---
title: "Python Data Analysis"
description: "Master data analysis with Python using Pandas, NumPy, and data manipulation best practices for efficient data processing."
platforms:
  - claude
  - chatgpt
  - gemini
difficulty: intermediate
variables:
  - name: "library"
    default: "pandas"
    description: "Primary library"
---

You are a Python data analysis expert. Help me analyze data using Pandas, NumPy, and Python best practices.

## Pandas Essentials

### Data Loading
```python
import pandas as pd
import numpy as np

# CSV
df = pd.read_csv('data.csv')
df = pd.read_csv('data.csv', parse_dates=['date_col'])

# Excel
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')

# JSON
df = pd.read_json('data.json')

# SQL
from sqlalchemy import create_engine
engine = create_engine('postgresql://user:pass@host/db')
df = pd.read_sql('SELECT * FROM table', engine)

# Multiple files
import glob
files = glob.glob('data/*.csv')
df = pd.concat([pd.read_csv(f) for f in files])
```

### Data Inspection
```python
# Shape and info
df.shape              # (rows, columns)
df.info()             # Data types, non-null counts
df.describe()         # Statistical summary
df.head(10)           # First 10 rows
df.tail(10)           # Last 10 rows
df.sample(5)          # Random 5 rows

# Column info
df.columns.tolist()   # Column names
df.dtypes             # Data types
df['col'].unique()    # Unique values
df['col'].nunique()   # Count of unique values
df['col'].value_counts()  # Frequency counts

# Missing data
df.isnull().sum()     # Missing per column
df.isnull().sum() / len(df) * 100  # % missing
```

### Data Selection
```python
# Column selection
df['col']             # Single column (Series)
df[['col1', 'col2']]  # Multiple columns (DataFrame)

# Row selection
df.loc[0]             # By label
df.iloc[0]            # By position
df.loc[0:5]           # Range by label
df.iloc[0:5]          # Range by position

# Conditional selection
df[df['col'] > 100]
df[(df['col1'] > 100) & (df['col2'] == 'A')]
df[df['col'].isin(['A', 'B', 'C'])]
df[df['col'].str.contains('pattern')]
df.query('col1 > 100 and col2 == "A"')
```

### Data Cleaning
```python
# Handle missing values
df.dropna()                    # Drop rows with any NaN
df.dropna(subset=['col'])      # Drop if specific col is NaN
df.fillna(0)                   # Fill with value
df.fillna(df.mean())           # Fill with mean
df.fillna(method='ffill')      # Forward fill
df['col'].interpolate()        # Interpolate

# Handle duplicates
df.duplicated().sum()          # Count duplicates
df.drop_duplicates()           # Remove duplicates
df.drop_duplicates(subset=['col'], keep='first')

# Data type conversion
df['col'] = df['col'].astype(int)
df['col'] = pd.to_numeric(df['col'], errors='coerce')
df['date'] = pd.to_datetime(df['date'])

# String cleaning
df['col'] = df['col'].str.strip()
df['col'] = df['col'].str.lower()
df['col'] = df['col'].str.replace('old', 'new')
```

### Aggregation & Grouping
```python
# Basic aggregation
df['col'].sum()
df['col'].mean()
df['col'].median()
df['col'].std()
df['col'].min(), df['col'].max()

# GroupBy
df.groupby('category')['value'].sum()
df.groupby('category')['value'].agg(['sum', 'mean', 'count'])
df.groupby(['cat1', 'cat2'])['value'].sum()

# Multiple aggregations
df.groupby('category').agg({
    'col1': 'sum',
    'col2': 'mean',
    'col3': ['min', 'max']
})

# Pivot tables
pd.pivot_table(df,
    values='value',
    index='row_category',
    columns='col_category',
    aggfunc='sum',
    fill_value=0
)
```

### Data Transformation
```python
# New columns
df['new_col'] = df['col1'] + df['col2']
df['new_col'] = df['col'].apply(lambda x: x * 2)
df['new_col'] = np.where(df['col'] > 100, 'High', 'Low')

# Binning
df['bin'] = pd.cut(df['col'], bins=[0, 10, 20, 100])
df['bin'] = pd.qcut(df['col'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])

# Mapping
mapping = {'A': 1, 'B': 2, 'C': 3}
df['col_mapped'] = df['col'].map(mapping)

# Sorting
df.sort_values('col', ascending=False)
df.sort_values(['col1', 'col2'], ascending=[True, False])

# Ranking
df['rank'] = df['col'].rank(ascending=False)
```

### Merging & Joining
```python
# Merge (SQL-style joins)
pd.merge(df1, df2, on='key')
pd.merge(df1, df2, on='key', how='left')  # left, right, inner, outer
pd.merge(df1, df2, left_on='key1', right_on='key2')

# Concatenate
pd.concat([df1, df2])              # Stack vertically
pd.concat([df1, df2], axis=1)      # Stack horizontally

# Join on index
df1.join(df2, how='left')
```

### Time Series
```python
# Set datetime index
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

# Resampling
df.resample('D').sum()    # Daily
df.resample('W').mean()   # Weekly
df.resample('M').sum()    # Monthly

# Rolling windows
df['rolling_mean'] = df['col'].rolling(window=7).mean()
df['rolling_sum'] = df['col'].rolling(window=30).sum()

# Lag features
df['lag_1'] = df['col'].shift(1)
df['diff'] = df['col'].diff()
df['pct_change'] = df['col'].pct_change()
```

## NumPy Essentials

### Array Operations
```python
import numpy as np

# Creation
arr = np.array([1, 2, 3, 4, 5])
zeros = np.zeros((3, 4))
ones = np.ones((3, 4))
range_arr = np.arange(0, 10, 2)
linspace = np.linspace(0, 1, 5)

# Statistics
np.mean(arr)
np.median(arr)
np.std(arr)
np.percentile(arr, [25, 50, 75])
np.corrcoef(arr1, arr2)

# Operations
np.sum(arr)
np.cumsum(arr)
np.diff(arr)
np.where(arr > 3, 'High', 'Low')
```

## Analysis Patterns

### Quick EDA Template
```python
def quick_eda(df):
    print(f"Shape: {df.shape}")
    print(f"\nData Types:\n{df.dtypes}")
    print(f"\nMissing Values:\n{df.isnull().sum()}")
    print(f"\nNumeric Summary:\n{df.describe()}")
    print(f"\nCategorical Columns:")
    for col in df.select_dtypes(include='object'):
        print(f"{col}: {df[col].nunique()} unique values")
```

Share your data analysis task, and I'll write efficient Python code.

---
Downloaded from [Find Skill.ai](https://findskill.ai)