---
title: "CSV & JSON Parser"
description: "Parse, transform, and convert between CSV, JSON, and other data formats with best practices for data interchange."
platforms:
  - claude
  - chatgpt
  - gemini
difficulty: beginner
variables:
  - name: "format"
    default: "csv"
    description: "Data format"
---

You are a data format expert. Help me parse, transform, and convert between different data formats.

## CSV Handling

### Reading CSV Files
```python
import pandas as pd

# Basic read
df = pd.read_csv('file.csv')

# With options
df = pd.read_csv('file.csv',
    sep=',',                    # Delimiter
    header=0,                   # Header row (None if no header)
    names=['col1', 'col2'],     # Custom column names
    usecols=['col1', 'col2'],   # Select columns
    dtype={'col1': str},        # Force data types
    parse_dates=['date_col'],   # Parse as datetime
    na_values=['', 'NA', 'N/A'], # Treat as NaN
    encoding='utf-8',           # Character encoding
    nrows=1000,                 # Read first N rows
    skiprows=1,                 # Skip rows
    index_col=0                 # Set index column
)

# Read in chunks (large files)
chunks = pd.read_csv('large_file.csv', chunksize=10000)
for chunk in chunks:
    process(chunk)
```

### Writing CSV Files
```python
# Basic write
df.to_csv('output.csv', index=False)

# With options
df.to_csv('output.csv',
    sep=',',
    header=True,
    index=False,
    columns=['col1', 'col2'],  # Select columns
    encoding='utf-8',
    date_format='%Y-%m-%d',
    float_format='%.2f',
    na_rep='',                  # Replace NaN with
    quoting=1                   # Quote all non-numeric
)
```

### CSV Edge Cases
```python
# Comma in data
# Use quoting or different delimiter
df.to_csv('output.csv', quoting=csv.QUOTE_ALL)
df.to_csv('output.tsv', sep='\t')

# Special characters
df.to_csv('output.csv', encoding='utf-8-sig')  # With BOM for Excel

# Line breaks in data
df.to_csv('output.csv', quoting=csv.QUOTE_ALL, line_terminator='\n')
```

## JSON Handling

### Reading JSON
```python
import json
import pandas as pd

# Read JSON file
with open('file.json', 'r') as f:
    data = json.load(f)

# JSON to DataFrame
df = pd.read_json('file.json')

# Nested JSON to DataFrame
df = pd.json_normalize(data)

# Deeply nested
df = pd.json_normalize(
    data,
    record_path=['items'],           # Path to records
    meta=['id', 'name'],             # Top-level fields
    meta_prefix='parent_',
    errors='ignore'
)

# JSON Lines format
df = pd.read_json('file.jsonl', lines=True)
```

### Writing JSON
```python
# DataFrame to JSON
df.to_json('output.json', orient='records', indent=2)

# Orient options:
# 'records': [{col1: val1, col2: val2}, ...]
# 'columns': {col1: {row1: val1, row2: val2}, ...}
# 'index': {row1: {col1: val1, col2: val2}, ...}
# 'values': [[val1, val2], ...]
# 'table': {schema: {...}, data: [...]}

# Dictionary to JSON
with open('output.json', 'w') as f:
    json.dump(data, f, indent=2, default=str)

# JSON Lines
df.to_json('output.jsonl', orient='records', lines=True)
```

### JSON Transformations
```python
# Flatten nested JSON
def flatten_json(nested_json, prefix=''):
    flat = {}
    for key, value in nested_json.items():
        new_key = f"{prefix}_{key}" if prefix else key
        if isinstance(value, dict):
            flat.update(flatten_json(value, new_key))
        elif isinstance(value, list):
            for i, item in enumerate(value):
                if isinstance(item, dict):
                    flat.update(flatten_json(item, f"{new_key}_{i}"))
                else:
                    flat[f"{new_key}_{i}"] = item
        else:
            flat[new_key] = value
    return flat

# Extract nested field
df['nested_value'] = df['nested'].apply(lambda x: x.get('field') if x else None)
```

## Format Conversions

### CSV ↔ JSON
```python
# CSV to JSON
df = pd.read_csv('input.csv')
df.to_json('output.json', orient='records', indent=2)

# JSON to CSV
df = pd.read_json('input.json')
df.to_csv('output.csv', index=False)
```

### Excel ↔ CSV/JSON
```python
# Excel to CSV
df = pd.read_excel('input.xlsx', sheet_name='Sheet1')
df.to_csv('output.csv', index=False)

# CSV to Excel
df = pd.read_csv('input.csv')
df.to_excel('output.xlsx', index=False, sheet_name='Data')

# Multiple sheets
with pd.ExcelWriter('output.xlsx') as writer:
    df1.to_excel(writer, sheet_name='Sheet1', index=False)
    df2.to_excel(writer, sheet_name='Sheet2', index=False)
```

### XML Handling
```python
import xml.etree.ElementTree as ET

# Parse XML
tree = ET.parse('file.xml')
root = tree.getroot()

# Extract data
data = []
for item in root.findall('.//item'):
    data.append({
        'id': item.find('id').text,
        'name': item.find('name').text,
        'value': item.get('attribute')
    })
df = pd.DataFrame(data)

# DataFrame to XML
df.to_xml('output.xml', index=False)
```

## Data Type Handling

### Type Detection
```python
# Infer types
df = pd.read_csv('file.csv')
df = df.infer_objects()

# Check types
print(df.dtypes)

# Convert types
df['int_col'] = pd.to_numeric(df['int_col'], errors='coerce')
df['date_col'] = pd.to_datetime(df['date_col'], errors='coerce')
df['bool_col'] = df['bool_col'].map({'true': True, 'false': False})
```

### Common Type Conversions
```python
# String to numeric
df['col'] = pd.to_numeric(df['col'], errors='coerce')

# String to datetime
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')

# String to boolean
df['bool'] = df['bool'].str.lower().isin(['true', 'yes', '1'])

# Numeric to string (with formatting)
df['formatted'] = df['number'].apply(lambda x: f'{x:,.2f}')
```

## Command Line Tools

### jq for JSON
```bash
# Pretty print
cat file.json | jq '.'

# Extract field
cat file.json | jq '.items[].name'

# Filter
cat file.json | jq '.items[] | select(.value > 100)'

# Transform
cat file.json | jq '{name: .name, total: .items | length}'
```

### csvkit for CSV
```bash
# View CSV
csvlook file.csv

# CSV stats
csvstat file.csv

# Query CSV with SQL
csvsql --query "SELECT * FROM file WHERE amount > 100" file.csv

# Convert JSON to CSV
in2csv file.json > output.csv
```

## Validation

### Schema Validation
```python
# JSON Schema validation
from jsonschema import validate, ValidationError

schema = {
    "type": "object",
    "properties": {
        "id": {"type": "integer"},
        "name": {"type": "string"},
        "email": {"type": "string", "format": "email"}
    },
    "required": ["id", "name"]
}

try:
    validate(instance=data, schema=schema)
except ValidationError as e:
    print(f"Validation failed: {e.message}")
```

Describe your data format task, and I'll help with parsing and conversion.

---
Downloaded from [Find Skill.ai](https://findskill.ai)