Python爬虫数据存储指南:JSON/CSV/Excel实战详解

JSON文件存储精要

(代码示例:豆瓣电影数据抓取与存储)

import json
import requests
from datetime import datetime

url = 'https://movie.douban.com/j/search_subjects'
params = {
    'type': 'movie',
    'tag': '热门',
    'page_limit': 50,
    'page_start': 0
}

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

response = requests.get(url, params=params, headers=headers)
data = response.json()

# 添加抓取时间戳
for item in data['subjects']:
    item['crawl_time'] = datetime.now().isoformat()

# 自定义序列化函数处理datetime对象
class ComplexEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, datetime):
            return obj.isoformat()
        return super().default(obj)

with open('movies.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2, cls=ComplexEncoder)

# 验证数据完整性
with open('movies.json', 'r', encoding='utf-8') as f:
    loaded_data = json.load(f)
    print(f"成功加载{len(loaded_data['subjects'])}条电影数据")

实战技巧:

  1. 处理中文编码时务必设置ensure_ascii=False
  2. 使用indent参数提升可读性但会增大文件体积
  3. 嵌套写入时推荐使用json.JSONEncoder处理自定义对象
  4. 增量存储可采用分块写入模式
  5. 使用jq工具进行JSON数据验证(jq . movies.json

CSV文件存储进阶

(案例:链家二手房数据存储优化)

import csv
import chardet
from collections import OrderedDict

def detect_encoding(file_path):
    with open(file_path, 'rb') as f:
        result = chardet.detect(f.read())
    return result['encoding']

# 带BOM的UTF-8处理
def safe_csv_writer(file_obj, fieldnames):
    # 处理Windows Excel中文乱码问题
    file_obj.write('\ufeff')
    writer = csv.DictWriter(file_obj, fieldnames=fieldnames)
    return writer

# 数据清洗函数
def clean_price(price_str):
    return int(price_str.replace('万', '').replace(',', ''))

# 示例数据
houses = [
    OrderedDict([('区域', '浦东'), ('小区', '汤臣一品'), ('价格', '15000万'), ('户型', '4室2厅')]),
    OrderedDict([('区域', '静安'), ('小区', '中粮海景'), ('价格', '9800万'), ('户型', '3室1厅')])
]

with open('lianjia.csv', 'a', newline='', encoding='utf-8-sig') as f:
    writer = safe_csv_writer(f, fieldnames=houses[0].keys())
    writer.writeheader()
    for row in houses:
        cleaned_row = row.copy()
        cleaned_row['价格'] = clean_price(row['价格'])
        writer.writerow(cleaned_row)

# 数据验证
detected_enc = detect_encoding('lianjia.csv')
print(f"文件编码检测结果:{detected_enc}")

with open('lianjia.csv', 'r', encoding='utf-8-sig') as f:
    reader = csv.DictReader(f)
    for row in reader:
        print(f"{row['小区']} - {row['价格']}")

关键技术点:

  1. 使用OrderedDict保持字段顺序
  2. 处理BOM头解决Excel中文乱码
  3. 自动检测文件编码
  4. 数据清洗与类型转换
  5. 追加写入模式的应用

Excel文件处理大全

(openpyxl与pandas双方案对比)

# 方案一:openpyxl精细操作
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Font, Alignment

wb = Workbook()
ws = wb.active
ws.title = "股票数据"

# 设置表头样式
header_font = Font(bold=True, color="FFFFFF")
header_fill = PatternFill(start_color="4F81BD", end_color="4F81BD", fill_type="solid")

headers = ['代码', '名称', '最新价', '涨跌幅']
for col_num, header in enumerate(headers, 1):
    cell = ws.cell(row=1, column=col_num, value=header)
    cell.font = header_font
    cell.fill = header_fill

# 示例数据
stocks = [
    ('600519', '贵州茅台', 1720.5, 2.18),
    ('000858', '五粮液', 158.3, 1.65),
    ('300750', '宁德时代', 410.0, -0.73)
]

for row_num, stock in enumerate(stocks, 2):
    for col_num, value in enumerate(stock, 1):
        ws.cell(row=row_num, column=col_num, value=value)

# 自动调整列宽
for col in ws.columns:
    max_length = 0
    column = col[0].column_letter
    for cell in col:
        try:
            if len(str(cell.value)) > max_length:
                max_length = len(str(cell.value))
        except:
            pass
    adjusted_width = (max_length + 2) * 1.2
    ws.column_dimensions[column].width = adjusted_width

wb.save('stocks_openpyxl.xlsx')

# 方案二:pandas快速导出
import pandas as pd
from io import BytesIO

# 创建示例DataFrame
data = {
    '城市': ['北京', '上海', '广州'],
    'GDP(万亿)': [4.03, 4.32, 2.82],
    '人口(万)': [2189, 2487, 1867]
}
df = pd.DataFrame(data)

# 使用ExcelWriter实现多sheet写入
with pd.ExcelWriter('city_data.xlsx', engine='xlsxwriter') as writer:
    df.to_excel(writer, sheet_name='基础数据', index=False)
    
    # 添加统计sheet
    stats = df.describe().reset_index()
    stats.to_excel(writer, sheet_name='统计分析', index=False)
    
    # 添加图表(需要xlsxwriter)
    workbook = writer.book
    worksheet = writer.sheets['基础数据']
    chart = workbook.add_chart({'type': 'column'})
    
    chart.add_series({
        'name': 'GDP(万亿)',
        'categories': '=基础数据!$A$2:$A$4',
        'values': '=基础数据!$B$2:$B$4',
    })
    
    worksheet.insert_chart('E2', chart)

# 追加数据到已有文件
def append_to_excel(filename, df, sheet_name):
    with pd.ExcelWriter(filename, engine='openpyxl', mode='a') as writer:
        df.to_excel(writer, sheet_name=sheet_name, index=False)

new_data = pd.DataFrame({'城市': ['深圳'], 'GDP(万亿)': [3.24], '人口(万)': [1756]})
append_to_excel('city_data.xlsx', new_data, '补充数据')

高阶技巧:

  1. 使用样式模板统一报表格式
  2. 利用数据验证创建下拉菜单
  3. 添加条件格式突出关键数据
  4. 设置公式自动计算指标
  5. 使用冻结窗格方便浏览

存储方案选型指南

  1. 性能对比测试(万条数据):

    • JSON:写入1.2s / 读取0.8s / 文件大小3.8MB
    • CSV:写入0.6s / 读取0.4s / 文件大小2.1MB
    • Excel:写入4.5s / 读取2.3s / 文件大小6.2MB
  2. 格式特性矩阵: | 特性 | JSON | CSV | Excel | |--------------------|-------------|-------------|-------------| | 数据结构支持 | 嵌套对象 | 二维表格 | 多维复杂 | | 可读性 | 中等 | 高 | 高 | | 编程语言兼容性 | 通用 | 通用 | 需要库支持 | | 公式支持 | 无 | 无 | 有 | | 大数据集处理 | 适合 | 适合 | 有限制 | | 版本兼容性 | 无问题 | 无问题 | 注意xlsx格式|

  3. 异常处理规范:

import logging
from json.decoder import JSONDecodeError
from csv import Error as CSVError

def safe_save(data, filename):
    try:
        if filename.endswith('.json'):
            with open(filename, 'w') as f:
                json.dump(data, f, indent=2)
        elif filename.endswith('.csv'):
            with open(filename, 'w', newline='') as f:
                writer = csv.writer(f)
                writer.writerows(data)
        elif filename.endswith(('.xls', '.xlsx')):
            pd.DataFrame(data).to_excel(filename, index=False)
        else:
            raise ValueError("不支持的文件格式")
            
    except (JSONDecodeError, CSVError, PermissionError) as e:
        logging.error(f"存储失败:{str(e)}")
        raise
    finally:
        if 'f' in locals():
            f.close()

云存储集成示例

(AWS S3 + 本地缓存方案)

import boto3
from smart_open import open

def save_to_cloud(data, local_path, s3_path):
    # 本地备份
    with open(local_path, 'w') as f:
        json.dump(data, f)
    
    # 上传到S3
    s3 = boto3.resource('s3',
        aws_access_key_id='YOUR_KEY',
        aws_secret_access_key='YOUR_SECRET')
    
    with open(s3_path, 'w', transport_params={'client': s3.meta.client}) as f:
        json.dump(data, f)

# 使用示例
data = {'temperature': [22.3, 23.1, 24.5], 'humidity': [45, 43, 47]}
save_to_cloud(data, 
             local_path='weather_data.json',
             s3_path='s3://my-bucket/weather/2023-08.json')
正文到此结束
评论插件初始化中...
Loading...