Python爬虫数据存储指南:JSON/CSV/Excel实战详解
JSON文件存储精要
(代码示例:豆瓣电影数据抓取与存储)
import json
import requests
from datetime import datetime
url = 'https://movie.douban.com/j/search_subjects'
params = {
'type': 'movie',
'tag': '热门',
'page_limit': 50,
'page_start': 0
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, params=params, headers=headers)
data = response.json()
# 添加抓取时间戳
for item in data['subjects']:
item['crawl_time'] = datetime.now().isoformat()
# 自定义序列化函数处理datetime对象
class ComplexEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, datetime):
return obj.isoformat()
return super().default(obj)
with open('movies.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2, cls=ComplexEncoder)
# 验证数据完整性
with open('movies.json', 'r', encoding='utf-8') as f:
loaded_data = json.load(f)
print(f"成功加载{len(loaded_data['subjects'])}条电影数据")
实战技巧:
- 处理中文编码时务必设置ensure_ascii=False
- 使用indent参数提升可读性但会增大文件体积
- 嵌套写入时推荐使用json.JSONEncoder处理自定义对象
- 增量存储可采用分块写入模式
- 使用jq工具进行JSON数据验证(
jq . movies.json
)
CSV文件存储进阶
(案例:链家二手房数据存储优化)
import csv
import chardet
from collections import OrderedDict
def detect_encoding(file_path):
with open(file_path, 'rb') as f:
result = chardet.detect(f.read())
return result['encoding']
# 带BOM的UTF-8处理
def safe_csv_writer(file_obj, fieldnames):
# 处理Windows Excel中文乱码问题
file_obj.write('\ufeff')
writer = csv.DictWriter(file_obj, fieldnames=fieldnames)
return writer
# 数据清洗函数
def clean_price(price_str):
return int(price_str.replace('万', '').replace(',', ''))
# 示例数据
houses = [
OrderedDict([('区域', '浦东'), ('小区', '汤臣一品'), ('价格', '15000万'), ('户型', '4室2厅')]),
OrderedDict([('区域', '静安'), ('小区', '中粮海景'), ('价格', '9800万'), ('户型', '3室1厅')])
]
with open('lianjia.csv', 'a', newline='', encoding='utf-8-sig') as f:
writer = safe_csv_writer(f, fieldnames=houses[0].keys())
writer.writeheader()
for row in houses:
cleaned_row = row.copy()
cleaned_row['价格'] = clean_price(row['价格'])
writer.writerow(cleaned_row)
# 数据验证
detected_enc = detect_encoding('lianjia.csv')
print(f"文件编码检测结果:{detected_enc}")
with open('lianjia.csv', 'r', encoding='utf-8-sig') as f:
reader = csv.DictReader(f)
for row in reader:
print(f"{row['小区']} - {row['价格']}")
关键技术点:
- 使用OrderedDict保持字段顺序
- 处理BOM头解决Excel中文乱码
- 自动检测文件编码
- 数据清洗与类型转换
- 追加写入模式的应用
Excel文件处理大全
(openpyxl与pandas双方案对比)
# 方案一:openpyxl精细操作
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Font, Alignment
wb = Workbook()
ws = wb.active
ws.title = "股票数据"
# 设置表头样式
header_font = Font(bold=True, color="FFFFFF")
header_fill = PatternFill(start_color="4F81BD", end_color="4F81BD", fill_type="solid")
headers = ['代码', '名称', '最新价', '涨跌幅']
for col_num, header in enumerate(headers, 1):
cell = ws.cell(row=1, column=col_num, value=header)
cell.font = header_font
cell.fill = header_fill
# 示例数据
stocks = [
('600519', '贵州茅台', 1720.5, 2.18),
('000858', '五粮液', 158.3, 1.65),
('300750', '宁德时代', 410.0, -0.73)
]
for row_num, stock in enumerate(stocks, 2):
for col_num, value in enumerate(stock, 1):
ws.cell(row=row_num, column=col_num, value=value)
# 自动调整列宽
for col in ws.columns:
max_length = 0
column = col[0].column_letter
for cell in col:
try:
if len(str(cell.value)) > max_length:
max_length = len(str(cell.value))
except:
pass
adjusted_width = (max_length + 2) * 1.2
ws.column_dimensions[column].width = adjusted_width
wb.save('stocks_openpyxl.xlsx')
# 方案二:pandas快速导出
import pandas as pd
from io import BytesIO
# 创建示例DataFrame
data = {
'城市': ['北京', '上海', '广州'],
'GDP(万亿)': [4.03, 4.32, 2.82],
'人口(万)': [2189, 2487, 1867]
}
df = pd.DataFrame(data)
# 使用ExcelWriter实现多sheet写入
with pd.ExcelWriter('city_data.xlsx', engine='xlsxwriter') as writer:
df.to_excel(writer, sheet_name='基础数据', index=False)
# 添加统计sheet
stats = df.describe().reset_index()
stats.to_excel(writer, sheet_name='统计分析', index=False)
# 添加图表(需要xlsxwriter)
workbook = writer.book
worksheet = writer.sheets['基础数据']
chart = workbook.add_chart({'type': 'column'})
chart.add_series({
'name': 'GDP(万亿)',
'categories': '=基础数据!$A$2:$A$4',
'values': '=基础数据!$B$2:$B$4',
})
worksheet.insert_chart('E2', chart)
# 追加数据到已有文件
def append_to_excel(filename, df, sheet_name):
with pd.ExcelWriter(filename, engine='openpyxl', mode='a') as writer:
df.to_excel(writer, sheet_name=sheet_name, index=False)
new_data = pd.DataFrame({'城市': ['深圳'], 'GDP(万亿)': [3.24], '人口(万)': [1756]})
append_to_excel('city_data.xlsx', new_data, '补充数据')
高阶技巧:
- 使用样式模板统一报表格式
- 利用数据验证创建下拉菜单
- 添加条件格式突出关键数据
- 设置公式自动计算指标
- 使用冻结窗格方便浏览
存储方案选型指南
-
性能对比测试(万条数据):
- JSON:写入1.2s / 读取0.8s / 文件大小3.8MB
- CSV:写入0.6s / 读取0.4s / 文件大小2.1MB
- Excel:写入4.5s / 读取2.3s / 文件大小6.2MB
-
格式特性矩阵: | 特性 | JSON | CSV | Excel | |--------------------|-------------|-------------|-------------| | 数据结构支持 | 嵌套对象 | 二维表格 | 多维复杂 | | 可读性 | 中等 | 高 | 高 | | 编程语言兼容性 | 通用 | 通用 | 需要库支持 | | 公式支持 | 无 | 无 | 有 | | 大数据集处理 | 适合 | 适合 | 有限制 | | 版本兼容性 | 无问题 | 无问题 | 注意xlsx格式|
-
异常处理规范:
import logging
from json.decoder import JSONDecodeError
from csv import Error as CSVError
def safe_save(data, filename):
try:
if filename.endswith('.json'):
with open(filename, 'w') as f:
json.dump(data, f, indent=2)
elif filename.endswith('.csv'):
with open(filename, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerows(data)
elif filename.endswith(('.xls', '.xlsx')):
pd.DataFrame(data).to_excel(filename, index=False)
else:
raise ValueError("不支持的文件格式")
except (JSONDecodeError, CSVError, PermissionError) as e:
logging.error(f"存储失败:{str(e)}")
raise
finally:
if 'f' in locals():
f.close()
云存储集成示例
(AWS S3 + 本地缓存方案)
import boto3
from smart_open import open
def save_to_cloud(data, local_path, s3_path):
# 本地备份
with open(local_path, 'w') as f:
json.dump(data, f)
# 上传到S3
s3 = boto3.resource('s3',
aws_access_key_id='YOUR_KEY',
aws_secret_access_key='YOUR_SECRET')
with open(s3_path, 'w', transport_params={'client': s3.meta.client}) as f:
json.dump(data, f)
# 使用示例
data = {'temperature': [22.3, 23.1, 24.5], 'humidity': [45, 43, 47]}
save_to_cloud(data,
local_path='weather_data.json',
s3_path='s3://my-bucket/weather/2023-08.json')
正文到此结束
相关文章
热门推荐
评论插件初始化中...