goofish-api-huggingface / data_parser.py
luoluoluo22's picture
添加调试信息并更新文档,强调必须设置GOOFISH_COOKIES环境变量
77d5d95
from typing import List, Dict, Optional
from pydantic import BaseModel, Field
import json
class ItemLocation(BaseModel):
area: str
class ItemPrice(BaseModel):
price: float
class ItemDetail(BaseModel):
title: str
price: float
item_id: str
area: str
seller_nick: str
publish_time: Optional[str]
pics: List[str]
want_count: int = 0
detail_url: str = "" # 添加商品详情页URL字段
def safe_int(value: str, default: int = 0) -> int:
"""安全地将字符串转换为整数"""
try:
if not value:
return default
return int(value)
except (ValueError, TypeError):
return default
def safe_float(value: str, default: float = 0.0) -> float:
"""安全地将字符串转换为浮点数"""
try:
if not value:
return default
return float(value)
except (ValueError, TypeError):
return default
def parse_search_result(raw_data: Dict) -> List[ItemDetail]:
"""解析搜索结果数据"""
print("开始解析数据...")
# 打印完整的原始数据结构(仅限顶级键)
print(f"原始数据类型: {type(raw_data)}")
if isinstance(raw_data, dict):
print(f"原始数据顶级键: {list(raw_data.keys())}")
# 检查API返回的状态码
if 'ret' in raw_data:
print(f"API返回状态: {raw_data['ret']}")
# 检查API错误信息
if 'data' in raw_data and isinstance(raw_data['data'], dict) and 'msg' in raw_data['data']:
print(f"API返回消息: {raw_data['data']['msg']}")
else:
print(f"原始数据不是字典: {raw_data}")
return []
if not raw_data or 'data' not in raw_data:
print("无效的数据格式:缺少 'data' 字段")
return []
# 打印data字段的类型和结构
print(f"data字段类型: {type(raw_data['data'])}")
if isinstance(raw_data['data'], dict):
print(f"data字段键: {list(raw_data['data'].keys())}")
else:
print(f"data字段不是字典: {raw_data['data']}")
return []
if 'resultList' not in raw_data['data']:
print("无效的数据格式:缺少 'resultList' 字段")
print(f"可用的字段: {list(raw_data['data'].keys())}")
# 如果返回的是重定向URL,打印出来
if 'url' in raw_data['data']:
print(f"发现重定向URL: {raw_data['data']['url']}")
print("这表明cookie已过期或无效,需要重新登录获取新cookie")
return []
items = []
items_array = raw_data['data'].get('resultList', [])
print(f"找到 {len(items_array)} 个商品")
for idx, item_data in enumerate(items_array):
try:
if 'data' not in item_data:
print(f"商品 {idx} 缺少 'data' 字段")
continue
if 'item' not in item_data['data']:
print(f"商品 {idx} 缺少 'item' 字段")
continue
item = item_data['data']['item']
if 'main' not in item:
print(f"商品 {idx} 缺少 'main' 字段")
continue
if 'exContent' not in item['main']:
print(f"商品 {idx} 缺少 'exContent' 字段")
continue
ex_content = item['main']['exContent']
detail_params = ex_content.get('detailParams', {})
# 提取价格
price = safe_float(detail_params.get('soldPrice', 0))
# 提取商品ID
item_id = ex_content.get('itemId', '')
# 构建商品详情页URL
detail_url = f"https://www.goofish.com/item?id={item_id}" if item_id else ""
# 构建商品详情
item_detail = ItemDetail(
title=ex_content.get('title', ''),
price=price,
item_id=item_id,
area=ex_content.get('area', ''),
seller_nick=ex_content.get('userNickName', ''),
publish_time=str(detail_params.get('publishTime', '')),
pics=[ex_content.get('picUrl', '')] if ex_content.get('picUrl') else [],
want_count=safe_int(ex_content.get('want', '0')),
detail_url=detail_url
)
items.append(item_detail)
print(f"成功解析商品 {idx}: {item_detail.title[:30]}...")
except Exception as e:
print(f"解析商品 {idx} 时出错: {str(e)}")
continue
print(f"成功解析 {len(items)} 个商品")
return items