Spaces:
Configuration error
Configuration error
File size: 7,729 Bytes
54056c6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
'''暂未剔除退款的记录'''
import pandas as pd
import requests
from PIL import Image
import os
from typing import List, Dict
import json
import re
class DataProcessor:
def __init__(self, data_dir: str):
self.data_dir = data_dir
self.metadata_file = os.path.join(data_dir, "metadata.json")
def is_top(self, name: str) -> bool:
"""判断是否为上衣"""
keywords = ['背心', '上衣', 'T恤', '抹胸', '吊带', '露脐', '短袖', '衬衫', '外套', '夹克', '卫衣']
return any(kw in name for kw in keywords)
def is_bottom(self, name: str) -> bool:
"""判断是否为下装"""
keywords = ['短裤', '长裤', '裤子', '半身裙']
return any(kw in name for kw in keywords)
def is_dress(self, name: str) -> bool:
"""判断是否为连衣裙/连体裤"""
keywords = ['连衣裙', '连体裤', '套装', '长裙', '吊带裙', '背带裤']
return any(kw in name for kw in keywords)
def is_accessory(self, name: str) -> bool:
"""判断是否为配饰"""
keywords = ['帽子', '项链', '耳环', '手链', '戒指', '发饰', '围巾', '手套', '袜子', '包', '腰带', '眼镜', '口罩', '帽子', '鞋','袜子']
return any(kw in name for kw in keywords)
def estimate_exposure(self, name: str) -> str:
"""估算露肤度"""
high = ['抹胸', '露脐', '吊带']
medium = ['短袖', '背心', '短裙', '短裤']
low = ['长裙', '长裤', '毛呢']
if any(kw in name for kw in high):
return 'high'
elif any(kw in name for kw in medium):
return 'medium'
elif any(kw in name for kw in low):
return 'low'
return 'unknown'
def extract_style(self, name: str, fallback: str = None) -> str:
"""提取风格关键词"""
keywords = ['通勤', '辣妹', '运动', '学院', '复古', '法式']
style_map = {
'通勤': 'commuter',
'辣妹': 'trendy',
'运动': 'sports',
'学院': 'academic',
'复古': 'retro',
'法式': 'french'
}
for kw in keywords:
if kw in name:
return kw #style_map[kw]
return fallback if fallback else 'unknown'
def extract_color_size(self, spec: str) -> tuple:
"""从规格中提取颜色和尺码"""
color = ''
size = ''
if pd.isna(spec):
return color, size
# 尝试提取颜色
color_section = re.search(r'(?:颜色分类|主要颜色)[::]([^::]+)', str(spec))
if color_section:
# 获取颜色部分的文本并清理
color_text = color_section.group(1).strip()
# 先尝试匹配 xxx色
color_match = re.search(r'([^\s,,]+色)', color_text)
if color_match:
color = color_match.group(1)
else:
# 如果没找到xxx色,则保留第一段非空文本(处理类似"浆果玫红"这样的组合词)
color = re.split(r'[,,\s\-]+', color_text)[0].strip()
# 尝试提取尺码
size_section = re.search(r'尺码[::]([^::]+)', str(spec))
if size_section:
size = size_section.group(1).strip()
# 清理尺码中的特殊字符和乱码
size = re.sub(r'[\[\]【】\(\)]', '', size)
return color, size
def process_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""处理淘宝购买记录数据"""
# 清理标题中的[交易快照]
df['title'] = df['title'].str.replace(r'\[交易快照\]$', '', regex=True).str.strip()
# 清理 image_url 中的 _80x80.jpg
df['image_url'] = df['image_url'].str.replace(r'_80x80\.jpg$', '_640x640.jpg', regex=True)
# 添加必要的列
if "type" not in df.columns:
df["type"] = "" # 服装类型
if "style" not in df.columns:
df["style"] = "" # 风格
if "exposure_level" not in df.columns:
df["exposure_level"] = "" # 露肤度
# 从specification提取颜色和尺码
df[['color', 'size']] = pd.DataFrame(
df['specification'].apply(self.extract_color_size).tolist(),
index=df.index
)
# 添加新的处理逻辑
df['type'] = df['title'].apply(lambda x:
'上衣' if self.is_top(x)
else ('下装' if self.is_bottom(x)
else ('连衣裙/裤' if self.is_dress(x)
else ('配饰' if self.is_accessory(x)
else '未知'))))
df['exposure_level'] = df['title'].apply(self.estimate_exposure)
df['style'] = df.apply(lambda row: self.extract_style(str(row['title'])), axis=1)
# 判断是否是服饰(类型不为未知,且颜色和尺码都不为空)
df['is_clothing'] = (df['type'].apply(lambda x: x != '未知') &
df['color'].str.len().gt(0) &
df['size'].str.len().gt(0))
# 剔除退款的记录 !!暂未剔除
# df = df[~df['status'].str.contains('查看退款', na=False)]
# 剔除非服装类商品
df = df[df['is_clothing'] == True]
return df
# def download_images(self, image_urls: List[str], output_dir: str):
# """下载并保存图片"""
# os.makedirs(output_dir, exist_ok=True)
# for url in image_urls:
# try:
# response = requests.get(url)
# if response.status_code == 200:
# filename = os.path.join(output_dir, f"{hash(url)}.jpg")
# with open(filename, "wb") as f:
# f.write(response.content)
# except Exception as e:
# print(f"Error downloading {url}: {str(e)}")
# def save_metadata(self, metadata: Dict):
# """保存元数据"""
# with open(self.metadata_file, "w", encoding="utf-8") as f:
# json.dump(metadata, f, ensure_ascii=False, indent=2)
# def load_metadata(self) -> Dict:
# """加载元数据"""
# if os.path.exists(self.metadata_file):
# with open(self.metadata_file, "r", encoding="utf-8") as f:
# return json.load(f)
# return {}
# def process_image(self, image_path: str) -> Dict:
# """处理单张图片,提取特征"""
# try:
# with Image.open(image_path) as img:
# # 这里可以添加图像处理逻辑
# # 例如:调整大小、格式转换等
# return {
# "width": img.width,
# "height": img.height,
# "format": img.format,
# "path": image_path
# }
# except Exception as e:
# print(f"Error processing image {image_path}: {str(e)}")
# return {}
def main():
# 初始化数据处理器
processor = DataProcessor(data_dir="data")
# 加载淘宝购买数据
raw_data = pd.read_csv("data/taobao_purchases.csv")
df = processor.process_data(raw_data)
# 保存处理后的数据
output_path = "data/processed_taobao_purchases.csv"
df.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"\nProcessed data saved to: {output_path}")
if __name__ == "__main__":
main() |