'''暂未剔除退款的记录''' import pandas as pd import requests from PIL import Image import os from typing import List, Dict import json import re class DataProcessor: def __init__(self, data_dir: str): self.data_dir = data_dir self.metadata_file = os.path.join(data_dir, "metadata.json") def is_top(self, name: str) -> bool: """判断是否为上衣""" keywords = ['背心', '上衣', 'T恤', '抹胸', '吊带', '露脐', '短袖', '衬衫', '外套', '夹克', '卫衣'] return any(kw in name for kw in keywords) def is_bottom(self, name: str) -> bool: """判断是否为下装""" keywords = ['短裤', '长裤', '裤子', '半身裙'] return any(kw in name for kw in keywords) def is_dress(self, name: str) -> bool: """判断是否为连衣裙/连体裤""" keywords = ['连衣裙', '连体裤', '套装', '长裙', '吊带裙', '背带裤'] return any(kw in name for kw in keywords) def is_accessory(self, name: str) -> bool: """判断是否为配饰""" keywords = ['帽子', '项链', '耳环', '手链', '戒指', '发饰', '围巾', '手套', '袜子', '包', '腰带', '眼镜', '口罩', '帽子', '鞋','袜子'] return any(kw in name for kw in keywords) def estimate_exposure(self, name: str) -> str: """估算露肤度""" high = ['抹胸', '露脐', '吊带'] medium = ['短袖', '背心', '短裙', '短裤'] low = ['长裙', '长裤', '毛呢'] if any(kw in name for kw in high): return 'high' elif any(kw in name for kw in medium): return 'medium' elif any(kw in name for kw in low): return 'low' return 'unknown' def extract_style(self, name: str, fallback: str = None) -> str: """提取风格关键词""" keywords = ['通勤', '辣妹', '运动', '学院', '复古', '法式'] style_map = { '通勤': 'commuter', '辣妹': 'trendy', '运动': 'sports', '学院': 'academic', '复古': 'retro', '法式': 'french' } for kw in keywords: if kw in name: return kw #style_map[kw] return fallback if fallback else 'unknown' def extract_color_size(self, spec: str) -> tuple: """从规格中提取颜色和尺码""" color = '' size = '' if pd.isna(spec): return color, size # 尝试提取颜色 color_section = re.search(r'(?:颜色分类|主要颜色)[::]([^::]+)', str(spec)) if color_section: # 获取颜色部分的文本并清理 color_text = color_section.group(1).strip() # 先尝试匹配 xxx色 color_match = re.search(r'([^\s,,]+色)', color_text) if color_match: color = color_match.group(1) else: # 如果没找到xxx色,则保留第一段非空文本(处理类似"浆果玫红"这样的组合词) color = re.split(r'[,,\s\-]+', color_text)[0].strip() # 尝试提取尺码 size_section = re.search(r'尺码[::]([^::]+)', str(spec)) if size_section: size = size_section.group(1).strip() # 清理尺码中的特殊字符和乱码 size = re.sub(r'[\[\]【】\(\)]', '', size) return color, size def process_data(self, df: pd.DataFrame) -> pd.DataFrame: """处理淘宝购买记录数据""" # 清理标题中的[交易快照] df['title'] = df['title'].str.replace(r'\[交易快照\]$', '', regex=True).str.strip() # 清理 image_url 中的 _80x80.jpg df['image_url'] = df['image_url'].str.replace(r'_80x80\.jpg$', '_640x640.jpg', regex=True) # 添加必要的列 if "type" not in df.columns: df["type"] = "" # 服装类型 if "style" not in df.columns: df["style"] = "" # 风格 if "exposure_level" not in df.columns: df["exposure_level"] = "" # 露肤度 # 从specification提取颜色和尺码 df[['color', 'size']] = pd.DataFrame( df['specification'].apply(self.extract_color_size).tolist(), index=df.index ) # 添加新的处理逻辑 df['type'] = df['title'].apply(lambda x: '上衣' if self.is_top(x) else ('下装' if self.is_bottom(x) else ('连衣裙/裤' if self.is_dress(x) else ('配饰' if self.is_accessory(x) else '未知')))) df['exposure_level'] = df['title'].apply(self.estimate_exposure) df['style'] = df.apply(lambda row: self.extract_style(str(row['title'])), axis=1) # 判断是否是服饰(类型不为未知,且颜色和尺码都不为空) df['is_clothing'] = (df['type'].apply(lambda x: x != '未知') & df['color'].str.len().gt(0) & df['size'].str.len().gt(0)) # 剔除退款的记录 !!暂未剔除 # df = df[~df['status'].str.contains('查看退款', na=False)] # 剔除非服装类商品 df = df[df['is_clothing'] == True] return df # def download_images(self, image_urls: List[str], output_dir: str): # """下载并保存图片""" # os.makedirs(output_dir, exist_ok=True) # for url in image_urls: # try: # response = requests.get(url) # if response.status_code == 200: # filename = os.path.join(output_dir, f"{hash(url)}.jpg") # with open(filename, "wb") as f: # f.write(response.content) # except Exception as e: # print(f"Error downloading {url}: {str(e)}") # def save_metadata(self, metadata: Dict): # """保存元数据""" # with open(self.metadata_file, "w", encoding="utf-8") as f: # json.dump(metadata, f, ensure_ascii=False, indent=2) # def load_metadata(self) -> Dict: # """加载元数据""" # if os.path.exists(self.metadata_file): # with open(self.metadata_file, "r", encoding="utf-8") as f: # return json.load(f) # return {} # def process_image(self, image_path: str) -> Dict: # """处理单张图片,提取特征""" # try: # with Image.open(image_path) as img: # # 这里可以添加图像处理逻辑 # # 例如:调整大小、格式转换等 # return { # "width": img.width, # "height": img.height, # "format": img.format, # "path": image_path # } # except Exception as e: # print(f"Error processing image {image_path}: {str(e)}") # return {} def main(): # 初始化数据处理器 processor = DataProcessor(data_dir="data") # 加载淘宝购买数据 raw_data = pd.read_csv("data/taobao_purchases.csv") df = processor.process_data(raw_data) # 保存处理后的数据 output_path = "data/processed_taobao_purchases.csv" df.to_csv(output_path, index=False, encoding='utf-8-sig') print(f"\nProcessed data saved to: {output_path}") if __name__ == "__main__": main()