File size: 7,729 Bytes
54056c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
'''暂未剔除退款的记录'''



import pandas as pd
import requests
from PIL import Image
import os
from typing import List, Dict
import json
import re

class DataProcessor:
    def __init__(self, data_dir: str):
        self.data_dir = data_dir
        self.metadata_file = os.path.join(data_dir, "metadata.json")
        
    def is_top(self, name: str) -> bool:
        """判断是否为上衣"""
        keywords = ['背心', '上衣', 'T恤', '抹胸', '吊带', '露脐', '短袖', '衬衫', '外套', '夹克', '卫衣']
        return any(kw in name for kw in keywords)
    
    def is_bottom(self, name: str) -> bool:
        """判断是否为下装"""
        keywords = ['短裤', '长裤', '裤子', '半身裙']
        return any(kw in name for kw in keywords)
    
    def is_dress(self, name: str) -> bool:
        """判断是否为连衣裙/连体裤"""
        keywords = ['连衣裙', '连体裤', '套装', '长裙', '吊带裙', '背带裤']
        return any(kw in name for kw in keywords)
    
    def is_accessory(self, name: str) -> bool:
        """判断是否为配饰"""
        keywords = ['帽子', '项链', '耳环', '手链', '戒指', '发饰', '围巾', '手套', '袜子', '包', '腰带', '眼镜', '口罩', '帽子', '鞋','袜子']
        return any(kw in name for kw in keywords)
    
    def estimate_exposure(self, name: str) -> str:
        """估算露肤度"""
        high = ['抹胸', '露脐', '吊带']
        medium = ['短袖', '背心', '短裙', '短裤']
        low = ['长裙', '长裤', '毛呢']
        
        if any(kw in name for kw in high):
            return 'high'
        elif any(kw in name for kw in medium):
            return 'medium'
        elif any(kw in name for kw in low):
            return 'low'
        return 'unknown'
    
    def extract_style(self, name: str, fallback: str = None) -> str:
        """提取风格关键词"""
        keywords = ['通勤', '辣妹', '运动', '学院', '复古', '法式']
        style_map = {
            '通勤': 'commuter',
            '辣妹': 'trendy',
            '运动': 'sports',
            '学院': 'academic',
            '复古': 'retro',
            '法式': 'french'
        }
        for kw in keywords:
            if kw in name:
                return kw #style_map[kw]
        return fallback if fallback else 'unknown'

    def extract_color_size(self, spec: str) -> tuple:
        """从规格中提取颜色和尺码"""
        color = ''
        size = ''
        
        if pd.isna(spec):
            return color, size
            
        # 尝试提取颜色
        color_section = re.search(r'(?:颜色分类|主要颜色)[::]([^::]+)', str(spec))
        if color_section:
            # 获取颜色部分的文本并清理
            color_text = color_section.group(1).strip()
            
            # 先尝试匹配 xxx色
            color_match = re.search(r'([^\s,,]+色)', color_text)
            if color_match:
                color = color_match.group(1)
            else:
                # 如果没找到xxx色,则保留第一段非空文本(处理类似"浆果玫红"这样的组合词)
                color = re.split(r'[,,\s\-]+', color_text)[0].strip()
            
        # 尝试提取尺码
        size_section = re.search(r'尺码[::]([^::]+)', str(spec))
        if size_section:
            size = size_section.group(1).strip()
            # 清理尺码中的特殊字符和乱码
            size = re.sub(r'[\[\]【】\(\)]', '', size)
            
        return color, size
        
    def process_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """处理淘宝购买记录数据"""
        # 清理标题中的[交易快照]
        df['title'] = df['title'].str.replace(r'\[交易快照\]$', '', regex=True).str.strip()
        
        # 清理 image_url 中的 _80x80.jpg
        df['image_url'] = df['image_url'].str.replace(r'_80x80\.jpg$', '_640x640.jpg', regex=True)

        # 添加必要的列
        if "type" not in df.columns:
            df["type"] = ""  # 服装类型
        if "style" not in df.columns:
            df["style"] = ""  # 风格
        if "exposure_level" not in df.columns:
            df["exposure_level"] = ""  # 露肤度
                   
        # 从specification提取颜色和尺码
        df[['color', 'size']] = pd.DataFrame(
            df['specification'].apply(self.extract_color_size).tolist(),
            index=df.index
        )
        
        # 添加新的处理逻辑
        df['type'] = df['title'].apply(lambda x: 
            '上衣' if self.is_top(x) 
            else ('下装' if self.is_bottom(x) 
            else ('连衣裙/裤' if self.is_dress(x)
            else ('配饰' if self.is_accessory(x)
            else '未知'))))
            
        df['exposure_level'] = df['title'].apply(self.estimate_exposure)
        df['style'] = df.apply(lambda row: self.extract_style(str(row['title'])), axis=1)
        
        # 判断是否是服饰(类型不为未知,且颜色和尺码都不为空)
        df['is_clothing'] = (df['type'].apply(lambda x: x != '未知') & 
                           df['color'].str.len().gt(0) & 
                           df['size'].str.len().gt(0))

        # 剔除退款的记录 !!暂未剔除
        # df = df[~df['status'].str.contains('查看退款', na=False)]
        
        # 剔除非服装类商品
        df = df[df['is_clothing'] == True]
            
        return df
    
    # def download_images(self, image_urls: List[str], output_dir: str):
    #     """下载并保存图片"""
    #     os.makedirs(output_dir, exist_ok=True)
        
    #     for url in image_urls:
    #         try:
    #             response = requests.get(url)
    #             if response.status_code == 200:
    #                 filename = os.path.join(output_dir, f"{hash(url)}.jpg")
    #                 with open(filename, "wb") as f:
    #                     f.write(response.content)
    #         except Exception as e:
    #             print(f"Error downloading {url}: {str(e)}")
    
    # def save_metadata(self, metadata: Dict):
    #     """保存元数据"""
    #     with open(self.metadata_file, "w", encoding="utf-8") as f:
    #         json.dump(metadata, f, ensure_ascii=False, indent=2)
    
    # def load_metadata(self) -> Dict:
    #     """加载元数据"""
    #     if os.path.exists(self.metadata_file):
    #         with open(self.metadata_file, "r", encoding="utf-8") as f:
    #             return json.load(f)
    #     return {}
    
    # def process_image(self, image_path: str) -> Dict:
    #     """处理单张图片,提取特征"""
    #     try:
    #         with Image.open(image_path) as img:
    #             # 这里可以添加图像处理逻辑
    #             # 例如:调整大小、格式转换等
    #             return {
    #                 "width": img.width,
    #                 "height": img.height,
    #                 "format": img.format,
    #                 "path": image_path
    #             }
    #     except Exception as e:
    #         print(f"Error processing image {image_path}: {str(e)}")
    #         return {} 


def main():
    # 初始化数据处理器
    processor = DataProcessor(data_dir="data")
    
    # 加载淘宝购买数据
    raw_data = pd.read_csv("data/taobao_purchases.csv")
    df = processor.process_data(raw_data)
    
    # 保存处理后的数据
    output_path = "data/processed_taobao_purchases.csv"
    df.to_csv(output_path, index=False, encoding='utf-8-sig')
    print(f"\nProcessed data saved to: {output_path}")

if __name__ == "__main__":
    main()