File size: 9,983 Bytes
1fccc5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
"""

πŸ“Š VALIDATION DATA CREATOR

===========================

Helper script to create validation CSV for confidence calibration.



Two modes:

1. Sample from existing categories (automated)

2. Manual entry (interactive)



Output format:

    product_title,true_category_id

    "Oxygen Sensor Tool",12345

    "Hydraulic Oil Additive",67890



Usage:

    # Automated sampling:

    python create_validation_data.py auto data/category_id_path_only.csv

    

    # Manual entry:

    python create_validation_data.py manual

"""

import pandas as pd
import sys
from pathlib import Path
import random


def sample_from_categories(csv_path, num_samples=100, output_file='data/validation.csv'):
    """

    Automatically create validation data by sampling from categories

    and generating product titles based on category paths.

    """
    print("\n" + "="*80)
    print("πŸ“Š AUTO-GENERATING VALIDATION DATA")
    print("="*80 + "\n")
    
    # Load categories
    print(f"Loading: {csv_path}")
    df = pd.read_csv(csv_path)
    
    if len(df.columns) < 2:
        print("❌ CSV must have at least 2 columns (category_id, category_path)")
        return False
    
    df.columns = ['category_id', 'category_path'] + list(df.columns[2:])
    df = df.dropna(subset=['category_path'])
    
    print(f"βœ… Loaded {len(df):,} categories\n")
    
    # Sample categories
    sample_size = min(num_samples, len(df))
    sampled = df.sample(n=sample_size, random_state=42)
    
    print(f"πŸ“ Generating {sample_size} validation entries...\n")
    
    validation_data = []
    
    for idx, row in sampled.iterrows():
        cat_id = str(row['category_id'])
        cat_path = str(row['category_path'])
        
        # Generate product title from category path
        levels = cat_path.split('/')
        
        # Use last 2-3 levels as product title
        if len(levels) >= 3:
            title_parts = levels[-3:]
        elif len(levels) >= 2:
            title_parts = levels[-2:]
        else:
            title_parts = levels
        
        # Clean and combine
        title = ' '.join(title_parts).strip()
        
        # Add some variation
        variations = [
            title,
            f"{title} kit",
            f"{title} tool",
            f"{title} set",
            f"professional {title}",
            f"{title} replacement",
        ]
        
        product_title = random.choice(variations)
        
        validation_data.append({
            'product_title': product_title,
            'true_category_id': cat_id
        })
    
    # Create DataFrame
    val_df = pd.DataFrame(validation_data)
    
    # Save
    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    val_df.to_csv(output_path, index=False)
    
    print(f"βœ… Created validation file: {output_path}")
    print(f"   Entries: {len(val_df):,}")
    
    # Show samples
    print("\nπŸ“ Sample entries:")
    for i, row in val_df.head(5).iterrows():
        print(f"   {i+1}. \"{row['product_title']}\" β†’ {row['true_category_id']}")
    
    print("\n" + "="*80)
    print("βœ… VALIDATION DATA CREATED!")
    print("="*80)
    print(f"\nNext step: Train with calibration")
    print(f"   python train_fixed_v2.py data/category_id_path_only.csv data/tags.json {output_path}")
    print("="*80 + "\n")
    
    return True


def manual_entry(output_file='data/validation_manual.csv'):
    """

    Interactive mode to manually create validation data.

    """
    print("\n" + "="*80)
    print("πŸ“ MANUAL VALIDATION DATA ENTRY")
    print("="*80)
    print("\nEnter product titles and their correct category IDs.")
    print("Press CTRL+C when done.\n")
    
    validation_data = []
    
    try:
        while True:
            print(f"\n--- Entry #{len(validation_data) + 1} ---")
            
            title = input("Product title: ").strip()
            if not title:
                print("⚠️  Title cannot be empty")
                continue
            
            cat_id = input("Category ID: ").strip()
            if not cat_id:
                print("⚠️  Category ID cannot be empty")
                continue
            
            validation_data.append({
                'product_title': title,
                'true_category_id': cat_id
            })
            
            print(f"βœ… Added: \"{title}\" β†’ {cat_id}")
            
    except KeyboardInterrupt:
        print("\n\nπŸ“Š Entry complete!")
    
    if not validation_data:
        print("❌ No entries created")
        return False
    
    # Create DataFrame
    val_df = pd.DataFrame(validation_data)
    
    # Save
    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    val_df.to_csv(output_path, index=False)
    
    print(f"\nβœ… Created validation file: {output_path}")
    print(f"   Entries: {len(val_df):,}")
    
    print("\n" + "="*80)
    print("βœ… VALIDATION DATA CREATED!")
    print("="*80)
    print(f"\nNext step: Train with calibration")
    print(f"   python train_fixed_v2.py data/category_id_path_only.csv data/tags.json {output_path}")
    print("="*80 + "\n")
    
    return True


def verify_validation_file(validation_csv, categories_csv):
    """

    Verify that validation data references valid category IDs.

    """
    print("\n" + "="*80)
    print("πŸ” VERIFYING VALIDATION DATA")
    print("="*80 + "\n")
    
    # Load validation data
    print(f"Loading validation: {validation_csv}")
    val_df = pd.read_csv(validation_csv)
    
    if 'product_title' not in val_df.columns or 'true_category_id' not in val_df.columns:
        print("❌ Validation CSV must have: product_title, true_category_id")
        return False
    
    print(f"βœ… Loaded {len(val_df):,} validation entries\n")
    
    # Load categories
    print(f"Loading categories: {categories_csv}")
    cat_df = pd.read_csv(categories_csv)
    cat_df.columns = ['category_id', 'category_path'] + list(cat_df.columns[2:])
    
    valid_ids = set(cat_df['category_id'].astype(str))
    print(f"βœ… Loaded {len(valid_ids):,} valid category IDs\n")
    
    # Verify
    print("Checking validation entries...")
    invalid_count = 0
    
    for idx, row in val_df.iterrows():
        cat_id = str(row['true_category_id'])
        title = row['product_title']
        
        if cat_id not in valid_ids:
            print(f"❌ Invalid ID: {cat_id} for \"{title}\"")
            invalid_count += 1
    
    if invalid_count == 0:
        print("βœ… All validation entries are valid!")
    else:
        print(f"\n⚠️  Found {invalid_count} invalid entries")
    
    # Summary
    print("\n" + "="*80)
    print("πŸ“Š VALIDATION DATA SUMMARY")
    print("="*80)
    print(f"Total entries: {len(val_df):,}")
    print(f"Valid entries: {len(val_df) - invalid_count:,}")
    print(f"Invalid entries: {invalid_count}")
    print("="*80 + "\n")
    
    return invalid_count == 0


def main():
    print("\n" + "="*80)
    print("πŸ“Š VALIDATION DATA CREATOR")
    print("="*80 + "\n")
    
    if len(sys.argv) < 2:
        print("Usage:")
        print("  python create_validation_data.py auto <csv_path> [num_samples] [output_file]")
        print("  python create_validation_data.py manual [output_file]")
        print("  python create_validation_data.py verify <validation_csv> <categories_csv>")
        print("\nExamples:")
        print("  # Auto-generate 100 samples:")
        print("  python create_validation_data.py auto data/category_id_path_only.csv")
        print()
        print("  # Auto-generate 200 samples:")
        print("  python create_validation_data.py auto data/category_id_path_only.csv 200")
        print()
        print("  # Manual entry:")
        print("  python create_validation_data.py manual")
        print()
        print("  # Verify validation file:")
        print("  python create_validation_data.py verify data/validation.csv data/category_id_path_only.csv")
        print()
        return
    
    mode = sys.argv[1].lower()
    
    if mode == 'auto':
        if len(sys.argv) < 3:
            print("❌ CSV path required for auto mode")
            print("   python create_validation_data.py auto data/category_id_path_only.csv")
            return
        
        csv_path = sys.argv[2]
        num_samples = int(sys.argv[3]) if len(sys.argv) > 3 else 100
        output_file = sys.argv[4] if len(sys.argv) > 4 else 'data/validation.csv'
        
        if not Path(csv_path).exists():
            print(f"❌ File not found: {csv_path}")
            return
        
        sample_from_categories(csv_path, num_samples, output_file)
    
    elif mode == 'manual':
        output_file = sys.argv[2] if len(sys.argv) > 2 else 'data/validation_manual.csv'
        manual_entry(output_file)
    
    elif mode == 'verify':
        if len(sys.argv) < 4:
            print("❌ Both validation CSV and categories CSV required")
            print("   python create_validation_data.py verify data/validation.csv data/category_id_path_only.csv")
            return
        
        validation_csv = sys.argv[2]
        categories_csv = sys.argv[3]
        
        if not Path(validation_csv).exists():
            print(f"❌ File not found: {validation_csv}")
            return
        
        if not Path(categories_csv).exists():
            print(f"❌ File not found: {categories_csv}")
            return
        
        verify_validation_file(validation_csv, categories_csv)
    
    else:
        print(f"❌ Unknown mode: {mode}")
        print("   Use: auto, manual, or verify")


if __name__ == "__main__":
    main()