Spaces:

zen-vton
/

prediction

No application file

App Files Files Community

zen-vton commited on Nov 19, 2025

Commit

1fccc5c

verified ·

1 Parent(s): ad9b761

Upload 11 files

Browse files

Files changed (11) hide show

.gitignore +16 -0
api_server.py +1377 -0
check.py +365 -0
fix.py +270 -0
gradio_app.py +259 -0
miss.py +421 -0
path.py +141 -0
requirements.txt +28 -0
synonyms.py +853 -365
train_products.py +421 -0
validation_data.py +310 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+__pycache__/
+*.py[cod]
+.Python
+venv/
+env/
+.vscode/
+.idea/
+.DS_Store
+*.bin
+*.safetensors
+*.log
+cache/*.faiss
+cache/*.npy
+!cache/metadata.pkl
+!cache/model_info.json
+!cache/cross_store_synonyms.pkl

api_server.py ADDED Viewed

	@@ -0,0 +1,1377 @@

+# """
+# 🎯 COMPLETE API SERVER - Matches Cross-Store Training System
+# =============================================================
+# ✅ Works with cross-store synonyms (washing machine = laundry machine)
+# ✅ Uses auto-tags from training
+# ✅ Single model (fast predictions)
+# ✅ Guaranteed category_id match
+# ✅ Real-time classification
+# """
+# from flask import Flask, request, jsonify, render_template_string
+# from sentence_transformers import SentenceTransformer
+# import faiss
+# import pickle
+# import numpy as np
+# from pathlib import Path
+# import time
+# import re
+# app = Flask(__name__)
+# # ============================================================================
+# # GLOBAL VARIABLES
+# # ============================================================================
+# CACHE_DIR = Path('cache')
+# # Model
+# encoder = None
+# faiss_index = None
+# metadata = []
+# cross_store_synonyms = {}
+# # ============================================================================
+# # CROSS-STORE SYNONYM DATABASE (Same as training)
+# # ============================================================================
+# def build_cross_store_synonyms():
+#     """Build cross-store synonym database"""
+#     synonyms = {
+#         # Appliances
+#         'washing machine': {'laundry machine', 'washer', 'clothes washer', 'washing appliance'},
+#         'laundry machine': {'washing machine', 'washer', 'clothes washer'},
+#         'dryer': {'drying machine', 'clothes dryer', 'tumble dryer'},
+#         'refrigerator': {'fridge', 'cooler', 'ice box', 'cooling appliance'},
+#         'dishwasher': {'dish washer', 'dish cleaning machine'},
+#         'microwave': {'microwave oven', 'micro wave'},
+#         'vacuum': {'vacuum cleaner', 'hoover', 'vac'},
+#         # Electronics
+#         'tv': {'television', 'telly', 'smart tv', 'display'},
+#         'laptop': {'notebook', 'portable computer', 'laptop computer'},
+#         'mobile': {'phone', 'cell phone', 'smartphone', 'cellphone'},
+#         'tablet': {'ipad', 'tab', 'tablet computer'},
+#         'headphones': {'headset', 'earphones', 'earbuds', 'ear buds'},
+#         'speaker': {'audio speaker', 'sound system', 'speakers'},
+#         # Furniture
+#         'sofa': {'couch', 'settee', 'divan'},
+#         'wardrobe': {'closet', 'armoire', 'cupboard'},
+#         'drawer': {'chest of drawers', 'dresser'},
+#         # Clothing
+#         'pants': {'trousers', 'slacks', 'bottoms'},
+#         'sweater': {'jumper', 'pullover', 'sweatshirt'},
+#         'sneakers': {'trainers', 'tennis shoes', 'running shoes'},
+#         'jacket': {'coat', 'blazer', 'outerwear'},
+#         # Kitchen
+#         'cooker': {'stove', 'range', 'cooking range'},
+#         'blender': {'mixer', 'food processor', 'liquidizer'},
+#         'kettle': {'electric kettle', 'water boiler'},
+#         # Baby/Kids
+#         'stroller': {'pram', 'pushchair', 'buggy', 'baby carriage'},
+#         'diaper': {'nappy', 'nappies'},
+#         'pacifier': {'dummy', 'soother'},
+#         # Tools
+#         'wrench': {'spanner', 'adjustable wrench'},
+#         'flashlight': {'torch', 'flash light'},
+#         'screwdriver': {'screw driver'},
+#         # Home
+#         'tap': {'faucet', 'water tap'},
+#         'bin': {'trash can', 'garbage can', 'waste bin'},
+#         'curtain': {'drape', 'window covering'},
+#         # Crafts/Office
+#         'guillotine': {'paper cutter', 'paper trimmer', 'blade cutter'},
+#         'trimmer': {'cutter', 'cutting tool', 'edge cutter'},
+#         'stapler': {'stapling machine', 'staple gun'},
+#         # Books/Media
+#         'magazine': {'periodical', 'journal', 'publication'},
+#         'comic': {'comic book', 'graphic novel', 'manga'},
+#         'ebook': {'e-book', 'digital book', 'electronic book'},
+#         # General
+#         'kids': {'children', 'child', 'childrens', 'youth', 'junior'},
+#         'women': {'womens', 'ladies', 'female', 'lady'},
+#         'men': {'mens', 'male', 'gentleman'},
+#         'baby': {'infant', 'newborn', 'toddler'},
+#     }
+#     # Build bidirectional mapping
+#     expanded = {}
+#     for term, syns in synonyms.items():
+#         expanded[term] = syns.copy()
+#         for syn in syns:
+#             if syn not in expanded:
+#                 expanded[syn] = set()
+#             expanded[syn].add(term)
+#             expanded[syn].update(syns - {syn})
+#     return expanded
+# # ============================================================================
+# # HELPER FUNCTIONS
+# # ============================================================================
+# def clean_text(text):
+#     """Clean and normalize text"""
+#     if not text:
+#         return ""
+#     text = str(text).lower()
+#     text = re.sub(r'[^\w\s-]', ' ', text)
+#     text = re.sub(r'\s+', ' ', text).strip()
+#     return text
+# def extract_cross_store_terms(text):
+#     """Extract terms with cross-store variations"""
+#     cleaned = clean_text(text)
+#     words = cleaned.split()
+#     all_terms = set()
+#     all_terms.add(cleaned)  # Full text
+#     # Single words
+#     for word in words:
+#         if len(word) > 2:
+#             all_terms.add(word)
+#             # Add cross-store synonyms
+#             if word in cross_store_synonyms:
+#                 all_terms.update(cross_store_synonyms[word])
+#     # 2-word phrases
+#     for i in range(len(words) - 1):
+#         if len(words[i]) > 2 and len(words[i+1]) > 2:
+#             phrase = f"{words[i]} {words[i+1]}"
+#             all_terms.add(phrase)
+#             if phrase in cross_store_synonyms:
+#                 all_terms.update(cross_store_synonyms[phrase])
+#     # 3-word phrases
+#     if len(words) >= 3:
+#         for i in range(len(words) - 2):
+#             if all(len(w) > 2 for w in words[i:i+3]):
+#                 phrase = f"{words[i]} {words[i+1]} {words[i+2]}"
+#                 all_terms.add(phrase)
+#     return list(all_terms)
+# def build_enhanced_query(title, description=""):
+#     """Build enhanced query with cross-store intelligence"""
+#     # Extract terms with variations
+#     all_terms = extract_cross_store_terms(f"{title} {description}")
+#     # Clean product terms
+#     product_terms = [t for t in clean_text(f"{title} {description}").split() if len(t) > 2]
+#     # Build query
+#     # Emphasize original + all variations
+#     product_text = ' '.join(product_terms)
+#     variations_text = ' '.join(all_terms[:30])  # Top 30 variations
+#     # Repeat for emphasis
+#     emphasized = ' '.join([product_text] * 3)
+#     query = f"{emphasized} {variations_text} {title} {description}"
+#     return query, all_terms[:20]
+# def encode_query(text):
+#     """Encode query using the trained model"""
+#     embedding = encoder.encode(
+#         text,
+#         convert_to_numpy=True,
+#         normalize_embeddings=True
+#     )
+#     if embedding.ndim == 1:
+#         embedding = embedding.reshape(1, -1)
+#     return embedding.astype('float32')
+# def classify_product(title, description="", top_k=5):
+#     """
+#     Classify product using trained system
+#     Returns: category_id, category_path, confidence, and alternatives
+#     """
+#     start_time = time.time()
+#     # Step 1: Build enhanced query with cross-store synonyms
+#     query, matched_terms = build_enhanced_query(title, description)
+#     # Step 2: Encode query
+#     query_embedding = encode_query(query)
+#     # Step 3: Search FAISS index
+#     distances, indices = faiss_index.search(query_embedding, top_k)
+#     # Step 4: Get results
+#     results = []
+#     for i in range(len(indices[0])):
+#         idx = indices[0][i]
+#         if idx < len(metadata):
+#             meta = metadata[idx]
+#             confidence = float(distances[0][i]) * 100
+#             # Get final product name
+#             levels = meta.get('levels', [])
+#             final_product = levels[-1] if levels else meta['category_path'].split('/')[-1]
+#             results.append({
+#                 'rank': i + 1,
+#                 'category_id': meta['category_id'],
+#                 'category_path': meta['category_path'],
+#                 'final_product': final_product,
+#                 'confidence': round(confidence, 2),
+#                 'depth': meta.get('depth', 0)
+#             })
+#     # Best result
+#     best = results[0] if results else None
+#     if not best:
+#         return {
+#             'error': 'No results found',
+#             'product': title
+#         }
+#     # Confidence level
+#     conf_pct = best['confidence']
+#     if conf_pct >= 90:
+#         conf_level = "EXCELLENT"
+#     elif conf_pct >= 85:
+#         conf_level = "VERY HIGH"
+#     elif conf_pct >= 80:
+#         conf_level = "HIGH"
+#     elif conf_pct >= 75:
+#         conf_level = "GOOD"
+#     elif conf_pct >= 70:
+#         conf_level = "MEDIUM"
+#     else:
+#         conf_level = "LOW"
+#     processing_time = (time.time() - start_time) * 1000
+#     return {
+#         'product': title,
+#         'category_id': best['category_id'],
+#         'category_path': best['category_path'],
+#         'final_product': best['final_product'],
+#         'confidence': f"{conf_level} ({conf_pct:.2f}%)",
+#         'confidence_percent': conf_pct,
+#         'depth': best['depth'],
+#         'matched_terms': matched_terms,
+#         'top_5_results': results,
+#         'processing_time_ms': round(processing_time, 2)
+#     }
+# # ============================================================================
+# # SERVER INITIALIZATION
+# # ============================================================================
+# def load_server():
+#     """Load all trained data"""
+#     global encoder, faiss_index, metadata, cross_store_synonyms
+#     print("\n" + "="*80)
+#     print("🔄 LOADING TRAINED MODEL")
+#     print("="*80 + "\n")
+#     # Load model
+#     print("📥 Loading sentence transformer...")
+#     encoder = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
+#     print("✅ Model loaded\n")
+#     # Load FAISS index
+#     print("📥 Loading FAISS index...")
+#     index_path = CACHE_DIR / 'main_index.faiss'
+#     if not index_path.exists():
+#         raise FileNotFoundError(f"FAISS index not found: {index_path}\nPlease run training first!")
+#     faiss_index = faiss.read_index(str(index_path))
+#     print(f"✅ Index loaded ({faiss_index.ntotal:,} vectors)\n")
+#     # Load metadata
+#     print("📥 Loading metadata...")
+#     meta_path = CACHE_DIR / 'metadata.pkl'
+#     if not meta_path.exists():
+#         raise FileNotFoundError(f"Metadata not found: {meta_path}\nPlease run training first!")
+#     with open(meta_path, 'rb') as f:
+#         metadata = pickle.load(f)
+#     print(f"✅ Metadata loaded ({len(metadata):,} categories)\n")
+#     # Load cross-store synonyms
+#     print("📥 Loading cross-store synonyms...")
+#     syn_path = CACHE_DIR / 'cross_store_synonyms.pkl'
+#     if syn_path.exists():
+#         with open(syn_path, 'rb') as f:
+#             cross_store_synonyms = pickle.load(f)
+#         print(f"✅ Cross-store synonyms loaded ({len(cross_store_synonyms)} terms)\n")
+#     else:
+#         print("⚠️  Cross-store synonyms not found, building default set...")
+#         cross_store_synonyms = build_cross_store_synonyms()
+#         print(f"✅ Built {len(cross_store_synonyms)} synonym mappings\n")
+#     print("="*80)
+#     print("✅ SERVER READY!")
+#     print("="*80 + "\n")
+# # ============================================================================
+# # HTML INTERFACE
+# # ============================================================================
+# HTML_TEMPLATE = """
+# <!DOCTYPE html>
+# <html>
+# <head>
+#     <title>🎯 Product Category Classifier</title>
+#     <meta charset="UTF-8">
+#     <meta name="viewport" content="width=device-width, initial-scale=1.0">
+#     <style>
+#         * { margin: 0; padding: 0; box-sizing: border-box; }
+#         body {
+#             font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+#             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+#             min-height: 100vh;
+#             padding: 20px;
+#         }
+#         .container { max-width: 1200px; margin: 0 auto; }
+#         .header {
+#             text-align: center;
+#             color: white;
+#             margin-bottom: 30px;
+#         }
+#         .header h1 { font-size: 2.5em; margin-bottom: 10px; }
+#         .badge {
+#             background: rgba(255,255,255,0.2);
+#             padding: 8px 20px;
+#             border-radius: 20px;
+#             display: inline-block;
+#             margin: 5px;
+#             font-size: 0.9em;
+#         }
+#         .card {
+#             background: white;
+#             border-radius: 20px;
+#             padding: 30px;
+#             box-shadow: 0 10px 40px rgba(0,0,0,0.2);
+#         }
+#         .success-box {
+#             background: #d4edda;
+#             padding: 15px;
+#             border-radius: 8px;
+#             margin-bottom: 20px;
+#             border-left: 4px solid #28a745;
+#             color: #155724;
+#         }
+#         .form-group { margin-bottom: 20px; }
+#         label {
+#             display: block;
+#             font-weight: 600;
+#             margin-bottom: 8px;
+#             color: #333;
+#         }
+#         input, textarea {
+#             width: 100%;
+#             padding: 12px;
+#             border: 2px solid #e0e0e0;
+#             border-radius: 8px;
+#             font-size: 1em;
+#         }
+#         input:focus, textarea:focus {
+#             outline: none;
+#             border-color: #667eea;
+#         }
+#         textarea { min-height: 80px; resize: vertical; }
+#         button {
+#             width: 100%;
+#             padding: 15px;
+#             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+#             color: white;
+#             border: none;
+#             border-radius: 10px;
+#             font-size: 1.1em;
+#             cursor: pointer;
+#             font-weight: 600;
+#             transition: transform 0.2s;
+#         }
+#         button:hover { transform: translateY(-2px); }
+#         .results { display: none; margin-top: 20px; }
+#         .results.show { display: block; animation: fadeIn 0.5s; }
+#         @keyframes fadeIn {
+#             from { opacity: 0; transform: translateY(10px); }
+#             to { opacity: 1; transform: translateY(0); }
+#         }
+#         .section {
+#             background: #f8f9fa;
+#             padding: 20px;
+#             border-radius: 12px;
+#             margin-bottom: 15px;
+#             border-left: 4px solid #667eea;
+#         }
+#         .section h3 { color: #667eea; margin-bottom: 12px; }
+#         .result-item {
+#             background: white;
+#             padding: 15px;
+#             border-radius: 8px;
+#             margin-bottom: 10px;
+#             border-left: 3px solid #667eea;
+#         }
+#         .tag {
+#             display: inline-block;
+#             background: #667eea;
+#             color: white;
+#             padding: 6px 12px;
+#             border-radius: 15px;
+#             margin: 3px;
+#             font-size: 0.9em;
+#         }
+#         .conf-excellent { background: #4caf50; }
+#         .conf-very { background: #8bc34a; }
+#         .conf-high { background: #cddc39; color: #333; }
+#         .conf-good { background: #ff9800; }
+#         .conf-medium { background: #ff5722; }
+#         .conf-low { background: #9e9e9e; }
+#         .loading { display: none; text-align: center; padding: 20px; }
+#         .loading.show { display: block; }
+#         .spinner {
+#             border: 4px solid #f3f3f3;
+#             border-top: 4px solid #667eea;
+#             border-radius: 50%;
+#             width: 40px;
+#             height: 40px;
+#             animation: spin 1s linear infinite;
+#             margin: 0 auto;
+#         }
+#         @keyframes spin {
+#             0% { transform: rotate(0deg); }
+#             100% { transform: rotate(360deg); }
+#         }
+#     </style>
+# </head>
+# <body>
+#     <div class="container">
+#         <div class="header">
+#             <h1>🎯 Product Category Classifier</h1>
+#             <div class="badge">Cross-Store Intelligence</div>
+#             <div class="badge">Auto-Tag Support</div>
+#             <div class="badge">Real-Time</div>
+#         </div>
+#         <div class="card">
+#             <div class="success-box">
+#                 <strong>✅ Cross-Store Synonyms Active!</strong><br>
+#                 Understands: washing machine = laundry machine | tv = television | kids = children
+#             </div>
+#             <div class="form-group">
+#                 <label>Product Title *</label>
+#                 <input type="text" id="title" placeholder="e.g., Washing Machine or Laundry Machine" />
+#             </div>
+#             <div class="form-group">
+#                 <label>Description (Optional)</label>
+#                 <textarea id="desc" placeholder="Additional details..."></textarea>
+#             </div>
+#             <button onclick="classify()">🎯 Classify Product</button>
+#             <div class="loading" id="loading">
+#                 <div class="spinner"></div>
+#                 <p style="margin-top: 10px; color: #666;">Analyzing...</p>
+#             </div>
+#             <div class="results" id="results">
+#                 <div class="section">
+#                     <h3>✅ Best Match</h3>
+#                     <div class="result-item">
+#                         <div style="margin-bottom: 10px;">
+#                             <strong>Product:</strong> <span id="product"></span>
+#                         </div>
+#                         <div style="margin-bottom: 10px;">
+#                             <strong>Category ID:</strong>
+#                             <span id="catId" style="font-size: 1.2em; color: #28a745; font-weight: bold;"></span>
+#                         </div>
+#                         <div style="margin-bottom: 10px;">
+#                             <strong>Final Product:</strong> <span id="finalProd" style="font-weight: 600;"></span>
+#                         </div>
+#                         <div style="margin-bottom: 10px;">
+#                             <strong>Full Path:</strong><br>
+#                             <span id="path" style="color: #666; font-size: 0.95em;"></span>
+#                         </div>
+#                         <div style="margin-bottom: 10px;">
+#                             <strong>Confidence:</strong>
+#                             <span id="confidence" class="tag"></span>
+#                         </div>
+#                         <div style="font-size: 0.9em; color: #666;">
+#                             <strong>Depth:</strong> <span id="depth"></span> levels |
+#                             <strong>Time:</strong> <span id="time"></span>ms
+#                         </div>
+#                     </div>
+#                 </div>
+#                 <div class="section">
+#                     <h3>🔗 Matched Terms (Cross-Store Variations)</h3>
+#                     <div id="matchedTerms"></div>
+#                 </div>
+#                 <div class="section">
+#                     <h3>📋 Top 5 Alternative Matches</h3>
+#                     <div id="alternatives"></div>
+#                 </div>
+#             </div>
+#         </div>
+#     </div>
+#     <script>
+#         async function classify() {
+#             const title = document.getElementById('title').value.trim();
+#             const desc = document.getElementById('desc').value.trim();
+#             if (!title) {
+#                 alert('Please enter a product title');
+#                 return;
+#             }
+#             document.getElementById('loading').classList.add('show');
+#             document.getElementById('results').classList.remove('show');
+#             try {
+#                 const response = await fetch('/classify', {
+#                     method: 'POST',
+#                     headers: { 'Content-Type': 'application/json' },
+#                     body: JSON.stringify({ title, description: desc })
+#                 });
+#                 if (!response.ok) throw new Error('Classification failed');
+#                 const data = await response.json();
+#                 displayResults(data);
+#             } catch (error) {
+#                 alert('Error: ' + error.message);
+#             } finally {
+#                 document.getElementById('loading').classList.remove('show');
+#             }
+#         }
+#         function displayResults(data) {
+#             document.getElementById('results').classList.add('show');
+#             document.getElementById('product').textContent = data.product;
+#             document.getElementById('catId').textContent = data.category_id;
+#             document.getElementById('finalProd').textContent = data.final_product;
+#             document.getElementById('path').textContent = data.category_path;
+#             document.getElementById('depth').textContent = data.depth;
+#             document.getElementById('time').textContent = data.processing_time_ms;
+#             const conf = document.getElementById('confidence');
+#             conf.textContent = data.confidence;
+#             const confClass = data.confidence.split(' ')[0].toLowerCase().replace('_', '-');
+#             conf.className = 'tag conf-' + confClass;
+#             const matchedHtml = data.matched_terms.map(t => `<span class="tag">${t}</span>`).join('');
+#             document.getElementById('matchedTerms').innerHTML = matchedHtml;
+#             let altHtml = '';
+#             data.top_5_results.forEach((item, i) => {
+#                 const cls = i === 0 ? 'style="background: #e8f5e9;"' : '';
+#                 altHtml += `
+#                     <div class="result-item" ${cls}>
+#                         <strong>${item.rank}.</strong> ${item.final_product}
+#                         <span class="tag" style="background: #999;">${item.confidence}%</span>
+#                         <div style="font-size: 0.85em; color: #666; margin-top: 5px;">
+#                             ID: ${item.category_id}
+#                         </div>
+#                     </div>
+#                 `;
+#             });
+#             document.getElementById('alternatives').innerHTML = altHtml;
+#         }
+#         document.getElementById('title').addEventListener('keypress', function(e) {
+#             if (e.key === 'Enter') classify();
+#         });
+#     </script>
+# </body>
+# </html>
+# """
+# # ============================================================================
+# # FLASK ROUTES
+# # ============================================================================
+# @app.route('/')
+# def index():
+#     """Serve the web interface"""
+#     return render_template_string(HTML_TEMPLATE)
+# @app.route('/classify', methods=['POST'])
+# def classify_route():
+#     """API endpoint for classification"""
+#     data = request.json
+#     title = data.get('title', '').strip()
+#     description = data.get('description', '').strip()
+#     if not title:
+#         return jsonify({'error': 'Title required'}), 400
+#     try:
+#         result = classify_product(title, description)
+#         return jsonify(result)
+#     except Exception as e:
+#         print(f"Error: {e}")
+#         return jsonify({'error': str(e)}), 500
+# @app.route('/health')
+# def health():
+#     """Health check endpoint"""
+#     return jsonify({
+#         'status': 'healthy',
+#         'categories': len(metadata),
+#         'cross_store_synonyms': len(cross_store_synonyms),
+#         'model': 'all-mpnet-base-v2'
+#     })
+# # ============================================================================
+# # MAIN
+# # ============================================================================
+# if __name__ == '__main__':
+#     try:
+#         load_server()
+#         print("\n🌐 Server starting...")
+#         print("   URL: http://localhost:5000")
+#         print("   Press CTRL+C to stop\n")
+#         app.run(host='0.0.0.0', port=5000, debug=False)
+#     except FileNotFoundError as e:
+#         print(f"\n❌ ERROR: {e}")
+#         print("\n💡 Solution: Run training first:")
+#         print("   python train.py data/category_id_path_only.csv\n")
+#     except Exception as e:
+#         print(f"\n❌ UNEXPECTED ERROR: {e}\n")
+#!/usr/bin/env python3
+"""
+API Server for product category classification
+Merged UI + classification logic
+Model: intfloat/e5-base-v2 (must match training)
+Usage:
+    python api_server.py
+Requirements:
+    pip install flask sentence-transformers faiss-cpu numpy pickle5
+Files expected in cache/:
+    - main_index.faiss
+    - metadata.pkl
+    - cross_store_synonyms.pkl (optional)
+"""
+from flask import Flask, request, jsonify, render_template_string
+from sentence_transformers import SentenceTransformer
+import faiss
+import pickle
+import numpy as np
+from pathlib import Path
+import time
+import re
+import os
+from typing import List
+# ============================================================================
+# CONFIG
+# ============================================================================
+CACHE_DIR = Path('cache')
+MODEL_NAME = 'intfloat/e5-base-v2'  # <-- MUST match the model used during training
+FAISS_INDEX_PATH = CACHE_DIR / 'main_index.faiss'
+METADATA_PATH = CACHE_DIR / 'metadata.pkl'
+SYN_PATH = CACHE_DIR / 'cross_store_synonyms.pkl'
+# Server globals
+encoder = None
+faiss_index = None
+metadata = []
+cross_store_synonyms = {}
+# ============================================================================
+# CROSS-STORE SYNONYM FALLBACK
+# ============================================================================
+def build_cross_store_synonyms():
+    """Default cross-store synonyms fallback (bidirectional mapping).
+    If you have a trained cross_store_synonyms.pkl produced by training, the
+    server will load that file instead. This function only used when no file
+    exists in the cache.
+    """
+    synonyms = {
+        'washing machine': {'laundry machine', 'washer', 'clothes washer', 'washing appliance'},
+        'laundry machine': {'washing machine', 'washer', 'clothes washer'},
+        'dryer': {'drying machine', 'clothes dryer', 'tumble dryer'},
+        'refrigerator': {'fridge', 'cooler', 'ice box', 'cooling appliance'},
+        'dishwasher': {'dish washer', 'dish cleaning machine'},
+        'microwave': {'microwave oven', 'micro wave'},
+        'vacuum': {'vacuum cleaner', 'hoover', 'vac'},
+        'tv': {'television', 'telly', 'smart tv', 'display'},
+        'laptop': {'notebook', 'portable computer', 'laptop computer'},
+        'mobile': {'phone', 'cell phone', 'smartphone', 'cellphone'},
+        'tablet': {'ipad', 'tab', 'tablet computer'},
+        'headphones': {'headset', 'earphones', 'earbuds', 'ear buds'},
+        'speaker': {'audio speaker', 'sound system', 'speakers'},
+        'sofa': {'couch', 'settee', 'divan'},
+        'wardrobe': {'closet', 'armoire', 'cupboard'},
+        'drawer': {'chest of drawers', 'dresser'},
+        'pants': {'trousers', 'slacks', 'bottoms'},
+        'sweater': {'jumper', 'pullover', 'sweatshirt'},
+        'sneakers': {'trainers', 'tennis shoes', 'running shoes'},
+        'jacket': {'coat', 'blazer', 'outerwear'},
+        'cooker': {'stove', 'range', 'cooking range'},
+        'blender': {'mixer', 'food processor', 'liquidizer'},
+        'kettle': {'electric kettle', 'water boiler'},
+        'stroller': {'pram', 'pushchair', 'buggy', 'baby carriage'},
+        'diaper': {'nappy', 'nappies'},
+        'pacifier': {'dummy', 'soother'},
+        'wrench': {'spanner', 'adjustable wrench'},
+        'flashlight': {'torch', 'flash light'},
+        'screwdriver': {'screw driver'},
+        'tap': {'faucet', 'water tap'},
+        'bin': {'trash can', 'garbage can', 'waste bin'},
+        'curtain': {'drape', 'window covering'},
+        'guillotine': {'paper cutter', 'paper trimmer', 'blade cutter'},
+        'trimmer': {'cutter', 'cutting tool', 'edge cutter'},
+        'stapler': {'stapling machine', 'staple gun'},
+        'magazine': {'periodical', 'journal', 'publication'},
+        'comic': {'comic book', 'graphic novel', 'manga'},
+        'ebook': {'e-book', 'digital book', 'electronic book'},
+        'kids': {'children', 'child', 'childrens', 'youth', 'junior'},
+        'women': {'womens', 'ladies', 'female', 'lady'},
+        'men': {'mens', 'male', 'gentleman'},
+        'baby': {'infant', 'newborn', 'toddler'},
+    }
+    expanded = {}
+    for term, syns in synonyms.items():
+        expanded[term] = set(syns)
+        for syn in syns:
+            if syn not in expanded:
+                expanded[syn] = set()
+            expanded[syn].add(term)
+            expanded[syn].update(syns - {syn})
+    return expanded
+# ============================================================================
+# TEXT CLEANING / QUERY BUILDING
+# ============================================================================
+def clean_text(text: str) -> str:
+    if not text:
+        return ""
+    text = str(text).lower()
+    # keep alphanumerics, dashes and spaces
+    text = re.sub(r"[^\w\s-]", " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def extract_cross_store_terms(text: str) -> List[str]:
+    cleaned = clean_text(text)
+    words = cleaned.split()
+    all_terms = set()
+    all_terms.add(cleaned)  # full cleaned text
+    # single words + synonyms
+    for word in words:
+        if len(word) > 2:
+            all_terms.add(word)
+            if word in cross_store_synonyms:
+                all_terms.update(cross_store_synonyms[word])
+    # 2-word phrases
+    for i in range(len(words) - 1):
+        if len(words[i]) > 2 and len(words[i + 1]) > 2:
+            phrase = f"{words[i]} {words[i+1]}"
+            all_terms.add(phrase)
+            if phrase in cross_store_synonyms:
+                all_terms.update(cross_store_synonyms[phrase])
+    # 3-word phrases
+    if len(words) >= 3:
+        for i in range(len(words) - 2):
+            if all(len(w) > 2 for w in words[i:i + 3]):
+                phrase = f"{words[i]} {words[i+1]} {words[i+2]}"
+                all_terms.add(phrase)
+    return list(all_terms)
+def build_enhanced_query(title, description="", max_synonyms=10):
+    """Build query emphasizing original title and cross-store variations"""
+    title_clean = clean_text(title)
+    description_clean = clean_text(description)
+    # Extract cross-store variations
+    synonyms_list = extract_cross_store_terms(f"{title_clean} {description_clean}")
+    # Emphasize original title 3x, then include top synonyms
+    enhanced_query = ' '.join([title_clean] * 3 + synonyms_list[:max_synonyms])
+    return enhanced_query, synonyms_list[:20]  # return top 20 for matched_terms display
+# ============================================================================
+# ENCODER / FAISS
+# ============================================================================
+def encode_query(text: str) -> np.ndarray:
+    embedding = encoder.encode(text, convert_to_numpy=True, normalize_embeddings=True)
+    if embedding.ndim == 1:
+        embedding = embedding.reshape(1, -1)
+    return embedding.astype('float32')
+def classify_product(title, description="", top_k=5):
+    """Classify product using e5-base embeddings with cross-store optimization"""
+    start_time = time.time()
+    # Step 1: Build enhanced query
+    query_text, matched_terms = build_enhanced_query(title, description)
+    # Step 2: Encode query
+    query_embedding = encoder.encode(
+        query_text,
+        convert_to_numpy=True,
+        normalize_embeddings=True
+    ).astype('float32')
+    if query_embedding.ndim == 1:
+        query_embedding = query_embedding.reshape(1, -1)
+    # Step 3: FAISS search
+    distances, indices = faiss_index.search(query_embedding, top_k)
+    results = []
+    for i, idx in enumerate(indices[0]):
+        if idx >= len(metadata):
+            continue
+        meta = metadata[idx]
+        # Convert FAISS distance to cosine similarity
+        similarity = 1 - distances[0][i]
+        confidence_pct = float(similarity) * 100
+        final_product = meta.get('levels', [])[-1] if meta.get('levels') else meta['category_path'].split('/')[-1]
+        results.append({
+            'rank': i + 1,
+            'category_id': meta['category_id'],
+            'category_path': meta['category_path'],
+            'final_product': final_product,
+            'confidence': round(confidence_pct, 2),
+            'depth': meta.get('depth', 0)
+        })
+    if not results:
+        return {'error': 'No results found', 'product': title}
+    # Pick best match
+    best = results[0]
+    conf_pct = best['confidence']
+    if conf_pct >= 90:
+        conf_level = "EXCELLENT"
+    elif conf_pct >= 85:
+        conf_level = "VERY HIGH"
+    elif conf_pct >= 80:
+        conf_level = "HIGH"
+    elif conf_pct >= 75:
+        conf_level = "GOOD"
+    elif conf_pct >= 70:
+        conf_level = "MEDIUM"
+    else:
+        conf_level = "LOW"
+    processing_time = (time.time() - start_time) * 1000
+    return {
+        'product': title,
+        'category_id': best['category_id'],
+        'category_path': best['category_path'],
+        'final_product': best['final_product'],
+        'confidence': f"{conf_level} ({conf_pct:.2f}%)",
+        'confidence_percent': conf_pct,
+        'depth': best['depth'],
+        'matched_terms': matched_terms,
+        'top_5_results': results,
+        'processing_time_ms': round(processing_time, 2)
+    }
+    # FAISS returns squared L2 distances or inner product depending on index type.
+    # We'll treat lower distance as better. We convert to a 0-100-ish confidence by
+    # using a simple heuristic: score = 100 - normalized_distance*100 (clamped).
+    # Determine a normalization constant: use mean of top distance if available
+    flat_dist = distances[0]
+    max_d = float(np.max(flat_dist)) if flat_dist.size else 1.0
+    min_d = float(np.min(flat_dist)) if flat_dist.size else 0.0
+    range_d = max(1e-6, max_d - min_d)
+    for i, idx in enumerate(indices[0]):
+        if idx < 0 or idx >= len(metadata):
+            continue
+        meta = metadata[idx]
+        raw_d = float(distances[0][i])
+        # normalize and invert to make higher -> better
+        norm = (raw_d - min_d) / range_d
+        conf = max(0.0, min(100.0, 100.0 * (1.0 - norm)))
+        levels = meta.get('levels') or []
+        final_product = levels[-1] if levels else meta.get('category_path', '').split('/')[-1]
+        results.append({
+            'rank': i + 1,
+            'category_id': meta.get('category_id'),
+            'category_path': meta.get('category_path'),
+            'final_product': final_product,
+            'confidence': round(conf, 2),
+            'depth': meta.get('depth', 0)
+        })
+    if not results:
+        return {
+            'error': 'No results found',
+            'product': title
+        }
+    best = results[0]
+    conf_pct = best['confidence']
+    if conf_pct >= 90:
+        conf_level = "EXCELLENT"
+    elif conf_pct >= 85:
+        conf_level = "VERY HIGH"
+    elif conf_pct >= 80:
+        conf_level = "HIGH"
+    elif conf_pct >= 75:
+        conf_level = "GOOD"
+    elif conf_pct >= 70:
+        conf_level = "MEDIUM"
+    else:
+        conf_level = "LOW"
+    processing_time = (time.time() - start_time) * 1000.0
+    return {
+        'product': title,
+        'category_id': best['category_id'],
+        'category_path': best['category_path'],
+        'final_product': best['final_product'],
+        'confidence': f"{conf_level} ({conf_pct:.2f}%)",
+        'confidence_percent': conf_pct,
+        'depth': best['depth'],
+        'matched_terms': matched_terms,
+        'top_5_results': results,
+        'processing_time_ms': round(processing_time, 2)
+    }
+# ============================================================================
+# SERVER LOAD
+# ============================================================================
+def load_server():
+    global encoder, faiss_index, metadata, cross_store_synonyms
+    print('\n' + '=' * 80)
+    print('🔄 LOADING TRAINED MODEL')
+    print('=' * 80 + '\n')
+    # Load encoder
+    print('📥 Loading sentence transformer...')
+    encoder = SentenceTransformer(MODEL_NAME)
+    print('✅ Model loaded\n')
+    # Load FAISS index
+    print('📥 Loading FAISS index...')
+    if not FAISS_INDEX_PATH.exists():
+        raise FileNotFoundError(f"FAISS index not found: {FAISS_INDEX_PATH}\nPlease run training first!")
+    faiss_index = faiss.read_index(str(FAISS_INDEX_PATH))
+    print(f"✅ Index loaded ({faiss_index.ntotal:,} vectors)\n")
+    # Load metadata
+    print('📥 Loading metadata...')
+    if not METADATA_PATH.exists():
+        raise FileNotFoundError(f"Metadata not found: {METADATA_PATH}\nPlease run training first!")
+    with open(METADATA_PATH, 'rb') as f:
+        metadata = pickle.load(f)
+    print(f"✅ Metadata loaded ({len(metadata):,} categories)\n")
+    # Load or build cross-store synonyms
+    print('📥 Loading cross-store synonyms...')
+    if SYN_PATH.exists():
+        with open(SYN_PATH, 'rb') as f:
+            cross_store_synonyms = pickle.load(f)
+        print(f"✅ Cross-store synonyms loaded ({len(cross_store_synonyms)} terms)\n")
+    else:
+        print('⚠️  Cross-store synonyms not found, building default set...')
+        cross_store_synonyms = build_cross_store_synonyms()
+        print(f"✅ Built {len(cross_store_synonyms)} synonym mappings\n")
+    print('=' * 80)
+    print('✅ SERVER READY!')
+    print('=' * 80 + '\n')
+# ============================================================================
+# HTML TEMPLATE (same as provided)
+# ============================================================================
+HTML_TEMPLATE = r"""
+<!DOCTYPE html>
+<html>
+<head>
+    <title>🎯 Product Category Classifier</title>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <style>
+        * { margin: 0; padding: 0; box-sizing: border-box; }
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            min-height: 100vh;
+            padding: 20px;
+        }
+        .container { max-width: 1200px; margin: 0 auto; }
+        .header {
+            text-align: center;
+            color: white;
+            margin-bottom: 30px;
+        }
+        .header h1 { font-size: 2.5em; margin-bottom: 10px; }
+        .badge {
+            background: rgba(255,255,255,0.2);
+            padding: 8px 20px;
+            border-radius: 20px;
+            display: inline-block;
+            margin: 5px;
+            font-size: 0.9em;
+        }
+        .card {
+            background: white;
+            border-radius: 20px;
+            padding: 30px;
+            box-shadow: 0 10px 40px rgba(0,0,0,0.2);
+        }
+        .success-box {
+            background: #d4edda;
+            padding: 15px;
+            border-radius: 8px;
+            margin-bottom: 20px;
+            border-left: 4px solid #28a745;
+            color: #155724;
+        }
+        .form-group { margin-bottom: 20px; }
+        label {
+            display: block;
+            font-weight: 600;
+            margin-bottom: 8px;
+            color: #333;
+        }
+        input, textarea {
+            width: 100%;
+            padding: 12px;
+            border: 2px solid #e0e0e0;
+            border-radius: 8px;
+            font-size: 1em;
+        }
+        input:focus, textarea:focus {
+            outline: none;
+            border-color: #667eea;
+        }
+        textarea { min-height: 80px; resize: vertical; }
+        button {
+            width: 100%;
+            padding: 15px;
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            border: none;
+            border-radius: 10px;
+            font-size: 1.1em;
+            cursor: pointer;
+            font-weight: 600;
+            transition: transform 0.2s;
+        }
+        button:hover { transform: translateY(-2px); }
+        .results { display: none; margin-top: 20px; }
+        .results.show { display: block; animation: fadeIn 0.5s; }
+        @keyframes fadeIn {
+            from { opacity: 0; transform: translateY(10px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
+        .section {
+            background: #f8f9fa;
+            padding: 20px;
+            border-radius: 12px;
+            margin-bottom: 15px;
+            border-left: 4px solid #667eea;
+        }
+        .section h3 { color: #667eea; margin-bottom: 12px; }
+        .result-item {
+            background: white;
+            padding: 15px;
+            border-radius: 8px;
+            margin-bottom: 10px;
+            border-left: 3px solid #667eea;
+        }
+        .tag {
+            display: inline-block;
+            background: #667eea;
+            color: white;
+            padding: 6px 12px;
+            border-radius: 15px;
+            margin: 3px;
+            font-size: 0.9em;
+        }
+        .conf-excellent { background: #4caf50; }
+        .conf-very { background: #8bc34a; }
+        .conf-high { background: #cddc39; color: #333; }
+        .conf-good { background: #ff9800; }
+        .conf-medium { background: #ff5722; }
+        .conf-low { background: #9e9e9e; }
+        .loading { display: none; text-align: center; padding: 20px; }
+        .loading.show { display: block; }
+        .spinner {
+            border: 4px solid #f3f3f3;
+            border-top: 4px solid #667eea;
+            border-radius: 50%;
+            width: 40px;
+            height: 40px;
+            animation: spin 1s linear infinite;
+            margin: 0 auto;
+        }
+        @keyframes spin {
+            0% { transform: rotate(0deg); }
+            100% { transform: rotate(360deg); }
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <div class="header">
+            <h1>🎯 Product Category Classifier</h1>
+            <div class="badge">Cross-Store Intelligence</div>
+            <div class="badge">Auto-Tag Support</div>
+            <div class="badge">Real-Time</div>
+        </div>
+        <div class="card">
+            <div class="success-box">
+                <strong>✅ Cross-Store Synonyms Active!</strong><br>
+                Understands: washing machine = laundry machine | tv = television | kids = children
+            </div>
+            <div class="form-group">
+                <label>Product Title *</label>
+                <input type="text" id="title" placeholder="e.g., Washing Machine or Laundry Machine" />
+            </div>
+            <div class="form-group">
+                <label>Description (Optional)</label>
+                <textarea id="desc" placeholder="Additional details..."></textarea>
+            </div>
+            <button onclick="classify()">🎯 Classify Product</button>
+            <div class="loading" id="loading">
+                <div class="spinner"></div>
+                <p style="margin-top: 10px; color: #666;">Analyzing...</p>
+            </div>
+            <div class="results" id="results">
+                <div class="section">
+                    <h3>✅ Best Match</h3>
+                    <div class="result-item">
+                        <div style="margin-bottom: 10px;">
+                            <strong>Product:</strong> <span id="product"></span>
+                        </div>
+                        <div style="margin-bottom: 10px;">
+                            <strong>Category ID:</strong>
+                            <span id="catId" style="font-size: 1.2em; color: #28a745; font-weight: bold;"></span>
+                        </div>
+                        <div style="margin-bottom: 10px;">
+                            <strong>Final Product:</strong> <span id="finalProd" style="font-weight: 600;"></span>
+                        </div>
+                        <div style="margin-bottom: 10px;">
+                            <strong>Full Path:</strong><br>
+                            <span id="path" style="color: #666; font-size: 0.95em;"></span>
+                        </div>
+                        <div style="margin-bottom: 10px;">
+                            <strong>Confidence:</strong>
+                            <span id="confidence" class="tag"></span>
+                        </div>
+                        <div style="font-size: 0.9em; color: #666;">
+                            <strong>Depth:</strong> <span id="depth"></span> levels |
+                            <strong>Time:</strong> <span id="time"></span>ms
+                        </div>
+                    </div>
+                </div>
+                <div class="section">
+                    <h3>🔗 Matched Terms (Cross-Store Variations)</h3>
+                    <div id="matchedTerms"></div>
+                </div>
+                <div class="section">
+                    <h3>📋 Top 5 Alternative Matches</h3>
+                    <div id="alternatives"></div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <script>
+        async function classify() {
+            const title = document.getElementById('title').value.trim();
+            const desc = document.getElementById('desc').value.trim();
+            if (!title) {
+                alert('Please enter a product title');
+                return;
+            }
+            document.getElementById('loading').classList.add('show');
+            document.getElementById('results').classList.remove('show');
+            try {
+                const response = await fetch('/classify', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ title, description: desc })
+                });
+                if (!response.ok) throw new Error('Classification failed');
+                const data = await response.json();
+                displayResults(data);
+            } catch (error) {
+                alert('Error: ' + error.message);
+            } finally {
+                document.getElementById('loading').classList.remove('show');
+            }
+        }
+        function displayResults(data) {
+            document.getElementById('results').classList.add('show');
+            document.getElementById('product').textContent = data.product;
+            document.getElementById('catId').textContent = data.category_id;
+            document.getElementById('finalProd').textContent = data.final_product;
+            document.getElementById('path').textContent = data.category_path;
+            document.getElementById('depth').textContent = data.depth;
+            document.getElementById('time').textContent = data.processing_time_ms;
+            const conf = document.getElementById('confidence');
+            conf.textContent = data.confidence;
+            const confClass = data.confidence.split(' ')[0].toLowerCase().replace('_', '-');
+            conf.className = 'tag conf-' + confClass;
+            const matchedHtml = data.matched_terms.map(t => `<span class="tag">${t}</span>`).join('');
+            document.getElementById('matchedTerms').innerHTML = matchedHtml;
+            let altHtml = '';
+            data.top_5_results.forEach((item, i) => {
+                const cls = i === 0 ? 'style="background: #e8f5e9;"' : '';
+                altHtml += `
+                    <div class="result-item" ${cls}>
+                        <strong>${item.rank}.</strong> ${item.final_product}
+                        <span class="tag" style="background: #999;">${item.confidence}%</span>
+                        <div style="font-size: 0.85em; color: #666; margin-top: 5px;">
+                            ID: ${item.category_id}
+                        </div>
+                    </div>
+                `;
+            });
+            document.getElementById('alternatives').innerHTML = altHtml;
+        }
+        document.getElementById('title').addEventListener('keypress', function(e) {
+            if (e.key === 'Enter') classify();
+        });
+    </script>
+</body>
+</html>
+"""
+# ============================================================================
+# FLASK APP
+# ============================================================================
+app = Flask(__name__)
+@app.route('/')
+def index():
+    return render_template_string(HTML_TEMPLATE)
+@app.route('/classify', methods=['POST'])
+def classify_route():
+    data = request.get_json(force=True)
+    title = data.get('title', '').strip()
+    description = data.get('description', '').strip()
+    if not title:
+        return jsonify({'error': 'Title required'}), 400
+    try:
+        result = classify_product(title, description)
+        return jsonify(result)
+    except Exception as e:
+        app.logger.exception('Classification error')
+        return jsonify({'error': str(e)}), 500
+@app.route('/health')
+def health():
+    return jsonify({
+        'status': 'healthy',
+        'categories': len(metadata),
+        'cross_store_synonyms': len(cross_store_synonyms),
+        'model': MODEL_NAME
+    })
+# ============================================================================
+# MAIN
+# ============================================================================
+if __name__ == '__main__':
+    try:
+        load_server()
+        print('\n🌐 Server starting...')
+        print('   URL: http://localhost:5000')
+        print('   Press CTRL+C to stop\n')
+        # Recommended: run with a production server like gunicorn for production use
+        app.run(host='0.0.0.0', port=5000, debug=False)
+    except FileNotFoundError as e:
+        print(f"\n❌ ERROR: {e}")
+        print('\n💡 Solution: Run training first to create FAISS index and metadata')
+    except Exception as e:
+        print(f"\n❌ UNEXPECTED ERROR: {e}\n")

check.py ADDED Viewed

	@@ -0,0 +1,365 @@

+"""
+🔧 DIAGNOSTIC AND FIX TOOL
+===========================
+Analyzes your trained model and fixes common issues causing low confidence.
+Issues it detects and fixes:
+1. Column name mismatches (Category_ID vs category_id)
+2. Missing or corrupted tags.json
+3. Wrong metadata format in cache
+4. FAISS index mismatch
+Usage:
+    python diagnose_and_fix.py
+"""
+import pickle
+import json
+import pandas as pd
+import numpy as np
+import faiss
+from pathlib import Path
+from sentence_transformers import SentenceTransformer
+import sys
+def check_cache_files():
+    """Check what files exist in cache"""
+    cache_dir = Path('cache')
+    print("\n" + "="*80)
+    print("🔍 STEP 1: CHECKING CACHE FILES")
+    print("="*80 + "\n")
+    required_files = {
+        'main_index.faiss': cache_dir / 'main_index.faiss',
+        'metadata.pkl': cache_dir / 'metadata.pkl',
+        'model_info.json': cache_dir / 'model_info.json',
+    }
+    optional_files = {
+        'parent_embeddings.pkl': cache_dir / 'parent_embeddings.pkl',
+        'calibrator.pkl': cache_dir / 'calibrator.pkl',
+        'cross_store_synonyms.pkl': cache_dir / 'cross_store_synonyms.pkl',
+    }
+    issues = []
+    print("Required files:")
+    for name, path in required_files.items():
+        if path.exists():
+            size = path.stat().st_size / (1024 * 1024)  # MB
+            print(f"  ✅ {name} ({size:.2f} MB)")
+        else:
+            print(f"  ❌ {name} - MISSING")
+            issues.append(f"Missing required file: {name}")
+    print("\nOptional files:")
+    for name, path in optional_files.items():
+        if path.exists():
+            size = path.stat().st_size / (1024 * 1024)
+            print(f"  ✅ {name} ({size:.2f} MB)")
+        else:
+            print(f"  ⚠️  {name} - not found")
+    return issues
+def check_csv_format():
+    """Check CSV file format"""
+    print("\n" + "="*80)
+    print("🔍 STEP 2: CHECKING CSV FORMAT")
+    print("="*80 + "\n")
+    csv_path = Path('data/category_only_path.csv')
+    if not csv_path.exists():
+        print("❌ CSV not found at: data/category_only_path.csv")
+        return ["CSV file not found"]
+    try:
+        df = pd.read_csv(csv_path, nrows=5)
+        print(f"Columns found: {list(df.columns)}")
+        print(f"Total rows: {len(pd.read_csv(csv_path)):,}")
+        print("\nFirst 3 rows:")
+        print(df.head(3).to_string())
+        # Check column names
+        if 'Category_ID' in df.columns and 'Category_path' in df.columns:
+            print("\n✅ Column format: Uppercase (Category_ID, Category_path)")
+            return []
+        elif 'category_id' in df.columns and 'category_path' in df.columns:
+            print("\n✅ Column format: Lowercase (category_id, category_path)")
+            return []
+        else:
+            print("\n❌ Unexpected column names!")
+            return ["CSV has wrong column names"]
+    except Exception as e:
+        print(f"\n❌ Error reading CSV: {e}")
+        return [f"CSV read error: {e}"]
+def check_metadata():
+    """Check metadata format"""
+    print("\n" + "="*80)
+    print("🔍 STEP 3: CHECKING METADATA FORMAT")
+    print("="*80 + "\n")
+    meta_path = Path('cache/metadata.pkl')
+    if not meta_path.exists():
+        print("❌ Metadata file not found")
+        return ["Metadata missing"]
+    try:
+        with open(meta_path, 'rb') as f:
+            metadata = pickle.load(f)
+        print(f"Metadata entries: {len(metadata):,}")
+        if metadata:
+            sample = metadata[0]
+            print(f"\nSample entry:")
+            print(f"  Keys: {list(sample.keys())}")
+            print(f"  category_id: {sample.get('category_id', 'MISSING')}")
+            print(f"  category_path: {sample.get('category_path', 'MISSING')[:50]}...")
+            # Check if all entries have required fields
+            missing_fields = []
+            for i, entry in enumerate(metadata[:100]):
+                if 'category_id' not in entry:
+                    missing_fields.append(f"Entry {i}: missing category_id")
+                if 'category_path' not in entry:
+                    missing_fields.append(f"Entry {i}: missing category_path")
+            if missing_fields:
+                print(f"\n❌ Found {len(missing_fields)} entries with missing fields")
+                return missing_fields[:5]  # Return first 5
+            else:
+                print("\n✅ All entries have required fields")
+                return []
+        else:
+            print("❌ Metadata is empty!")
+            return ["Empty metadata"]
+    except Exception as e:
+        print(f"❌ Error reading metadata: {e}")
+        return [f"Metadata error: {e}"]
+def check_faiss_index():
+    """Check FAISS index"""
+    print("\n" + "="*80)
+    print("🔍 STEP 4: CHECKING FAISS INDEX")
+    print("="*80 + "\n")
+    index_path = Path('cache/main_index.faiss')
+    meta_path = Path('cache/metadata.pkl')
+    if not index_path.exists():
+        print("❌ FAISS index not found")
+        return ["FAISS index missing"]
+    try:
+        index = faiss.read_index(str(index_path))
+        print(f"FAISS index vectors: {index.ntotal:,}")
+        print(f"Dimension: {index.d}")
+        with open(meta_path, 'rb') as f:
+            metadata = pickle.load(f)
+        print(f"Metadata entries: {len(metadata):,}")
+        if index.ntotal != len(metadata):
+            print(f"\n❌ MISMATCH!")
+            print(f"   FAISS has {index.ntotal:,} vectors")
+            print(f"   Metadata has {len(metadata):,} entries")
+            return ["FAISS-metadata count mismatch"]
+        else:
+            print("\n✅ FAISS and metadata counts match")
+            return []
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        return [f"FAISS error: {e}"]
+def check_tags_json():
+    """Check tags.json"""
+    print("\n" + "="*80)
+    print("🔍 STEP 5: CHECKING TAGS.JSON")
+    print("="*80 + "\n")
+    tags_path = Path('data/tags.json')
+    if not tags_path.exists():
+        print("⚠️  tags.json not found - this will reduce accuracy!")
+        print("   Expected location: data/tags.json")
+        return ["tags.json missing"]
+    try:
+        with open(tags_path, 'r') as f:
+            tags = json.load(f)
+        print(f"Tags for {len(tags):,} categories")
+        if tags:
+            sample_key = list(tags.keys())[0]
+            sample_tags = tags[sample_key]
+            print(f"\nSample category: {sample_key}")
+            print(f"Tags ({len(sample_tags)}): {', '.join(sample_tags[:5])}...")
+            # Check average tags per category
+            tag_counts = [len(t) for t in tags.values() if isinstance(t, list)]
+            avg_tags = sum(tag_counts) / len(tag_counts) if tag_counts else 0
+            print(f"\nAverage tags per category: {avg_tags:.1f}")
+            if avg_tags < 10:
+                print("⚠️  Very few tags - this will reduce accuracy")
+                return ["Too few tags per category"]
+            else:
+                print("✅ Tags look good")
+                return []
+        else:
+            print("❌ tags.json is empty!")
+            return ["Empty tags.json"]
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        return [f"tags.json error: {e}"]
+def test_prediction():
+    """Test a sample prediction"""
+    print("\n" + "="*80)
+    print("🔍 STEP 6: TESTING PREDICTION")
+    print("="*80 + "\n")
+    try:
+        print("Loading model...")
+        encoder = SentenceTransformer('intfloat/e5-base-v2')
+        print("Loading FAISS index...")
+        index = faiss.read_index('cache/main_index.faiss')
+        print("Loading metadata...")
+        with open('cache/metadata.pkl', 'rb') as f:
+            metadata = pickle.load(f)
+        # Test query
+        test_query = "query: built in dishwasher"
+        print(f"\nTest query: \"{test_query}\"")
+        print("Encoding...")
+        query_emb = encoder.encode(test_query, convert_to_numpy=True, normalize_embeddings=True)
+        if query_emb.ndim == 1:
+            query_emb = query_emb.reshape(1, -1)
+        print("Searching...")
+        distances, indices = index.search(query_emb.astype('float32'), 5)
+        print("\nTop 5 results:")
+        for i in range(5):
+            idx = indices[0][i]
+            score = distances[0][i]
+            meta = metadata[idx]
+            print(f"\n{i+1}. Score: {score:.4f}")
+            print(f"   ID: {meta.get('category_id', 'N/A')}")
+            print(f"   Path: {meta.get('category_path', 'N/A')[:60]}...")
+        best_score = float(distances[0][0])
+        if best_score < 0.3:
+            print(f"\n❌ VERY LOW CONFIDENCE: {best_score:.4f}")
+            print("   This indicates a serious problem with training!")
+            return ["Very low prediction scores"]
+        elif best_score < 0.5:
+            print(f"\n⚠️  LOW CONFIDENCE: {best_score:.4f}")
+            print("   Model needs improvement")
+            return ["Low prediction scores"]
+        else:
+            print(f"\n✅ GOOD CONFIDENCE: {best_score:.4f}")
+            return []
+    except Exception as e:
+        print(f"\n❌ Prediction test failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return [f"Prediction error: {e}"]
+def generate_fix_commands(all_issues):
+    """Generate commands to fix issues"""
+    print("\n" + "="*80)
+    print("🔧 RECOMMENDED FIXES")
+    print("="*80 + "\n")
+    if not all_issues:
+        print("✅ No critical issues found!")
+        print("\nIf you're still experiencing low confidence:")
+        print("  1. Make sure you're using tags.json")
+        print("  2. Check if validation.csv is being used for calibration")
+        print("  3. Verify CSV has correct column names")
+        return
+    print("Issues found:")
+    for i, issue in enumerate(all_issues, 1):
+        print(f"  {i}. {issue}")
+    print("\n" + "="*80)
+    print("FIX STEPS:")
+    print("="*80 + "\n")
+    if any('missing' in issue.lower() or 'mismatch' in issue.lower() or 'low' in issue.lower() for issue in all_issues):
+        print("🔄 RE-TRAINING REQUIRED")
+        print("\nRun these commands in order:\n")
+        print("# Step 1: Generate tags (if missing)")
+        print("python generate_hybrid_tags.py data/category_only_path.csv data/tags.json")
+        print()
+        print("# Step 2: Generate validation data (for calibration)")
+        print("python create_validation_data.py auto data/category_only_path.csv 200")
+        print()
+        print("# Step 3: Train with ALL fixes")
+        print("python train_fixed_v2.py data/category_only_path.csv data/tags.json data/validation.csv")
+        print()
+    else:
+        print("✅ No retraining needed - minor issues only")
+def main():
+    print("\n" + "="*80)
+    print("🔧 DIAGNOSTIC AND FIX TOOL")
+    print("="*80)
+    print("\nThis tool will analyze your model and identify issues\n")
+    all_issues = []
+    # Run all checks
+    all_issues.extend(check_cache_files())
+    all_issues.extend(check_csv_format())
+    all_issues.extend(check_metadata())
+    all_issues.extend(check_faiss_index())
+    all_issues.extend(check_tags_json())
+    all_issues.extend(test_prediction())
+    # Generate fixes
+    generate_fix_commands(all_issues)
+    print("\n" + "="*80)
+    print("📊 DIAGNOSIS COMPLETE")
+    print("="*80)
+    print(f"\nTotal issues found: {len(all_issues)}")
+    print("\n")
+if __name__ == "__main__":
+    main()

fix.py ADDED Viewed

	@@ -0,0 +1,270 @@

+"""
+🔧 AUTOMATIC EMBEDDING & INDEX FIXER
+====================================
+Fixes common issues causing low confidence scores
+Usage:
+    python fix_embeddings.py normalize          # Fix normalization
+    python fix_embeddings.py rebuild-index      # Rebuild FAISS
+    python fix_embeddings.py full-fix           # Do everything
+"""
+import numpy as np
+import faiss
+import pickle
+import sys
+from pathlib import Path
+from tqdm import tqdm
+import warnings
+warnings.filterwarnings('ignore')
+class EmbeddingFixer:
+    def __init__(self, cache_dir='cache'):
+        self.cache_dir = Path(cache_dir)
+    def banner(self, text):
+        print("\n" + "="*80)
+        print(f"🔧 {text}")
+        print("="*80 + "\n")
+    def backup_files(self):
+        """Backup existing files"""
+        self.banner("CREATING BACKUPS")
+        backup_dir = self.cache_dir / 'backup'
+        backup_dir.mkdir(exist_ok=True)
+        files_to_backup = [
+            'embeddings.npy',
+            'main_index.faiss',
+            'metadata.pkl'
+        ]
+        for filename in files_to_backup:
+            src = self.cache_dir / filename
+            if src.exists():
+                dst = backup_dir / filename
+                import shutil
+                shutil.copy2(src, dst)
+                print(f"✅ Backed up: {filename}")
+        print(f"\n📁 Backups saved to: {backup_dir}")
+    def normalize_embeddings(self):
+        """Normalize embeddings to unit length"""
+        self.banner("NORMALIZING EMBEDDINGS")
+        emb_path = self.cache_dir / 'embeddings.npy'
+        if not emb_path.exists():
+            print("❌ embeddings.npy not found!")
+            return False
+        print("Loading embeddings...")
+        embeddings = np.load(emb_path)
+        print(f"Original shape: {embeddings.shape}")
+        # Check current normalization
+        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
+        print(f"Mean norm before: {norms.mean():.6f}")
+        print(f"Std norm before: {norms.std():.6f}")
+        # Normalize
+        print("\nNormalizing...")
+        embeddings_normalized = embeddings / (norms + 1e-8)
+        # Verify
+        norms_after = np.linalg.norm(embeddings_normalized, axis=1)
+        print(f"Mean norm after: {norms_after.mean():.6f}")
+        print(f"Std norm after: {norms_after.std():.6f}")
+        # Save
+        output_path = self.cache_dir / 'embeddings.npy'
+        np.save(output_path, embeddings_normalized.astype('float32'))
+        print(f"\n✅ Saved normalized embeddings: {output_path}")
+        return True
+    def rebuild_faiss_index(self):
+        """Rebuild FAISS index with correct metric"""
+        self.banner("REBUILDING FAISS INDEX")
+        emb_path = self.cache_dir / 'embeddings.npy'
+        if not emb_path.exists():
+            print("❌ embeddings.npy not found!")
+            return False
+        print("Loading embeddings...")
+        embeddings = np.load(emb_path).astype('float32')
+        print(f"Shape: {embeddings.shape}")
+        # Ensure normalized
+        norms = np.linalg.norm(embeddings, axis=1)
+        if abs(norms.mean() - 1.0) > 0.01:
+            print("⚠️  Embeddings not normalized, normalizing now...")
+            embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
+            np.save(emb_path, embeddings)
+        dimension = embeddings.shape[1]
+        print(f"\nBuilding FAISS index...")
+        print(f"   Dimension: {dimension}")
+        print(f"   Vectors: {len(embeddings):,}")
+        print(f"   Metric: INNER_PRODUCT")
+        # Create index with INNER_PRODUCT metric
+        index = faiss.IndexFlatIP(dimension)
+        # Add vectors
+        print("\nAdding vectors...")
+        index.add(embeddings)
+        # Save
+        index_path = self.cache_dir / 'main_index.faiss'
+        faiss.write_index(index, str(index_path))
+        print(f"\n✅ Saved FAISS index: {index_path}")
+        print(f"   Total vectors: {index.ntotal:,}")
+        return True
+    def verify_fixes(self):
+        """Verify that fixes worked"""
+        self.banner("VERIFYING FIXES")
+        try:
+            # Check embeddings
+            embeddings = np.load(self.cache_dir / 'embeddings.npy')
+            norms = np.linalg.norm(embeddings, axis=1)
+            print("📊 Embeddings:")
+            print(f"   Mean norm: {norms.mean():.6f}")
+            print(f"   Std norm: {norms.std():.6f}")
+            if abs(norms.mean() - 1.0) < 0.01 and norms.std() < 0.01:
+                print("   ✅ Properly normalized")
+            else:
+                print("   ❌ Still not normalized properly")
+                return False
+            # Check FAISS
+            index = faiss.read_index(str(self.cache_dir / 'main_index.faiss'))
+            print(f"\n📊 FAISS Index:")
+            print(f"   Vectors: {index.ntotal:,}")
+            print(f"   Dimension: {index.d}")
+            metric = index.metric_type
+            if metric == faiss.METRIC_INNER_PRODUCT:
+                print("   ✅ Using INNER_PRODUCT")
+            else:
+                print(f"   ❌ Wrong metric: {metric}")
+                return False
+            # Test search
+            print("\n🔍 Testing search...")
+            query = embeddings[0:1]
+            distances, indices = index.search(query, 5)
+            print(f"   Top result index: {indices[0][0]}")
+            print(f"   Top result score: {distances[0][0]:.6f}")
+            if distances[0][0] > 0.95:  # Should match itself almost perfectly
+                print("   ✅ Search working correctly")
+            else:
+                print("   ⚠️  Unexpected similarity score")
+            print("\n✅ ALL CHECKS PASSED!")
+            return True
+        except Exception as e:
+            print(f"\n❌ Verification failed: {e}")
+            return False
+    def full_fix(self):
+        """Run all fixes"""
+        self.banner("RUNNING FULL FIX")
+        print("This will:")
+        print("1. Backup existing files")
+        print("2. Normalize embeddings")
+        print("3. Rebuild FAISS index")
+        print("4. Verify fixes")
+        print("\nStarting in 3 seconds...")
+        import time
+        time.sleep(3)
+        # Backup
+        self.backup_files()
+        # Fix embeddings
+        if not self.normalize_embeddings():
+            print("\n❌ Failed to normalize embeddings")
+            return False
+        # Rebuild index
+        if not self.rebuild_faiss_index():
+            print("\n❌ Failed to rebuild index")
+            return False
+        # Verify
+        if not self.verify_fixes():
+            print("\n❌ Fixes did not work properly")
+            return False
+        print("\n" + "="*80)
+        print("✅ ALL FIXES COMPLETED SUCCESSFULLY!")
+        print("="*80)
+        print("\nNext steps:")
+        print("1. Restart your API server: python api_server.py")
+        print("2. Test classification with a known category")
+        print("3. Check confidence scores")
+        print("\nIf issues persist, run diagnostics:")
+        print("   python diagnose_and_fix.py")
+        print("="*80 + "\n")
+        return True
+def main():
+    if len(sys.argv) < 2:
+        print("\n" + "="*80)
+        print("🔧 EMBEDDING & INDEX FIXER")
+        print("="*80)
+        print("\nUsage:")
+        print("   python fix_embeddings.py normalize       # Fix normalization only")
+        print("   python fix_embeddings.py rebuild-index   # Rebuild FAISS index")
+        print("   python fix_embeddings.py full-fix        # Do everything (recommended)")
+        print("\nExample:")
+        print("   python fix_embeddings.py full-fix")
+        print("="*80 + "\n")
+        sys.exit(1)
+    command = sys.argv[1].lower()
+    fixer = EmbeddingFixer()
+    if command == 'normalize':
+        fixer.backup_files()
+        fixer.normalize_embeddings()
+        fixer.verify_fixes()
+    elif command == 'rebuild-index':
+        fixer.backup_files()
+        fixer.rebuild_faiss_index()
+        fixer.verify_fixes()
+    elif command == 'full-fix':
+        fixer.full_fix()
+    else:
+        print(f"❌ Unknown command: {command}")
+        print("Use: normalize, rebuild-index, or full-fix")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

gradio_app.py ADDED Viewed

	@@ -0,0 +1,259 @@

+#!/usr/bin/env python3
+"""
+Gradio App for Product Category Classification
+Model: intfloat/e5-base-v2 (must match training)
+Requires: pip install gradio sentence-transformers faiss-cpu numpy pickle5
+"""
+import gradio as gr
+from sentence_transformers import SentenceTransformer
+import faiss
+import pickle
+import numpy as np
+import re
+from pathlib import Path
+import time
+# ====================================================================
+# CONFIG
+# ====================================================================
+CACHE_DIR = Path("cache")
+MODEL_NAME = "intfloat/e5-base-v2"
+FAISS_INDEX_PATH = CACHE_DIR / "main_index.faiss"
+METADATA_PATH = CACHE_DIR / "metadata.pkl"
+SYN_PATH = CACHE_DIR / "cross_store_synonyms.pkl"
+encoder = None
+faiss_index = None
+metadata = []
+cross_store_synonyms = {}
+# ====================================================================
+# UTILITIES
+# ====================================================================
+def clean_text(text: str) -> str:
+    if not text:
+        return ""
+    text = str(text).lower()
+    text = re.sub(r"[^\w\s-]", " ", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def build_cross_store_synonyms():
+    synonyms = {
+        'washing machine': {'laundry machine', 'washer', 'clothes washer', 'washing appliance'},
+        'laundry machine': {'washing machine', 'washer', 'clothes washer'},
+        'dryer': {'drying machine', 'clothes dryer', 'tumble dryer'},
+        'refrigerator': {'fridge', 'cooler', 'ice box', 'cooling appliance'},
+        'dishwasher': {'dish washer', 'dish cleaning machine'},
+        'microwave': {'microwave oven', 'micro wave'},
+        'vacuum': {'vacuum cleaner', 'hoover', 'vac'},
+        'tv': {'television', 'telly', 'smart tv', 'display'},
+        'laptop': {'notebook', 'portable computer', 'laptop computer'},
+        'mobile': {'phone', 'cell phone', 'smartphone', 'cellphone'},
+        'tablet': {'ipad', 'tab', 'tablet computer'},
+        'headphones': {'headset', 'earphones', 'earbuds', 'ear buds'},
+        'speaker': {'audio speaker', 'sound system', 'speakers'},
+        'sofa': {'couch', 'settee', 'divan'},
+        'wardrobe': {'closet', 'armoire', 'cupboard'},
+        'drawer': {'chest of drawers', 'dresser'},
+        'pants': {'trousers', 'slacks', 'bottoms'},
+        'sweater': {'jumper', 'pullover', 'sweatshirt'},
+        'sneakers': {'trainers', 'tennis shoes', 'running shoes'},
+        'jacket': {'coat', 'blazer', 'outerwear'},
+        'cooker': {'stove', 'range', 'cooking range'},
+        'blender': {'mixer', 'food processor', 'liquidizer'},
+        'kettle': {'electric kettle', 'water boiler'},
+        'stroller': {'pram', 'pushchair', 'buggy', 'baby carriage'},
+        'diaper': {'nappy', 'nappies'},
+        'pacifier': {'dummy', 'soother'},
+        'wrench': {'spanner', 'adjustable wrench'},
+        'flashlight': {'torch', 'flash light'},
+        'screwdriver': {'screw driver'},
+        'tap': {'faucet', 'water tap'},
+        'bin': {'trash can', 'garbage can', 'waste bin'},
+        'curtain': {'drape', 'window covering'},
+        'guillotine': {'paper cutter', 'paper trimmer', 'blade cutter'},
+        'trimmer': {'cutter', 'cutting tool', 'edge cutter'},
+        'stapler': {'stapling machine', 'staple gun'},
+        'magazine': {'periodical', 'journal', 'publication'},
+        'comic': {'comic book', 'graphic novel', 'manga'},
+        'ebook': {'e-book', 'digital book', 'electronic book'},
+        'kids': {'children', 'child', 'childrens', 'youth', 'junior'},
+        'women': {'womens', 'ladies', 'female', 'lady'},
+        'men': {'mens', 'male', 'gentleman'},
+        'baby': {'infant', 'newborn', 'toddler'},
+    }
+    expanded = {}
+    for term, syns in synonyms.items():
+        expanded[term] = set(syns)
+        for syn in syns:
+            if syn not in expanded:
+                expanded[syn] = set()
+            expanded[syn].add(term)
+            expanded[syn].update(syns - {syn})
+    return expanded
+def extract_cross_store_terms(text: str):
+    cleaned = clean_text(text)
+    words = cleaned.split()
+    all_terms = set()
+    all_terms.add(cleaned)
+    for word in words:
+        if len(word) > 2:
+            all_terms.add(word)
+            if word in cross_store_synonyms:
+                all_terms.update(cross_store_synonyms[word])
+    for i in range(len(words) - 1):
+        phrase = f"{words[i]} {words[i+1]}"
+        all_terms.add(phrase)
+        if phrase in cross_store_synonyms:
+            all_terms.update(cross_store_synonyms[phrase])
+    for i in range(len(words) - 2):
+        phrase = f"{words[i]} {words[i+1]} {words[i+2]}"
+        all_terms.add(phrase)
+    return list(all_terms)
+def build_enhanced_query(title, description="", max_synonyms=10):
+    title_clean = clean_text(title)
+    description_clean = clean_text(description)
+    synonyms_list = extract_cross_store_terms(f"{title_clean} {description_clean}")
+    enhanced_query = ' '.join([title_clean]*3 + synonyms_list[:max_synonyms])
+    return enhanced_query, synonyms_list[:20]
+def encode_query(text: str):
+    emb = encoder.encode(text, convert_to_numpy=True, normalize_embeddings=True)
+    if emb.ndim == 1:
+        emb = emb.reshape(1, -1)
+    return emb.astype('float32')
+# ====================================================================
+# CLASSIFICATION
+# ====================================================================
+def classify_product(title, description="", top_k=5):
+    start_time = time.time()
+    query_text, matched_terms = build_enhanced_query(title, description)
+    query_embedding = encode_query(query_text)
+    distances, indices = faiss_index.search(query_embedding, top_k)
+    results = []
+    for i, idx in enumerate(indices[0]):
+        if idx >= len(metadata):
+            continue
+        meta = metadata[idx]
+        similarity = 1 - distances[0][i]
+        confidence_pct = float(similarity) * 100
+        final_product = meta.get('levels', [])[-1] if meta.get('levels') else meta['category_path'].split('/')[-1]
+        results.append({
+            'rank': i+1,
+            'category_id': str(meta['category_id']),
+            'category_path': meta['category_path'],
+            'final_product': final_product,
+            'confidence': round(confidence_pct, 2),
+            'depth': meta.get('depth', 0)
+        })
+    if not results:
+        return {
+            'error': 'No results found',
+            'product': title
+        }
+    best = results[0]
+    conf_pct = best['confidence']
+    if conf_pct >= 90:
+        conf_level = "EXCELLENT"
+    elif conf_pct >= 85:
+        conf_level = "VERY HIGH"
+    elif conf_pct >= 80:
+        conf_level = "HIGH"
+    elif conf_pct >= 75:
+        conf_level = "GOOD"
+    elif conf_pct >= 70:
+        conf_level = "MEDIUM"
+    else:
+        conf_level = "LOW"
+    processing_time = (time.time() - start_time) * 1000
+    return {
+        'product': title,
+        'category_id': best['category_id'],
+        'category_path': best['category_path'],
+        'final_product': best['final_product'],
+        'confidence': f"{conf_level} ({conf_pct:.2f}%)",
+        'confidence_percent': conf_pct,
+        'depth': best['depth'],
+        'matched_terms': matched_terms,
+        'top_5_results': results,
+        'processing_time_ms': round(processing_time, 2)
+    }
+# ====================================================================
+# LOAD MODEL & INDEX
+# ====================================================================
+def load_model():
+    global encoder, faiss_index, metadata, cross_store_synonyms
+    print("Loading sentence-transformer model...")
+    encoder = SentenceTransformer(MODEL_NAME)
+    print("Model loaded.")
+    print("Loading FAISS index...")
+    faiss_index = faiss.read_index(str(FAISS_INDEX_PATH))
+    print(f"FAISS index loaded: {faiss_index.ntotal} vectors.")
+    print("Loading metadata...")
+    with open(METADATA_PATH, 'rb') as f:
+        metadata = pickle.load(f)
+    print(f"Metadata loaded: {len(metadata)} categories.")
+    print("Loading cross-store synonyms...")
+    if SYN_PATH.exists():
+        with open(SYN_PATH, 'rb') as f:
+            cross_store_synonyms = pickle.load(f)
+        print(f"Loaded {len(cross_store_synonyms)} synonyms from file.")
+    else:
+        cross_store_synonyms = build_cross_store_synonyms()
+        print(f"Built {len(cross_store_synonyms)} default synonyms.")
+# ====================================================================
+# GRADIO FUNCTION
+# ====================================================================
+def classify_gradio(title, description=""):
+    result = classify_product(title, description)
+    top_match = str(result.get('final_product', ''))
+    category_path = str(result.get('category_path', ''))
+    confidence = str(result.get('confidence', ''))
+    matched_terms = ', '.join(result.get('matched_terms', [])) if result.get('matched_terms') else ''
+    top5_html = ""
+    for item in result.get('top_5_results', []):
+        top5_html += f"{item['rank']}. {item['final_product']} (ID: {item['category_id']}, Confidence: {item['confidence']}%)\n"
+    return top_match, category_path, confidence, matched_terms, top5_html
+# ====================================================================
+# MAIN GRADIO APP
+# ====================================================================
+def main():
+    load_model()
+    iface = gr.Interface(
+        fn=classify_gradio,
+        inputs=[
+            gr.Textbox(label="Product Title"),
+            gr.Textbox(label="Description")
+        ],
+        outputs=[
+            gr.Textbox(label="Predicted Product"),
+            gr.Textbox(label="Category Path"),
+            gr.Textbox(label="Confidence"),
+            gr.Textbox(label="Matched Terms"),
+            gr.Textbox(label="Top 5 Alternatives")
+        ],
+        title="🎯 Product Category Classifier",
+        description="Classify products with full cross-store synonyms and embeddings"
+    )
+    # Launch with a public shareable link
+    iface.launch(share=True)
+if __name__ == "__main__":
+    main()

miss.py ADDED Viewed

	@@ -0,0 +1,421 @@

+"""
+🔬 ADVANCED MODEL DIAGNOSTICS & AUTOMATIC FIXES
+===============================================
+Diagnoses and fixes common issues causing low confidence/accuracy
+Usage:
+    python diagnose_and_fix.py
+"""
+import numpy as np
+import pandas as pd
+import pickle
+import json
+import faiss
+from pathlib import Path
+from sentence_transformers import SentenceTransformer
+from collections import defaultdict, Counter
+from tqdm import tqdm
+import warnings
+warnings.filterwarnings('ignore')
+class ModelDiagnostics:
+    def __init__(self, cache_dir='cache', data_dir='data'):
+        self.cache_dir = Path(cache_dir)
+        self.data_dir = Path(data_dir)
+        self.issues = []
+        self.fixes_applied = []
+    def banner(self, text):
+        print("\n" + "="*80)
+        print(f"🔍 {text}")
+        print("="*80 + "\n")
+    def check_embedding_normalization(self):
+        """Check if embeddings are properly normalized"""
+        self.banner("CHECKING EMBEDDING NORMALIZATION")
+        try:
+            embeddings = np.load(self.cache_dir / 'embeddings.npy')
+            # Check norms
+            norms = np.linalg.norm(embeddings, axis=1)
+            print(f"📊 Embedding Statistics:")
+            print(f"   Shape: {embeddings.shape}")
+            print(f"   Mean norm: {norms.mean():.6f}")
+            print(f"   Std norm: {norms.std():.6f}")
+            print(f"   Min norm: {norms.min():.6f}")
+            print(f"   Max norm: {norms.max():.6f}")
+            # Should be ~1.0 if normalized
+            if abs(norms.mean() - 1.0) > 0.01 or norms.std() > 0.01:
+                self.issues.append({
+                    'type': 'CRITICAL',
+                    'issue': 'Embeddings not normalized',
+                    'details': f'Mean norm: {norms.mean():.6f} (should be ~1.0)',
+                    'fix': 'Re-normalize embeddings'
+                })
+                print("   ❌ ISSUE: Embeddings are NOT normalized!")
+                print("   This causes incorrect similarity scores")
+                return False
+            else:
+                print("   ✅ Embeddings properly normalized")
+                return True
+        except Exception as e:
+            print(f"   ❌ Error: {e}")
+            return False
+    def check_faiss_metric(self):
+        """Check FAISS index metric type"""
+        self.banner("CHECKING FAISS INDEX METRIC")
+        try:
+            index = faiss.read_index(str(self.cache_dir / 'main_index.faiss'))
+            metric = index.metric_type
+            print(f"📊 FAISS Index:")
+            print(f"   Vectors: {index.ntotal:,}")
+            print(f"   Dimension: {index.d}")
+            print(f"   Metric type: {metric}")
+            if metric == faiss.METRIC_INNER_PRODUCT:
+                print("   ✅ Using INNER_PRODUCT (correct for normalized vectors)")
+                return True
+            elif metric == faiss.METRIC_L2:
+                self.issues.append({
+                    'type': 'CRITICAL',
+                    'issue': 'Wrong FAISS metric',
+                    'details': 'Using L2 distance instead of inner product',
+                    'fix': 'Rebuild index with METRIC_INNER_PRODUCT'
+                })
+                print("   ❌ ISSUE: Using L2 distance!")
+                print("   Should use INNER_PRODUCT for normalized vectors")
+                return False
+            else:
+                print(f"   ⚠️  Unknown metric: {metric}")
+                return False
+        except Exception as e:
+            print(f"   ❌ Error: {e}")
+            return False
+    def check_text_weighting(self):
+        """Check if text is properly weighted"""
+        self.banner("CHECKING TEXT CONSTRUCTION")
+        try:
+            with open(self.cache_dir / 'metadata.pkl', 'rb') as f:
+                metadata = pickle.load(f)
+            # Analyze a sample
+            sample = metadata[0]
+            print(f"📊 Sample Category:")
+            print(f"   ID: {sample.get('category_id')}")
+            print(f"   Path: {sample.get('category_path')}")
+            print(f"   Depth: {sample.get('depth')}")
+            print(f"   Levels: {sample.get('levels')}")
+            # Check if we have tags
+            if 'auto_tags' in sample and sample['auto_tags']:
+                print(f"   Tags: {len(sample['auto_tags'])} tags")
+                print(f"   Sample tags: {sample['auto_tags'][:5]}")
+                print("   ✅ Auto-tags present")
+            else:
+                self.issues.append({
+                    'type': 'WARNING',
+                    'issue': 'Missing auto-tags',
+                    'details': 'Categories lack auto-generated tags',
+                    'fix': 'Generate tags from category paths'
+                })
+                print("   ⚠️  No auto-tags found")
+            return True
+        except Exception as e:
+            print(f"   ❌ Error: {e}")
+            return False
+    def test_predictions(self, num_samples=100):
+        """Test prediction accuracy on random samples"""
+        self.banner("TESTING PREDICTION ACCURACY")
+        try:
+            # Load model
+            print("Loading model and index...")
+            encoder = SentenceTransformer('intfloat/e5-base-v2')
+            index = faiss.read_index(str(self.cache_dir / 'main_index.faiss'))
+            with open(self.cache_dir / 'metadata.pkl', 'rb') as f:
+                metadata = pickle.load(f)
+            # Load CSV
+            csv_files = list(self.data_dir.glob('*.csv'))
+            if not csv_files:
+                print("   ❌ No CSV files found in data/")
+                return False
+            df = pd.read_csv(csv_files[0])
+            # Sample categories
+            samples = df.sample(min(num_samples, len(df)))
+            correct = 0
+            confidence_scores = []
+            rank_positions = []
+            print(f"Testing {len(samples)} random categories...\n")
+            for idx, row in tqdm(samples.iterrows(), total=len(samples)):
+                cat_id = str(row.iloc[0])  # First column
+                cat_path = str(row.iloc[1])  # Second column
+                # Get leaf category (final product)
+                leaf = cat_path.split('/')[-1].strip()
+                # Build query
+                query = f"query: {leaf}"
+                # Encode
+                query_emb = encoder.encode(query, normalize_embeddings=True)
+                query_emb = query_emb.reshape(1, -1).astype('float32')
+                # Search
+                distances, indices = index.search(query_emb, 10)
+                # Check if correct category is in top results
+                found_rank = None
+                for rank, idx in enumerate(indices[0]):
+                    pred_id = str(metadata[idx]['category_id'])
+                    if pred_id == cat_id:
+                        found_rank = rank + 1
+                        correct += 1
+                        confidence_scores.append(float(distances[0][rank]))
+                        break
+                if found_rank:
+                    rank_positions.append(found_rank)
+                else:
+                    rank_positions.append(11)  # Not in top 10
+            # Calculate metrics
+            accuracy = (correct / len(samples)) * 100
+            avg_confidence = np.mean(confidence_scores) if confidence_scores else 0
+            print(f"\n📊 Results:")
+            print(f"   Accuracy (Top-1): {accuracy:.2f}%")
+            print(f"   Correct predictions: {correct}/{len(samples)}")
+            print(f"   Average confidence: {avg_confidence:.4f}")
+            if confidence_scores:
+                print(f"   Min confidence: {min(confidence_scores):.4f}")
+                print(f"   Max confidence: {max(confidence_scores):.4f}")
+            # Rank distribution
+            rank_counts = Counter(rank_positions)
+            print(f"\n   Rank Distribution:")
+            for rank in sorted(rank_counts.keys())[:5]:
+                count = rank_counts[rank]
+                pct = (count / len(samples)) * 100
+                print(f"   Rank {rank}: {count} ({pct:.1f}%)")
+            if accuracy < 70:
+                self.issues.append({
+                    'type': 'CRITICAL',
+                    'issue': 'Low prediction accuracy',
+                    'details': f'Only {accuracy:.1f}% accuracy',
+                    'fix': 'Retrain with better text weighting'
+                })
+                print(f"\n   ❌ ISSUE: Low accuracy ({accuracy:.1f}%)")
+                return False
+            elif accuracy < 85:
+                self.issues.append({
+                    'type': 'WARNING',
+                    'issue': 'Moderate accuracy',
+                    'details': f'Accuracy: {accuracy:.1f}%',
+                    'fix': 'Consider retraining with optimizations'
+                })
+                print(f"\n   ⚠️  Moderate accuracy ({accuracy:.1f}%)")
+                return True
+            else:
+                print(f"\n   ✅ Good accuracy ({accuracy:.1f}%)")
+                return True
+        except Exception as e:
+            print(f"   ❌ Error: {e}")
+            import traceback
+            traceback.print_exc()
+            return False
+    def analyze_category_distribution(self):
+        """Analyze category depth and structure"""
+        self.banner("ANALYZING CATEGORY STRUCTURE")
+        try:
+            with open(self.cache_dir / 'metadata.pkl', 'rb') as f:
+                metadata = pickle.load(f)
+            depths = [m.get('depth', 0) for m in metadata]
+            print(f"📊 Category Structure:")
+            print(f"   Total categories: {len(metadata):,}")
+            print(f"   Average depth: {np.mean(depths):.2f}")
+            print(f"   Min depth: {min(depths)}")
+            print(f"   Max depth: {max(depths)}")
+            # Depth distribution
+            depth_counts = Counter(depths)
+            print(f"\n   Depth Distribution:")
+            for depth in sorted(depth_counts.keys())[:8]:
+                count = depth_counts[depth]
+                pct = (count / len(metadata)) * 100
+                print(f"   Depth {depth}: {count:,} ({pct:.1f}%)")
+            # Check for imbalance
+            if max(depths) - min(depths) > 5:
+                self.issues.append({
+                    'type': 'WARNING',
+                    'issue': 'Large depth variation',
+                    'details': f'Depth ranges from {min(depths)} to {max(depths)}',
+                    'fix': 'Consider depth-based weighting'
+                })
+                print(f"\n   ⚠️  Large depth variation detected")
+            return True
+        except Exception as e:
+            print(f"   ❌ Error: {e}")
+            return False
+    def check_duplicate_embeddings(self):
+        """Check for duplicate or near-duplicate embeddings"""
+        self.banner("CHECKING FOR DUPLICATE EMBEDDINGS")
+        try:
+            embeddings = np.load(self.cache_dir / 'embeddings.npy')
+            # Sample check (checking all would be too slow)
+            sample_size = min(1000, len(embeddings))
+            sample_indices = np.random.choice(len(embeddings), sample_size, replace=False)
+            sample_embs = embeddings[sample_indices]
+            # Compute pairwise similarities
+            similarities = np.dot(sample_embs, sample_embs.T)
+            # Count very high similarities (excluding diagonal)
+            np.fill_diagonal(similarities, 0)
+            high_sim = (similarities > 0.99).sum() // 2  # Divide by 2 for symmetry
+            print(f"📊 Duplicate Check (sample of {sample_size}):")
+            print(f"   Very similar pairs (>0.99): {high_sim}")
+            if high_sim > sample_size * 0.05:  # >5% duplicates
+                self.issues.append({
+                    'type': 'WARNING',
+                    'issue': 'Many duplicate embeddings',
+                    'details': f'{high_sim} pairs with >0.99 similarity',
+                    'fix': 'Check for duplicate categories or improve text diversity'
+                })
+                print(f"   ⚠️  Many near-duplicates detected")
+                return False
+            else:
+                print(f"   ✅ Low duplicate rate")
+                return True
+        except Exception as e:
+            print(f"   ❌ Error: {e}")
+            return False
+    def generate_report(self):
+        """Generate diagnostic report"""
+        self.banner("DIAGNOSTIC REPORT")
+        if not self.issues:
+            print("✅ NO ISSUES FOUND!")
+            print("\nYour model appears to be properly configured.")
+            return
+        # Group by severity
+        critical = [i for i in self.issues if i['type'] == 'CRITICAL']
+        warnings = [i for i in self.issues if i['type'] == 'WARNING']
+        if critical:
+            print("🔴 CRITICAL ISSUES:")
+            for i, issue in enumerate(critical, 1):
+                print(f"\n{i}. {issue['issue']}")
+                print(f"   Details: {issue['details']}")
+                print(f"   Fix: {issue['fix']}")
+        if warnings:
+            print("\n🟡 WARNINGS:")
+            for i, issue in enumerate(warnings, 1):
+                print(f"\n{i}. {issue['issue']}")
+                print(f"   Details: {issue['details']}")
+                print(f"   Fix: {issue['fix']}")
+        print(f"\n📊 Summary:")
+        print(f"   Critical issues: {len(critical)}")
+        print(f"   Warnings: {len(warnings)}")
+    def suggest_fixes(self):
+        """Suggest fixes based on issues found"""
+        self.banner("RECOMMENDED FIXES")
+        if not self.issues:
+            print("✅ No fixes needed!")
+            return
+        print("Run these commands to fix issues:\n")
+        # Check for critical issues
+        critical = [i for i in self.issues if i['type'] == 'CRITICAL']
+        if any('normalization' in i['issue'].lower() for i in critical):
+            print("1️⃣ Fix embedding normalization:")
+            print("   python fix_embeddings.py normalize")
+            print()
+        if any('faiss' in i['issue'].lower() for i in critical):
+            print("2️⃣ Rebuild FAISS index with correct metric:")
+            print("   python fix_embeddings.py rebuild-index")
+            print()
+        if any('accuracy' in i['issue'].lower() for i in critical):
+            print("3️⃣ Retrain with improved settings:")
+            print("   python train_fixed_v2.py data/categories.csv data/tags.json")
+            print()
+        if any('tags' in i['issue'].lower() for i in self.issues):
+            print("4️⃣ Generate missing tags:")
+            print("   python generate_tags.py data/categories.csv")
+            print()
+    def run_full_diagnostics(self):
+        """Run all diagnostic checks"""
+        print("\n" + "="*80)
+        print("🔬 COMPREHENSIVE MODEL DIAGNOSTICS")
+        print("="*80)
+        # Run all checks
+        self.check_embedding_normalization()
+        self.check_faiss_metric()
+        self.check_text_weighting()
+        self.analyze_category_distribution()
+        self.check_duplicate_embeddings()
+        self.test_predictions(num_samples=50)
+        # Generate report
+        self.generate_report()
+        self.suggest_fixes()
+        print("\n" + "="*80)
+        print("🎯 DIAGNOSTICS COMPLETE")
+        print("="*80 + "\n")
+if __name__ == "__main__":
+    diagnostics = ModelDiagnostics()
+    diagnostics.run_full_diagnostics()

path.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import pandas as pd
+import json
+import re
+from tqdm import tqdm
+class HybridTagsGenerator:
+    def __init__(self):
+        # Search intent patterns (E5 likes real text)
+        self.search_intents = [
+            "buy {item}",
+            "best {item}",
+            "{item} reviews",
+        ]
+    def clean(self, text):
+        text = str(text).lower()
+        text = re.sub(r"[^\w\s-]", " ", text)
+        text = re.sub(r"\s+", " ", text).strip()
+        return text
+    # -------------------------------------------------------
+    # 1. Hierarchical tag boosting
+    # -------------------------------------------------------
+    def make_hierarchy_tags(self, path):
+        levels = [l.strip() for l in path.split("/") if l.strip()]
+        tags = []
+        # Strong full-path signal
+        full = " ".join(self.clean(l) for l in levels)
+        tags.extend([full] * 8)   # <-- Strong boost
+        # Progressive hierarchy
+        for i in range(1, len(levels) + 1):
+            seg = " ".join(self.clean(l) for l in levels[:i])
+            tags.append(seg)
+        # Parent-child reinforcement
+        if len(levels) >= 2:
+            parent = self.clean(levels[-2])
+            child = self.clean(levels[-1])
+            tags.extend([
+                f"{parent} {child}",
+                f"{child} {parent}",
+                f"{child} in {parent}",
+                f"{child} category {parent}"
+            ])
+        return tags
+    # -------------------------------------------------------
+    # 2. Extract key terms and word combos
+    # -------------------------------------------------------
+    def extract_terms(self, path):
+        levels = [l.strip() for l in path.split("/") if l.strip()]
+        terms = []
+        for level in levels:
+            cleaned = self.clean(level)
+            if cleaned not in terms:
+                terms.append(cleaned)
+            words = [w for w in cleaned.split() if len(w) > 3]
+            terms.extend(words)
+            # bigrams for leaf and parent
+            if level in levels[-2:]:
+                for i in range(len(words) - 1):
+                    terms.append(f"{words[i]} {words[i+1]}")
+        # Remove duplicates, keep order
+        return list(dict.fromkeys(terms))
+    # -------------------------------------------------------
+    # 3. Build final tag list for ONE category
+    # -------------------------------------------------------
+    def build_tags(self, category_id, category_path):
+        tags = []
+        # Hierarchy tags
+        tags.extend(self.make_hierarchy_tags(category_path))
+        # Key terms
+        terms = self.extract_terms(category_path)
+        tags.extend(terms[:15])
+        # Search intent (for leaf level)
+        leaf = self.clean(category_path.split("/")[-1])
+        for pattern in self.search_intents[:2]:
+            tags.append(pattern.format(item=leaf))
+        # Clean + dedupe + limit
+        seen = set()
+        final = []
+        for t in tags:
+            c = self.clean(t)
+            if c and c not in seen and len(c.split()) <= 6:
+                seen.add(c)
+                final.append(c)
+        return final[:50]
+    # -------------------------------------------------------
+    # 4. Generate tags.json for entire CSV
+    # -------------------------------------------------------
+    def generate_tags_json(self, csv_path, output="tags.json"):
+        df = pd.read_csv(csv_path, dtype=str)
+        if "Category_ID" not in df.columns or "Category_path" not in df.columns:
+            raise ValueError("CSV must contain Category_ID, Category_path columns")
+        df = df.dropna(subset=["Category_path"])
+        tags_dict = {}
+        for _, row in tqdm(df.iterrows(), total=len(df), desc="Building tags"):
+            cid = str(row["Category_ID"])
+            cpath = str(row["Category_path"])
+            tags_dict[cid] = self.build_tags(cid, cpath)
+        with open(output, "w", encoding="utf-8") as f:
+            json.dump(tags_dict, f, indent=2)
+        print(f"✅ DONE: {output} saved.")
+        return tags_dict
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) < 2:
+        print("Usage: python build_tags_json.py <categories.csv>")
+        sys.exit()
+    csv_file = sys.argv[1]
+    gen = HybridTagsGenerator()
+    gen.generate_tags_json(csv_file, "tags.json")

requirements.txt ADDED Viewed

	@@ -0,0 +1,28 @@

+# sentence-transformers==3.3.1
+# torch==2.5.1
+# transformers==4.46.3
+# faiss-gpu==1.9.0.post1
+# pandas==2.2.3
+# numpy==2.0.2
+# fastapi==0.115.6
+# uvicorn==0.32.1
+# gunicorn==23.0.0
+# pydantic==2.10.3
+# joblib==1.4.2
+# psutil==6.1.0
+sentence-transformers==3.3.1
+torch==2.5.1
+transformers==4.46.3
+faiss-cpu==1.9.0
+pandas==2.2.3
+numpy==2.0.2
+fastapi==0.115.6
+uvicorn==0.32.1
+gunicorn==23.0.0
+pydantic==2.10.3
+joblib==1.4.2
+psutil==6.1.0
+nltk>=3.8.1
+# Note: faiss-gpu is commented out to avoid compatibility issues on systems without a compatible GPU.

synonyms.py CHANGED Viewed

@@ -1,366 +1,854 @@
-"""
-🤖 AI-POWERED SYNONYM MANAGER (Fixed for Windows + GPU)
-========================================================
-✅ Uses e5-base-v2 (768D, memory-efficient)
-✅ Windows + NVIDIA GPU optimized
-✅ Generates cross-store synonyms automatically
-Usage:
-    python synonym_manager_fixed.py autobuild data/category_id_path_only.csv
-    python synonym_manager_fixed.py autobuild data/category_id_path_only.csv --fast
-"""
-import pickle
-from pathlib import Path
-import json
-from collections import defaultdict
-from tqdm import tqdm
-import warnings
-import sys
-import os
-warnings.filterwarnings('ignore')
-os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
-try:
-    from nltk.corpus import wordnet
-    from nltk import download as nltk_download
-    WORDNET_AVAILABLE = True
-except ImportError:
-    WORDNET_AVAILABLE = False
-try:
-    from sentence_transformers import SentenceTransformer, util
-    import torch
-    TRANSFORMERS_AVAILABLE = True
-except ImportError:
-    TRANSFORMERS_AVAILABLE = False
-class SynonymManager:
-    """AI-powered synonym manager"""
-    def __init__(self, cache_dir='cache', fast_mode=False):
-        self.cache_dir = Path(cache_dir)
-        self.synonyms_file = self.cache_dir / 'cross_store_synonyms.pkl'
-        self.synonyms = {}
-        self.model = None
-        self.device = "cpu"
-        self.fast_mode = fast_mode
-        self.cache_dir.mkdir(parents=True, exist_ok=True)
-        if self.synonyms_file.exists():
-            self.load_synonyms()
-    def load_synonyms(self):
-        """Load existing synonyms"""
-        try:
-            with open(self.synonyms_file, 'rb') as f:
-                loaded = pickle.load(f)
-            if loaded and list(loaded.values()):
-                first_val = next(iter(loaded.values()))
-                if isinstance(first_val, list) and first_val:
-                    if isinstance(first_val[0], tuple):
-                        self.synonyms = loaded
-                    else:
-                        self.synonyms = {k: [(v, 0.8, 'legacy') for v in vals] for k, vals in loaded.items()}
-                elif isinstance(first_val, set):
-                    self.synonyms = {k: [(v, 0.8, 'legacy') for v in vals] for k, vals in loaded.items()}
-            print(f"✅ Loaded {len(self.synonyms):,} synonym entries")
-        except Exception as e:
-            print(f"❌ Error loading synonyms: {e}")
-            self.synonyms = {}
-    def save_synonyms(self):
-        """Save synonyms"""
-        try:
-            with open(self.synonyms_file, 'wb') as f:
-                pickle.dump(self.synonyms, f)
-            json_file = self.cache_dir / 'synonyms_readable.json'
-            readable = {
-                term: [
-                    {'synonym': syn, 'confidence': conf, 'source': src}
-                    for syn, conf, src in syns
-                ]
-                for term, syns in self.synonyms.items()
-            }
-            with open(json_file, 'w', encoding='utf-8') as f:
-                json.dump(readable, f, indent=2, ensure_ascii=False)
-            print(f"✅ Saved {len(self.synonyms):,} synonym entries")
-            return True
-        except Exception as e:
-            print(f"❌ Error saving synonyms: {e}")
-            return False
-    def load_transformer_model(self):
-        """Load e5-base-v2 model"""
-        if not TRANSFORMERS_AVAILABLE:
-            print("❌ SentenceTransformers not installed!")
-            return False
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        if self.device == "cuda":
-            print(f"🔥 NVIDIA GPU detected!")
-        model_name = "intfloat/e5-base-v2"
-        print(f"\n🤖 Loading {model_name}...")
-        try:
-            self.model = SentenceTransformer(model_name, device=self.device)
-            if self.device == "cuda":
-                self.model = self.model.half()
-                print("⚡ Enabled FP16 precision")
-            print("✅ Model loaded\n")
-            return True
-        except Exception as e:
-            print(f"❌ Failed to load model: {e}")
-            return False
-    def get_wordnet_synonyms(self, word, limit=10):
-        """Get WordNet synonyms"""
-        if self.fast_mode or not WORDNET_AVAILABLE:
-            return []
-        try:
-            try:
-                wordnet.synsets('test')
-            except:
-                nltk_download('wordnet', quiet=True)
-                nltk_download('omw-1.4', quiet=True)
-            synonyms = []
-            word_clean = word.lower().replace(' ', '_')
-            for syn in wordnet.synsets(word_clean):
-                for lemma in syn.lemmas():
-                    synonym = lemma.name().replace('_', ' ').lower()
-                    if synonym != word.lower() and len(synonym) > 2:
-                        confidence = 0.75
-                        synonyms.append((synonym, confidence, 'wordnet'))
-                        if len(synonyms) >= limit:
-                            break
-                if len(synonyms) >= limit:
-                    break
-            return synonyms[:limit]
-        except Exception:
-            return []
-    def get_semantic_synonyms(self, term, candidate_pool, threshold=0.70, limit=15):
-        """Get semantic synonyms using E5"""
-        if not self.model or not candidate_pool:
-            return []
-        try:
-            query = f"query: {term}"
-            candidates_prefixed = [f"passage: {c}" for c in candidate_pool]
-            term_emb = self.model.encode(query, convert_to_tensor=True, show_progress_bar=False)
-            batch_size = 32 if self.device == "cuda" else 8
-            all_embeddings = []
-            for i in range(0, len(candidates_prefixed), batch_size):
-                batch = candidates_prefixed[i:i + batch_size]
-                emb = self.model.encode(batch, convert_to_tensor=True, show_progress_bar=False)
-                all_embeddings.append(emb)
-            candidate_embs = torch.cat(all_embeddings, dim=0)
-            scores = util.cos_sim(term_emb, candidate_embs)[0]
-            synonyms = []
-            for candidate, score in zip(candidate_pool, scores):
-                score_val = float(score)
-                if score_val > threshold and candidate.lower() != term.lower():
-                    confidence = 0.60 + (score_val - threshold) * 0.35 / (1 - threshold)
-                    synonyms.append((candidate, confidence, 'semantic'))
-            synonyms.sort(key=lambda x: x[1], reverse=True)
-            return synonyms[:limit]
-        except Exception as e:
-            print(f"⚠️  Semantic error: {e}")
-            return []
-    def auto_generate_synonyms(self, term, candidate_pool=None, semantic_threshold=0.70, silent=False):
-        """Generate synonyms from multiple sources"""
-        all_synonyms = []
-        if not silent:
-            print(f"\n🔍 Finding synonyms for: '{term}'")
-        if WORDNET_AVAILABLE and not self.fast_mode:
-            wn_syns = self.get_wordnet_synonyms(term, limit=10)
-            all_synonyms.extend(wn_syns)
-        if candidate_pool and self.model:
-            sem_syns = self.get_semantic_synonyms(
-                term, candidate_pool,
-                threshold=semantic_threshold,
-                limit=15
-            )
-            all_synonyms.extend(sem_syns)
-        synonym_map = {}
-        for syn, conf, source in all_synonyms:
-            syn_lower = syn.lower()
-            if syn_lower not in synonym_map or conf > synonym_map[syn_lower][1]:
-                synonym_map[syn_lower] = (syn, conf, source)
-        final_synonyms = sorted(synonym_map.values(), key=lambda x: x[1], reverse=True)
-        return final_synonyms
-    def add_synonym_group(self, term, synonyms_with_confidence):
-        """Add synonym group"""
-        term_lower = term.lower()
-        if term_lower not in self.synonyms:
-            self.synonyms[term_lower] = []
-        for syn, conf, src in synonyms_with_confidence:
-            if not any(s[0].lower() == syn.lower() for s in self.synonyms[term_lower]):
-                self.synonyms[term_lower].append((syn, conf, src))
-    def extract_terms_from_categories(self, csv_path, min_frequency=2):
-        """Extract terms from category CSV"""
-        print(f"\n📂 Extracting terms from: {csv_path}")
-        try:
-            import pandas as pd
-            df = pd.read_csv(csv_path)
-            path_col = df.columns[1] if len(df.columns) > 1 else df.columns[0]
-            paths = df[path_col].dropna().astype(str)
-            print(f"   Processing {len(paths):,} category paths...")
-            term_freq = defaultdict(int)
-            for path in tqdm(paths, desc="Analyzing paths"):
-                levels = path.split('/')
-                for level in levels:
-                    words = level.lower().split()
-                    for word in words:
-                        if len(word) > 2 and word.isalpha():
-                            term_freq[word] += 1
-                    for i in range(len(words) - 1):
-                        if len(words[i]) > 2 and len(words[i+1]) > 2:
-                            phrase = f"{words[i]} {words[i+1]}"
-                            if phrase.replace(' ', '').isalpha():
-                                term_freq[phrase] += 1
-            candidates = [
-                term for term, freq in term_freq.items()
-                if freq >= min_frequency
-            ]
-            print(f"✅ Extracted {len(candidates):,} terms (min frequency: {min_frequency})")
-            return candidates, term_freq
-        except Exception as e:
-            print(f"❌ Error extracting terms: {e}")
-            import traceback
-            traceback.print_exc()
-            return [], {}
-    def auto_build_from_categories(self, csv_path, top_terms=1000, semantic_threshold=0.70):
-        """Auto-build synonym database"""
-        print("\n" + "="*80)
-        print("🚀 AUTO-BUILD SYNONYM DATABASE")
-        print("="*80)
-        if not self.load_transformer_model():
-            print("\n⚠️  Continuing with WordNet only")
-        all_terms, term_freq = self.extract_terms_from_categories(csv_path)
-        if not all_terms:
-            print("❌ No terms extracted")
-            return False
-        print(f"\n🎯 Selecting top {top_terms} terms...")
-        top_frequent = sorted(term_freq.items(), key=lambda x: x[1], reverse=True)[:top_terms]
-        terms_to_process = [term for term, _ in top_frequent]
-        print(f"✅ Selected {len(terms_to_process)} terms")
-        print(f"📊 Top 10: {', '.join(terms_to_process[:10])}")
-        print(f"\n🔄 Generating synonyms (threshold={semantic_threshold})...\n")
-        stats = {'processed': 0, 'synonyms': 0, 'high_conf': 0}
-        for term in tqdm(terms_to_process, desc="Processing"):
-            if term in self.synonyms and len(self.synonyms[term]) >= 10:
-                continue
-            syns = self.auto_generate_synonyms(
-                term,
-                candidate_pool=all_terms,
-                semantic_threshold=semantic_threshold,
-                silent=True
-            )
-            if syns:
-                self.add_synonym_group(term, syns)
-                stats['processed'] += 1
-                stats['synonyms'] += len(syns)
-                stats['high_conf'] += sum(1 for _, c, _ in syns if c >= 0.8)
-        print(f"\n✅ Processed: {stats['processed']:,} terms")
-        print(f"✅ Total synonyms: {stats['synonyms']:,}")
-        print(f"✅ High confidence (≥0.8): {stats['high_conf']:,}")
-        self.save_synonyms()
-        print("\n🎉 AUTO-BUILD COMPLETE!\n")
-        return True
-def main():
-    """Main entry point"""
-    print("\n" + "="*80)
-    print("🤖 AI-POWERED SYNONYM MANAGER")
-    print("="*80 + "\n")
-    fast_mode = '--fast' in sys.argv
-    if len(sys.argv) < 2:
-        print("Usage:")
-        print("  python synonym_manager_fixed.py autobuild <csv_file>")
-        print("  python synonym_manager_fixed.py autobuild <csv_file> --fast")
-        print("\nExample:")
-        print("  python synonym_manager_fixed.py autobuild data/category_id_path_only.csv")
-        return
-    command = sys.argv[1].lower()
-    if command == 'autobuild':
-        if len(sys.argv) < 3:
-            print("❌ CSV file path required")
-            return
-        csv_path = sys.argv[2]
-        if not Path(csv_path).exists():
-            print(f"❌ File not found: {csv_path}")
-            return
-        manager = SynonymManager(fast_mode=fast_mode)
-        manager.auto_build_from_categories(csv_path, top_terms=1000)
-    else:
-        print(f"❌ Unknown command: {command}")
-if __name__ == "__main__":
     main()

+# """
+# 🤖 FIXED AI-POWERED SYNONYM MANAGER
+# ====================================
+# ✅ Windows + NVIDIA GPU optimized
+# ✅ Uses e5-base-v2 (lower memory)
+# ✅ Proper error handling
+# ✅ Progress tracking
+# Usage:
+#     python synonym_manager_fixed.py autobuild data/category_id_path_only.csv
+#     python synonym_manager_fixed.py autobuild data/category_id_path_only.csv --fast
+# """
+# import pickle
+# from pathlib import Path
+# import json
+# from collections import defaultdict
+# from tqdm import tqdm
+# import warnings
+# import sys
+# import os
+# warnings.filterwarnings('ignore')
+# # Fix CUDA issues on Windows
+# os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
+# try:
+#     from nltk.corpus import wordnet
+#     from nltk import download as nltk_download
+#     WORDNET_AVAILABLE = True
+# except ImportError:
+#     WORDNET_AVAILABLE = False
+#     print("⚠️  NLTK not available. Install with: pip install nltk")
+# try:
+#     from sentence_transformers import SentenceTransformer, util
+#     import torch
+#     TRANSFORMERS_AVAILABLE = True
+# except ImportError:
+#     TRANSFORMERS_AVAILABLE = False
+#     print("⚠️  SentenceTransformers not available.")
+#     print("   Install with: pip install sentence-transformers torch")
+# class FixedAISynonymManager:
+#     """Fixed AI-powered synonym manager for Windows + NVIDIA GPU"""
+#     def __init__(self, cache_dir='cache', tags_file='data/tags.json', fast_mode=False):
+#         self.cache_dir = Path(cache_dir)
+#         self.synonyms_file = self.cache_dir / 'cross_store_synonyms.pkl'
+#         self.tags_file = Path(tags_file)
+#         self.synonyms = {}
+#         self.tags_data = {}
+#         self.model = None
+#         self.device = "cpu"
+#         self.fast_mode = fast_mode
+#         # Create cache directory
+#         self.cache_dir.mkdir(parents=True, exist_ok=True)
+#         # Load existing data
+#         self.load_tags()
+#         if self.synonyms_file.exists():
+#             self.load_synonyms()
+#         else:
+#             print("📝 No existing synonyms file. Will create new one.")
+#     def load_tags(self):
+#         """Load domain-specific tags (optional)"""
+#         if self.tags_file.exists():
+#             try:
+#                 with open(self.tags_file, 'r', encoding='utf-8') as f:
+#                     self.tags_data = json.load(f)
+#                 print(f"✅ Loaded {len(self.tags_data)} tag entries")
+#                 return True
+#             except Exception as e:
+#                 print(f"⚠️  Could not load tags.json: {e}")
+#         else:
+#             print(f"ℹ️  tags.json not found (optional)")
+#         return False
+#     def load_synonyms(self):
+#         """Load existing synonyms with format conversion"""
+#         try:
+#             with open(self.synonyms_file, 'rb') as f:
+#                 loaded = pickle.load(f)
+#             # Handle different formats
+#             if not loaded:
+#                 self.synonyms = {}
+#                 return
+#             # Check format
+#             first_val = next(iter(loaded.values()))
+#             if isinstance(first_val, list):
+#                 if first_val and isinstance(first_val[0], tuple):
+#                     # New format: [(syn, conf, src), ...]
+#                     self.synonyms = loaded
+#                     print(f"✅ Loaded {len(self.synonyms)} synonym entries (new format)")
+#                 elif first_val and isinstance(first_val[0], str):
+#                     # Legacy format: [syn1, syn2, ...]
+#                     self.synonyms = {
+#                         k: [(v, 0.8, 'legacy') for v in vals]
+#                         for k, vals in loaded.items()
+#                     }
+#                     print(f"✅ Converted {len(self.synonyms)} legacy synonym entries")
+#             elif isinstance(first_val, set):
+#                 # Set format
+#                 self.synonyms = {
+#                     k: [(v, 0.8, 'legacy') for v in vals]
+#                     for k, vals in loaded.items()
+#                 }
+#                 print(f"✅ Converted {len(self.synonyms)} set-based entries")
+#             else:
+#                 self.synonyms = {}
+#                 print(f"⚠️  Unknown synonym format")
+#         except Exception as e:
+#             print(f"❌ Error loading synonyms: {e}")
+#             self.synonyms = {}
+#     def save_synonyms(self):
+#         """Save synonyms in both formats"""
+#         try:
+#             # Save binary format
+#             with open(self.synonyms_file, 'wb') as f:
+#                 pickle.dump(self.synonyms, f)
+#             # Save readable JSON
+#             json_file = self.cache_dir / 'synonyms_readable.json'
+#             readable = {}
+#             for term, syns in self.synonyms.items():
+#                 readable[term] = [
+#                     {'synonym': syn, 'confidence': float(conf), 'source': src}
+#                     for syn, conf, src in syns
+#                 ]
+#             with open(json_file, 'w', encoding='utf-8') as f:
+#                 json.dump(readable, f, indent=2, ensure_ascii=False)
+#             print(f"\n✅ Saved {len(self.synonyms)} synonym entries")
+#             print(f"   📁 Binary: {self.synonyms_file}")
+#             print(f"   📁 JSON: {json_file}")
+#             return True
+#         except Exception as e:
+#             print(f"❌ Error saving synonyms: {e}")
+#             return False
+#     def load_transformer_model(self):
+#         """Load e5-base-v2 model with GPU support"""
+#         if not TRANSFORMERS_AVAILABLE:
+#             print("❌ SentenceTransformers not installed!")
+#             return False
+#         # Check for CUDA
+#         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+#         if self.device == "cuda":
+#             print(f"🔥 NVIDIA GPU detected!")
+#             try:
+#                 gpu_name = torch.cuda.get_device_name(0)
+#                 vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
+#                 print(f"   GPU: {gpu_name}")
+#                 print(f"   VRAM: {vram_gb:.1f} GB")
+#             except:
+#                 pass
+#         else:
+#             print("💻 Using CPU (slower)")
+#         # Use e5-base-v2 for better memory efficiency
+#         model_name = "intfloat/e5-base-v2"
+#         print(f"\n🤖 Loading model: {model_name}")
+#         try:
+#             self.model = SentenceTransformer(model_name, device=self.device)
+#             self.model.max_seq_length = 256
+#             # Use FP16 on GPU for speed
+#             if self.device == "cuda":
+#                 self.model = self.model.half()
+#                 print("⚡ Enabled FP16 precision")
+#             print("✅ Model loaded successfully\n")
+#             return True
+#         except Exception as e:
+#             print(f"❌ Failed to load model: {e}")
+#             return False
+#     def get_wordnet_synonyms(self, word, limit=10):
+#         """Get WordNet synonyms"""
+#         if self.fast_mode or not WORDNET_AVAILABLE:
+#             return []
+#         try:
+#             # Ensure WordNet is downloaded
+#             try:
+#                 wordnet.synsets('test')
+#             except:
+#                 print("📥 Downloading WordNet data...")
+#                 nltk_download('wordnet', quiet=True)
+#                 nltk_download('omw-1.4', quiet=True)
+#             synonyms = []
+#             word_clean = word.lower().replace(' ', '_')
+#             for syn in wordnet.synsets(word_clean):
+#                 for lemma in syn.lemmas():
+#                     synonym = lemma.name().replace('_', ' ').lower()
+#                     if synonym != word.lower() and len(synonym) > 2:
+#                         confidence = 0.75  # Fixed confidence for WordNet
+#                         synonyms.append((synonym, confidence, 'wordnet'))
+#                         if len(synonyms) >= limit:
+#                             break
+#                 if len(synonyms) >= limit:
+#                     break
+#             return synonyms[:limit]
+#         except Exception:
+#             return []
+#     def get_semantic_synonyms(self, term, candidate_pool, threshold=0.70, limit=15):
+#         """Get semantic synonyms using embeddings"""
+#         if not self.model or not candidate_pool:
+#             return []
+#         try:
+#             # E5 model requires query/passage prefixes
+#             query = f"query: {term}"
+#             candidates_prefixed = [f"passage: {c}" for c in candidate_pool]
+#             # Encode query
+#             term_emb = self.model.encode(
+#                 query,
+#                 convert_to_tensor=True,
+#                 show_progress_bar=False
+#             )
+#             # Encode candidates in batches
+#             batch_size = 32 if self.device == "cuda" else 8
+#             all_embeddings = []
+#             for i in range(0, len(candidates_prefixed), batch_size):
+#                 batch = candidates_prefixed[i:i + batch_size]
+#                 emb = self.model.encode(
+#                     batch,
+#                     convert_to_tensor=True,
+#                     show_progress_bar=False
+#                 )
+#                 all_embeddings.append(emb)
+#             # Concatenate all embeddings
+#             candidate_embs = torch.cat(all_embeddings, dim=0)
+#             # Calculate cosine similarity
+#             scores = util.cos_sim(term_emb, candidate_embs)[0]
+#             # Filter by threshold
+#             synonyms = []
+#             for candidate, score in zip(candidate_pool, scores):
+#                 score_val = float(score)
+#                 if score_val > threshold and candidate.lower() != term.lower():
+#                     # Scale confidence between 0.6 and 0.95
+#                     confidence = 0.60 + (score_val - threshold) * 0.35 / (1 - threshold)
+#                     synonyms.append((candidate, confidence, 'semantic'))
+#             # Sort by confidence
+#             synonyms.sort(key=lambda x: x[1], reverse=True)
+#             return synonyms[:limit]
+#         except Exception as e:
+#             print(f"⚠️  Semantic error: {e}")
+#             return []
+#     def auto_generate_synonyms(self, term, candidate_pool=None,
+#                               semantic_threshold=0.70, silent=False):
+#         """Generate synonyms from multiple sources"""
+#         all_synonyms = []
+#         if not silent:
+#             print(f"\n🔍 Finding synonyms for: '{term}'")
+#         # Source 1: WordNet
+#         if WORDNET_AVAILABLE and not self.fast_mode:
+#             wn_syns = self.get_wordnet_synonyms(term, limit=10)
+#             all_synonyms.extend(wn_syns)
+#         # Source 2: Semantic similarity
+#         if candidate_pool and self.model:
+#             sem_syns = self.get_semantic_synonyms(
+#                 term, candidate_pool,
+#                 threshold=semantic_threshold,
+#                 limit=15
+#             )
+#             all_synonyms.extend(sem_syns)
+#         # Deduplicate (keep highest confidence)
+#         synonym_map = {}
+#         for syn, conf, source in all_synonyms:
+#             syn_lower = syn.lower()
+#             if syn_lower not in synonym_map or conf > synonym_map[syn_lower][1]:
+#                 synonym_map[syn_lower] = (syn, conf, source)
+#         final_synonyms = sorted(
+#             synonym_map.values(),
+#             key=lambda x: x[1],
+#             reverse=True
+#         )
+#         return final_synonyms
+#     def add_synonym_group(self, term, synonyms_with_confidence):
+#         """Add synonym group"""
+#         term_lower = term.lower()
+#         if term_lower not in self.synonyms:
+#             self.synonyms[term_lower] = []
+#         for syn, conf, src in synonyms_with_confidence:
+#             # Check if already exists
+#             if not any(s[0].lower() == syn.lower() for s in self.synonyms[term_lower]):
+#                 self.synonyms[term_lower].append((syn, conf, src))
+#     def extract_terms_from_categories(self, csv_path, min_frequency=2):
+#         """Extract terms from category CSV"""
+#         print(f"\n📂 Extracting terms from: {csv_path}")
+#         try:
+#             import pandas as pd
+#             # Read CSV
+#             df = pd.read_csv(csv_path)
+#             # Find path column (usually second column)
+#             path_col = df.columns[1] if len(df.columns) > 1 else df.columns[0]
+#             paths = df[path_col].dropna().astype(str)
+#             print(f"   Processing {len(paths):,} category paths...")
+#             term_freq = defaultdict(int)
+#             for path in tqdm(paths, desc="Analyzing paths"):
+#                 levels = path.split('/')
+#                 for level in levels:
+#                     words = level.lower().split()
+#                     # Single words
+#                     for word in words:
+#                         if len(word) > 2 and word.isalpha():
+#                             term_freq[word] += 1
+#                     # Two-word phrases
+#                     for i in range(len(words) - 1):
+#                         if len(words[i]) > 2 and len(words[i+1]) > 2:
+#                             phrase = f"{words[i]} {words[i+1]}"
+#                             if phrase.replace(' ', '').isalpha():
+#                                 term_freq[phrase] += 1
+#             # Filter by frequency
+#             candidates = [
+#                 term for term, freq in term_freq.items()
+#                 if freq >= min_frequency
+#             ]
+#             print(f"✅ Extracted {len(candidates):,} terms (min frequency: {min_frequency})")
+#             return candidates, term_freq
+#         except Exception as e:
+#             print(f"❌ Error extracting terms: {e}")
+#             import traceback
+#             traceback.print_exc()
+#             return [], {}
+#     def auto_build_from_categories(self, csv_path, top_terms=1000,
+#                                    semantic_threshold=0.70):
+#         """Auto-build synonym database from categories"""
+#         print("\n" + "="*80)
+#         print("🚀 AUTO-BUILD SYNONYM DATABASE")
+#         print("="*80)
+#         # Load model
+#         if not self.load_transformer_model():
+#             print("\n⚠️  Continuing with WordNet only (limited coverage)")
+#         # Extract terms
+#         all_terms, term_freq = self.extract_terms_from_categories(csv_path)
+#         if not all_terms:
+#             print("❌ No terms extracted")
+#             return False
+#         # Select top terms
+#         print(f"\n🎯 Selecting top {top_terms} terms...")
+#         top_frequent = sorted(
+#             term_freq.items(),
+#             key=lambda x: x[1],
+#             reverse=True
+#         )[:top_terms]
+#         terms_to_process = [term for term, _ in top_frequent]
+#         print(f"✅ Selected {len(terms_to_process)} terms")
+#         print(f"📊 Top 10: {', '.join(terms_to_process[:10])}")
+#         print(f"\n🔄 Generating synonyms (threshold={semantic_threshold})...\n")
+#         # Process terms
+#         stats = {
+#             'processed': 0,
+#             'synonyms': 0,
+#             'high_conf': 0
+#         }
+#         for term in tqdm(terms_to_process, desc="Processing"):
+#             # Skip if already has enough synonyms
+#             if term in self.synonyms and len(self.synonyms[term]) >= 10:
+#                 continue
+#             # Generate synonyms
+#             syns = self.auto_generate_synonyms(
+#                 term,
+#                 candidate_pool=all_terms,
+#                 semantic_threshold=semantic_threshold,
+#                 silent=True
+#             )
+#             if syns:
+#                 self.add_synonym_group(term, syns)
+#                 stats['processed'] += 1
+#                 stats['synonyms'] += len(syns)
+#                 stats['high_conf'] += sum(1 for _, c, _ in syns if c >= 0.8)
+#         # Print stats
+#         print(f"\n✅ Processed: {stats['processed']:,} terms")
+#         print(f"✅ Total synonyms: {stats['synonyms']:,}")
+#         print(f"✅ High confidence (≥0.8): {stats['high_conf']:,}")
+#         # Save
+#         self.save_synonyms()
+#         print("\n🎉 AUTO-BUILD COMPLETE!\n")
+#         return True
+# def main():
+#     """Main entry point"""
+#     print("\n" + "="*80)
+#     print("🤖 AI-POWERED SYNONYM MANAGER (Windows + NVIDIA GPU)")
+#     print("="*80 + "\n")
+#     # Parse arguments
+#     fast_mode = '--fast' in sys.argv
+#     if len(sys.argv) < 2:
+#         print("Usage:")
+#         print("  python synonym_manager_fixed.py autobuild <csv_file>")
+#         print("  python synonym_manager_fixed.py autobuild <csv_file> --fast")
+#         print("\nExample:")
+#         print("  python synonym_manager_fixed.py autobuild data/category_id_path_only.csv")
+#         return
+#     command = sys.argv[1].lower()
+#     if command == 'autobuild':
+#         if len(sys.argv) < 3:
+#             print("❌ CSV file path required")
+#             return
+#         csv_path = sys.argv[2]
+#         if not Path(csv_path).exists():
+#             print(f"❌ File not found: {csv_path}")
+#             return
+#         # Initialize manager
+#         manager = FixedAISynonymManager(fast_mode=fast_mode)
+#         # Run auto-build
+#         manager.auto_build_from_categories(csv_path, top_terms=1000)
+#     else:
+#         print(f"❌ Unknown command: {command}")
+# if __name__ == "__main__":
+#     main()
+#for cache2
+"""
+🤖 AI-POWERED SYNONYM MANAGER (Fixed for Windows + GPU)
+========================================================
+✅ Uses e5-base-v2 (768D, memory-efficient)
+✅ Windows + NVIDIA GPU optimized
+✅ Generates cross-store synonyms automatically
+Usage:
+    python synonym_manager_fixed.py autobuild data/category_id_path_only.csv
+    python synonym_manager_fixed.py autobuild data/category_id_path_only.csv --fast
+"""
+import pickle
+from pathlib import Path
+import json
+from collections import defaultdict
+from tqdm import tqdm
+import warnings
+import sys
+import os
+warnings.filterwarnings('ignore')
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
+try:
+    from nltk.corpus import wordnet
+    from nltk import download as nltk_download
+    WORDNET_AVAILABLE = True
+except ImportError:
+    WORDNET_AVAILABLE = False
+try:
+    from sentence_transformers import SentenceTransformer, util
+    import torch
+    TRANSFORMERS_AVAILABLE = True
+except ImportError:
+    TRANSFORMERS_AVAILABLE = False
+class SynonymManager:
+    """AI-powered synonym manager"""
+    def __init__(self, cache_dir='cache', fast_mode=False):
+        self.cache_dir = Path(cache_dir)
+        self.synonyms_file = self.cache_dir / 'cross_store_synonyms.pkl'
+        self.synonyms = {}
+        self.model = None
+        self.device = "cpu"
+        self.fast_mode = fast_mode
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+        if self.synonyms_file.exists():
+            self.load_synonyms()
+    def load_synonyms(self):
+        """Load existing synonyms"""
+        try:
+            with open(self.synonyms_file, 'rb') as f:
+                loaded = pickle.load(f)
+            if loaded and list(loaded.values()):
+                first_val = next(iter(loaded.values()))
+                if isinstance(first_val, list) and first_val:
+                    if isinstance(first_val[0], tuple):
+                        self.synonyms = loaded
+                    else:
+                        self.synonyms = {k: [(v, 0.8, 'legacy') for v in vals] for k, vals in loaded.items()}
+                elif isinstance(first_val, set):
+                    self.synonyms = {k: [(v, 0.8, 'legacy') for v in vals] for k, vals in loaded.items()}
+            print(f"✅ Loaded {len(self.synonyms):,} synonym entries")
+        except Exception as e:
+            print(f"❌ Error loading synonyms: {e}")
+            self.synonyms = {}
+    def save_synonyms(self):
+        """Save synonyms"""
+        try:
+            with open(self.synonyms_file, 'wb') as f:
+                pickle.dump(self.synonyms, f)
+            json_file = self.cache_dir / 'synonyms_readable.json'
+            readable = {
+                term: [
+                    {'synonym': syn, 'confidence': conf, 'source': src}
+                    for syn, conf, src in syns
+                ]
+                for term, syns in self.synonyms.items()
+            }
+            with open(json_file, 'w', encoding='utf-8') as f:
+                json.dump(readable, f, indent=2, ensure_ascii=False)
+            print(f"✅ Saved {len(self.synonyms):,} synonym entries")
+            return True
+        except Exception as e:
+            print(f"❌ Error saving synonyms: {e}")
+            return False
+    def load_transformer_model(self):
+        """Load e5-base-v2 model"""
+        if not TRANSFORMERS_AVAILABLE:
+            print("❌ SentenceTransformers not installed!")
+            return False
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        if self.device == "cuda":
+            print(f"🔥 NVIDIA GPU detected!")
+        model_name = "intfloat/e5-base-v2"
+        print(f"\n🤖 Loading {model_name}...")
+        try:
+            self.model = SentenceTransformer(model_name, device=self.device)
+            if self.device == "cuda":
+                self.model = self.model.half()
+                print("⚡ Enabled FP16 precision")
+            print("✅ Model loaded\n")
+            return True
+        except Exception as e:
+            print(f"❌ Failed to load model: {e}")
+            return False
+    def get_wordnet_synonyms(self, word, limit=10):
+        """Get WordNet synonyms"""
+        if self.fast_mode or not WORDNET_AVAILABLE:
+            return []
+        try:
+            try:
+                wordnet.synsets('test')
+            except:
+                nltk_download('wordnet', quiet=True)
+                nltk_download('omw-1.4', quiet=True)
+            synonyms = []
+            word_clean = word.lower().replace(' ', '_')
+            for syn in wordnet.synsets(word_clean):
+                for lemma in syn.lemmas():
+                    synonym = lemma.name().replace('_', ' ').lower()
+                    if synonym != word.lower() and len(synonym) > 2:
+                        confidence = 0.75
+                        synonyms.append((synonym, confidence, 'wordnet'))
+                        if len(synonyms) >= limit:
+                            break
+                if len(synonyms) >= limit:
+                    break
+            return synonyms[:limit]
+        except Exception:
+            return []
+    def get_semantic_synonyms(self, term, candidate_pool, threshold=0.70, limit=15):
+        """Get semantic synonyms using E5"""
+        if not self.model or not candidate_pool:
+            return []
+        try:
+            query = f"query: {term}"
+            candidates_prefixed = [f"passage: {c}" for c in candidate_pool]
+            term_emb = self.model.encode(query, convert_to_tensor=True, show_progress_bar=False)
+            batch_size = 32 if self.device == "cuda" else 8
+            all_embeddings = []
+            for i in range(0, len(candidates_prefixed), batch_size):
+                batch = candidates_prefixed[i:i + batch_size]
+                emb = self.model.encode(batch, convert_to_tensor=True, show_progress_bar=False)
+                all_embeddings.append(emb)
+            candidate_embs = torch.cat(all_embeddings, dim=0)
+            scores = util.cos_sim(term_emb, candidate_embs)[0]
+            synonyms = []
+            for candidate, score in zip(candidate_pool, scores):
+                score_val = float(score)
+                if score_val > threshold and candidate.lower() != term.lower():
+                    confidence = 0.60 + (score_val - threshold) * 0.35 / (1 - threshold)
+                    synonyms.append((candidate, confidence, 'semantic'))
+            synonyms.sort(key=lambda x: x[1], reverse=True)
+            return synonyms[:limit]
+        except Exception as e:
+            print(f"⚠️  Semantic error: {e}")
+            return []
+    def auto_generate_synonyms(self, term, candidate_pool=None, semantic_threshold=0.70, silent=False):
+        """Generate synonyms from multiple sources"""
+        all_synonyms = []
+        if not silent:
+            print(f"\n🔍 Finding synonyms for: '{term}'")
+        if WORDNET_AVAILABLE and not self.fast_mode:
+            wn_syns = self.get_wordnet_synonyms(term, limit=10)
+            all_synonyms.extend(wn_syns)
+        if candidate_pool and self.model:
+            sem_syns = self.get_semantic_synonyms(
+                term, candidate_pool,
+                threshold=semantic_threshold,
+                limit=15
+            )
+            all_synonyms.extend(sem_syns)
+        synonym_map = {}
+        for syn, conf, source in all_synonyms:
+            syn_lower = syn.lower()
+            if syn_lower not in synonym_map or conf > synonym_map[syn_lower][1]:
+                synonym_map[syn_lower] = (syn, conf, source)
+        final_synonyms = sorted(synonym_map.values(), key=lambda x: x[1], reverse=True)
+        return final_synonyms
+    def add_synonym_group(self, term, synonyms_with_confidence):
+        """Add synonym group"""
+        term_lower = term.lower()
+        if term_lower not in self.synonyms:
+            self.synonyms[term_lower] = []
+        for syn, conf, src in synonyms_with_confidence:
+            if not any(s[0].lower() == syn.lower() for s in self.synonyms[term_lower]):
+                self.synonyms[term_lower].append((syn, conf, src))
+    def extract_terms_from_categories(self, csv_path, min_frequency=2):
+        """Extract terms from category CSV"""
+        print(f"\n📂 Extracting terms from: {csv_path}")
+        try:
+            import pandas as pd
+            df = pd.read_csv(csv_path)
+            path_col = df.columns[1] if len(df.columns) > 1 else df.columns[0]
+            paths = df[path_col].dropna().astype(str)
+            print(f"   Processing {len(paths):,} category paths...")
+            term_freq = defaultdict(int)
+            for path in tqdm(paths, desc="Analyzing paths"):
+                levels = path.split('/')
+                for level in levels:
+                    words = level.lower().split()
+                    for word in words:
+                        if len(word) > 2 and word.isalpha():
+                            term_freq[word] += 1
+                    for i in range(len(words) - 1):
+                        if len(words[i]) > 2 and len(words[i+1]) > 2:
+                            phrase = f"{words[i]} {words[i+1]}"
+                            if phrase.replace(' ', '').isalpha():
+                                term_freq[phrase] += 1
+            candidates = [
+                term for term, freq in term_freq.items()
+                if freq >= min_frequency
+            ]
+            print(f"✅ Extracted {len(candidates):,} terms (min frequency: {min_frequency})")
+            return candidates, term_freq
+        except Exception as e:
+            print(f"❌ Error extracting terms: {e}")
+            import traceback
+            traceback.print_exc()
+            return [], {}
+    def auto_build_from_categories(self, csv_path, top_terms=1000, semantic_threshold=0.70):
+        """Auto-build synonym database"""
+        print("\n" + "="*80)
+        print("🚀 AUTO-BUILD SYNONYM DATABASE")
+        print("="*80)
+        if not self.load_transformer_model():
+            print("\n⚠️  Continuing with WordNet only")
+        all_terms, term_freq = self.extract_terms_from_categories(csv_path)
+        if not all_terms:
+            print("❌ No terms extracted")
+            return False
+        print(f"\n🎯 Selecting top {top_terms} terms...")
+        top_frequent = sorted(term_freq.items(), key=lambda x: x[1], reverse=True)[:top_terms]
+        terms_to_process = [term for term, _ in top_frequent]
+        print(f"✅ Selected {len(terms_to_process)} terms")
+        print(f"📊 Top 10: {', '.join(terms_to_process[:10])}")
+        print(f"\n🔄 Generating synonyms (threshold={semantic_threshold})...\n")
+        stats = {'processed': 0, 'synonyms': 0, 'high_conf': 0}
+        for term in tqdm(terms_to_process, desc="Processing"):
+            if term in self.synonyms and len(self.synonyms[term]) >= 10:
+                continue
+            syns = self.auto_generate_synonyms(
+                term,
+                candidate_pool=all_terms,
+                semantic_threshold=semantic_threshold,
+                silent=True
+            )
+            if syns:
+                self.add_synonym_group(term, syns)
+                stats['processed'] += 1
+                stats['synonyms'] += len(syns)
+                stats['high_conf'] += sum(1 for _, c, _ in syns if c >= 0.8)
+        print(f"\n✅ Processed: {stats['processed']:,} terms")
+        print(f"✅ Total synonyms: {stats['synonyms']:,}")
+        print(f"✅ High confidence (≥0.8): {stats['high_conf']:,}")
+        self.save_synonyms()
+        print("\n🎉 AUTO-BUILD COMPLETE!\n")
+        return True
+def main():
+    """Main entry point"""
+    print("\n" + "="*80)
+    print("🤖 AI-POWERED SYNONYM MANAGER")
+    print("="*80 + "\n")
+    fast_mode = '--fast' in sys.argv
+    if len(sys.argv) < 2:
+        print("Usage:")
+        print("  python synonym_manager_fixed.py autobuild <csv_file>")
+        print("  python synonym_manager_fixed.py autobuild <csv_file> --fast")
+        print("\nExample:")
+        print("  python synonym_manager_fixed.py autobuild data/category_id_path_only.csv")
+        return
+    command = sys.argv[1].lower()
+    if command == 'autobuild':
+        if len(sys.argv) < 3:
+            print("❌ CSV file path required")
+            return
+        csv_path = sys.argv[2]
+        if not Path(csv_path).exists():
+            print(f"❌ File not found: {csv_path}")
+            return
+        manager = SynonymManager(fast_mode=fast_mode)
+        manager.auto_build_from_categories(csv_path, top_terms=1000)
+    else:
+        print(f"❌ Unknown command: {command}")
+if __name__ == "__main__":
     main()

train_products.py ADDED Viewed

	@@ -0,0 +1,421 @@

+#!/usr/bin/env python3
+"""
+train.py
+Build normalized embeddings + FAISS index for category catalog,
+build parent embeddings, save synonyms from tags.json and optionally
+train a LightGBM classifier and a simple confidence calibrator.
+Assumptions / Files:
+- categories CSV: category_only_path.csv (Category_ID,Category_path,Final_Category)
+- optional: data/tags.json  (map category_id -> list of phrases)
+- optional: validation.csv (columns: product_title,category_id) used for calibrator / classifier
+Outputs to ./cache:
+- main_index.faiss
+- metadata.pkl
+- parent_embeddings.pkl
+- cross_store_synonyms.pkl
+- model_info.json
+- calibrator.pkl (if validation exists)
+- classifier.pkl (if --train-classifier used)
+"""
+import argparse
+import json
+import os
+import pickle
+from pathlib import Path
+from typing import List, Dict
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+# sentence-transformers + faiss
+from sentence_transformers import SentenceTransformer
+import faiss
+# sklearn for calibrator and simple preprocessing
+from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+# optional LightGBM (install if you plan to train classifier)
+try:
+    import importlib
+    lgb = importlib.import_module("lightgbm")
+    LGB_AVAILABLE = True
+except Exception:
+    lgb = None
+    LGB_AVAILABLE = False
+CACHE_DIR = Path("cache")
+CACHE_DIR.mkdir(exist_ok=True, parents=True)
+DEFAULT_BATCH_SIZE_CPU = 256
+DEFAULT_BATCH_SIZE_GPU = 16
+def normalize_path_sep(path: str) -> str:
+    if not isinstance(path, str):
+        return ""
+    s = path.strip()
+    s = s.replace("/", " > ")
+    s = " > ".join([p.strip() for p in s.split(">") if p.strip()])
+    return s
+def path_to_levels(path: str) -> List[str]:
+    n = normalize_path_sep(path)
+    return [p.strip() for p in n.split(" > ") if p.strip()]
+def safe_pickle_save(obj, p: Path):
+    with open(p, "wb") as f:
+        pickle.dump(obj, f)
+def build_encoder(model_name: str, use_cuda: bool):
+    device = "cuda" if use_cuda else "cpu"
+    print(f"Loading encoder: {model_name} on {device}")
+    model = SentenceTransformer(model_name, device=device)
+    if use_cuda:
+        try:
+            import torch
+            model = model.half()
+            print("Using FP16 on GPU to conserve VRAM.")
+        except Exception:
+            pass
+    return model
+def encode_texts(model: SentenceTransformer, texts: List[str], use_cuda: bool) -> np.ndarray:
+    batch_size = DEFAULT_BATCH_SIZE_GPU if use_cuda else DEFAULT_BATCH_SIZE_CPU
+    print(f"Encoding {len(texts):,} texts in batches of {batch_size} ...")
+    all_emb = []
+    for i in tqdm(range(0, len(texts), batch_size)):
+        batch = texts[i:i + batch_size]
+        emb = model.encode(batch, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
+        if emb.ndim == 1:
+            emb = emb.reshape(1, -1)
+        all_emb.append(emb.astype("float32"))
+    embeddings = np.vstack(all_emb)
+    print("Final embeddings shape:", embeddings.shape)
+    return embeddings
+def build_faiss_index(np_emb: np.ndarray, use_gpu: bool = False):
+    d = np_emb.shape[1]
+    print(f"Building IndexFlatIP (d={d}) on {'GPU' if use_gpu else 'CPU'}")
+    index = faiss.IndexFlatIP(d)
+    if use_gpu:
+        try:
+            res = faiss.StandardGpuResources()
+            index = faiss.index_cpu_to_gpu(res, 0, index)
+            print("Converted FAISS index to GPU")
+        except Exception as e:
+            print("GPU conversion failed; using CPU index:", e)
+    index.add(np_emb)
+    print("Index ntotal:", index.ntotal)
+    return index
+def make_parent_embeddings(metadata: List[Dict], embeddings: np.ndarray) -> Dict[str, np.ndarray]:
+    """
+    For each possible parent path (every prefix), average embeddings of its children.
+    This helps hierarchical boosting during inference.
+    """
+    parent_map = {}
+    count_map = {}
+    for i, meta in enumerate(metadata):
+        levels = meta.get("levels", [])
+        for depth in range(1, len(levels)):
+            parent = " > ".join(levels[:depth])
+            if not parent:
+                continue
+            parent_map.setdefault(parent, np.zeros(embeddings.shape[1], dtype="float32"))
+            count_map.setdefault(parent, 0)
+            parent_map[parent] += embeddings[i]
+            count_map[parent] += 1
+    # average + normalize
+    from numpy.linalg import norm
+    final = {}
+    for k, vec in parent_map.items():
+        cnt = count_map.get(k, 1)
+        avg = vec / float(cnt)
+        nrm = np.linalg.norm(avg) + 1e-12
+        final[k] = (avg / nrm).astype("float32")
+    return final
+def load_tags_json(path: Path) -> Dict[str, List[str]]:
+    if not path.exists():
+        return {}
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        # ensure keys are strings
+        return {str(k): [str(x) for x in v] for k, v in data.items()}
+    except Exception as e:
+        print("Failed to load tags.json:", e)
+        return {}
+def train_calibrator(encoder, metadata, faiss_index, val_path: Path, model_name: str, use_cuda: bool):
+    """
+    Build a simple calibrator mapping raw cosine similarity of (product -> true category emb)
+    to a probability. Uses sklearn LogisticRegression on one feature (raw_score).
+    Expects validation.csv with columns product_title,category_id
+    """
+    print("Training calibrator using:", val_path)
+    df = pd.read_csv(val_path, dtype=str, keep_default_na=False)
+    if "product_title" not in df.columns or "category_id" not in df.columns:
+        print("validation.csv must have 'product_title' and 'category_id' columns. Skipping calibrator.")
+        return None
+    examples = []
+    labels = []
+    # Build a mapping category_id -> embedding (from metadata)
+    id_to_idx = {m["category_id"]: i for i, m in enumerate(metadata)}
+    # prepare product embeddings in batches
+    titles = df["product_title"].astype(str).tolist()
+    prod_embs = encode_texts(encoder, [f"query: {t}" for t in titles], use_cuda=use_cuda)
+    for i, row in df.iterrows():
+        cid = str(row["category_id"]).strip()
+        if cid not in id_to_idx:
+            # not in catalog, skip sample
+            continue
+        cat_idx = id_to_idx[cid]
+        cat_emb = metadata[cat_idx].get("_embedding")  # we will attach embeddings later temporarily
+        if cat_emb is None:
+            continue
+        q_emb = prod_embs[i].reshape(1, -1).astype("float32")
+        raw = float(np.dot(q_emb, cat_emb.reshape(-1, 1))[0][0])  # cosine because normalized
+        # positive
+        examples.append([raw])
+        labels.append(1)
+        # generate few negatives by sampling other categories
+        # sample up to 2 random negatives
+        negs = 2
+        for _ in range(negs):
+            import random
+            rand_idx = random.randrange(len(metadata))
+            if rand_idx == cat_idx:
+                continue
+            neg_emb = metadata[rand_idx].get("_embedding")
+            if neg_emb is None:
+                continue
+            raw_neg = float(np.dot(q_emb, neg_emb.reshape(-1, 1))[0][0])
+            examples.append([raw_neg])
+            labels.append(0)
+    if not examples:
+        print("No examples for calibrator (maybe category ids mismatch). Skipping.")
+        return None
+    X = np.array(examples, dtype="float32")
+    y = np.array(labels, dtype="int8")
+    scaler = StandardScaler()
+    Xs = scaler.fit_transform(X)
+    clf = LogisticRegression(max_iter=200)
+    clf.fit(Xs, y)
+    print("Calibrator trained (logistic regression on raw cosine).")
+    return {"calibrator": clf, "scaler": scaler}
+def attach_embeddings_to_metadata(metadata: List[Dict], embeddings: np.ndarray):
+    for i, m in enumerate(metadata):
+        m["_embedding"] = embeddings[i]
+def detach_embeddings_from_metadata(metadata: List[Dict]):
+    for m in metadata:
+        if "_embedding" in m:
+            del m["_embedding"]
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--csv", required=True, help="categories CSV (Category_ID,Category_path,Final_Category)")
+    parser.add_argument("--model", default="intfloat/e5-base-v2", help="embedding model")
+    parser.add_argument("--gpu", action="store_true", help="use GPU for encoding if available (careful with 4GB)")
+    parser.add_argument("--clean-cache", action="store_true", help="delete other cache files after build")
+    parser.add_argument("--train-classifier", action="store_true", help="train LightGBM classifier on validation.csv (optional)")
+    parser.add_argument("--validation", default="data/validation.csv", help="validation CSV used for calibrator / classifier")
+    parser.add_argument("--tags", default="data/tags.json", help="tags.json path (optional)")
+    args = parser.parse_args()
+    csv_path = Path(args.csv)
+    if not csv_path.exists():
+        raise SystemExit("CSV not found: " + str(csv_path))
+    print("Reading CSV:", csv_path)
+    df = pd.read_csv(csv_path, dtype=str, keep_default_na=False)
+    if df.shape[1] < 2:
+        raise SystemExit("CSV must have at least 2 columns: Category_ID, Category_path")
+    # columns
+    cols = list(df.columns)
+    cid_col, path_col = cols[0], cols[1]
+    print("Using columns:", cid_col, path_col)
+    metadata = []
+    texts_for_encoding = []
+    for idx, row in df.iterrows():
+        cid = str(row[cid_col]).strip()
+        raw_path = str(row[path_col]).strip()
+        norm_path = normalize_path_sep(raw_path)
+        levels = path_to_levels(norm_path)
+        final = levels[-1] if levels else norm_path or cid
+        # include both path and final in canonical text to encode
+        text = f"category: {norm_path}. leaf: {final}."
+        metadata.append({
+            "category_id": cid,
+            "category_path": norm_path,
+            "final": final,
+            "levels": levels,
+            "depth": len(levels)
+        })
+        texts_for_encoding.append(text)
+    print(f"Prepared {len(metadata):,} metadata entries")
+    # encoder
+    use_cuda = args.gpu
+    encoder = build_encoder(args.model, use_cuda=use_cuda)
+    # encode categories
+    cat_embeddings = encode_texts(encoder, texts_for_encoding, use_cuda=use_cuda)
+    # Attach embeddings temporarily for calibrator builder
+    attach_embeddings_to_metadata(metadata, cat_embeddings)
+    # parent embeddings
+    parent_emb = make_parent_embeddings(metadata, cat_embeddings)
+    print(f"Built {len(parent_emb):,} parent embeddings")
+    # Build CPU FAISS index (IP on normalized vectors -> cosine)
+    index = build_faiss_index(cat_embeddings, use_gpu=False)
+    # save index (FAISS CPU index)
+    faiss_path = CACHE_DIR / "main_index.faiss"
+    faiss.write_index(index, str(faiss_path))
+    print("Saved FAISS index:", faiss_path)
+    # save metadata (we will strip embeddings before saving to reduce pickle size)
+    detach_embeddings_from_metadata(metadata)
+    meta_path = CACHE_DIR / "metadata.pkl"
+    safe_pickle_save(metadata, meta_path)
+    print("Saved metadata:", meta_path)
+    # save parent embeddings
+    parent_path = CACHE_DIR / "parent_embeddings.pkl"
+    safe_pickle_save(parent_emb, parent_path)
+    print("Saved parent embeddings:", parent_path)
+    # save model_info
+    info = {
+        "model_name": args.model,
+        "num_categories": len(metadata),
+        "embedding_dim": cat_embeddings.shape[1]
+    }
+    with open(CACHE_DIR / "model_info.json", "w", encoding="utf-8") as f:
+        json.dump(info, f, indent=2)
+    print("Saved model_info.json")
+    # store tags.json -> cross_store_synonyms (just preserve structure)
+    tags = load_tags_json(Path(args.tags))
+    if tags:
+        syn_p = CACHE_DIR / "cross_store_synonyms.pkl"
+        safe_pickle_save(tags, syn_p)
+        print("Saved cross_store_synonyms.pkl from tags.json (size: %d)" % len(tags))
+    # calibrator: use validation.csv if exists
+    val_path = Path(args.validation)
+    calibrator_obj = None
+    if val_path.exists():
+        # we need embeddings attached again for calibrator training
+        attach_embeddings_to_metadata(metadata, cat_embeddings)
+        calibrator_obj = train_calibrator(encoder, metadata, index, val_path, args.model, use_cuda=use_cuda)
+        detach_embeddings_from_metadata(metadata)
+        if calibrator_obj:
+            safe_pickle_save(calibrator_obj, CACHE_DIR / "calibrator.pkl")
+            print("Saved calibrator.pkl")
+    # optional LightGBM classifier
+    if args.train_classifier:
+        if not LGB_AVAILABLE:
+            print("LightGBM not available. Install lightgbm to train classifier.")
+        else:
+            val_path2 = Path(args.validation)
+            if not val_path2.exists():
+                print("validation.csv required to train classifier. Skipping classifier training.")
+            else:
+                # create training set from validation.csv
+                dfv = pd.read_csv(val_path2, dtype=str, keep_default_na=False)
+                if "product_title" not in dfv.columns or "category_id" not in dfv.columns:
+                    print("validation.csv must contain product_title and category_id. Skipping classifier.")
+                else:
+                    # encode product titles
+                    prod_texts = [f"query: {t}" for t in dfv["product_title"].astype(str).tolist()]
+                    prod_embs = encode_texts(encoder, prod_texts, use_cuda=use_cuda)
+                    # map category ids to numeric labels
+                    cat_to_label = {m["category_id"]: i for i, m in enumerate(metadata)}
+                    labels = []
+                    rows = []
+                    for i, row in dfv.iterrows():
+                        cid = row["category_id"]
+                        if cid not in cat_to_label:
+                            continue
+                        labels.append(cat_to_label[cid])
+                        rows.append(prod_embs[i])
+                    if len(rows) < 50:
+                        print("Not enough training rows for classifier. Need >=50. Skipping.")
+                    else:
+                        X = np.vstack(rows)
+                        y = np.array(labels, dtype=np.int32)
+                        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
+                        lgb_train = lgb.Dataset(X_train, label=y_train)
+                        lgb_eval = lgb.Dataset(X_val, label=y_val, reference=lgb_train)
+                        params = {
+                            "objective": "multiclass",
+                            "num_class": int(max(y) + 1),
+                            "metric": "multi_logloss",
+                            "verbosity": -1,
+                            "num_threads": 4,
+                            "learning_rate": 0.1,
+                            "num_leaves": 31
+                        }
+                        print("Training LightGBM classifier (may take time)...")
+                        gbm = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval], early_stopping_rounds=30, num_boost_round=500)
+                        # save classifier and mapping
+                        clf_path = CACHE_DIR / "classifier.pkl"
+                        safe_pickle_save({"model": gbm, "cat_to_label": cat_to_label, "label_to_cat": {v: k for k, v in cat_to_label.items()}}, clf_path)
+                        print("Saved classifier.pkl")
+    # cleanup if asked
+    if args.clean_cache:
+        keep = {"main_index.faiss", "metadata.pkl", "model_info.json", "parent_embeddings.pkl", "cross_store_synonyms.pkl"}
+        if calibrator_obj:
+            keep.add("calibrator.pkl")
+        # remove everything else in cache
+        removed = []
+        for p in CACHE_DIR.iterdir():
+            if p.name in keep:
+                continue
+            try:
+                p.unlink()
+                removed.append(p.name)
+            except Exception:
+                pass
+        if removed:
+            print("Removed cache files:", removed)
+    print("DONE. Index + data saved to cache/")
+if __name__ == "__main__":
+    main()

validation_data.py ADDED Viewed

	@@ -0,0 +1,310 @@

+"""
+📊 VALIDATION DATA CREATOR
+===========================
+Helper script to create validation CSV for confidence calibration.
+Two modes:
+1. Sample from existing categories (automated)
+2. Manual entry (interactive)
+Output format:
+    product_title,true_category_id
+    "Oxygen Sensor Tool",12345
+    "Hydraulic Oil Additive",67890
+Usage:
+    # Automated sampling:
+    python create_validation_data.py auto data/category_id_path_only.csv
+    # Manual entry:
+    python create_validation_data.py manual
+"""
+import pandas as pd
+import sys
+from pathlib import Path
+import random
+def sample_from_categories(csv_path, num_samples=100, output_file='data/validation.csv'):
+    """
+    Automatically create validation data by sampling from categories
+    and generating product titles based on category paths.
+    """
+    print("\n" + "="*80)
+    print("📊 AUTO-GENERATING VALIDATION DATA")
+    print("="*80 + "\n")
+    # Load categories
+    print(f"Loading: {csv_path}")
+    df = pd.read_csv(csv_path)
+    if len(df.columns) < 2:
+        print("❌ CSV must have at least 2 columns (category_id, category_path)")
+        return False
+    df.columns = ['category_id', 'category_path'] + list(df.columns[2:])
+    df = df.dropna(subset=['category_path'])
+    print(f"✅ Loaded {len(df):,} categories\n")
+    # Sample categories
+    sample_size = min(num_samples, len(df))
+    sampled = df.sample(n=sample_size, random_state=42)
+    print(f"📝 Generating {sample_size} validation entries...\n")
+    validation_data = []
+    for idx, row in sampled.iterrows():
+        cat_id = str(row['category_id'])
+        cat_path = str(row['category_path'])
+        # Generate product title from category path
+        levels = cat_path.split('/')
+        # Use last 2-3 levels as product title
+        if len(levels) >= 3:
+            title_parts = levels[-3:]
+        elif len(levels) >= 2:
+            title_parts = levels[-2:]
+        else:
+            title_parts = levels
+        # Clean and combine
+        title = ' '.join(title_parts).strip()
+        # Add some variation
+        variations = [
+            title,
+            f"{title} kit",
+            f"{title} tool",
+            f"{title} set",
+            f"professional {title}",
+            f"{title} replacement",
+        ]
+        product_title = random.choice(variations)
+        validation_data.append({
+            'product_title': product_title,
+            'true_category_id': cat_id
+        })
+    # Create DataFrame
+    val_df = pd.DataFrame(validation_data)
+    # Save
+    output_path = Path(output_file)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    val_df.to_csv(output_path, index=False)
+    print(f"✅ Created validation file: {output_path}")
+    print(f"   Entries: {len(val_df):,}")
+    # Show samples
+    print("\n📝 Sample entries:")
+    for i, row in val_df.head(5).iterrows():
+        print(f"   {i+1}. \"{row['product_title']}\" → {row['true_category_id']}")
+    print("\n" + "="*80)
+    print("✅ VALIDATION DATA CREATED!")
+    print("="*80)
+    print(f"\nNext step: Train with calibration")
+    print(f"   python train_fixed_v2.py data/category_id_path_only.csv data/tags.json {output_path}")
+    print("="*80 + "\n")
+    return True
+def manual_entry(output_file='data/validation_manual.csv'):
+    """
+    Interactive mode to manually create validation data.
+    """
+    print("\n" + "="*80)
+    print("📝 MANUAL VALIDATION DATA ENTRY")
+    print("="*80)
+    print("\nEnter product titles and their correct category IDs.")
+    print("Press CTRL+C when done.\n")
+    validation_data = []
+    try:
+        while True:
+            print(f"\n--- Entry #{len(validation_data) + 1} ---")
+            title = input("Product title: ").strip()
+            if not title:
+                print("⚠️  Title cannot be empty")
+                continue
+            cat_id = input("Category ID: ").strip()
+            if not cat_id:
+                print("⚠️  Category ID cannot be empty")
+                continue
+            validation_data.append({
+                'product_title': title,
+                'true_category_id': cat_id
+            })
+            print(f"✅ Added: \"{title}\" → {cat_id}")
+    except KeyboardInterrupt:
+        print("\n\n📊 Entry complete!")
+    if not validation_data:
+        print("❌ No entries created")
+        return False
+    # Create DataFrame
+    val_df = pd.DataFrame(validation_data)
+    # Save
+    output_path = Path(output_file)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    val_df.to_csv(output_path, index=False)
+    print(f"\n✅ Created validation file: {output_path}")
+    print(f"   Entries: {len(val_df):,}")
+    print("\n" + "="*80)
+    print("✅ VALIDATION DATA CREATED!")
+    print("="*80)
+    print(f"\nNext step: Train with calibration")
+    print(f"   python train_fixed_v2.py data/category_id_path_only.csv data/tags.json {output_path}")
+    print("="*80 + "\n")
+    return True
+def verify_validation_file(validation_csv, categories_csv):
+    """
+    Verify that validation data references valid category IDs.
+    """
+    print("\n" + "="*80)
+    print("🔍 VERIFYING VALIDATION DATA")
+    print("="*80 + "\n")
+    # Load validation data
+    print(f"Loading validation: {validation_csv}")
+    val_df = pd.read_csv(validation_csv)
+    if 'product_title' not in val_df.columns or 'true_category_id' not in val_df.columns:
+        print("❌ Validation CSV must have: product_title, true_category_id")
+        return False
+    print(f"✅ Loaded {len(val_df):,} validation entries\n")
+    # Load categories
+    print(f"Loading categories: {categories_csv}")
+    cat_df = pd.read_csv(categories_csv)
+    cat_df.columns = ['category_id', 'category_path'] + list(cat_df.columns[2:])
+    valid_ids = set(cat_df['category_id'].astype(str))
+    print(f"✅ Loaded {len(valid_ids):,} valid category IDs\n")
+    # Verify
+    print("Checking validation entries...")
+    invalid_count = 0
+    for idx, row in val_df.iterrows():
+        cat_id = str(row['true_category_id'])
+        title = row['product_title']
+        if cat_id not in valid_ids:
+            print(f"❌ Invalid ID: {cat_id} for \"{title}\"")
+            invalid_count += 1
+    if invalid_count == 0:
+        print("✅ All validation entries are valid!")
+    else:
+        print(f"\n⚠️  Found {invalid_count} invalid entries")
+    # Summary
+    print("\n" + "="*80)
+    print("📊 VALIDATION DATA SUMMARY")
+    print("="*80)
+    print(f"Total entries: {len(val_df):,}")
+    print(f"Valid entries: {len(val_df) - invalid_count:,}")
+    print(f"Invalid entries: {invalid_count}")
+    print("="*80 + "\n")
+    return invalid_count == 0
+def main():
+    print("\n" + "="*80)
+    print("📊 VALIDATION DATA CREATOR")
+    print("="*80 + "\n")
+    if len(sys.argv) < 2:
+        print("Usage:")
+        print("  python create_validation_data.py auto <csv_path> [num_samples] [output_file]")
+        print("  python create_validation_data.py manual [output_file]")
+        print("  python create_validation_data.py verify <validation_csv> <categories_csv>")
+        print("\nExamples:")
+        print("  # Auto-generate 100 samples:")
+        print("  python create_validation_data.py auto data/category_id_path_only.csv")
+        print()
+        print("  # Auto-generate 200 samples:")
+        print("  python create_validation_data.py auto data/category_id_path_only.csv 200")
+        print()
+        print("  # Manual entry:")
+        print("  python create_validation_data.py manual")
+        print()
+        print("  # Verify validation file:")
+        print("  python create_validation_data.py verify data/validation.csv data/category_id_path_only.csv")
+        print()
+        return
+    mode = sys.argv[1].lower()
+    if mode == 'auto':
+        if len(sys.argv) < 3:
+            print("❌ CSV path required for auto mode")
+            print("   python create_validation_data.py auto data/category_id_path_only.csv")
+            return
+        csv_path = sys.argv[2]
+        num_samples = int(sys.argv[3]) if len(sys.argv) > 3 else 100
+        output_file = sys.argv[4] if len(sys.argv) > 4 else 'data/validation.csv'
+        if not Path(csv_path).exists():
+            print(f"❌ File not found: {csv_path}")
+            return
+        sample_from_categories(csv_path, num_samples, output_file)
+    elif mode == 'manual':
+        output_file = sys.argv[2] if len(sys.argv) > 2 else 'data/validation_manual.csv'
+        manual_entry(output_file)
+    elif mode == 'verify':
+        if len(sys.argv) < 4:
+            print("❌ Both validation CSV and categories CSV required")
+            print("   python create_validation_data.py verify data/validation.csv data/category_id_path_only.csv")
+            return
+        validation_csv = sys.argv[2]
+        categories_csv = sys.argv[3]
+        if not Path(validation_csv).exists():
+            print(f"❌ File not found: {validation_csv}")
+            return
+        if not Path(categories_csv).exists():
+            print(f"❌ File not found: {categories_csv}")
+            return
+        verify_validation_file(validation_csv, categories_csv)
+    else:
+        print(f"❌ Unknown mode: {mode}")
+        print("   Use: auto, manual, or verify")
+if __name__ == "__main__":
+    main()