zen-vton commited on
Commit
ad9b761
·
verified ·
1 Parent(s): dbddda1

Update synonyms.py

Browse files
Files changed (1) hide show
  1. synonyms.py +365 -853
synonyms.py CHANGED
@@ -1,854 +1,366 @@
1
- # """
2
- # 🤖 FIXED AI-POWERED SYNONYM MANAGER
3
- # ====================================
4
- # ✅ Windows + NVIDIA GPU optimized
5
- # ✅ Uses e5-base-v2 (lower memory)
6
- # Proper error handling
7
- # Progress tracking
8
-
9
- # Usage:
10
- # python synonym_manager_fixed.py autobuild data/category_id_path_only.csv
11
- # python synonym_manager_fixed.py autobuild data/category_id_path_only.csv --fast
12
- # """
13
-
14
- # import pickle
15
- # from pathlib import Path
16
- # import json
17
- # from collections import defaultdict
18
- # from tqdm import tqdm
19
- # import warnings
20
- # import sys
21
- # import os
22
-
23
- # warnings.filterwarnings('ignore')
24
-
25
- # # Fix CUDA issues on Windows
26
- # os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
27
-
28
- # try:
29
- # from nltk.corpus import wordnet
30
- # from nltk import download as nltk_download
31
- # WORDNET_AVAILABLE = True
32
- # except ImportError:
33
- # WORDNET_AVAILABLE = False
34
- # print("⚠️ NLTK not available. Install with: pip install nltk")
35
-
36
- # try:
37
- # from sentence_transformers import SentenceTransformer, util
38
- # import torch
39
- # TRANSFORMERS_AVAILABLE = True
40
- # except ImportError:
41
- # TRANSFORMERS_AVAILABLE = False
42
- # print("⚠️ SentenceTransformers not available.")
43
- # print(" Install with: pip install sentence-transformers torch")
44
-
45
-
46
- # class FixedAISynonymManager:
47
- # """Fixed AI-powered synonym manager for Windows + NVIDIA GPU"""
48
-
49
- # def __init__(self, cache_dir='cache', tags_file='data/tags.json', fast_mode=False):
50
- # self.cache_dir = Path(cache_dir)
51
- # self.synonyms_file = self.cache_dir / 'cross_store_synonyms.pkl'
52
- # self.tags_file = Path(tags_file)
53
- # self.synonyms = {}
54
- # self.tags_data = {}
55
- # self.model = None
56
- # self.device = "cpu"
57
- # self.fast_mode = fast_mode
58
-
59
- # # Create cache directory
60
- # self.cache_dir.mkdir(parents=True, exist_ok=True)
61
-
62
- # # Load existing data
63
- # self.load_tags()
64
- # if self.synonyms_file.exists():
65
- # self.load_synonyms()
66
- # else:
67
- # print("📝 No existing synonyms file. Will create new one.")
68
-
69
- # def load_tags(self):
70
- # """Load domain-specific tags (optional)"""
71
- # if self.tags_file.exists():
72
- # try:
73
- # with open(self.tags_file, 'r', encoding='utf-8') as f:
74
- # self.tags_data = json.load(f)
75
- # print(f"✅ Loaded {len(self.tags_data)} tag entries")
76
- # return True
77
- # except Exception as e:
78
- # print(f"⚠️ Could not load tags.json: {e}")
79
- # else:
80
- # print(f"ℹ️ tags.json not found (optional)")
81
- # return False
82
-
83
- # def load_synonyms(self):
84
- # """Load existing synonyms with format conversion"""
85
- # try:
86
- # with open(self.synonyms_file, 'rb') as f:
87
- # loaded = pickle.load(f)
88
-
89
- # # Handle different formats
90
- # if not loaded:
91
- # self.synonyms = {}
92
- # return
93
-
94
- # # Check format
95
- # first_val = next(iter(loaded.values()))
96
-
97
- # if isinstance(first_val, list):
98
- # if first_val and isinstance(first_val[0], tuple):
99
- # # New format: [(syn, conf, src), ...]
100
- # self.synonyms = loaded
101
- # print(f"✅ Loaded {len(self.synonyms)} synonym entries (new format)")
102
- # elif first_val and isinstance(first_val[0], str):
103
- # # Legacy format: [syn1, syn2, ...]
104
- # self.synonyms = {
105
- # k: [(v, 0.8, 'legacy') for v in vals]
106
- # for k, vals in loaded.items()
107
- # }
108
- # print(f" Converted {len(self.synonyms)} legacy synonym entries")
109
- # elif isinstance(first_val, set):
110
- # # Set format
111
- # self.synonyms = {
112
- # k: [(v, 0.8, 'legacy') for v in vals]
113
- # for k, vals in loaded.items()
114
- # }
115
- # print(f"✅ Converted {len(self.synonyms)} set-based entries")
116
- # else:
117
- # self.synonyms = {}
118
- # print(f"⚠️ Unknown synonym format")
119
-
120
- # except Exception as e:
121
- # print(f" Error loading synonyms: {e}")
122
- # self.synonyms = {}
123
-
124
- # def save_synonyms(self):
125
- # """Save synonyms in both formats"""
126
- # try:
127
- # # Save binary format
128
- # with open(self.synonyms_file, 'wb') as f:
129
- # pickle.dump(self.synonyms, f)
130
-
131
- # # Save readable JSON
132
- # json_file = self.cache_dir / 'synonyms_readable.json'
133
- # readable = {}
134
- # for term, syns in self.synonyms.items():
135
- # readable[term] = [
136
- # {'synonym': syn, 'confidence': float(conf), 'source': src}
137
- # for syn, conf, src in syns
138
- # ]
139
-
140
- # with open(json_file, 'w', encoding='utf-8') as f:
141
- # json.dump(readable, f, indent=2, ensure_ascii=False)
142
-
143
- # print(f"\n✅ Saved {len(self.synonyms)} synonym entries")
144
- # print(f" 📁 Binary: {self.synonyms_file}")
145
- # print(f" 📁 JSON: {json_file}")
146
- # return True
147
- # except Exception as e:
148
- # print(f"❌ Error saving synonyms: {e}")
149
- # return False
150
-
151
- # def load_transformer_model(self):
152
- # """Load e5-base-v2 model with GPU support"""
153
- # if not TRANSFORMERS_AVAILABLE:
154
- # print("❌ SentenceTransformers not installed!")
155
- # return False
156
-
157
- # # Check for CUDA
158
- # self.device = "cuda" if torch.cuda.is_available() else "cpu"
159
-
160
- # if self.device == "cuda":
161
- # print(f"🔥 NVIDIA GPU detected!")
162
- # try:
163
- # gpu_name = torch.cuda.get_device_name(0)
164
- # vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
165
- # print(f" GPU: {gpu_name}")
166
- # print(f" VRAM: {vram_gb:.1f} GB")
167
- # except:
168
- # pass
169
- # else:
170
- # print("💻 Using CPU (slower)")
171
-
172
- # # Use e5-base-v2 for better memory efficiency
173
- # model_name = "intfloat/e5-base-v2"
174
- # print(f"\n🤖 Loading model: {model_name}")
175
-
176
- # try:
177
- # self.model = SentenceTransformer(model_name, device=self.device)
178
- # self.model.max_seq_length = 256
179
-
180
- # # Use FP16 on GPU for speed
181
- # if self.device == "cuda":
182
- # self.model = self.model.half()
183
- # print("⚡ Enabled FP16 precision")
184
-
185
- # print("✅ Model loaded successfully\n")
186
- # return True
187
- # except Exception as e:
188
- # print(f"❌ Failed to load model: {e}")
189
- # return False
190
-
191
- # def get_wordnet_synonyms(self, word, limit=10):
192
- # """Get WordNet synonyms"""
193
- # if self.fast_mode or not WORDNET_AVAILABLE:
194
- # return []
195
-
196
- # try:
197
- # # Ensure WordNet is downloaded
198
- # try:
199
- # wordnet.synsets('test')
200
- # except:
201
- # print("📥 Downloading WordNet data...")
202
- # nltk_download('wordnet', quiet=True)
203
- # nltk_download('omw-1.4', quiet=True)
204
-
205
- # synonyms = []
206
- # word_clean = word.lower().replace(' ', '_')
207
-
208
- # for syn in wordnet.synsets(word_clean):
209
- # for lemma in syn.lemmas():
210
- # synonym = lemma.name().replace('_', ' ').lower()
211
- # if synonym != word.lower() and len(synonym) > 2:
212
- # confidence = 0.75 # Fixed confidence for WordNet
213
- # synonyms.append((synonym, confidence, 'wordnet'))
214
- # if len(synonyms) >= limit:
215
- # break
216
- # if len(synonyms) >= limit:
217
- # break
218
-
219
- # return synonyms[:limit]
220
- # except Exception:
221
- # return []
222
-
223
- # def get_semantic_synonyms(self, term, candidate_pool, threshold=0.70, limit=15):
224
- # """Get semantic synonyms using embeddings"""
225
- # if not self.model or not candidate_pool:
226
- # return []
227
-
228
- # try:
229
- # # E5 model requires query/passage prefixes
230
- # query = f"query: {term}"
231
- # candidates_prefixed = [f"passage: {c}" for c in candidate_pool]
232
-
233
- # # Encode query
234
- # term_emb = self.model.encode(
235
- # query,
236
- # convert_to_tensor=True,
237
- # show_progress_bar=False
238
- # )
239
-
240
- # # Encode candidates in batches
241
- # batch_size = 32 if self.device == "cuda" else 8
242
- # all_embeddings = []
243
-
244
- # for i in range(0, len(candidates_prefixed), batch_size):
245
- # batch = candidates_prefixed[i:i + batch_size]
246
- # emb = self.model.encode(
247
- # batch,
248
- # convert_to_tensor=True,
249
- # show_progress_bar=False
250
- # )
251
- # all_embeddings.append(emb)
252
-
253
- # # Concatenate all embeddings
254
- # candidate_embs = torch.cat(all_embeddings, dim=0)
255
-
256
- # # Calculate cosine similarity
257
- # scores = util.cos_sim(term_emb, candidate_embs)[0]
258
-
259
- # # Filter by threshold
260
- # synonyms = []
261
- # for candidate, score in zip(candidate_pool, scores):
262
- # score_val = float(score)
263
- # if score_val > threshold and candidate.lower() != term.lower():
264
- # # Scale confidence between 0.6 and 0.95
265
- # confidence = 0.60 + (score_val - threshold) * 0.35 / (1 - threshold)
266
- # synonyms.append((candidate, confidence, 'semantic'))
267
-
268
- # # Sort by confidence
269
- # synonyms.sort(key=lambda x: x[1], reverse=True)
270
- # return synonyms[:limit]
271
-
272
- # except Exception as e:
273
- # print(f"⚠️ Semantic error: {e}")
274
- # return []
275
-
276
- # def auto_generate_synonyms(self, term, candidate_pool=None,
277
- # semantic_threshold=0.70, silent=False):
278
- # """Generate synonyms from multiple sources"""
279
- # all_synonyms = []
280
-
281
- # if not silent:
282
- # print(f"\n🔍 Finding synonyms for: '{term}'")
283
-
284
- # # Source 1: WordNet
285
- # if WORDNET_AVAILABLE and not self.fast_mode:
286
- # wn_syns = self.get_wordnet_synonyms(term, limit=10)
287
- # all_synonyms.extend(wn_syns)
288
-
289
- # # Source 2: Semantic similarity
290
- # if candidate_pool and self.model:
291
- # sem_syns = self.get_semantic_synonyms(
292
- # term, candidate_pool,
293
- # threshold=semantic_threshold,
294
- # limit=15
295
- # )
296
- # all_synonyms.extend(sem_syns)
297
-
298
- # # Deduplicate (keep highest confidence)
299
- # synonym_map = {}
300
- # for syn, conf, source in all_synonyms:
301
- # syn_lower = syn.lower()
302
- # if syn_lower not in synonym_map or conf > synonym_map[syn_lower][1]:
303
- # synonym_map[syn_lower] = (syn, conf, source)
304
-
305
- # final_synonyms = sorted(
306
- # synonym_map.values(),
307
- # key=lambda x: x[1],
308
- # reverse=True
309
- # )
310
-
311
- # return final_synonyms
312
-
313
- # def add_synonym_group(self, term, synonyms_with_confidence):
314
- # """Add synonym group"""
315
- # term_lower = term.lower()
316
- # if term_lower not in self.synonyms:
317
- # self.synonyms[term_lower] = []
318
-
319
- # for syn, conf, src in synonyms_with_confidence:
320
- # # Check if already exists
321
- # if not any(s[0].lower() == syn.lower() for s in self.synonyms[term_lower]):
322
- # self.synonyms[term_lower].append((syn, conf, src))
323
-
324
- # def extract_terms_from_categories(self, csv_path, min_frequency=2):
325
- # """Extract terms from category CSV"""
326
- # print(f"\n📂 Extracting terms from: {csv_path}")
327
-
328
- # try:
329
- # import pandas as pd
330
-
331
- # # Read CSV
332
- # df = pd.read_csv(csv_path)
333
-
334
- # # Find path column (usually second column)
335
- # path_col = df.columns[1] if len(df.columns) > 1 else df.columns[0]
336
- # paths = df[path_col].dropna().astype(str)
337
-
338
- # print(f" Processing {len(paths):,} category paths...")
339
-
340
- # term_freq = defaultdict(int)
341
-
342
- # for path in tqdm(paths, desc="Analyzing paths"):
343
- # levels = path.split('/')
344
-
345
- # for level in levels:
346
- # words = level.lower().split()
347
-
348
- # # Single words
349
- # for word in words:
350
- # if len(word) > 2 and word.isalpha():
351
- # term_freq[word] += 1
352
-
353
- # # Two-word phrases
354
- # for i in range(len(words) - 1):
355
- # if len(words[i]) > 2 and len(words[i+1]) > 2:
356
- # phrase = f"{words[i]} {words[i+1]}"
357
- # if phrase.replace(' ', '').isalpha():
358
- # term_freq[phrase] += 1
359
-
360
- # # Filter by frequency
361
- # candidates = [
362
- # term for term, freq in term_freq.items()
363
- # if freq >= min_frequency
364
- # ]
365
-
366
- # print(f"✅ Extracted {len(candidates):,} terms (min frequency: {min_frequency})")
367
- # return candidates, term_freq
368
-
369
- # except Exception as e:
370
- # print(f"❌ Error extracting terms: {e}")
371
- # import traceback
372
- # traceback.print_exc()
373
- # return [], {}
374
-
375
- # def auto_build_from_categories(self, csv_path, top_terms=1000,
376
- # semantic_threshold=0.70):
377
- # """Auto-build synonym database from categories"""
378
- # print("\n" + "="*80)
379
- # print("🚀 AUTO-BUILD SYNONYM DATABASE")
380
- # print("="*80)
381
-
382
- # # Load model
383
- # if not self.load_transformer_model():
384
- # print("\n⚠️ Continuing with WordNet only (limited coverage)")
385
-
386
- # # Extract terms
387
- # all_terms, term_freq = self.extract_terms_from_categories(csv_path)
388
- # if not all_terms:
389
- # print("❌ No terms extracted")
390
- # return False
391
-
392
- # # Select top terms
393
- # print(f"\n🎯 Selecting top {top_terms} terms...")
394
- # top_frequent = sorted(
395
- # term_freq.items(),
396
- # key=lambda x: x[1],
397
- # reverse=True
398
- # )[:top_terms]
399
- # terms_to_process = [term for term, _ in top_frequent]
400
-
401
- # print(f"✅ Selected {len(terms_to_process)} terms")
402
- # print(f"📊 Top 10: {', '.join(terms_to_process[:10])}")
403
- # print(f"\n🔄 Generating synonyms (threshold={semantic_threshold})...\n")
404
-
405
- # # Process terms
406
- # stats = {
407
- # 'processed': 0,
408
- # 'synonyms': 0,
409
- # 'high_conf': 0
410
- # }
411
-
412
- # for term in tqdm(terms_to_process, desc="Processing"):
413
- # # Skip if already has enough synonyms
414
- # if term in self.synonyms and len(self.synonyms[term]) >= 10:
415
- # continue
416
-
417
- # # Generate synonyms
418
- # syns = self.auto_generate_synonyms(
419
- # term,
420
- # candidate_pool=all_terms,
421
- # semantic_threshold=semantic_threshold,
422
- # silent=True
423
- # )
424
-
425
- # if syns:
426
- # self.add_synonym_group(term, syns)
427
- # stats['processed'] += 1
428
- # stats['synonyms'] += len(syns)
429
- # stats['high_conf'] += sum(1 for _, c, _ in syns if c >= 0.8)
430
-
431
- # # Print stats
432
- # print(f"\n✅ Processed: {stats['processed']:,} terms")
433
- # print(f"✅ Total synonyms: {stats['synonyms']:,}")
434
- # print(f"✅ High confidence (≥0.8): {stats['high_conf']:,}")
435
-
436
- # # Save
437
- # self.save_synonyms()
438
-
439
- # print("\n🎉 AUTO-BUILD COMPLETE!\n")
440
- # return True
441
-
442
-
443
- # def main():
444
- # """Main entry point"""
445
- # print("\n" + "="*80)
446
- # print("🤖 AI-POWERED SYNONYM MANAGER (Windows + NVIDIA GPU)")
447
- # print("="*80 + "\n")
448
-
449
- # # Parse arguments
450
- # fast_mode = '--fast' in sys.argv
451
-
452
- # if len(sys.argv) < 2:
453
- # print("Usage:")
454
- # print(" python synonym_manager_fixed.py autobuild <csv_file>")
455
- # print(" python synonym_manager_fixed.py autobuild <csv_file> --fast")
456
- # print("\nExample:")
457
- # print(" python synonym_manager_fixed.py autobuild data/category_id_path_only.csv")
458
- # return
459
-
460
- # command = sys.argv[1].lower()
461
-
462
- # if command == 'autobuild':
463
- # if len(sys.argv) < 3:
464
- # print("❌ CSV file path required")
465
- # return
466
-
467
- # csv_path = sys.argv[2]
468
-
469
- # if not Path(csv_path).exists():
470
- # print(f"❌ File not found: {csv_path}")
471
- # return
472
-
473
- # # Initialize manager
474
- # manager = FixedAISynonymManager(fast_mode=fast_mode)
475
-
476
- # # Run auto-build
477
- # manager.auto_build_from_categories(csv_path, top_terms=1000)
478
-
479
- # else:
480
- # print(f"❌ Unknown command: {command}")
481
-
482
-
483
- # if __name__ == "__main__":
484
- # main()
485
-
486
-
487
- #for cache2
488
-
489
-
490
- """
491
- 🤖 AI-POWERED SYNONYM MANAGER (Fixed for Windows + GPU)
492
- ========================================================
493
- ✅ Uses e5-base-v2 (768D, memory-efficient)
494
- ✅ Windows + NVIDIA GPU optimized
495
- ✅ Generates cross-store synonyms automatically
496
-
497
- Usage:
498
- python synonym_manager_fixed.py autobuild data/category_id_path_only.csv
499
- python synonym_manager_fixed.py autobuild data/category_id_path_only.csv --fast
500
- """
501
-
502
- import pickle
503
- from pathlib import Path
504
- import json
505
- from collections import defaultdict
506
- from tqdm import tqdm
507
- import warnings
508
- import sys
509
- import os
510
-
511
- warnings.filterwarnings('ignore')
512
- os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
513
-
514
- try:
515
- from nltk.corpus import wordnet
516
- from nltk import download as nltk_download
517
- WORDNET_AVAILABLE = True
518
- except ImportError:
519
- WORDNET_AVAILABLE = False
520
-
521
- try:
522
- from sentence_transformers import SentenceTransformer, util
523
- import torch
524
- TRANSFORMERS_AVAILABLE = True
525
- except ImportError:
526
- TRANSFORMERS_AVAILABLE = False
527
-
528
-
529
- class SynonymManager:
530
- """AI-powered synonym manager"""
531
-
532
- def __init__(self, cache_dir='cache', fast_mode=False):
533
- self.cache_dir = Path(cache_dir)
534
- self.synonyms_file = self.cache_dir / 'cross_store_synonyms.pkl'
535
- self.synonyms = {}
536
- self.model = None
537
- self.device = "cpu"
538
- self.fast_mode = fast_mode
539
-
540
- self.cache_dir.mkdir(parents=True, exist_ok=True)
541
-
542
- if self.synonyms_file.exists():
543
- self.load_synonyms()
544
-
545
- def load_synonyms(self):
546
- """Load existing synonyms"""
547
- try:
548
- with open(self.synonyms_file, 'rb') as f:
549
- loaded = pickle.load(f)
550
-
551
- if loaded and list(loaded.values()):
552
- first_val = next(iter(loaded.values()))
553
-
554
- if isinstance(first_val, list) and first_val:
555
- if isinstance(first_val[0], tuple):
556
- self.synonyms = loaded
557
- else:
558
- self.synonyms = {k: [(v, 0.8, 'legacy') for v in vals] for k, vals in loaded.items()}
559
- elif isinstance(first_val, set):
560
- self.synonyms = {k: [(v, 0.8, 'legacy') for v in vals] for k, vals in loaded.items()}
561
-
562
- print(f"✅ Loaded {len(self.synonyms):,} synonym entries")
563
- except Exception as e:
564
- print(f"❌ Error loading synonyms: {e}")
565
- self.synonyms = {}
566
-
567
- def save_synonyms(self):
568
- """Save synonyms"""
569
- try:
570
- with open(self.synonyms_file, 'wb') as f:
571
- pickle.dump(self.synonyms, f)
572
-
573
- json_file = self.cache_dir / 'synonyms_readable.json'
574
- readable = {
575
- term: [
576
- {'synonym': syn, 'confidence': conf, 'source': src}
577
- for syn, conf, src in syns
578
- ]
579
- for term, syns in self.synonyms.items()
580
- }
581
- with open(json_file, 'w', encoding='utf-8') as f:
582
- json.dump(readable, f, indent=2, ensure_ascii=False)
583
-
584
- print(f"✅ Saved {len(self.synonyms):,} synonym entries")
585
- return True
586
- except Exception as e:
587
- print(f"❌ Error saving synonyms: {e}")
588
- return False
589
-
590
- def load_transformer_model(self):
591
- """Load e5-base-v2 model"""
592
- if not TRANSFORMERS_AVAILABLE:
593
- print("❌ SentenceTransformers not installed!")
594
- return False
595
-
596
- self.device = "cuda" if torch.cuda.is_available() else "cpu"
597
-
598
- if self.device == "cuda":
599
- print(f"🔥 NVIDIA GPU detected!")
600
-
601
- model_name = "intfloat/e5-base-v2"
602
- print(f"\n🤖 Loading {model_name}...")
603
-
604
- try:
605
- self.model = SentenceTransformer(model_name, device=self.device)
606
-
607
- if self.device == "cuda":
608
- self.model = self.model.half()
609
- print("⚡ Enabled FP16 precision")
610
-
611
- print("✅ Model loaded\n")
612
- return True
613
- except Exception as e:
614
- print(f"❌ Failed to load model: {e}")
615
- return False
616
-
617
- def get_wordnet_synonyms(self, word, limit=10):
618
- """Get WordNet synonyms"""
619
- if self.fast_mode or not WORDNET_AVAILABLE:
620
- return []
621
-
622
- try:
623
- try:
624
- wordnet.synsets('test')
625
- except:
626
- nltk_download('wordnet', quiet=True)
627
- nltk_download('omw-1.4', quiet=True)
628
-
629
- synonyms = []
630
- word_clean = word.lower().replace(' ', '_')
631
-
632
- for syn in wordnet.synsets(word_clean):
633
- for lemma in syn.lemmas():
634
- synonym = lemma.name().replace('_', ' ').lower()
635
- if synonym != word.lower() and len(synonym) > 2:
636
- confidence = 0.75
637
- synonyms.append((synonym, confidence, 'wordnet'))
638
- if len(synonyms) >= limit:
639
- break
640
- if len(synonyms) >= limit:
641
- break
642
-
643
- return synonyms[:limit]
644
- except Exception:
645
- return []
646
-
647
- def get_semantic_synonyms(self, term, candidate_pool, threshold=0.70, limit=15):
648
- """Get semantic synonyms using E5"""
649
- if not self.model or not candidate_pool:
650
- return []
651
-
652
- try:
653
- query = f"query: {term}"
654
- candidates_prefixed = [f"passage: {c}" for c in candidate_pool]
655
-
656
- term_emb = self.model.encode(query, convert_to_tensor=True, show_progress_bar=False)
657
-
658
- batch_size = 32 if self.device == "cuda" else 8
659
- all_embeddings = []
660
-
661
- for i in range(0, len(candidates_prefixed), batch_size):
662
- batch = candidates_prefixed[i:i + batch_size]
663
- emb = self.model.encode(batch, convert_to_tensor=True, show_progress_bar=False)
664
- all_embeddings.append(emb)
665
-
666
- candidate_embs = torch.cat(all_embeddings, dim=0)
667
- scores = util.cos_sim(term_emb, candidate_embs)[0]
668
-
669
- synonyms = []
670
- for candidate, score in zip(candidate_pool, scores):
671
- score_val = float(score)
672
- if score_val > threshold and candidate.lower() != term.lower():
673
- confidence = 0.60 + (score_val - threshold) * 0.35 / (1 - threshold)
674
- synonyms.append((candidate, confidence, 'semantic'))
675
-
676
- synonyms.sort(key=lambda x: x[1], reverse=True)
677
- return synonyms[:limit]
678
-
679
- except Exception as e:
680
- print(f"⚠️ Semantic error: {e}")
681
- return []
682
-
683
- def auto_generate_synonyms(self, term, candidate_pool=None, semantic_threshold=0.70, silent=False):
684
- """Generate synonyms from multiple sources"""
685
- all_synonyms = []
686
-
687
- if not silent:
688
- print(f"\n🔍 Finding synonyms for: '{term}'")
689
-
690
- if WORDNET_AVAILABLE and not self.fast_mode:
691
- wn_syns = self.get_wordnet_synonyms(term, limit=10)
692
- all_synonyms.extend(wn_syns)
693
-
694
- if candidate_pool and self.model:
695
- sem_syns = self.get_semantic_synonyms(
696
- term, candidate_pool,
697
- threshold=semantic_threshold,
698
- limit=15
699
- )
700
- all_synonyms.extend(sem_syns)
701
-
702
- synonym_map = {}
703
- for syn, conf, source in all_synonyms:
704
- syn_lower = syn.lower()
705
- if syn_lower not in synonym_map or conf > synonym_map[syn_lower][1]:
706
- synonym_map[syn_lower] = (syn, conf, source)
707
-
708
- final_synonyms = sorted(synonym_map.values(), key=lambda x: x[1], reverse=True)
709
- return final_synonyms
710
-
711
- def add_synonym_group(self, term, synonyms_with_confidence):
712
- """Add synonym group"""
713
- term_lower = term.lower()
714
- if term_lower not in self.synonyms:
715
- self.synonyms[term_lower] = []
716
-
717
- for syn, conf, src in synonyms_with_confidence:
718
- if not any(s[0].lower() == syn.lower() for s in self.synonyms[term_lower]):
719
- self.synonyms[term_lower].append((syn, conf, src))
720
-
721
- def extract_terms_from_categories(self, csv_path, min_frequency=2):
722
- """Extract terms from category CSV"""
723
- print(f"\n📂 Extracting terms from: {csv_path}")
724
-
725
- try:
726
- import pandas as pd
727
-
728
- df = pd.read_csv(csv_path)
729
- path_col = df.columns[1] if len(df.columns) > 1 else df.columns[0]
730
- paths = df[path_col].dropna().astype(str)
731
-
732
- print(f" Processing {len(paths):,} category paths...")
733
-
734
- term_freq = defaultdict(int)
735
-
736
- for path in tqdm(paths, desc="Analyzing paths"):
737
- levels = path.split('/')
738
-
739
- for level in levels:
740
- words = level.lower().split()
741
-
742
- for word in words:
743
- if len(word) > 2 and word.isalpha():
744
- term_freq[word] += 1
745
-
746
- for i in range(len(words) - 1):
747
- if len(words[i]) > 2 and len(words[i+1]) > 2:
748
- phrase = f"{words[i]} {words[i+1]}"
749
- if phrase.replace(' ', '').isalpha():
750
- term_freq[phrase] += 1
751
-
752
- candidates = [
753
- term for term, freq in term_freq.items()
754
- if freq >= min_frequency
755
- ]
756
-
757
- print(f"✅ Extracted {len(candidates):,} terms (min frequency: {min_frequency})")
758
- return candidates, term_freq
759
-
760
- except Exception as e:
761
- print(f"❌ Error extracting terms: {e}")
762
- import traceback
763
- traceback.print_exc()
764
- return [], {}
765
-
766
- def auto_build_from_categories(self, csv_path, top_terms=1000, semantic_threshold=0.70):
767
- """Auto-build synonym database"""
768
- print("\n" + "="*80)
769
- print("🚀 AUTO-BUILD SYNONYM DATABASE")
770
- print("="*80)
771
-
772
- if not self.load_transformer_model():
773
- print("\n⚠️ Continuing with WordNet only")
774
-
775
- all_terms, term_freq = self.extract_terms_from_categories(csv_path)
776
- if not all_terms:
777
- print("❌ No terms extracted")
778
- return False
779
-
780
- print(f"\n🎯 Selecting top {top_terms} terms...")
781
- top_frequent = sorted(term_freq.items(), key=lambda x: x[1], reverse=True)[:top_terms]
782
- terms_to_process = [term for term, _ in top_frequent]
783
-
784
- print(f"✅ Selected {len(terms_to_process)} terms")
785
- print(f"📊 Top 10: {', '.join(terms_to_process[:10])}")
786
- print(f"\n🔄 Generating synonyms (threshold={semantic_threshold})...\n")
787
-
788
- stats = {'processed': 0, 'synonyms': 0, 'high_conf': 0}
789
-
790
- for term in tqdm(terms_to_process, desc="Processing"):
791
- if term in self.synonyms and len(self.synonyms[term]) >= 10:
792
- continue
793
-
794
- syns = self.auto_generate_synonyms(
795
- term,
796
- candidate_pool=all_terms,
797
- semantic_threshold=semantic_threshold,
798
- silent=True
799
- )
800
-
801
- if syns:
802
- self.add_synonym_group(term, syns)
803
- stats['processed'] += 1
804
- stats['synonyms'] += len(syns)
805
- stats['high_conf'] += sum(1 for _, c, _ in syns if c >= 0.8)
806
-
807
- print(f"\n✅ Processed: {stats['processed']:,} terms")
808
- print(f"✅ Total synonyms: {stats['synonyms']:,}")
809
- print(f"✅ High confidence (≥0.8): {stats['high_conf']:,}")
810
-
811
- self.save_synonyms()
812
-
813
- print("\n🎉 AUTO-BUILD COMPLETE!\n")
814
- return True
815
-
816
-
817
- def main():
818
- """Main entry point"""
819
- print("\n" + "="*80)
820
- print("🤖 AI-POWERED SYNONYM MANAGER")
821
- print("="*80 + "\n")
822
-
823
- fast_mode = '--fast' in sys.argv
824
-
825
- if len(sys.argv) < 2:
826
- print("Usage:")
827
- print(" python synonym_manager_fixed.py autobuild <csv_file>")
828
- print(" python synonym_manager_fixed.py autobuild <csv_file> --fast")
829
- print("\nExample:")
830
- print(" python synonym_manager_fixed.py autobuild data/category_id_path_only.csv")
831
- return
832
-
833
- command = sys.argv[1].lower()
834
-
835
- if command == 'autobuild':
836
- if len(sys.argv) < 3:
837
- print("❌ CSV file path required")
838
- return
839
-
840
- csv_path = sys.argv[2]
841
-
842
- if not Path(csv_path).exists():
843
- print(f"❌ File not found: {csv_path}")
844
- return
845
-
846
- manager = SynonymManager(fast_mode=fast_mode)
847
- manager.auto_build_from_categories(csv_path, top_terms=1000)
848
-
849
- else:
850
- print(f"❌ Unknown command: {command}")
851
-
852
-
853
- if __name__ == "__main__":
854
  main()
 
1
+
2
+ """
3
+ 🤖 AI-POWERED SYNONYM MANAGER (Fixed for Windows + GPU)
4
+ ========================================================
5
+ ✅ Uses e5-base-v2 (768D, memory-efficient)
6
+ Windows + NVIDIA GPU optimized
7
+ Generates cross-store synonyms automatically
8
+
9
+ Usage:
10
+ python synonym_manager_fixed.py autobuild data/category_id_path_only.csv
11
+ python synonym_manager_fixed.py autobuild data/category_id_path_only.csv --fast
12
+ """
13
+
14
+ import pickle
15
+ from pathlib import Path
16
+ import json
17
+ from collections import defaultdict
18
+ from tqdm import tqdm
19
+ import warnings
20
+ import sys
21
+ import os
22
+
23
+ warnings.filterwarnings('ignore')
24
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
25
+
26
+ try:
27
+ from nltk.corpus import wordnet
28
+ from nltk import download as nltk_download
29
+ WORDNET_AVAILABLE = True
30
+ except ImportError:
31
+ WORDNET_AVAILABLE = False
32
+
33
+ try:
34
+ from sentence_transformers import SentenceTransformer, util
35
+ import torch
36
+ TRANSFORMERS_AVAILABLE = True
37
+ except ImportError:
38
+ TRANSFORMERS_AVAILABLE = False
39
+
40
+
41
+ class SynonymManager:
42
+ """AI-powered synonym manager"""
43
+
44
+ def __init__(self, cache_dir='cache', fast_mode=False):
45
+ self.cache_dir = Path(cache_dir)
46
+ self.synonyms_file = self.cache_dir / 'cross_store_synonyms.pkl'
47
+ self.synonyms = {}
48
+ self.model = None
49
+ self.device = "cpu"
50
+ self.fast_mode = fast_mode
51
+
52
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
53
+
54
+ if self.synonyms_file.exists():
55
+ self.load_synonyms()
56
+
57
+ def load_synonyms(self):
58
+ """Load existing synonyms"""
59
+ try:
60
+ with open(self.synonyms_file, 'rb') as f:
61
+ loaded = pickle.load(f)
62
+
63
+ if loaded and list(loaded.values()):
64
+ first_val = next(iter(loaded.values()))
65
+
66
+ if isinstance(first_val, list) and first_val:
67
+ if isinstance(first_val[0], tuple):
68
+ self.synonyms = loaded
69
+ else:
70
+ self.synonyms = {k: [(v, 0.8, 'legacy') for v in vals] for k, vals in loaded.items()}
71
+ elif isinstance(first_val, set):
72
+ self.synonyms = {k: [(v, 0.8, 'legacy') for v in vals] for k, vals in loaded.items()}
73
+
74
+ print(f"✅ Loaded {len(self.synonyms):,} synonym entries")
75
+ except Exception as e:
76
+ print(f"❌ Error loading synonyms: {e}")
77
+ self.synonyms = {}
78
+
79
+ def save_synonyms(self):
80
+ """Save synonyms"""
81
+ try:
82
+ with open(self.synonyms_file, 'wb') as f:
83
+ pickle.dump(self.synonyms, f)
84
+
85
+ json_file = self.cache_dir / 'synonyms_readable.json'
86
+ readable = {
87
+ term: [
88
+ {'synonym': syn, 'confidence': conf, 'source': src}
89
+ for syn, conf, src in syns
90
+ ]
91
+ for term, syns in self.synonyms.items()
92
+ }
93
+ with open(json_file, 'w', encoding='utf-8') as f:
94
+ json.dump(readable, f, indent=2, ensure_ascii=False)
95
+
96
+ print(f"✅ Saved {len(self.synonyms):,} synonym entries")
97
+ return True
98
+ except Exception as e:
99
+ print(f"❌ Error saving synonyms: {e}")
100
+ return False
101
+
102
+ def load_transformer_model(self):
103
+ """Load e5-base-v2 model"""
104
+ if not TRANSFORMERS_AVAILABLE:
105
+ print("❌ SentenceTransformers not installed!")
106
+ return False
107
+
108
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
109
+
110
+ if self.device == "cuda":
111
+ print(f"🔥 NVIDIA GPU detected!")
112
+
113
+ model_name = "intfloat/e5-base-v2"
114
+ print(f"\n🤖 Loading {model_name}...")
115
+
116
+ try:
117
+ self.model = SentenceTransformer(model_name, device=self.device)
118
+
119
+ if self.device == "cuda":
120
+ self.model = self.model.half()
121
+ print(" Enabled FP16 precision")
122
+
123
+ print("✅ Model loaded\n")
124
+ return True
125
+ except Exception as e:
126
+ print(f"❌ Failed to load model: {e}")
127
+ return False
128
+
129
+ def get_wordnet_synonyms(self, word, limit=10):
130
+ """Get WordNet synonyms"""
131
+ if self.fast_mode or not WORDNET_AVAILABLE:
132
+ return []
133
+
134
+ try:
135
+ try:
136
+ wordnet.synsets('test')
137
+ except:
138
+ nltk_download('wordnet', quiet=True)
139
+ nltk_download('omw-1.4', quiet=True)
140
+
141
+ synonyms = []
142
+ word_clean = word.lower().replace(' ', '_')
143
+
144
+ for syn in wordnet.synsets(word_clean):
145
+ for lemma in syn.lemmas():
146
+ synonym = lemma.name().replace('_', ' ').lower()
147
+ if synonym != word.lower() and len(synonym) > 2:
148
+ confidence = 0.75
149
+ synonyms.append((synonym, confidence, 'wordnet'))
150
+ if len(synonyms) >= limit:
151
+ break
152
+ if len(synonyms) >= limit:
153
+ break
154
+
155
+ return synonyms[:limit]
156
+ except Exception:
157
+ return []
158
+
159
+ def get_semantic_synonyms(self, term, candidate_pool, threshold=0.70, limit=15):
160
+ """Get semantic synonyms using E5"""
161
+ if not self.model or not candidate_pool:
162
+ return []
163
+
164
+ try:
165
+ query = f"query: {term}"
166
+ candidates_prefixed = [f"passage: {c}" for c in candidate_pool]
167
+
168
+ term_emb = self.model.encode(query, convert_to_tensor=True, show_progress_bar=False)
169
+
170
+ batch_size = 32 if self.device == "cuda" else 8
171
+ all_embeddings = []
172
+
173
+ for i in range(0, len(candidates_prefixed), batch_size):
174
+ batch = candidates_prefixed[i:i + batch_size]
175
+ emb = self.model.encode(batch, convert_to_tensor=True, show_progress_bar=False)
176
+ all_embeddings.append(emb)
177
+
178
+ candidate_embs = torch.cat(all_embeddings, dim=0)
179
+ scores = util.cos_sim(term_emb, candidate_embs)[0]
180
+
181
+ synonyms = []
182
+ for candidate, score in zip(candidate_pool, scores):
183
+ score_val = float(score)
184
+ if score_val > threshold and candidate.lower() != term.lower():
185
+ confidence = 0.60 + (score_val - threshold) * 0.35 / (1 - threshold)
186
+ synonyms.append((candidate, confidence, 'semantic'))
187
+
188
+ synonyms.sort(key=lambda x: x[1], reverse=True)
189
+ return synonyms[:limit]
190
+
191
+ except Exception as e:
192
+ print(f"⚠️ Semantic error: {e}")
193
+ return []
194
+
195
+ def auto_generate_synonyms(self, term, candidate_pool=None, semantic_threshold=0.70, silent=False):
196
+ """Generate synonyms from multiple sources"""
197
+ all_synonyms = []
198
+
199
+ if not silent:
200
+ print(f"\n🔍 Finding synonyms for: '{term}'")
201
+
202
+ if WORDNET_AVAILABLE and not self.fast_mode:
203
+ wn_syns = self.get_wordnet_synonyms(term, limit=10)
204
+ all_synonyms.extend(wn_syns)
205
+
206
+ if candidate_pool and self.model:
207
+ sem_syns = self.get_semantic_synonyms(
208
+ term, candidate_pool,
209
+ threshold=semantic_threshold,
210
+ limit=15
211
+ )
212
+ all_synonyms.extend(sem_syns)
213
+
214
+ synonym_map = {}
215
+ for syn, conf, source in all_synonyms:
216
+ syn_lower = syn.lower()
217
+ if syn_lower not in synonym_map or conf > synonym_map[syn_lower][1]:
218
+ synonym_map[syn_lower] = (syn, conf, source)
219
+
220
+ final_synonyms = sorted(synonym_map.values(), key=lambda x: x[1], reverse=True)
221
+ return final_synonyms
222
+
223
+ def add_synonym_group(self, term, synonyms_with_confidence):
224
+ """Add synonym group"""
225
+ term_lower = term.lower()
226
+ if term_lower not in self.synonyms:
227
+ self.synonyms[term_lower] = []
228
+
229
+ for syn, conf, src in synonyms_with_confidence:
230
+ if not any(s[0].lower() == syn.lower() for s in self.synonyms[term_lower]):
231
+ self.synonyms[term_lower].append((syn, conf, src))
232
+
233
+ def extract_terms_from_categories(self, csv_path, min_frequency=2):
234
+ """Extract terms from category CSV"""
235
+ print(f"\n📂 Extracting terms from: {csv_path}")
236
+
237
+ try:
238
+ import pandas as pd
239
+
240
+ df = pd.read_csv(csv_path)
241
+ path_col = df.columns[1] if len(df.columns) > 1 else df.columns[0]
242
+ paths = df[path_col].dropna().astype(str)
243
+
244
+ print(f" Processing {len(paths):,} category paths...")
245
+
246
+ term_freq = defaultdict(int)
247
+
248
+ for path in tqdm(paths, desc="Analyzing paths"):
249
+ levels = path.split('/')
250
+
251
+ for level in levels:
252
+ words = level.lower().split()
253
+
254
+ for word in words:
255
+ if len(word) > 2 and word.isalpha():
256
+ term_freq[word] += 1
257
+
258
+ for i in range(len(words) - 1):
259
+ if len(words[i]) > 2 and len(words[i+1]) > 2:
260
+ phrase = f"{words[i]} {words[i+1]}"
261
+ if phrase.replace(' ', '').isalpha():
262
+ term_freq[phrase] += 1
263
+
264
+ candidates = [
265
+ term for term, freq in term_freq.items()
266
+ if freq >= min_frequency
267
+ ]
268
+
269
+ print(f"✅ Extracted {len(candidates):,} terms (min frequency: {min_frequency})")
270
+ return candidates, term_freq
271
+
272
+ except Exception as e:
273
+ print(f" Error extracting terms: {e}")
274
+ import traceback
275
+ traceback.print_exc()
276
+ return [], {}
277
+
278
+ def auto_build_from_categories(self, csv_path, top_terms=1000, semantic_threshold=0.70):
279
+ """Auto-build synonym database"""
280
+ print("\n" + "="*80)
281
+ print("🚀 AUTO-BUILD SYNONYM DATABASE")
282
+ print("="*80)
283
+
284
+ if not self.load_transformer_model():
285
+ print("\n⚠️ Continuing with WordNet only")
286
+
287
+ all_terms, term_freq = self.extract_terms_from_categories(csv_path)
288
+ if not all_terms:
289
+ print("❌ No terms extracted")
290
+ return False
291
+
292
+ print(f"\n🎯 Selecting top {top_terms} terms...")
293
+ top_frequent = sorted(term_freq.items(), key=lambda x: x[1], reverse=True)[:top_terms]
294
+ terms_to_process = [term for term, _ in top_frequent]
295
+
296
+ print(f"✅ Selected {len(terms_to_process)} terms")
297
+ print(f"📊 Top 10: {', '.join(terms_to_process[:10])}")
298
+ print(f"\n🔄 Generating synonyms (threshold={semantic_threshold})...\n")
299
+
300
+ stats = {'processed': 0, 'synonyms': 0, 'high_conf': 0}
301
+
302
+ for term in tqdm(terms_to_process, desc="Processing"):
303
+ if term in self.synonyms and len(self.synonyms[term]) >= 10:
304
+ continue
305
+
306
+ syns = self.auto_generate_synonyms(
307
+ term,
308
+ candidate_pool=all_terms,
309
+ semantic_threshold=semantic_threshold,
310
+ silent=True
311
+ )
312
+
313
+ if syns:
314
+ self.add_synonym_group(term, syns)
315
+ stats['processed'] += 1
316
+ stats['synonyms'] += len(syns)
317
+ stats['high_conf'] += sum(1 for _, c, _ in syns if c >= 0.8)
318
+
319
+ print(f"\n✅ Processed: {stats['processed']:,} terms")
320
+ print(f"✅ Total synonyms: {stats['synonyms']:,}")
321
+ print(f"✅ High confidence (0.8): {stats['high_conf']:,}")
322
+
323
+ self.save_synonyms()
324
+
325
+ print("\n🎉 AUTO-BUILD COMPLETE!\n")
326
+ return True
327
+
328
+
329
+ def main():
330
+ """Main entry point"""
331
+ print("\n" + "="*80)
332
+ print("🤖 AI-POWERED SYNONYM MANAGER")
333
+ print("="*80 + "\n")
334
+
335
+ fast_mode = '--fast' in sys.argv
336
+
337
+ if len(sys.argv) < 2:
338
+ print("Usage:")
339
+ print(" python synonym_manager_fixed.py autobuild <csv_file>")
340
+ print(" python synonym_manager_fixed.py autobuild <csv_file> --fast")
341
+ print("\nExample:")
342
+ print(" python synonym_manager_fixed.py autobuild data/category_id_path_only.csv")
343
+ return
344
+
345
+ command = sys.argv[1].lower()
346
+
347
+ if command == 'autobuild':
348
+ if len(sys.argv) < 3:
349
+ print("❌ CSV file path required")
350
+ return
351
+
352
+ csv_path = sys.argv[2]
353
+
354
+ if not Path(csv_path).exists():
355
+ print(f"❌ File not found: {csv_path}")
356
+ return
357
+
358
+ manager = SynonymManager(fast_mode=fast_mode)
359
+ manager.auto_build_from_categories(csv_path, top_terms=1000)
360
+
361
+ else:
362
+ print(f"❌ Unknown command: {command}")
363
+
364
+
365
+ if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  main()