prediction / synonyms.py
zen-vton's picture
Upload 11 files
1fccc5c verified
# """
# πŸ€– FIXED AI-POWERED SYNONYM MANAGER
# ====================================
# βœ… Windows + NVIDIA GPU optimized
# βœ… Uses e5-base-v2 (lower memory)
# βœ… Proper error handling
# βœ… Progress tracking
# Usage:
# python synonym_manager_fixed.py autobuild data/category_id_path_only.csv
# python synonym_manager_fixed.py autobuild data/category_id_path_only.csv --fast
# """
# import pickle
# from pathlib import Path
# import json
# from collections import defaultdict
# from tqdm import tqdm
# import warnings
# import sys
# import os
# warnings.filterwarnings('ignore')
# # Fix CUDA issues on Windows
# os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
# try:
# from nltk.corpus import wordnet
# from nltk import download as nltk_download
# WORDNET_AVAILABLE = True
# except ImportError:
# WORDNET_AVAILABLE = False
# print("⚠️ NLTK not available. Install with: pip install nltk")
# try:
# from sentence_transformers import SentenceTransformer, util
# import torch
# TRANSFORMERS_AVAILABLE = True
# except ImportError:
# TRANSFORMERS_AVAILABLE = False
# print("⚠️ SentenceTransformers not available.")
# print(" Install with: pip install sentence-transformers torch")
# class FixedAISynonymManager:
# """Fixed AI-powered synonym manager for Windows + NVIDIA GPU"""
# def __init__(self, cache_dir='cache', tags_file='data/tags.json', fast_mode=False):
# self.cache_dir = Path(cache_dir)
# self.synonyms_file = self.cache_dir / 'cross_store_synonyms.pkl'
# self.tags_file = Path(tags_file)
# self.synonyms = {}
# self.tags_data = {}
# self.model = None
# self.device = "cpu"
# self.fast_mode = fast_mode
# # Create cache directory
# self.cache_dir.mkdir(parents=True, exist_ok=True)
# # Load existing data
# self.load_tags()
# if self.synonyms_file.exists():
# self.load_synonyms()
# else:
# print("πŸ“ No existing synonyms file. Will create new one.")
# def load_tags(self):
# """Load domain-specific tags (optional)"""
# if self.tags_file.exists():
# try:
# with open(self.tags_file, 'r', encoding='utf-8') as f:
# self.tags_data = json.load(f)
# print(f"βœ… Loaded {len(self.tags_data)} tag entries")
# return True
# except Exception as e:
# print(f"⚠️ Could not load tags.json: {e}")
# else:
# print(f"ℹ️ tags.json not found (optional)")
# return False
# def load_synonyms(self):
# """Load existing synonyms with format conversion"""
# try:
# with open(self.synonyms_file, 'rb') as f:
# loaded = pickle.load(f)
# # Handle different formats
# if not loaded:
# self.synonyms = {}
# return
# # Check format
# first_val = next(iter(loaded.values()))
# if isinstance(first_val, list):
# if first_val and isinstance(first_val[0], tuple):
# # New format: [(syn, conf, src), ...]
# self.synonyms = loaded
# print(f"βœ… Loaded {len(self.synonyms)} synonym entries (new format)")
# elif first_val and isinstance(first_val[0], str):
# # Legacy format: [syn1, syn2, ...]
# self.synonyms = {
# k: [(v, 0.8, 'legacy') for v in vals]
# for k, vals in loaded.items()
# }
# print(f"βœ… Converted {len(self.synonyms)} legacy synonym entries")
# elif isinstance(first_val, set):
# # Set format
# self.synonyms = {
# k: [(v, 0.8, 'legacy') for v in vals]
# for k, vals in loaded.items()
# }
# print(f"βœ… Converted {len(self.synonyms)} set-based entries")
# else:
# self.synonyms = {}
# print(f"⚠️ Unknown synonym format")
# except Exception as e:
# print(f"❌ Error loading synonyms: {e}")
# self.synonyms = {}
# def save_synonyms(self):
# """Save synonyms in both formats"""
# try:
# # Save binary format
# with open(self.synonyms_file, 'wb') as f:
# pickle.dump(self.synonyms, f)
# # Save readable JSON
# json_file = self.cache_dir / 'synonyms_readable.json'
# readable = {}
# for term, syns in self.synonyms.items():
# readable[term] = [
# {'synonym': syn, 'confidence': float(conf), 'source': src}
# for syn, conf, src in syns
# ]
# with open(json_file, 'w', encoding='utf-8') as f:
# json.dump(readable, f, indent=2, ensure_ascii=False)
# print(f"\nβœ… Saved {len(self.synonyms)} synonym entries")
# print(f" πŸ“ Binary: {self.synonyms_file}")
# print(f" πŸ“ JSON: {json_file}")
# return True
# except Exception as e:
# print(f"❌ Error saving synonyms: {e}")
# return False
# def load_transformer_model(self):
# """Load e5-base-v2 model with GPU support"""
# if not TRANSFORMERS_AVAILABLE:
# print("❌ SentenceTransformers not installed!")
# return False
# # Check for CUDA
# self.device = "cuda" if torch.cuda.is_available() else "cpu"
# if self.device == "cuda":
# print(f"πŸ”₯ NVIDIA GPU detected!")
# try:
# gpu_name = torch.cuda.get_device_name(0)
# vram_gb = torch.cuda.get_device_properties(0).total_memory / 1024**3
# print(f" GPU: {gpu_name}")
# print(f" VRAM: {vram_gb:.1f} GB")
# except:
# pass
# else:
# print("πŸ’» Using CPU (slower)")
# # Use e5-base-v2 for better memory efficiency
# model_name = "intfloat/e5-base-v2"
# print(f"\nπŸ€– Loading model: {model_name}")
# try:
# self.model = SentenceTransformer(model_name, device=self.device)
# self.model.max_seq_length = 256
# # Use FP16 on GPU for speed
# if self.device == "cuda":
# self.model = self.model.half()
# print("⚑ Enabled FP16 precision")
# print("βœ… Model loaded successfully\n")
# return True
# except Exception as e:
# print(f"❌ Failed to load model: {e}")
# return False
# def get_wordnet_synonyms(self, word, limit=10):
# """Get WordNet synonyms"""
# if self.fast_mode or not WORDNET_AVAILABLE:
# return []
# try:
# # Ensure WordNet is downloaded
# try:
# wordnet.synsets('test')
# except:
# print("πŸ“₯ Downloading WordNet data...")
# nltk_download('wordnet', quiet=True)
# nltk_download('omw-1.4', quiet=True)
# synonyms = []
# word_clean = word.lower().replace(' ', '_')
# for syn in wordnet.synsets(word_clean):
# for lemma in syn.lemmas():
# synonym = lemma.name().replace('_', ' ').lower()
# if synonym != word.lower() and len(synonym) > 2:
# confidence = 0.75 # Fixed confidence for WordNet
# synonyms.append((synonym, confidence, 'wordnet'))
# if len(synonyms) >= limit:
# break
# if len(synonyms) >= limit:
# break
# return synonyms[:limit]
# except Exception:
# return []
# def get_semantic_synonyms(self, term, candidate_pool, threshold=0.70, limit=15):
# """Get semantic synonyms using embeddings"""
# if not self.model or not candidate_pool:
# return []
# try:
# # E5 model requires query/passage prefixes
# query = f"query: {term}"
# candidates_prefixed = [f"passage: {c}" for c in candidate_pool]
# # Encode query
# term_emb = self.model.encode(
# query,
# convert_to_tensor=True,
# show_progress_bar=False
# )
# # Encode candidates in batches
# batch_size = 32 if self.device == "cuda" else 8
# all_embeddings = []
# for i in range(0, len(candidates_prefixed), batch_size):
# batch = candidates_prefixed[i:i + batch_size]
# emb = self.model.encode(
# batch,
# convert_to_tensor=True,
# show_progress_bar=False
# )
# all_embeddings.append(emb)
# # Concatenate all embeddings
# candidate_embs = torch.cat(all_embeddings, dim=0)
# # Calculate cosine similarity
# scores = util.cos_sim(term_emb, candidate_embs)[0]
# # Filter by threshold
# synonyms = []
# for candidate, score in zip(candidate_pool, scores):
# score_val = float(score)
# if score_val > threshold and candidate.lower() != term.lower():
# # Scale confidence between 0.6 and 0.95
# confidence = 0.60 + (score_val - threshold) * 0.35 / (1 - threshold)
# synonyms.append((candidate, confidence, 'semantic'))
# # Sort by confidence
# synonyms.sort(key=lambda x: x[1], reverse=True)
# return synonyms[:limit]
# except Exception as e:
# print(f"⚠️ Semantic error: {e}")
# return []
# def auto_generate_synonyms(self, term, candidate_pool=None,
# semantic_threshold=0.70, silent=False):
# """Generate synonyms from multiple sources"""
# all_synonyms = []
# if not silent:
# print(f"\nπŸ” Finding synonyms for: '{term}'")
# # Source 1: WordNet
# if WORDNET_AVAILABLE and not self.fast_mode:
# wn_syns = self.get_wordnet_synonyms(term, limit=10)
# all_synonyms.extend(wn_syns)
# # Source 2: Semantic similarity
# if candidate_pool and self.model:
# sem_syns = self.get_semantic_synonyms(
# term, candidate_pool,
# threshold=semantic_threshold,
# limit=15
# )
# all_synonyms.extend(sem_syns)
# # Deduplicate (keep highest confidence)
# synonym_map = {}
# for syn, conf, source in all_synonyms:
# syn_lower = syn.lower()
# if syn_lower not in synonym_map or conf > synonym_map[syn_lower][1]:
# synonym_map[syn_lower] = (syn, conf, source)
# final_synonyms = sorted(
# synonym_map.values(),
# key=lambda x: x[1],
# reverse=True
# )
# return final_synonyms
# def add_synonym_group(self, term, synonyms_with_confidence):
# """Add synonym group"""
# term_lower = term.lower()
# if term_lower not in self.synonyms:
# self.synonyms[term_lower] = []
# for syn, conf, src in synonyms_with_confidence:
# # Check if already exists
# if not any(s[0].lower() == syn.lower() for s in self.synonyms[term_lower]):
# self.synonyms[term_lower].append((syn, conf, src))
# def extract_terms_from_categories(self, csv_path, min_frequency=2):
# """Extract terms from category CSV"""
# print(f"\nπŸ“‚ Extracting terms from: {csv_path}")
# try:
# import pandas as pd
# # Read CSV
# df = pd.read_csv(csv_path)
# # Find path column (usually second column)
# path_col = df.columns[1] if len(df.columns) > 1 else df.columns[0]
# paths = df[path_col].dropna().astype(str)
# print(f" Processing {len(paths):,} category paths...")
# term_freq = defaultdict(int)
# for path in tqdm(paths, desc="Analyzing paths"):
# levels = path.split('/')
# for level in levels:
# words = level.lower().split()
# # Single words
# for word in words:
# if len(word) > 2 and word.isalpha():
# term_freq[word] += 1
# # Two-word phrases
# for i in range(len(words) - 1):
# if len(words[i]) > 2 and len(words[i+1]) > 2:
# phrase = f"{words[i]} {words[i+1]}"
# if phrase.replace(' ', '').isalpha():
# term_freq[phrase] += 1
# # Filter by frequency
# candidates = [
# term for term, freq in term_freq.items()
# if freq >= min_frequency
# ]
# print(f"βœ… Extracted {len(candidates):,} terms (min frequency: {min_frequency})")
# return candidates, term_freq
# except Exception as e:
# print(f"❌ Error extracting terms: {e}")
# import traceback
# traceback.print_exc()
# return [], {}
# def auto_build_from_categories(self, csv_path, top_terms=1000,
# semantic_threshold=0.70):
# """Auto-build synonym database from categories"""
# print("\n" + "="*80)
# print("πŸš€ AUTO-BUILD SYNONYM DATABASE")
# print("="*80)
# # Load model
# if not self.load_transformer_model():
# print("\n⚠️ Continuing with WordNet only (limited coverage)")
# # Extract terms
# all_terms, term_freq = self.extract_terms_from_categories(csv_path)
# if not all_terms:
# print("❌ No terms extracted")
# return False
# # Select top terms
# print(f"\n🎯 Selecting top {top_terms} terms...")
# top_frequent = sorted(
# term_freq.items(),
# key=lambda x: x[1],
# reverse=True
# )[:top_terms]
# terms_to_process = [term for term, _ in top_frequent]
# print(f"βœ… Selected {len(terms_to_process)} terms")
# print(f"πŸ“Š Top 10: {', '.join(terms_to_process[:10])}")
# print(f"\nπŸ”„ Generating synonyms (threshold={semantic_threshold})...\n")
# # Process terms
# stats = {
# 'processed': 0,
# 'synonyms': 0,
# 'high_conf': 0
# }
# for term in tqdm(terms_to_process, desc="Processing"):
# # Skip if already has enough synonyms
# if term in self.synonyms and len(self.synonyms[term]) >= 10:
# continue
# # Generate synonyms
# syns = self.auto_generate_synonyms(
# term,
# candidate_pool=all_terms,
# semantic_threshold=semantic_threshold,
# silent=True
# )
# if syns:
# self.add_synonym_group(term, syns)
# stats['processed'] += 1
# stats['synonyms'] += len(syns)
# stats['high_conf'] += sum(1 for _, c, _ in syns if c >= 0.8)
# # Print stats
# print(f"\nβœ… Processed: {stats['processed']:,} terms")
# print(f"βœ… Total synonyms: {stats['synonyms']:,}")
# print(f"βœ… High confidence (β‰₯0.8): {stats['high_conf']:,}")
# # Save
# self.save_synonyms()
# print("\nπŸŽ‰ AUTO-BUILD COMPLETE!\n")
# return True
# def main():
# """Main entry point"""
# print("\n" + "="*80)
# print("πŸ€– AI-POWERED SYNONYM MANAGER (Windows + NVIDIA GPU)")
# print("="*80 + "\n")
# # Parse arguments
# fast_mode = '--fast' in sys.argv
# if len(sys.argv) < 2:
# print("Usage:")
# print(" python synonym_manager_fixed.py autobuild <csv_file>")
# print(" python synonym_manager_fixed.py autobuild <csv_file> --fast")
# print("\nExample:")
# print(" python synonym_manager_fixed.py autobuild data/category_id_path_only.csv")
# return
# command = sys.argv[1].lower()
# if command == 'autobuild':
# if len(sys.argv) < 3:
# print("❌ CSV file path required")
# return
# csv_path = sys.argv[2]
# if not Path(csv_path).exists():
# print(f"❌ File not found: {csv_path}")
# return
# # Initialize manager
# manager = FixedAISynonymManager(fast_mode=fast_mode)
# # Run auto-build
# manager.auto_build_from_categories(csv_path, top_terms=1000)
# else:
# print(f"❌ Unknown command: {command}")
# if __name__ == "__main__":
# main()
#for cache2
"""
πŸ€– AI-POWERED SYNONYM MANAGER (Fixed for Windows + GPU)
========================================================
βœ… Uses e5-base-v2 (768D, memory-efficient)
βœ… Windows + NVIDIA GPU optimized
βœ… Generates cross-store synonyms automatically
Usage:
python synonym_manager_fixed.py autobuild data/category_id_path_only.csv
python synonym_manager_fixed.py autobuild data/category_id_path_only.csv --fast
"""
import pickle
from pathlib import Path
import json
from collections import defaultdict
from tqdm import tqdm
import warnings
import sys
import os
warnings.filterwarnings('ignore')
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
try:
from nltk.corpus import wordnet
from nltk import download as nltk_download
WORDNET_AVAILABLE = True
except ImportError:
WORDNET_AVAILABLE = False
try:
from sentence_transformers import SentenceTransformer, util
import torch
TRANSFORMERS_AVAILABLE = True
except ImportError:
TRANSFORMERS_AVAILABLE = False
class SynonymManager:
"""AI-powered synonym manager"""
def __init__(self, cache_dir='cache', fast_mode=False):
self.cache_dir = Path(cache_dir)
self.synonyms_file = self.cache_dir / 'cross_store_synonyms.pkl'
self.synonyms = {}
self.model = None
self.device = "cpu"
self.fast_mode = fast_mode
self.cache_dir.mkdir(parents=True, exist_ok=True)
if self.synonyms_file.exists():
self.load_synonyms()
def load_synonyms(self):
"""Load existing synonyms"""
try:
with open(self.synonyms_file, 'rb') as f:
loaded = pickle.load(f)
if loaded and list(loaded.values()):
first_val = next(iter(loaded.values()))
if isinstance(first_val, list) and first_val:
if isinstance(first_val[0], tuple):
self.synonyms = loaded
else:
self.synonyms = {k: [(v, 0.8, 'legacy') for v in vals] for k, vals in loaded.items()}
elif isinstance(first_val, set):
self.synonyms = {k: [(v, 0.8, 'legacy') for v in vals] for k, vals in loaded.items()}
print(f"βœ… Loaded {len(self.synonyms):,} synonym entries")
except Exception as e:
print(f"❌ Error loading synonyms: {e}")
self.synonyms = {}
def save_synonyms(self):
"""Save synonyms"""
try:
with open(self.synonyms_file, 'wb') as f:
pickle.dump(self.synonyms, f)
json_file = self.cache_dir / 'synonyms_readable.json'
readable = {
term: [
{'synonym': syn, 'confidence': conf, 'source': src}
for syn, conf, src in syns
]
for term, syns in self.synonyms.items()
}
with open(json_file, 'w', encoding='utf-8') as f:
json.dump(readable, f, indent=2, ensure_ascii=False)
print(f"βœ… Saved {len(self.synonyms):,} synonym entries")
return True
except Exception as e:
print(f"❌ Error saving synonyms: {e}")
return False
def load_transformer_model(self):
"""Load e5-base-v2 model"""
if not TRANSFORMERS_AVAILABLE:
print("❌ SentenceTransformers not installed!")
return False
self.device = "cuda" if torch.cuda.is_available() else "cpu"
if self.device == "cuda":
print(f"πŸ”₯ NVIDIA GPU detected!")
model_name = "intfloat/e5-base-v2"
print(f"\nπŸ€– Loading {model_name}...")
try:
self.model = SentenceTransformer(model_name, device=self.device)
if self.device == "cuda":
self.model = self.model.half()
print("⚑ Enabled FP16 precision")
print("βœ… Model loaded\n")
return True
except Exception as e:
print(f"❌ Failed to load model: {e}")
return False
def get_wordnet_synonyms(self, word, limit=10):
"""Get WordNet synonyms"""
if self.fast_mode or not WORDNET_AVAILABLE:
return []
try:
try:
wordnet.synsets('test')
except:
nltk_download('wordnet', quiet=True)
nltk_download('omw-1.4', quiet=True)
synonyms = []
word_clean = word.lower().replace(' ', '_')
for syn in wordnet.synsets(word_clean):
for lemma in syn.lemmas():
synonym = lemma.name().replace('_', ' ').lower()
if synonym != word.lower() and len(synonym) > 2:
confidence = 0.75
synonyms.append((synonym, confidence, 'wordnet'))
if len(synonyms) >= limit:
break
if len(synonyms) >= limit:
break
return synonyms[:limit]
except Exception:
return []
def get_semantic_synonyms(self, term, candidate_pool, threshold=0.70, limit=15):
"""Get semantic synonyms using E5"""
if not self.model or not candidate_pool:
return []
try:
query = f"query: {term}"
candidates_prefixed = [f"passage: {c}" for c in candidate_pool]
term_emb = self.model.encode(query, convert_to_tensor=True, show_progress_bar=False)
batch_size = 32 if self.device == "cuda" else 8
all_embeddings = []
for i in range(0, len(candidates_prefixed), batch_size):
batch = candidates_prefixed[i:i + batch_size]
emb = self.model.encode(batch, convert_to_tensor=True, show_progress_bar=False)
all_embeddings.append(emb)
candidate_embs = torch.cat(all_embeddings, dim=0)
scores = util.cos_sim(term_emb, candidate_embs)[0]
synonyms = []
for candidate, score in zip(candidate_pool, scores):
score_val = float(score)
if score_val > threshold and candidate.lower() != term.lower():
confidence = 0.60 + (score_val - threshold) * 0.35 / (1 - threshold)
synonyms.append((candidate, confidence, 'semantic'))
synonyms.sort(key=lambda x: x[1], reverse=True)
return synonyms[:limit]
except Exception as e:
print(f"⚠️ Semantic error: {e}")
return []
def auto_generate_synonyms(self, term, candidate_pool=None, semantic_threshold=0.70, silent=False):
"""Generate synonyms from multiple sources"""
all_synonyms = []
if not silent:
print(f"\nπŸ” Finding synonyms for: '{term}'")
if WORDNET_AVAILABLE and not self.fast_mode:
wn_syns = self.get_wordnet_synonyms(term, limit=10)
all_synonyms.extend(wn_syns)
if candidate_pool and self.model:
sem_syns = self.get_semantic_synonyms(
term, candidate_pool,
threshold=semantic_threshold,
limit=15
)
all_synonyms.extend(sem_syns)
synonym_map = {}
for syn, conf, source in all_synonyms:
syn_lower = syn.lower()
if syn_lower not in synonym_map or conf > synonym_map[syn_lower][1]:
synonym_map[syn_lower] = (syn, conf, source)
final_synonyms = sorted(synonym_map.values(), key=lambda x: x[1], reverse=True)
return final_synonyms
def add_synonym_group(self, term, synonyms_with_confidence):
"""Add synonym group"""
term_lower = term.lower()
if term_lower not in self.synonyms:
self.synonyms[term_lower] = []
for syn, conf, src in synonyms_with_confidence:
if not any(s[0].lower() == syn.lower() for s in self.synonyms[term_lower]):
self.synonyms[term_lower].append((syn, conf, src))
def extract_terms_from_categories(self, csv_path, min_frequency=2):
"""Extract terms from category CSV"""
print(f"\nπŸ“‚ Extracting terms from: {csv_path}")
try:
import pandas as pd
df = pd.read_csv(csv_path)
path_col = df.columns[1] if len(df.columns) > 1 else df.columns[0]
paths = df[path_col].dropna().astype(str)
print(f" Processing {len(paths):,} category paths...")
term_freq = defaultdict(int)
for path in tqdm(paths, desc="Analyzing paths"):
levels = path.split('/')
for level in levels:
words = level.lower().split()
for word in words:
if len(word) > 2 and word.isalpha():
term_freq[word] += 1
for i in range(len(words) - 1):
if len(words[i]) > 2 and len(words[i+1]) > 2:
phrase = f"{words[i]} {words[i+1]}"
if phrase.replace(' ', '').isalpha():
term_freq[phrase] += 1
candidates = [
term for term, freq in term_freq.items()
if freq >= min_frequency
]
print(f"βœ… Extracted {len(candidates):,} terms (min frequency: {min_frequency})")
return candidates, term_freq
except Exception as e:
print(f"❌ Error extracting terms: {e}")
import traceback
traceback.print_exc()
return [], {}
def auto_build_from_categories(self, csv_path, top_terms=1000, semantic_threshold=0.70):
"""Auto-build synonym database"""
print("\n" + "="*80)
print("πŸš€ AUTO-BUILD SYNONYM DATABASE")
print("="*80)
if not self.load_transformer_model():
print("\n⚠️ Continuing with WordNet only")
all_terms, term_freq = self.extract_terms_from_categories(csv_path)
if not all_terms:
print("❌ No terms extracted")
return False
print(f"\n🎯 Selecting top {top_terms} terms...")
top_frequent = sorted(term_freq.items(), key=lambda x: x[1], reverse=True)[:top_terms]
terms_to_process = [term for term, _ in top_frequent]
print(f"βœ… Selected {len(terms_to_process)} terms")
print(f"πŸ“Š Top 10: {', '.join(terms_to_process[:10])}")
print(f"\nπŸ”„ Generating synonyms (threshold={semantic_threshold})...\n")
stats = {'processed': 0, 'synonyms': 0, 'high_conf': 0}
for term in tqdm(terms_to_process, desc="Processing"):
if term in self.synonyms and len(self.synonyms[term]) >= 10:
continue
syns = self.auto_generate_synonyms(
term,
candidate_pool=all_terms,
semantic_threshold=semantic_threshold,
silent=True
)
if syns:
self.add_synonym_group(term, syns)
stats['processed'] += 1
stats['synonyms'] += len(syns)
stats['high_conf'] += sum(1 for _, c, _ in syns if c >= 0.8)
print(f"\nβœ… Processed: {stats['processed']:,} terms")
print(f"βœ… Total synonyms: {stats['synonyms']:,}")
print(f"βœ… High confidence (β‰₯0.8): {stats['high_conf']:,}")
self.save_synonyms()
print("\nπŸŽ‰ AUTO-BUILD COMPLETE!\n")
return True
def main():
"""Main entry point"""
print("\n" + "="*80)
print("πŸ€– AI-POWERED SYNONYM MANAGER")
print("="*80 + "\n")
fast_mode = '--fast' in sys.argv
if len(sys.argv) < 2:
print("Usage:")
print(" python synonym_manager_fixed.py autobuild <csv_file>")
print(" python synonym_manager_fixed.py autobuild <csv_file> --fast")
print("\nExample:")
print(" python synonym_manager_fixed.py autobuild data/category_id_path_only.csv")
return
command = sys.argv[1].lower()
if command == 'autobuild':
if len(sys.argv) < 3:
print("❌ CSV file path required")
return
csv_path = sys.argv[2]
if not Path(csv_path).exists():
print(f"❌ File not found: {csv_path}")
return
manager = SynonymManager(fast_mode=fast_mode)
manager.auto_build_from_categories(csv_path, top_terms=1000)
else:
print(f"❌ Unknown command: {command}")
if __name__ == "__main__":
main()