import pandas as pd import json import re from tqdm import tqdm class HybridTagsGenerator: def __init__(self): # Search intent patterns (E5 likes real text) self.search_intents = [ "buy {item}", "best {item}", "{item} reviews", ] def clean(self, text): text = str(text).lower() text = re.sub(r"[^\w\s-]", " ", text) text = re.sub(r"\s+", " ", text).strip() return text # ------------------------------------------------------- # 1. Hierarchical tag boosting # ------------------------------------------------------- def make_hierarchy_tags(self, path): levels = [l.strip() for l in path.split("/") if l.strip()] tags = [] # Strong full-path signal full = " ".join(self.clean(l) for l in levels) tags.extend([full] * 8) # <-- Strong boost # Progressive hierarchy for i in range(1, len(levels) + 1): seg = " ".join(self.clean(l) for l in levels[:i]) tags.append(seg) # Parent-child reinforcement if len(levels) >= 2: parent = self.clean(levels[-2]) child = self.clean(levels[-1]) tags.extend([ f"{parent} {child}", f"{child} {parent}", f"{child} in {parent}", f"{child} category {parent}" ]) return tags # ------------------------------------------------------- # 2. Extract key terms and word combos # ------------------------------------------------------- def extract_terms(self, path): levels = [l.strip() for l in path.split("/") if l.strip()] terms = [] for level in levels: cleaned = self.clean(level) if cleaned not in terms: terms.append(cleaned) words = [w for w in cleaned.split() if len(w) > 3] terms.extend(words) # bigrams for leaf and parent if level in levels[-2:]: for i in range(len(words) - 1): terms.append(f"{words[i]} {words[i+1]}") # Remove duplicates, keep order return list(dict.fromkeys(terms)) # ------------------------------------------------------- # 3. Build final tag list for ONE category # ------------------------------------------------------- def build_tags(self, category_id, category_path): tags = [] # Hierarchy tags tags.extend(self.make_hierarchy_tags(category_path)) # Key terms terms = self.extract_terms(category_path) tags.extend(terms[:15]) # Search intent (for leaf level) leaf = self.clean(category_path.split("/")[-1]) for pattern in self.search_intents[:2]: tags.append(pattern.format(item=leaf)) # Clean + dedupe + limit seen = set() final = [] for t in tags: c = self.clean(t) if c and c not in seen and len(c.split()) <= 6: seen.add(c) final.append(c) return final[:50] # ------------------------------------------------------- # 4. Generate tags.json for entire CSV # ------------------------------------------------------- def generate_tags_json(self, csv_path, output="tags.json"): df = pd.read_csv(csv_path, dtype=str) if "Category_ID" not in df.columns or "Category_path" not in df.columns: raise ValueError("CSV must contain Category_ID, Category_path columns") df = df.dropna(subset=["Category_path"]) tags_dict = {} for _, row in tqdm(df.iterrows(), total=len(df), desc="Building tags"): cid = str(row["Category_ID"]) cpath = str(row["Category_path"]) tags_dict[cid] = self.build_tags(cid, cpath) with open(output, "w", encoding="utf-8") as f: json.dump(tags_dict, f, indent=2) print(f"✅ DONE: {output} saved.") return tags_dict if __name__ == "__main__": import sys if len(sys.argv) < 2: print("Usage: python build_tags_json.py ") sys.exit() csv_file = sys.argv[1] gen = HybridTagsGenerator() gen.generate_tags_json(csv_file, "tags.json")