Spaces:
No application file
No application file
| import pandas as pd | |
| import json | |
| import re | |
| from tqdm import tqdm | |
| class HybridTagsGenerator: | |
| def __init__(self): | |
| # Search intent patterns (E5 likes real text) | |
| self.search_intents = [ | |
| "buy {item}", | |
| "best {item}", | |
| "{item} reviews", | |
| ] | |
| def clean(self, text): | |
| text = str(text).lower() | |
| text = re.sub(r"[^\w\s-]", " ", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| # ------------------------------------------------------- | |
| # 1. Hierarchical tag boosting | |
| # ------------------------------------------------------- | |
| def make_hierarchy_tags(self, path): | |
| levels = [l.strip() for l in path.split("/") if l.strip()] | |
| tags = [] | |
| # Strong full-path signal | |
| full = " ".join(self.clean(l) for l in levels) | |
| tags.extend([full] * 8) # <-- Strong boost | |
| # Progressive hierarchy | |
| for i in range(1, len(levels) + 1): | |
| seg = " ".join(self.clean(l) for l in levels[:i]) | |
| tags.append(seg) | |
| # Parent-child reinforcement | |
| if len(levels) >= 2: | |
| parent = self.clean(levels[-2]) | |
| child = self.clean(levels[-1]) | |
| tags.extend([ | |
| f"{parent} {child}", | |
| f"{child} {parent}", | |
| f"{child} in {parent}", | |
| f"{child} category {parent}" | |
| ]) | |
| return tags | |
| # ------------------------------------------------------- | |
| # 2. Extract key terms and word combos | |
| # ------------------------------------------------------- | |
| def extract_terms(self, path): | |
| levels = [l.strip() for l in path.split("/") if l.strip()] | |
| terms = [] | |
| for level in levels: | |
| cleaned = self.clean(level) | |
| if cleaned not in terms: | |
| terms.append(cleaned) | |
| words = [w for w in cleaned.split() if len(w) > 3] | |
| terms.extend(words) | |
| # bigrams for leaf and parent | |
| if level in levels[-2:]: | |
| for i in range(len(words) - 1): | |
| terms.append(f"{words[i]} {words[i+1]}") | |
| # Remove duplicates, keep order | |
| return list(dict.fromkeys(terms)) | |
| # ------------------------------------------------------- | |
| # 3. Build final tag list for ONE category | |
| # ------------------------------------------------------- | |
| def build_tags(self, category_id, category_path): | |
| tags = [] | |
| # Hierarchy tags | |
| tags.extend(self.make_hierarchy_tags(category_path)) | |
| # Key terms | |
| terms = self.extract_terms(category_path) | |
| tags.extend(terms[:15]) | |
| # Search intent (for leaf level) | |
| leaf = self.clean(category_path.split("/")[-1]) | |
| for pattern in self.search_intents[:2]: | |
| tags.append(pattern.format(item=leaf)) | |
| # Clean + dedupe + limit | |
| seen = set() | |
| final = [] | |
| for t in tags: | |
| c = self.clean(t) | |
| if c and c not in seen and len(c.split()) <= 6: | |
| seen.add(c) | |
| final.append(c) | |
| return final[:50] | |
| # ------------------------------------------------------- | |
| # 4. Generate tags.json for entire CSV | |
| # ------------------------------------------------------- | |
| def generate_tags_json(self, csv_path, output="tags.json"): | |
| df = pd.read_csv(csv_path, dtype=str) | |
| if "Category_ID" not in df.columns or "Category_path" not in df.columns: | |
| raise ValueError("CSV must contain Category_ID, Category_path columns") | |
| df = df.dropna(subset=["Category_path"]) | |
| tags_dict = {} | |
| for _, row in tqdm(df.iterrows(), total=len(df), desc="Building tags"): | |
| cid = str(row["Category_ID"]) | |
| cpath = str(row["Category_path"]) | |
| tags_dict[cid] = self.build_tags(cid, cpath) | |
| with open(output, "w", encoding="utf-8") as f: | |
| json.dump(tags_dict, f, indent=2) | |
| print(f"✅ DONE: {output} saved.") | |
| return tags_dict | |
| if __name__ == "__main__": | |
| import sys | |
| if len(sys.argv) < 2: | |
| print("Usage: python build_tags_json.py <categories.csv>") | |
| sys.exit() | |
| csv_file = sys.argv[1] | |
| gen = HybridTagsGenerator() | |
| gen.generate_tags_json(csv_file, "tags.json") | |