Spaces:
No application file
No application file
File size: 4,479 Bytes
1fccc5c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import pandas as pd
import json
import re
from tqdm import tqdm
class HybridTagsGenerator:
def __init__(self):
# Search intent patterns (E5 likes real text)
self.search_intents = [
"buy {item}",
"best {item}",
"{item} reviews",
]
def clean(self, text):
text = str(text).lower()
text = re.sub(r"[^\w\s-]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
# -------------------------------------------------------
# 1. Hierarchical tag boosting
# -------------------------------------------------------
def make_hierarchy_tags(self, path):
levels = [l.strip() for l in path.split("/") if l.strip()]
tags = []
# Strong full-path signal
full = " ".join(self.clean(l) for l in levels)
tags.extend([full] * 8) # <-- Strong boost
# Progressive hierarchy
for i in range(1, len(levels) + 1):
seg = " ".join(self.clean(l) for l in levels[:i])
tags.append(seg)
# Parent-child reinforcement
if len(levels) >= 2:
parent = self.clean(levels[-2])
child = self.clean(levels[-1])
tags.extend([
f"{parent} {child}",
f"{child} {parent}",
f"{child} in {parent}",
f"{child} category {parent}"
])
return tags
# -------------------------------------------------------
# 2. Extract key terms and word combos
# -------------------------------------------------------
def extract_terms(self, path):
levels = [l.strip() for l in path.split("/") if l.strip()]
terms = []
for level in levels:
cleaned = self.clean(level)
if cleaned not in terms:
terms.append(cleaned)
words = [w for w in cleaned.split() if len(w) > 3]
terms.extend(words)
# bigrams for leaf and parent
if level in levels[-2:]:
for i in range(len(words) - 1):
terms.append(f"{words[i]} {words[i+1]}")
# Remove duplicates, keep order
return list(dict.fromkeys(terms))
# -------------------------------------------------------
# 3. Build final tag list for ONE category
# -------------------------------------------------------
def build_tags(self, category_id, category_path):
tags = []
# Hierarchy tags
tags.extend(self.make_hierarchy_tags(category_path))
# Key terms
terms = self.extract_terms(category_path)
tags.extend(terms[:15])
# Search intent (for leaf level)
leaf = self.clean(category_path.split("/")[-1])
for pattern in self.search_intents[:2]:
tags.append(pattern.format(item=leaf))
# Clean + dedupe + limit
seen = set()
final = []
for t in tags:
c = self.clean(t)
if c and c not in seen and len(c.split()) <= 6:
seen.add(c)
final.append(c)
return final[:50]
# -------------------------------------------------------
# 4. Generate tags.json for entire CSV
# -------------------------------------------------------
def generate_tags_json(self, csv_path, output="tags.json"):
df = pd.read_csv(csv_path, dtype=str)
if "Category_ID" not in df.columns or "Category_path" not in df.columns:
raise ValueError("CSV must contain Category_ID, Category_path columns")
df = df.dropna(subset=["Category_path"])
tags_dict = {}
for _, row in tqdm(df.iterrows(), total=len(df), desc="Building tags"):
cid = str(row["Category_ID"])
cpath = str(row["Category_path"])
tags_dict[cid] = self.build_tags(cid, cpath)
with open(output, "w", encoding="utf-8") as f:
json.dump(tags_dict, f, indent=2)
print(f"✅ DONE: {output} saved.")
return tags_dict
if __name__ == "__main__":
import sys
if len(sys.argv) < 2:
print("Usage: python build_tags_json.py <categories.csv>")
sys.exit()
csv_file = sys.argv[1]
gen = HybridTagsGenerator()
gen.generate_tags_json(csv_file, "tags.json")
|