File size: 4,479 Bytes
1fccc5c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142


import pandas as pd
import json
import re
from tqdm import tqdm


class HybridTagsGenerator:

    def __init__(self):
        # Search intent patterns (E5 likes real text)
        self.search_intents = [
            "buy {item}",
            "best {item}",
            "{item} reviews",
        ]

    def clean(self, text):
        text = str(text).lower()
        text = re.sub(r"[^\w\s-]", " ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    # -------------------------------------------------------
    # 1. Hierarchical tag boosting
    # -------------------------------------------------------
    def make_hierarchy_tags(self, path):
        levels = [l.strip() for l in path.split("/") if l.strip()]
        tags = []

        # Strong full-path signal
        full = " ".join(self.clean(l) for l in levels)
        tags.extend([full] * 8)   # <-- Strong boost

        # Progressive hierarchy
        for i in range(1, len(levels) + 1):
            seg = " ".join(self.clean(l) for l in levels[:i])
            tags.append(seg)

        # Parent-child reinforcement
        if len(levels) >= 2:
            parent = self.clean(levels[-2])
            child = self.clean(levels[-1])

            tags.extend([
                f"{parent} {child}",
                f"{child} {parent}",
                f"{child} in {parent}",
                f"{child} category {parent}"
            ])

        return tags

    # -------------------------------------------------------
    # 2. Extract key terms and word combos
    # -------------------------------------------------------
    def extract_terms(self, path):
        levels = [l.strip() for l in path.split("/") if l.strip()]
        terms = []

        for level in levels:
            cleaned = self.clean(level)
            if cleaned not in terms:
                terms.append(cleaned)

            words = [w for w in cleaned.split() if len(w) > 3]
            terms.extend(words)

            # bigrams for leaf and parent
            if level in levels[-2:]:
                for i in range(len(words) - 1):
                    terms.append(f"{words[i]} {words[i+1]}")

        # Remove duplicates, keep order
        return list(dict.fromkeys(terms))

    # -------------------------------------------------------
    # 3. Build final tag list for ONE category
    # -------------------------------------------------------
    def build_tags(self, category_id, category_path):
        tags = []

        # Hierarchy tags
        tags.extend(self.make_hierarchy_tags(category_path))

        # Key terms
        terms = self.extract_terms(category_path)
        tags.extend(terms[:15])

        # Search intent (for leaf level)
        leaf = self.clean(category_path.split("/")[-1])
        for pattern in self.search_intents[:2]:
            tags.append(pattern.format(item=leaf))

        # Clean + dedupe + limit
        seen = set()
        final = []

        for t in tags:
            c = self.clean(t)
            if c and c not in seen and len(c.split()) <= 6:
                seen.add(c)
                final.append(c)

        return final[:50]

    # -------------------------------------------------------
    # 4. Generate tags.json for entire CSV
    # -------------------------------------------------------
    def generate_tags_json(self, csv_path, output="tags.json"):
        df = pd.read_csv(csv_path, dtype=str)

        if "Category_ID" not in df.columns or "Category_path" not in df.columns:
            raise ValueError("CSV must contain Category_ID, Category_path columns")

        df = df.dropna(subset=["Category_path"])

        tags_dict = {}

        for _, row in tqdm(df.iterrows(), total=len(df), desc="Building tags"):
            cid = str(row["Category_ID"])
            cpath = str(row["Category_path"])
            tags_dict[cid] = self.build_tags(cid, cpath)

        with open(output, "w", encoding="utf-8") as f:
            json.dump(tags_dict, f, indent=2)

        print(f"✅ DONE: {output} saved.")
        return tags_dict


if __name__ == "__main__":
    import sys
    if len(sys.argv) < 2:
        print("Usage: python build_tags_json.py <categories.csv>")
        sys.exit()

    csv_file = sys.argv[1]
    gen = HybridTagsGenerator()
    gen.generate_tags_json(csv_file, "tags.json")