Spaces:

zen-vton
/

prediction

No application file

File size: 4,479 Bytes

1fccc5c



import pandas as pd
import json
import re
from tqdm import tqdm


class HybridTagsGenerator:

    def __init__(self):
        # Search intent patterns (E5 likes real text)
        self.search_intents = [
            "buy {item}",
            "best {item}",
            "{item} reviews",
        ]

    def clean(self, text):
        text = str(text).lower()
        text = re.sub(r"[^\w\s-]", " ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    # -------------------------------------------------------
    # 1. Hierarchical tag boosting
    # -------------------------------------------------------
    def make_hierarchy_tags(self, path):
        levels = [l.strip() for l in path.split("/") if l.strip()]
        tags = []

        # Strong full-path signal
        full = " ".join(self.clean(l) for l in levels)
        tags.extend([full] * 8)   # <-- Strong boost

        # Progressive hierarchy
        for i in range(1, len(levels) + 1):
            seg = " ".join(self.clean(l) for l in levels[:i])
            tags.append(seg)

        # Parent-child reinforcement
        if len(levels) >= 2:
            parent = self.clean(levels[-2])
            child = self.clean(levels[-1])

            tags.extend([
                f"{parent} {child}",
                f"{child} {parent}",
                f"{child} in {parent}",
                f"{child} category {parent}"
            ])

        return tags

    # -------------------------------------------------------
    # 2. Extract key terms and word combos
    # -------------------------------------------------------
    def extract_terms(self, path):
        levels = [l.strip() for l in path.split("/") if l.strip()]
        terms = []

        for level in levels:
            cleaned = self.clean(level)
            if cleaned not in terms:
                terms.append(cleaned)

            words = [w for w in cleaned.split() if len(w) > 3]
            terms.extend(words)

            # bigrams for leaf and parent
            if level in levels[-2:]:
                for i in range(len(words) - 1):
                    terms.append(f"{words[i]} {words[i+1]}")

        # Remove duplicates, keep order
        return list(dict.fromkeys(terms))

    # -------------------------------------------------------
    # 3. Build final tag list for ONE category
    # -------------------------------------------------------
    def build_tags(self, category_id, category_path):
        tags = []

        # Hierarchy tags
        tags.extend(self.make_hierarchy_tags(category_path))

        # Key terms
        terms = self.extract_terms(category_path)
        tags.extend(terms[:15])

        # Search intent (for leaf level)
        leaf = self.clean(category_path.split("/")[-1])
        for pattern in self.search_intents[:2]:
            tags.append(pattern.format(item=leaf))

        # Clean + dedupe + limit
        seen = set()
        final = []

        for t in tags:
            c = self.clean(t)
            if c and c not in seen and len(c.split()) <= 6:
                seen.add(c)
                final.append(c)

        return final[:50]

    # -------------------------------------------------------
    # 4. Generate tags.json for entire CSV
    # -------------------------------------------------------
    def generate_tags_json(self, csv_path, output="tags.json"):
        df = pd.read_csv(csv_path, dtype=str)

        if "Category_ID" not in df.columns or "Category_path" not in df.columns:
            raise ValueError("CSV must contain Category_ID, Category_path columns")

        df = df.dropna(subset=["Category_path"])

        tags_dict = {}

        for _, row in tqdm(df.iterrows(), total=len(df), desc="Building tags"):
            cid = str(row["Category_ID"])
            cpath = str(row["Category_path"])
            tags_dict[cid] = self.build_tags(cid, cpath)

        with open(output, "w", encoding="utf-8") as f:
            json.dump(tags_dict, f, indent=2)

        print(f"✅ DONE: {output} saved.")
        return tags_dict


if __name__ == "__main__":
    import sys
    if len(sys.argv) < 2:
        print("Usage: python build_tags_json.py <categories.csv>")
        sys.exit()

    csv_file = sys.argv[1]
    gen = HybridTagsGenerator()
    gen.generate_tags_json(csv_file, "tags.json")