Spaces:

Sid9797
/

Look-Buy-AI

Sleeping

File size: 4,884 Bytes

333b839

from pathlib import Path
import re
import pandas as pd

BASE_DIR = Path(__file__).resolve().parent.parent
IN_PATH = BASE_DIR / "data" / "processed" / "products_10k.parquet"
OUT_PATH = BASE_DIR / "data" / "processed" / "products_10k_enriched.parquet"


COLORS = {
    "black","white","silver","gold","gray","grey","blue","navy","red","green","yellow","orange",
    "pink","purple","violet","brown","beige","tan","cream","clear","transparent","rose","rose gold"
}

# super lightweight category rules (good enough for submission; we’ll refine later)
CATEGORY_RULES = [
    ("phone case", ["case", "iphone", "samsung", "galaxy", "pixel", "cover"]),
    ("earrings", ["earring", "stud", "hoop", "piercing"]),
    ("necklace", ["necklace", "pendant", "chain"]),
    ("ring", ["ring", "band"]),
    ("watch", ["watch", "smartwatch"]),
    ("laptop accessory", ["laptop", "macbook", "notebook", "keyboard", "mouse", "trackpad"]),
    ("kitchen tool", ["kitchen", "peeler", "spatula", "knife", "cookware", "pan", "pot", "utensil", "mop"]),
    ("grocery", ["organic", "snack", "scone", "tofu", "chicken", "food", "drink", "beverage"]),
    ("beauty", ["shampoo", "conditioner", "serum", "lotion", "cream", "makeup", "perfume"]),
    ("home", ["chair", "table", "sofa", "lamp", "bedding", "pillow", "curtain"]),
    ("tools", ["cutter", "pliers", "wrench", "screwdriver", "drill"]),
    ("clothing", ["shirt", "hoodie", "jacket", "jeans", "dress", "pants", "shoes", "sneaker"]),
]


STOPWORDS = {
    "the","a","an","and","or","with","for","of","to","in","on","by","from","this","that","these","those",
    "new","set","pack","pcs","piece","pieces","count","inch","inches","cm","mm","oz","lbs","lb"
}


def clean_text(s: str) -> str:
    s = (s or "").lower()
    s = re.sub(r"[^a-z0-9\s\-]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s


def extract_color(text: str):
    t = clean_text(text)
    # check multiword first
    if "rose gold" in t:
        return "rose gold"
    for c in COLORS:
        if re.search(rf"\b{re.escape(c)}\b", t):
            return c
    return None


def guess_category(text: str):
    t = clean_text(text)
    for cat, keys in CATEGORY_RULES:
        for k in keys:
            if re.search(rf"\b{re.escape(k)}\b", t):
                return cat
    return "other"


def extract_brand(title: str):
    # Simple heuristic: brand often appears at the start like "AmazonBasics", "IGI", etc.
    # We'll take first token if it's Capitalized/alpha-ish OR "AmazonBasics"/"Amazon"/etc.
    title = (title or "").strip()
    if not title:
        return None

    first = title.split()[0]
    first_clean = re.sub(r"[^A-Za-z0-9\-&]", "", first)

    if len(first_clean) < 2:
        return None

    common = {"amazonbasics", "amazon", "igd", "igi", "rivet", "essentials", "365"}
    if first_clean.lower() in common:
        return first_clean

    # If it has letters and starts with uppercase, likely brand-like
    if re.match(r"^[A-Z][A-Za-z0-9\-&]+$", first_clean):
        return first_clean

    return None


def extract_keywords(text: str):
    t = clean_text(text)
    tokens = [w for w in t.split() if w not in STOPWORDS and len(w) >= 3]
    # keep unique but stable order
    seen = set()
    out = []
    for w in tokens:
        if w not in seen:
            out.append(w)
            seen.add(w)
    return out[:40]


def main():
    if not IN_PATH.exists():
        raise FileNotFoundError(IN_PATH)

    df = pd.read_parquet(IN_PATH)

    # Ensure caption exists
    if "caption" not in df.columns:
        df["caption"] = ""

    brands, cats, cols, keywords, text_index = [], [], [], [], []

    for _, row in df.iterrows():
        title = str(row.get("title", "") or "")
        caption = str(row.get("caption", "") or "")
        joined = f"{title} {caption}"

        b = extract_brand(title)
        c = extract_color(joined)
        cat = guess_category(joined)
        kw = extract_keywords(joined)

        brands.append(b)
        cols.append(c)
        cats.append(cat)
        keywords.append(kw)

        parts = [title, caption]
        if b: parts.append(f"brand {b}")
        if cat: parts.append(f"category {cat}")
        if c: parts.append(f"color {c}")
        parts.append("keywords " + " ".join(kw))
        text_index.append(" | ".join([p for p in parts if p]))

    df["brand"] = brands
    df["category"] = cats
    df["color"] = cols
    df["keywords"] = keywords
    df["text_for_index"] = text_index

    OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(OUT_PATH, index=False)

    print("Saved:", OUT_PATH)
    print("Rows:", len(df))
    print("Category distribution (top 10):")
    print(df["category"].value_counts().head(10))
    print("Brand nulls:", int(df["brand"].isna().sum()))
    print("Color nulls:", int(df["color"].isna().sum()))


if __name__ == "__main__":
    main()