Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| import re | |
| import pandas as pd | |
| BASE_DIR = Path(__file__).resolve().parent.parent | |
| IN_PATH = BASE_DIR / "data" / "processed" / "products_10k.parquet" | |
| OUT_PATH = BASE_DIR / "data" / "processed" / "products_10k_enriched.parquet" | |
| COLORS = { | |
| "black","white","silver","gold","gray","grey","blue","navy","red","green","yellow","orange", | |
| "pink","purple","violet","brown","beige","tan","cream","clear","transparent","rose","rose gold" | |
| } | |
| # super lightweight category rules (good enough for submission; we’ll refine later) | |
| CATEGORY_RULES = [ | |
| ("phone case", ["case", "iphone", "samsung", "galaxy", "pixel", "cover"]), | |
| ("earrings", ["earring", "stud", "hoop", "piercing"]), | |
| ("necklace", ["necklace", "pendant", "chain"]), | |
| ("ring", ["ring", "band"]), | |
| ("watch", ["watch", "smartwatch"]), | |
| ("laptop accessory", ["laptop", "macbook", "notebook", "keyboard", "mouse", "trackpad"]), | |
| ("kitchen tool", ["kitchen", "peeler", "spatula", "knife", "cookware", "pan", "pot", "utensil", "mop"]), | |
| ("grocery", ["organic", "snack", "scone", "tofu", "chicken", "food", "drink", "beverage"]), | |
| ("beauty", ["shampoo", "conditioner", "serum", "lotion", "cream", "makeup", "perfume"]), | |
| ("home", ["chair", "table", "sofa", "lamp", "bedding", "pillow", "curtain"]), | |
| ("tools", ["cutter", "pliers", "wrench", "screwdriver", "drill"]), | |
| ("clothing", ["shirt", "hoodie", "jacket", "jeans", "dress", "pants", "shoes", "sneaker"]), | |
| ] | |
| STOPWORDS = { | |
| "the","a","an","and","or","with","for","of","to","in","on","by","from","this","that","these","those", | |
| "new","set","pack","pcs","piece","pieces","count","inch","inches","cm","mm","oz","lbs","lb" | |
| } | |
| def clean_text(s: str) -> str: | |
| s = (s or "").lower() | |
| s = re.sub(r"[^a-z0-9\s\-]", " ", s) | |
| s = re.sub(r"\s+", " ", s).strip() | |
| return s | |
| def extract_color(text: str): | |
| t = clean_text(text) | |
| # check multiword first | |
| if "rose gold" in t: | |
| return "rose gold" | |
| for c in COLORS: | |
| if re.search(rf"\b{re.escape(c)}\b", t): | |
| return c | |
| return None | |
| def guess_category(text: str): | |
| t = clean_text(text) | |
| for cat, keys in CATEGORY_RULES: | |
| for k in keys: | |
| if re.search(rf"\b{re.escape(k)}\b", t): | |
| return cat | |
| return "other" | |
| def extract_brand(title: str): | |
| # Simple heuristic: brand often appears at the start like "AmazonBasics", "IGI", etc. | |
| # We'll take first token if it's Capitalized/alpha-ish OR "AmazonBasics"/"Amazon"/etc. | |
| title = (title or "").strip() | |
| if not title: | |
| return None | |
| first = title.split()[0] | |
| first_clean = re.sub(r"[^A-Za-z0-9\-&]", "", first) | |
| if len(first_clean) < 2: | |
| return None | |
| common = {"amazonbasics", "amazon", "igd", "igi", "rivet", "essentials", "365"} | |
| if first_clean.lower() in common: | |
| return first_clean | |
| # If it has letters and starts with uppercase, likely brand-like | |
| if re.match(r"^[A-Z][A-Za-z0-9\-&]+$", first_clean): | |
| return first_clean | |
| return None | |
| def extract_keywords(text: str): | |
| t = clean_text(text) | |
| tokens = [w for w in t.split() if w not in STOPWORDS and len(w) >= 3] | |
| # keep unique but stable order | |
| seen = set() | |
| out = [] | |
| for w in tokens: | |
| if w not in seen: | |
| out.append(w) | |
| seen.add(w) | |
| return out[:40] | |
| def main(): | |
| if not IN_PATH.exists(): | |
| raise FileNotFoundError(IN_PATH) | |
| df = pd.read_parquet(IN_PATH) | |
| # Ensure caption exists | |
| if "caption" not in df.columns: | |
| df["caption"] = "" | |
| brands, cats, cols, keywords, text_index = [], [], [], [], [] | |
| for _, row in df.iterrows(): | |
| title = str(row.get("title", "") or "") | |
| caption = str(row.get("caption", "") or "") | |
| joined = f"{title} {caption}" | |
| b = extract_brand(title) | |
| c = extract_color(joined) | |
| cat = guess_category(joined) | |
| kw = extract_keywords(joined) | |
| brands.append(b) | |
| cols.append(c) | |
| cats.append(cat) | |
| keywords.append(kw) | |
| parts = [title, caption] | |
| if b: parts.append(f"brand {b}") | |
| if cat: parts.append(f"category {cat}") | |
| if c: parts.append(f"color {c}") | |
| parts.append("keywords " + " ".join(kw)) | |
| text_index.append(" | ".join([p for p in parts if p])) | |
| df["brand"] = brands | |
| df["category"] = cats | |
| df["color"] = cols | |
| df["keywords"] = keywords | |
| df["text_for_index"] = text_index | |
| OUT_PATH.parent.mkdir(parents=True, exist_ok=True) | |
| df.to_parquet(OUT_PATH, index=False) | |
| print("Saved:", OUT_PATH) | |
| print("Rows:", len(df)) | |
| print("Category distribution (top 10):") | |
| print(df["category"].value_counts().head(10)) | |
| print("Brand nulls:", int(df["brand"].isna().sum())) | |
| print("Color nulls:", int(df["color"].isna().sum())) | |
| if __name__ == "__main__": | |
| main() | |