Look-Buy-AI / scripts /12_build_metadata_fields.py
Siddhesh Patil
Initial commit: RAG-based AI assistant with env-based API key handling
333b839
from pathlib import Path
import re
import pandas as pd
BASE_DIR = Path(__file__).resolve().parent.parent
IN_PATH = BASE_DIR / "data" / "processed" / "products_10k.parquet"
OUT_PATH = BASE_DIR / "data" / "processed" / "products_10k_enriched.parquet"
COLORS = {
"black","white","silver","gold","gray","grey","blue","navy","red","green","yellow","orange",
"pink","purple","violet","brown","beige","tan","cream","clear","transparent","rose","rose gold"
}
# super lightweight category rules (good enough for submission; we’ll refine later)
CATEGORY_RULES = [
("phone case", ["case", "iphone", "samsung", "galaxy", "pixel", "cover"]),
("earrings", ["earring", "stud", "hoop", "piercing"]),
("necklace", ["necklace", "pendant", "chain"]),
("ring", ["ring", "band"]),
("watch", ["watch", "smartwatch"]),
("laptop accessory", ["laptop", "macbook", "notebook", "keyboard", "mouse", "trackpad"]),
("kitchen tool", ["kitchen", "peeler", "spatula", "knife", "cookware", "pan", "pot", "utensil", "mop"]),
("grocery", ["organic", "snack", "scone", "tofu", "chicken", "food", "drink", "beverage"]),
("beauty", ["shampoo", "conditioner", "serum", "lotion", "cream", "makeup", "perfume"]),
("home", ["chair", "table", "sofa", "lamp", "bedding", "pillow", "curtain"]),
("tools", ["cutter", "pliers", "wrench", "screwdriver", "drill"]),
("clothing", ["shirt", "hoodie", "jacket", "jeans", "dress", "pants", "shoes", "sneaker"]),
]
STOPWORDS = {
"the","a","an","and","or","with","for","of","to","in","on","by","from","this","that","these","those",
"new","set","pack","pcs","piece","pieces","count","inch","inches","cm","mm","oz","lbs","lb"
}
def clean_text(s: str) -> str:
s = (s or "").lower()
s = re.sub(r"[^a-z0-9\s\-]", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s
def extract_color(text: str):
t = clean_text(text)
# check multiword first
if "rose gold" in t:
return "rose gold"
for c in COLORS:
if re.search(rf"\b{re.escape(c)}\b", t):
return c
return None
def guess_category(text: str):
t = clean_text(text)
for cat, keys in CATEGORY_RULES:
for k in keys:
if re.search(rf"\b{re.escape(k)}\b", t):
return cat
return "other"
def extract_brand(title: str):
# Simple heuristic: brand often appears at the start like "AmazonBasics", "IGI", etc.
# We'll take first token if it's Capitalized/alpha-ish OR "AmazonBasics"/"Amazon"/etc.
title = (title or "").strip()
if not title:
return None
first = title.split()[0]
first_clean = re.sub(r"[^A-Za-z0-9\-&]", "", first)
if len(first_clean) < 2:
return None
common = {"amazonbasics", "amazon", "igd", "igi", "rivet", "essentials", "365"}
if first_clean.lower() in common:
return first_clean
# If it has letters and starts with uppercase, likely brand-like
if re.match(r"^[A-Z][A-Za-z0-9\-&]+$", first_clean):
return first_clean
return None
def extract_keywords(text: str):
t = clean_text(text)
tokens = [w for w in t.split() if w not in STOPWORDS and len(w) >= 3]
# keep unique but stable order
seen = set()
out = []
for w in tokens:
if w not in seen:
out.append(w)
seen.add(w)
return out[:40]
def main():
if not IN_PATH.exists():
raise FileNotFoundError(IN_PATH)
df = pd.read_parquet(IN_PATH)
# Ensure caption exists
if "caption" not in df.columns:
df["caption"] = ""
brands, cats, cols, keywords, text_index = [], [], [], [], []
for _, row in df.iterrows():
title = str(row.get("title", "") or "")
caption = str(row.get("caption", "") or "")
joined = f"{title} {caption}"
b = extract_brand(title)
c = extract_color(joined)
cat = guess_category(joined)
kw = extract_keywords(joined)
brands.append(b)
cols.append(c)
cats.append(cat)
keywords.append(kw)
parts = [title, caption]
if b: parts.append(f"brand {b}")
if cat: parts.append(f"category {cat}")
if c: parts.append(f"color {c}")
parts.append("keywords " + " ".join(kw))
text_index.append(" | ".join([p for p in parts if p]))
df["brand"] = brands
df["category"] = cats
df["color"] = cols
df["keywords"] = keywords
df["text_for_index"] = text_index
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
df.to_parquet(OUT_PATH, index=False)
print("Saved:", OUT_PATH)
print("Rows:", len(df))
print("Category distribution (top 10):")
print(df["category"].value_counts().head(10))
print("Brand nulls:", int(df["brand"].isna().sum()))
print("Color nulls:", int(df["color"].isna().sum()))
if __name__ == "__main__":
main()