Spaces:

Sid9797
/

Look-Buy-AI

Sleeping

Look-Buy-AI / scripts /12_build_metadata_fields.py

Siddhesh Patil

Initial commit: RAG-based AI assistant with env-based API key handling

333b839 4 months ago

4.88 kB

	from pathlib import Path
	import re
	import pandas as pd

	BASE_DIR = Path(__file__).resolve().parent.parent
	IN_PATH = BASE_DIR / "data" / "processed" / "products_10k.parquet"
	OUT_PATH = BASE_DIR / "data" / "processed" / "products_10k_enriched.parquet"


	COLORS = {
	"black","white","silver","gold","gray","grey","blue","navy","red","green","yellow","orange",
	"pink","purple","violet","brown","beige","tan","cream","clear","transparent","rose","rose gold"
	}

	# super lightweight category rules (good enough for submission; we’ll refine later)
	CATEGORY_RULES = [
	("phone case", ["case", "iphone", "samsung", "galaxy", "pixel", "cover"]),
	("earrings", ["earring", "stud", "hoop", "piercing"]),
	("necklace", ["necklace", "pendant", "chain"]),
	("ring", ["ring", "band"]),
	("watch", ["watch", "smartwatch"]),
	("laptop accessory", ["laptop", "macbook", "notebook", "keyboard", "mouse", "trackpad"]),
	("kitchen tool", ["kitchen", "peeler", "spatula", "knife", "cookware", "pan", "pot", "utensil", "mop"]),
	("grocery", ["organic", "snack", "scone", "tofu", "chicken", "food", "drink", "beverage"]),
	("beauty", ["shampoo", "conditioner", "serum", "lotion", "cream", "makeup", "perfume"]),
	("home", ["chair", "table", "sofa", "lamp", "bedding", "pillow", "curtain"]),
	("tools", ["cutter", "pliers", "wrench", "screwdriver", "drill"]),
	("clothing", ["shirt", "hoodie", "jacket", "jeans", "dress", "pants", "shoes", "sneaker"]),
	]


	STOPWORDS = {
	"the","a","an","and","or","with","for","of","to","in","on","by","from","this","that","these","those",
	"new","set","pack","pcs","piece","pieces","count","inch","inches","cm","mm","oz","lbs","lb"
	}


	def clean_text(s: str) -> str:
	s = (s or "").lower()
	s = re.sub(r"[^a-z0-9\s\-]", " ", s)
	s = re.sub(r"\s+", " ", s).strip()
	return s


	def extract_color(text: str):
	t = clean_text(text)
	# check multiword first
	if "rose gold" in t:
	return "rose gold"
	for c in COLORS:
	if re.search(rf"\b{re.escape(c)}\b", t):
	return c
	return None


	def guess_category(text: str):
	t = clean_text(text)
	for cat, keys in CATEGORY_RULES:
	for k in keys:
	if re.search(rf"\b{re.escape(k)}\b", t):
	return cat
	return "other"


	def extract_brand(title: str):
	# Simple heuristic: brand often appears at the start like "AmazonBasics", "IGI", etc.
	# We'll take first token if it's Capitalized/alpha-ish OR "AmazonBasics"/"Amazon"/etc.
	title = (title or "").strip()
	if not title:
	return None

	first = title.split()[0]
	first_clean = re.sub(r"[^A-Za-z0-9\-&]", "", first)

	if len(first_clean) < 2:
	return None

	common = {"amazonbasics", "amazon", "igd", "igi", "rivet", "essentials", "365"}
	if first_clean.lower() in common:
	return first_clean

	# If it has letters and starts with uppercase, likely brand-like
	if re.match(r"^[A-Z][A-Za-z0-9\-&]+$", first_clean):
	return first_clean

	return None


	def extract_keywords(text: str):
	t = clean_text(text)
	tokens = [w for w in t.split() if w not in STOPWORDS and len(w) >= 3]
	# keep unique but stable order
	seen = set()
	out = []
	for w in tokens:
	if w not in seen:
	out.append(w)
	seen.add(w)
	return out[:40]


	def main():
	if not IN_PATH.exists():
	raise FileNotFoundError(IN_PATH)

	df = pd.read_parquet(IN_PATH)

	# Ensure caption exists
	if "caption" not in df.columns:
	df["caption"] = ""

	brands, cats, cols, keywords, text_index = [], [], [], [], []

	for _, row in df.iterrows():
	title = str(row.get("title", "") or "")
	caption = str(row.get("caption", "") or "")
	joined = f"{title} {caption}"

	b = extract_brand(title)
	c = extract_color(joined)
	cat = guess_category(joined)
	kw = extract_keywords(joined)

	brands.append(b)
	cols.append(c)
	cats.append(cat)
	keywords.append(kw)

	parts = [title, caption]
	if b: parts.append(f"brand {b}")
	if cat: parts.append(f"category {cat}")
	if c: parts.append(f"color {c}")
	parts.append("keywords " + " ".join(kw))
	text_index.append(" \| ".join([p for p in parts if p]))

	df["brand"] = brands
	df["category"] = cats
	df["color"] = cols
	df["keywords"] = keywords
	df["text_for_index"] = text_index

	OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
	df.to_parquet(OUT_PATH, index=False)

	print("Saved:", OUT_PATH)
	print("Rows:", len(df))
	print("Category distribution (top 10):")
	print(df["category"].value_counts().head(10))
	print("Brand nulls:", int(df["brand"].isna().sum()))
	print("Color nulls:", int(df["color"].isna().sum()))


	if __name__ == "__main__":
	main()