""" Scrape product data from an Amalfa product page URL. """ import json import re from typing import Any from urllib.parse import urlparse import requests from bs4 import BeautifulSoup def _clean_text(s: str) -> str: if not s: return "" return " ".join(s.split()).strip() def _extract_price_from_text(text: str) -> str: """Find first price like Rs 1,299 or ₹1299.""" if not text: return "" # Rs 1,299.00 or ₹1,299 or Rs. 1299 m = re.search(r"(?:Rs\.?|₹)\s*([\d,]+(?:\.\d{2})?)", text, re.I) if m: return m.group(0).strip() m = re.search(r"[\d,]+(?:\.\d{2})?", text) if m: return m.group(0) return "" def scrape_product(url: str) -> dict[str, Any]: """ Fetch an Amalfa product page and extract product_name, description, price, offers, product_images, brand, category. Strategy fields (target_audience, competitors, psychological_triggers) and show_product are left empty for AI / user. """ parsed = urlparse(url) if not parsed.scheme or not parsed.netloc: raise ValueError(f"Invalid URL: {url}") headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-IN,en;q=0.9", } resp = requests.get(url, headers=headers, timeout=15) resp.raise_for_status() html = resp.text soup = BeautifulSoup(html, "html.parser") product: dict[str, Any] = { "product_name": "", "description": "", "price": "", "offers": "", "product_images": "", "brand": "Amalfa", "category": "", "target_audience": "", "competitors": "", "psychological_triggers": "", "show_product": None, } # 1. JSON-LD (Shopify and many stores) for script in soup.find_all("script", type="application/ld+json"): try: data = json.loads(script.string or "{}") if isinstance(data, dict) and data.get("@type") == "Product": product["product_name"] = _clean_text(data.get("name") or "") product["description"] = _clean_text(data.get("description") or "") if data.get("offers") and isinstance(data["offers"], dict): product["price"] = str(data["offers"].get("price", "")) elif isinstance(data.get("offers"), list) and data["offers"]: product["price"] = str(data["offers"][0].get("price", "")) if data.get("image"): imgs = data["image"] if isinstance(data["image"], list) else [data["image"]] # Collect up to 10 image URLs (product gallery) product["product_images"] = ", ".join(str(u).strip() for u in imgs[:10] if u) if product["product_name"] and product["price"]: break except (json.JSONDecodeError, TypeError): continue # 2. Meta tags (og:title, og:description, og:image) if not product["product_name"]: meta = soup.find("meta", property="og:title") if meta and meta.get("content"): product["product_name"] = _clean_text(meta["content"].split("|")[0].strip()) if not product["description"]: meta = soup.find("meta", property="og:description") or soup.find("meta", attrs={"name": "description"}) if meta and meta.get("content"): product["description"] = _clean_text(meta["content"]) if not product["product_images"]: meta = soup.find("meta", property="og:image") if meta and meta.get("content"): product["product_images"] = meta["content"].strip() # 3. Fallback: H1, price in body, description section if not product["product_name"]: h1 = soup.find("h1") if h1: product["product_name"] = _clean_text(h1.get_text()) if not product["price"]: # Common Shopify / Amalfa price classes for sel in ["[class*='price']", ".product__price", "[data-product-price]", ".price-item"]: el = soup.select_one(sel) if el: product["price"] = _extract_price_from_text(el.get_text()) if product["price"]: break if not product["price"]: product["price"] = _extract_price_from_text(soup.get_text()) if not product["description"]: desc_el = ( soup.find("div", class_=re.compile(r"description|product-description|product__description", re.I)) or soup.find("meta", attrs={"name": "description"}) ) if desc_el: product["description"] = _clean_text(desc_el.get_text() if hasattr(desc_el, "get_text") else (desc_el.get("content") or "")) if not product["product_images"]: # Product gallery images: collect up to 10 URLs (no break after first) seen = set() for img in soup.select("img[src*='cdn.shopify'], img[data-src*='shopify'], img[src*='amalfa']")[:20]: if len(seen) >= 10: break src = (img.get("data-src") or img.get("src") or "").split("?")[0].strip() if src and src.startswith("http") and src not in seen: seen.add(src) product["product_images"] = (product["product_images"] + ", " + src).strip(", ") # Infer category from URL path (e.g. /collections/earrings/...) or leave for AI path = (parsed.path or "").lower() if "earring" in path: product["category"] = product["category"] or "Earrings" elif "necklace" in path or "pendant" in path or "choker" in path: product["category"] = product["category"] or "Necklaces" elif "ring" in path: product["category"] = product["category"] or "Rings" elif "bracelet" in path or "bangle" in path: product["category"] = product["category"] or "Bracelets" elif "anklet" in path: product["category"] = product["category"] or "Anklets" if not product["category"]: product["category"] = "Jewellery" return product