| """ |
| Scrape product data from an Amalfa product page URL. |
| """ |
| import json |
| import re |
| from typing import Any |
| from urllib.parse import urlparse |
|
|
| import requests |
| from bs4 import BeautifulSoup |
|
|
|
|
| def _clean_text(s: str) -> str: |
| if not s: |
| return "" |
| return " ".join(s.split()).strip() |
|
|
|
|
| def _extract_price_from_text(text: str) -> str: |
| """Find first price like Rs 1,299 or ₹1299.""" |
| if not text: |
| return "" |
| |
| m = re.search(r"(?:Rs\.?|₹)\s*([\d,]+(?:\.\d{2})?)", text, re.I) |
| if m: |
| return m.group(0).strip() |
| m = re.search(r"[\d,]+(?:\.\d{2})?", text) |
| if m: |
| return m.group(0) |
| return "" |
|
|
|
|
| def scrape_product(url: str) -> dict[str, Any]: |
| """ |
| Fetch an Amalfa product page and extract product_name, description, price, offers, product_images, brand, category. |
| Strategy fields (target_audience, competitors, psychological_triggers) and show_product are left empty for AI / user. |
| """ |
| parsed = urlparse(url) |
| if not parsed.scheme or not parsed.netloc: |
| raise ValueError(f"Invalid URL: {url}") |
|
|
| headers = { |
| "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
| "Accept-Language": "en-IN,en;q=0.9", |
| } |
| resp = requests.get(url, headers=headers, timeout=15) |
| resp.raise_for_status() |
| html = resp.text |
| soup = BeautifulSoup(html, "html.parser") |
|
|
| product: dict[str, Any] = { |
| "product_name": "", |
| "description": "", |
| "price": "", |
| "offers": "", |
| "product_images": "", |
| "brand": "Amalfa", |
| "category": "", |
| "target_audience": "", |
| "competitors": "", |
| "psychological_triggers": "", |
| "show_product": None, |
| } |
|
|
| |
| for script in soup.find_all("script", type="application/ld+json"): |
| try: |
| data = json.loads(script.string or "{}") |
| if isinstance(data, dict) and data.get("@type") == "Product": |
| product["product_name"] = _clean_text(data.get("name") or "") |
| product["description"] = _clean_text(data.get("description") or "") |
| if data.get("offers") and isinstance(data["offers"], dict): |
| product["price"] = str(data["offers"].get("price", "")) |
| elif isinstance(data.get("offers"), list) and data["offers"]: |
| product["price"] = str(data["offers"][0].get("price", "")) |
| if data.get("image"): |
| imgs = data["image"] if isinstance(data["image"], list) else [data["image"]] |
| |
| product["product_images"] = ", ".join(str(u).strip() for u in imgs[:10] if u) |
| if product["product_name"] and product["price"]: |
| break |
| except (json.JSONDecodeError, TypeError): |
| continue |
|
|
| |
| if not product["product_name"]: |
| meta = soup.find("meta", property="og:title") |
| if meta and meta.get("content"): |
| product["product_name"] = _clean_text(meta["content"].split("|")[0].strip()) |
| if not product["description"]: |
| meta = soup.find("meta", property="og:description") or soup.find("meta", attrs={"name": "description"}) |
| if meta and meta.get("content"): |
| product["description"] = _clean_text(meta["content"]) |
| if not product["product_images"]: |
| meta = soup.find("meta", property="og:image") |
| if meta and meta.get("content"): |
| product["product_images"] = meta["content"].strip() |
|
|
| |
| if not product["product_name"]: |
| h1 = soup.find("h1") |
| if h1: |
| product["product_name"] = _clean_text(h1.get_text()) |
|
|
| if not product["price"]: |
| |
| for sel in ["[class*='price']", ".product__price", "[data-product-price]", ".price-item"]: |
| el = soup.select_one(sel) |
| if el: |
| product["price"] = _extract_price_from_text(el.get_text()) |
| if product["price"]: |
| break |
| if not product["price"]: |
| product["price"] = _extract_price_from_text(soup.get_text()) |
|
|
| if not product["description"]: |
| desc_el = ( |
| soup.find("div", class_=re.compile(r"description|product-description|product__description", re.I)) |
| or soup.find("meta", attrs={"name": "description"}) |
| ) |
| if desc_el: |
| product["description"] = _clean_text(desc_el.get_text() if hasattr(desc_el, "get_text") else (desc_el.get("content") or "")) |
|
|
| if not product["product_images"]: |
| |
| seen = set() |
| for img in soup.select("img[src*='cdn.shopify'], img[data-src*='shopify'], img[src*='amalfa']")[:20]: |
| if len(seen) >= 10: |
| break |
| src = (img.get("data-src") or img.get("src") or "").split("?")[0].strip() |
| if src and src.startswith("http") and src not in seen: |
| seen.add(src) |
| product["product_images"] = (product["product_images"] + ", " + src).strip(", ") |
|
|
| |
| path = (parsed.path or "").lower() |
| if "earring" in path: |
| product["category"] = product["category"] or "Earrings" |
| elif "necklace" in path or "pendant" in path or "choker" in path: |
| product["category"] = product["category"] or "Necklaces" |
| elif "ring" in path: |
| product["category"] = product["category"] or "Rings" |
| elif "bracelet" in path or "bangle" in path: |
| product["category"] = product["category"] or "Bracelets" |
| elif "anklet" in path: |
| product["category"] = product["category"] or "Anklets" |
|
|
| if not product["category"]: |
| product["category"] = "Jewellery" |
|
|
| return product |
|
|