sushilideaclan01's picture
Add product scraping functionality and AI concept filling
76c3397
"""
Scrape product data from an Amalfa product page URL.
"""
import json
import re
from typing import Any
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
def _clean_text(s: str) -> str:
if not s:
return ""
return " ".join(s.split()).strip()
def _extract_price_from_text(text: str) -> str:
"""Find first price like Rs 1,299 or ₹1299."""
if not text:
return ""
# Rs 1,299.00 or ₹1,299 or Rs. 1299
m = re.search(r"(?:Rs\.?|₹)\s*([\d,]+(?:\.\d{2})?)", text, re.I)
if m:
return m.group(0).strip()
m = re.search(r"[\d,]+(?:\.\d{2})?", text)
if m:
return m.group(0)
return ""
def scrape_product(url: str) -> dict[str, Any]:
"""
Fetch an Amalfa product page and extract product_name, description, price, offers, product_images, brand, category.
Strategy fields (target_audience, competitors, psychological_triggers) and show_product are left empty for AI / user.
"""
parsed = urlparse(url)
if not parsed.scheme or not parsed.netloc:
raise ValueError(f"Invalid URL: {url}")
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-IN,en;q=0.9",
}
resp = requests.get(url, headers=headers, timeout=15)
resp.raise_for_status()
html = resp.text
soup = BeautifulSoup(html, "html.parser")
product: dict[str, Any] = {
"product_name": "",
"description": "",
"price": "",
"offers": "",
"product_images": "",
"brand": "Amalfa",
"category": "",
"target_audience": "",
"competitors": "",
"psychological_triggers": "",
"show_product": None,
}
# 1. JSON-LD (Shopify and many stores)
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "{}")
if isinstance(data, dict) and data.get("@type") == "Product":
product["product_name"] = _clean_text(data.get("name") or "")
product["description"] = _clean_text(data.get("description") or "")
if data.get("offers") and isinstance(data["offers"], dict):
product["price"] = str(data["offers"].get("price", ""))
elif isinstance(data.get("offers"), list) and data["offers"]:
product["price"] = str(data["offers"][0].get("price", ""))
if data.get("image"):
imgs = data["image"] if isinstance(data["image"], list) else [data["image"]]
# Collect up to 10 image URLs (product gallery)
product["product_images"] = ", ".join(str(u).strip() for u in imgs[:10] if u)
if product["product_name"] and product["price"]:
break
except (json.JSONDecodeError, TypeError):
continue
# 2. Meta tags (og:title, og:description, og:image)
if not product["product_name"]:
meta = soup.find("meta", property="og:title")
if meta and meta.get("content"):
product["product_name"] = _clean_text(meta["content"].split("|")[0].strip())
if not product["description"]:
meta = soup.find("meta", property="og:description") or soup.find("meta", attrs={"name": "description"})
if meta and meta.get("content"):
product["description"] = _clean_text(meta["content"])
if not product["product_images"]:
meta = soup.find("meta", property="og:image")
if meta and meta.get("content"):
product["product_images"] = meta["content"].strip()
# 3. Fallback: H1, price in body, description section
if not product["product_name"]:
h1 = soup.find("h1")
if h1:
product["product_name"] = _clean_text(h1.get_text())
if not product["price"]:
# Common Shopify / Amalfa price classes
for sel in ["[class*='price']", ".product__price", "[data-product-price]", ".price-item"]:
el = soup.select_one(sel)
if el:
product["price"] = _extract_price_from_text(el.get_text())
if product["price"]:
break
if not product["price"]:
product["price"] = _extract_price_from_text(soup.get_text())
if not product["description"]:
desc_el = (
soup.find("div", class_=re.compile(r"description|product-description|product__description", re.I))
or soup.find("meta", attrs={"name": "description"})
)
if desc_el:
product["description"] = _clean_text(desc_el.get_text() if hasattr(desc_el, "get_text") else (desc_el.get("content") or ""))
if not product["product_images"]:
# Product gallery images: collect up to 10 URLs (no break after first)
seen = set()
for img in soup.select("img[src*='cdn.shopify'], img[data-src*='shopify'], img[src*='amalfa']")[:20]:
if len(seen) >= 10:
break
src = (img.get("data-src") or img.get("src") or "").split("?")[0].strip()
if src and src.startswith("http") and src not in seen:
seen.add(src)
product["product_images"] = (product["product_images"] + ", " + src).strip(", ")
# Infer category from URL path (e.g. /collections/earrings/...) or leave for AI
path = (parsed.path or "").lower()
if "earring" in path:
product["category"] = product["category"] or "Earrings"
elif "necklace" in path or "pendant" in path or "choker" in path:
product["category"] = product["category"] or "Necklaces"
elif "ring" in path:
product["category"] = product["category"] or "Rings"
elif "bracelet" in path or "bangle" in path:
product["category"] = product["category"] or "Bracelets"
elif "anklet" in path:
product["category"] = product["category"] or "Anklets"
if not product["category"]:
product["category"] = "Jewellery"
return product