Spaces:

userIdc2024
/

ProductShowcaseStudio

Running

ProductShowcaseStudio / backend /api /scraper.py

2f9de1b 2 months ago

7.24 kB

	"""
	Scrape product data from an Amalfa product page URL.
	"""

	import json
	import re
	from typing import Any
	from urllib.parse import urlparse

	import requests
	from bs4 import BeautifulSoup


	def _clean_text(s: str) -> str:
	if not s:
	return ""
	return " ".join(s.split()).strip()


	def _extract_price_from_text(text: str) -> str:
	"""Find first price like Rs 1,299 or ₹1299."""
	if not text:
	return ""
	m = re.search(r"(?:Rs\.?\|₹)\s*([\d,]+(?:\.\d{2})?)", text, re.I)
	if m:
	return m.group(0).strip()
	m = re.search(r"[\d,]+(?:\.\d{2})?", text)
	if m:
	return m.group(0)
	return ""


	def scrape_product(url: str) -> dict[str, Any]:
	"""
	Fetch an Amalfa product page and extract product_name, description, price,
	offers, product_images, brand, category. Strategy fields left empty for AI/user.
	"""
	parsed = urlparse(url)
	if not parsed.scheme or not parsed.netloc:
	raise ValueError(f"Invalid URL: {url}")

	headers = {
	"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language": "en-IN,en;q=0.9",
	}
	resp = requests.get(url, headers=headers, timeout=15)
	resp.raise_for_status()
	html = resp.text
	soup = BeautifulSoup(html, "html.parser")

	product: dict[str, Any] = {
	"product_name": "",
	"description": "",
	"price": "",
	"offers": "",
	"product_images": "",
	"brand": "",
	"category": "",
	"target_audience": "",
	"competitors": "",
	"psychological_triggers": "",
	"show_product": None,
	}

	for script in soup.find_all("script", type="application/ld+json"):
	try:
	data = json.loads(script.string or "{}")
	if isinstance(data, dict) and data.get("@type") == "Product":
	product["product_name"] = _clean_text(data.get("name") or "")
	product["description"] = _clean_text(data.get("description") or "")
	if data.get("offers") and isinstance(data["offers"], dict):
	product["price"] = str(data["offers"].get("price", ""))
	elif isinstance(data.get("offers"), list) and data["offers"]:
	product["price"] = str(data["offers"][0].get("price", ""))
	if data.get("image"):
	imgs = data["image"] if isinstance(data["image"], list) else [data["image"]]
	product["product_images"] = ", ".join(str(u).strip() for u in imgs[:9] if u)
	if product["product_name"] and product["price"]:
	break
	except (json.JSONDecodeError, TypeError):
	continue

	if not product["product_name"]:
	meta = soup.find("meta", property="og:title")
	if meta and meta.get("content"):
	product["product_name"] = _clean_text(meta["content"].split("\|")[0].strip())
	if not product["description"]:
	meta = soup.find("meta", property="og:description") or soup.find("meta", attrs={"name": "description"})
	if meta and meta.get("content"):
	product["description"] = _clean_text(meta["content"])
	if not product["product_images"]:
	meta = soup.find("meta", property="og:image")
	if meta and meta.get("content"):
	product["product_images"] = meta["content"].strip()

	if not product["product_name"]:
	h1 = soup.find("h1")
	if h1:
	product["product_name"] = _clean_text(h1.get_text())

	if not product["price"]:
	for sel in ["[class*='price']", ".product__price", "[data-product-price]", ".price-item"]:
	el = soup.select_one(sel)
	if el:
	product["price"] = _extract_price_from_text(el.get_text())
	if product["price"]:
	break
	if not product["price"]:
	product["price"] = _extract_price_from_text(soup.get_text())

	if not product["description"]:
	desc_el = (
	soup.find("div", class_=re.compile(r"description\|product-description\|product__description", re.I))
	or soup.find("meta", attrs={"name": "description"})
	)
	if desc_el:
	product["description"] = _clean_text(
	desc_el.get_text() if hasattr(desc_el, "get_text") else (desc_el.get("content") or "")
	)

	# Shopify product JSON has the full images list (primary source for product images)
	path_parts = (parsed.path or "").strip("/").split("/")
	if path_parts and path_parts[0] == "products" and len(path_parts) >= 2:
	handle = path_parts[1]
	product_json_url = f"{parsed.scheme}://{parsed.netloc}/products/{handle}.json"
	try:
	r = requests.get(product_json_url, headers={**headers, "Accept": "application/json"}, timeout=10)
	if r.ok:
	data = r.json()
	# Shopify Ajax API: root is the product object, or wrapped as {"product": {...}}
	prod = data.get("product") if isinstance(data.get("product"), dict) else data
	if isinstance(prod, dict):
	images = prod.get("images")
	if isinstance(images, list) and len(images) >= 1:
	urls = []
	for img in images[:9]:
	u = None
	if isinstance(img, dict) and img.get("src"):
	u = (img.get("src") or "").strip()
	elif isinstance(img, str) and img.strip():
	u = img.strip()
	if u:
	if u.startswith("//"):
	u = "https:" + u
	if u.startswith("http") and u not in urls:
	urls.append(u)
	if urls:
	product["product_images"] = ", ".join(urls)
	except (requests.RequestException, ValueError, KeyError):
	pass

	path = (parsed.path or "").lower()
	if "earring" in path:
	product["category"] = product["category"] or "Earrings"
	elif "necklace" in path or "pendant" in path or "choker" in path:
	product["category"] = product["category"] or "Necklaces"
	elif "ring" in path:
	product["category"] = product["category"] or "Rings"
	elif "bracelet" in path or "bangle" in path:
	product["category"] = product["category"] or "Bracelets"
	elif "anklet" in path:
	product["category"] = product["category"] or "Anklets"

	if not product["category"]:
	product["category"] = "Jewellery"

	# Log scraped data for verification (especially product images)
	_images = [u.strip() for u in (product.get("product_images") or "").split(",") if u.strip()]
	print(
	"[scraper] product_name=%r category=%r \| product_images count=%d \| urls=%s"
	% (product.get("product_name"), product.get("category"), len(_images), _images)
	)

	return product