Spaces:

userIdc2024
/

Amalfa_Image_Ad_Essentials

Sleeping

App Files Files Community

Amalfa_Image_Ad_Essentials / backend /scraper.py

sushilideaclan01

Add product scraping functionality and AI concept filling

76c3397 about 2 months ago

raw

history blame contribute delete

6.22 kB

	"""
	Scrape product data from an Amalfa product page URL.
	"""
	import json
	import re
	from typing import Any
	from urllib.parse import urlparse

	import requests
	from bs4 import BeautifulSoup


	def _clean_text(s: str) -> str:
	if not s:
	return ""
	return " ".join(s.split()).strip()


	def _extract_price_from_text(text: str) -> str:
	"""Find first price like Rs 1,299 or ₹1299."""
	if not text:
	return ""
	# Rs 1,299.00 or ₹1,299 or Rs. 1299
	m = re.search(r"(?:Rs\.?\|₹)\s*([\d,]+(?:\.\d{2})?)", text, re.I)
	if m:
	return m.group(0).strip()
	m = re.search(r"[\d,]+(?:\.\d{2})?", text)
	if m:
	return m.group(0)
	return ""


	def scrape_product(url: str) -> dict[str, Any]:
	"""
	Fetch an Amalfa product page and extract product_name, description, price, offers, product_images, brand, category.
	Strategy fields (target_audience, competitors, psychological_triggers) and show_product are left empty for AI / user.
	"""
	parsed = urlparse(url)
	if not parsed.scheme or not parsed.netloc:
	raise ValueError(f"Invalid URL: {url}")

	headers = {
	"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language": "en-IN,en;q=0.9",
	}
	resp = requests.get(url, headers=headers, timeout=15)
	resp.raise_for_status()
	html = resp.text
	soup = BeautifulSoup(html, "html.parser")

	product: dict[str, Any] = {
	"product_name": "",
	"description": "",
	"price": "",
	"offers": "",
	"product_images": "",
	"brand": "Amalfa",
	"category": "",
	"target_audience": "",
	"competitors": "",
	"psychological_triggers": "",
	"show_product": None,
	}

	# 1. JSON-LD (Shopify and many stores)
	for script in soup.find_all("script", type="application/ld+json"):
	try:
	data = json.loads(script.string or "{}")
	if isinstance(data, dict) and data.get("@type") == "Product":
	product["product_name"] = _clean_text(data.get("name") or "")
	product["description"] = _clean_text(data.get("description") or "")
	if data.get("offers") and isinstance(data["offers"], dict):
	product["price"] = str(data["offers"].get("price", ""))
	elif isinstance(data.get("offers"), list) and data["offers"]:
	product["price"] = str(data["offers"][0].get("price", ""))
	if data.get("image"):
	imgs = data["image"] if isinstance(data["image"], list) else [data["image"]]
	# Collect up to 10 image URLs (product gallery)
	product["product_images"] = ", ".join(str(u).strip() for u in imgs[:10] if u)
	if product["product_name"] and product["price"]:
	break
	except (json.JSONDecodeError, TypeError):
	continue

	# 2. Meta tags (og:title, og:description, og:image)
	if not product["product_name"]:
	meta = soup.find("meta", property="og:title")
	if meta and meta.get("content"):
	product["product_name"] = _clean_text(meta["content"].split("\|")[0].strip())
	if not product["description"]:
	meta = soup.find("meta", property="og:description") or soup.find("meta", attrs={"name": "description"})
	if meta and meta.get("content"):
	product["description"] = _clean_text(meta["content"])
	if not product["product_images"]:
	meta = soup.find("meta", property="og:image")
	if meta and meta.get("content"):
	product["product_images"] = meta["content"].strip()

	# 3. Fallback: H1, price in body, description section
	if not product["product_name"]:
	h1 = soup.find("h1")
	if h1:
	product["product_name"] = _clean_text(h1.get_text())

	if not product["price"]:
	# Common Shopify / Amalfa price classes
	for sel in ["[class*='price']", ".product__price", "[data-product-price]", ".price-item"]:
	el = soup.select_one(sel)
	if el:
	product["price"] = _extract_price_from_text(el.get_text())
	if product["price"]:
	break
	if not product["price"]:
	product["price"] = _extract_price_from_text(soup.get_text())

	if not product["description"]:
	desc_el = (
	soup.find("div", class_=re.compile(r"description\|product-description\|product__description", re.I))
	or soup.find("meta", attrs={"name": "description"})
	)
	if desc_el:
	product["description"] = _clean_text(desc_el.get_text() if hasattr(desc_el, "get_text") else (desc_el.get("content") or ""))

	if not product["product_images"]:
	# Product gallery images: collect up to 10 URLs (no break after first)
	seen = set()
	for img in soup.select("img[src='cdn.shopify'], img[data-src='shopify'], img[src*='amalfa']")[:20]:
	if len(seen) >= 10:
	break
	src = (img.get("data-src") or img.get("src") or "").split("?")[0].strip()
	if src and src.startswith("http") and src not in seen:
	seen.add(src)
	product["product_images"] = (product["product_images"] + ", " + src).strip(", ")

	# Infer category from URL path (e.g. /collections/earrings/...) or leave for AI
	path = (parsed.path or "").lower()
	if "earring" in path:
	product["category"] = product["category"] or "Earrings"
	elif "necklace" in path or "pendant" in path or "choker" in path:
	product["category"] = product["category"] or "Necklaces"
	elif "ring" in path:
	product["category"] = product["category"] or "Rings"
	elif "bracelet" in path or "bangle" in path:
	product["category"] = product["category"] or "Bracelets"
	elif "anklet" in path:
	product["category"] = product["category"] or "Anklets"

	if not product["category"]:
	product["category"] = "Jewellery"

	return product