sushilideaclan01's picture
.
2f9de1b
Raw
History Blame Contribute Delete
7.24 kB
"""
Scrape product data from an Amalfa product page URL.
"""
import json
import re
from typing import Any
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
def _clean_text(s: str) -> str:
if not s:
return ""
return " ".join(s.split()).strip()
def _extract_price_from_text(text: str) -> str:
"""Find first price like Rs 1,299 or ₹1299."""
if not text:
return ""
m = re.search(r"(?:Rs\.?|₹)\s*([\d,]+(?:\.\d{2})?)", text, re.I)
if m:
return m.group(0).strip()
m = re.search(r"[\d,]+(?:\.\d{2})?", text)
if m:
return m.group(0)
return ""
def scrape_product(url: str) -> dict[str, Any]:
"""
Fetch an Amalfa product page and extract product_name, description, price,
offers, product_images, brand, category. Strategy fields left empty for AI/user.
"""
parsed = urlparse(url)
if not parsed.scheme or not parsed.netloc:
raise ValueError(f"Invalid URL: {url}")
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-IN,en;q=0.9",
}
resp = requests.get(url, headers=headers, timeout=15)
resp.raise_for_status()
html = resp.text
soup = BeautifulSoup(html, "html.parser")
product: dict[str, Any] = {
"product_name": "",
"description": "",
"price": "",
"offers": "",
"product_images": "",
"brand": "",
"category": "",
"target_audience": "",
"competitors": "",
"psychological_triggers": "",
"show_product": None,
}
for script in soup.find_all("script", type="application/ld+json"):
try:
data = json.loads(script.string or "{}")
if isinstance(data, dict) and data.get("@type") == "Product":
product["product_name"] = _clean_text(data.get("name") or "")
product["description"] = _clean_text(data.get("description") or "")
if data.get("offers") and isinstance(data["offers"], dict):
product["price"] = str(data["offers"].get("price", ""))
elif isinstance(data.get("offers"), list) and data["offers"]:
product["price"] = str(data["offers"][0].get("price", ""))
if data.get("image"):
imgs = data["image"] if isinstance(data["image"], list) else [data["image"]]
product["product_images"] = ", ".join(str(u).strip() for u in imgs[:9] if u)
if product["product_name"] and product["price"]:
break
except (json.JSONDecodeError, TypeError):
continue
if not product["product_name"]:
meta = soup.find("meta", property="og:title")
if meta and meta.get("content"):
product["product_name"] = _clean_text(meta["content"].split("|")[0].strip())
if not product["description"]:
meta = soup.find("meta", property="og:description") or soup.find("meta", attrs={"name": "description"})
if meta and meta.get("content"):
product["description"] = _clean_text(meta["content"])
if not product["product_images"]:
meta = soup.find("meta", property="og:image")
if meta and meta.get("content"):
product["product_images"] = meta["content"].strip()
if not product["product_name"]:
h1 = soup.find("h1")
if h1:
product["product_name"] = _clean_text(h1.get_text())
if not product["price"]:
for sel in ["[class*='price']", ".product__price", "[data-product-price]", ".price-item"]:
el = soup.select_one(sel)
if el:
product["price"] = _extract_price_from_text(el.get_text())
if product["price"]:
break
if not product["price"]:
product["price"] = _extract_price_from_text(soup.get_text())
if not product["description"]:
desc_el = (
soup.find("div", class_=re.compile(r"description|product-description|product__description", re.I))
or soup.find("meta", attrs={"name": "description"})
)
if desc_el:
product["description"] = _clean_text(
desc_el.get_text() if hasattr(desc_el, "get_text") else (desc_el.get("content") or "")
)
# Shopify product JSON has the full images list (primary source for product images)
path_parts = (parsed.path or "").strip("/").split("/")
if path_parts and path_parts[0] == "products" and len(path_parts) >= 2:
handle = path_parts[1]
product_json_url = f"{parsed.scheme}://{parsed.netloc}/products/{handle}.json"
try:
r = requests.get(product_json_url, headers={**headers, "Accept": "application/json"}, timeout=10)
if r.ok:
data = r.json()
# Shopify Ajax API: root is the product object, or wrapped as {"product": {...}}
prod = data.get("product") if isinstance(data.get("product"), dict) else data
if isinstance(prod, dict):
images = prod.get("images")
if isinstance(images, list) and len(images) >= 1:
urls = []
for img in images[:9]:
u = None
if isinstance(img, dict) and img.get("src"):
u = (img.get("src") or "").strip()
elif isinstance(img, str) and img.strip():
u = img.strip()
if u:
if u.startswith("//"):
u = "https:" + u
if u.startswith("http") and u not in urls:
urls.append(u)
if urls:
product["product_images"] = ", ".join(urls)
except (requests.RequestException, ValueError, KeyError):
pass
path = (parsed.path or "").lower()
if "earring" in path:
product["category"] = product["category"] or "Earrings"
elif "necklace" in path or "pendant" in path or "choker" in path:
product["category"] = product["category"] or "Necklaces"
elif "ring" in path:
product["category"] = product["category"] or "Rings"
elif "bracelet" in path or "bangle" in path:
product["category"] = product["category"] or "Bracelets"
elif "anklet" in path:
product["category"] = product["category"] or "Anklets"
if not product["category"]:
product["category"] = "Jewellery"
# Log scraped data for verification (especially product images)
_images = [u.strip() for u in (product.get("product_images") or "").split(",") if u.strip()]
print(
"[scraper] product_name=%r category=%r | product_images count=%d | urls=%s"
% (product.get("product_name"), product.get("category"), len(_images), _images)
)
return product