Spaces:

rnyx
/

ecom-qa-bert_f

Sleeping

App Files Files Community

ecom-qa-bert_f / src /scraper.py

rnyx

Upload 2 files

202ae51 verified about 1 month ago

raw

history blame contribute delete

6.77 kB

	"""
	FILE 3: src/scraper.py — Web Scraper
	======================================
	IMPORTED BY: app.py (called by /api/scrape route)
	IMPORTS: requests, beautifulsoup4

	Functions:
	scrape_url(url) → fetches page, extracts product text, returns dict

	Supports: Amazon (.in/.com), Flipkart, any generic webpage.
	Returns combined "context" string ready for BERT QA.
	"""

	import re
	import logging
	import requests
	from bs4 import BeautifulSoup

	logger = logging.getLogger(__name__)

	HEADERS = {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/122.0.0.0 Safari/537.36"
	),
	"Accept-Language": "en-US,en;q=0.9",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	}


	def scrape_url(url: str) -> dict:
	"""
	Scrape a product page and return structured data.

	Called by: app.py → api_scrape route
	Input: URL string
	Returns: {
	"title": "Samsung Galaxy S24 Ultra",
	"context": "Product: Samsung Galaxy... Features: 6.8-inch...",
	"features": "...",
	"description": "...",
	"specs": "...",
	"source": "amazon" \| "flipkart" \| "generic",
	"char_count": 1847,
	"warning": "..." (optional, if extraction was poor)
	}

	The "context" field is what gets sent to model.py for QA.
	"""
	if not url.startswith("http"):
	url = "https://" + url

	try:
	logger.info(f"Scraping: {url}")
	resp = requests.get(url, headers=HEADERS, timeout=15)
	resp.raise_for_status()
	except requests.exceptions.ConnectionError:
	return {"error": f"Cannot connect to {url}. Check the URL."}
	except requests.exceptions.Timeout:
	return {"error": "Request timed out (15s). Try again."}
	except requests.exceptions.HTTPError:
	return {"error": f"HTTP {resp.status_code}. Site may block scrapers."}
	except Exception as e:
	return {"error": str(e)}

	soup = BeautifulSoup(resp.text, "html.parser")
	for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]):
	tag.decompose()

	url_lower = url.lower()
	if "amazon" in url_lower:
	data = _amazon(soup)
	data["source"] = "amazon"
	elif "flipkart" in url_lower:
	data = _flipkart(soup)
	data["source"] = "flipkart"
	else:
	data = _generic(soup)
	data["source"] = "generic"

	# Build combined context for QA
	parts = []
	if data.get("title"):
	parts.append(f"Product: {data['title']}.")
	if data.get("features"):
	parts.append(f"Features: {data['features']}")
	if data.get("description"):
	parts.append(f"Description: {data['description']}")
	if data.get("specs"):
	parts.append(f"Specifications: {data['specs']}")

	context = re.sub(r'\s+', ' ', " ".join(parts)).strip()
	data["context"] = context
	data["char_count"] = len(context)

	if len(context) < 50:
	data["warning"] = (
	"Very little text extracted. The site may block scrapers or use "
	"heavy JavaScript. Try pasting text manually in Text mode."
	)

	logger.info(f"Scraped [{data['source']}]: {data.get('title', '?')[:50]}... ({len(context)} chars)")
	return data


	def _amazon(soup):
	"""Extract from Amazon product pages."""
	d = {"title": "", "features": "", "description": "", "specs": ""}

	tag = soup.find("span", {"id": "productTitle"})
	if tag:
	d["title"] = tag.get_text(strip=True)

	feat = soup.find("div", {"id": "feature-bullets"})
	if feat:
	d["features"] = " ".join(
	li.get_text(strip=True) for li in feat.find_all("li") if li.get_text(strip=True)
	)

	desc = soup.find("div", {"id": "productDescription"})
	if desc:
	d["description"] = desc.get_text(strip=True)
	else:
	aplus = soup.find("div", {"id": "aplus"})
	if aplus:
	d["description"] = " ".join(
	p.get_text(strip=True) for p in aplus.find_all(["p", "li"])[:10]
	)

	specs = []
	for table in soup.find_all("table", class_=re.compile("prodDetTable\|a-keyvalue")):
	for row in table.find_all("tr"):
	th, td = row.find("th"), row.find("td")
	if th and td:
	k, v = th.get_text(strip=True), td.get_text(strip=True)
	if k and v:
	specs.append(f"{k}: {v}")

	detail = soup.find("table", {"id": "productDetails_techSpec_section_1"})
	if detail:
	for row in detail.find_all("tr"):
	th, td = row.find("th"), row.find("td")
	if th and td:
	k, v = th.get_text(strip=True), td.get_text(strip=True)
	entry = f"{k}: {v}"
	if k and v and entry not in specs:
	specs.append(entry)

	d["specs"] = " \| ".join(specs)
	return d


	def _flipkart(soup):
	"""Extract from Flipkart product pages."""
	d = {"title": "", "features": "", "description": "", "specs": ""}

	for sel in ["span.VU-ZEz", "span.B_NuCI", "h1 span"]:
	tag = soup.select_one(sel)
	if tag:
	d["title"] = tag.get_text(strip=True)
	break

	highlights = soup.find_all("li", class_=re.compile("_21Ahn-\|col-12"))
	if highlights:
	d["features"] = " ".join(h.get_text(strip=True) for h in highlights[:15])

	desc = soup.find("div", class_=re.compile("_1mXcCf\|_1AN87F"))
	if desc:
	d["description"] = desc.get_text(strip=True)

	specs = []
	for table in soup.find_all("table", class_=re.compile("_14cfVK\|_1s_Smc")):
	for row in table.find_all("tr"):
	cells = row.find_all("td")
	if len(cells) >= 2:
	k, v = cells[0].get_text(strip=True), cells[1].get_text(strip=True)
	if k and v:
	specs.append(f"{k}: {v}")
	d["specs"] = " \| ".join(specs)
	return d


	def _generic(soup):
	"""Fallback for any webpage."""
	d = {"title": "", "features": "", "description": "", "specs": ""}

	h1 = soup.find("h1")
	d["title"] = h1.get_text(strip=True) if h1 else (soup.title.get_text(strip=True) if soup.title else "")

	seen, texts = set(), []
	for tag in soup.find_all(["p", "li", "td", "span", "div"]):
	t = tag.get_text(strip=True)
	if t and len(t) > 30 and t not in seen:
	seen.add(t)
	texts.append(t)
	if len(texts) >= 25:
	break
	d["description"] = " ".join(texts)
	return d