Spaces:

rnyx
/

ecom-qa

Sleeping

App Files Files Community

ecom-qa / src /scraper /scraper.py

rnyx

Initial deploy

f48b219 about 1 month ago

Raw

History Blame Contribute Delete

7.45 kB

	"""
	src/scraper/scraper.py
	Fix #3 — 3-tier scraping with caching:
	Tier 1: Playwright (headless Chromium + stealth) — handles JS-heavy pages
	Tier 2: requests + BeautifulSoup with rotated headers — fast, for simple pages
	Tier 3: Graceful failure — tells user to paste text manually
	Also implements URL-level SQLite caching (Fix #3 cache requirement).
	"""

	import re
	import time
	import logging
	import hashlib
	from urllib.parse import urlparse

	import requests
	from bs4 import BeautifulSoup

	logger = logging.getLogger(__name__)

	# How long to keep a cached page (seconds)
	CACHE_TTL_SECONDS = 60 * 60 * 6 # 6 hours

	# Realistic browser headers to reduce bot detection
	HEADERS_POOL = [
	{
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/124.0.0.0 Safari/537.36",
	"Accept-Language": "en-US,en;q=0.9",
	"Accept-Encoding": "gzip, deflate, br",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	"Connection": "keep-alive",
	},
	{
	"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) "
	"AppleWebKit/605.1.15 (KHTML, like Gecko) "
	"Version/17.4.1 Safari/605.1.15",
	"Accept-Language": "en-GB,en;q=0.9",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	},
	]

	# CSS selectors for product content on known sites
	SITE_SELECTORS = {
	"amazon": ["#productTitle", "#feature-bullets", "#productDescription",
	"#aplus", ".a-expander-content", "#customerReviews",
	"[data-component-type='s-customer-reviews']"],
	"flipkart": ["._1AtVbE", ".G4BRas", "._1mXcCf", "._3nPA3R",
	"._3dtsli", ".t-ZTKy", "._2kHMtA"],
	"default": ["main", "article", "#content", ".product", ".description",
	"[itemprop='description']", "body"],
	}


	def _url_hash(url: str) -> str:
	return hashlib.sha256(url.encode()).hexdigest()[:32]


	def _detect_site(url: str) -> str:
	domain = urlparse(url).netloc.lower()
	if "amazon" in domain:
	return "amazon"
	if "flipkart" in domain:
	return "flipkart"
	return "default"


	def _clean_text(text: str) -> str:
	"""Remove excessive whitespace and non-printable characters."""
	text = re.sub(r"[\r\n\t]+", " ", text)
	text = re.sub(r" {2,}", " ", text)
	return text.strip()


	def _extract_from_soup(soup: BeautifulSoup, site: str) -> str:
	"""Extract product text using site-specific selectors."""
	selectors = SITE_SELECTORS.get(site, SITE_SELECTORS["default"])
	parts = []
	for sel in selectors:
	for el in soup.select(sel):
	t = el.get_text(separator=" ").strip()
	if t:
	parts.append(t)

	if not parts:
	# last resort: all paragraph text
	parts = [p.get_text() for p in soup.find_all("p") if len(p.get_text()) > 30]

	return _clean_text(" ".join(parts))


	class Scraper:
	def __init__(self, db=None):
	self._db = db

	# ── Cache helpers ─────────────────────────────────────────────────────────
	def _get_cached(self, url: str):
	if self._db is None:
	return None
	return self._db.get_cached_page(url)

	def _save_cache(self, url: str, text: str):
	if self._db:
	self._db.cache_page(url, text)

	# ── Tier 1: Playwright ────────────────────────────────────────────────────
	def _scrape_playwright(self, url: str) -> str \| None:
	try:
	from playwright.sync_api import sync_playwright
	except ImportError:
	logger.info("Playwright not installed — skipping Tier 1")
	return None

	logger.info("Tier 1 scraping (Playwright): %s", url)
	try:
	with sync_playwright() as p:
	browser = p.chromium.launch(headless=True, args=[
	"--no-sandbox", "--disable-setuid-sandbox",
	"--disable-blink-features=AutomationControlled",
	])
	ctx = browser.new_context(
	user_agent=HEADERS_POOL[0]["User-Agent"],
	viewport={"width": 1280, "height": 720},
	locale="en-US",
	)
	# Stealth: hide navigator.webdriver
	ctx.add_init_script("""
	Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
	Object.defineProperty(navigator, 'plugins', {get: () => [1,2,3]});
	""")
	page = ctx.new_page()
	page.goto(url, timeout=20_000, wait_until="domcontentloaded")
	time.sleep(2) # let lazy content load
	html = page.content()
	browser.close()

	soup = BeautifulSoup(html, "html.parser")
	text = _extract_from_soup(soup, _detect_site(url))
	return text if len(text) > 200 else None

	except Exception as e:
	logger.warning("Playwright scraping failed: %s", e)
	return None

	# ── Tier 2: requests + BeautifulSoup ─────────────────────────────────────
	def _scrape_requests(self, url: str) -> str \| None:
	logger.info("Tier 2 scraping (requests): %s", url)
	import random
	headers = random.choice(HEADERS_POOL)
	try:
	resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True)
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, "html.parser")
	# Remove script / style noise
	for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
	tag.decompose()
	text = _extract_from_soup(soup, _detect_site(url))
	return text if len(text) > 200 else None
	except Exception as e:
	logger.warning("requests scraping failed: %s", e)
	return None

	# ── Public interface ──────────────────────────────────────────────────────
	def scrape(self, url: str) -> tuple[str, str]:
	"""
	Returns (text, source_tier) where source_tier is one of:
	'cache' \| 'playwright' \| 'requests' \| None
	Raises ValueError if all tiers fail.
	"""
	# Check cache first
	cached = self._get_cached(url)
	if cached:
	logger.info("Cache hit for %s", url)
	return cached, "cache"

	# Tier 1
	text = self._scrape_playwright(url)
	if text:
	self._save_cache(url, text)
	return text, "playwright"

	# Tier 2
	text = self._scrape_requests(url)
	if text:
	self._save_cache(url, text)
	return text, "requests"

	# Tier 3: fail gracefully
	raise ValueError(
	"Could not scrape this URL automatically. "
	"Please copy and paste the product description text manually."
	)