ecom-qa-bert-v2 / src /scraper.py
rnyx's picture
v3: Multi-site support (Amazon, Flipkart, Myntra, Meesho, TataCliq, Nykaa)
ba70fd1
"""
Unified scraper.
Calls the right parser based on the URL via the site registry. Tries
ScraperAPI first (residential proxy → bypasses most blocks), falls back
to Playwright (headless browser), then to plain requests as last resort.
Site parsers all return the same ProductData shape — see src/sites/_base.py.
"""
import logging
import random
import re
from typing import Optional
from bs4 import BeautifulSoup
from . import config
from .sites import find_parser, generic
from .sites._base import clean
logger = logging.getLogger(__name__)
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4) AppleWebKit/605.1.15 "
"(KHTML, like Gecko) Version/17.4 Safari/605.1.15",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36",
]
def scrape_url(url: str) -> dict:
"""
Scraping pipeline:
1. ScraperAPI (residential proxy)
2. Playwright (headless Chromium)
3. Plain requests (last resort)
First one that returns valid HTML wins. Then the URL-specific parser
extracts product data from that HTML.
"""
if not url:
return {"error": "URL is required."}
url = url.strip()
if not url.startswith(("http://", "https://")):
url = "https://" + url
last_error: Optional[str] = None
html: Optional[str] = None
method: Optional[str] = None
# 1) ScraperAPI
if config.SCRAPERAPI_ENABLED:
try:
html, err = _fetch_scraperapi(url)
if html:
method = "scraperapi"
else:
last_error = err
logger.warning(f"ScraperAPI failed: {err}")
except Exception as e:
last_error = f"ScraperAPI crashed: {e}"
logger.exception(last_error)
# 2) Playwright
if not html and config.PLAYWRIGHT_ENABLED:
try:
html, err = _fetch_playwright(url)
if html:
method = "playwright"
else:
last_error = err
logger.warning(f"Playwright failed: {err}")
except Exception as e:
last_error = f"Playwright crashed: {e}"
logger.exception(last_error)
# 3) Plain requests
if not html:
try:
html, err = _fetch_requests(url)
if html:
method = "requests"
else:
last_error = err
except Exception as e:
last_error = f"Requests crashed: {e}"
logger.exception(last_error)
if not html:
return {
"error": f"All scrapers failed. Last: {last_error or 'unknown'}",
"scraper_used": "none",
}
# ── Parse with the right site parser ──
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "noscript", "iframe"]):
tag.decompose()
parser = find_parser(url)
if parser is None:
parser = generic.parse
logger.info(f"No site parser for {url} — using generic")
else:
logger.info(f"Using {parser.__module__} for {url}")
try:
data = parser(soup).to_dict()
except Exception as e:
logger.exception("Parser crashed; falling back to generic")
data = generic.parse(soup).to_dict()
data["parse_error"] = str(e)
# ── Build the QA context (everything BERT can search over) ──
parts = []
if data.get("title"):
parts.append(f"Product: {data['title']}.")
if data.get("features"):
parts.append(f"Features: {data['features']}")
if data.get("description"):
parts.append(f"Description: {data['description']}")
if data.get("specs"):
parts.append(f"Specifications: {data['specs']}")
if data.get("materials"):
parts.append(f"Materials: {data['materials']}")
if data.get("sizes"):
parts.append(f"Available sizes: {data['sizes']}")
if data.get("return_policy"):
parts.append(f"Return policy: {data['return_policy']}")
if data.get("rating_text"):
parts.append(f"Rating: {data['rating_text']}")
context = clean(" ".join(parts), limit=20000)
data["context"] = context
data["char_count"] = len(context)
data["scraper_used"] = method
data["reviews"] = data.get("reviews", [])
data["review_count"] = len(data["reviews"])
if len(context) < 50 and not data["reviews"]:
data["warning"] = (
f"Very little usable text was extracted with {method}. "
"The site may have served a CAPTCHA or blocked content. "
"Try the URL again, or paste the product description manually "
"in Text mode."
)
logger.info(
f"Scraped [{data['source']} via {method}] "
f"title={data.get('title', '?')[:60]!r} "
f"chars={len(context)} reviews={data['review_count']}"
)
return data
# ────────────────────────────────────────────────────────────────────
# Fetchers
# ────────────────────────────────────────────────────────────────────
def _fetch_scraperapi(url: str) -> tuple[Optional[str], Optional[str]]:
"""Returns (html, None) on success, (None, error_str) on failure."""
import requests
from urllib.parse import urlencode
if not config.SCRAPERAPI_KEY:
return None, "ScraperAPI key not configured."
params = {
"api_key": config.SCRAPERAPI_KEY,
"url": url,
"country_code": "in",
"render": "true" if config.SCRAPERAPI_RENDER_JS else "false",
"keep_headers": "false",
}
request_url = f"https://api.scraperapi.com?{urlencode(params)}"
try:
logger.info(f"ScraperAPI fetching: {url}")
resp = requests.get(request_url, timeout=config.SCRAPERAPI_TIMEOUT)
except requests.exceptions.Timeout:
return None, f"ScraperAPI timeout ({config.SCRAPERAPI_TIMEOUT}s)"
except requests.exceptions.RequestException as e:
return None, f"ScraperAPI network: {e}"
if resp.status_code == 401:
return None, "ScraperAPI key rejected (401)"
if resp.status_code == 403:
return None, "ScraperAPI denied (403) — out of credits?"
if resp.status_code == 429:
return None, "ScraperAPI rate limited (429)"
if resp.status_code >= 400:
return None, f"ScraperAPI HTTP {resp.status_code}"
html = resp.text or ""
if not html.strip():
return None, "ScraperAPI returned empty body"
return html, None
def _fetch_playwright(url: str) -> tuple[Optional[str], Optional[str]]:
try:
from playwright.sync_api import sync_playwright, TimeoutError as PWTimeoutError
except ImportError:
return None, "Playwright not installed"
with sync_playwright() as pw:
try:
browser = pw.chromium.launch(
headless=config.PLAYWRIGHT_HEADLESS,
args=[
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-blink-features=AutomationControlled",
],
)
except Exception as e:
return None, f"Browser launch failed: {e}"
try:
context = browser.new_context(
user_agent=random.choice(USER_AGENTS),
viewport={"width": 1280, "height": 900},
locale="en-US",
)
context.route(
"**/*",
lambda route: (
route.abort()
if route.request.resource_type in {"image", "media", "font"}
else route.continue_()
),
)
page = context.new_page()
page.set_default_timeout(config.PLAYWRIGHT_TIMEOUT_MS)
try:
page.goto(url, wait_until="domcontentloaded",
timeout=config.PLAYWRIGHT_TIMEOUT_MS)
except PWTimeoutError:
return None, f"Page load timed out ({config.PLAYWRIGHT_TIMEOUT_MS//1000}s)"
head = page.content()[:5000].lower()
if any(p in head for p in [
"enter the characters you see below",
"type the characters",
"automated access",
]):
return None, "Site served a CAPTCHA"
try:
page.wait_for_load_state("networkidle", timeout=8000)
except PWTimeoutError:
pass
# Auto-scroll for lazy-loaded reviews (helpful on Flipkart, Myntra)
try:
for _ in range(3):
page.mouse.wheel(0, 1500)
page.wait_for_timeout(400)
except Exception:
pass
html = page.content()
finally:
try:
context.close()
except Exception:
pass
browser.close()
return html, None
def _fetch_requests(url: str) -> tuple[Optional[str], Optional[str]]:
"""Plain requests — last-resort fallback."""
import requests
headers = {
"User-Agent": random.choice(USER_AGENTS),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
}
try:
resp = requests.get(url, headers=headers,
timeout=config.SCRAPE_TIMEOUT)
except requests.exceptions.Timeout:
return None, "Request timeout"
except requests.exceptions.RequestException as e:
return None, f"Request error: {e}"
if resp.status_code == 403:
return None, "Site blocked the request (HTTP 403)"
if resp.status_code >= 400:
return None, f"HTTP {resp.status_code}"
return resp.text or "", None