| """ |
| src/scraper/scraper.py |
| Fix #3 — 3-tier scraping with caching: |
| Tier 1: Playwright (headless Chromium + stealth) — handles JS-heavy pages |
| Tier 2: requests + BeautifulSoup with rotated headers — fast, for simple pages |
| Tier 3: Graceful failure — tells user to paste text manually |
| Also implements URL-level SQLite caching (Fix #3 cache requirement). |
| """ |
|
|
| import re |
| import time |
| import logging |
| import hashlib |
| from urllib.parse import urlparse |
|
|
| import requests |
| from bs4 import BeautifulSoup |
|
|
| logger = logging.getLogger(__name__) |
|
|
| |
| CACHE_TTL_SECONDS = 60 * 60 * 6 |
|
|
| |
| HEADERS_POOL = [ |
| { |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
| "AppleWebKit/537.36 (KHTML, like Gecko) " |
| "Chrome/124.0.0.0 Safari/537.36", |
| "Accept-Language": "en-US,en;q=0.9", |
| "Accept-Encoding": "gzip, deflate, br", |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", |
| "Connection": "keep-alive", |
| }, |
| { |
| "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) " |
| "AppleWebKit/605.1.15 (KHTML, like Gecko) " |
| "Version/17.4.1 Safari/605.1.15", |
| "Accept-Language": "en-GB,en;q=0.9", |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", |
| }, |
| ] |
|
|
| |
| SITE_SELECTORS = { |
| "amazon": ["#productTitle", "#feature-bullets", "#productDescription", |
| "#aplus", ".a-expander-content", "#customerReviews", |
| "[data-component-type='s-customer-reviews']"], |
| "flipkart": ["._1AtVbE", ".G4BRas", "._1mXcCf", "._3nPA3R", |
| "._3dtsli", ".t-ZTKy", "._2kHMtA"], |
| "default": ["main", "article", "#content", ".product", ".description", |
| "[itemprop='description']", "body"], |
| } |
|
|
|
|
| def _url_hash(url: str) -> str: |
| return hashlib.sha256(url.encode()).hexdigest()[:32] |
|
|
|
|
| def _detect_site(url: str) -> str: |
| domain = urlparse(url).netloc.lower() |
| if "amazon" in domain: |
| return "amazon" |
| if "flipkart" in domain: |
| return "flipkart" |
| return "default" |
|
|
|
|
| def _clean_text(text: str) -> str: |
| """Remove excessive whitespace and non-printable characters.""" |
| text = re.sub(r"[\r\n\t]+", " ", text) |
| text = re.sub(r" {2,}", " ", text) |
| return text.strip() |
|
|
|
|
| def _extract_from_soup(soup: BeautifulSoup, site: str) -> str: |
| """Extract product text using site-specific selectors.""" |
| selectors = SITE_SELECTORS.get(site, SITE_SELECTORS["default"]) |
| parts = [] |
| for sel in selectors: |
| for el in soup.select(sel): |
| t = el.get_text(separator=" ").strip() |
| if t: |
| parts.append(t) |
|
|
| if not parts: |
| |
| parts = [p.get_text() for p in soup.find_all("p") if len(p.get_text()) > 30] |
|
|
| return _clean_text(" ".join(parts)) |
|
|
|
|
| class Scraper: |
| def __init__(self, db=None): |
| self._db = db |
|
|
| |
| def _get_cached(self, url: str): |
| if self._db is None: |
| return None |
| return self._db.get_cached_page(url) |
|
|
| def _save_cache(self, url: str, text: str): |
| if self._db: |
| self._db.cache_page(url, text) |
|
|
| |
| def _scrape_playwright(self, url: str) -> str | None: |
| try: |
| from playwright.sync_api import sync_playwright |
| except ImportError: |
| logger.info("Playwright not installed — skipping Tier 1") |
| return None |
|
|
| logger.info("Tier 1 scraping (Playwright): %s", url) |
| try: |
| with sync_playwright() as p: |
| browser = p.chromium.launch(headless=True, args=[ |
| "--no-sandbox", "--disable-setuid-sandbox", |
| "--disable-blink-features=AutomationControlled", |
| ]) |
| ctx = browser.new_context( |
| user_agent=HEADERS_POOL[0]["User-Agent"], |
| viewport={"width": 1280, "height": 720}, |
| locale="en-US", |
| ) |
| |
| ctx.add_init_script(""" |
| Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); |
| Object.defineProperty(navigator, 'plugins', {get: () => [1,2,3]}); |
| """) |
| page = ctx.new_page() |
| page.goto(url, timeout=20_000, wait_until="domcontentloaded") |
| time.sleep(2) |
| html = page.content() |
| browser.close() |
|
|
| soup = BeautifulSoup(html, "html.parser") |
| text = _extract_from_soup(soup, _detect_site(url)) |
| return text if len(text) > 200 else None |
|
|
| except Exception as e: |
| logger.warning("Playwright scraping failed: %s", e) |
| return None |
|
|
| |
| def _scrape_requests(self, url: str) -> str | None: |
| logger.info("Tier 2 scraping (requests): %s", url) |
| import random |
| headers = random.choice(HEADERS_POOL) |
| try: |
| resp = requests.get(url, headers=headers, timeout=15, allow_redirects=True) |
| resp.raise_for_status() |
| soup = BeautifulSoup(resp.text, "html.parser") |
| |
| for tag in soup(["script", "style", "nav", "footer", "header", "aside"]): |
| tag.decompose() |
| text = _extract_from_soup(soup, _detect_site(url)) |
| return text if len(text) > 200 else None |
| except Exception as e: |
| logger.warning("requests scraping failed: %s", e) |
| return None |
|
|
| |
| def scrape(self, url: str) -> tuple[str, str]: |
| """ |
| Returns (text, source_tier) where source_tier is one of: |
| 'cache' | 'playwright' | 'requests' | None |
| Raises ValueError if all tiers fail. |
| """ |
| |
| cached = self._get_cached(url) |
| if cached: |
| logger.info("Cache hit for %s", url) |
| return cached, "cache" |
|
|
| |
| text = self._scrape_playwright(url) |
| if text: |
| self._save_cache(url, text) |
| return text, "playwright" |
|
|
| |
| text = self._scrape_requests(url) |
| if text: |
| self._save_cache(url, text) |
| return text, "requests" |
|
|
| |
| raise ValueError( |
| "Could not scrape this URL automatically. " |
| "Please copy and paste the product description text manually." |
| ) |
|
|