""" FILE 3: src/scraper.py — Web Scraper ====================================== IMPORTED BY: app.py (called by /api/scrape route) IMPORTS: requests, beautifulsoup4 Functions: scrape_url(url) → fetches page, extracts product text, returns dict Supports: Amazon (.in/.com), Flipkart, any generic webpage. Returns combined "context" string ready for BERT QA. """ import re import logging import requests from bs4 import BeautifulSoup logger = logging.getLogger(__name__) HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/122.0.0.0 Safari/537.36" ), "Accept-Language": "en-US,en;q=0.9", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", } def scrape_url(url: str) -> dict: """ Scrape a product page and return structured data. Called by: app.py → api_scrape route Input: URL string Returns: { "title": "Samsung Galaxy S24 Ultra", "context": "Product: Samsung Galaxy... Features: 6.8-inch...", "features": "...", "description": "...", "specs": "...", "source": "amazon" | "flipkart" | "generic", "char_count": 1847, "warning": "..." (optional, if extraction was poor) } The "context" field is what gets sent to model.py for QA. """ if not url.startswith("http"): url = "https://" + url try: logger.info(f"Scraping: {url}") resp = requests.get(url, headers=HEADERS, timeout=15) resp.raise_for_status() except requests.exceptions.ConnectionError: return {"error": f"Cannot connect to {url}. Check the URL."} except requests.exceptions.Timeout: return {"error": "Request timed out (15s). Try again."} except requests.exceptions.HTTPError: return {"error": f"HTTP {resp.status_code}. Site may block scrapers."} except Exception as e: return {"error": str(e)} soup = BeautifulSoup(resp.text, "html.parser") for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]): tag.decompose() url_lower = url.lower() if "amazon" in url_lower: data = _amazon(soup) data["source"] = "amazon" elif "flipkart" in url_lower: data = _flipkart(soup) data["source"] = "flipkart" else: data = _generic(soup) data["source"] = "generic" # Build combined context for QA parts = [] if data.get("title"): parts.append(f"Product: {data['title']}.") if data.get("features"): parts.append(f"Features: {data['features']}") if data.get("description"): parts.append(f"Description: {data['description']}") if data.get("specs"): parts.append(f"Specifications: {data['specs']}") context = re.sub(r'\s+', ' ', " ".join(parts)).strip() data["context"] = context data["char_count"] = len(context) if len(context) < 50: data["warning"] = ( "Very little text extracted. The site may block scrapers or use " "heavy JavaScript. Try pasting text manually in Text mode." ) logger.info(f"Scraped [{data['source']}]: {data.get('title', '?')[:50]}... ({len(context)} chars)") return data def _amazon(soup): """Extract from Amazon product pages.""" d = {"title": "", "features": "", "description": "", "specs": ""} tag = soup.find("span", {"id": "productTitle"}) if tag: d["title"] = tag.get_text(strip=True) feat = soup.find("div", {"id": "feature-bullets"}) if feat: d["features"] = " ".join( li.get_text(strip=True) for li in feat.find_all("li") if li.get_text(strip=True) ) desc = soup.find("div", {"id": "productDescription"}) if desc: d["description"] = desc.get_text(strip=True) else: aplus = soup.find("div", {"id": "aplus"}) if aplus: d["description"] = " ".join( p.get_text(strip=True) for p in aplus.find_all(["p", "li"])[:10] ) specs = [] for table in soup.find_all("table", class_=re.compile("prodDetTable|a-keyvalue")): for row in table.find_all("tr"): th, td = row.find("th"), row.find("td") if th and td: k, v = th.get_text(strip=True), td.get_text(strip=True) if k and v: specs.append(f"{k}: {v}") detail = soup.find("table", {"id": "productDetails_techSpec_section_1"}) if detail: for row in detail.find_all("tr"): th, td = row.find("th"), row.find("td") if th and td: k, v = th.get_text(strip=True), td.get_text(strip=True) entry = f"{k}: {v}" if k and v and entry not in specs: specs.append(entry) d["specs"] = " | ".join(specs) return d def _flipkart(soup): """Extract from Flipkart product pages.""" d = {"title": "", "features": "", "description": "", "specs": ""} for sel in ["span.VU-ZEz", "span.B_NuCI", "h1 span"]: tag = soup.select_one(sel) if tag: d["title"] = tag.get_text(strip=True) break highlights = soup.find_all("li", class_=re.compile("_21Ahn-|col-12")) if highlights: d["features"] = " ".join(h.get_text(strip=True) for h in highlights[:15]) desc = soup.find("div", class_=re.compile("_1mXcCf|_1AN87F")) if desc: d["description"] = desc.get_text(strip=True) specs = [] for table in soup.find_all("table", class_=re.compile("_14cfVK|_1s_Smc")): for row in table.find_all("tr"): cells = row.find_all("td") if len(cells) >= 2: k, v = cells[0].get_text(strip=True), cells[1].get_text(strip=True) if k and v: specs.append(f"{k}: {v}") d["specs"] = " | ".join(specs) return d def _generic(soup): """Fallback for any webpage.""" d = {"title": "", "features": "", "description": "", "specs": ""} h1 = soup.find("h1") d["title"] = h1.get_text(strip=True) if h1 else (soup.title.get_text(strip=True) if soup.title else "") seen, texts = set(), [] for tag in soup.find_all(["p", "li", "td", "span", "div"]): t = tag.get_text(strip=True) if t and len(t) > 30 and t not in seen: seen.add(t) texts.append(t) if len(texts) >= 25: break d["description"] = " ".join(texts) return d