Spaces:
Sleeping
Sleeping
| """ | |
| FILE 3: src/scraper.py — Web Scraper | |
| ====================================== | |
| IMPORTED BY: app.py (called by /api/scrape route) | |
| IMPORTS: requests, beautifulsoup4 | |
| Functions: | |
| scrape_url(url) → fetches page, extracts product text, returns dict | |
| Supports: Amazon (.in/.com), Flipkart, any generic webpage. | |
| Returns combined "context" string ready for BERT QA. | |
| """ | |
| import re | |
| import logging | |
| import requests | |
| from bs4 import BeautifulSoup | |
| logger = logging.getLogger(__name__) | |
| HEADERS = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/122.0.0.0 Safari/537.36" | |
| ), | |
| "Accept-Language": "en-US,en;q=0.9", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| } | |
| def scrape_url(url: str) -> dict: | |
| """ | |
| Scrape a product page and return structured data. | |
| Called by: app.py → api_scrape route | |
| Input: URL string | |
| Returns: { | |
| "title": "Samsung Galaxy S24 Ultra", | |
| "context": "Product: Samsung Galaxy... Features: 6.8-inch...", | |
| "features": "...", | |
| "description": "...", | |
| "specs": "...", | |
| "source": "amazon" | "flipkart" | "generic", | |
| "char_count": 1847, | |
| "warning": "..." (optional, if extraction was poor) | |
| } | |
| The "context" field is what gets sent to model.py for QA. | |
| """ | |
| if not url.startswith("http"): | |
| url = "https://" + url | |
| try: | |
| logger.info(f"Scraping: {url}") | |
| resp = requests.get(url, headers=HEADERS, timeout=15) | |
| resp.raise_for_status() | |
| except requests.exceptions.ConnectionError: | |
| return {"error": f"Cannot connect to {url}. Check the URL."} | |
| except requests.exceptions.Timeout: | |
| return {"error": "Request timed out (15s). Try again."} | |
| except requests.exceptions.HTTPError: | |
| return {"error": f"HTTP {resp.status_code}. Site may block scrapers."} | |
| except Exception as e: | |
| return {"error": str(e)} | |
| soup = BeautifulSoup(resp.text, "html.parser") | |
| for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]): | |
| tag.decompose() | |
| url_lower = url.lower() | |
| if "amazon" in url_lower: | |
| data = _amazon(soup) | |
| data["source"] = "amazon" | |
| elif "flipkart" in url_lower: | |
| data = _flipkart(soup) | |
| data["source"] = "flipkart" | |
| else: | |
| data = _generic(soup) | |
| data["source"] = "generic" | |
| # Build combined context for QA | |
| parts = [] | |
| if data.get("title"): | |
| parts.append(f"Product: {data['title']}.") | |
| if data.get("features"): | |
| parts.append(f"Features: {data['features']}") | |
| if data.get("description"): | |
| parts.append(f"Description: {data['description']}") | |
| if data.get("specs"): | |
| parts.append(f"Specifications: {data['specs']}") | |
| context = re.sub(r'\s+', ' ', " ".join(parts)).strip() | |
| data["context"] = context | |
| data["char_count"] = len(context) | |
| if len(context) < 50: | |
| data["warning"] = ( | |
| "Very little text extracted. The site may block scrapers or use " | |
| "heavy JavaScript. Try pasting text manually in Text mode." | |
| ) | |
| logger.info(f"Scraped [{data['source']}]: {data.get('title', '?')[:50]}... ({len(context)} chars)") | |
| return data | |
| def _amazon(soup): | |
| """Extract from Amazon product pages.""" | |
| d = {"title": "", "features": "", "description": "", "specs": ""} | |
| tag = soup.find("span", {"id": "productTitle"}) | |
| if tag: | |
| d["title"] = tag.get_text(strip=True) | |
| feat = soup.find("div", {"id": "feature-bullets"}) | |
| if feat: | |
| d["features"] = " ".join( | |
| li.get_text(strip=True) for li in feat.find_all("li") if li.get_text(strip=True) | |
| ) | |
| desc = soup.find("div", {"id": "productDescription"}) | |
| if desc: | |
| d["description"] = desc.get_text(strip=True) | |
| else: | |
| aplus = soup.find("div", {"id": "aplus"}) | |
| if aplus: | |
| d["description"] = " ".join( | |
| p.get_text(strip=True) for p in aplus.find_all(["p", "li"])[:10] | |
| ) | |
| specs = [] | |
| for table in soup.find_all("table", class_=re.compile("prodDetTable|a-keyvalue")): | |
| for row in table.find_all("tr"): | |
| th, td = row.find("th"), row.find("td") | |
| if th and td: | |
| k, v = th.get_text(strip=True), td.get_text(strip=True) | |
| if k and v: | |
| specs.append(f"{k}: {v}") | |
| detail = soup.find("table", {"id": "productDetails_techSpec_section_1"}) | |
| if detail: | |
| for row in detail.find_all("tr"): | |
| th, td = row.find("th"), row.find("td") | |
| if th and td: | |
| k, v = th.get_text(strip=True), td.get_text(strip=True) | |
| entry = f"{k}: {v}" | |
| if k and v and entry not in specs: | |
| specs.append(entry) | |
| d["specs"] = " | ".join(specs) | |
| return d | |
| def _flipkart(soup): | |
| """Extract from Flipkart product pages.""" | |
| d = {"title": "", "features": "", "description": "", "specs": ""} | |
| for sel in ["span.VU-ZEz", "span.B_NuCI", "h1 span"]: | |
| tag = soup.select_one(sel) | |
| if tag: | |
| d["title"] = tag.get_text(strip=True) | |
| break | |
| highlights = soup.find_all("li", class_=re.compile("_21Ahn-|col-12")) | |
| if highlights: | |
| d["features"] = " ".join(h.get_text(strip=True) for h in highlights[:15]) | |
| desc = soup.find("div", class_=re.compile("_1mXcCf|_1AN87F")) | |
| if desc: | |
| d["description"] = desc.get_text(strip=True) | |
| specs = [] | |
| for table in soup.find_all("table", class_=re.compile("_14cfVK|_1s_Smc")): | |
| for row in table.find_all("tr"): | |
| cells = row.find_all("td") | |
| if len(cells) >= 2: | |
| k, v = cells[0].get_text(strip=True), cells[1].get_text(strip=True) | |
| if k and v: | |
| specs.append(f"{k}: {v}") | |
| d["specs"] = " | ".join(specs) | |
| return d | |
| def _generic(soup): | |
| """Fallback for any webpage.""" | |
| d = {"title": "", "features": "", "description": "", "specs": ""} | |
| h1 = soup.find("h1") | |
| d["title"] = h1.get_text(strip=True) if h1 else (soup.title.get_text(strip=True) if soup.title else "") | |
| seen, texts = set(), [] | |
| for tag in soup.find_all(["p", "li", "td", "span", "div"]): | |
| t = tag.get_text(strip=True) | |
| if t and len(t) > 30 and t not in seen: | |
| seen.add(t) | |
| texts.append(t) | |
| if len(texts) >= 25: | |
| break | |
| d["description"] = " ".join(texts) | |
| return d | |