ecom-qa-bert_f / src /scraper.py
rnyx's picture
Upload 2 files
202ae51 verified
"""
FILE 3: src/scraper.py — Web Scraper
======================================
IMPORTED BY: app.py (called by /api/scrape route)
IMPORTS: requests, beautifulsoup4
Functions:
scrape_url(url) → fetches page, extracts product text, returns dict
Supports: Amazon (.in/.com), Flipkart, any generic webpage.
Returns combined "context" string ready for BERT QA.
"""
import re
import logging
import requests
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
def scrape_url(url: str) -> dict:
"""
Scrape a product page and return structured data.
Called by: app.py → api_scrape route
Input: URL string
Returns: {
"title": "Samsung Galaxy S24 Ultra",
"context": "Product: Samsung Galaxy... Features: 6.8-inch...",
"features": "...",
"description": "...",
"specs": "...",
"source": "amazon" | "flipkart" | "generic",
"char_count": 1847,
"warning": "..." (optional, if extraction was poor)
}
The "context" field is what gets sent to model.py for QA.
"""
if not url.startswith("http"):
url = "https://" + url
try:
logger.info(f"Scraping: {url}")
resp = requests.get(url, headers=HEADERS, timeout=15)
resp.raise_for_status()
except requests.exceptions.ConnectionError:
return {"error": f"Cannot connect to {url}. Check the URL."}
except requests.exceptions.Timeout:
return {"error": "Request timed out (15s). Try again."}
except requests.exceptions.HTTPError:
return {"error": f"HTTP {resp.status_code}. Site may block scrapers."}
except Exception as e:
return {"error": str(e)}
soup = BeautifulSoup(resp.text, "html.parser")
for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]):
tag.decompose()
url_lower = url.lower()
if "amazon" in url_lower:
data = _amazon(soup)
data["source"] = "amazon"
elif "flipkart" in url_lower:
data = _flipkart(soup)
data["source"] = "flipkart"
else:
data = _generic(soup)
data["source"] = "generic"
# Build combined context for QA
parts = []
if data.get("title"):
parts.append(f"Product: {data['title']}.")
if data.get("features"):
parts.append(f"Features: {data['features']}")
if data.get("description"):
parts.append(f"Description: {data['description']}")
if data.get("specs"):
parts.append(f"Specifications: {data['specs']}")
context = re.sub(r'\s+', ' ', " ".join(parts)).strip()
data["context"] = context
data["char_count"] = len(context)
if len(context) < 50:
data["warning"] = (
"Very little text extracted. The site may block scrapers or use "
"heavy JavaScript. Try pasting text manually in Text mode."
)
logger.info(f"Scraped [{data['source']}]: {data.get('title', '?')[:50]}... ({len(context)} chars)")
return data
def _amazon(soup):
"""Extract from Amazon product pages."""
d = {"title": "", "features": "", "description": "", "specs": ""}
tag = soup.find("span", {"id": "productTitle"})
if tag:
d["title"] = tag.get_text(strip=True)
feat = soup.find("div", {"id": "feature-bullets"})
if feat:
d["features"] = " ".join(
li.get_text(strip=True) for li in feat.find_all("li") if li.get_text(strip=True)
)
desc = soup.find("div", {"id": "productDescription"})
if desc:
d["description"] = desc.get_text(strip=True)
else:
aplus = soup.find("div", {"id": "aplus"})
if aplus:
d["description"] = " ".join(
p.get_text(strip=True) for p in aplus.find_all(["p", "li"])[:10]
)
specs = []
for table in soup.find_all("table", class_=re.compile("prodDetTable|a-keyvalue")):
for row in table.find_all("tr"):
th, td = row.find("th"), row.find("td")
if th and td:
k, v = th.get_text(strip=True), td.get_text(strip=True)
if k and v:
specs.append(f"{k}: {v}")
detail = soup.find("table", {"id": "productDetails_techSpec_section_1"})
if detail:
for row in detail.find_all("tr"):
th, td = row.find("th"), row.find("td")
if th and td:
k, v = th.get_text(strip=True), td.get_text(strip=True)
entry = f"{k}: {v}"
if k and v and entry not in specs:
specs.append(entry)
d["specs"] = " | ".join(specs)
return d
def _flipkart(soup):
"""Extract from Flipkart product pages."""
d = {"title": "", "features": "", "description": "", "specs": ""}
for sel in ["span.VU-ZEz", "span.B_NuCI", "h1 span"]:
tag = soup.select_one(sel)
if tag:
d["title"] = tag.get_text(strip=True)
break
highlights = soup.find_all("li", class_=re.compile("_21Ahn-|col-12"))
if highlights:
d["features"] = " ".join(h.get_text(strip=True) for h in highlights[:15])
desc = soup.find("div", class_=re.compile("_1mXcCf|_1AN87F"))
if desc:
d["description"] = desc.get_text(strip=True)
specs = []
for table in soup.find_all("table", class_=re.compile("_14cfVK|_1s_Smc")):
for row in table.find_all("tr"):
cells = row.find_all("td")
if len(cells) >= 2:
k, v = cells[0].get_text(strip=True), cells[1].get_text(strip=True)
if k and v:
specs.append(f"{k}: {v}")
d["specs"] = " | ".join(specs)
return d
def _generic(soup):
"""Fallback for any webpage."""
d = {"title": "", "features": "", "description": "", "specs": ""}
h1 = soup.find("h1")
d["title"] = h1.get_text(strip=True) if h1 else (soup.title.get_text(strip=True) if soup.title else "")
seen, texts = set(), []
for tag in soup.find_all(["p", "li", "td", "span", "div"]):
t = tag.get_text(strip=True)
if t and len(t) > 30 and t not in seen:
seen.add(t)
texts.append(t)
if len(texts) >= 25:
break
d["description"] = " ".join(texts)
return d