Spaces:
Sleeping
Sleeping
File size: 6,770 Bytes
202ae51 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 | """
FILE 3: src/scraper.py — Web Scraper
======================================
IMPORTED BY: app.py (called by /api/scrape route)
IMPORTS: requests, beautifulsoup4
Functions:
scrape_url(url) → fetches page, extracts product text, returns dict
Supports: Amazon (.in/.com), Flipkart, any generic webpage.
Returns combined "context" string ready for BERT QA.
"""
import re
import logging
import requests
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/122.0.0.0 Safari/537.36"
),
"Accept-Language": "en-US,en;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
def scrape_url(url: str) -> dict:
"""
Scrape a product page and return structured data.
Called by: app.py → api_scrape route
Input: URL string
Returns: {
"title": "Samsung Galaxy S24 Ultra",
"context": "Product: Samsung Galaxy... Features: 6.8-inch...",
"features": "...",
"description": "...",
"specs": "...",
"source": "amazon" | "flipkart" | "generic",
"char_count": 1847,
"warning": "..." (optional, if extraction was poor)
}
The "context" field is what gets sent to model.py for QA.
"""
if not url.startswith("http"):
url = "https://" + url
try:
logger.info(f"Scraping: {url}")
resp = requests.get(url, headers=HEADERS, timeout=15)
resp.raise_for_status()
except requests.exceptions.ConnectionError:
return {"error": f"Cannot connect to {url}. Check the URL."}
except requests.exceptions.Timeout:
return {"error": "Request timed out (15s). Try again."}
except requests.exceptions.HTTPError:
return {"error": f"HTTP {resp.status_code}. Site may block scrapers."}
except Exception as e:
return {"error": str(e)}
soup = BeautifulSoup(resp.text, "html.parser")
for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]):
tag.decompose()
url_lower = url.lower()
if "amazon" in url_lower:
data = _amazon(soup)
data["source"] = "amazon"
elif "flipkart" in url_lower:
data = _flipkart(soup)
data["source"] = "flipkart"
else:
data = _generic(soup)
data["source"] = "generic"
# Build combined context for QA
parts = []
if data.get("title"):
parts.append(f"Product: {data['title']}.")
if data.get("features"):
parts.append(f"Features: {data['features']}")
if data.get("description"):
parts.append(f"Description: {data['description']}")
if data.get("specs"):
parts.append(f"Specifications: {data['specs']}")
context = re.sub(r'\s+', ' ', " ".join(parts)).strip()
data["context"] = context
data["char_count"] = len(context)
if len(context) < 50:
data["warning"] = (
"Very little text extracted. The site may block scrapers or use "
"heavy JavaScript. Try pasting text manually in Text mode."
)
logger.info(f"Scraped [{data['source']}]: {data.get('title', '?')[:50]}... ({len(context)} chars)")
return data
def _amazon(soup):
"""Extract from Amazon product pages."""
d = {"title": "", "features": "", "description": "", "specs": ""}
tag = soup.find("span", {"id": "productTitle"})
if tag:
d["title"] = tag.get_text(strip=True)
feat = soup.find("div", {"id": "feature-bullets"})
if feat:
d["features"] = " ".join(
li.get_text(strip=True) for li in feat.find_all("li") if li.get_text(strip=True)
)
desc = soup.find("div", {"id": "productDescription"})
if desc:
d["description"] = desc.get_text(strip=True)
else:
aplus = soup.find("div", {"id": "aplus"})
if aplus:
d["description"] = " ".join(
p.get_text(strip=True) for p in aplus.find_all(["p", "li"])[:10]
)
specs = []
for table in soup.find_all("table", class_=re.compile("prodDetTable|a-keyvalue")):
for row in table.find_all("tr"):
th, td = row.find("th"), row.find("td")
if th and td:
k, v = th.get_text(strip=True), td.get_text(strip=True)
if k and v:
specs.append(f"{k}: {v}")
detail = soup.find("table", {"id": "productDetails_techSpec_section_1"})
if detail:
for row in detail.find_all("tr"):
th, td = row.find("th"), row.find("td")
if th and td:
k, v = th.get_text(strip=True), td.get_text(strip=True)
entry = f"{k}: {v}"
if k and v and entry not in specs:
specs.append(entry)
d["specs"] = " | ".join(specs)
return d
def _flipkart(soup):
"""Extract from Flipkart product pages."""
d = {"title": "", "features": "", "description": "", "specs": ""}
for sel in ["span.VU-ZEz", "span.B_NuCI", "h1 span"]:
tag = soup.select_one(sel)
if tag:
d["title"] = tag.get_text(strip=True)
break
highlights = soup.find_all("li", class_=re.compile("_21Ahn-|col-12"))
if highlights:
d["features"] = " ".join(h.get_text(strip=True) for h in highlights[:15])
desc = soup.find("div", class_=re.compile("_1mXcCf|_1AN87F"))
if desc:
d["description"] = desc.get_text(strip=True)
specs = []
for table in soup.find_all("table", class_=re.compile("_14cfVK|_1s_Smc")):
for row in table.find_all("tr"):
cells = row.find_all("td")
if len(cells) >= 2:
k, v = cells[0].get_text(strip=True), cells[1].get_text(strip=True)
if k and v:
specs.append(f"{k}: {v}")
d["specs"] = " | ".join(specs)
return d
def _generic(soup):
"""Fallback for any webpage."""
d = {"title": "", "features": "", "description": "", "specs": ""}
h1 = soup.find("h1")
d["title"] = h1.get_text(strip=True) if h1 else (soup.title.get_text(strip=True) if soup.title else "")
seen, texts = set(), []
for tag in soup.find_all(["p", "li", "td", "span", "div"]):
t = tag.get_text(strip=True)
if t and len(t) > 30 and t not in seen:
seen.add(t)
texts.append(t)
if len(texts) >= 25:
break
d["description"] = " ".join(texts)
return d
|