Spaces:

rnyx
/

ecom-qa-bert_f

Sleeping

File size: 6,770 Bytes

202ae51

"""

FILE 3: src/scraper.py — Web Scraper

======================================

IMPORTED BY: app.py (called by /api/scrape route)

IMPORTS:     requests, beautifulsoup4



Functions:

  scrape_url(url)     → fetches page, extracts product text, returns dict



Supports: Amazon (.in/.com), Flipkart, any generic webpage.

Returns combined "context" string ready for BERT QA.

"""

import re
import logging
import requests
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}


def scrape_url(url: str) -> dict:
    """

    Scrape a product page and return structured data.



    Called by: app.py → api_scrape route

    Input:    URL string

    Returns:  {

        "title": "Samsung Galaxy S24 Ultra",

        "context": "Product: Samsung Galaxy... Features: 6.8-inch...",

        "features": "...",

        "description": "...",

        "specs": "...",

        "source": "amazon" | "flipkart" | "generic",

        "char_count": 1847,

        "warning": "..." (optional, if extraction was poor)

    }



    The "context" field is what gets sent to model.py for QA.

    """
    if not url.startswith("http"):
        url = "https://" + url

    try:
        logger.info(f"Scraping: {url}")
        resp = requests.get(url, headers=HEADERS, timeout=15)
        resp.raise_for_status()
    except requests.exceptions.ConnectionError:
        return {"error": f"Cannot connect to {url}. Check the URL."}
    except requests.exceptions.Timeout:
        return {"error": "Request timed out (15s). Try again."}
    except requests.exceptions.HTTPError:
        return {"error": f"HTTP {resp.status_code}. Site may block scrapers."}
    except Exception as e:
        return {"error": str(e)}

    soup = BeautifulSoup(resp.text, "html.parser")
    for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]):
        tag.decompose()

    url_lower = url.lower()
    if "amazon" in url_lower:
        data = _amazon(soup)
        data["source"] = "amazon"
    elif "flipkart" in url_lower:
        data = _flipkart(soup)
        data["source"] = "flipkart"
    else:
        data = _generic(soup)
        data["source"] = "generic"

    # Build combined context for QA
    parts = []
    if data.get("title"):
        parts.append(f"Product: {data['title']}.")
    if data.get("features"):
        parts.append(f"Features: {data['features']}")
    if data.get("description"):
        parts.append(f"Description: {data['description']}")
    if data.get("specs"):
        parts.append(f"Specifications: {data['specs']}")

    context = re.sub(r'\s+', ' ', " ".join(parts)).strip()
    data["context"] = context
    data["char_count"] = len(context)

    if len(context) < 50:
        data["warning"] = (
            "Very little text extracted. The site may block scrapers or use "
            "heavy JavaScript. Try pasting text manually in Text mode."
        )

    logger.info(f"Scraped [{data['source']}]: {data.get('title', '?')[:50]}... ({len(context)} chars)")
    return data


def _amazon(soup):
    """Extract from Amazon product pages."""
    d = {"title": "", "features": "", "description": "", "specs": ""}

    tag = soup.find("span", {"id": "productTitle"})
    if tag:
        d["title"] = tag.get_text(strip=True)

    feat = soup.find("div", {"id": "feature-bullets"})
    if feat:
        d["features"] = " ".join(
            li.get_text(strip=True) for li in feat.find_all("li") if li.get_text(strip=True)
        )

    desc = soup.find("div", {"id": "productDescription"})
    if desc:
        d["description"] = desc.get_text(strip=True)
    else:
        aplus = soup.find("div", {"id": "aplus"})
        if aplus:
            d["description"] = " ".join(
                p.get_text(strip=True) for p in aplus.find_all(["p", "li"])[:10]
            )

    specs = []
    for table in soup.find_all("table", class_=re.compile("prodDetTable|a-keyvalue")):
        for row in table.find_all("tr"):
            th, td = row.find("th"), row.find("td")
            if th and td:
                k, v = th.get_text(strip=True), td.get_text(strip=True)
                if k and v:
                    specs.append(f"{k}: {v}")

    detail = soup.find("table", {"id": "productDetails_techSpec_section_1"})
    if detail:
        for row in detail.find_all("tr"):
            th, td = row.find("th"), row.find("td")
            if th and td:
                k, v = th.get_text(strip=True), td.get_text(strip=True)
                entry = f"{k}: {v}"
                if k and v and entry not in specs:
                    specs.append(entry)

    d["specs"] = " | ".join(specs)
    return d


def _flipkart(soup):
    """Extract from Flipkart product pages."""
    d = {"title": "", "features": "", "description": "", "specs": ""}

    for sel in ["span.VU-ZEz", "span.B_NuCI", "h1 span"]:
        tag = soup.select_one(sel)
        if tag:
            d["title"] = tag.get_text(strip=True)
            break

    highlights = soup.find_all("li", class_=re.compile("_21Ahn-|col-12"))
    if highlights:
        d["features"] = " ".join(h.get_text(strip=True) for h in highlights[:15])

    desc = soup.find("div", class_=re.compile("_1mXcCf|_1AN87F"))
    if desc:
        d["description"] = desc.get_text(strip=True)

    specs = []
    for table in soup.find_all("table", class_=re.compile("_14cfVK|_1s_Smc")):
        for row in table.find_all("tr"):
            cells = row.find_all("td")
            if len(cells) >= 2:
                k, v = cells[0].get_text(strip=True), cells[1].get_text(strip=True)
                if k and v:
                    specs.append(f"{k}: {v}")
    d["specs"] = " | ".join(specs)
    return d


def _generic(soup):
    """Fallback for any webpage."""
    d = {"title": "", "features": "", "description": "", "specs": ""}

    h1 = soup.find("h1")
    d["title"] = h1.get_text(strip=True) if h1 else (soup.title.get_text(strip=True) if soup.title else "")

    seen, texts = set(), []
    for tag in soup.find_all(["p", "li", "td", "span", "div"]):
        t = tag.get_text(strip=True)
        if t and len(t) > 30 and t not in seen:
            seen.add(t)
            texts.append(t)
            if len(texts) >= 25:
                break
    d["description"] = " ".join(texts)
    return d