Spaces:

amalsp
/

web-scraper-app

Sleeping

File size: 6,740 Bytes

from fastapi import FastAPI, HTTPException
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse, FileResponse
from pydantic import BaseModel
import requests
from bs4 import BeautifulSoup
import pandas as pd
import io
import re

app = FastAPI(title="Universal Web Scraper API")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

app.mount("/static", StaticFiles(directory="/code/static"), name="static")

@app.get("/")
async def read_root():
    return FileResponse("/code/static/index.html")

class ScrapeRequest(BaseModel):
    url: str
    mode: str = "table"

def scrape_table(soup: BeautifulSoup):
    tables = soup.find_all("table")
    if not tables:
        raise HTTPException(status_code=400, detail="No table found on page")
    
    table = max(tables, key=lambda t: len(t.find_all("tr")))
    
    headers = []
    header_row = table.find("tr")
    if header_row:
        for th in header_row.find_all(["th", "td"]):
            headers.append(th.get_text(strip=True))
    if not headers:
        first_data_row = table.find("tr")
        if not first_data_row:
            raise HTTPException(status_code=400, detail="Empty table")
        cols = len(first_data_row.find_all("td"))
        headers = [f"col_{i+1}" for i in range(cols)]
    
    rows = []
    for tr in table.find_all("tr")[1:]:
        cells = tr.find_all("td")
        if not cells:
            continue
        row = [c.get_text(strip=True) for c in cells]
        if len(row) < len(headers):
            row += [""] * (len(headers) - len(row))
        elif len(row) > len(headers):
            row = row[:len(headers)]
        rows.append(row)
    
    df = pd.DataFrame(rows, columns=headers)
    return df

def scrape_links(soup: BeautifulSoup):
    links = []
    for a in soup.find_all("a"):
        text = a.get_text(strip=True)
        href = a.get("href", "")
        if not href:
            continue
        links.append({"text": text, "href": href})
    if not links:
        raise HTTPException(status_code=400, detail="No links found")
    df = pd.DataFrame(links)
    return df

def scrape_all_content(soup: BeautifulSoup):
    # IMPROVED: Extract only meaningful product/content data
    data = []
    
    # Remove unwanted elements (navigation, scripts, styles, ads)
    for tag in soup(["script", "style", "nav", "header", "footer", "aside", "iframe"]):
        tag.decompose()
    
    # Try to find product/article containers first (common e-commerce patterns)
    product_containers = soup.find_all(
        attrs={
            "class": re.compile(r"product|item|card|listing|article", re.I)
        }
    )
    
    # If we find product containers, extract from them
    if product_containers and len(product_containers) > 5:
        for container in product_containers[:100]:  # Limit to first 100 items
            # Extract title/name
            title_elem = container.find(["h1", "h2", "h3", "h4", "a"], 
                                       attrs={"class": re.compile(r"title|name|heading", re.I)})
            title = title_elem.get_text(strip=True) if title_elem else ""
            
            # Extract price
            price_elem = container.find(attrs={"class": re.compile(r"price|cost|amount", re.I)})
            price = price_elem.get_text(strip=True) if price_elem else ""
            
            # Extract description
            desc_elem = container.find(["p", "div"], 
                                      attrs={"class": re.compile(r"desc|detail|summary", re.I)})
            description = desc_elem.get_text(strip=True)[:200] if desc_elem else ""
            
            # Extract link
            link_elem = container.find("a", href=True)
            link = link_elem["href"] if link_elem else ""
            
            if title or price:  # Only add if we have meaningful data
                data.append({
                    "Title": title[:200],
                    "Price": price[:50],
                    "Description": description,
                    "Link": link[:300]
                })
    
    # Fallback: If no product containers found, extract main content
    else:
        # Look for main content area
        main_content = soup.find(["main", "article", "div"], 
                                attrs={"id": re.compile(r"main|content|primary", re.I)}) or soup
        
        # Extract headings and associated content
        for heading in main_content.find_all(["h1", "h2", "h3"]):
            heading_text = heading.get_text(strip=True)
            if len(heading_text) > 5:  # Skip very short headings
                # Get next sibling paragraph or div
                content = ""
                next_elem = heading.find_next_sibling(["p", "div", "ul"])
                if next_elem:
                    content = next_elem.get_text(strip=True)[:300]
                
                data.append({
                    "Title": heading_text[:200],
                    "Price": "",
                    "Description": content,
                    "Link": ""
                })
    
    if not data:
        raise HTTPException(status_code=400, detail="No meaningful content found on page. Try 'Tables' or 'Links' mode instead.")
    
    # Remove exact duplicates
    df = pd.DataFrame(data)
    df = df.drop_duplicates(subset=["Title"], keep="first")
    
    return df

@app.post("/scrape")
def scrape_to_excel(req: ScrapeRequest):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        resp = requests.get(req.url, headers=headers, timeout=15)
    except Exception:
        raise HTTPException(status_code=400, detail="Could not fetch URL")
    
    if resp.status_code != 200:
        raise HTTPException(status_code=400, detail=f"Bad status code: {resp.status_code}")
    
    soup = BeautifulSoup(resp.text, "html.parser")
    
    if req.mode == "table":
        df = scrape_table(soup)
    elif req.mode == "links":
        df = scrape_links(soup)
    elif req.mode == "content":
        df = scrape_all_content(soup)
    else:
        raise HTTPException(status_code=400, detail="Unsupported mode")
    
    output = io.BytesIO()
    with pd.ExcelWriter(output, engine="openpyxl") as writer:
        df.to_excel(writer, index=False, sheet_name="data")
    output.seek(0)
    
    headers = {"Content-Disposition": 'attachment; filename="scraped_data.xlsx"'}
    
    return StreamingResponse(
        output,
        media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        headers=headers,
    )