from fastapi import FastAPI, HTTPException, HTTPException from fastapi.staticfiles import StaticFiles from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import StreamingResponse from pydantic import BaseModel import requests from bs4 import BeautifulSoup import pandas as pd import io app = FastAPI(title="Simple Web Scraper API") # Enable CORS for frontend app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Mount static files app.mount("/static", StaticFiles(directory="/code/static"), name="static") # Root route to serve the index.html @app.get("/") async def read_root(): from fastapi.responses import FileResponse return FileResponse("/code/static/index.html") class ScrapeRequest(BaseModel): url: str mode: str = "table" def scrape_table(soup: BeautifulSoup): tables = soup.find_all("table") if not tables: raise HTTPException(status_code=400, detail="No table found on page") # Find the largest table (most likely to be the main content table) table = max(tables, key=lambda t: len(t.find_all("tr"))) headers = [] header_row = table.find("tr") if header_row: for th in header_row.find_all(["th", "td"]): headers.append(th.get_text(strip=True)) if not headers: first_data_row = table.find("tr") if not first_data_row: raise HTTPException(status_code=400, detail="Empty table") cols = len(first_data_row.find_all("td")) headers = [f"col_{i+1}" for i in range(cols)] rows = [] for tr in table.find_all("tr")[1:]: cells = tr.find_all("td") if not cells: continue row = [c.get_text(strip=True) for c in cells] if len(row) < len(headers): row += [""] * (len(headers) - len(row)) elif len(row) > len(headers): row = row[: len(headers)] rows.append(row) df = pd.DataFrame(rows, columns=headers) return df def scrape_links(soup: BeautifulSoup): links = [] for a in soup.find_all("a"): text = a.get_text(strip=True) href = a.get("href", "") if not href: continue links.append({"text": text, "href": href}) if not links: raise HTTPException(status_code=400, detail="No links found") df = pd.DataFrame(links) return df @app.post("/scrape") def scrape_to_excel(req: ScrapeRequest): try: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} resp = requests.get(req.url, headers=headers, timeout=15) except Exception: raise HTTPException(status_code=400, detail="Could not fetch URL") if resp.status_code != 200: raise HTTPException(status_code=400, detail=f"Bad status code: {resp.status_code}") soup = BeautifulSoup(resp.text, "html.parser") if req.mode == "table": df = scrape_table(soup) elif req.mode == "links": df = scrape_links(soup) else: raise HTTPException(status_code=400, detail="Unsupported mode") output = io.BytesIO() with pd.ExcelWriter(output, engine="openpyxl") as writer: df.to_excel(writer, index=False, sheet_name="data") output.seek(0) headers = {"Content-Disposition": 'attachment; filename="scraped_data.xlsx"'} return StreamingResponse( output, media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", headers=headers, )