amalsp's picture
Update backend/main.py
c50dc4c verified
from fastapi import FastAPI, HTTPException, HTTPException
from fastapi.staticfiles import StaticFiles
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
import requests
from bs4 import BeautifulSoup
import pandas as pd
import io
app = FastAPI(title="Simple Web Scraper API")
# Enable CORS for frontend
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Mount static files
app.mount("/static", StaticFiles(directory="/code/static"), name="static")
# Root route to serve the index.html
@app.get("/")
async def read_root():
from fastapi.responses import FileResponse
return FileResponse("/code/static/index.html")
class ScrapeRequest(BaseModel):
url: str
mode: str = "table"
def scrape_table(soup: BeautifulSoup):
tables = soup.find_all("table")
if not tables:
raise HTTPException(status_code=400, detail="No table found on page")
# Find the largest table (most likely to be the main content table)
table = max(tables, key=lambda t: len(t.find_all("tr")))
headers = []
header_row = table.find("tr")
if header_row:
for th in header_row.find_all(["th", "td"]):
headers.append(th.get_text(strip=True))
if not headers:
first_data_row = table.find("tr")
if not first_data_row:
raise HTTPException(status_code=400, detail="Empty table")
cols = len(first_data_row.find_all("td"))
headers = [f"col_{i+1}" for i in range(cols)]
rows = []
for tr in table.find_all("tr")[1:]:
cells = tr.find_all("td")
if not cells:
continue
row = [c.get_text(strip=True) for c in cells]
if len(row) < len(headers):
row += [""] * (len(headers) - len(row))
elif len(row) > len(headers):
row = row[: len(headers)]
rows.append(row)
df = pd.DataFrame(rows, columns=headers)
return df
def scrape_links(soup: BeautifulSoup):
links = []
for a in soup.find_all("a"):
text = a.get_text(strip=True)
href = a.get("href", "")
if not href:
continue
links.append({"text": text, "href": href})
if not links:
raise HTTPException(status_code=400, detail="No links found")
df = pd.DataFrame(links)
return df
@app.post("/scrape")
def scrape_to_excel(req: ScrapeRequest):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
resp = requests.get(req.url, headers=headers, timeout=15)
except Exception:
raise HTTPException(status_code=400, detail="Could not fetch URL")
if resp.status_code != 200:
raise HTTPException(status_code=400, detail=f"Bad status code: {resp.status_code}")
soup = BeautifulSoup(resp.text, "html.parser")
if req.mode == "table":
df = scrape_table(soup)
elif req.mode == "links":
df = scrape_links(soup)
else:
raise HTTPException(status_code=400, detail="Unsupported mode")
output = io.BytesIO()
with pd.ExcelWriter(output, engine="openpyxl") as writer:
df.to_excel(writer, index=False, sheet_name="data")
output.seek(0)
headers = {"Content-Disposition": 'attachment; filename="scraped_data.xlsx"'}
return StreamingResponse(
output,
media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
headers=headers,
)