Spaces:

amalsp
/

web-scraper-app

Sleeping

App Files Files Community

web-scraper-app / backend /main.py

amalsp

Update backend/main.py

c50dc4c verified 7 days ago

raw

history blame contribute delete

3.58 kB

	from fastapi import FastAPI, HTTPException, HTTPException
	from fastapi.staticfiles import StaticFiles
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import StreamingResponse
	from pydantic import BaseModel
	import requests
	from bs4 import BeautifulSoup
	import pandas as pd
	import io

	app = FastAPI(title="Simple Web Scraper API")

	# Enable CORS for frontend
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# Mount static files
	app.mount("/static", StaticFiles(directory="/code/static"), name="static")

	# Root route to serve the index.html
	@app.get("/")
	async def read_root():
	from fastapi.responses import FileResponse
	return FileResponse("/code/static/index.html")

	class ScrapeRequest(BaseModel):
	url: str
	mode: str = "table"

	def scrape_table(soup: BeautifulSoup):
	tables = soup.find_all("table")
	if not tables:
	raise HTTPException(status_code=400, detail="No table found on page")

	# Find the largest table (most likely to be the main content table)
	table = max(tables, key=lambda t: len(t.find_all("tr")))

	headers = []
	header_row = table.find("tr")
	if header_row:
	for th in header_row.find_all(["th", "td"]):
	headers.append(th.get_text(strip=True))
	if not headers:
	first_data_row = table.find("tr")
	if not first_data_row:
	raise HTTPException(status_code=400, detail="Empty table")
	cols = len(first_data_row.find_all("td"))
	headers = [f"col_{i+1}" for i in range(cols)]

	rows = []
	for tr in table.find_all("tr")[1:]:
	cells = tr.find_all("td")
	if not cells:
	continue
	row = [c.get_text(strip=True) for c in cells]
	if len(row) < len(headers):
	row += [""] * (len(headers) - len(row))
	elif len(row) > len(headers):
	row = row[: len(headers)]
	rows.append(row)

	df = pd.DataFrame(rows, columns=headers)
	return df

	def scrape_links(soup: BeautifulSoup):
	links = []
	for a in soup.find_all("a"):
	text = a.get_text(strip=True)
	href = a.get("href", "")
	if not href:
	continue
	links.append({"text": text, "href": href})
	if not links:
	raise HTTPException(status_code=400, detail="No links found")
	df = pd.DataFrame(links)
	return df

	@app.post("/scrape")
	def scrape_to_excel(req: ScrapeRequest):
	try:
	headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
	resp = requests.get(req.url, headers=headers, timeout=15)
	except Exception:
	raise HTTPException(status_code=400, detail="Could not fetch URL")

	if resp.status_code != 200:
	raise HTTPException(status_code=400, detail=f"Bad status code: {resp.status_code}")

	soup = BeautifulSoup(resp.text, "html.parser")

	if req.mode == "table":
	df = scrape_table(soup)
	elif req.mode == "links":
	df = scrape_links(soup)
	else:
	raise HTTPException(status_code=400, detail="Unsupported mode")

	output = io.BytesIO()
	with pd.ExcelWriter(output, engine="openpyxl") as writer:
	df.to_excel(writer, index=False, sheet_name="data")
	output.seek(0)

	headers = {"Content-Disposition": 'attachment; filename="scraped_data.xlsx"'}

	return StreamingResponse(
	output,
	media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	headers=headers,
	)