amalsp commited on
Commit
0fbd166
·
verified ·
1 Parent(s): 5dee047

Create backend/main.py

Browse files
Files changed (1) hide show
  1. backend/main.py +100 -0
backend/main.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from fastapi.responses import StreamingResponse
4
+ from pydantic import BaseModel
5
+ import requests
6
+ from bs4 import BeautifulSoup
7
+ import pandas as pd
8
+ import io
9
+
10
+ app = FastAPI(title="Simple Web Scraper API")
11
+
12
+ # Enable CORS for frontend
13
+ app.add_middleware(
14
+ CORSMiddleware,
15
+ allow_origins=["*"],
16
+ allow_credentials=True,
17
+ allow_methods=["*"],
18
+ allow_headers=["*"],
19
+ )
20
+
21
+ class ScrapeRequest(BaseModel):
22
+ url: str
23
+ mode: str = "table"
24
+
25
+ def scrape_table(soup: BeautifulSoup):
26
+ table = soup.find("table")
27
+ if table is None:
28
+ raise HTTPException(status_code=400, detail="No table found on page")
29
+
30
+ headers = []
31
+ header_row = table.find("tr")
32
+ if header_row:
33
+ for th in header_row.find_all(["th", "td"]):
34
+ headers.append(th.get_text(strip=True))
35
+ if not headers:
36
+ first_data_row = table.find("tr")
37
+ if not first_data_row:
38
+ raise HTTPException(status_code=400, detail="Empty table")
39
+ cols = len(first_data_row.find_all("td"))
40
+ headers = [f"col_{i+1}" for i in range(cols)]
41
+
42
+ rows = []
43
+ for tr in table.find_all("tr")[1:]:
44
+ cells = tr.find_all("td")
45
+ if not cells:
46
+ continue
47
+ row = [c.get_text(strip=True) for c in cells]
48
+ if len(row) < len(headers):
49
+ row += [""] * (len(headers) - len(row))
50
+ elif len(row) > len(headers):
51
+ row = row[: len(headers)]
52
+ rows.append(row)
53
+
54
+ df = pd.DataFrame(rows, columns=headers)
55
+ return df
56
+
57
+ def scrape_links(soup: BeautifulSoup):
58
+ links = []
59
+ for a in soup.find_all("a"):
60
+ text = a.get_text(strip=True)
61
+ href = a.get("href", "")
62
+ if not href:
63
+ continue
64
+ links.append({"text": text, "href": href})
65
+ if not links:
66
+ raise HTTPException(status_code=400, detail="No links found")
67
+ df = pd.DataFrame(links)
68
+ return df
69
+
70
+ @app.post("/scrape")
71
+ def scrape_to_excel(req: ScrapeRequest):
72
+ try:
73
+ resp = requests.get(req.url, timeout=15)
74
+ except Exception:
75
+ raise HTTPException(status_code=400, detail="Could not fetch URL")
76
+
77
+ if resp.status_code != 200:
78
+ raise HTTPException(status_code=400, detail=f"Bad status code: {resp.status_code}")
79
+
80
+ soup = BeautifulSoup(resp.text, "html.parser")
81
+
82
+ if req.mode == "table":
83
+ df = scrape_table(soup)
84
+ elif req.mode == "links":
85
+ df = scrape_links(soup)
86
+ else:
87
+ raise HTTPException(status_code=400, detail="Unsupported mode")
88
+
89
+ output = io.BytesIO()
90
+ with pd.ExcelWriter(output, engine="openpyxl") as writer:
91
+ df.to_excel(writer, index=False, sheet_name="data")
92
+ output.seek(0)
93
+
94
+ headers = {"Content-Disposition": 'attachment; filename="scraped_data.xlsx"'}
95
+
96
+ return StreamingResponse(
97
+ output,
98
+ media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
99
+ headers=headers,
100
+ )