amalsp commited on
Commit
82f5373
·
verified ·
1 Parent(s): 26500d5

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +146 -0
main.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from fastapi.staticfiles import StaticFiles
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from fastapi.responses import StreamingResponse, FileResponse
5
+ from pydantic import BaseModel
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ import pandas as pd
9
+ import io
10
+ import re
11
+
12
+ app = FastAPI(title="Universal Web Scraper API")
13
+
14
+ app.add_middleware(
15
+ CORSMiddleware,
16
+ allow_origins=["*"],
17
+ allow_credentials=True,
18
+ allow_methods=["*"],
19
+ allow_headers=["*"],
20
+ )
21
+
22
+ app.mount("/static", StaticFiles(directory="/code/static"), name="static")
23
+
24
+ @app.get("/")
25
+ async def read_root():
26
+ return FileResponse("/code/static/index.html")
27
+
28
+ class ScrapeRequest(BaseModel):
29
+ url: str
30
+ mode: str = "table"
31
+
32
+ def scrape_table(soup: BeautifulSoup):
33
+ tables = soup.find_all("table")
34
+ if not tables:
35
+ raise HTTPException(status_code=400, detail="No table found on page")
36
+
37
+ table = max(tables, key=lambda t: len(t.find_all("tr")))
38
+
39
+ headers = []
40
+ header_row = table.find("tr")
41
+ if header_row:
42
+ for th in header_row.find_all(["th", "td"]):
43
+ headers.append(th.get_text(strip=True))
44
+ if not headers:
45
+ first_data_row = table.find("tr")
46
+ if not first_data_row:
47
+ raise HTTPException(status_code=400, detail="Empty table")
48
+ cols = len(first_data_row.find_all("td"))
49
+ headers = [f"col_{i+1}" for i in range(cols)]
50
+
51
+ rows = []
52
+ for tr in table.find_all("tr")[1:]:
53
+ cells = tr.find_all("td")
54
+ if not cells:
55
+ continue
56
+ row = [c.get_text(strip=True) for c in cells]
57
+ if len(row) < len(headers):
58
+ row += [""] * (len(headers) - len(row))
59
+ elif len(row) > len(headers):
60
+ row = row[:len(headers)]
61
+ rows.append(row)
62
+
63
+ df = pd.DataFrame(rows, columns=headers)
64
+ return df
65
+
66
+ def scrape_links(soup: BeautifulSoup):
67
+ links = []
68
+ for a in soup.find_all("a"):
69
+ text = a.get_text(strip=True)
70
+ href = a.get("href", "")
71
+ if not href:
72
+ continue
73
+ links.append({"text": text, "href": href})
74
+ if not links:
75
+ raise HTTPException(status_code=400, detail="No links found")
76
+ df = pd.DataFrame(links)
77
+ return df
78
+
79
+ def scrape_all_content(soup: BeautifulSoup):
80
+ # Extract ALL visible text content from the page
81
+ data = []
82
+
83
+ # Get all divs, spans, and p tags with text
84
+ for element in soup.find_all(["div", "span", "p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "td", "th"]):
85
+ text = element.get_text(strip=True)
86
+ if text and len(text) > 2: # Only include meaningful text
87
+ # Get element classes and id for context
88
+ classes = " ".join(element.get("class", []))
89
+ elem_id = element.get("id", "")
90
+ elem_type = element.name
91
+
92
+ data.append({
93
+ "Type": elem_type,
94
+ "Content": text[:500], # Limit to 500 chars per element
95
+ "Class": classes[:100] if classes else "",
96
+ "ID": elem_id[:50] if elem_id else ""
97
+ })
98
+
99
+ if not data:
100
+ raise HTTPException(status_code=400, detail="No content found on page")
101
+
102
+ # Remove duplicate content
103
+ seen = set()
104
+ unique_data = []
105
+ for item in data:
106
+ if item["Content"] not in seen:
107
+ seen.add(item["Content"])
108
+ unique_data.append(item)
109
+
110
+ df = pd.DataFrame(unique_data)
111
+ return df
112
+
113
+ @app.post("/scrape")
114
+ def scrape_to_excel(req: ScrapeRequest):
115
+ try:
116
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
117
+ resp = requests.get(req.url, headers=headers, timeout=15)
118
+ except Exception:
119
+ raise HTTPException(status_code=400, detail="Could not fetch URL")
120
+
121
+ if resp.status_code != 200:
122
+ raise HTTPException(status_code=400, detail=f"Bad status code: {resp.status_code}")
123
+
124
+ soup = BeautifulSoup(resp.text, "html.parser")
125
+
126
+ if req.mode == "table":
127
+ df = scrape_table(soup)
128
+ elif req.mode == "links":
129
+ df = scrape_links(soup)
130
+ elif req.mode == "content":
131
+ df = scrape_all_content(soup)
132
+ else:
133
+ raise HTTPException(status_code=400, detail="Unsupported mode")
134
+
135
+ output = io.BytesIO()
136
+ with pd.ExcelWriter(output, engine="openpyxl") as writer:
137
+ df.to_excel(writer, index=False, sheet_name="data")
138
+ output.seek(0)
139
+
140
+ headers = {"Content-Disposition": 'attachment; filename="scraped_data.xlsx"'}
141
+
142
+ return StreamingResponse(
143
+ output,
144
+ media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
145
+ headers=headers,
146
+ )