Spaces:
Sleeping
Sleeping
Update backend/main.py
Browse files- backend/main.py +7 -3
backend/main.py
CHANGED
|
@@ -33,9 +33,12 @@ class ScrapeRequest(BaseModel):
|
|
| 33 |
mode: str = "table"
|
| 34 |
|
| 35 |
def scrape_table(soup: BeautifulSoup):
|
| 36 |
-
|
| 37 |
-
|
| 38 |
raise HTTPException(status_code=400, detail="No table found on page")
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
headers = []
|
| 41 |
header_row = table.find("tr")
|
|
@@ -80,7 +83,8 @@ def scrape_links(soup: BeautifulSoup):
|
|
| 80 |
@app.post("/scrape")
|
| 81 |
def scrape_to_excel(req: ScrapeRequest):
|
| 82 |
try:
|
| 83 |
-
|
|
|
|
| 84 |
except Exception:
|
| 85 |
raise HTTPException(status_code=400, detail="Could not fetch URL")
|
| 86 |
|
|
|
|
| 33 |
mode: str = "table"
|
| 34 |
|
| 35 |
def scrape_table(soup: BeautifulSoup):
|
| 36 |
+
tables = soup.find_all("table")
|
| 37 |
+
if not tables:
|
| 38 |
raise HTTPException(status_code=400, detail="No table found on page")
|
| 39 |
+
|
| 40 |
+
# Find the largest table (most likely to be the main content table)
|
| 41 |
+
table = max(tables, key=lambda t: len(t.find_all("tr")))
|
| 42 |
|
| 43 |
headers = []
|
| 44 |
header_row = table.find("tr")
|
|
|
|
| 83 |
@app.post("/scrape")
|
| 84 |
def scrape_to_excel(req: ScrapeRequest):
|
| 85 |
try:
|
| 86 |
+
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
|
| 87 |
+
resp = requests.get(req.url, headers=headers, timeout=15)
|
| 88 |
except Exception:
|
| 89 |
raise HTTPException(status_code=400, detail="Could not fetch URL")
|
| 90 |
|