Spaces:

amalsp
/

web-scraper-app

Sleeping

amalsp commited on 7 days ago

Commit

c50dc4c

verified ·

1 Parent(s): a9a2fef

Update backend/main.py

Files changed (1) hide show

backend/main.py CHANGED Viewed

@@ -33,9 +33,12 @@ class ScrapeRequest(BaseModel):
     mode: str = "table"
 def scrape_table(soup: BeautifulSoup):
-    table = soup.find("table")
-    if table is None:
         raise HTTPException(status_code=400, detail="No table found on page")
     headers = []
     header_row = table.find("tr")
@@ -80,7 +83,8 @@ def scrape_links(soup: BeautifulSoup):
 @app.post("/scrape")
 def scrape_to_excel(req: ScrapeRequest):
     try:
-        resp = requests.get(req.url, timeout=15)
     except Exception:
         raise HTTPException(status_code=400, detail="Could not fetch URL")

     mode: str = "table"
 def scrape_table(soup: BeautifulSoup):
+    tables = soup.find_all("table")
+        if not tables:
         raise HTTPException(status_code=400, detail="No table found on page")
+    # Find the largest table (most likely to be the main content table)
+    table = max(tables, key=lambda t: len(t.find_all("tr")))
     headers = []
     header_row = table.find("tr")
 @app.post("/scrape")
 def scrape_to_excel(req: ScrapeRequest):
     try:
+        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
+        resp = requests.get(req.url, headers=headers, timeout=15)
     except Exception:
         raise HTTPException(status_code=400, detail="Could not fetch URL")