amalsp commited on
Commit
c50dc4c
·
verified ·
1 Parent(s): a9a2fef

Update backend/main.py

Browse files
Files changed (1) hide show
  1. backend/main.py +7 -3
backend/main.py CHANGED
@@ -33,9 +33,12 @@ class ScrapeRequest(BaseModel):
33
  mode: str = "table"
34
 
35
  def scrape_table(soup: BeautifulSoup):
36
- table = soup.find("table")
37
- if table is None:
38
  raise HTTPException(status_code=400, detail="No table found on page")
 
 
 
39
 
40
  headers = []
41
  header_row = table.find("tr")
@@ -80,7 +83,8 @@ def scrape_links(soup: BeautifulSoup):
80
  @app.post("/scrape")
81
  def scrape_to_excel(req: ScrapeRequest):
82
  try:
83
- resp = requests.get(req.url, timeout=15)
 
84
  except Exception:
85
  raise HTTPException(status_code=400, detail="Could not fetch URL")
86
 
 
33
  mode: str = "table"
34
 
35
  def scrape_table(soup: BeautifulSoup):
36
+ tables = soup.find_all("table")
37
+ if not tables:
38
  raise HTTPException(status_code=400, detail="No table found on page")
39
+
40
+ # Find the largest table (most likely to be the main content table)
41
+ table = max(tables, key=lambda t: len(t.find_all("tr")))
42
 
43
  headers = []
44
  header_row = table.find("tr")
 
83
  @app.post("/scrape")
84
  def scrape_to_excel(req: ScrapeRequest):
85
  try:
86
+ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
87
+ resp = requests.get(req.url, headers=headers, timeout=15)
88
  except Exception:
89
  raise HTTPException(status_code=400, detail="Could not fetch URL")
90