Spaces:

amalsp
/

web-scraper-app

Sleeping

App Files Files Community

amalsp commited on 7 days ago

Commit

82f5373

verified ·

1 Parent(s): 26500d5

Create main.py

Browse files

Files changed (1) hide show

main.py +146 -0

main.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from fastapi import FastAPI, HTTPException
+from fastapi.staticfiles import StaticFiles
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse, FileResponse
+from pydantic import BaseModel
+import requests
+from bs4 import BeautifulSoup
+import pandas as pd
+import io
+import re
+app = FastAPI(title="Universal Web Scraper API")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+app.mount("/static", StaticFiles(directory="/code/static"), name="static")
+@app.get("/")
+async def read_root():
+    return FileResponse("/code/static/index.html")
+class ScrapeRequest(BaseModel):
+    url: str
+    mode: str = "table"
+def scrape_table(soup: BeautifulSoup):
+    tables = soup.find_all("table")
+    if not tables:
+        raise HTTPException(status_code=400, detail="No table found on page")
+    table = max(tables, key=lambda t: len(t.find_all("tr")))
+    headers = []
+    header_row = table.find("tr")
+    if header_row:
+        for th in header_row.find_all(["th", "td"]):
+            headers.append(th.get_text(strip=True))
+    if not headers:
+        first_data_row = table.find("tr")
+        if not first_data_row:
+            raise HTTPException(status_code=400, detail="Empty table")
+        cols = len(first_data_row.find_all("td"))
+        headers = [f"col_{i+1}" for i in range(cols)]
+    rows = []
+    for tr in table.find_all("tr")[1:]:
+        cells = tr.find_all("td")
+        if not cells:
+            continue
+        row = [c.get_text(strip=True) for c in cells]
+        if len(row) < len(headers):
+            row += [""] * (len(headers) - len(row))
+        elif len(row) > len(headers):
+            row = row[:len(headers)]
+        rows.append(row)
+    df = pd.DataFrame(rows, columns=headers)
+    return df
+def scrape_links(soup: BeautifulSoup):
+    links = []
+    for a in soup.find_all("a"):
+        text = a.get_text(strip=True)
+        href = a.get("href", "")
+        if not href:
+            continue
+        links.append({"text": text, "href": href})
+    if not links:
+        raise HTTPException(status_code=400, detail="No links found")
+    df = pd.DataFrame(links)
+    return df
+def scrape_all_content(soup: BeautifulSoup):
+    # Extract ALL visible text content from the page
+    data = []
+    # Get all divs, spans, and p tags with text
+    for element in soup.find_all(["div", "span", "p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "td", "th"]):
+        text = element.get_text(strip=True)
+        if text and len(text) > 2:  # Only include meaningful text
+            # Get element classes and id for context
+            classes = " ".join(element.get("class", []))
+            elem_id = element.get("id", "")
+            elem_type = element.name
+            data.append({
+                "Type": elem_type,
+                "Content": text[:500],  # Limit to 500 chars per element
+                "Class": classes[:100] if classes else "",
+                "ID": elem_id[:50] if elem_id else ""
+            })
+    if not data:
+        raise HTTPException(status_code=400, detail="No content found on page")
+    # Remove duplicate content
+    seen = set()
+    unique_data = []
+    for item in data:
+        if item["Content"] not in seen:
+            seen.add(item["Content"])
+            unique_data.append(item)
+    df = pd.DataFrame(unique_data)
+    return df
+@app.post("/scrape")
+def scrape_to_excel(req: ScrapeRequest):
+    try:
+        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
+        resp = requests.get(req.url, headers=headers, timeout=15)
+    except Exception:
+        raise HTTPException(status_code=400, detail="Could not fetch URL")
+    if resp.status_code != 200:
+        raise HTTPException(status_code=400, detail=f"Bad status code: {resp.status_code}")
+    soup = BeautifulSoup(resp.text, "html.parser")
+    if req.mode == "table":
+        df = scrape_table(soup)
+    elif req.mode == "links":
+        df = scrape_links(soup)
+    elif req.mode == "content":
+        df = scrape_all_content(soup)
+    else:
+        raise HTTPException(status_code=400, detail="Unsupported mode")
+    output = io.BytesIO()
+    with pd.ExcelWriter(output, engine="openpyxl") as writer:
+        df.to_excel(writer, index=False, sheet_name="data")
+    output.seek(0)
+    headers = {"Content-Disposition": 'attachment; filename="scraped_data.xlsx"'}
+    return StreamingResponse(
+        output,
+        media_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        headers=headers,
+    )