Spaces:

mibrahimzia
/

Parse-AI

Sleeping

App Files Files Community

mibrahimzia commited on Oct 15, 2025

Commit

0f94f61

verified ·

1 Parent(s): 8d00c99

Update scraper.py

Browse files

Files changed (1) hide show

scraper.py +88 -8

scraper.py CHANGED Viewed

@@ -1,12 +1,92 @@
 import requests
 from bs4 import BeautifulSoup
-def scrape_website(url: str):
-    """Scrape title and text content from a webpage."""
-    headers = {"User-Agent": "Mozilla/5.0"}
-    resp = requests.get(url, headers=headers, timeout=10)
-    soup = BeautifulSoup(resp.text, "html.parser")
-    title = soup.title.string if soup.title else "No title"
-    paragraphs = " ".join([p.get_text() for p in soup.find_all("p")])
-    return {"title": title.strip(), "content": paragraphs.strip()}

+# scraper.py
 import requests
 from bs4 import BeautifulSoup
+import time
+from typing import Dict, List
+from urllib.parse import urljoin
+# Playwright import (sync)
+try:
+    from playwright.sync_api import sync_playwright
+    PLAYWRIGHT_AVAILABLE = True
+except Exception:
+    PLAYWRIGHT_AVAILABLE = False
+HEADERS = {
+    "User-Agent": "ai-scraper-bot/1.0 (+https://example.com)"
+}
+def _extract_basic(url: str, html: str) -> Dict:
+    soup = BeautifulSoup(html, "html.parser")
+    title = soup.title.string.strip() if soup.title and soup.title.string else ""
+    # collect headings, links, and paragraphs
+    headings = [h.get_text(strip=True) for h in soup.find_all(["h1", "h2", "h3"])][:30]
+    paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")][:200]
+    links = []
+    for a in soup.find_all("a", href=True)[:200]:
+        href = a["href"]
+        links.append({
+            "text": a.get_text(strip=True),
+            "href": urljoin(url, href)
+        })
+    full_text = "\n\n".join(paragraphs) or soup.get_text(separator="\n", strip=True)
+    return {
+        "url": url,
+        "title": title,
+        "headings": headings,
+        "links": links,
+        "text": full_text.strip()
+    }
+def scrape_with_requests(url: str, timeout: int = 12) -> Dict:
+    """Try a simple requests + BeautifulSoup scrape first."""
+    resp = requests.get(url, headers=HEADERS, timeout=timeout)
+    resp.raise_for_status()
+    return _extract_basic(url, resp.text)
+def scrape_with_playwright(url: str, timeout: int = 25) -> Dict:
+    """Use Playwright to render JS-heavy pages and return the same structure."""
+    if not PLAYWRIGHT_AVAILABLE:
+        raise RuntimeError("Playwright not available in this environment.")
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=True)
+        page = browser.new_page(extra_http_headers=HEADERS)
+        page.goto(url, timeout=timeout * 1000)
+        # wait a bit for content
+        time.sleep(1.2)
+        html = page.content()
+        browser.close()
+    return _extract_basic(url, html)
+def scrape(url: str, force_render: bool = False) -> Dict:
+    """
+    High-level scrape function:
+      - Try requests+BS first.
+      - If content looks too small or force_render=True, fallback to Playwright.
+    """
+    try:
+        data = scrape_with_requests(url)
+        # if lightweight response or very little text, try playwright
+        text_len = len(data.get("text", "") or "")
+        if force_render or text_len < 500:
+            # fallback to playwright if available
+            if PLAYWRIGHT_AVAILABLE:
+                data2 = scrape_with_playwright(url)
+                # prefer rendered text if richer
+                if len(data2.get("text", "")) > text_len:
+                    return data2
+            # else return what we have
+        return data
+    except Exception as e:
+        # requests failed -> try playwright
+        if PLAYWRIGHT_AVAILABLE:
+            try:
+                return scrape_with_playwright(url)
+            except Exception as e2:
+                raise RuntimeError(f"Both requests and playwright scrapes failed: {e}, {e2}")
+        raise
+if __name__ == "__main__":
+    # quick local test
+    print("Quick test (requests).")
+    print(scrape("https://example.com")["title"])