Spaces:
Sleeping
Sleeping
File size: 3,288 Bytes
0f94f61 9f569f7 0f94f61 9f569f7 0f94f61 9f569f7 0f94f61 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | # scraper.py
import requests
from bs4 import BeautifulSoup
import time
from typing import Dict, List
from urllib.parse import urljoin
# Playwright import (sync)
try:
from playwright.sync_api import sync_playwright
PLAYWRIGHT_AVAILABLE = True
except Exception:
PLAYWRIGHT_AVAILABLE = False
HEADERS = {
"User-Agent": "ai-scraper-bot/1.0 (+https://example.com)"
}
def _extract_basic(url: str, html: str) -> Dict:
soup = BeautifulSoup(html, "html.parser")
title = soup.title.string.strip() if soup.title and soup.title.string else ""
# collect headings, links, and paragraphs
headings = [h.get_text(strip=True) for h in soup.find_all(["h1", "h2", "h3"])][:30]
paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")][:200]
links = []
for a in soup.find_all("a", href=True)[:200]:
href = a["href"]
links.append({
"text": a.get_text(strip=True),
"href": urljoin(url, href)
})
full_text = "\n\n".join(paragraphs) or soup.get_text(separator="\n", strip=True)
return {
"url": url,
"title": title,
"headings": headings,
"links": links,
"text": full_text.strip()
}
def scrape_with_requests(url: str, timeout: int = 12) -> Dict:
"""Try a simple requests + BeautifulSoup scrape first."""
resp = requests.get(url, headers=HEADERS, timeout=timeout)
resp.raise_for_status()
return _extract_basic(url, resp.text)
def scrape_with_playwright(url: str, timeout: int = 25) -> Dict:
"""Use Playwright to render JS-heavy pages and return the same structure."""
if not PLAYWRIGHT_AVAILABLE:
raise RuntimeError("Playwright not available in this environment.")
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page(extra_http_headers=HEADERS)
page.goto(url, timeout=timeout * 1000)
# wait a bit for content
time.sleep(1.2)
html = page.content()
browser.close()
return _extract_basic(url, html)
def scrape(url: str, force_render: bool = False) -> Dict:
"""
High-level scrape function:
- Try requests+BS first.
- If content looks too small or force_render=True, fallback to Playwright.
"""
try:
data = scrape_with_requests(url)
# if lightweight response or very little text, try playwright
text_len = len(data.get("text", "") or "")
if force_render or text_len < 500:
# fallback to playwright if available
if PLAYWRIGHT_AVAILABLE:
data2 = scrape_with_playwright(url)
# prefer rendered text if richer
if len(data2.get("text", "")) > text_len:
return data2
# else return what we have
return data
except Exception as e:
# requests failed -> try playwright
if PLAYWRIGHT_AVAILABLE:
try:
return scrape_with_playwright(url)
except Exception as e2:
raise RuntimeError(f"Both requests and playwright scrapes failed: {e}, {e2}")
raise
if __name__ == "__main__":
# quick local test
print("Quick test (requests).")
print(scrape("https://example.com")["title"])
|