# scraper.py import requests from bs4 import BeautifulSoup import time from typing import Dict, List from urllib.parse import urljoin # Playwright import (sync) try: from playwright.sync_api import sync_playwright PLAYWRIGHT_AVAILABLE = True except Exception: PLAYWRIGHT_AVAILABLE = False HEADERS = { "User-Agent": "ai-scraper-bot/1.0 (+https://example.com)" } def _extract_basic(url: str, html: str) -> Dict: soup = BeautifulSoup(html, "html.parser") title = soup.title.string.strip() if soup.title and soup.title.string else "" # collect headings, links, and paragraphs headings = [h.get_text(strip=True) for h in soup.find_all(["h1", "h2", "h3"])][:30] paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")][:200] links = [] for a in soup.find_all("a", href=True)[:200]: href = a["href"] links.append({ "text": a.get_text(strip=True), "href": urljoin(url, href) }) full_text = "\n\n".join(paragraphs) or soup.get_text(separator="\n", strip=True) return { "url": url, "title": title, "headings": headings, "links": links, "text": full_text.strip() } def scrape_with_requests(url: str, timeout: int = 12) -> Dict: """Try a simple requests + BeautifulSoup scrape first.""" resp = requests.get(url, headers=HEADERS, timeout=timeout) resp.raise_for_status() return _extract_basic(url, resp.text) def scrape_with_playwright(url: str, timeout: int = 25) -> Dict: """Use Playwright to render JS-heavy pages and return the same structure.""" if not PLAYWRIGHT_AVAILABLE: raise RuntimeError("Playwright not available in this environment.") with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_page(extra_http_headers=HEADERS) page.goto(url, timeout=timeout * 1000) # wait a bit for content time.sleep(1.2) html = page.content() browser.close() return _extract_basic(url, html) def scrape(url: str, force_render: bool = False) -> Dict: """ High-level scrape function: - Try requests+BS first. - If content looks too small or force_render=True, fallback to Playwright. """ try: data = scrape_with_requests(url) # if lightweight response or very little text, try playwright text_len = len(data.get("text", "") or "") if force_render or text_len < 500: # fallback to playwright if available if PLAYWRIGHT_AVAILABLE: data2 = scrape_with_playwright(url) # prefer rendered text if richer if len(data2.get("text", "")) > text_len: return data2 # else return what we have return data except Exception as e: # requests failed -> try playwright if PLAYWRIGHT_AVAILABLE: try: return scrape_with_playwright(url) except Exception as e2: raise RuntimeError(f"Both requests and playwright scrapes failed: {e}, {e2}") raise if __name__ == "__main__": # quick local test print("Quick test (requests).") print(scrape("https://example.com")["title"])