Spaces:

mibrahimzia
/

Parse-AI

Sleeping

File size: 3,288 Bytes

# scraper.py
import requests
from bs4 import BeautifulSoup
import time
from typing import Dict, List
from urllib.parse import urljoin

# Playwright import (sync)
try:
    from playwright.sync_api import sync_playwright
    PLAYWRIGHT_AVAILABLE = True
except Exception:
    PLAYWRIGHT_AVAILABLE = False

HEADERS = {
    "User-Agent": "ai-scraper-bot/1.0 (+https://example.com)"
}

def _extract_basic(url: str, html: str) -> Dict:
    soup = BeautifulSoup(html, "html.parser")
    title = soup.title.string.strip() if soup.title and soup.title.string else ""
    # collect headings, links, and paragraphs
    headings = [h.get_text(strip=True) for h in soup.find_all(["h1", "h2", "h3"])][:30]
    paragraphs = [p.get_text(strip=True) for p in soup.find_all("p")][:200]
    links = []
    for a in soup.find_all("a", href=True)[:200]:
        href = a["href"]
        links.append({
            "text": a.get_text(strip=True),
            "href": urljoin(url, href)
        })
    full_text = "\n\n".join(paragraphs) or soup.get_text(separator="\n", strip=True)
    return {
        "url": url,
        "title": title,
        "headings": headings,
        "links": links,
        "text": full_text.strip()
    }

def scrape_with_requests(url: str, timeout: int = 12) -> Dict:
    """Try a simple requests + BeautifulSoup scrape first."""
    resp = requests.get(url, headers=HEADERS, timeout=timeout)
    resp.raise_for_status()
    return _extract_basic(url, resp.text)

def scrape_with_playwright(url: str, timeout: int = 25) -> Dict:
    """Use Playwright to render JS-heavy pages and return the same structure."""
    if not PLAYWRIGHT_AVAILABLE:
        raise RuntimeError("Playwright not available in this environment.")
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page(extra_http_headers=HEADERS)
        page.goto(url, timeout=timeout * 1000)
        # wait a bit for content
        time.sleep(1.2)
        html = page.content()
        browser.close()
    return _extract_basic(url, html)

def scrape(url: str, force_render: bool = False) -> Dict:
    """
    High-level scrape function:
      - Try requests+BS first.
      - If content looks too small or force_render=True, fallback to Playwright.
    """
    try:
        data = scrape_with_requests(url)
        # if lightweight response or very little text, try playwright
        text_len = len(data.get("text", "") or "")
        if force_render or text_len < 500:
            # fallback to playwright if available
            if PLAYWRIGHT_AVAILABLE:
                data2 = scrape_with_playwright(url)
                # prefer rendered text if richer
                if len(data2.get("text", "")) > text_len:
                    return data2
            # else return what we have
        return data
    except Exception as e:
        # requests failed -> try playwright
        if PLAYWRIGHT_AVAILABLE:
            try:
                return scrape_with_playwright(url)
            except Exception as e2:
                raise RuntimeError(f"Both requests and playwright scrapes failed: {e}, {e2}")
        raise

if __name__ == "__main__":
    # quick local test
    print("Quick test (requests).")
    print(scrape("https://example.com")["title"])