Spaces:

akashe
/

really-deep-research

Sleeping

File size: 3,474 Bytes

a559920

# playwright_tool.py
from typing import Dict, Optional
from playwright.async_api import async_playwright
import re
import html
import time

try:
    # if you have the same decorator you used for serper
    from agents import function_tool  # or wherever your decorator is
except Exception:
    # fallback no-op if you call it directly
    def function_tool(fn): return fn

def _collapse_ws(s: str) -> str:
    s = html.unescape(s or "")
    s = re.sub(r"\r\n|\r", "\n", s)
    s = re.sub(r"[ \t\f\v]+", " ", s)
    s = re.sub(r"\n[ \t]+", "\n", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

@function_tool
async def playwright_web_read(
    url: str,
    wait_selector: Optional[str] = None,
    render_js: bool = True,
    timeout_ms: int = 120000,
    max_chars: int = 200_000,
    user_agent: Optional[str] = None,
) -> Dict[str, object]:
    """
    Fetch visible page text using Playwright (Chromium, headless).
    Args:
      url: The URL to visit.
      wait_selector: CSS selector to wait for (optional).
      render_js: If False, disable JS for faster loads on static pages.
      timeout_ms: Overall nav+wait timeout.
      max_chars: Truncate returned text to avoid huge payloads.
      user_agent: Optional UA string.
    Returns:
      { "title", "final_url", "status", "text", "elapsed_ms" }
    """
    t0 = time.time()
    title = ""
    final_url = url
    status = 0
    text = ""

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        try:
            context_kwargs = {}
            if user_agent:
                context_kwargs["user_agent"] = user_agent
            if not render_js:
                context_kwargs["java_script_enabled"] = False

            context = await browser.new_context(**context_kwargs)
            page = await context.new_page()

            # Conservative wait_until to get dynamic content when render_js=True
            wait_until = "networkidle" if render_js else "domcontentloaded"
            resp = await page.goto(url, wait_until=wait_until, timeout=timeout_ms)
            if resp:
                status = resp.status or 0
                final_url = page.url

            if wait_selector:
                try:
                    await page.wait_for_selector(wait_selector, timeout=timeout_ms)
                except Exception:
                    pass  # don't fail just because selector not found

            try:
                title = await page.title() or ""
            except Exception:
                title = ""

            # Prefer visible text; fall back to body textContent.
            try:
                # inner_text("body") respects visibility better than content()
                text = await page.inner_text("body", timeout=2000)
            except Exception:
                try:
                    text = await page.evaluate("document.body ? document.body.innerText : ''") or ""
                except Exception:
                    text = ""

            text = _collapse_ws(text)
            if len(text) > max_chars:
                text = text[:max_chars]

            return {
                "title": title,
                "final_url": final_url,
                "status": status,
                "text": text,
                "elapsed_ms": int((time.time() - t0) * 1000),
            }
        finally:
            try:
                await browser.close()
            except Exception:
                pass