really-deep-research / tools /playwright_tool.py
akashe's picture
Initial Deployment
a559920
# playwright_tool.py
from typing import Dict, Optional
from playwright.async_api import async_playwright
import re
import html
import time
try:
# if you have the same decorator you used for serper
from agents import function_tool # or wherever your decorator is
except Exception:
# fallback no-op if you call it directly
def function_tool(fn): return fn
def _collapse_ws(s: str) -> str:
s = html.unescape(s or "")
s = re.sub(r"\r\n|\r", "\n", s)
s = re.sub(r"[ \t\f\v]+", " ", s)
s = re.sub(r"\n[ \t]+", "\n", s)
s = re.sub(r"\n{3,}", "\n\n", s)
return s.strip()
@function_tool
async def playwright_web_read(
url: str,
wait_selector: Optional[str] = None,
render_js: bool = True,
timeout_ms: int = 120000,
max_chars: int = 200_000,
user_agent: Optional[str] = None,
) -> Dict[str, object]:
"""
Fetch visible page text using Playwright (Chromium, headless).
Args:
url: The URL to visit.
wait_selector: CSS selector to wait for (optional).
render_js: If False, disable JS for faster loads on static pages.
timeout_ms: Overall nav+wait timeout.
max_chars: Truncate returned text to avoid huge payloads.
user_agent: Optional UA string.
Returns:
{ "title", "final_url", "status", "text", "elapsed_ms" }
"""
t0 = time.time()
title = ""
final_url = url
status = 0
text = ""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
try:
context_kwargs = {}
if user_agent:
context_kwargs["user_agent"] = user_agent
if not render_js:
context_kwargs["java_script_enabled"] = False
context = await browser.new_context(**context_kwargs)
page = await context.new_page()
# Conservative wait_until to get dynamic content when render_js=True
wait_until = "networkidle" if render_js else "domcontentloaded"
resp = await page.goto(url, wait_until=wait_until, timeout=timeout_ms)
if resp:
status = resp.status or 0
final_url = page.url
if wait_selector:
try:
await page.wait_for_selector(wait_selector, timeout=timeout_ms)
except Exception:
pass # don't fail just because selector not found
try:
title = await page.title() or ""
except Exception:
title = ""
# Prefer visible text; fall back to body textContent.
try:
# inner_text("body") respects visibility better than content()
text = await page.inner_text("body", timeout=2000)
except Exception:
try:
text = await page.evaluate("document.body ? document.body.innerText : ''") or ""
except Exception:
text = ""
text = _collapse_ws(text)
if len(text) > max_chars:
text = text[:max_chars]
return {
"title": title,
"final_url": final_url,
"status": status,
"text": text,
"elapsed_ms": int((time.time() - t0) * 1000),
}
finally:
try:
await browser.close()
except Exception:
pass