Spaces:

akashe
/

really-deep-research

Sleeping

App Files Files Community

really-deep-research / tools /playwright_tool.py

akashe

Initial Deployment

a559920 4 months ago

raw

history blame contribute delete

3.47 kB

	# playwright_tool.py
	from typing import Dict, Optional
	from playwright.async_api import async_playwright
	import re
	import html
	import time

	try:
	# if you have the same decorator you used for serper
	from agents import function_tool # or wherever your decorator is
	except Exception:
	# fallback no-op if you call it directly
	def function_tool(fn): return fn

	def _collapse_ws(s: str) -> str:
	s = html.unescape(s or "")
	s = re.sub(r"\r\n\|\r", "\n", s)
	s = re.sub(r"[ \t\f\v]+", " ", s)
	s = re.sub(r"\n[ \t]+", "\n", s)
	s = re.sub(r"\n{3,}", "\n\n", s)
	return s.strip()

	@function_tool
	async def playwright_web_read(
	url: str,
	wait_selector: Optional[str] = None,
	render_js: bool = True,
	timeout_ms: int = 120000,
	max_chars: int = 200_000,
	user_agent: Optional[str] = None,
	) -> Dict[str, object]:
	"""
	Fetch visible page text using Playwright (Chromium, headless).
	Args:
	url: The URL to visit.
	wait_selector: CSS selector to wait for (optional).
	render_js: If False, disable JS for faster loads on static pages.
	timeout_ms: Overall nav+wait timeout.
	max_chars: Truncate returned text to avoid huge payloads.
	user_agent: Optional UA string.
	Returns:
	{ "title", "final_url", "status", "text", "elapsed_ms" }
	"""
	t0 = time.time()
	title = ""
	final_url = url
	status = 0
	text = ""

	async with async_playwright() as p:
	browser = await p.chromium.launch(headless=True)
	try:
	context_kwargs = {}
	if user_agent:
	context_kwargs["user_agent"] = user_agent
	if not render_js:
	context_kwargs["java_script_enabled"] = False

	context = await browser.new_context(**context_kwargs)
	page = await context.new_page()

	# Conservative wait_until to get dynamic content when render_js=True
	wait_until = "networkidle" if render_js else "domcontentloaded"
	resp = await page.goto(url, wait_until=wait_until, timeout=timeout_ms)
	if resp:
	status = resp.status or 0
	final_url = page.url

	if wait_selector:
	try:
	await page.wait_for_selector(wait_selector, timeout=timeout_ms)
	except Exception:
	pass # don't fail just because selector not found

	try:
	title = await page.title() or ""
	except Exception:
	title = ""

	# Prefer visible text; fall back to body textContent.
	try:
	# inner_text("body") respects visibility better than content()
	text = await page.inner_text("body", timeout=2000)
	except Exception:
	try:
	text = await page.evaluate("document.body ? document.body.innerText : ''") or ""
	except Exception:
	text = ""

	text = _collapse_ws(text)
	if len(text) > max_chars:
	text = text[:max_chars]

	return {
	"title": title,
	"final_url": final_url,
	"status": status,
	"text": text,
	"elapsed_ms": int((time.time() - t0) * 1000),
	}
	finally:
	try:
	await browser.close()
	except Exception:
	pass