Spaces:
Sleeping
Sleeping
| # playwright_tool.py | |
| from typing import Dict, Optional | |
| from playwright.async_api import async_playwright | |
| import re | |
| import html | |
| import time | |
| try: | |
| # if you have the same decorator you used for serper | |
| from agents import function_tool # or wherever your decorator is | |
| except Exception: | |
| # fallback no-op if you call it directly | |
| def function_tool(fn): return fn | |
| def _collapse_ws(s: str) -> str: | |
| s = html.unescape(s or "") | |
| s = re.sub(r"\r\n|\r", "\n", s) | |
| s = re.sub(r"[ \t\f\v]+", " ", s) | |
| s = re.sub(r"\n[ \t]+", "\n", s) | |
| s = re.sub(r"\n{3,}", "\n\n", s) | |
| return s.strip() | |
| async def playwright_web_read( | |
| url: str, | |
| wait_selector: Optional[str] = None, | |
| render_js: bool = True, | |
| timeout_ms: int = 120000, | |
| max_chars: int = 200_000, | |
| user_agent: Optional[str] = None, | |
| ) -> Dict[str, object]: | |
| """ | |
| Fetch visible page text using Playwright (Chromium, headless). | |
| Args: | |
| url: The URL to visit. | |
| wait_selector: CSS selector to wait for (optional). | |
| render_js: If False, disable JS for faster loads on static pages. | |
| timeout_ms: Overall nav+wait timeout. | |
| max_chars: Truncate returned text to avoid huge payloads. | |
| user_agent: Optional UA string. | |
| Returns: | |
| { "title", "final_url", "status", "text", "elapsed_ms" } | |
| """ | |
| t0 = time.time() | |
| title = "" | |
| final_url = url | |
| status = 0 | |
| text = "" | |
| async with async_playwright() as p: | |
| browser = await p.chromium.launch(headless=True) | |
| try: | |
| context_kwargs = {} | |
| if user_agent: | |
| context_kwargs["user_agent"] = user_agent | |
| if not render_js: | |
| context_kwargs["java_script_enabled"] = False | |
| context = await browser.new_context(**context_kwargs) | |
| page = await context.new_page() | |
| # Conservative wait_until to get dynamic content when render_js=True | |
| wait_until = "networkidle" if render_js else "domcontentloaded" | |
| resp = await page.goto(url, wait_until=wait_until, timeout=timeout_ms) | |
| if resp: | |
| status = resp.status or 0 | |
| final_url = page.url | |
| if wait_selector: | |
| try: | |
| await page.wait_for_selector(wait_selector, timeout=timeout_ms) | |
| except Exception: | |
| pass # don't fail just because selector not found | |
| try: | |
| title = await page.title() or "" | |
| except Exception: | |
| title = "" | |
| # Prefer visible text; fall back to body textContent. | |
| try: | |
| # inner_text("body") respects visibility better than content() | |
| text = await page.inner_text("body", timeout=2000) | |
| except Exception: | |
| try: | |
| text = await page.evaluate("document.body ? document.body.innerText : ''") or "" | |
| except Exception: | |
| text = "" | |
| text = _collapse_ws(text) | |
| if len(text) > max_chars: | |
| text = text[:max_chars] | |
| return { | |
| "title": title, | |
| "final_url": final_url, | |
| "status": status, | |
| "text": text, | |
| "elapsed_ms": int((time.time() - t0) * 1000), | |
| } | |
| finally: | |
| try: | |
| await browser.close() | |
| except Exception: | |
| pass | |