import json import logging from typing import Optional, Dict, Any, List from langchain_core.tools import tool from patchright.async_api import async_playwright, Browser, BrowserContext, Page import asyncio logger = logging.getLogger(__name__) # Global state for persistent browser session # Note: In a production API with multiple workers, this should be managed per-session/request. # For now, we use a simple mechanism to store session-specific browsers if session_id is provided via context, # but since tools interface doesn't easily pass that, we default to a singleton for single-user/cli usage. class BrowserManager: _instance = None _playwright = None _browser = None _context = None _page = None _lock = asyncio.Lock() @classmethod async def get_page(cls, headless: bool = True) -> Page: async with cls._lock: if cls._playwright is None: cls._playwright = await async_playwright().start() if cls._browser is None: # Use Scrapling-compatible browser launch if needed, or standard playwright # Using standard playwright for tools to ensure full interactivity cls._browser = await cls._playwright.chromium.launch(headless=headless) if cls._context is None: cls._context = await cls._browser.new_context( viewport={'width': 1280, 'height': 800}, user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36' ) if cls._page is None: cls._page = await cls._context.new_page() return cls._page @classmethod async def close(cls): async with cls._lock: if cls._context: await cls._context.close() cls._context = None cls._page = None if cls._browser: await cls._browser.close() cls._browser = None if cls._playwright: await cls._playwright.stop() cls._playwright = None # Helper to run async code synchronously for tool execution def run_sync(coro): try: loop = asyncio.get_event_loop() except RuntimeError: loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) if loop.is_running(): # If we are already in an async loop (e.g. FastAPI), we can't block. # This is a limitation of the current tool design which returns strings immediately. # We might need to use nest_asyncio or assume tools are awaited by the caller. # However, LangChain tools can be async. # For this implementation, we will use a separate thread or new loop if possible, # but nest_asyncio is safer if available. import nest_asyncio nest_asyncio.apply() return loop.run_until_complete(coro) else: return loop.run_until_complete(coro) @tool def browse_and_extract(url: str, selector: str = "body", use_persistent: bool = True) -> str: """Browse to a URL and extract text content from the specified CSS selector.""" async def _action(): try: page = await BrowserManager.get_page() await page.goto(url, wait_until="domcontentloaded") # Wait for selector if specific one provided if selector != "body": try: await page.wait_for_selector(selector, timeout=5000) except: pass # Continue if selector not found immediately element = await page.query_selector(selector) if element: text = await element.inner_text() return text return f"Element '{selector}' not found." except Exception as e: return f"Error: {str(e)}" return run_sync(_action()) @tool def click_element(url: str, selector: str, use_persistent: bool = True) -> str: """Click an element on the page identified by the CSS selector. URL is ignored if persistent session is active.""" async def _action(): try: page = await BrowserManager.get_page() # If URL is different from current, navigate? # Usually agent provides URL context, but in persistent mode we might want to stay on current page # ignoring URL arg if we are already there-ish or if use_persistent is True if page.url == "about:blank" and url: await page.goto(url) await page.click(selector) return "Clicked element." except Exception as e: return f"Error: {str(e)}" return run_sync(_action()) @tool def fill_field(url: str, selector: str, text: str, use_persistent: bool = True) -> str: """Fill a text field or form element identified by the CSS selector with the provided text.""" async def _action(): try: page = await BrowserManager.get_page() await page.fill(selector, text) return f"Filled '{selector}' with text." except Exception as e: return f"Error: {str(e)}" return run_sync(_action()) @tool def execute_javascript(url: str, script: str, use_persistent: bool = True) -> str: """Execute custom JavaScript on the page and return the result.""" async def _action(): try: page = await BrowserManager.get_page() result = await page.evaluate(script) return str(result) except Exception as e: return f"Error: {str(e)}" return run_sync(_action()) @tool def get_cookies(url: str, use_persistent: bool = True) -> str: """Get all cookies for the current domain in JSON format.""" async def _action(): try: page = await BrowserManager.get_page() cookies = await page.context.cookies() return json.dumps(cookies) except Exception as e: return f"Error: {str(e)}" return run_sync(_action()) @tool def set_cookies(url: str, cookies_json: str, use_persistent: bool = True) -> str: """Set cookies on the page from a JSON string.""" async def _action(): try: page = await BrowserManager.get_page() cookies = json.loads(cookies_json) await page.context.add_cookies(cookies) return "Cookies set." except Exception as e: return f"Error: {str(e)}" return run_sync(_action()) @tool def scroll_page(url: str, direction: str = "bottom", pixels: float = 500, use_persistent: bool = True) -> str: """Scroll the page in a specified direction ('bottom', 'top', 'down', 'up').""" async def _action(): try: page = await BrowserManager.get_page() if direction == "bottom": await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") elif direction == "top": await page.evaluate("window.scrollTo(0, 0)") elif direction == "down": await page.evaluate(f"window.scrollBy(0, {pixels})") elif direction == "up": await page.evaluate(f"window.scrollBy(0, -{pixels})") return f"Scrolled {direction}." except Exception as e: return f"Error: {str(e)}" return run_sync(_action()) @tool def take_screenshot(url: str, full_page: bool = False, use_persistent: bool = True) -> str: """Take a screenshot of the current page and return base64 string.""" async def _action(): try: page = await BrowserManager.get_page() import base64 screenshot = await page.screenshot(full_page=full_page) return f"Screenshot captured (base64): {base64.b64encode(screenshot).decode()[:100]}..." except Exception as e: return f"Error: {str(e)}" return run_sync(_action()) @tool def get_html_source(url: str, use_persistent: bool = True) -> str: """Get the full HTML source code of the current page.""" async def _action(): try: page = await BrowserManager.get_page() return await page.content() except Exception as e: return f"Error: {str(e)}" return run_sync(_action()) @tool def get_page_info(url: str, use_persistent: bool = True) -> str: """Get comprehensive page information including title, URL.""" async def _action(): try: page = await BrowserManager.get_page() return json.dumps({ "title": await page.title(), "url": page.url, "content_preview": (await page.content())[:500] }) except Exception as e: return f"Error: {str(e)}" return run_sync(_action()) @tool def wait_for_element(url: str, selector: str, timeout: float = 10, use_persistent: bool = True) -> str: """Wait for an element matching the CSS selector to appear on the page.""" async def _action(): try: page = await BrowserManager.get_page() await page.wait_for_selector(selector, timeout=timeout * 1000) return f"Element '{selector}' appeared." except Exception as e: return f"Error: {str(e)}" return run_sync(_action()) @tool def task_complete(reason: str) -> str: """Call this tool when you have successfully completed the task and have the final data or answer.""" return f"TASK COMPLETE: {reason}" @tool def agent_reflection(thought: str, adaptation_plan: str) -> str: """Call this tool to reflect on your progress, especially after an error or unexpected result. Explain what you've learned and how you're adapting your strategy.""" return f"REFLECTION: {thought}\nADAPTATION PLAN: {adaptation_plan}" def get_all_browser_tools(): """Returns a list of all browser automation tools.""" return [ browse_and_extract, click_element, fill_field, execute_javascript, get_cookies, set_cookies, scroll_page, take_screenshot, get_html_source, get_page_info, wait_for_element, task_complete, agent_reflection ]