import json import time from typing import Dict, List, Optional, Tuple from dataclasses import dataclass from enum import Enum # Assuming these libraries are available for browser automation and LLM interaction # from playwright.sync_api import sync_playwright # import openai # or another LLM API client # Define enums for status tracking class Status(Enum): SUCCESS = "success" FAILURE = "failure" PENDING = "pending" # Data classes for structured data handling @dataclass class SubGoal: id: str description: str expected_state: str status: Status = Status.PENDING @dataclass class TaskResult: status: Status output_data: Optional[Dict] = None error_message: Optional[str] = None # Core modules of the agent class PlannerModule: """ Planner Module: Breaks down user commands into executable sub-goals. """ def __init__(self, llm_client): self.llm_client = llm_client self.working_memory = { "current_state": "", "completed_subgoals": [], "pending_subgoals": [] } def create_plan(self, user_command: str) -> List[SubGoal]: """ Uses LLM to generate a step-by-step plan from the user command. Returns a list of sub-goals. """ prompt = f""" Convert the following user command into discrete browser automation steps: "{user_command}" Return a JSON array of steps with these keys: - id: unique identifier for the step - description: detailed description of the action to take - expected_state: what the page should look like after completion Example format: [ {{ "id": "1", "description": "Navigate to https://example.com", "expected_state": "Homepage with logo visible" }}, {{ "id": "2", "description": "Click on the 'Login' button", "expected_state": "Login form appears with username and password fields" }} ] """ # response = self.llm_client.generate_response(prompt) # For demonstration, returning a mock plan mock_plan = [ SubGoal(id="1", description="Navigate to website", expected_state="Page loaded"), SubGoal(id="2", description="Click sign-in button", expected_state="Login form visible"), SubGoal(id="3", description="Enter credentials", expected_state="User logged in") ] self.working_memory["pending_subgoals"] = mock_plan return mock_plan def replan(self, failed_subgoal: SubGoal, error_reason: str) -> List[SubGoal]: """ Re-generates plan based on failure feedback from Validator. """ prompt = f""" The following step failed: "{failed_subgoal.description}" Reason: "{error_reason}" Generate alternative steps to achieve the same goal. """ # response = self.llm_client.generate_response(prompt) # For demonstration, returning a mock replan mock_replan = [ SubGoal(id="2a", description="Click alternative sign-in button", expected_state="Login form visible"), SubGoal(id="2b", description="Wait for 5 seconds and retry click", expected_state="Login form visible") ] return mock_replan class ActorModule: """ Actor Module: Executes browser actions based on sub-goals. """ def __init__(self, browser_controller): self.browser = browser_controller def execute_action(self, subgoal: SubGoal) -> TaskResult: """ Performs the specified action in the browser. """ try: # Parse subgoal description and perform corresponding action if "navigate" in subgoal.description.lower(): url = subgoal.description.split(" ")[-1] self.browser.navigate(url) elif "click" in subgoal.description.lower(): element = self.browser.find_element_by_text("Sign In") self.browser.click(element) elif "enter" in subgoal.description.lower(): input_field = self.browser.find_element_by_label("username") self.browser.type(input_field, "user@example.com") return TaskResult(status=Status.SUCCESS) except Exception as e: return TaskResult(status=Status.FAILURE, error_message=str(e)) class ValidatorModule: """ Validator Module: Verifies if actions were successful. """ def __init__(self, llm_client): self.llm_client = llm_client def validate(self, subgoal: SubGoal, browser_state: Dict) -> Tuple[Status, str]: """ Compares current browser state with expected state using LLM. Returns validation status and optional message. """ prompt = f""" Goal: {subgoal.description} Expected State: {subgoal.expected_state} Current State: {json.dumps(browser_state)} Has the goal been successfully achieved? Respond with YES or NO followed by reason. """ # response = self.llm_client.generate_response(prompt) # For demonstration, returning mock validation if subgoal.id == "2": return (Status.SUCCESS, "Login form is visible") else: return (Status.FAILURE, "Element not found or page not loaded as expected") # Main automation agent using planner-actor-validator loop class SkyvernAgent: """ Main automation agent implementing the Planner-Actor-Validator loop. """ def __init__(self): # self.llm_client = openai.Client(api_key="YOUR_API_KEY") # self.browser_controller = sync_playwright() self.planner = PlannerModule(llm_client=None) self.actor = ActorModule(browser_controller=None) self.validator = ValidatorModule(llm_client=None) self.settings = {} def run_task(self, user_command: str, settings: Dict) -> TaskResult: """ Main automation loop that coordinates all modules. """ self.settings = settings plan = self.planner.create_plan(user_command) output_data = {} step_count = 0 max_steps = settings.get("max_steps", 100) while plan and step_count < max_steps: current_subgoal = plan.pop(0) step_count += 1 # Actor executes the action result = self.actor.execute_action(current_subgoal) if result.status == Status.SUCCESS: # Validator checks if the action was successful browser_state = self._get_browser_state() validation_status, message = self.validator.validate(current_subgoal, browser_state) if validation_status == Status.SUCCESS: self.planner.working_memory["completed_subgoals"].append(current_subgoal) # Extract data if defined in schema if "data_schema" in settings: output_data = self._extract_data(browser_state, settings["data_schema"]) else: # Replanning when validation fails new_plan = self.planner.replan(current_subgoal, message) plan = new_plan + plan # Prepend new steps to existing plan else: # Replanning when action fails new_plan = self.planner.replan(current_subgoal, result.error_message) plan = new_plan + plan # Prepend new steps to existing plan # Return final result final_status = Status.SUCCESS if not plan else Status.FAILURE return TaskResult(status=final_status, output_data=output_data) def _get_browser_state(self) -> Dict: """ Captures current browser state (screenshot, HTML, URL, etc.) """ return { "url": "https://example.com", "html": "...", "screenshot": "base64-encoded-screenshot" } def _extract_data(self, browser_state: Dict, schema: Dict) -> Dict: """ Extracts data based on provided schema. """ # Using LLM to extract structured data according to schema prompt = f""" Extract data from the following browser state according to this schema: Schema: {json.dumps(schema)} State: {json.dumps(browser_state)} Return a JSON object with extracted data. """ # response = self.llm.generate_response(prompt) # return json.loads(response) return {"extracted": "data"} # Mock response # Example usage if __name__ == "__main__": agent = SkyvernAgent() command = "Add a product to the cart" settings = { "webhook_url": "https://your-webhook-url.com", "proxy_type": "residential", "session_id": "session_12345", "two_factor_id": "2fa_67890", "http_headers": {"User-Agent": "Custom Browser"}, "publish_workflow": True, "max_steps": 50, "data_schema": {"product_name": "string", "price": "number"}, "max_scrolls": 5 } result = agent.run_task(command, settings) print(json.dumps(result.__dict__, indent=2))