frankystew1's picture
Upload app.py with huggingface_hub
afc1327 verified
import json
import time
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass
from enum import Enum
# Assuming these libraries are available for browser automation and LLM interaction
# from playwright.sync_api import sync_playwright
# import openai # or another LLM API client
# Define enums for status tracking
class Status(Enum):
SUCCESS = "success"
FAILURE = "failure"
PENDING = "pending"
# Data classes for structured data handling
@dataclass
class SubGoal:
id: str
description: str
expected_state: str
status: Status = Status.PENDING
@dataclass
class TaskResult:
status: Status
output_data: Optional[Dict] = None
error_message: Optional[str] = None
# Core modules of the agent
class PlannerModule:
"""
Planner Module: Breaks down user commands into executable sub-goals.
"""
def __init__(self, llm_client):
self.llm_client = llm_client
self.working_memory = {
"current_state": "",
"completed_subgoals": [],
"pending_subgoals": []
}
def create_plan(self, user_command: str) -> List[SubGoal]:
"""
Uses LLM to generate a step-by-step plan from the user command.
Returns a list of sub-goals.
"""
prompt = f"""
Convert the following user command into discrete browser automation steps:
"{user_command}"
Return a JSON array of steps with these keys:
- id: unique identifier for the step
- description: detailed description of the action to take
- expected_state: what the page should look like after completion
Example format:
[
{{
"id": "1",
"description": "Navigate to https://example.com",
"expected_state": "Homepage with logo visible"
}},
{{
"id": "2",
"description": "Click on the 'Login' button",
"expected_state": "Login form appears with username and password fields"
}}
]
"""
# response = self.llm_client.generate_response(prompt)
# For demonstration, returning a mock plan
mock_plan = [
SubGoal(id="1", description="Navigate to website", expected_state="Page loaded"),
SubGoal(id="2", description="Click sign-in button", expected_state="Login form visible"),
SubGoal(id="3", description="Enter credentials", expected_state="User logged in")
]
self.working_memory["pending_subgoals"] = mock_plan
return mock_plan
def replan(self, failed_subgoal: SubGoal, error_reason: str) -> List[SubGoal]:
"""
Re-generates plan based on failure feedback from Validator.
"""
prompt = f"""
The following step failed: "{failed_subgoal.description}"
Reason: "{error_reason}"
Generate alternative steps to achieve the same goal.
"""
# response = self.llm_client.generate_response(prompt)
# For demonstration, returning a mock replan
mock_replan = [
SubGoal(id="2a", description="Click alternative sign-in button", expected_state="Login form visible"),
SubGoal(id="2b", description="Wait for 5 seconds and retry click", expected_state="Login form visible")
]
return mock_replan
class ActorModule:
"""
Actor Module: Executes browser actions based on sub-goals.
"""
def __init__(self, browser_controller):
self.browser = browser_controller
def execute_action(self, subgoal: SubGoal) -> TaskResult:
"""
Performs the specified action in the browser.
"""
try:
# Parse subgoal description and perform corresponding action
if "navigate" in subgoal.description.lower():
url = subgoal.description.split(" ")[-1]
self.browser.navigate(url)
elif "click" in subgoal.description.lower():
element = self.browser.find_element_by_text("Sign In")
self.browser.click(element)
elif "enter" in subgoal.description.lower():
input_field = self.browser.find_element_by_label("username")
self.browser.type(input_field, "user@example.com")
return TaskResult(status=Status.SUCCESS)
except Exception as e:
return TaskResult(status=Status.FAILURE, error_message=str(e))
class ValidatorModule:
"""
Validator Module: Verifies if actions were successful.
"""
def __init__(self, llm_client):
self.llm_client = llm_client
def validate(self, subgoal: SubGoal, browser_state: Dict) -> Tuple[Status, str]:
"""
Compares current browser state with expected state using LLM.
Returns validation status and optional message.
"""
prompt = f"""
Goal: {subgoal.description}
Expected State: {subgoal.expected_state}
Current State: {json.dumps(browser_state)}
Has the goal been successfully achieved? Respond with YES or NO followed by reason.
"""
# response = self.llm_client.generate_response(prompt)
# For demonstration, returning mock validation
if subgoal.id == "2":
return (Status.SUCCESS, "Login form is visible")
else:
return (Status.FAILURE, "Element not found or page not loaded as expected")
# Main automation agent using planner-actor-validator loop
class SkyvernAgent:
"""
Main automation agent implementing the Planner-Actor-Validator loop.
"""
def __init__(self):
# self.llm_client = openai.Client(api_key="YOUR_API_KEY")
# self.browser_controller = sync_playwright()
self.planner = PlannerModule(llm_client=None)
self.actor = ActorModule(browser_controller=None)
self.validator = ValidatorModule(llm_client=None)
self.settings = {}
def run_task(self, user_command: str, settings: Dict) -> TaskResult:
"""
Main automation loop that coordinates all modules.
"""
self.settings = settings
plan = self.planner.create_plan(user_command)
output_data = {}
step_count = 0
max_steps = settings.get("max_steps", 100)
while plan and step_count < max_steps:
current_subgoal = plan.pop(0)
step_count += 1
# Actor executes the action
result = self.actor.execute_action(current_subgoal)
if result.status == Status.SUCCESS:
# Validator checks if the action was successful
browser_state = self._get_browser_state()
validation_status, message = self.validator.validate(current_subgoal, browser_state)
if validation_status == Status.SUCCESS:
self.planner.working_memory["completed_subgoals"].append(current_subgoal)
# Extract data if defined in schema
if "data_schema" in settings:
output_data = self._extract_data(browser_state, settings["data_schema"])
else:
# Replanning when validation fails
new_plan = self.planner.replan(current_subgoal, message)
plan = new_plan + plan # Prepend new steps to existing plan
else:
# Replanning when action fails
new_plan = self.planner.replan(current_subgoal, result.error_message)
plan = new_plan + plan # Prepend new steps to existing plan
# Return final result
final_status = Status.SUCCESS if not plan else Status.FAILURE
return TaskResult(status=final_status, output_data=output_data)
def _get_browser_state(self) -> Dict:
"""
Captures current browser state (screenshot, HTML, URL, etc.)
"""
return {
"url": "https://example.com",
"html": "<html>...</html>",
"screenshot": "base64-encoded-screenshot"
}
def _extract_data(self, browser_state: Dict, schema: Dict) -> Dict:
"""
Extracts data based on provided schema.
"""
# Using LLM to extract structured data according to schema
prompt = f"""
Extract data from the following browser state according to this schema:
Schema: {json.dumps(schema)}
State: {json.dumps(browser_state)}
Return a JSON object with extracted data.
"""
# response = self.llm.generate_response(prompt)
# return json.loads(response)
return {"extracted": "data"} # Mock response
# Example usage
if __name__ == "__main__":
agent = SkyvernAgent()
command = "Add a product to the cart"
settings = {
"webhook_url": "https://your-webhook-url.com",
"proxy_type": "residential",
"session_id": "session_12345",
"two_factor_id": "2fa_67890",
"http_headers": {"User-Agent": "Custom Browser"},
"publish_workflow": True,
"max_steps": 50,
"data_schema": {"product_name": "string", "price": "number"},
"max_scrolls": 5
}
result = agent.run_task(command, settings)
print(json.dumps(result.__dict__, indent=2))