Josedcape commited on
Commit
7ee4a95
·
verified ·
1 Parent(s): 6e051ea

Upload 5 files

Browse files
src/utils/__init__.py ADDED
File without changes
src/utils/agent_state.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+
3
+ class AgentState:
4
+ _instance = None
5
+
6
+ def __init__(self):
7
+ if not hasattr(self, '_stop_requested'):
8
+ self._stop_requested = asyncio.Event()
9
+ self.last_valid_state = None # store the last valid browser state
10
+
11
+ def __new__(cls):
12
+ if cls._instance is None:
13
+ cls._instance = super(AgentState, cls).__new__(cls)
14
+ return cls._instance
15
+
16
+ def request_stop(self):
17
+ self._stop_requested.set()
18
+
19
+ def clear_stop(self):
20
+ self._stop_requested.clear()
21
+ self.last_valid_state = None
22
+
23
+ def is_stop_requested(self):
24
+ return self._stop_requested.is_set()
25
+
26
+ def set_last_valid_state(self, state):
27
+ self.last_valid_state = state
28
+
29
+ def get_last_valid_state(self):
30
+ return self.last_valid_state
src/utils/default_config_settings.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pickle
3
+ import uuid
4
+ import gradio as gr
5
+
6
+
7
+ def default_config():
8
+ """Prepare the default configuration"""
9
+ return {
10
+ "agent_type": "custom",
11
+ "max_steps": 100,
12
+ "max_actions_per_step": 10,
13
+ "use_vision": True,
14
+ "tool_call_in_content": True,
15
+ "llm_provider": "openai",
16
+ "llm_model_name": "gpt-4o",
17
+ "llm_temperature": 1.0,
18
+ "llm_base_url": "",
19
+ "llm_api_key": "",
20
+ "use_own_browser": os.getenv("CHROME_PERSISTENT_SESSION", "false").lower() == "true",
21
+ "keep_browser_open": False,
22
+ "headless": False,
23
+ "disable_security": True,
24
+ "enable_recording": True,
25
+ "window_w": 1280,
26
+ "window_h": 1100,
27
+ "save_recording_path": "./tmp/record_videos",
28
+ "save_trace_path": "./tmp/traces",
29
+ "save_agent_history_path": "./tmp/agent_history",
30
+ "task": "go to google.com and type 'OpenAI' click search and give me the first url",
31
+ }
32
+
33
+
34
+ def load_config_from_file(config_file):
35
+ """Load settings from a UUID.pkl file."""
36
+ try:
37
+ with open(config_file, 'rb') as f:
38
+ settings = pickle.load(f)
39
+ return settings
40
+ except Exception as e:
41
+ return f"Error loading configuration: {str(e)}"
42
+
43
+
44
+ def save_config_to_file(settings, save_dir="./tmp/webui_settings"):
45
+ """Save the current settings to a UUID.pkl file with a UUID name."""
46
+ os.makedirs(save_dir, exist_ok=True)
47
+ config_file = os.path.join(save_dir, f"{uuid.uuid4()}.pkl")
48
+ with open(config_file, 'wb') as f:
49
+ pickle.dump(settings, f)
50
+ return f"Configuration saved to {config_file}"
51
+
52
+
53
+ def save_current_config(*args):
54
+ current_config = {
55
+ "agent_type": args[0],
56
+ "max_steps": args[1],
57
+ "max_actions_per_step": args[2],
58
+ "use_vision": args[3],
59
+ "tool_call_in_content": args[4],
60
+ "llm_provider": args[5],
61
+ "llm_model_name": args[6],
62
+ "llm_temperature": args[7],
63
+ "llm_base_url": args[8],
64
+ "llm_api_key": args[9],
65
+ "use_own_browser": args[10],
66
+ "keep_browser_open": args[11],
67
+ "headless": args[12],
68
+ "disable_security": args[13],
69
+ "enable_recording": args[14],
70
+ "window_w": args[15],
71
+ "window_h": args[16],
72
+ "save_recording_path": args[17],
73
+ "save_trace_path": args[18],
74
+ "save_agent_history_path": args[19],
75
+ "task": args[20],
76
+ }
77
+ return save_config_to_file(current_config)
78
+
79
+
80
+ def update_ui_from_config(config_file):
81
+ if config_file is not None:
82
+ loaded_config = load_config_from_file(config_file.name)
83
+ if isinstance(loaded_config, dict):
84
+ return (
85
+ gr.update(value=loaded_config.get("agent_type", "custom")),
86
+ gr.update(value=loaded_config.get("max_steps", 100)),
87
+ gr.update(value=loaded_config.get("max_actions_per_step", 10)),
88
+ gr.update(value=loaded_config.get("use_vision", True)),
89
+ gr.update(value=loaded_config.get("tool_call_in_content", True)),
90
+ gr.update(value=loaded_config.get("llm_provider", "openai")),
91
+ gr.update(value=loaded_config.get("llm_model_name", "gpt-4o")),
92
+ gr.update(value=loaded_config.get("llm_temperature", 1.0)),
93
+ gr.update(value=loaded_config.get("llm_base_url", "")),
94
+ gr.update(value=loaded_config.get("llm_api_key", "")),
95
+ gr.update(value=loaded_config.get("use_own_browser", False)),
96
+ gr.update(value=loaded_config.get("keep_browser_open", False)),
97
+ gr.update(value=loaded_config.get("headless", False)),
98
+ gr.update(value=loaded_config.get("disable_security", True)),
99
+ gr.update(value=loaded_config.get("enable_recording", True)),
100
+ gr.update(value=loaded_config.get("window_w", 1280)),
101
+ gr.update(value=loaded_config.get("window_h", 1100)),
102
+ gr.update(value=loaded_config.get("save_recording_path", "./tmp/record_videos")),
103
+ gr.update(value=loaded_config.get("save_trace_path", "./tmp/traces")),
104
+ gr.update(value=loaded_config.get("save_agent_history_path", "./tmp/agent_history")),
105
+ gr.update(value=loaded_config.get("task", "")),
106
+ "Configuration loaded successfully."
107
+ )
108
+ else:
109
+ return (
110
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
111
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
112
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
113
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
114
+ gr.update(), "Error: Invalid configuration file."
115
+ )
116
+ return (
117
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
118
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
119
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
120
+ gr.update(), gr.update(), gr.update(), gr.update(), gr.update(),
121
+ gr.update(), "No file selected."
122
+ )
src/utils/llm.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import pdb
3
+ from langchain_openai import ChatOpenAI
4
+ from langchain_core.globals import get_llm_cache
5
+ from langchain_core.language_models.base import (
6
+ BaseLanguageModel,
7
+ LangSmithParams,
8
+ LanguageModelInput,
9
+ )
10
+ from langchain_core.load import dumpd, dumps
11
+ from langchain_core.messages import (
12
+ AIMessage,
13
+ SystemMessage,
14
+ AnyMessage,
15
+ BaseMessage,
16
+ BaseMessageChunk,
17
+ HumanMessage,
18
+ convert_to_messages,
19
+ message_chunk_to_message,
20
+ )
21
+ from langchain_core.outputs import (
22
+ ChatGeneration,
23
+ ChatGenerationChunk,
24
+ ChatResult,
25
+ LLMResult,
26
+ RunInfo,
27
+ )
28
+ from langchain_core.output_parsers.base import OutputParserLike
29
+ from langchain_core.runnables import Runnable, RunnableConfig
30
+ from langchain_core.tools import BaseTool
31
+
32
+ from typing import (
33
+ TYPE_CHECKING,
34
+ Any,
35
+ Callable,
36
+ Literal,
37
+ Optional,
38
+ Union,
39
+ cast,
40
+ )
41
+
42
+ class DeepSeekR1ChatOpenAI(ChatOpenAI):
43
+
44
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
45
+ super().__init__(*args, **kwargs)
46
+ self.client = OpenAI(
47
+ base_url=kwargs.get("base_url"),
48
+ api_key=kwargs.get("api_key")
49
+ )
50
+
51
+ async def ainvoke(
52
+ self,
53
+ input: LanguageModelInput,
54
+ config: Optional[RunnableConfig] = None,
55
+ *,
56
+ stop: Optional[list[str]] = None,
57
+ **kwargs: Any,
58
+ ) -> AIMessage:
59
+ message_history = []
60
+ for input_ in input:
61
+ if isinstance(input_, SystemMessage):
62
+ message_history.append({"role": "system", "content": input_.content})
63
+ elif isinstance(input_, AIMessage):
64
+ message_history.append({"role": "assistant", "content": input_.content})
65
+ else:
66
+ message_history.append({"role": "user", "content": input_.content})
67
+
68
+ response = self.client.chat.completions.create(
69
+ model=self.model_name,
70
+ messages=messages
71
+ )
72
+
73
+ reasoning_content = response.choices[0].message.reasoning_content
74
+ content = response.choices[0].message.content
75
+ return AIMessage(content=content, reasoning_content=reasoning_content)
76
+
77
+ def invoke(
78
+ self,
79
+ input: LanguageModelInput,
80
+ config: Optional[RunnableConfig] = None,
81
+ *,
82
+ stop: Optional[list[str]] = None,
83
+ **kwargs: Any,
84
+ ) -> AIMessage:
85
+ message_history = []
86
+ for input_ in input:
87
+ if isinstance(input_, SystemMessage):
88
+ message_history.append({"role": "system", "content": input_.content})
89
+ elif isinstance(input_, AIMessage):
90
+ message_history.append({"role": "assistant", "content": input_.content})
91
+ else:
92
+ message_history.append({"role": "user", "content": input_.content})
93
+
94
+ response = self.client.chat.completions.create(
95
+ model=self.model_name,
96
+ messages=message_history
97
+ )
98
+
99
+ reasoning_content = response.choices[0].message.reasoning_content
100
+ content = response.choices[0].message.content
101
+ return AIMessage(content=content, reasoning_content=reasoning_content)
src/utils/utils.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+ import time
4
+ from pathlib import Path
5
+ from typing import Dict, Optional
6
+
7
+ from langchain_anthropic import ChatAnthropic
8
+ from langchain_google_genai import ChatGoogleGenerativeAI
9
+ from langchain_ollama import ChatOllama
10
+ from langchain_openai import AzureChatOpenAI, ChatOpenAI
11
+ import gradio as gr
12
+
13
+ from .llm import DeepSeekR1ChatOpenAI
14
+
15
+ def get_llm_model(provider: str, **kwargs):
16
+ """
17
+ 获取LLM 模型
18
+ :param provider: 模型类型
19
+ :param kwargs:
20
+ :return:
21
+ """
22
+ if provider == "anthropic":
23
+ if not kwargs.get("base_url", ""):
24
+ base_url = "https://api.anthropic.com"
25
+ else:
26
+ base_url = kwargs.get("base_url")
27
+
28
+ if not kwargs.get("api_key", ""):
29
+ api_key = os.getenv("ANTHROPIC_API_KEY", "")
30
+ else:
31
+ api_key = kwargs.get("api_key")
32
+
33
+ return ChatAnthropic(
34
+ model_name=kwargs.get("model_name", "claude-3-5-sonnet-20240620"),
35
+ temperature=kwargs.get("temperature", 0.0),
36
+ base_url=base_url,
37
+ api_key=api_key,
38
+ )
39
+ elif provider == "openai":
40
+ if not kwargs.get("base_url", ""):
41
+ base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1")
42
+ else:
43
+ base_url = kwargs.get("base_url")
44
+
45
+ if not kwargs.get("api_key", ""):
46
+ api_key = os.getenv("OPENAI_API_KEY", "")
47
+ else:
48
+ api_key = kwargs.get("api_key")
49
+
50
+ return ChatOpenAI(
51
+ model=kwargs.get("model_name", "gpt-4o"),
52
+ temperature=kwargs.get("temperature", 0.0),
53
+ base_url=base_url,
54
+ api_key=api_key,
55
+ )
56
+ elif provider == "deepseek":
57
+ if not kwargs.get("base_url", ""):
58
+ base_url = os.getenv("DEEPSEEK_ENDPOINT", "")
59
+ else:
60
+ base_url = kwargs.get("base_url")
61
+
62
+ if not kwargs.get("api_key", ""):
63
+ api_key = os.getenv("DEEPSEEK_API_KEY", "")
64
+ else:
65
+ api_key = kwargs.get("api_key")
66
+
67
+ if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner":
68
+ return DeepSeekR1ChatOpenAI(
69
+ model=kwargs.get("model_name", "deepseek-reasoner"),
70
+ temperature=kwargs.get("temperature", 0.0),
71
+ base_url=base_url,
72
+ api_key=api_key,
73
+ )
74
+ else:
75
+ return ChatOpenAI(
76
+ model=kwargs.get("model_name", "deepseek-chat"),
77
+ temperature=kwargs.get("temperature", 0.0),
78
+ base_url=base_url,
79
+ api_key=api_key,
80
+ )
81
+ elif provider == "gemini":
82
+ if not kwargs.get("api_key", ""):
83
+ api_key = os.getenv("GOOGLE_API_KEY", "")
84
+ else:
85
+ api_key = kwargs.get("api_key")
86
+ return ChatGoogleGenerativeAI(
87
+ model=kwargs.get("model_name", "gemini-2.0-flash-exp"),
88
+ temperature=kwargs.get("temperature", 0.0),
89
+ google_api_key=api_key,
90
+ )
91
+ elif provider == "ollama":
92
+ return ChatOllama(
93
+ model=kwargs.get("model_name", "qwen2.5:7b"),
94
+ temperature=kwargs.get("temperature", 0.0),
95
+ num_ctx=kwargs.get("num_ctx", 32000),
96
+ base_url=kwargs.get("base_url", "http://localhost:11434"),
97
+ )
98
+ elif provider == "azure_openai":
99
+ if not kwargs.get("base_url", ""):
100
+ base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "")
101
+ else:
102
+ base_url = kwargs.get("base_url")
103
+ if not kwargs.get("api_key", ""):
104
+ api_key = os.getenv("AZURE_OPENAI_API_KEY", "")
105
+ else:
106
+ api_key = kwargs.get("api_key")
107
+ return AzureChatOpenAI(
108
+ model=kwargs.get("model_name", "gpt-4o"),
109
+ temperature=kwargs.get("temperature", 0.0),
110
+ api_version="2024-05-01-preview",
111
+ azure_endpoint=base_url,
112
+ api_key=api_key,
113
+ )
114
+ else:
115
+ raise ValueError(f"Unsupported provider: {provider}")
116
+
117
+ # Predefined model names for common providers
118
+ model_names = {
119
+ "anthropic": ["claude-3-5-sonnet-20240620", "claude-3-opus-20240229"],
120
+ "openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"],
121
+ "deepseek": ["deepseek-chat", "deepseek-reasoner"],
122
+ "gemini": ["gemini-2.0-flash-exp", "gemini-2.0-flash-thinking-exp", "gemini-1.5-flash-latest", "gemini-1.5-flash-8b-latest", "gemini-2.0-flash-thinking-exp-1219" ],
123
+ "ollama": ["qwen2.5:7b", "llama2:7b"],
124
+ "azure_openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo"]
125
+ }
126
+
127
+ # Callback to update the model name dropdown based on the selected provider
128
+ def update_model_dropdown(llm_provider, api_key=None, base_url=None):
129
+ """
130
+ Update the model name dropdown with predefined models for the selected provider.
131
+ """
132
+ # Use API keys from .env if not provided
133
+ if not api_key:
134
+ api_key = os.getenv(f"{llm_provider.upper()}_API_KEY", "")
135
+ if not base_url:
136
+ base_url = os.getenv(f"{llm_provider.upper()}_BASE_URL", "")
137
+
138
+ # Use predefined models for the selected provider
139
+ if llm_provider in model_names:
140
+ return gr.Dropdown(choices=model_names[llm_provider], value=model_names[llm_provider][0], interactive=True)
141
+ else:
142
+ return gr.Dropdown(choices=[], value="", interactive=True, allow_custom_value=True)
143
+
144
+ def encode_image(img_path):
145
+ if not img_path:
146
+ return None
147
+ with open(img_path, "rb") as fin:
148
+ image_data = base64.b64encode(fin.read()).decode("utf-8")
149
+ return image_data
150
+
151
+
152
+ def get_latest_files(directory: str, file_types: list = ['.webm', '.zip']) -> Dict[str, Optional[str]]:
153
+ """Get the latest recording and trace files"""
154
+ latest_files: Dict[str, Optional[str]] = {ext: None for ext in file_types}
155
+
156
+ if not os.path.exists(directory):
157
+ os.makedirs(directory, exist_ok=True)
158
+ return latest_files
159
+
160
+ for file_type in file_types:
161
+ try:
162
+ matches = list(Path(directory).rglob(f"*{file_type}"))
163
+ if matches:
164
+ latest = max(matches, key=lambda p: p.stat().st_mtime)
165
+ # Only return files that are complete (not being written)
166
+ if time.time() - latest.stat().st_mtime > 1.0:
167
+ latest_files[file_type] = str(latest)
168
+ except Exception as e:
169
+ print(f"Error getting latest {file_type} file: {e}")
170
+
171
+ return latest_files
172
+ async def capture_screenshot(browser_context):
173
+ """Capture and encode a screenshot"""
174
+ # Extract the Playwright browser instance
175
+ playwright_browser = browser_context.browser.playwright_browser # Ensure this is correct.
176
+
177
+ # Check if the browser instance is valid and if an existing context can be reused
178
+ if playwright_browser and playwright_browser.contexts:
179
+ playwright_context = playwright_browser.contexts[0]
180
+ else:
181
+ return None
182
+
183
+ # Access pages in the context
184
+ pages = None
185
+ if playwright_context:
186
+ pages = playwright_context.pages
187
+
188
+ # Use an existing page or create a new one if none exist
189
+ if pages:
190
+ active_page = pages[0]
191
+ for page in pages:
192
+ if page.url != "about:blank":
193
+ active_page = page
194
+ else:
195
+ return None
196
+
197
+ # Take screenshot
198
+ try:
199
+ screenshot = await active_page.screenshot(
200
+ type='jpeg',
201
+ quality=75,
202
+ scale="css"
203
+ )
204
+ encoded = base64.b64encode(screenshot).decode('utf-8')
205
+ return encoded
206
+ except Exception as e:
207
+ return None