muhammadmaazuddin commited on
Commit
16a46a4
·
1 Parent(s): 4de915b

feat: working on css computed syles on elements

Browse files
.gemini/settings.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mcpServers": {
3
+ "context7": {
4
+ "httpUrl": "https://mcp.context7.com/mcp",
5
+ "headers": {
6
+ "CONTEXT7_API_KEY": "ctx7sk-4e80e74f-0287-4113-af76-a5a39a5ca6b4",
7
+ "Accept": "application/json, text/event-stream"
8
+ }
9
+ }
10
+ }
11
+ }
pyproject.toml CHANGED
@@ -6,7 +6,7 @@ readme = "README.md"
6
  requires-python = ">=3.11"
7
  dependencies = [
8
  "beautifulsoup4>=4.13.5",
9
- "browser-use>=0.8.0",
10
  "ddgs>=9.5.5",
11
  "duckduckgo-search>=8.1.1",
12
  "fal-client>=0.7.0",
 
6
  requires-python = ">=3.11"
7
  dependencies = [
8
  "beautifulsoup4>=4.13.5",
9
+ "browser-use>=0.9.1",
10
  "ddgs>=9.5.5",
11
  "duckduckgo-search>=8.1.1",
12
  "fal-client>=0.7.0",
src/_agents.py CHANGED
@@ -166,96 +166,11 @@ You are a Browser Agent that must locate, visually verify, and capture a screens
166
  User's query: Take screenshot of header
167
  """
168
 
 
 
169
  task="""
170
- You are an advanced Browser Agent specializing in precise element identification and screenshot capture using a multi-strategy approach.
171
-
172
- ### Element Discovery and Screenshot Process:
173
-
174
- 1. INITIALIZATION PHASE
175
- - Ensure page is fully loaded
176
- - Handle any popups/cookie notices
177
- - Wait for dynamic content to stabilize
178
-
179
- 2. ELEMENT DISCOVERY PHASE (Multi-Strategy)
180
-
181
- Strategy A: Accessibility-First Search
182
- - Extract page's accessibility tree
183
- - Use semantic matching to find elements matching query
184
- - Generate precise element locator based on:
185
- • ARIA roles and labels
186
- • Semantic HTML structure
187
- • Unique identifiers or data attributes
188
-
189
- Strategy B: Visual Search (Fallback)
190
- - Capture full page screenshot
191
- - Use vision AI to identify target region
192
- - Convert visual coordinates to DOM element
193
- - Generate reliable element locator
194
-
195
- 3. ELEMENT VERIFICATION PHASE
196
- - Take preliminary element screenshot
197
- - Verify accuracy using visual confirmation:
198
- ```
199
- Query: "Does this element match: {user_query}?"
200
- Expected Response:
201
- {
202
- confidence: number (0-100),
203
- reasoning: string,
204
- matches_criteria: boolean
205
- }
206
- ```
207
- - Success Criteria:
208
- • Confidence score > 80%
209
- • Element boundaries exactly match intent
210
- • No missing or extra content
211
- - If verification fails:
212
- • Try alternate strategy
213
- • Refine element locator
214
- • Log failure reason
215
-
216
- 4. SCREENSHOT CAPTURE PHASE
217
- - Scroll element into viewport
218
- - Add temporary highlight for visual confirmation:
219
- ```js
220
- (elementLocator) => {
221
- const el = document.querySelector(elementLocator);
222
- if (el) {
223
- el.style.outline = '2px solid #007FFF';
224
- el.style.backgroundColor = 'rgba(0, 127, 255, 0.1)';
225
- return true;
226
- }
227
- return false;
228
- }
229
- ```
230
- - Capture element screenshot with padding
231
- - Remove highlighting:
232
- ```js
233
- (elementLocator) => {
234
- const el = document.querySelector(elementLocator);
235
- if (el) {
236
- el.style.outline = '';
237
- el.style.backgroundColor = '';
238
- return true;
239
- }
240
- return false;
241
- }
242
- ```
243
- - Save with metadata (timestamp, query, coordinates)
244
-
245
- ### Critical Rules:
246
- 1. Always attempt Strategy A (Accessibility) before falling back to Strategy B (Visual)
247
- 2. Require explicit verification before screenshot capture
248
- 3. Maintain clean DOM state - remove all temporary highlights
249
- 4. Log each phase with detailed status and timing
250
- 5. Handle failures gracefully with clear error reporting
251
-
252
- ### Error Recovery:
253
- - If Strategy A fails: Fall back to Strategy B
254
- - If verification fails: Retry with refined locator
255
- - If both strategies fail: Report detailed failure analysis
256
- - Max 3 retry attempts per strategy
257
-
258
- Current task: Find and screenshot the header.
259
  """
260
 
261
 
@@ -338,7 +253,7 @@ async def run_search() -> None:
338
  print(f"✅ Browser started successfully")
339
 
340
  # Use the already opened tab and navigate if needed
341
- target_url = "https://github.com/pricing"
342
  print(f'🌐 Navigating to {target_url} in the first tab...')
343
  page = await browser.get_current_page()
344
  await page.goto(target_url)
@@ -350,7 +265,8 @@ async def run_search() -> None:
350
  print('🔄 Creating Browser Agent with pre-navigated browser...')
351
  browser_agent = AgentBrowser(
352
  task=task,
353
- llm=get_model("browser_agent_openrouter:google/gemini-2.5-flash"),
 
354
  use_vision=True,
355
  generate_gif=False,
356
  max_failures=3,
 
166
  User's query: Take screenshot of header
167
  """
168
 
169
+ # specializing in precise element identification and screenshot capture using a multi-strategy approach
170
+ #First, scroll to the bottom of the page to ensure all content is loaded, then scroll back to the top. After that
171
  task="""
172
+ You are an advanced Browser Agent .
173
+ Task : Extract colors from the webpage and return a Colors object.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  """
175
 
176
 
 
253
  print(f"✅ Browser started successfully")
254
 
255
  # Use the already opened tab and navigate if needed
256
+ target_url = "http://denovers.com/"
257
  print(f'🌐 Navigating to {target_url} in the first tab...')
258
  page = await browser.get_current_page()
259
  await page.goto(target_url)
 
265
  print('🔄 Creating Browser Agent with pre-navigated browser...')
266
  browser_agent = AgentBrowser(
267
  task=task,
268
+ # llm=get_model("browser_agent_openrouter:google/gemini-2.5-flash"),
269
+ llm=get_model("llm_browser_google"),
270
  use_vision=True,
271
  generate_gif=False,
272
  max_failures=3,
src/agent_dir/browser_agent.py CHANGED
@@ -8,7 +8,7 @@ import base64
8
  import asyncio
9
  from PIL import Image
10
  from datetime import datetime
11
- from typing import Optional, List
12
  from urllib.parse import urlparse
13
  from pydantic import BaseModel, Field, conint
14
  from playwright.async_api import TimeoutError as PlaywrightTimeoutError
@@ -16,7 +16,8 @@ from browser_use import Agent as AgentBrowser, ChatGoogle, ChatOpenAI as ChatOpe
16
  from browser_use.browser import BrowserSession, BrowserProfile
17
  from utils.chrome_playwright import start_chrome_with_debug_port, connect_playwright_to_cdp
18
  from browser_use.actor.element import Element as Element_
19
-
 
20
  # Model definitions for browser interaction
21
  class PageVisited(BaseModel):
22
  url: str
@@ -107,6 +108,338 @@ class VerifyElementVisualParams(BaseModel):
107
 
108
 
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
 
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  import asyncio
9
  from PIL import Image
10
  from datetime import datetime
11
+ from typing import Optional, List, Literal
12
  from urllib.parse import urlparse
13
  from pydantic import BaseModel, Field, conint
14
  from playwright.async_api import TimeoutError as PlaywrightTimeoutError
 
16
  from browser_use.browser import BrowserSession, BrowserProfile
17
  from utils.chrome_playwright import start_chrome_with_debug_port, connect_playwright_to_cdp
18
  from browser_use.actor.element import Element as Element_
19
+ from browser_use.dom.serializer.serializer import DOMTreeSerializer
20
+ import re
21
  # Model definitions for browser interaction
22
  class PageVisited(BaseModel):
23
  url: str
 
108
 
109
 
110
 
111
+ class ColorElementHint(BaseModel):
112
+ text: str = Field(description="Text content of element (e.g., 'Get Started', 'Sign Up')")
113
+ tags: List[str] = Field(description="Possible HTML tags (e.g., ['button', 'a'])")
114
+ priority: Literal["primary", "secondary", "accent"] = Field(description="Color priority level")
115
+
116
+
117
+
118
+ class PossibleColorThemeData(BaseModel):
119
+ elements_to_find: List[ColorElementHint] = Field(
120
+ description="List of elements identified by agent that likely have brand colors"
121
+ )
122
+ additional_tag_patterns: Optional[List[str]] = Field(
123
+ default=None,
124
+ description="Additional tags agent thinks should be checked (e.g., ['span', 'div'])"
125
+ )
126
+
127
+
128
+ def build_search_strategy(params: PossibleColorThemeData):
129
+ """
130
+ Convert agent params into search strategy
131
+ """
132
+
133
+ # Base hardcoded selectors (always search)
134
+ BASE_SELECTORS = [
135
+ {'tag': 'a', 'role': None},
136
+ {'tag': 'button', 'role': None},
137
+ {'tag': 'div', 'role': 'button'},
138
+ {'tag': 'span', 'role': 'button'},
139
+ {'tag': 'input', 'role': 'submit'},
140
+ {"tag": 'h1', 'role': None},
141
+ {"tag": 'h2', 'role': None},
142
+ {"tag": 'h3', 'role': None},
143
+ {"tag": 'h4', 'role': None},
144
+ {"tag": 'h5', 'role': None},
145
+ {"tag": 'h6', 'role': None},
146
+ {"tag": 'p', 'role': None},
147
+ {"tag": 'span', 'role': None},
148
+ {"tag": 'div', 'role': None},
149
+ ]
150
+
151
+ # Extract from params
152
+ search_strategy = {
153
+ 'base_selectors': BASE_SELECTORS,
154
+ 'text_matches': [
155
+ elem.text for elem in params.elements_to_find
156
+ ],
157
+ 'priority_map': {
158
+ elem.text: elem.priority
159
+ for elem in params.elements_to_find
160
+ },
161
+ 'agent_tags': list(set(
162
+ tag
163
+ for elem in params.elements_to_find
164
+ for tag in elem.tags
165
+ ))
166
+ }
167
+
168
+ # Add additional tags if provided
169
+ if params.additional_tag_patterns:
170
+ search_strategy['agent_tags'].extend(params.additional_tag_patterns)
171
+
172
+ return search_strategy
173
+
174
+
175
+
176
+
177
+ @tools.action(
178
+ description="""Extracts the complete color system from the current webpage for brand guidelines.
179
+
180
+ This action identifies and extracts brand colors by analyzing interactive elements
181
+ (buttons, links, CTAs) and their styling. It combines hardcoded element patterns
182
+ with AI-identified color hints to find primary, secondary, and accent brand colors.
183
+
184
+ Process:
185
+ 1. Takes agent-provided hints about elements with brand colors (text + tags)
186
+ 2. Searches DOM using both base selectors and agent hints
187
+ 3. Extracts computed colors from matching elements
188
+ 4. Scores and ranks colors by prominence and relevance
189
+
190
+ Args:
191
+ params (AgentColorThemeData): Contains:
192
+ - elements_to_find: List of elements agent identified (text, tags, priority)
193
+ - additional_tag_patterns: Extra tags to search (optional)
194
+ browser_session (BrowserSession): The active browser session
195
+
196
+ Returns:
197
+ dict: {
198
+ "primary": {"hex": "#...", "score": float, "examples": [...]},
199
+ "secondary": {"hex": "#...", "score": float, "examples": [...]},
200
+ "accent": {"hex": "#...", "score": float, "examples": [...]},
201
+ "all_colors": [...], # Top 10 ranked colors
202
+ "error": None or error message
203
+ } """,
204
+ param_model=PossibleColorThemeData,
205
+ )
206
+ async def extract_color_system(params,browser_session: BrowserSession):
207
+ print("Extracting color system from the website...--------------------")
208
+ print(params)
209
+
210
+ page = await browser_session.get_current_page()
211
+
212
+
213
+ await page._ensure_session()
214
+ await page._client.send.CSS.enable(session_id=page._session_id)
215
+
216
+ await page._client.send.DOM.getDocument(
217
+ params={'depth': 1}, # depth: 1 is usually enough to get the root document
218
+ session_id=page._session_id
219
+ )
220
+
221
+ dom_service = page.dom_service
222
+
223
+ enhanced_dom_tree = await dom_service.get_dom_tree(target_id=page._target_id)
224
+
225
+ serialized_dom_state, _ = DOMTreeSerializer(
226
+ enhanced_dom_tree, None, paint_order_filtering=True
227
+ ).serialize_accessible_elements()
228
+ llm_representation = serialized_dom_state.llm_representation()
229
+ # print(llm_representation)
230
+
231
+ search_strategy = build_search_strategy(params)
232
+ print(search_strategy)
233
+
234
+
235
+ # Parse and match
236
+ matching_indices = []
237
+ lines = llm_representation.split('\n')
238
+ lines = [line.strip(" \t\r\n\f\v") for line in lines if line.strip(" \t\r\n\f\v")]
239
+ print(lines)
240
+ for i, line in enumerate(lines):
241
+ # Extract [index]<tag attributes>
242
+ match = re.match(r'\s*\[(\d+)\]<(\w+)([^>]*)>', line)
243
+ if not match:
244
+ continue
245
+
246
+ element_index = int(match.group(1))
247
+ tag = match.group(2)
248
+ attributes = match.group(3)
249
+
250
+ # Get text content from next line
251
+ text_content = ''
252
+ if i + 1 < len(lines):
253
+ next_line = lines[i + 1].strip()
254
+ if not next_line.startswith('['):
255
+ text_content = next_line
256
+
257
+ # Match Strategy 1: Base selectors
258
+ for base in search_strategy['base_selectors']:
259
+ if tag == base['tag']:
260
+ role_match = base['role'] is None or f'role="{base["role"]}"' in attributes
261
+ if role_match:
262
+ matching_indices.append({
263
+ 'index': element_index,
264
+ 'tag': tag,
265
+ 'text': text_content,
266
+ 'source': 'base',
267
+ 'priority': None
268
+ })
269
+ break
270
+
271
+ # Match Strategy 2: Agent text matches (higher priority)
272
+ for text_match in search_strategy['text_matches']:
273
+ if text_match.lower() in text_content.lower():
274
+ priority = search_strategy['priority_map'].get(text_match)
275
+ matching_indices.append({
276
+ 'index': element_index,
277
+ 'tag': tag,
278
+ 'text': text_content,
279
+ 'source': 'agent',
280
+ 'priority': priority,
281
+ 'matched_text': text_match
282
+ })
283
+ break
284
+
285
+ print(matching_indices )
286
+
287
+ # await page.dom_service.get_dom_tree(target_id=page._target_id)
288
+ # await page._ensure_session()
289
+
290
+ color_data = []
291
+
292
+ for match in matching_indices:
293
+ element_index = match['index']
294
+
295
+ # Get element using selector_map (as you discovered!)
296
+ if element_index not in serialized_dom_state.selector_map:
297
+ continue
298
+
299
+
300
+ element_info = serialized_dom_state.selector_map[element_index]
301
+
302
+ try:
303
+ pushed_nodes = await page._client.send.DOM.pushNodesByBackendIdsToFrontend(
304
+ params={
305
+ 'backendNodeIds': [element_info.backend_node_id], # Pass a list
306
+ },
307
+ session_id=page._session_id
308
+ )
309
+
310
+ # 2. Extract the live NodeId from the response list
311
+ working_node_ids = pushed_nodes.get('nodeIds', [])
312
+
313
+ if working_node_ids and working_node_ids[0] != 0:
314
+ working_node_id = working_node_ids[0]
315
+ print(f"✅ Successfully resolved live NodeId: {working_node_id}")
316
+
317
+ tasksToRun = [
318
+ page._client.send.CSS.getComputedStyleForNode(
319
+ params={'nodeId': working_node_id},
320
+ session_id=page._session_id
321
+ ),
322
+ page._client.send.CSS.getMatchedStylesForNode(
323
+ params={'nodeId': working_node_id},
324
+ session_id=page._session_id
325
+ ),
326
+ page._client.send.CSS.getPlatformFontsForNode(
327
+ params={'nodeId': working_node_id},
328
+ session_id=page._session_id
329
+ ),
330
+ # page._client.send.CSS.getBackgroundColors(
331
+ # params={'nodeId': working_node_id},
332
+ # session_id=page._session_id
333
+ # )
334
+ ]
335
+ results = await asyncio.gather(*tasksToRun)
336
+
337
+ computedStyle, matchedStyles, platformFonts = results
338
+ print(matchedStyles.keys(), platformFonts)
339
+ else:
340
+ print(f"❌ ERROR: Node with BackendNodeId {element_info.backend_node_id} could not be found in the current DOM tree.")
341
+ continue # Move to the next matched element
342
+
343
+ except Exception as e:
344
+ print(f"❌ ERROR during CDP call for node {element_index}: {e}")
345
+ continue
346
+
347
+
348
+ # doc_result = await page._client.send.DOM.getOuterHTML(
349
+ # params={
350
+ # 'backendNodeId': element_info.backend_node_id,
351
+ # },
352
+ # session_id=page._session_id
353
+ # )
354
+ # # print(doc_result)
355
+ # print(element_info.backend_node_id, element_info.node_id)
356
+ # resolved_node = await page._client.send.DOM.resolveNode(
357
+ # params={
358
+ # 'backendNodeId': element_info.backend_node_id,
359
+ # },
360
+ # session_id=page._session_id
361
+ # )
362
+ # print(resolved_node)
363
+ # object_id = resolved_node.get('object', {}).get('objectId', None)
364
+ # print(object_id) # Should print the long string: '6444308731130212907.2.3'
365
+
366
+ # requested_node = await page._client.send.DOM.requestNode(
367
+ # params={
368
+ # 'objectId': object_id,
369
+ # },
370
+ # session_id=page._session_id
371
+ # )
372
+
373
+ # print(requested_node) # Should print a dictionary like: {'nodeId': 32}
374
+
375
+ # # *** CORRECTION: Extract the integer nodeId from the requested_node response ***
376
+ # # The response from DOM.requestNode is a dictionary containing the key 'nodeId'
377
+ # working_node_id = requested_node.get('nodeId')
378
+
379
+ # if working_node_id is None:
380
+ # print("ERROR: Could not retrieve a valid NodeId from DOM.requestNode.")
381
+ # else:
382
+ # # CDP is expecting an int32, which Python's int handles.
383
+ # # The error 'int32 value expected' was likely because a string or other object
384
+ # # was passed before, but now we're passing the extracted integer.
385
+ # css_result = await page._client.send.CSS.getComputedStyleForNode(
386
+ # params={
387
+ # 'nodeId': working_node_id,
388
+ # },
389
+ # session_id=page._session_id
390
+ # )
391
+ # print(css_result)
392
+ # # element = Element_(page._browser_session, element_info.backend_node_id, page._session_id)
393
 
394
 
395
 
396
+ # print(basic_info)
397
+ # Extract colors from this element
398
+ # colors = await element.evaluate("""
399
+ # (el) => {
400
+ # const styles = getComputedStyle(el);
401
+
402
+ # // Convert RGB to hex
403
+ # function rgbToHex(rgb) {
404
+ # if (!rgb || rgb === 'transparent' || rgb === 'rgba(0, 0, 0, 0)') {
405
+ # return null;
406
+ # }
407
+ # const match = rgb.match(/\\d+/g);
408
+ # if (!match || match.length < 3) return null;
409
+ # return '#' + match.slice(0, 3).map(x =>
410
+ # parseInt(x).toString(16).padStart(2, '0')
411
+ # ).join('').toUpperCase();
412
+ # }
413
+
414
+ # return {
415
+ # backgroundColor: rgbToHex(styles.backgroundColor),
416
+ # color: rgbToHex(styles.color),
417
+ # borderColor: rgbToHex(styles.borderColor),
418
+ # rect: {
419
+ # width: el.offsetWidth,
420
+ # height: el.offsetHeight,
421
+ # top: el.getBoundingClientRect().top
422
+ # }
423
+ # };
424
+ # }
425
+ # """)
426
+
427
+ # color_data.append({
428
+ # 'index': element_index,
429
+ # 'tag': match['tag'],
430
+ # 'text': match['text'],
431
+ # 'source': match['source'],
432
+ # 'backgroundColor': colors['backgroundColor'],
433
+ # 'textColor': colors['color'],
434
+ # 'borderColor': colors['borderColor'],
435
+ # 'size': colors['rect']['width'] * colors['rect']['height'],
436
+ # 'position': colors['rect']['top']
437
+ # })
438
+ # serialized_dom_state
439
+
440
+ # colors = Colors(
441
+ # primary=snapshot.get('primaryColor', None),
442
+ # secondary=snapshot.get('secondaryColor', None),
443
+ # palette=snapshot.get('palette', [])
444
+ # )
445
+ return ""
src/model.py CHANGED
@@ -1,6 +1,6 @@
1
  # LLM client initialization moved from _agents.py
2
  import os
3
- from browser_use import ChatGoogle, ChatOpenAI as ChatOpenAIBrowserUse, ChatOpenAI
4
 
5
  from agents import OpenAIChatCompletionsModel, AsyncOpenAI
6
  from dotenv import load_dotenv, find_dotenv
@@ -19,16 +19,21 @@ DEEPSEEK_BASE_URL = "https://api.deepseek.com/v1"
19
  GROK_BASE_URL = "https://api.x.ai/v1"
20
  GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
21
  OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
22
-
23
 
24
 
25
  openrouter_client = AsyncOpenAI(base_url=OPENROUTER_BASE_URL, api_key=openrouter_api_key)
26
  # deepseek_client = AsyncOpenAI(base_url=DEEPSEEK_BASE_URL, api_key=deepseek_api_key)
27
  # grok_client = AsyncOpenAI(base_url=GROK_BASE_URL, api_key=grok_api_key)
28
- gemini_client = AsyncOpenAI(base_url=GEMINI_BASE_URL, api_key=google_api_key)
29
  openai_client = AsyncOpenAI(api_key=openai_api_key)
30
 
31
 
 
 
 
 
 
32
  def get_model(model_name: str) -> ChatGoogle | ChatOpenAIBrowserUse | OpenAIChatCompletionsModel | str:
33
  if model_name.startswith("openrouter:"):
34
  # Use the text after ':' as the model name
@@ -51,8 +56,6 @@ def get_model(model_name: str) -> ChatGoogle | ChatOpenAIBrowserUse | OpenAIChat
51
  # return OpenAIChatCompletionsModel(model=model_name, openai_client=grok_client)
52
  elif "gpt" in model_name:
53
  return OpenAIChatCompletionsModel(model=model_name, openai_client=openai_client)
54
- elif "gemini" in model_name:
55
- return OpenAIChatCompletionsModel(model=model_name, openai_client=gemini_client)
56
  else:
57
  return model_name
58
 
 
1
  # LLM client initialization moved from _agents.py
2
  import os
3
+ from browser_use import ChatGoogle, ChatOpenAI as ChatOpenAIBrowserUse
4
 
5
  from agents import OpenAIChatCompletionsModel, AsyncOpenAI
6
  from dotenv import load_dotenv, find_dotenv
 
19
  GROK_BASE_URL = "https://api.x.ai/v1"
20
  GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
21
  OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
22
+ # QWEN_BASE_URL = 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1'
23
 
24
 
25
  openrouter_client = AsyncOpenAI(base_url=OPENROUTER_BASE_URL, api_key=openrouter_api_key)
26
  # deepseek_client = AsyncOpenAI(base_url=DEEPSEEK_BASE_URL, api_key=deepseek_api_key)
27
  # grok_client = AsyncOpenAI(base_url=GROK_BASE_URL, api_key=grok_api_key)
28
+ # gemini_client = AsyncOpenAI(base_url=GEMINI_BASE_URL, api_key=google_api_key)
29
  openai_client = AsyncOpenAI(api_key=openai_api_key)
30
 
31
 
32
+
33
+ # llm = ChatOpenAI(model='qwen-vl-max', api_key=api_key, base_url=base_url)
34
+
35
+
36
+
37
  def get_model(model_name: str) -> ChatGoogle | ChatOpenAIBrowserUse | OpenAIChatCompletionsModel | str:
38
  if model_name.startswith("openrouter:"):
39
  # Use the text after ':' as the model name
 
56
  # return OpenAIChatCompletionsModel(model=model_name, openai_client=grok_client)
57
  elif "gpt" in model_name:
58
  return OpenAIChatCompletionsModel(model=model_name, openai_client=openai_client)
 
 
59
  else:
60
  return model_name
61
 
uv.lock CHANGED
The diff for this file is too large to render. See raw diff