Spaces:

muhammadmaazuddin
/

smgp

No application file

App Files Files Community

muhammadmaazuddin commited on Oct 26, 2025

Commit

16a46a4

1 Parent(s): 4de915b

feat: working on css computed syles on elements

Browse files

Files changed (6) hide show

.gemini/settings.json +11 -0
pyproject.toml +1 -1
src/_agents.py +7 -91
src/agent_dir/browser_agent.py +335 -2
src/model.py +8 -5
uv.lock +0 -0

.gemini/settings.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "mcpServers": {
+    "context7": {
+      "httpUrl": "https://mcp.context7.com/mcp",
+      "headers": {
+        "CONTEXT7_API_KEY": "ctx7sk-4e80e74f-0287-4113-af76-a5a39a5ca6b4",
+        "Accept": "application/json, text/event-stream"
+      }
+    }
+  }
+}

pyproject.toml CHANGED Viewed

@@ -6,7 +6,7 @@ readme = "README.md"
 requires-python = ">=3.11"
 dependencies = [
     "beautifulsoup4>=4.13.5",
-    "browser-use>=0.8.0",
     "ddgs>=9.5.5",
     "duckduckgo-search>=8.1.1",
     "fal-client>=0.7.0",

 requires-python = ">=3.11"
 dependencies = [
     "beautifulsoup4>=4.13.5",
+    "browser-use>=0.9.1",
     "ddgs>=9.5.5",
     "duckduckgo-search>=8.1.1",
     "fal-client>=0.7.0",

src/_agents.py CHANGED Viewed

@@ -166,96 +166,11 @@ You are a Browser Agent that must locate, visually verify, and capture a screens
 User's query: Take screenshot of header
 """
 task="""
-You are an advanced Browser Agent specializing in precise element identification and screenshot capture using a multi-strategy approach.
-### Element Discovery and Screenshot Process:
-1. INITIALIZATION PHASE
-   - Ensure page is fully loaded
-   - Handle any popups/cookie notices
-   - Wait for dynamic content to stabilize
-2. ELEMENT DISCOVERY PHASE (Multi-Strategy)
-   Strategy A: Accessibility-First Search
-   - Extract page's accessibility tree
-   - Use semantic matching to find elements matching query
-   - Generate precise element locator based on:
-     • ARIA roles and labels
-     • Semantic HTML structure
-     • Unique identifiers or data attributes
-   Strategy B: Visual Search (Fallback)
-   - Capture full page screenshot
-   - Use vision AI to identify target region
-   - Convert visual coordinates to DOM element
-   - Generate reliable element locator
-3. ELEMENT VERIFICATION PHASE
-   - Take preliminary element screenshot
-   - Verify accuracy using visual confirmation:
-     ```
-     Query: "Does this element match: {user_query}?"
-     Expected Response:
-     {
-       confidence: number (0-100),
-       reasoning: string,
-       matches_criteria: boolean
-     }
-     ```
-   - Success Criteria:
-     • Confidence score > 80%
-     • Element boundaries exactly match intent
-     • No missing or extra content
-   - If verification fails:
-     • Try alternate strategy
-     • Refine element locator
-     • Log failure reason
-4. SCREENSHOT CAPTURE PHASE
-   - Scroll element into viewport
-   - Add temporary highlight for visual confirmation:
-     ```js
-     (elementLocator) => {
-         const el = document.querySelector(elementLocator);
-         if (el) {
-             el.style.outline = '2px solid #007FFF';
-             el.style.backgroundColor = 'rgba(0, 127, 255, 0.1)';
-             return true;
-         }
-         return false;
-     }
-     ```
-   - Capture element screenshot with padding
-   - Remove highlighting:
-     ```js
-     (elementLocator) => {
-         const el = document.querySelector(elementLocator);
-         if (el) {
-             el.style.outline = '';
-             el.style.backgroundColor = '';
-             return true;
-         }
-         return false;
-     }
-     ```
-   - Save with metadata (timestamp, query, coordinates)
-### Critical Rules:
-1. Always attempt Strategy A (Accessibility) before falling back to Strategy B (Visual)
-2. Require explicit verification before screenshot capture
-3. Maintain clean DOM state - remove all temporary highlights
-4. Log each phase with detailed status and timing
-5. Handle failures gracefully with clear error reporting
-### Error Recovery:
-- If Strategy A fails: Fall back to Strategy B
-- If verification fails: Retry with refined locator
-- If both strategies fail: Report detailed failure analysis
-- Max 3 retry attempts per strategy
-Current task: Find and screenshot the header.
 """
@@ -338,7 +253,7 @@ async def run_search() -> None:
         print(f"✅ Browser started successfully")
         # Use the already opened tab and navigate if needed
-        target_url = "https://github.com/pricing"
         print(f'🌐 Navigating to {target_url} in the first tab...')
         page = await browser.get_current_page()
         await page.goto(target_url)
@@ -350,7 +265,8 @@ async def run_search() -> None:
         print('🔄 Creating Browser Agent with pre-navigated browser...')
         browser_agent = AgentBrowser(
             task=task,
-            llm=get_model("browser_agent_openrouter:google/gemini-2.5-flash"),
             use_vision=True,
             generate_gif=False,
             max_failures=3,

 User's query: Take screenshot of header
 """
+# specializing in precise element identification and screenshot capture using a multi-strategy approach
+#First, scroll to the bottom of the page to ensure all content is loaded, then scroll back to the top. After that
 task="""
+You are an advanced Browser Agent .
+Task : Extract colors from the webpage and return a Colors object.
 """
         print(f"✅ Browser started successfully")
         # Use the already opened tab and navigate if needed
+        target_url = "http://denovers.com/"
         print(f'🌐 Navigating to {target_url} in the first tab...')
         page = await browser.get_current_page()
         await page.goto(target_url)
         print('🔄 Creating Browser Agent with pre-navigated browser...')
         browser_agent = AgentBrowser(
             task=task,
+            # llm=get_model("browser_agent_openrouter:google/gemini-2.5-flash"),
+            llm=get_model("llm_browser_google"),
             use_vision=True,
             generate_gif=False,
             max_failures=3,

src/agent_dir/browser_agent.py CHANGED Viewed

@@ -8,7 +8,7 @@ import base64
 import asyncio
 from PIL import Image
 from datetime import datetime
-from typing import Optional, List
 from urllib.parse import urlparse
 from pydantic import BaseModel, Field, conint
 from playwright.async_api import TimeoutError as PlaywrightTimeoutError
@@ -16,7 +16,8 @@ from browser_use import Agent as AgentBrowser, ChatGoogle, ChatOpenAI as ChatOpe
 from browser_use.browser import BrowserSession, BrowserProfile
 from utils.chrome_playwright import start_chrome_with_debug_port, connect_playwright_to_cdp
 from browser_use.actor.element import Element as Element_
 # Model definitions for browser interaction
 class PageVisited(BaseModel):
     url: str
@@ -107,6 +108,338 @@ class VerifyElementVisualParams(BaseModel):

 import asyncio
 from PIL import Image
 from datetime import datetime
+from typing import Optional, List, Literal
 from urllib.parse import urlparse
 from pydantic import BaseModel, Field, conint
 from playwright.async_api import TimeoutError as PlaywrightTimeoutError
 from browser_use.browser import BrowserSession, BrowserProfile
 from utils.chrome_playwright import start_chrome_with_debug_port, connect_playwright_to_cdp
 from browser_use.actor.element import Element as Element_
+from browser_use.dom.serializer.serializer import DOMTreeSerializer
+import re
 # Model definitions for browser interaction
 class PageVisited(BaseModel):
     url: str
+class ColorElementHint(BaseModel):
+    text: str = Field(description="Text content of element (e.g., 'Get Started', 'Sign Up')")
+    tags: List[str] = Field(description="Possible HTML tags (e.g., ['button', 'a'])")
+    priority: Literal["primary", "secondary", "accent"] = Field(description="Color priority level")
+class PossibleColorThemeData(BaseModel):
+    elements_to_find: List[ColorElementHint] = Field(
+        description="List of elements identified by agent that likely have brand colors"
+    )
+    additional_tag_patterns: Optional[List[str]] = Field(
+        default=None,
+        description="Additional tags agent thinks should be checked (e.g., ['span', 'div'])"
+    )
+def build_search_strategy(params: PossibleColorThemeData):
+    """
+    Convert agent params into search strategy
+    """
+    # Base hardcoded selectors (always search)
+    BASE_SELECTORS = [
+        {'tag': 'a', 'role': None},
+        {'tag': 'button', 'role': None},
+        {'tag': 'div', 'role': 'button'},
+        {'tag': 'span', 'role': 'button'},
+        {'tag': 'input', 'role': 'submit'},
+        {"tag": 'h1', 'role': None},
+        {"tag": 'h2', 'role': None},
+        {"tag": 'h3', 'role': None},
+        {"tag": 'h4', 'role': None},
+        {"tag": 'h5', 'role': None},
+        {"tag": 'h6', 'role': None},
+        {"tag": 'p', 'role': None},
+        {"tag": 'span', 'role': None},
+        {"tag": 'div', 'role': None},
+    ]
+    # Extract from params
+    search_strategy = {
+        'base_selectors': BASE_SELECTORS,
+        'text_matches': [
+            elem.text for elem in params.elements_to_find
+        ],
+        'priority_map': {
+            elem.text: elem.priority
+            for elem in params.elements_to_find
+        },
+        'agent_tags': list(set(
+            tag
+            for elem in params.elements_to_find
+            for tag in elem.tags
+        ))
+    }
+    # Add additional tags if provided
+    if params.additional_tag_patterns:
+        search_strategy['agent_tags'].extend(params.additional_tag_patterns)
+    return search_strategy
+@tools.action(
+              description="""Extracts the complete color system from the current webpage for brand guidelines.
+    This action identifies and extracts brand colors by analyzing interactive elements
+    (buttons, links, CTAs) and their styling. It combines hardcoded element patterns
+    with AI-identified color hints to find primary, secondary, and accent brand colors.
+    Process:
+    1. Takes agent-provided hints about elements with brand colors (text + tags)
+    2. Searches DOM using both base selectors and agent hints
+    3. Extracts computed colors from matching elements
+    4. Scores and ranks colors by prominence and relevance
+    Args:
+        params (AgentColorThemeData): Contains:
+            - elements_to_find: List of elements agent identified (text, tags, priority)
+            - additional_tag_patterns: Extra tags to search (optional)
+        browser_session (BrowserSession): The active browser session
+    Returns:
+        dict: {
+            "primary": {"hex": "#...", "score": float, "examples": [...]},
+            "secondary": {"hex": "#...", "score": float, "examples": [...]},
+            "accent": {"hex": "#...", "score": float, "examples": [...]},
+            "all_colors": [...],  # Top 10 ranked colors
+            "error": None or error message
+        } """,
+              param_model=PossibleColorThemeData,
+              )
+async def extract_color_system(params,browser_session: BrowserSession):
+    print("Extracting color system from the website...--------------------")
+    print(params)
+    page = await browser_session.get_current_page()
+    await page._ensure_session()
+    await page._client.send.CSS.enable(session_id=page._session_id)
+    await page._client.send.DOM.getDocument(
+        params={'depth': 1}, # depth: 1 is usually enough to get the root document
+        session_id=page._session_id
+    )
+    dom_service = page.dom_service
+    enhanced_dom_tree = await dom_service.get_dom_tree(target_id=page._target_id)
+    serialized_dom_state, _ = DOMTreeSerializer(
+        enhanced_dom_tree, None, paint_order_filtering=True
+    ).serialize_accessible_elements()
+    llm_representation = serialized_dom_state.llm_representation()
+    # print(llm_representation)
+    search_strategy = build_search_strategy(params)
+    print(search_strategy)
+    # Parse and match
+    matching_indices = []
+    lines = llm_representation.split('\n')
+    lines = [line.strip(" \t\r\n\f\v") for line in lines if line.strip(" \t\r\n\f\v")]
+    print(lines)
+    for i, line in enumerate(lines):
+        # Extract [index]<tag attributes>
+        match = re.match(r'\s*\[(\d+)\]<(\w+)([^>]*)>', line)
+        if not match:
+            continue
+        element_index = int(match.group(1))
+        tag = match.group(2)
+        attributes = match.group(3)
+        # Get text content from next line
+        text_content = ''
+        if i + 1 < len(lines):
+            next_line = lines[i + 1].strip()
+            if not next_line.startswith('['):
+                text_content = next_line
+        # Match Strategy 1: Base selectors
+        for base in search_strategy['base_selectors']:
+            if tag == base['tag']:
+                role_match = base['role'] is None or f'role="{base["role"]}"' in attributes
+                if role_match:
+                    matching_indices.append({
+                        'index': element_index,
+                        'tag': tag,
+                        'text': text_content,
+                        'source': 'base',
+                        'priority': None
+                    })
+                    break
+        # Match Strategy 2: Agent text matches (higher priority)
+        for text_match in search_strategy['text_matches']:
+            if text_match.lower() in text_content.lower():
+                priority = search_strategy['priority_map'].get(text_match)
+                matching_indices.append({
+                    'index': element_index,
+                    'tag': tag,
+                    'text': text_content,
+                    'source': 'agent',
+                    'priority': priority,
+                    'matched_text': text_match
+                })
+                break
+    print(matching_indices )
+    # await page.dom_service.get_dom_tree(target_id=page._target_id)
+    # await page._ensure_session()
+    color_data = []
+    for match in matching_indices:
+        element_index = match['index']
+        # Get element using selector_map (as you discovered!)
+        if element_index not in serialized_dom_state.selector_map:
+            continue
+        element_info = serialized_dom_state.selector_map[element_index]
+        try:
+            pushed_nodes = await page._client.send.DOM.pushNodesByBackendIdsToFrontend(
+                params={
+                    'backendNodeIds': [element_info.backend_node_id], # Pass a list
+                },
+                session_id=page._session_id
+            )
+            # 2. Extract the live NodeId from the response list
+            working_node_ids = pushed_nodes.get('nodeIds', [])
+            if working_node_ids and working_node_ids[0] != 0:
+                working_node_id = working_node_ids[0]
+                print(f"✅ Successfully resolved live NodeId: {working_node_id}")
+                tasksToRun = [
+                    page._client.send.CSS.getComputedStyleForNode(
+                        params={'nodeId': working_node_id},
+                        session_id=page._session_id
+                    ),
+                    page._client.send.CSS.getMatchedStylesForNode(
+                        params={'nodeId': working_node_id},
+                        session_id=page._session_id
+                    ),
+                    page._client.send.CSS.getPlatformFontsForNode(
+                        params={'nodeId': working_node_id},
+                        session_id=page._session_id
+                    ),
+                    # page._client.send.CSS.getBackgroundColors(
+                    #     params={'nodeId': working_node_id},
+                    #     session_id=page._session_id
+                    # )
+                ]
+                results =  await asyncio.gather(*tasksToRun)
+                computedStyle, matchedStyles, platformFonts = results
+                print(matchedStyles.keys(), platformFonts)
+            else:
+                print(f"❌ ERROR: Node with BackendNodeId {element_info.backend_node_id} could not be found in the current DOM tree.")
+                continue # Move to the next matched element
+        except Exception as e:
+            print(f"❌ ERROR during CDP call for node {element_index}: {e}")
+            continue
+        # doc_result = await page._client.send.DOM.getOuterHTML(
+        #     params={
+        #         'backendNodeId': element_info.backend_node_id,
+        #     },
+        #         session_id=page._session_id
+        #     )
+        # # print(doc_result)
+        # print(element_info.backend_node_id, element_info.node_id)
+        # resolved_node = await page._client.send.DOM.resolveNode(
+        #     params={
+        #         'backendNodeId': element_info.backend_node_id,
+        #     },
+        #     session_id=page._session_id
+        # )
+        # print(resolved_node)
+        # object_id = resolved_node.get('object', {}).get('objectId', None)
+        # print(object_id) # Should print the long string: '6444308731130212907.2.3'
+        # requested_node = await page._client.send.DOM.requestNode(
+        #     params={
+        #         'objectId': object_id,
+        #     },
+        #     session_id=page._session_id
+        # )
+        # print(requested_node) # Should print a dictionary like: {'nodeId': 32}
+        # # *** CORRECTION: Extract the integer nodeId from the requested_node response ***
+        # # The response from DOM.requestNode is a dictionary containing the key 'nodeId'
+        # working_node_id = requested_node.get('nodeId')
+        # if working_node_id is None:
+        #     print("ERROR: Could not retrieve a valid NodeId from DOM.requestNode.")
+        # else:
+        #     # CDP is expecting an int32, which Python's int handles.
+        #     # The error 'int32 value expected' was likely because a string or other object
+        #     # was passed before, but now we're passing the extracted integer.
+        #     css_result = await page._client.send.CSS.getComputedStyleForNode(
+        #         params={
+        #             'nodeId': working_node_id,
+        #         },
+        #         session_id=page._session_id
+        #     )
+        #     print(css_result)
+        # # element = Element_(page._browser_session, element_info.backend_node_id, page._session_id)
+        # print(basic_info)
+        # Extract colors from this element
+        # colors = await element.evaluate("""
+        #     (el) => {
+        #         const styles = getComputedStyle(el);
+        #         // Convert RGB to hex
+        #         function rgbToHex(rgb) {
+        #             if (!rgb || rgb === 'transparent' || rgb === 'rgba(0, 0, 0, 0)') {
+        #                 return null;
+        #             }
+        #             const match = rgb.match(/\\d+/g);
+        #             if (!match || match.length < 3) return null;
+        #             return '#' + match.slice(0, 3).map(x =>
+        #                 parseInt(x).toString(16).padStart(2, '0')
+        #             ).join('').toUpperCase();
+        #         }
+        #         return {
+        #             backgroundColor: rgbToHex(styles.backgroundColor),
+        #             color: rgbToHex(styles.color),
+        #             borderColor: rgbToHex(styles.borderColor),
+        #             rect: {
+        #                 width: el.offsetWidth,
+        #                 height: el.offsetHeight,
+        #                 top: el.getBoundingClientRect().top
+        #             }
+        #         };
+        #     }
+        # """)
+        # color_data.append({
+        #     'index': element_index,
+        #     'tag': match['tag'],
+        #     'text': match['text'],
+        #     'source': match['source'],
+        #     'backgroundColor': colors['backgroundColor'],
+        #     'textColor': colors['color'],
+        #     'borderColor': colors['borderColor'],
+        #     'size': colors['rect']['width'] * colors['rect']['height'],
+        #     'position': colors['rect']['top']
+        # })
+    # serialized_dom_state
+    # colors = Colors(
+    #     primary=snapshot.get('primaryColor', None),
+    #     secondary=snapshot.get('secondaryColor', None),
+    #     palette=snapshot.get('palette', [])
+    # )
+    return  ""

src/model.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # LLM client initialization moved from _agents.py
 import os
-from browser_use import ChatGoogle, ChatOpenAI as ChatOpenAIBrowserUse, ChatOpenAI
 from agents import OpenAIChatCompletionsModel, AsyncOpenAI
 from dotenv import load_dotenv, find_dotenv
@@ -19,16 +19,21 @@ DEEPSEEK_BASE_URL = "https://api.deepseek.com/v1"
 GROK_BASE_URL = "https://api.x.ai/v1"
 GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
 OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
 openrouter_client = AsyncOpenAI(base_url=OPENROUTER_BASE_URL, api_key=openrouter_api_key)
 # deepseek_client = AsyncOpenAI(base_url=DEEPSEEK_BASE_URL, api_key=deepseek_api_key)
 # grok_client = AsyncOpenAI(base_url=GROK_BASE_URL, api_key=grok_api_key)
-gemini_client = AsyncOpenAI(base_url=GEMINI_BASE_URL, api_key=google_api_key)
 openai_client = AsyncOpenAI(api_key=openai_api_key)
 def get_model(model_name: str) -> ChatGoogle | ChatOpenAIBrowserUse | OpenAIChatCompletionsModel | str:
     if model_name.startswith("openrouter:"):
         # Use the text after ':' as the model name
@@ -51,8 +56,6 @@ def get_model(model_name: str) -> ChatGoogle | ChatOpenAIBrowserUse | OpenAIChat
     #     return OpenAIChatCompletionsModel(model=model_name, openai_client=grok_client)
     elif "gpt" in model_name:
         return OpenAIChatCompletionsModel(model=model_name, openai_client=openai_client)
-    elif "gemini" in model_name:
-        return OpenAIChatCompletionsModel(model=model_name, openai_client=gemini_client)
     else:
         return model_name

 # LLM client initialization moved from _agents.py
 import os
+from browser_use import ChatGoogle, ChatOpenAI as ChatOpenAIBrowserUse
 from agents import OpenAIChatCompletionsModel, AsyncOpenAI
 from dotenv import load_dotenv, find_dotenv
 GROK_BASE_URL = "https://api.x.ai/v1"
 GEMINI_BASE_URL = "https://generativelanguage.googleapis.com/v1beta/openai/"
 OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
+# QWEN_BASE_URL = 'https://dashscope-intl.aliyuncs.com/compatible-mode/v1'
 openrouter_client = AsyncOpenAI(base_url=OPENROUTER_BASE_URL, api_key=openrouter_api_key)
 # deepseek_client = AsyncOpenAI(base_url=DEEPSEEK_BASE_URL, api_key=deepseek_api_key)
 # grok_client = AsyncOpenAI(base_url=GROK_BASE_URL, api_key=grok_api_key)
+# gemini_client = AsyncOpenAI(base_url=GEMINI_BASE_URL, api_key=google_api_key)
 openai_client = AsyncOpenAI(api_key=openai_api_key)
+# llm = ChatOpenAI(model='qwen-vl-max', api_key=api_key, base_url=base_url)
 def get_model(model_name: str) -> ChatGoogle | ChatOpenAIBrowserUse | OpenAIChatCompletionsModel | str:
     if model_name.startswith("openrouter:"):
         # Use the text after ':' as the model name
     #     return OpenAIChatCompletionsModel(model=model_name, openai_client=grok_client)
     elif "gpt" in model_name:
         return OpenAIChatCompletionsModel(model=model_name, openai_client=openai_client)
     else:
         return model_name

uv.lock CHANGED Viewed

The diff for this file is too large to render. See raw diff