smgp / src /_agents.py
muhammadmaazuddin's picture
feat working on colors
75538a9
#type: ignore
from agent_dir.content_agent import content_agent
from agent_dir.media_agent import media_agent, client, post_schema
from agent_dir.web_inspector_agent import WebInspectorAgent
from agent_dir.browser_agent import (
tools,
# ElementScreenshotParams,
# PageVisited,
# WebsiteInfo,
# ContentInfo,
# Colors,
# Typography,
# ButtonStyles,
# HeadingStyles,
# Components,
# DesignSystem,
# Screenshot,
BrowserAgentOutput,
)
# Core imports
import os
import sys
import time
import json
import logging
import asyncio
import aiohttp
import requests
import base64
from datetime import datetime
from playwright.async_api import TimeoutError as PlaywrightTimeoutError
import signal
# Browser setup imports
from browser_use import Agent as AgentBrowser, ChatGoogle, ChatOpenAI as ChatOpenAIBrowserUse
from browser_use.browser import BrowserSession, BrowserProfile
from utils.chrome_playwright import start_chrome_with_debug_port, connect_playwright_to_cdp
# Initialize LLM clients for browser usage
from model import get_model
# Task templates
task_old_1 = f"""
You are a Browser Intelligence Agent specialized in extracting website content and brand identity assets.
Your goal is to visit the given website URL and return a structured, comprehensive extraction.
Follow these steps strictly:
1. Website Navigation:
- Open the provided URL.
- If a user query is provided, search across multiple related internal pages (navigation links, relevant subpages) that may contain information about the query.
- If no query is provided, focus on the landing page only.
2. Content Extraction:
- If a query is provided:
β€’ Extract and summarize text relevant to the query from all visited pages.
β€’ Provide a coherent summary that highlights key points across pages.
- If no query:
β€’ Extract the full visible text from the landing page.
3. Brand & Design Extraction:
- Identify and extract the brand's visual identity, including:
β€’ Primary and secondary colors (hex codes).
β€’ Extended color palette if available.
β€’ Typography (fonts, weights, styles).
β€’ Design system or style guide elements.
β€’ Social media brand kit details (logos, icons, button styles, heading styles).
4. Screenshots (via custom tools):
- Capture screenshots of **topic-related content** (e.g., pricing tables, signup buttons, hero sections if the query is "pricing plans").
- Capture screenshots of **brand identity elements** (e.g., color swatches, typography samples, buttons, logos, icons, headings).
- Save screenshots with clear, descriptive filenames (e.g., `pricing_table.png`, `signup_button.png`, `primary_colors.png`, `typography_styles.png`).
5. Output:
- Return the extracted content, brand identity data, and screenshot metadata in a clean and structured JSON format.
- Do not include free text or commentary outside the JSON.
Today is {datetime.now().strftime('%Y-%m-%d')}
User's query: Go to https://github.com/pricing and extract content and brand identity assets and screenshots for linkedin post, Topic is pricing plans.
"""
task_old_2 = """
###Selector Discovery, Verification & Screenshot Instructions
When identifying selectors for taking elements or sections screenshots:
Verify each selector's element or section, then capture its screenshot immediately after successful verification.
1. **Analyze** the HTML DOM structure of the page to identify potential selectors for the target elements or sections based on the query.
2. **Generate** a list of possible selectors that could uniquely identify each target element.
3. **Locate the Target Section or Element:**
- Identify the element or section that visually and contextually matches the target.
- Focus on the most relevant container or element that directly represents the intended target β€” not its parent or unrelated siblings.
4. For each candidate selector:
- Use the `"execute_js"` tool to verify that the selector matches exactly the target.
- **Highlight** the matched element by injecting a visible red border (`2px solid red`) or a temporary background color.
5. **Validate the Finalized Selector Against the Query:**
- Once a selector is finalized, confirm that it accurately represents the element or section described in the query.
- Ensure it precisely corresponds to the query intent and does not include unrelated, broader, or nested regions.
6. **Remove injected visual styles or modifications** from the DOM to restore the page to its original state before proceeding to the next selector.
7. **After verification**, immediately **capture a screenshot** of the verified element or section.
8. Continue this process until **all target selectors** have been verified and their screenshots captured.
After successful verification, remove all injected visual styles or temporary DOM modifications.
User's query: Go to https://github.com/pricing and take screenshot of header and pricing details
"""
task_old_3="""
You are a Browser Agent that must locate, visually verify, and capture a screenshot of a webpage section or element based on a natural language query.
### Steps to Follow
1. **Understand the Query**
- Interpret the user's intent (e.g., "header", "footer", "main hero section", "signup form").
- (Optional) gather page context if needed via `extract_content`.
2. **Find the Element**
- Primary: `find_element_by_prompt(query)`
- Fallback / extra probes: use page methods like `get_elements_by_css_selector` or `query_selector` if `find_element_by_prompt` is ambiguous.
3. **Get Element Details**
- Retrieve coordinates and size with `get_bounding_box(selector)`.
- Inspect returned element metadata (id, classes, backend_node_id) from `find_element_by_prompt`.
4. **Highlight for Verification**
- Scroll into view and outline the element using `highlight_element(selector_or_obj)`.
5. **Visually Verify**
- Take a temporary screenshot of the highlighted region with `element_screenshot_clip(clip)` (or `element_screenshot(selectors=[selector])`).
- Ask the visual verifier to confirm with `verify_element_visual(query, screenshot_path)`.
- If verification fails: refine and retry by re-calling `find_element_by_prompt` (or exploring parent/child/sibling via `get_elements_by_css_selector`) β€” repeat Steps 3–5.
6. **Capture Final Screenshot**
- After verification, capture final image with `element_screenshot({ "selectors": [verified_selector], "highlight": False, "padding": 10 })`.
- Remove temporary highlight (call `highlight_element({"selector": verified_selector, "remove": True})` or similar).
7. **Return Results**
- Return structured output containing: `selector` (from `find_element_by_prompt` / derived), `bounding_box` (from `get_bounding_box`), `screenshot_path` (from `element_screenshot`), and `confidence` (derived from `verify_element_visual`).
### Rules (enforced by the flow)
- Always visually verify before finalizing: use `verify_element_visual`.
- Ensure element is scrolled into view (use `highlight_element`).
- Prefer precise selectors (id, `data-*`, unique class) returned or implied by `find_element_by_prompt`.
- If verification fails, retry up to 3 times by re-invoking `find_element_by_prompt` and refining selectors.
User's query: Go to https://github.com/pricing and take screenshot of header and pricing details
"""
task_old_4="""
You are a Browser Agent that must locate, visually verify, and capture a screenshot of a webpage section or element based on a natural language query.
### Steps to Follow
1. **Understand the Query**
- Interpret the user's intent (e.g., "header", "footer", "main hero section", "signup form").
- The page is already loaded, so you don't need to navigate to any URL.
2. **Find the Element**
- Primary: `find_element_by_prompt`
- Pass a detailed natural language description of the element to find, including its visual appearance, position, and any visible text it contains (e.g., 'the login button with the text Sign In').
3. **Visually Verify**
- After finding the element, visually confirm that the correct element was found before proceeding.
User's query: Take screenshot of header
"""
# Browser agent task for extracting color systems
colors_extract_task="""
Extract and verify the complete color system from this webpage.
## Process:
### 1. Scroll & Identify Elements
- Scroll the page to view all sections (header, hero, CTAs, footer)
- Identify the MOST VISUALLY DISTINCT elements for each color category
### 2. Extract Colors with Hints
Call `extract_color_system` with element hints for ALL color types you can identify:
```
extract_color_system({
"elements_to_find": [
# MANDATORY: Brand Colors (3 required)
{"text": "Get Started", "tags": ["button", "a"], "priority": "primary"},
{"text": "Learn More", "tags": ["button"], "priority": "secondary"},
{"text": "New", "tags": ["span", "div"], "priority": "accent"},
# OPTIONAL: Background Color (improve accuracy if hinted)
{"text": "", "tags": ["body", "header", "main"], "priority": "background"},
# OPTIONAL: Text Colors (improve accuracy if hinted)
{"text": "Main Heading", "tags": ["h1", "h2"], "priority": "text-heading"},
{"text": "Body paragraph text", "tags": ["p"], "priority": "text-body"},
{"text": "Subtle caption", "tags": ["small", "span"], "priority": "text-subtle"}
]
})
```
**Priority Types:**
**MANDATORY (must verify):**
- `primary` = Main brand color (brightest CTA, most eye-catching button)
- `secondary` = Supporting color (less prominent actions, links)
- `accent` = Highlight color (small accents, badges, status indicators)
**OPTIONAL (auto-detected with fallback, hints improve accuracy):**
- `background` = Page background color (body, header, main sections)
- `text-heading` = Main heading text color (h1, h2)
- `text-body` = Body paragraph text color (p, span)
- `text-subtle` = Subtle/muted text color (small, captions)
**Tips for Better Results:**
- **Brand Colors (mandatory)**: Use EXACT text from interactive elements (buttons, links)
- **Background (optional)**: Leave text="" for container elements (body, header, main)
- **Text Colors (optional)**: Use sample text content from headings/paragraphs
- Focus on DISTINCT colors (not gray/white/black for brand colors)
- 3-7 hints total is optimal (3 mandatory brand + up to 4 optional background/text)
### 3. Verify Extraction
After extraction, verify the results:
**MANDATORY Checks:**
- βœ“ Primary should be the most prominent brand color (main CTA background/color)
- βœ“ Primary should NOT be a page background (#1b1f23, #ffffff, etc.)
- βœ“ Secondary and accent should be visually distinct from primary
- βœ“ All 3 mandatory colors (primary/secondary/accent) must be present
**OPTIONAL Checks (if auto-detected):**
- βœ“ Background should be the main page container color
- βœ“ Text hierarchy should show heading/body/subtle text colors
- βœ“ Check "source" field: "agent-hint" (you provided it) or "auto-detected" (tool found it)
**If mandatory colors are incorrect:**
- Re-call extract_color_system with better element examples for primary/secondary/accent
- Focus on the brightest, most colorful interactive elements
- Avoid selecting text-only or container elements for brand colors
**Optional colors will auto-detect with fallback if not hinted.**
Execute the extraction and verification now.
"""
browser_instance = None
def shutdown_browser(*args):
global browser_instance
if browser_instance:
try:
import asyncio
asyncio.run(browser_instance.stop())
print('βœ… Browser stopped via signal handler')
except Exception as e:
print(f'⚠️ Error stopping browser via signal handler: {type(e).__name__}: {e}')
signal.signal(signal.SIGINT, shutdown_browser)
signal.signal(signal.SIGTERM, shutdown_browser)
async def run_search() -> None:
global browser_instance
print('====================================================')
print('Starting run_search() function')
print('====================================================')
# Check installed packages that might be relevant
try:
import importlib
packages = ['browser_use', 'playwright', 'aiohttp']
for package in packages:
try:
mod = importlib.import_module(package)
print(f"βœ… {package} is installed: {getattr(mod, '__version__', 'unknown version')}")
except ImportError:
print(f"❌ {package} is NOT installed")
except Exception as e:
print(f"Error checking packages: {e}")
# Check environment variables (redacted for security)
for key in ['google_api_key', 'OPENROUTER_API_KEY']:
if os.environ.get(key):
print(f"βœ… {key} environment variable is set")
else:
print(f"❌ {key} environment variable is NOT set")
browser = None
playwright_browser = None
try:
# Import Browser from browser_use
from browser_use import Browser
# Create browser profile
print('πŸ”„ Creating browser profile...')
browser_profile = BrowserProfile(
is_local=True,
headless=False,
launch_args=[
'--no-first-run',
'--no-default-browser-check',
'--disable-extensions',
'--disable-background-networking',
'--disable-background-timer-throttling',
'--disable-backgrounding-occluded-windows',
'--disable-popup-blocking',
'--disable-renderer-backgrounding',
'--force-color-profile=srgb',
'--metrics-recording-only',
'--mute-audio',
],
)
# Create and start the browser
print('πŸ”„ Creating Browser instance...')
browser = Browser(browser_profile=browser_profile)
browser_instance = browser
print('πŸš€ Starting browser...')
await browser.start()
print(f"βœ… Browser started successfully")
# Use the already opened tab and navigate if needed
target_url = "https://github.com/pricing"
print(f'🌐 Navigating to {target_url} in the first tab...')
page = await browser.get_current_page()
await page.goto(target_url)
print(f"βœ… Page loaded successfully: {target_url}")
# Optional: Wait a moment for page to fully load
await asyncio.sleep(2)
# Build the Browser Agent using the browser instance
print('πŸ”„ Creating Browser Agent with pre-navigated browser...')
browser_agent = AgentBrowser(
task=colors_extract_task,
# llm=get_model("browser_agent_openrouter:google/gemini-2.5-flash"),
llm=get_model("llm_browser_google"),
use_vision=True,
generate_gif=False,
max_failures=3,
file_system_path="./browser_agent_data",
tools=tools,
# output_model_schema=BrowserAgentOutput, # ⚠️ TEMPORARILY DISABLED for testing color extraction
browser=browser, # Pass the Browser instance instead of BrowserSession
)
print('βœ… Browser Agent created with pre-navigated browser')
print('πŸš€ Running browser agent...')
try:
print("Starting browser agent.run() with max_steps=15")
history = await browser_agent.run(max_steps=15)
print("-------------Agent run completed---------------")
print("Steps executed:", len(history.steps) if hasattr(history, 'steps') else "Unknown")
print("-------------Final result---------------")
# print(history.final_result)
except Exception as run_error:
print(f'❌ Error during browser agent run: {type(run_error).__name__}: {run_error}')
import traceback
print("Detailed traceback:")
traceback.print_exc()
raise
except Exception as e:
print(f'❌ Error: {e}')
raise
finally:
# Clean up resources in proper order
print('🧹 Cleaning up resources...')
# Close browser
try:
if browser:
print(f"Attempting to stop browser: {browser}")
await browser.stop()
print('βœ… Stopped browser')
else:
print('ℹ️ No browser was created')
except Exception as e:
print(f'⚠️ Error stopping browser: {type(e).__name__}: {e}')
import traceback
traceback.print_exc()
# Close playwright browser if exists
if playwright_browser:
try:
print(f"Attempting to close Playwright browser: {playwright_browser}")
await playwright_browser.close()
print('βœ… Closed Playwright browser')
except Exception as e:
print(f'⚠️ Error closing Playwright browser: {type(e).__name__}: {e}')
import traceback
traceback.print_exc()
# Check if Chrome is still running via CDP
try:
print("Checking if Chrome CDP is still accessible...")
async with aiohttp.ClientSession() as session:
async with session.get('http://localhost:9222/json/version', timeout=aiohttp.ClientTimeout(total=1)) as response:
if response.status == 200:
print('⚠️ WARNING: Chrome with CDP is still running after cleanup!')
else:
print('βœ… Chrome CDP no longer accessible (status code != 200)')
except Exception:
print('βœ… Chrome CDP no longer accessible (connection failed)')
print('βœ… All cleanup complete')
if __name__ == "__main__":
try:
asyncio.run(run_search())
finally:
shutdown_browser()
print('_agents file')