diff --git a/BrowsingAgent/.DS_Store b/BrowsingAgent/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..187740f5cc0056d6b39d48e49f714143e54e929d Binary files /dev/null and b/BrowsingAgent/.DS_Store differ diff --git a/BrowsingAgent/BrowsingAgent.py b/BrowsingAgent/BrowsingAgent.py new file mode 100644 index 0000000000000000000000000000000000000000..023807dc2f4c2825aba1a7836b00eae44f10df47 --- /dev/null +++ b/BrowsingAgent/BrowsingAgent.py @@ -0,0 +1,184 @@ +import json +import re +import logging +from agency_swarm.agents import Agent +from typing_extensions import override +import base64 +from .tools.SearchAndScrape import SearchAndScrape +from selenium.webdriver.common.by import By +from selenium.webdriver.support.select import Select +from .tools.util import highlight_elements_with_labels, get_web_driver, set_web_driver +from agency_swarm.tools.oai import FileSearch + + +class BrowsingAgent(Agent): + SCREENSHOT_FILE_NAME = "screenshot.jpg" + + def __init__(self, selenium_config=None, **kwargs): + from .tools.util.selenium import set_selenium_config + super().__init__( + name="BrowsingAgent", + description="This agent is designed to perform web searches and navigate web pages.", + instructions=""" + I am a browsing agent that can: + 1. Perform Google searches + 2. Navigate web pages + 3. Take screenshots + 4. Highlight and interact with page elements + + Use my search capabilities to find information and my navigation tools to explore web pages. + """, + files_folder="./files", + schemas_folder="./schemas", + tools=[SearchAndScrape], + tools_folder="./tools", + temperature=0, + max_prompt_tokens=16000, + model="groq/llama-3.3-70b-versatile", + **kwargs + ) + if selenium_config is not None: + set_selenium_config(selenium_config) + + self.prev_message = "" + + @override + def response_validator(self, message): + from .tools.util.selenium import get_web_driver, set_web_driver + from .tools.util import highlight_elements_with_labels, remove_highlight_and_labels + from selenium.webdriver.common.by import By + from selenium.webdriver.support.select import Select + + # Filter out everything in square brackets + filtered_message = re.sub(r'\[.*?\]', '', message).strip() + + if filtered_message and self.prev_message == filtered_message: + raise ValueError("Do not repeat yourself. If you are stuck, try a different approach or search in google for the page you are looking for directly.") + + self.prev_message = filtered_message + + if "[send screenshot]" in message.lower(): + wd = get_web_driver() + remove_highlight_and_labels(wd) + self.take_screenshot() + response_text = "Here is the screenshot of the current web page:" + + elif '[highlight clickable elements]' in message.lower(): + wd = get_web_driver() + highlight_elements_with_labels(wd, 'a, button, div[onclick], div[role="button"], div[tabindex], ' + 'span[onclick], span[role="button"], span[tabindex]') + self._shared_state.set("elements_highlighted", 'a, button, div[onclick], div[role="button"], div[tabindex], ' + 'span[onclick], span[role="button"], span[tabindex]') + + self.take_screenshot() + + all_elements = wd.find_elements(By.CSS_SELECTOR, '.highlighted-element') + + all_element_texts = [element.text for element in all_elements] + + element_texts_json = {} + for i, element_text in enumerate(all_element_texts): + element_texts_json[str(i + 1)] = self.remove_unicode(element_text) + + element_texts_json = {k: v for k, v in element_texts_json.items() if v} + + element_texts_formatted = ", ".join([f"{k}: {v}" for k, v in element_texts_json.items()]) + + response_text = ("Here is the screenshot of the current web page with highlighted clickable elements. \n\n" + "Texts of the elements are: " + element_texts_formatted + ".\n\n" + "Elements without text are not shown, but are available on screenshot. \n" + "Please make sure to analyze the screenshot to find the clickable element you need to click on.") + + elif '[highlight text fields]' in message.lower(): + wd = get_web_driver() + highlight_elements_with_labels(wd, 'input, textarea') + self._shared_state.set("elements_highlighted", "input, textarea") + + self.take_screenshot() + + all_elements = wd.find_elements(By.CSS_SELECTOR, '.highlighted-element') + + all_element_texts = [element.text for element in all_elements] + + element_texts_json = {} + for i, element_text in enumerate(all_element_texts): + element_texts_json[str(i + 1)] = self.remove_unicode(element_text) + + element_texts_formatted = ", ".join([f"{k}: {v}" for k, v in element_texts_json.items()]) + + response_text = ("Here is the screenshot of the current web page with highlighted text fields: \n" + "Texts of the elements are: " + element_texts_formatted + ".\n" + "Please make sure to analyze the screenshot to find the text field you need to fill.") + + elif '[highlight dropdowns]' in message.lower(): + wd = get_web_driver() + highlight_elements_with_labels(wd, 'select') + self._shared_state.set("elements_highlighted", "select") + + self.take_screenshot() + + all_elements = wd.find_elements(By.CSS_SELECTOR, '.highlighted-element') + + all_selector_values = {} + + i = 0 + for element in all_elements: + select = Select(element) + options = select.options + selector_values = {} + for j, option in enumerate(options): + selector_values[str(j)] = option.text + if j > 10: + break + all_selector_values[str(i + 1)] = selector_values + + all_selector_values = {k: v for k, v in all_selector_values.items() if v} + all_selector_values_formatted = ", ".join([f"{k}: {v}" for k, v in all_selector_values.items()]) + + response_text = ("Here is the screenshot with highlighted dropdowns. \n" + "Selector values are: " + all_selector_values_formatted + ".\n" + "Please make sure to analyze the screenshot to find the dropdown you need to select.") + + else: + return message + + set_web_driver(wd) + content = self.create_response_content(response_text) + raise ValueError(content) + + def take_screenshot(self): + from .tools.util.selenium import get_web_driver + from .tools.util import get_b64_screenshot + wd = get_web_driver() + screenshot = get_b64_screenshot(wd) + screenshot_data = base64.b64decode(screenshot) + with open(self.SCREENSHOT_FILE_NAME, "wb") as screenshot_file: + screenshot_file.write(screenshot_data) + + def create_response_content(self, response_text): + with open(self.SCREENSHOT_FILE_NAME, "rb") as file: + file_id = self.client.files.create( + file=file, + purpose="vision", + ).id + + content = [ + {"type": "text", "text": response_text}, + { + "type": "image_file", + "image_file": {"file_id": file_id} + } + ] + return content + + # Function to check for Unicode escape sequences + def remove_unicode(self, data): + return re.sub(r'[^\x00-\x7F]+', '', data) + + def run_search_and_scrape(self, query): + """Run the SearchAndScrape tool and process the results.""" + tool = SearchAndScrape(query=query) + result = tool.run() + logging.info(f"Search and Scrape result: {result}") + return result + diff --git a/BrowsingAgent/__init__.py b/BrowsingAgent/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c8568f27896e02c3628f63892988e4414e13dc82 --- /dev/null +++ b/BrowsingAgent/__init__.py @@ -0,0 +1 @@ +from .BrowsingAgent import BrowsingAgent \ No newline at end of file diff --git a/BrowsingAgent/__pycache__/BrowsingAgent.cpython-311.pyc b/BrowsingAgent/__pycache__/BrowsingAgent.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9929fe0b734a087a5f3429a26d6b3ce2fc7b8dc Binary files /dev/null and b/BrowsingAgent/__pycache__/BrowsingAgent.cpython-311.pyc differ diff --git a/BrowsingAgent/__pycache__/BrowsingAgent.cpython-313.pyc b/BrowsingAgent/__pycache__/BrowsingAgent.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7cdf50b91cea34fc8bb7f5241ad87167588c1681 Binary files /dev/null and b/BrowsingAgent/__pycache__/BrowsingAgent.cpython-313.pyc differ diff --git a/BrowsingAgent/__pycache__/__init__.cpython-311.pyc b/BrowsingAgent/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48dfed1501360c1e5fe96bc8fbb2eefa86bdf6d1 Binary files /dev/null and b/BrowsingAgent/__pycache__/__init__.cpython-311.pyc differ diff --git a/BrowsingAgent/__pycache__/__init__.cpython-313.pyc b/BrowsingAgent/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1107027367ec828561070e9cba7732e3424676f2 Binary files /dev/null and b/BrowsingAgent/__pycache__/__init__.cpython-313.pyc differ diff --git a/BrowsingAgent/__pycache__/ma.cpython-311.pyc b/BrowsingAgent/__pycache__/ma.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..967c8046ca99e2d28f65fd870ceb505dcc1c458c Binary files /dev/null and b/BrowsingAgent/__pycache__/ma.cpython-311.pyc differ diff --git a/BrowsingAgent/instructions.md b/BrowsingAgent/instructions.md new file mode 100644 index 0000000000000000000000000000000000000000..dbd5986c7a6e587baebff9c6fd0d641f84eb9707 --- /dev/null +++ b/BrowsingAgent/instructions.md @@ -0,0 +1,23 @@ +# Browsing Agent Instructions + +As an advanced browsing agent, you are equipped with specialized tools to navigate and search the web effectively. Your primary objective is to fulfill the user's requests by efficiently utilizing these tools. + +### Primary Instructions: + +1. **Search and Scrape**: Use the SearchAndScrape tool to perform Google searches and scrape content using Firecrawl. +2. **Navigating to New Pages**: Always use the `ClickElement` tool to open links when navigating to a new web page from the current source. +3. **Single Page Interaction**: You can only open and interact with one web page at a time. +4. **Requesting Screenshots**: Before using tools that interact with the web page, ask the user to send you the appropriate screenshot. + +### Commands to Request Screenshots: + +- **'[send screenshot]'**: Sends the current browsing window as an image. +- **'[highlight clickable elements]'**: Highlights all clickable elements on the current web page. +- **'[highlight text fields]'**: Highlights all text fields on the current web page. +- **'[highlight dropdowns]'**: Highlights all dropdowns on the current web page. + +### Important Reminders: + +- Only open and interact with one web page at a time. +- Use the SearchAndScrape tool for efficient web searching and content extraction. +- Complete your interactions with the current web page before proceeding to a different source. diff --git a/BrowsingAgent/ma.py b/BrowsingAgent/ma.py new file mode 100644 index 0000000000000000000000000000000000000000..b81d2d220cd68560930da4ed9e19b5c4cfb5a3ae --- /dev/null +++ b/BrowsingAgent/ma.py @@ -0,0 +1,522 @@ +import logging +from datetime import datetime +from firecrawl import FirecrawlApp +import os +import time +import google.generativeai as genai +import requests # Import requests for making API calls +from googlesearch import search # Add this import at the top +import json + +# Initialize logging +logging.basicConfig(level=logging.DEBUG) + +# Initialize Firecrawl +FIRECRAWL_API_KEY = "fc-5fadfeae30314d4ea8a3d9afaa75c493" +firecrawl_app = FirecrawlApp(api_key=FIRECRAWL_API_KEY) +logging.info("Firecrawl initialized") + +# Initialize Gemini +GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', '') +if GOOGLE_API_KEY: + genai.configure(api_key=GOOGLE_API_KEY) + model = genai.GenerativeModel('gemini-1.5-flash') + logging.info("Gemini initialized") +else: + logging.warning("No Gemini API key found") + +def perform_search(query, use_custom_api=True): + """ + Perform search with fallback mechanism + First tries Custom Search API, then falls back to googlesearch package + """ + try: + if use_custom_api: + # Try Custom Search API first + api_key = "AIzaSyAxeLlJ6vZxOl-TblUJg_dInBS3vNxaFVY" + search_engine_id = "37793b12975da4e35" + url = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={search_engine_id}&q={query}&num=2" + + response = requests.get(url) + if response.status_code == 200: + search_results = response.json().get('items', []) + if search_results: + return [item['link'] for item in search_results] + logging.warning("Custom Search API failed, falling back to googlesearch") + + # Fallback to googlesearch package + logging.info("Using googlesearch package") + return list(search(query, num_results=2, lang="en")) + + except Exception as e: + logging.error(f"Search error: {str(e)}") + return [] + +def scrape_with_retry(url, max_retries=3, timeout=15): + """Helper function to scrape URL with retry logic and improved timeout handling""" + # List of problematic domains that often timeout + problematic_domains = [ + 'sparktoro.com', + 'j-jdis.com', + 'linkedin.com', + 'facebook.com', + 'twitter.com', + 'reddit.com', + '.pdf' + ] + + # Skip problematic URLs immediately + if any(domain in url.lower() for domain in problematic_domains): + logging.info(f"Skipping known problematic URL: {url}") + return None + + for attempt in range(max_retries): + try: + # Use shorter timeout for initial attempts + current_timeout = timeout * (attempt + 1) # Increase timeout with each retry + + logging.info(f"Attempting to scrape {url} (timeout: {current_timeout}s)") + + # Add timeout and rate limiting parameters + response = firecrawl_app.scrape_url( + url=url, + params={ + 'formats': ['markdown'], + 'timeout': current_timeout, + 'wait': True, # Enable rate limiting + 'max_retries': 2 # Internal retries + } + ) + + if response and response.get('markdown'): + content = response.get('markdown') + if len(content.strip()) > 200: # Verify content quality + logging.info(f"Successfully scraped {url}") + return content + else: + logging.warning(f"Content too short from {url}") + return None + + except Exception as e: + error_msg = str(e).lower() + wait_time = (attempt + 1) * 5 # Reduced wait times + + if "timeout" in error_msg or "408" in error_msg: + if attempt < max_retries - 1: + logging.warning(f"Timeout error for {url}, attempt {attempt + 1}") + logging.info(f"Waiting {wait_time}s before retry...") + time.sleep(wait_time) + continue + else: + logging.error(f"Final timeout for {url} after {max_retries} attempts") + break + + elif "429" in error_msg: # Rate limit + logging.info(f"Rate limit hit, waiting {wait_time}s...") + time.sleep(wait_time) + continue + + else: + logging.error(f"Error scraping {url}: {error_msg}") + break + + time.sleep(1) # Reduced basic delay + + return None + +def get_trends_data(query): + """Get market trends data with improved error handling""" + try: + if not query: + logging.error("No query provided") + return generate_fallback_response("Unknown Business") + + logging.info(f"\n{'='*50}\nGathering trends data for: {query}\n{'='*50}") + + # Define search queries + search_queries = [ + # Market Overview + f"{query} market size revenue statistics analysis", + + # Industry Trends + f"{query} industry trends growth forecast analysis", + + # Competition Analysis + f"{query} market share competitive landscape analysis", + + # Technology & Innovation + f"{query} technology innovation disruption analysis", + + # Future Outlook + f"{query} market future outlook predictions analysis" + ] + + scraped_content = [] + use_custom_api = True + successful_scrapes = 0 + min_required_content = 2 + max_attempts_per_url = 2 + + for search_query in search_queries: + if successful_scrapes >= min_required_content: + break + + try: + logging.info(f"\nSearching for: {search_query}") + search_results = perform_search(search_query, use_custom_api) + + if not search_results and use_custom_api: + use_custom_api = False + search_results = perform_search(search_query, use_custom_api=False) + + if search_results: + attempts = 0 + for url in search_results: + if successful_scrapes >= min_required_content or attempts >= max_attempts_per_url: + break + + content = scrape_with_retry(url, timeout=15) # Reduced initial timeout + if content: + scraped_content.append({ + 'url': url, + 'domain': extract_domain(url), + 'section': 'Market Trends', + 'date': datetime.now().strftime("%Y-%m-%d"), + 'content': content[:2000] + }) + successful_scrapes += 1 + attempts += 1 + + time.sleep(1) # Reduced delay between queries + + except Exception as e: + logging.error(f"Error in search for query '{search_query}': {str(e)}") + continue + + if not scraped_content: + logging.warning("No content scraped, returning fallback response") + return generate_fallback_response(query) + + try: + result = process_scraped_content(scraped_content, query) + + # Save analysis to file + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_file = os.path.join('gemini_outputs', f'market_trends_{timestamp}.txt') + + with open(output_file, 'w', encoding='utf-8') as f: + f.write(f"Market Trends Analysis for: {query}\n") + f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") + f.write("="*50 + "\n\n") + f.write(json.dumps(result, indent=2)) + f.write("\n\nData Sources:\n") + for source in scraped_content: + f.write(f"- {source['domain']} ({source['date']})\n") + + return result + + except Exception as e: + logging.error(f"Error processing content: {str(e)}") + return generate_fallback_response(query) + + except Exception as e: + logging.error(f"Error during market trends analysis: {str(e)}") + return generate_fallback_response(query) + +def process_scraped_content(scraped_content, query): + try: + # Generate analysis using the scraped content + analysis = generate_analysis(scraped_content, query) + + # Structure the response + result = { + "market_size_growth": { + "total_market_value": extract_bullet_points(analysis, "Market Size"), + "market_segments": extract_bullet_points(analysis, "Market Segments"), + "regional_distribution": extract_bullet_points(analysis, "Regional Distribution") + }, + "competitive_landscape": { + "market_leaders": extract_bullet_points(analysis, "Market Leaders"), + "market_differentiators": extract_bullet_points(analysis, "Market Differentiators"), + "industry_dynamics": extract_bullet_points(analysis, "Industry Dynamics") + }, + "consumer_analysis": { + "segments": extract_bullet_points(analysis, "Consumer Segments"), + "behavior_patterns": extract_bullet_points(analysis, "Behavior Patterns"), + "pain_points": extract_bullet_points(analysis, "Pain Points") + }, + "metrics": extract_metrics(scraped_content), + "sources": [{ + 'url': item['url'], + 'domain': item['domain'], + 'section': item['section'], + 'date': item['date'] + } for item in scraped_content] + } + + return result + except Exception as e: + logging.error(f"Error processing scraped content: {str(e)}") + return generate_fallback_response(query) + +def extract_domain(url): + """Extract domain name from URL""" + try: + from urllib.parse import urlparse + domain = urlparse(url).netloc + return domain.replace('www.', '') + except: + return url + +def generate_fallback_response(query): + """Generate fallback response when analysis fails""" + return { + "market_size_growth": { + "total_market_value": [f"Market size analysis for {query} pending (Inferred)"], + "market_segments": ["Market segmentation analysis needed (Inferred)"], + "regional_distribution": ["Regional analysis to be conducted (Inferred)"] + }, + "competitive_landscape": { + "market_leaders": ["Market leader analysis pending (Inferred)"], + "market_differentiators": ["Differentiator analysis needed (Inferred)"], + "industry_dynamics": ["Industry dynamics to be evaluated (Inferred)"] + }, + "consumer_analysis": { + "segments": ["Consumer segmentation pending (Inferred)"], + "behavior_patterns": ["Behavior analysis needed (Inferred)"], + "pain_points": ["Pain point identification required (Inferred)"] + }, + "metrics": {}, + "sources": [] + } + +def process_analysis(analysis, scraped_content): + """Process and structure the analysis for frontend consumption""" + result = { + "market_size_growth": { + "total_market_value": [], + "market_segments": [], + "regional_distribution": [], + "growth_drivers": [] + }, + "competitive_landscape": { + "market_leaders": [], + "market_differentiators": [], + "industry_dynamics": [], + "entry_barriers": [] + }, + "consumer_analysis": { + "segments": [], + "behavior_patterns": [], + "pain_points": [], + "decision_factors": [] + }, + "technology_innovation": { + "current_trends": [], + "emerging_tech": [], + "digital_impact": [], + "innovation_opportunities": [] + }, + "regulatory_environment": { + "key_regulations": [], + "compliance_requirements": [], + "environmental_impact": [], + "sustainability": [] + }, + "future_outlook": { + "growth_forecast": [], + "opportunities": [], + "challenges": [], + "evolution_scenarios": [] + }, + "strategic_recommendations": { + "entry_strategies": [], + "product_development": [], + "tech_investments": [], + "risk_mitigation": [] + }, + "metrics": extract_metrics(scraped_content), + "sources": [] + } + + # Extract sections using more specific patterns + for section in result.keys(): + if section != "metrics" and section != "sources": + for subsection in result[section].keys(): + result[section][subsection] = extract_bullet_points(analysis, subsection.replace('_', ' ').title()) + + return result + +def extract_metrics(scraped_content): + """Extract and structure metrics from scraped content""" + metrics = { + "market_share": {}, + "growth_rates": {}, + "revenue": {} + } + + for item in scraped_content: + if 'metrics' in item: + # Process market share + for i, share in enumerate(item['metrics'].get('market_share', [])): + try: + value = float(share) + metrics['market_share'][f'Company {i+1}'] = value + except ValueError: + continue + + # Process growth rates + for i, rate in enumerate(item['metrics'].get('growth_rates', [])): + try: + value = float(rate) + metrics['growth_rates'][f'Period {i+1}'] = value + except ValueError: + continue + + # Process revenue figures + for i, amount in enumerate(item['metrics'].get('money', [])): + try: + value = float(amount) + metrics['revenue'][f'Entity {i+1}'] = value + except ValueError: + continue + + return metrics + +def extract_bullet_points(text, section_name): + """Extract bullet points from a specific section""" + try: + lines = [] + in_section = False + + for line in text.split('\n'): + if section_name in line: + in_section = True + continue + elif any(s in line for s in [ + "Market Size", "Market Segments", "Regional Distribution", + "Market Leaders", "Market Differentiators", "Industry Dynamics", + "Consumer Segments", "Behavior Patterns", "Pain Points", + "Current Trends", "Emerging Technologies", "Growth Forecast", + "Opportunities", "Challenges" + ]): + in_section = False + elif in_section and line.strip().startswith('•'): + cleaned_line = line.strip('• ').strip() + if cleaned_line and not cleaned_line.endswith(':'): + lines.append(cleaned_line) + + return lines if lines else [f"Analysis for {section_name} pending (Inferred)"] + + except Exception as e: + logging.error(f"Error extracting bullet points for {section_name}: {str(e)}") + return [f"Error extracting {section_name} data (Inferred)"] + +def generate_analysis(scraped_content, query): + """Generate market trends analysis using Gemini""" + try: + # Prepare content for analysis + content_text = "\n\n".join([item['content'] for item in scraped_content]) + + # Create the analysis prompt + analysis_prompt = f""" + Task: Analyze the provided content to create a detailed market trends analysis for {query}. + + Content to analyze: + {content_text} + + Please provide a structured analysis covering these exact sections: + + Market Size & Growth: + Market Size: + • [Provide market size estimates with specific numbers where available] + • [Include year-over-year growth rates] + + Market Segments: + • [Identify key market segments] + • [Provide segment-wise breakdown] + + Regional Distribution: + • [Analyze geographical distribution] + • [Identify key markets and growth regions] + + Competitive Landscape: + Market Leaders: + • [List top companies and their market positions] + • [Include market share data where available] + + Market Differentiators: + • [Identify key competitive advantages] + • [Analyze unique selling propositions] + + Industry Dynamics: + • [Analyze industry trends and changes] + • [Identify market drivers and challenges] + + Consumer Analysis: + Consumer Segments: + • [Identify key customer segments] + • [Analyze segment characteristics] + + Behavior Patterns: + • [Analyze purchasing patterns] + • [Identify decision factors] + + Pain Points: + • [List key customer challenges] + • [Identify unmet needs] + + Technology & Innovation: + Current Trends: + • [Identify current technology trends] + • [Analyze adoption rates] + + Emerging Technologies: + • [List emerging technologies] + • [Assess potential impact] + + Future Outlook: + Growth Forecast: + • [Provide growth projections] + • [Identify growth drivers] + + Opportunities: + • [List market opportunities] + • [Identify potential areas for expansion] + + Challenges: + • [Identify market challenges] + • [List potential risks] + + Format each point with specific data where available. + Mark inferences with (Inferred). + Prioritize insights based on confidence and impact. + """ + + # Generate analysis using Gemini + response = model.generate_content(analysis_prompt) + if not response or not response.text: + raise Exception("No response from Gemini") + + analysis = response.text + + # Save raw analysis to file + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + raw_output_file = os.path.join('gemini_outputs', f'market_trends_raw_{timestamp}.txt') + + with open(raw_output_file, 'w', encoding='utf-8') as f: + f.write(f"Raw Market Trends Analysis for: {query}\n") + f.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") + f.write("="*50 + "\n\n") + f.write("Input Content:\n") + f.write("-"*30 + "\n") + f.write(content_text[:1000] + "...\n\n") + f.write("Generated Analysis:\n") + f.write("-"*30 + "\n") + f.write(analysis) + + return analysis + + except Exception as e: + logging.error(f"Error generating analysis: {str(e)}") + raise \ No newline at end of file diff --git a/BrowsingAgent/requirements.txt b/BrowsingAgent/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e59d3fe55833b680a4156e20d67242a64fe9672 --- /dev/null +++ b/BrowsingAgent/requirements.txt @@ -0,0 +1,5 @@ +selenium +webdriver-manager +selenium_stealth +googlesearch-python +firecrawl \ No newline at end of file diff --git a/BrowsingAgent/tools/ClickElement.py b/BrowsingAgent/tools/ClickElement.py new file mode 100644 index 0000000000000000000000000000000000000000..651c19bf55ece223862e0674a2b3411c519bb47b --- /dev/null +++ b/BrowsingAgent/tools/ClickElement.py @@ -0,0 +1,59 @@ +import time + +from pydantic import Field +from selenium.webdriver.common.by import By + +from agency_swarm.tools import BaseTool +from .util import get_web_driver, set_web_driver +from .util.highlights import remove_highlight_and_labels + + +class ClickElement(BaseTool): + """ + This tool clicks on an element on the current web page based on its number. + + Before using this tool make sure to highlight clickable elements on the page by outputting '[highlight clickable elements]' message. + """ + element_number: int = Field( + ..., + description="The number of the element to click on. The element numbers are displayed on the page after highlighting elements.", + ) + + def run(self): + wd = get_web_driver() + + if 'button' not in self._shared_state.get("elements_highlighted", ""): + raise ValueError("Please highlight clickable elements on the page first by outputting '[highlight clickable elements]' message. You must output just the message without calling the tool first, so the user can respond with the screenshot.") + + all_elements = wd.find_elements(By.CSS_SELECTOR, '.highlighted-element') + + # iterate through all elements with a number in the text + try: + element_text = all_elements[self.element_number - 1].text + element_text = element_text.strip() if element_text else "" + # Subtract 1 because sequence numbers start at 1, but list indices start at 0 + try: + all_elements[self.element_number - 1].click() + except Exception as e: + if "element click intercepted" in str(e).lower(): + wd.execute_script("arguments[0].click();", all_elements[self.element_number - 1]) + else: + raise e + + time.sleep(3) + + result = f"Clicked on element {self.element_number}. Text on clicked element: '{element_text}'. Current URL is {wd.current_url} To further analyze the page, output '[send screenshot]' command." + except IndexError: + result = "Element number is invalid. Please try again with a valid element number." + except Exception as e: + result = str(e) + + wd = remove_highlight_and_labels(wd) + + wd.execute_script("document.body.style.zoom='1.5'") + + set_web_driver(wd) + + self._shared_state.set("elements_highlighted", "") + + return result \ No newline at end of file diff --git a/BrowsingAgent/tools/ExportFile.py b/BrowsingAgent/tools/ExportFile.py new file mode 100644 index 0000000000000000000000000000000000000000..26abc90ecd0bec512d38734e7c9045700508ec57 --- /dev/null +++ b/BrowsingAgent/tools/ExportFile.py @@ -0,0 +1,45 @@ +import base64 +import os + +from agency_swarm.tools import BaseTool +from .util import get_web_driver + + +class ExportFile(BaseTool): + """This tool converts the current full web page into a file and returns its file_id. You can then send this file id back to the user for further processing.""" + + def run(self): + wd = get_web_driver() + from agency_swarm import get_openai_client + client = get_openai_client() + + # Define the parameters for the PDF + params = { + 'landscape': False, + 'displayHeaderFooter': False, + 'printBackground': True, + 'preferCSSPageSize': True, + } + + # Execute the command to print to PDF + result = wd.execute_cdp_cmd('Page.printToPDF', params) + pdf = result['data'] + + pdf_bytes = base64.b64decode(pdf) + + # Save the PDF to a file + with open("exported_file.pdf", "wb") as f: + f.write(pdf_bytes) + + file_id = client.files.create(file=open("exported_file.pdf", "rb"), purpose="assistants",).id + + self._shared_state.set("file_id", file_id) + + return "Success. File exported with id: `" + file_id + "` You can now send this file id back to the user." + + +if __name__ == "__main__": + wd = get_web_driver() + wd.get("https://www.google.com") + tool = ExportFile() + tool.run() diff --git a/BrowsingAgent/tools/GoBack.py b/BrowsingAgent/tools/GoBack.py new file mode 100644 index 0000000000000000000000000000000000000000..5a5a7ec5e23d3f03bddfba94a56620803702a58a --- /dev/null +++ b/BrowsingAgent/tools/GoBack.py @@ -0,0 +1,22 @@ +import time + +from agency_swarm.tools import BaseTool + +from .util.selenium import get_web_driver, set_web_driver + + +class GoBack(BaseTool): + """W + This tool allows you to go back 1 page in the browser history. Use it in case of a mistake or if a page shows you unexpected content. + """ + + def run(self): + wd = get_web_driver() + + wd.back() + + time.sleep(3) + + set_web_driver(wd) + + return "Success. Went back 1 page. Current URL is: " + wd.current_url diff --git a/BrowsingAgent/tools/ReadURL.py b/BrowsingAgent/tools/ReadURL.py new file mode 100644 index 0000000000000000000000000000000000000000..38b6c8dc3a4d73b08412b24c5b56dca16872c00a --- /dev/null +++ b/BrowsingAgent/tools/ReadURL.py @@ -0,0 +1,44 @@ +import time + +from pydantic import Field + +from agency_swarm.tools import BaseTool +from .util.selenium import get_web_driver, set_web_driver + + +class ReadURL(BaseTool): + """ +This tool reads a single URL and opens it in your current browser window. For each new source, either navigate directly to a URL that you believe contains the answer to the user's question or perform a Google search (e.g., 'https://google.com/search?q=search') if necessary. + +If you are unsure of the direct URL, do not guess. Instead, use the ClickElement tool to click on links that might contain the desired information on the current web page. + +Note: This tool only supports opening one URL at a time. The previous URL will be closed when you open a new one. + """ + chain_of_thought: str = Field( + ..., description="Think step-by-step about where you need to navigate next to find the necessary information.", + exclude=True + ) + url: str = Field( + ..., description="URL of the webpage.", examples=["https://google.com/search?q=search"] + ) + + class ToolConfig: + one_call_at_a_time: bool = True + + def run(self): + wd = get_web_driver() + + wd.get(self.url) + + time.sleep(2) + + set_web_driver(wd) + + self._shared_state.set("elements_highlighted", "") + + return "Current URL is: " + wd.current_url + "\n" + "Please output '[send screenshot]' next to analyze the current web page or '[highlight clickable elements]' for further navigation." + + +if __name__ == "__main__": + tool = ReadURL(url="https://google.com") + print(tool.run()) \ No newline at end of file diff --git a/BrowsingAgent/tools/Scroll.py b/BrowsingAgent/tools/Scroll.py new file mode 100644 index 0000000000000000000000000000000000000000..d8e9637a4f4104916971a4193feea6f232b561e0 --- /dev/null +++ b/BrowsingAgent/tools/Scroll.py @@ -0,0 +1,53 @@ +from typing import Literal + +from pydantic import Field + +from agency_swarm.tools import BaseTool +from .util.selenium import get_web_driver, set_web_driver + + +class Scroll(BaseTool): + """ + This tool allows you to scroll the current web page up or down by 1 screen height. + """ + direction: Literal["up", "down"] = Field( + ..., description="Direction to scroll." + ) + + def run(self): + wd = get_web_driver() + + height = wd.get_window_size()['height'] + + # Get the zoom level + zoom_level = wd.execute_script("return document.body.style.zoom || '1';") + zoom_level = float(zoom_level.strip('%')) / 100 if '%' in zoom_level else float(zoom_level) + + # Adjust height by zoom level + adjusted_height = height / zoom_level + + current_scroll_position = wd.execute_script("return window.pageYOffset;") + total_scroll_height = wd.execute_script("return document.body.scrollHeight;") + + result = "" + + if self.direction == "up": + if current_scroll_position == 0: + # Reached the top of the page + result = "Reached the top of the page. Cannot scroll up any further.\n" + else: + wd.execute_script(f"window.scrollBy(0, -{adjusted_height});") + result = "Scrolled up by 1 screen height. Make sure to output '[send screenshot]' command to analyze the page after scrolling." + + elif self.direction == "down": + if current_scroll_position + adjusted_height >= total_scroll_height: + # Reached the bottom of the page + result = "Reached the bottom of the page. Cannot scroll down any further.\n" + else: + wd.execute_script(f"window.scrollBy(0, {adjusted_height});") + result = "Scrolled down by 1 screen height. Make sure to output '[send screenshot]' command to analyze the page after scrolling." + + set_web_driver(wd) + + return result + diff --git a/BrowsingAgent/tools/SearchAndScrape.py b/BrowsingAgent/tools/SearchAndScrape.py new file mode 100644 index 0000000000000000000000000000000000000000..d9a6df2aa2e26a5b11bcc2a990494f36ecc6b3e4 --- /dev/null +++ b/BrowsingAgent/tools/SearchAndScrape.py @@ -0,0 +1,41 @@ +from agency_swarm.tools import BaseTool +from pydantic import Field +import logging +import os +from googlesearch import search +import json + +class SearchAndScrape(BaseTool): + """ + A tool to perform Google searches and return results. + """ + + query: str = Field( + ..., + description="The search query to look up", + examples=["best restaurants in New York", "how to learn python"] + ) + + def run(self): + """ + Performs a Google search and returns the search results + """ + try: + # Use the stop parameter to limit results + search_results = search(self.query, stop=5, lang="en") + + # Convert generator to list + results = list(search_results) + + return json.dumps({ + "success": True, + "message": f"Found {len(results)} results for query: {self.query}", + "results": results + }) + + except Exception as e: + logging.error(f"Search error: {str(e)}") + return json.dumps({ + "success": False, + "error": str(e) + }) \ No newline at end of file diff --git a/BrowsingAgent/tools/SelectDropdown.py b/BrowsingAgent/tools/SelectDropdown.py new file mode 100644 index 0000000000000000000000000000000000000000..0a7ad371be883ed286084e9c31e18759268a44f1 --- /dev/null +++ b/BrowsingAgent/tools/SelectDropdown.py @@ -0,0 +1,58 @@ +from typing import Dict +from pydantic import Field, model_validator +from selenium.webdriver.common.by import By +from selenium.webdriver.support.select import Select + +from agency_swarm.tools import BaseTool +from .util import get_web_driver, set_web_driver +from .util.highlights import remove_highlight_and_labels + + +class SelectDropdown(BaseTool): + """ + This tool selects an option in a dropdown on the current web page based on the description of that element and which option to select. + + Before using this tool make sure to highlight dropdown elements on the page by outputting '[highlight dropdowns]' message. + """ + + key_value_pairs: Dict[str, str] = Field(..., + description="A dictionary where the key is the sequence number of the dropdown element and the value is the index of the option to select.", + examples=[{"1": 0, "2": 1}, {"3": 2}] + ) + + @model_validator(mode='before') + @classmethod + def check_key_value_pairs(cls, data): + if not data.get('key_value_pairs'): + raise ValueError( + "key_value_pairs is required. Example format: " + "key_value_pairs={'1': 0, '2': 1}" + ) + return data + + def run(self): + wd = get_web_driver() + + if 'select' not in self._shared_state.get("elements_highlighted", ""): + raise ValueError("Please highlight dropdown elements on the page first by outputting '[highlight dropdowns]' message. You must output just the message without calling the tool first, so the user can respond with the screenshot.") + + all_elements = wd.find_elements(By.CSS_SELECTOR, '.highlighted-element') + + try: + for key, value in self.key_value_pairs.items(): + key = int(key) + element = all_elements[key - 1] + + select = Select(element) + + # Select the first option (index 0) + select.select_by_index(int(value)) + result = f"Success. Option is selected in the dropdown. To further analyze the page, output '[send screenshot]' command." + except Exception as e: + result = str(e) + + remove_highlight_and_labels(wd) + + set_web_driver(wd) + + return result diff --git a/BrowsingAgent/tools/SendKeys.py b/BrowsingAgent/tools/SendKeys.py new file mode 100644 index 0000000000000000000000000000000000000000..ac7e234ffc6fa5efd19cb893260a9816685e5b43 --- /dev/null +++ b/BrowsingAgent/tools/SendKeys.py @@ -0,0 +1,73 @@ +import time +from typing import Dict + +from pydantic import Field +from selenium.webdriver import Keys +from selenium.webdriver.common.by import By + +from agency_swarm.tools import BaseTool +from .util import get_web_driver, set_web_driver +from .util.highlights import remove_highlight_and_labels + + +from pydantic import model_validator + +class SendKeys(BaseTool): + """ + This tool sends keys into input fields on the current webpage based on the description of that element and what needs to be typed. It then clicks "Enter" on the last element to submit the form. You do not need to tell it to press "Enter"; it will do that automatically. + + Before using this tool make sure to highlight the input elements on the page by outputting '[highlight text fields]' message. + """ + elements_and_texts: Dict[int, str] = Field(..., + description="A dictionary where the key is the element number and the value is the text to be typed.", + examples=[ + {52: "johndoe@gmail.com", 53: "password123"}, + {3: "John Doe", 4: "123 Main St"}, + ] + ) + + @model_validator(mode='before') + @classmethod + def check_elements_and_texts(cls, data): + if not data.get('elements_and_texts'): + raise ValueError( + "elements_and_texts is required. Example format: " + "elements_and_texts={1: 'John Doe', 2: '123 Main St'}" + ) + return data + + def run(self): + wd = get_web_driver() + if 'input' not in self._shared_state.get("elements_highlighted", ""): + raise ValueError("Please highlight input elements on the page first by outputting '[highlight text fields]' message. You must output just the message without calling the tool first, so the user can respond with the screenshot.") + + all_elements = wd.find_elements(By.CSS_SELECTOR, '.highlighted-element') + + i = 0 + try: + for key, value in self.elements_and_texts.items(): + key = int(key) + element = all_elements[key - 1] + + try: + element.click() + element.send_keys(Keys.CONTROL + "a") # Select all text in input + element.send_keys(Keys.DELETE) + element.clear() + except Exception as e: + pass + element.send_keys(value) + # send enter key to the last element + if i == len(self.elements_and_texts) - 1: + element.send_keys(Keys.RETURN) + time.sleep(3) + i += 1 + result = f"Sent input to element and pressed Enter. Current URL is {wd.current_url} To further analyze the page, output '[send screenshot]' command." + except Exception as e: + result = str(e) + + remove_highlight_and_labels(wd) + + set_web_driver(wd) + + return result diff --git a/BrowsingAgent/tools/SolveCaptcha.py b/BrowsingAgent/tools/SolveCaptcha.py new file mode 100644 index 0000000000000000000000000000000000000000..562e5ad49ba22b82b2faff6d6b7a5ba13693da6a --- /dev/null +++ b/BrowsingAgent/tools/SolveCaptcha.py @@ -0,0 +1,238 @@ +import base64 +import time + +from selenium.webdriver.common.by import By +from selenium.webdriver.support.expected_conditions import presence_of_element_located, \ + frame_to_be_available_and_switch_to_it +from selenium.webdriver.support.wait import WebDriverWait + +from agency_swarm.tools import BaseTool +from .util import get_b64_screenshot, remove_highlight_and_labels +from .util.selenium import get_web_driver +from agency_swarm.util import get_openai_client + + +class SolveCaptcha(BaseTool): + """ + This tool asks a human to solve captcha on the current webpage. Make sure that captcha is visible before running it. + """ + + def run(self): + wd = get_web_driver() + + try: + WebDriverWait(wd, 10).until( + frame_to_be_available_and_switch_to_it((By.XPATH, "//iframe[@title='reCAPTCHA']")) + ) + + element = WebDriverWait(wd, 3).until( + presence_of_element_located((By.ID, "recaptcha-anchor")) + ) + except Exception as e: + return "Could not find captcha checkbox" + + try: + # Scroll the element into view + wd.execute_script("arguments[0].scrollIntoView(true);", element) + time.sleep(1) # Give some time for the scrolling to complete + + # Click the element using JavaScript + wd.execute_script("arguments[0].click();", element) + except Exception as e: + return f"Could not click captcha checkbox: {str(e)}" + + try: + # Now check if the reCAPTCHA is checked + WebDriverWait(wd, 3).until( + lambda d: d.find_element(By.CLASS_NAME, "recaptcha-checkbox").get_attribute( + "aria-checked") == "true" + ) + + return "Success" + except Exception as e: + pass + + wd.switch_to.default_content() + + client = get_openai_client() + + WebDriverWait(wd, 10).until( + frame_to_be_available_and_switch_to_it( + (By.XPATH, "//iframe[@title='recaptcha challenge expires in two minutes']")) + ) + + time.sleep(2) + + attempts = 0 + while attempts < 5: + tiles = wd.find_elements(By.CLASS_NAME, "rc-imageselect-tile") + + # filter out tiles with rc-imageselect-dynamic-selected class + tiles = [tile for tile in tiles if + not tile.get_attribute("class").endswith("rc-imageselect-dynamic-selected")] + + image_content = [] + i = 0 + for tile in tiles: + i += 1 + screenshot = get_b64_screenshot(wd, tile) + + image_content.append( + { + "type": "text", + "text": f"Image {i}:", + } + ) + image_content.append( + { + "type": "image_url", + "image_url": + { + "url": f"data:image/jpeg;base64,{screenshot}", + "detail": "high", + } + }, + ) + # highlight all titles with rc-imageselect-tile class but not with rc-imageselect-dynamic-selected + # wd = highlight_elements_with_labels(wd, 'td.rc-imageselect-tile:not(.rc-imageselect-dynamic-selected)') + + # screenshot = get_b64_screenshot(wd, wd.find_element(By.ID, "rc-imageselect")) + + task_text = wd.find_element(By.CLASS_NAME, "rc-imageselect-instructions").text.strip().replace("\n", + " ") + + continuous_task = 'once there are none left' in task_text.lower() + + task_text = task_text.replace("Click verify", "Output 0") + task_text = task_text.replace("click skip", "Output 0") + task_text = task_text.replace("once", "if") + task_text = task_text.replace("none left", "none") + task_text = task_text.replace("all", "only") + task_text = task_text.replace("squares", "images") + + additional_info = "" + if len(tiles) > 9: + additional_info = ("Keep in mind that all images are a part of a bigger image " + "from left to right, and top to bottom. The grid is 4x4. ") + + messages = [ + { + "role": "system", + "content": f"""You are an advanced AI designed to support users with visual impairments. + User will provide you with {i} images numbered from 1 to {i}. Your task is to output + the numbers of the images that contain the requested object, or at least some part of the requested + object. {additional_info}If there are no individual images that satisfy this condition, output 0. + """.replace("\n", ""), + }, + { + "role": "user", + "content": [ + *image_content, + { + "type": "text", + "text": f"{task_text}. Only output numbers separated by commas and nothing else. " + f"Output 0 if there are none." + } + ] + }] + + response = client.chat.completions.create( + model="gpt-4o", + messages=messages, + max_tokens=1024, + temperature=0.0, + ) + + message = response.choices[0].message + message_text = message.content + + # check if 0 is in the message + if "0" in message_text and "10" not in message_text: + # Find the button by its ID + verify_button = wd.find_element(By.ID, "recaptcha-verify-button") + + verify_button_text = verify_button.text + + # Click the button + wd.execute_script("arguments[0].click();", verify_button) + + time.sleep(1) + + try: + if self.verify_checkbox(wd): + return "Success. Captcha solved." + except Exception as e: + print('Not checked') + pass + + else: + numbers = [int(s.strip()) for s in message_text.split(",") if s.strip().isdigit()] + + # Click the tiles based on the provided numbers + for number in numbers: + wd.execute_script("arguments[0].click();", tiles[number - 1]) + time.sleep(0.5) + + time.sleep(3) + + if not continuous_task: + # Find the button by its ID + verify_button = wd.find_element(By.ID, "recaptcha-verify-button") + + verify_button_text = verify_button.text + + # Click the button + wd.execute_script("arguments[0].click();", verify_button) + + try: + if self.verify_checkbox(wd): + return "Success. Captcha solved." + except Exception as e: + pass + else: + continue + + if "verify" in verify_button_text.lower(): + attempts += 1 + + wd = remove_highlight_and_labels(wd) + + wd.switch_to.default_content() + + # close captcha + try: + element = WebDriverWait(wd, 3).until( + presence_of_element_located((By.XPATH, "//iframe[@title='reCAPTCHA']")) + ) + + wd.execute_script(f"document.elementFromPoint({element.location['x']}, {element.location['y']-10}).click();") + except Exception as e: + print(e) + pass + + return "Could not solve captcha." + + def verify_checkbox(self, wd): + wd.switch_to.default_content() + + try: + WebDriverWait(wd, 10).until( + frame_to_be_available_and_switch_to_it((By.XPATH, "//iframe[@title='reCAPTCHA']")) + ) + + WebDriverWait(wd, 5).until( + lambda d: d.find_element(By.CLASS_NAME, "recaptcha-checkbox").get_attribute( + "aria-checked") == "true" + ) + + return True + except Exception as e: + wd.switch_to.default_content() + + WebDriverWait(wd, 10).until( + frame_to_be_available_and_switch_to_it( + (By.XPATH, "//iframe[@title='recaptcha challenge expires in two minutes']")) + ) + + return False + diff --git a/BrowsingAgent/tools/WebPageSummarizer.py b/BrowsingAgent/tools/WebPageSummarizer.py new file mode 100644 index 0000000000000000000000000000000000000000..00246954d204b7b44446aa1c5ef49a68b93f328b --- /dev/null +++ b/BrowsingAgent/tools/WebPageSummarizer.py @@ -0,0 +1,39 @@ +from selenium.webdriver.common.by import By + +from agency_swarm.tools import BaseTool +from .util import get_web_driver, set_web_driver + + +class WebPageSummarizer(BaseTool): + """ + This tool summarizes the content of the current web page, extracting the main points and providing a concise summary. + """ + + def run(self): + from agency_swarm import get_openai_client + + wd = get_web_driver() + client = get_openai_client() + + content = wd.find_element(By.TAG_NAME, "body").text + + # only use the first 10000 characters + content = " ".join(content.split()[:10000]) + + completion = client.chat.completions.create( + model="gpt-3.5-turbo", + messages=[ + {"role": "system", "content": "Your task is to summarize the content of the provided webpage. The summary should be concise and informative, capturing the main points and takeaways of the page."}, + {"role": "user", "content": "Summarize the content of the following webpage:\n\n" + content}, + ], + temperature=0.0, + ) + + return completion.choices[0].message.content + +if __name__ == "__main__": + wd = get_web_driver() + wd.get("https://en.wikipedia.org/wiki/Python_(programming_language)") + set_web_driver(wd) + tool = WebPageSummarizer() + print(tool.run()) \ No newline at end of file diff --git a/BrowsingAgent/tools/__init__.py b/BrowsingAgent/tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a68e9085c1daf091de3d171180a54e8b83fb3513 --- /dev/null +++ b/BrowsingAgent/tools/__init__.py @@ -0,0 +1,10 @@ +from .Scroll import Scroll +from .ReadURL import ReadURL +from .SendKeys import SendKeys +from .ClickElement import ClickElement +from .GoBack import GoBack +from .SelectDropdown import SelectDropdown +from .SolveCaptcha import SolveCaptcha +from .ExportFile import ExportFile +from .WebPageSummarizer import WebPageSummarizer +from .SearchAndScrape import SearchAndScrape \ No newline at end of file diff --git a/BrowsingAgent/tools/__pycache__/ClickElement.cpython-311.pyc b/BrowsingAgent/tools/__pycache__/ClickElement.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc1c5aa60a2d6413e681322c4a04234f66ca6024 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/ClickElement.cpython-311.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/ClickElement.cpython-313.pyc b/BrowsingAgent/tools/__pycache__/ClickElement.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be367801eba8a862f13f676cdb66f3dd41263bcd Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/ClickElement.cpython-313.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/ExportFile.cpython-311.pyc b/BrowsingAgent/tools/__pycache__/ExportFile.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dfde34ff638770e059e2cec825115c79d90656e1 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/ExportFile.cpython-311.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/ExportFile.cpython-313.pyc b/BrowsingAgent/tools/__pycache__/ExportFile.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7507f0ba3f63eebe7f914e0e591ec09b604b1802 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/ExportFile.cpython-313.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/GoBack.cpython-311.pyc b/BrowsingAgent/tools/__pycache__/GoBack.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9cb40a0c13408bb717681d3580d07aa85c00ce4f Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/GoBack.cpython-311.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/GoBack.cpython-313.pyc b/BrowsingAgent/tools/__pycache__/GoBack.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd6d5861822238fdb2aaf0f16f362dc4f5903370 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/GoBack.cpython-313.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/ReadURL.cpython-311.pyc b/BrowsingAgent/tools/__pycache__/ReadURL.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0636e9ca4c7d776650e92e417fce1586eb757453 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/ReadURL.cpython-311.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/ReadURL.cpython-313.pyc b/BrowsingAgent/tools/__pycache__/ReadURL.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb3c4772bfa63f6cd595476f1bcb8545d64f1907 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/ReadURL.cpython-313.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/Scroll.cpython-311.pyc b/BrowsingAgent/tools/__pycache__/Scroll.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b76c9d26ed45ba6b651284300ab68a895a6986e0 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/Scroll.cpython-311.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/Scroll.cpython-313.pyc b/BrowsingAgent/tools/__pycache__/Scroll.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc0be3bd0c1ba9048d49aec0dd6283a1214ce379 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/Scroll.cpython-313.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/SearchAndScrape.cpython-311.pyc b/BrowsingAgent/tools/__pycache__/SearchAndScrape.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6fe9aadb36e38814e995c5959747f3e2dad66d4 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/SearchAndScrape.cpython-311.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/SearchAndScrape.cpython-313.pyc b/BrowsingAgent/tools/__pycache__/SearchAndScrape.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d76f0e9c25198343479898dae5a6cbda18239f25 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/SearchAndScrape.cpython-313.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/SelectDropdown.cpython-311.pyc b/BrowsingAgent/tools/__pycache__/SelectDropdown.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94993b4298e178e18646a253b9c4aea779a9d80e Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/SelectDropdown.cpython-311.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/SelectDropdown.cpython-313.pyc b/BrowsingAgent/tools/__pycache__/SelectDropdown.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c68e02520a78360a0adcc023c68399711c000675 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/SelectDropdown.cpython-313.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/SendKeys.cpython-311.pyc b/BrowsingAgent/tools/__pycache__/SendKeys.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d36801ec4a68be7f5f704ba68407f13b5f51075 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/SendKeys.cpython-311.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/SendKeys.cpython-313.pyc b/BrowsingAgent/tools/__pycache__/SendKeys.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..af0b0a158c58387a6652103d9dd861360831c992 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/SendKeys.cpython-313.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/SolveCaptcha.cpython-311.pyc b/BrowsingAgent/tools/__pycache__/SolveCaptcha.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..06553ec2279f2646403a3a54ca3742f8ba19faf3 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/SolveCaptcha.cpython-311.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/SolveCaptcha.cpython-313.pyc b/BrowsingAgent/tools/__pycache__/SolveCaptcha.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..51b824c0fcffb9041f95a8cd7da982c8b28af6ee Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/SolveCaptcha.cpython-313.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/WebPageSummarizer.cpython-311.pyc b/BrowsingAgent/tools/__pycache__/WebPageSummarizer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37646fc77406376a4d83db587137019bac99fa48 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/WebPageSummarizer.cpython-311.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/WebPageSummarizer.cpython-313.pyc b/BrowsingAgent/tools/__pycache__/WebPageSummarizer.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..582ec97e9774e083f732df8ae03446b9f2f72a6d Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/WebPageSummarizer.cpython-313.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/__init__.cpython-311.pyc b/BrowsingAgent/tools/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..605ab05c3bf8f575b9989cd51e0d91f207df06ea Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/__init__.cpython-311.pyc differ diff --git a/BrowsingAgent/tools/__pycache__/__init__.cpython-313.pyc b/BrowsingAgent/tools/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ab42d9285595d0cf071cd45741e5e679cdc0e61 Binary files /dev/null and b/BrowsingAgent/tools/__pycache__/__init__.cpython-313.pyc differ diff --git a/BrowsingAgent/tools/util/__init__.py b/BrowsingAgent/tools/util/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f8634adc4e34da111010fe924bb440ca0521322f --- /dev/null +++ b/BrowsingAgent/tools/util/__init__.py @@ -0,0 +1,3 @@ +from .get_b64_screenshot import get_b64_screenshot +from .selenium import get_web_driver, set_web_driver +from .highlights import remove_highlight_and_labels, highlight_elements_with_labels diff --git a/BrowsingAgent/tools/util/__pycache__/__init__.cpython-311.pyc b/BrowsingAgent/tools/util/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..566db9400a00a8dc56c2e0e474cdba543d1d6aa9 Binary files /dev/null and b/BrowsingAgent/tools/util/__pycache__/__init__.cpython-311.pyc differ diff --git a/BrowsingAgent/tools/util/__pycache__/__init__.cpython-313.pyc b/BrowsingAgent/tools/util/__pycache__/__init__.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5ee209741043be5d53649aaf136613fe5fb69a2f Binary files /dev/null and b/BrowsingAgent/tools/util/__pycache__/__init__.cpython-313.pyc differ diff --git a/BrowsingAgent/tools/util/__pycache__/get_b64_screenshot.cpython-311.pyc b/BrowsingAgent/tools/util/__pycache__/get_b64_screenshot.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..425c2d0cd4c02d34364a20719af8b079139df7e2 Binary files /dev/null and b/BrowsingAgent/tools/util/__pycache__/get_b64_screenshot.cpython-311.pyc differ diff --git a/BrowsingAgent/tools/util/__pycache__/get_b64_screenshot.cpython-313.pyc b/BrowsingAgent/tools/util/__pycache__/get_b64_screenshot.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c9e2f2977b8dd9277e692f315db8a341ebae9a77 Binary files /dev/null and b/BrowsingAgent/tools/util/__pycache__/get_b64_screenshot.cpython-313.pyc differ diff --git a/BrowsingAgent/tools/util/__pycache__/highlights.cpython-311.pyc b/BrowsingAgent/tools/util/__pycache__/highlights.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c7b1cb16896de0709ee0c12b6c32a5c46abb0613 Binary files /dev/null and b/BrowsingAgent/tools/util/__pycache__/highlights.cpython-311.pyc differ diff --git a/BrowsingAgent/tools/util/__pycache__/highlights.cpython-313.pyc b/BrowsingAgent/tools/util/__pycache__/highlights.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5cbe4c3b5da11893dec7b2421507a9e8e131f787 Binary files /dev/null and b/BrowsingAgent/tools/util/__pycache__/highlights.cpython-313.pyc differ diff --git a/BrowsingAgent/tools/util/__pycache__/selenium.cpython-311.pyc b/BrowsingAgent/tools/util/__pycache__/selenium.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4d06294bf78c0e657470dd344ee20c6dfb5f53ee Binary files /dev/null and b/BrowsingAgent/tools/util/__pycache__/selenium.cpython-311.pyc differ diff --git a/BrowsingAgent/tools/util/__pycache__/selenium.cpython-313.pyc b/BrowsingAgent/tools/util/__pycache__/selenium.cpython-313.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1b0d7c35a99dbcab6fbbfa9c167457ea3e31a141 Binary files /dev/null and b/BrowsingAgent/tools/util/__pycache__/selenium.cpython-313.pyc differ diff --git a/BrowsingAgent/tools/util/get_b64_screenshot.py b/BrowsingAgent/tools/util/get_b64_screenshot.py new file mode 100644 index 0000000000000000000000000000000000000000..4d418bb0061af5f2f613a95e1b43906eb5d9b115 --- /dev/null +++ b/BrowsingAgent/tools/util/get_b64_screenshot.py @@ -0,0 +1,8 @@ + +def get_b64_screenshot(wd, element=None): + if element: + screenshot_b64 = element.screenshot_as_base64 + else: + screenshot_b64 = wd.get_screenshot_as_base64() + + return screenshot_b64 \ No newline at end of file diff --git a/BrowsingAgent/tools/util/highlights.py b/BrowsingAgent/tools/util/highlights.py new file mode 100644 index 0000000000000000000000000000000000000000..d2fb0895f9cec09d779fb5041cb9d7430cf75a74 --- /dev/null +++ b/BrowsingAgent/tools/util/highlights.py @@ -0,0 +1,139 @@ +def highlight_elements_with_labels(driver, selector): + """ + This function highlights clickable elements like buttons, links, and certain divs and spans + that match the given CSS selector on the webpage with a red border and ensures that labels are visible and positioned + correctly within the viewport. + + :param driver: Instance of Selenium WebDriver. + :param selector: CSS selector for the elements to be highlighted. + """ + script = f""" + // Helper function to check if an element is visible + function isElementVisible(element) {{ + var rect = element.getBoundingClientRect(); + if (rect.width <= 0 || rect.height <= 0 || + rect.top >= (window.innerHeight || document.documentElement.clientHeight) || + rect.bottom <= 0 || + rect.left >= (window.innerWidth || document.documentElement.clientWidth) || + rect.right <= 0) {{ + return false; + }} + // Check if any parent element is hidden, which would hide this element as well + var parent = element; + while (parent) {{ + var style = window.getComputedStyle(parent); + if (style.display === 'none' || style.visibility === 'hidden') {{ + return false; + }} + parent = parent.parentElement; + }} + return true; + }} + + // Remove previous labels and styles if they exist + document.querySelectorAll('.highlight-label').forEach(function(label) {{ + label.remove(); + }}); + document.querySelectorAll('.highlighted-element').forEach(function(element) {{ + element.classList.remove('highlighted-element'); + element.removeAttribute('data-highlighted'); + }}); + + // Inject custom style for highlighting elements + var styleElement = document.getElementById('highlight-style'); + if (!styleElement) {{ + styleElement = document.createElement('style'); + styleElement.id = 'highlight-style'; + document.head.appendChild(styleElement); + }} + styleElement.textContent = ` + .highlighted-element {{ + border: 2px solid red !important; + position: relative; + box-sizing: border-box; + }} + .highlight-label {{ + position: absolute; + z-index: 2147483647; + background: yellow; + color: black; + font-size: 25px; + padding: 3px 5px; + border: 1px solid black; + border-radius: 3px; + white-space: nowrap; + box-shadow: 0px 0px 2px #000; + top: -25px; + left: 0; + display: none; + }} + `; + + // Function to create and append a label to the body + function createAndAdjustLabel(element, index) {{ + if (!isElementVisible(element)) return; + + element.classList.add('highlighted-element'); + var label = document.createElement('div'); + label.className = 'highlight-label'; + label.textContent = index.toString(); + label.style.display = 'block'; // Make the label visible + + // Calculate label position + var rect = element.getBoundingClientRect(); + var top = rect.top + window.scrollY - 25; // Position label above the element + var left = rect.left + window.scrollX; + + label.style.top = top + 'px'; + label.style.left = left + 'px'; + + document.body.appendChild(label); // Append the label to the body + }} + + // Select all clickable elements and apply the styles + var allElements = document.querySelectorAll('{selector}'); + var index = 1; + allElements.forEach(function(element) {{ + // Check if the element is not already highlighted and is visible + if (!element.dataset.highlighted && isElementVisible(element)) {{ + element.dataset.highlighted = 'true'; + createAndAdjustLabel(element, index++); + }} + }}); + """ + + driver.execute_script(script) + + return driver + + +def remove_highlight_and_labels(driver): + """ + This function removes all red borders and labels from the webpage elements, + reversing the changes made by the highlight functions using Selenium WebDriver. + + :param driver: Instance of Selenium WebDriver. + """ + selector = ('a, button, input, textarea, div[onclick], div[role="button"], div[tabindex], span[onclick], ' + 'span[role="button"], span[tabindex]') + script = f""" + // Remove all labels + document.querySelectorAll('.highlight-label').forEach(function(label) {{ + label.remove(); + }}); + + // Remove the added style for red borders + var highlightStyle = document.getElementById('highlight-style'); + if (highlightStyle) {{ + highlightStyle.remove(); + }} + + // Remove inline styles added by highlighting function + document.querySelectorAll('{selector}').forEach(function(element) {{ + element.style.border = ''; + }}); + """ + + driver.execute_script(script) + + return driver \ No newline at end of file diff --git a/BrowsingAgent/tools/util/selenium.py b/BrowsingAgent/tools/util/selenium.py new file mode 100644 index 0000000000000000000000000000000000000000..dff68849d1599c7c4adda7c29deef1ebe3bfbf77 --- /dev/null +++ b/BrowsingAgent/tools/util/selenium.py @@ -0,0 +1,154 @@ +import os + +wd = None + +selenium_config = { + "chrome_profile_path": None, + "headless": True, + "full_page_screenshot": True, +} + + +def get_web_driver(): + print("Initializing WebDriver...") + try: + from selenium import webdriver + from selenium.webdriver.chrome.service import Service as ChromeService + print("Selenium imported successfully.") + except ImportError: + print("Selenium not installed. Please install it with pip install selenium") + raise ImportError + + try: + from webdriver_manager.chrome import ChromeDriverManager + print("webdriver_manager imported successfully.") + except ImportError: + print("webdriver_manager not installed. Please install it with pip install webdriver-manager") + raise ImportError + + try: + from selenium_stealth import stealth + print("selenium_stealth imported successfully.") + except ImportError: + print("selenium_stealth not installed. Please install it with pip install selenium-stealth") + raise ImportError + + global wd, selenium_config + + if wd: + print("Returning existing WebDriver instance.") + return wd + + chrome_profile_path = selenium_config.get("chrome_profile_path", None) + profile_directory = None + user_data_dir = None + if isinstance(chrome_profile_path, str) and os.path.exists(chrome_profile_path): + profile_directory = os.path.split(chrome_profile_path)[-1].strip("\\").rstrip("/") + user_data_dir = os.path.split(chrome_profile_path)[0].strip("\\").rstrip("/") + print(f"Using Chrome profile: {profile_directory}") + print(f"Using Chrome user data dir: {user_data_dir}") + print(f"Using Chrome profile path: {chrome_profile_path}") + + chrome_options = webdriver.ChromeOptions() + print("ChromeOptions initialized.") + + chrome_driver_path = "/usr/bin/chromedriver" + if not os.path.exists(chrome_driver_path): + print("ChromeDriver not found at /usr/bin/chromedriver. Installing using webdriver_manager.") + chrome_driver_path = ChromeDriverManager().install() + else: + print(f"ChromeDriver found at {chrome_driver_path}.") + + if selenium_config.get("headless", False): + chrome_options.add_argument('--headless') + print("Headless mode enabled.") + if selenium_config.get("full_page_screenshot", False): + chrome_options.add_argument("--start-maximized") + print("Full page screenshot mode enabled.") + else: + chrome_options.add_argument("--window-size=1920,1080") + print("Window size set to 1920,1080.") + + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-gpu") + chrome_options.add_argument("--disable-dev-shm-usage") + chrome_options.add_argument("--remote-debugging-port=9222") + chrome_options.add_argument("--disable-extensions") + chrome_options.add_argument("--disable-popup-blocking") + chrome_options.add_argument("--ignore-certificate-errors") + chrome_options.add_argument("--disable-blink-features=AutomationControlled") + chrome_options.add_argument("--disable-web-security") + chrome_options.add_argument("--allow-running-insecure-content") + chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) + chrome_options.add_experimental_option("useAutomationExtension", False) + print("Chrome options configured.") + + if user_data_dir and profile_directory: + chrome_options.add_argument(f"user-data-dir={user_data_dir}") + chrome_options.add_argument(f"profile-directory={profile_directory}") + print(f"Using user data dir: {user_data_dir} and profile directory: {profile_directory}") + + try: + wd = webdriver.Chrome(service=ChromeService(chrome_driver_path), options=chrome_options) + print("WebDriver initialized successfully.") + if wd.capabilities['chrome']['userDataDir']: + print(f"Profile path in use: {wd.capabilities['chrome']['userDataDir']}") + except Exception as e: + print(f"Error initializing WebDriver: {e}") + raise e + + if not selenium_config.get("chrome_profile_path", None): + stealth( + wd, + languages=["en-US", "en"], + vendor="Google Inc.", + platform="Win32", + webgl_vendor="Intel Inc.", + renderer="Intel Iris OpenGL Engine", + fix_hairline=True, + ) + print("Stealth mode configured.") + + wd.implicitly_wait(3) + print("Implicit wait set to 3 seconds.") + + return wd + + +def set_web_driver(new_wd): + # remove all popups + js_script = """ + var popUpSelectors = ['modal', 'popup', 'overlay', 'dialog']; // Add more selectors that are commonly used for pop-ups + popUpSelectors.forEach(function(selector) { + var elements = document.querySelectorAll(selector); + elements.forEach(function(element) { + // You can choose to hide or remove; here we're removing the element + element.parentNode.removeChild(element); + }); + }); + """ + + new_wd.execute_script(js_script) + + # Close LinkedIn specific popups + if "linkedin.com" in new_wd.current_url: + linkedin_js_script = """ + var linkedinSelectors = ['div.msg-overlay-list-bubble', 'div.ml4.msg-overlay-list-bubble__tablet-height']; + linkedinSelectors.forEach(function(selector) { + var elements = document.querySelectorAll(selector); + elements.forEach(function(element) { + element.parentNode.removeChild(element); + }); + }); + """ + new_wd.execute_script(linkedin_js_script) + + new_wd.execute_script("document.body.style.zoom='1.2'") + + global wd + wd = new_wd + + +def set_selenium_config(config): + global selenium_config + selenium_config = config diff --git a/agency.py b/agency.py new file mode 100644 index 0000000000000000000000000000000000000000..194230f0040ab1b64b6be119bbec9da9ea249a3b --- /dev/null +++ b/agency.py @@ -0,0 +1,41 @@ +import os +import logging +from agency_swarm import Agency, set_openai_client +from ValidationAgent import ValidationAgent +from BrowsingAgent import BrowsingAgent +from MarketInsightsCEO import MarketInsightsCEO +from dotenv import load_dotenv +from openai import OpenAI +from astra_assistants import patch + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + +# Load environment variables +load_dotenv() + +# Set OpenAI API key +openai_api_key = os.getenv("OPENAI_API_KEY") +if not openai_api_key: + raise ValueError("OPENAI_API_KEY not found in environment variables") + +client = patch(OpenAI()) +set_openai_client(client) + +# Create instances of agents +ceo = MarketInsightsCEO() +scraper = BrowsingAgent() +validator = ValidationAgent() + +agency = Agency([ceo, scraper, validator, [ceo, scraper], + [ceo, validator], + [scraper, validator]], + shared_instructions='./agency_manifesto.md', + max_prompt_tokens=25000, + temperature=0.3, + ) + +if __name__ == '__main__': + # Example of initiating a market research task + ceo.initiate_market_research(scraper, "ecommerce market trends india") + agency.demo_gradio() diff --git a/agency_manifesto.md b/agency_manifesto.md new file mode 100644 index 0000000000000000000000000000000000000000..84764ebe669fd2c6db9cf906c9ee4b1e01fb9ccf --- /dev/null +++ b/agency_manifesto.md @@ -0,0 +1 @@ +The MarketInsightsAgency's mission is to autonomously gather, validate, and produce comprehensive reports on market trends, focusing on specific tools, technologies, or domains. This agency leverages the googlesearch package for searching and Firecrawl for web scraping to collect and analyze data. The agency aims to provide accurate, complete, and timely reports while ensuring compliance with legal and ethical guidelines for web scraping. Key performance indicators include data accuracy, report completeness, generation time, and compliance with guidelines. \ No newline at end of file