HF_Agents_Final_Project

Runtime error

App Files Files Community

Yago Bolivar commited on May 12, 2025

Commit

c467d81

1 Parent(s): ada4787

feat: implement WebBrowser class for fetching and parsing web content with error handling

Browse files

Files changed (2) hide show

src/web_browsing_tool.py +98 -0
tests/test_web_browser.py +163 -0

src/web_browsing_tool.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import requests
+from bs4 import BeautifulSoup
+class WebBrowser:
+    """
+    A simple web browser tool to fetch and parse content from URLs.
+    """
+    def __init__(self, user_agent="GAIA-Agent/1.0"):
+        """
+        Initializes the web browser with a user agent.
+        Args:
+            user_agent (str): The User-Agent string to use for requests.
+        """
+        self.headers = {"User-Agent": user_agent}
+    def browse(self, url: str) -> str:
+        """
+        Fetches the content of a web page and extracts its text.
+        Args:
+            url (str): The URL of the web page to browse.
+        Returns:
+            str: The extracted text content of the web page, or an error message
+                 if fetching or parsing fails.
+        """
+        if not url.startswith(('http://', 'https://')):
+            return f"Error: Invalid URL format. URL must start with http:// or https://. Received: {url}"
+        try:
+            response = requests.get(url, headers=self.headers, timeout=15)
+            response.raise_for_status()  # Raises an HTTPError for bad responses (4XX or 5XX)
+            # Use BeautifulSoup to parse the HTML content
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Remove script and style elements
+            for script_or_style in soup(["script", "style"]):
+                script_or_style.decompose()
+            # Get text
+            text_from_soup = soup.get_text(separator='\n', strip=True)
+            # Convert multiple newlines to a single newline and clean spaces within lines
+            cleaned_lines = []
+            for line in text_from_soup.splitlines():
+                line = line.strip() # Strip leading/trailing whitespace from the line itself
+                if line: # Only process non-empty lines
+                    # Replace multiple spaces with a single space
+                    cleaned_line = ' '.join(line.split())
+                    cleaned_lines.append(cleaned_line)
+            text = '\n'.join(cleaned_lines)
+            if not text:
+                return f"Error: No text content found at {url}."
+            return text
+        except requests.exceptions.HTTPError as http_err:
+            return f"Error: HTTP error occurred while fetching {url}: {http_err}"
+        except requests.exceptions.ConnectionError as conn_err:
+            return f"Error: Connection error occurred while fetching {url}: {conn_err}"
+        except requests.exceptions.Timeout as timeout_err:
+            return f"Error: Timeout occurred while fetching {url}: {timeout_err}"
+        except requests.exceptions.RequestException as req_err:
+            return f"Error: An unexpected error occurred while fetching {url}: {req_err}"
+        except Exception as e:
+            return f"Error: An unexpected error occurred during parsing of {url}: {e}"
+if __name__ == '__main__':
+    browser = WebBrowser()
+    # Example usage:
+    # Note: For a real agent, the URL would come from the task or a search step.
+    # This example uses a known Wikipedia page for demonstration.
+    # For tasks like "How many studio albums were published by Mercedes Sosa...",
+    # the agent would first need to find the relevant Wikipedia URL.
+    test_url_wikipedia = "https://en.wikipedia.org/wiki/Mercedes_Sosa"
+    print(f"--- Browsing: {test_url_wikipedia} ---")
+    content_wikipedia = browser.browse(test_url_wikipedia)
+    if content_wikipedia.startswith("Error:"):
+        print(content_wikipedia)
+    else:
+        # Print first 1000 characters for brevity in example
+        print(content_wikipedia[:1000] + "..." if len(content_wikipedia) > 1000 else content_wikipedia)
+    print("\n--- Example with a non-existent page ---")
+    test_url_non_existent = "http://example.com/nonexistentpage12345.html"
+    content_non_existent = browser.browse(test_url_non_existent)
+    print(content_non_existent)
+    print("\n--- Example with an invalid URL format ---")
+    test_url_invalid_format = "www.google.com"
+    content_invalid_format = browser.browse(test_url_invalid_format)
+    print(content_invalid_format)

tests/test_web_browser.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import unittest
+from unittest.mock import patch, MagicMock
+import requests # Import requests for its exception types
+import os
+import sys
+# Add the parent directory to sys.path to find the src module
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# Adjust the import path based on your project structure
+# If web_browser.py is in a 'src' directory:
+from src.web_browsing_tool import WebBrowser
+# If web_browser.py is in the same directory as app.py (and tools are in a 'tools' subdir):
+# from tools.web_browser import WebBrowser
+class TestWebBrowser(unittest.TestCase):
+    def setUp(self):
+        self.browser = WebBrowser(user_agent="TestAgent/1.0")
+    @patch('src.web_browsing_tool.requests.get')
+    def test_browse_successful_fetch_and_parse(self, mock_get):
+        # Mock the response from requests.get
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.content = b"<html><head><title>Test Page</title></head><body><p>Hello World!</p><script>alert('test');</script></body></html>"
+        mock_response.raise_for_status = MagicMock() # Ensure this doesn't raise an error
+        mock_get.return_value = mock_response
+        url = "http://example.com/testpage"
+        result = self.browser.browse(url)
+        mock_get.assert_called_once_with(url, headers={"User-Agent": "TestAgent/1.0"}, timeout=15)
+        self.assertEqual(result, "Test Page\nHello World!")
+    @patch('src.web_browsing_tool.requests.get')
+    def test_browse_http_error(self, mock_get):
+        # Mock requests.get to raise an HTTPError
+        mock_get.side_effect = requests.exceptions.HTTPError("404 Client Error: Not Found for url")
+        url = "http://example.com/notfound"
+        result = self.browser.browse(url)
+        mock_get.assert_called_once_with(url, headers={"User-Agent": "TestAgent/1.0"}, timeout=15)
+        self.assertTrue(result.startswith("Error: HTTP error occurred"))
+        self.assertIn("404 Client Error", result)
+    @patch('src.web_browsing_tool.requests.get')
+    def test_browse_connection_error(self, mock_get):
+        mock_get.side_effect = requests.exceptions.ConnectionError("Connection refused")
+        url = "http://example.com/unreachable"
+        result = self.browser.browse(url)
+        self.assertTrue(result.startswith("Error: Connection error occurred"))
+        self.assertIn("Connection refused", result)
+    @patch('src.web_browsing_tool.requests.get')
+    def test_browse_timeout_error(self, mock_get):
+        mock_get.side_effect = requests.exceptions.Timeout("Request timed out")
+        url = "http://example.com/slowresponse"
+        result = self.browser.browse(url)
+        self.assertTrue(result.startswith("Error: Timeout occurred"))
+        self.assertIn("Request timed out", result)
+    @patch('src.web_browsing_tool.requests.get')
+    def test_browse_generic_request_exception(self, mock_get):
+        mock_get.side_effect = requests.exceptions.RequestException("Some other request error")
+        url = "http://example.com/othererror"
+        result = self.browser.browse(url)
+        self.assertTrue(result.startswith("Error: An unexpected error occurred while fetching"))
+        self.assertIn("Some other request error", result)
+    def test_browse_invalid_url_format(self):
+        url = "www.example.com" # Missing http:// or https://
+        result = self.browser.browse(url)
+        self.assertEqual(result, "Error: Invalid URL format. URL must start with http:// or https://. Received: www.example.com")
+    @patch('src.web_browsing_tool.requests.get')
+    def test_browse_no_text_content(self, mock_get):
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.content = b"<html><head><script>var x=1;</script></head><body><style>.body {color:red;}</style></body></html>"
+        mock_response.raise_for_status = MagicMock()
+        mock_get.return_value = mock_response
+        url = "http://example.com/notext"
+        result = self.browser.browse(url)
+        self.assertEqual(result, f"Error: No text content found at {url}.")
+    @patch('src.web_browsing_tool.requests.get')
+    def test_browse_strips_extra_whitespace_and_newlines(self, mock_get):
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.content = b"<html><body><p>Line 1</p>  <p>Line  2</p>\n\n<p>Line\n3</p><div><span>Text</span></div></body></html>"
+        mock_response.raise_for_status = MagicMock()
+        mock_get.return_value = mock_response
+        url = "http://example.com/whitespace"
+        result = self.browser.browse(url)
+        expected_text = "Line 1\nLine 2\nLine\n3\nText"
+        self.assertEqual(result, expected_text)
+    @patch('src.web_browsing_tool.requests.get')
+    def test_browse_for_question_answering_scenario_mercedes_sosa(self, mock_get):
+        """
+        Tests if the browser can extract relevant text for a question
+        similar to the Mercedes Sosa studio albums count.
+        """
+        # Use a regular string for HTML content
+        mock_html_content_str = """
+        <html>
+            <head><title>Mercedes Sosa Discography</title></head>
+            <body>
+                <h1>Mercedes Sosa</h1>
+                <h2>Studio Albums</h2>
+                <ul>
+                    <li>1999 - Misa Criolla</li>
+                    <li>2002 - Acústico</li>
+                    <li>2005 - Corazón libre</li>
+                    <li>2009 - Cantora 1</li>
+                    <li>2011 - Canto para caminar</li>
+                </ul>
+                <h2>Live Albums</h2>
+                <ul>
+                    <li>2000 - Live in Concert</li>
+                </ul>
+            </body>
+        </html>
+        """
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        # Encode the string to bytes for the content
+        mock_response.content = mock_html_content_str.encode('utf-8')
+        mock_response.raise_for_status = MagicMock()
+        mock_get.return_value = mock_response
+        url = "http://example.com/mercedes_sosa_discography"
+        result = self.browser.browse(url)
+        # Assert that key information is present in the extracted text
+        self.assertIn("Mercedes Sosa Discography", result) # From title
+        self.assertIn("Studio Albums", result)
+        self.assertIn("1999 - Misa Criolla", result)
+        self.assertIn("2002 - Acústico", result)
+        self.assertIn("2005 - Corazón libre", result)
+        self.assertIn("2009 - Cantora 1", result)
+        self.assertIn("2011 - Canto para caminar", result)
+        # Ensure it doesn't just grab everything indiscriminately or miss sections
+        self.assertIn("Live Albums", result)
+        self.assertIn("2000 - Live in Concert", result)
+        # A further step (outside this tool's direct responsibility but for agent context)
+        # would be to pass this 'result' to an LLM with the question:
+        # "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)?"
+        # The LLM should be able to parse the structured list and count "Acústico", "Corazón libre", "Cantora 1" -> 3.
+if __name__ == '__main__':
+    unittest.main()