Spaces:
Runtime error
Runtime error
Yago Bolivar
feat: implement WebBrowser class for fetching and parsing web content with error handling
c467d81
| import unittest | |
| from unittest.mock import patch, MagicMock | |
| import requests # Import requests for its exception types | |
| import os | |
| import sys | |
| # Add the parent directory to sys.path to find the src module | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| # Adjust the import path based on your project structure | |
| # If web_browser.py is in a 'src' directory: | |
| from src.web_browsing_tool import WebBrowser | |
| # If web_browser.py is in the same directory as app.py (and tools are in a 'tools' subdir): | |
| # from tools.web_browser import WebBrowser | |
| class TestWebBrowser(unittest.TestCase): | |
| def setUp(self): | |
| self.browser = WebBrowser(user_agent="TestAgent/1.0") | |
| def test_browse_successful_fetch_and_parse(self, mock_get): | |
| # Mock the response from requests.get | |
| mock_response = MagicMock() | |
| mock_response.status_code = 200 | |
| mock_response.content = b"<html><head><title>Test Page</title></head><body><p>Hello World!</p><script>alert('test');</script></body></html>" | |
| mock_response.raise_for_status = MagicMock() # Ensure this doesn't raise an error | |
| mock_get.return_value = mock_response | |
| url = "http://example.com/testpage" | |
| result = self.browser.browse(url) | |
| mock_get.assert_called_once_with(url, headers={"User-Agent": "TestAgent/1.0"}, timeout=15) | |
| self.assertEqual(result, "Test Page\nHello World!") | |
| def test_browse_http_error(self, mock_get): | |
| # Mock requests.get to raise an HTTPError | |
| mock_get.side_effect = requests.exceptions.HTTPError("404 Client Error: Not Found for url") | |
| url = "http://example.com/notfound" | |
| result = self.browser.browse(url) | |
| mock_get.assert_called_once_with(url, headers={"User-Agent": "TestAgent/1.0"}, timeout=15) | |
| self.assertTrue(result.startswith("Error: HTTP error occurred")) | |
| self.assertIn("404 Client Error", result) | |
| def test_browse_connection_error(self, mock_get): | |
| mock_get.side_effect = requests.exceptions.ConnectionError("Connection refused") | |
| url = "http://example.com/unreachable" | |
| result = self.browser.browse(url) | |
| self.assertTrue(result.startswith("Error: Connection error occurred")) | |
| self.assertIn("Connection refused", result) | |
| def test_browse_timeout_error(self, mock_get): | |
| mock_get.side_effect = requests.exceptions.Timeout("Request timed out") | |
| url = "http://example.com/slowresponse" | |
| result = self.browser.browse(url) | |
| self.assertTrue(result.startswith("Error: Timeout occurred")) | |
| self.assertIn("Request timed out", result) | |
| def test_browse_generic_request_exception(self, mock_get): | |
| mock_get.side_effect = requests.exceptions.RequestException("Some other request error") | |
| url = "http://example.com/othererror" | |
| result = self.browser.browse(url) | |
| self.assertTrue(result.startswith("Error: An unexpected error occurred while fetching")) | |
| self.assertIn("Some other request error", result) | |
| def test_browse_invalid_url_format(self): | |
| url = "www.example.com" # Missing http:// or https:// | |
| result = self.browser.browse(url) | |
| self.assertEqual(result, "Error: Invalid URL format. URL must start with http:// or https://. Received: www.example.com") | |
| def test_browse_no_text_content(self, mock_get): | |
| mock_response = MagicMock() | |
| mock_response.status_code = 200 | |
| mock_response.content = b"<html><head><script>var x=1;</script></head><body><style>.body {color:red;}</style></body></html>" | |
| mock_response.raise_for_status = MagicMock() | |
| mock_get.return_value = mock_response | |
| url = "http://example.com/notext" | |
| result = self.browser.browse(url) | |
| self.assertEqual(result, f"Error: No text content found at {url}.") | |
| def test_browse_strips_extra_whitespace_and_newlines(self, mock_get): | |
| mock_response = MagicMock() | |
| mock_response.status_code = 200 | |
| mock_response.content = b"<html><body><p>Line 1</p> <p>Line 2</p>\n\n<p>Line\n3</p><div><span>Text</span></div></body></html>" | |
| mock_response.raise_for_status = MagicMock() | |
| mock_get.return_value = mock_response | |
| url = "http://example.com/whitespace" | |
| result = self.browser.browse(url) | |
| expected_text = "Line 1\nLine 2\nLine\n3\nText" | |
| self.assertEqual(result, expected_text) | |
| def test_browse_for_question_answering_scenario_mercedes_sosa(self, mock_get): | |
| """ | |
| Tests if the browser can extract relevant text for a question | |
| similar to the Mercedes Sosa studio albums count. | |
| """ | |
| # Use a regular string for HTML content | |
| mock_html_content_str = """ | |
| <html> | |
| <head><title>Mercedes Sosa Discography</title></head> | |
| <body> | |
| <h1>Mercedes Sosa</h1> | |
| <h2>Studio Albums</h2> | |
| <ul> | |
| <li>1999 - Misa Criolla</li> | |
| <li>2002 - Ac煤stico</li> | |
| <li>2005 - Coraz贸n libre</li> | |
| <li>2009 - Cantora 1</li> | |
| <li>2011 - Canto para caminar</li> | |
| </ul> | |
| <h2>Live Albums</h2> | |
| <ul> | |
| <li>2000 - Live in Concert</li> | |
| </ul> | |
| </body> | |
| </html> | |
| """ | |
| mock_response = MagicMock() | |
| mock_response.status_code = 200 | |
| # Encode the string to bytes for the content | |
| mock_response.content = mock_html_content_str.encode('utf-8') | |
| mock_response.raise_for_status = MagicMock() | |
| mock_get.return_value = mock_response | |
| url = "http://example.com/mercedes_sosa_discography" | |
| result = self.browser.browse(url) | |
| # Assert that key information is present in the extracted text | |
| self.assertIn("Mercedes Sosa Discography", result) # From title | |
| self.assertIn("Studio Albums", result) | |
| self.assertIn("1999 - Misa Criolla", result) | |
| self.assertIn("2002 - Ac煤stico", result) | |
| self.assertIn("2005 - Coraz贸n libre", result) | |
| self.assertIn("2009 - Cantora 1", result) | |
| self.assertIn("2011 - Canto para caminar", result) | |
| # Ensure it doesn't just grab everything indiscriminately or miss sections | |
| self.assertIn("Live Albums", result) | |
| self.assertIn("2000 - Live in Concert", result) | |
| # A further step (outside this tool's direct responsibility but for agent context) | |
| # would be to pass this 'result' to an LLM with the question: | |
| # "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)?" | |
| # The LLM should be able to parse the structured list and count "Ac煤stico", "Coraz贸n libre", "Cantora 1" -> 3. | |
| if __name__ == '__main__': | |
| unittest.main() |