Yago Bolivar commited on
Commit
c467d81
1 Parent(s): ada4787

feat: implement WebBrowser class for fetching and parsing web content with error handling

Browse files
Files changed (2) hide show
  1. src/web_browsing_tool.py +98 -0
  2. tests/test_web_browser.py +163 -0
src/web_browsing_tool.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+ class WebBrowser:
5
+ """
6
+ A simple web browser tool to fetch and parse content from URLs.
7
+ """
8
+
9
+ def __init__(self, user_agent="GAIA-Agent/1.0"):
10
+ """
11
+ Initializes the web browser with a user agent.
12
+ Args:
13
+ user_agent (str): The User-Agent string to use for requests.
14
+ """
15
+ self.headers = {"User-Agent": user_agent}
16
+
17
+ def browse(self, url: str) -> str:
18
+ """
19
+ Fetches the content of a web page and extracts its text.
20
+
21
+ Args:
22
+ url (str): The URL of the web page to browse.
23
+
24
+ Returns:
25
+ str: The extracted text content of the web page, or an error message
26
+ if fetching or parsing fails.
27
+ """
28
+ if not url.startswith(('http://', 'https://')):
29
+ return f"Error: Invalid URL format. URL must start with http:// or https://. Received: {url}"
30
+
31
+ try:
32
+ response = requests.get(url, headers=self.headers, timeout=15)
33
+ response.raise_for_status() # Raises an HTTPError for bad responses (4XX or 5XX)
34
+
35
+ # Use BeautifulSoup to parse the HTML content
36
+ soup = BeautifulSoup(response.content, 'html.parser')
37
+
38
+ # Remove script and style elements
39
+ for script_or_style in soup(["script", "style"]):
40
+ script_or_style.decompose()
41
+
42
+ # Get text
43
+ text_from_soup = soup.get_text(separator='\n', strip=True)
44
+
45
+ # Convert multiple newlines to a single newline and clean spaces within lines
46
+ cleaned_lines = []
47
+ for line in text_from_soup.splitlines():
48
+ line = line.strip() # Strip leading/trailing whitespace from the line itself
49
+ if line: # Only process non-empty lines
50
+ # Replace multiple spaces with a single space
51
+ cleaned_line = ' '.join(line.split())
52
+ cleaned_lines.append(cleaned_line)
53
+
54
+ text = '\n'.join(cleaned_lines)
55
+
56
+ if not text:
57
+ return f"Error: No text content found at {url}."
58
+
59
+ return text
60
+
61
+ except requests.exceptions.HTTPError as http_err:
62
+ return f"Error: HTTP error occurred while fetching {url}: {http_err}"
63
+ except requests.exceptions.ConnectionError as conn_err:
64
+ return f"Error: Connection error occurred while fetching {url}: {conn_err}"
65
+ except requests.exceptions.Timeout as timeout_err:
66
+ return f"Error: Timeout occurred while fetching {url}: {timeout_err}"
67
+ except requests.exceptions.RequestException as req_err:
68
+ return f"Error: An unexpected error occurred while fetching {url}: {req_err}"
69
+ except Exception as e:
70
+ return f"Error: An unexpected error occurred during parsing of {url}: {e}"
71
+
72
+ if __name__ == '__main__':
73
+ browser = WebBrowser()
74
+
75
+ # Example usage:
76
+ # Note: For a real agent, the URL would come from the task or a search step.
77
+ # This example uses a known Wikipedia page for demonstration.
78
+ # For tasks like "How many studio albums were published by Mercedes Sosa...",
79
+ # the agent would first need to find the relevant Wikipedia URL.
80
+
81
+ test_url_wikipedia = "https://en.wikipedia.org/wiki/Mercedes_Sosa"
82
+ print(f"--- Browsing: {test_url_wikipedia} ---")
83
+ content_wikipedia = browser.browse(test_url_wikipedia)
84
+ if content_wikipedia.startswith("Error:"):
85
+ print(content_wikipedia)
86
+ else:
87
+ # Print first 1000 characters for brevity in example
88
+ print(content_wikipedia[:1000] + "..." if len(content_wikipedia) > 1000 else content_wikipedia)
89
+
90
+ print("\n--- Example with a non-existent page ---")
91
+ test_url_non_existent = "http://example.com/nonexistentpage12345.html"
92
+ content_non_existent = browser.browse(test_url_non_existent)
93
+ print(content_non_existent)
94
+
95
+ print("\n--- Example with an invalid URL format ---")
96
+ test_url_invalid_format = "www.google.com"
97
+ content_invalid_format = browser.browse(test_url_invalid_format)
98
+ print(content_invalid_format)
tests/test_web_browser.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from unittest.mock import patch, MagicMock
3
+ import requests # Import requests for its exception types
4
+
5
+ import os
6
+ import sys
7
+
8
+ # Add the parent directory to sys.path to find the src module
9
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
10
+
11
+ # Adjust the import path based on your project structure
12
+ # If web_browser.py is in a 'src' directory:
13
+ from src.web_browsing_tool import WebBrowser
14
+
15
+ # If web_browser.py is in the same directory as app.py (and tools are in a 'tools' subdir):
16
+ # from tools.web_browser import WebBrowser
17
+
18
+ class TestWebBrowser(unittest.TestCase):
19
+
20
+ def setUp(self):
21
+ self.browser = WebBrowser(user_agent="TestAgent/1.0")
22
+
23
+ @patch('src.web_browsing_tool.requests.get')
24
+ def test_browse_successful_fetch_and_parse(self, mock_get):
25
+ # Mock the response from requests.get
26
+ mock_response = MagicMock()
27
+ mock_response.status_code = 200
28
+ mock_response.content = b"<html><head><title>Test Page</title></head><body><p>Hello World!</p><script>alert('test');</script></body></html>"
29
+ mock_response.raise_for_status = MagicMock() # Ensure this doesn't raise an error
30
+ mock_get.return_value = mock_response
31
+
32
+ url = "http://example.com/testpage"
33
+ result = self.browser.browse(url)
34
+
35
+ mock_get.assert_called_once_with(url, headers={"User-Agent": "TestAgent/1.0"}, timeout=15)
36
+ self.assertEqual(result, "Test Page\nHello World!")
37
+
38
+ @patch('src.web_browsing_tool.requests.get')
39
+ def test_browse_http_error(self, mock_get):
40
+ # Mock requests.get to raise an HTTPError
41
+ mock_get.side_effect = requests.exceptions.HTTPError("404 Client Error: Not Found for url")
42
+
43
+ url = "http://example.com/notfound"
44
+ result = self.browser.browse(url)
45
+
46
+ mock_get.assert_called_once_with(url, headers={"User-Agent": "TestAgent/1.0"}, timeout=15)
47
+ self.assertTrue(result.startswith("Error: HTTP error occurred"))
48
+ self.assertIn("404 Client Error", result)
49
+
50
+ @patch('src.web_browsing_tool.requests.get')
51
+ def test_browse_connection_error(self, mock_get):
52
+ mock_get.side_effect = requests.exceptions.ConnectionError("Connection refused")
53
+
54
+ url = "http://example.com/unreachable"
55
+ result = self.browser.browse(url)
56
+ self.assertTrue(result.startswith("Error: Connection error occurred"))
57
+ self.assertIn("Connection refused", result)
58
+
59
+ @patch('src.web_browsing_tool.requests.get')
60
+ def test_browse_timeout_error(self, mock_get):
61
+ mock_get.side_effect = requests.exceptions.Timeout("Request timed out")
62
+
63
+ url = "http://example.com/slowresponse"
64
+ result = self.browser.browse(url)
65
+ self.assertTrue(result.startswith("Error: Timeout occurred"))
66
+ self.assertIn("Request timed out", result)
67
+
68
+ @patch('src.web_browsing_tool.requests.get')
69
+ def test_browse_generic_request_exception(self, mock_get):
70
+ mock_get.side_effect = requests.exceptions.RequestException("Some other request error")
71
+
72
+ url = "http://example.com/othererror"
73
+ result = self.browser.browse(url)
74
+ self.assertTrue(result.startswith("Error: An unexpected error occurred while fetching"))
75
+ self.assertIn("Some other request error", result)
76
+
77
+ def test_browse_invalid_url_format(self):
78
+ url = "www.example.com" # Missing http:// or https://
79
+ result = self.browser.browse(url)
80
+ self.assertEqual(result, "Error: Invalid URL format. URL must start with http:// or https://. Received: www.example.com")
81
+
82
+ @patch('src.web_browsing_tool.requests.get')
83
+ def test_browse_no_text_content(self, mock_get):
84
+ mock_response = MagicMock()
85
+ mock_response.status_code = 200
86
+ mock_response.content = b"<html><head><script>var x=1;</script></head><body><style>.body {color:red;}</style></body></html>"
87
+ mock_response.raise_for_status = MagicMock()
88
+ mock_get.return_value = mock_response
89
+
90
+ url = "http://example.com/notext"
91
+ result = self.browser.browse(url)
92
+ self.assertEqual(result, f"Error: No text content found at {url}.")
93
+
94
+ @patch('src.web_browsing_tool.requests.get')
95
+ def test_browse_strips_extra_whitespace_and_newlines(self, mock_get):
96
+ mock_response = MagicMock()
97
+ mock_response.status_code = 200
98
+ mock_response.content = b"<html><body><p>Line 1</p> <p>Line 2</p>\n\n<p>Line\n3</p><div><span>Text</span></div></body></html>"
99
+ mock_response.raise_for_status = MagicMock()
100
+ mock_get.return_value = mock_response
101
+
102
+ url = "http://example.com/whitespace"
103
+ result = self.browser.browse(url)
104
+ expected_text = "Line 1\nLine 2\nLine\n3\nText"
105
+ self.assertEqual(result, expected_text)
106
+
107
+ @patch('src.web_browsing_tool.requests.get')
108
+ def test_browse_for_question_answering_scenario_mercedes_sosa(self, mock_get):
109
+ """
110
+ Tests if the browser can extract relevant text for a question
111
+ similar to the Mercedes Sosa studio albums count.
112
+ """
113
+ # Use a regular string for HTML content
114
+ mock_html_content_str = """
115
+ <html>
116
+ <head><title>Mercedes Sosa Discography</title></head>
117
+ <body>
118
+ <h1>Mercedes Sosa</h1>
119
+ <h2>Studio Albums</h2>
120
+ <ul>
121
+ <li>1999 - Misa Criolla</li>
122
+ <li>2002 - Ac煤stico</li>
123
+ <li>2005 - Coraz贸n libre</li>
124
+ <li>2009 - Cantora 1</li>
125
+ <li>2011 - Canto para caminar</li>
126
+ </ul>
127
+ <h2>Live Albums</h2>
128
+ <ul>
129
+ <li>2000 - Live in Concert</li>
130
+ </ul>
131
+ </body>
132
+ </html>
133
+ """
134
+ mock_response = MagicMock()
135
+ mock_response.status_code = 200
136
+ # Encode the string to bytes for the content
137
+ mock_response.content = mock_html_content_str.encode('utf-8')
138
+ mock_response.raise_for_status = MagicMock()
139
+ mock_get.return_value = mock_response
140
+
141
+ url = "http://example.com/mercedes_sosa_discography"
142
+ result = self.browser.browse(url)
143
+
144
+ # Assert that key information is present in the extracted text
145
+ self.assertIn("Mercedes Sosa Discography", result) # From title
146
+ self.assertIn("Studio Albums", result)
147
+ self.assertIn("1999 - Misa Criolla", result)
148
+ self.assertIn("2002 - Ac煤stico", result)
149
+ self.assertIn("2005 - Coraz贸n libre", result)
150
+ self.assertIn("2009 - Cantora 1", result)
151
+ self.assertIn("2011 - Canto para caminar", result)
152
+
153
+ # Ensure it doesn't just grab everything indiscriminately or miss sections
154
+ self.assertIn("Live Albums", result)
155
+ self.assertIn("2000 - Live in Concert", result)
156
+
157
+ # A further step (outside this tool's direct responsibility but for agent context)
158
+ # would be to pass this 'result' to an LLM with the question:
159
+ # "How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)?"
160
+ # The LLM should be able to parse the structured list and count "Ac煤stico", "Coraz贸n libre", "Cantora 1" -> 3.
161
+
162
+ if __name__ == '__main__':
163
+ unittest.main()