| | |
| |
|
| | import requests |
| |
|
| | from autogpt.commands.web_requests import scrape_text |
| |
|
| | """ |
| | Code Analysis |
| | |
| | Objective: |
| | The objective of the "scrape_text" function is to scrape the text content from |
| | a given URL and return it as a string, after removing any unwanted HTML tags and scripts. |
| | |
| | Inputs: |
| | - url: a string representing the URL of the webpage to be scraped. |
| | |
| | Flow: |
| | 1. Send a GET request to the given URL using the requests library and the user agent header from the config file. |
| | 2. Check if the response contains an HTTP error. If it does, return an error message. |
| | 3. Use BeautifulSoup to parse the HTML content of the response and extract all script and style tags. |
| | 4. Get the text content of the remaining HTML using the get_text() method of BeautifulSoup. |
| | 5. Split the text into lines and then into chunks, removing any extra whitespace. |
| | 6. Join the chunks into a single string with newline characters between them. |
| | 7. Return the cleaned text. |
| | |
| | Outputs: |
| | - A string representing the cleaned text content of the webpage. |
| | |
| | Additional aspects: |
| | - The function uses the requests library and BeautifulSoup to handle the HTTP request and HTML parsing, respectively. |
| | - The function removes script and style tags from the HTML to avoid including unwanted content in the text output. |
| | - The function uses a generator expression to split the text into lines and chunks, which can improve performance for large amounts of text. |
| | """ |
| |
|
| |
|
| | class TestScrapeText: |
| | |
| | def test_scrape_text_with_valid_url(self, mocker): |
| | |
| | expected_text = "This is some sample text" |
| | mock_response = mocker.Mock() |
| | mock_response.status_code = 200 |
| | mock_response.text = f"<html><body><div><p style='color: blue;'>{expected_text}</p></div></body></html>" |
| | mocker.patch("requests.Session.get", return_value=mock_response) |
| |
|
| | |
| | url = "http://www.example.com" |
| | assert scrape_text(url) == expected_text |
| |
|
| | |
| | def test_invalid_url(self, mocker): |
| | |
| | mocker.patch( |
| | "requests.Session.get", side_effect=requests.exceptions.RequestException |
| | ) |
| |
|
| | |
| | url = "http://www.invalidurl.com" |
| | error_message = scrape_text(url) |
| | assert "Error:" in error_message |
| |
|
| | |
| | def test_no_text(self, mocker): |
| | |
| | mock_response = mocker.Mock() |
| | mock_response.status_code = 200 |
| | mock_response.text = "<html><body></body></html>" |
| | mocker.patch("requests.Session.get", return_value=mock_response) |
| |
|
| | |
| | url = "http://www.example.com" |
| | assert scrape_text(url) == "" |
| |
|
| | |
| | def test_http_error(self, mocker): |
| | |
| | mocker.patch("requests.Session.get", return_value=mocker.Mock(status_code=404)) |
| |
|
| | |
| | result = scrape_text("https://www.example.com") |
| |
|
| | |
| | assert result == "Error: HTTP 404 error" |
| |
|
| | |
| | def test_scrape_text_with_html_tags(self, mocker): |
| | |
| | html = "<html><body><p>This is <b>bold</b> text.</p></body></html>" |
| | mock_response = mocker.Mock() |
| | mock_response.status_code = 200 |
| | mock_response.text = html |
| | mocker.patch("requests.Session.get", return_value=mock_response) |
| |
|
| | |
| | result = scrape_text("https://www.example.com") |
| |
|
| | |
| | assert result == "This is bold text." |
| |
|