Spaces:
No application file
No application file
| import pytest | |
| import responses | |
| from bs4 import BeautifulSoup | |
| def test_load_data_gets_by_selectors_and_ignored_tags(selectee, ignored_tag, loader, mocked_responses, mocker): | |
| child_url = "https://docs.embedchain.ai/quickstart" | |
| selectee = selectee.format(ignored_tag=ignored_tag) | |
| html_body = """ | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <body> | |
| {selectee} | |
| </body> | |
| </html> | |
| """ | |
| html_body = html_body.format(selectee=selectee) | |
| mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html") | |
| url = "https://docs.embedchain.ai/" | |
| html_body = """ | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <body> | |
| <li><a href="/quickstart">Quickstart</a></li> | |
| </body> | |
| </html> | |
| """ | |
| mocked_responses.get(url, body=html_body, status=200, content_type="text/html") | |
| mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256") | |
| doc_id = "mocked_hash" | |
| mock_sha256.return_value.hexdigest.return_value = doc_id | |
| result = loader.load_data(url) | |
| selector_soup = BeautifulSoup(selectee, "html.parser") | |
| expected_content = " ".join((selector_soup.select_one("h2").get_text(), selector_soup.select_one("p").get_text())) | |
| assert result["doc_id"] == doc_id | |
| assert result["data"] == [ | |
| { | |
| "content": expected_content, | |
| "meta_data": {"url": "https://docs.embedchain.ai/quickstart"}, | |
| } | |
| ] | |
| def test_load_data_gets_child_links_recursively(loader, mocked_responses, mocker): | |
| child_url = "https://docs.embedchain.ai/quickstart" | |
| html_body = """ | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <body> | |
| <li><a href="/">..</a></li> | |
| <li><a href="/quickstart">.</a></li> | |
| </body> | |
| </html> | |
| """ | |
| mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html") | |
| child_url = "https://docs.embedchain.ai/introduction" | |
| html_body = """ | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <body> | |
| <li><a href="/">..</a></li> | |
| <li><a href="/introduction">.</a></li> | |
| </body> | |
| </html> | |
| """ | |
| mocked_responses.get(child_url, body=html_body, status=200, content_type="text/html") | |
| url = "https://docs.embedchain.ai/" | |
| html_body = """ | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <body> | |
| <li><a href="/quickstart">Quickstart</a></li> | |
| <li><a href="/introduction">Introduction</a></li> | |
| </body> | |
| </html> | |
| """ | |
| mocked_responses.get(url, body=html_body, status=200, content_type="text/html") | |
| mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256") | |
| doc_id = "mocked_hash" | |
| mock_sha256.return_value.hexdigest.return_value = doc_id | |
| result = loader.load_data(url) | |
| assert result["doc_id"] == doc_id | |
| expected_data = [ | |
| {"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/quickstart"}}, | |
| {"content": "..\n.", "meta_data": {"url": "https://docs.embedchain.ai/introduction"}}, | |
| ] | |
| assert all(item in expected_data for item in result["data"]) | |
| def test_load_data_fails_to_fetch_website(loader, mocked_responses, mocker): | |
| child_url = "https://docs.embedchain.ai/introduction" | |
| mocked_responses.get(child_url, status=404) | |
| url = "https://docs.embedchain.ai/" | |
| html_body = """ | |
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <body> | |
| <li><a href="/introduction">Introduction</a></li> | |
| </body> | |
| </html> | |
| """ | |
| mocked_responses.get(url, body=html_body, status=200, content_type="text/html") | |
| mock_sha256 = mocker.patch("embedchain.loaders.docs_site_loader.hashlib.sha256") | |
| doc_id = "mocked_hash" | |
| mock_sha256.return_value.hexdigest.return_value = doc_id | |
| result = loader.load_data(url) | |
| assert result["doc_id"] is doc_id | |
| assert result["data"] == [] | |
| def loader(): | |
| from embedchain.loaders.docs_site_loader import DocsSiteLoader | |
| return DocsSiteLoader() | |
| def mocked_responses(): | |
| with responses.RequestsMock() as rsps: | |
| yield rsps | |