| | import pytest |
| | import requests_mock |
| | from bs4 import BeautifulSoup |
| |
|
| | from ankigen_core.crawler import WebCrawler |
| |
|
| | BASE_URL = "http://example.com" |
| | SUB_PAGE_URL = f"{BASE_URL}/subpage" |
| | EXTERNAL_URL = "http://anotherdomain.com" |
| |
|
| |
|
| | @pytest.fixture |
| | def crawler_fixture(): |
| | return WebCrawler(start_url=BASE_URL, max_depth=1) |
| |
|
| |
|
| | @pytest.fixture |
| | def crawler_with_patterns_fixture(): |
| | return WebCrawler( |
| | start_url=BASE_URL, |
| | max_depth=1, |
| | include_patterns=[r"http://example\.com/docs/.*"], |
| | exclude_patterns=[r"http://example\.com/docs/v1/.*"], |
| | ) |
| |
|
| |
|
| | |
| |
|
| |
|
| | def test_is_valid_url_valid(crawler_fixture): |
| | assert crawler_fixture._is_valid_url(f"{BASE_URL}/page1") |
| | assert crawler_fixture._is_valid_url(f"{BASE_URL}/another/page") |
| |
|
| |
|
| | def test_is_valid_url_different_domain(crawler_fixture): |
| | assert not crawler_fixture._is_valid_url("http://otherdomain.com/page") |
| |
|
| |
|
| | def test_is_valid_url_different_scheme(crawler_fixture): |
| | assert not crawler_fixture._is_valid_url("ftp://example.com/page") |
| | assert not crawler_fixture._is_valid_url( |
| | "mailto:user@example.com" |
| | ) |
| |
|
| |
|
| | def test_is_valid_url_malformed(crawler_fixture): |
| | assert not crawler_fixture._is_valid_url( |
| | "htp://example.com/page" |
| | ) |
| | assert not crawler_fixture._is_valid_url( |
| | "http:///page" |
| | ) |
| |
|
| |
|
| | def test_is_valid_url_include_patterns_match(crawler_with_patterns_fixture): |
| | assert crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/docs/page1") |
| | assert crawler_with_patterns_fixture._is_valid_url( |
| | f"{BASE_URL}/docs/topic/subtopic" |
| | ) |
| |
|
| |
|
| | def test_is_valid_url_include_patterns_no_match(crawler_with_patterns_fixture): |
| | assert not crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/blog/page1") |
| |
|
| |
|
| | def test_is_valid_url_exclude_patterns_match(crawler_with_patterns_fixture): |
| | |
| | assert not crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/docs/v1/page1") |
| |
|
| |
|
| | def test_is_valid_url_exclude_patterns_no_match(crawler_with_patterns_fixture): |
| | |
| | assert crawler_with_patterns_fixture._is_valid_url(f"{BASE_URL}/docs/v2/page1") |
| |
|
| |
|
| | def test_is_valid_url_no_patterns_defined(crawler_fixture): |
| | |
| | assert crawler_fixture._is_valid_url(f"{BASE_URL}/any/path") |
| |
|
| |
|
| | |
| |
|
| |
|
| | @pytest.mark.parametrize( |
| | "html_content, base_url, expected_links", |
| | [ |
| | |
| | ( |
| | """<a href="/page1">1</a> <a href="http://example.com/page2">2</a>""", |
| | BASE_URL, |
| | [f"{BASE_URL}/page1", f"{BASE_URL}/page2"], |
| | ), |
| | |
| | ( |
| | """<a href="#section">S</a> <a href="javascript:void(0)">JS</a> <a href="/page3">3</a>""", |
| | BASE_URL, |
| | [f"{BASE_URL}/page3"], |
| | ), |
| | |
| | ( |
| | """<a href="http://anotherdomain.com">Ext</a> <a href="/page4">4</a>""", |
| | BASE_URL, |
| | [f"{BASE_URL}/page4"], |
| | ), |
| | |
| | ("""<a>No Href</a> <a href="/page5">5</a>""", BASE_URL, [f"{BASE_URL}/page5"]), |
| | |
| | ( |
| | """<a href="">Empty Href</a> <a href="/page6">6</a>""", |
| | BASE_URL, |
| | [f"{BASE_URL}/page6"], |
| | ), |
| | |
| | ( |
| | """<a href="sub/page7">7</a>""", |
| | f"{BASE_URL}/path/", |
| | [f"{BASE_URL}/path/sub/page7"], |
| | ), |
| | ], |
| | ) |
| | def test_extract_links(crawler_fixture, html_content, base_url, expected_links): |
| | soup = BeautifulSoup(html_content, "html.parser") |
| | |
| | |
| | actual_links = crawler_fixture._extract_links(soup, base_url) |
| | assert sorted(actual_links) == sorted(expected_links) |
| |
|
| |
|
| | def test_extract_links_with_filtering(crawler_with_patterns_fixture): |
| | html = """ |
| | <a href="http://example.com/docs/pageA">Allowed Doc</a> |
| | <a href="http://example.com/docs/v1/pageB">Excluded Doc v1</a> |
| | <a href="http://example.com/blog/pageC">Non-Doc Page</a> |
| | <a href="http://example.com/docs/v2/pageD">Allowed Doc v2</a> |
| | """ |
| | soup = BeautifulSoup(html, "html.parser") |
| | |
| | expected = [f"{BASE_URL}/docs/pageA", f"{BASE_URL}/docs/v2/pageD"] |
| | actual_links = crawler_with_patterns_fixture._extract_links(soup, BASE_URL) |
| | assert sorted(actual_links) == sorted(expected) |
| |
|
| |
|
| | |
| | @pytest.mark.parametrize( |
| | "html_content, expected_text", |
| | [ |
| | ( |
| | "<html><head><title>T</title><script>alert('x');</script><style>.c{}</style></head><body><p>Hello</p><div>World</div></body></html>", |
| | "T Hello World", |
| | ), |
| | ("<body>Just text</body>", "Just text"), |
| | ( |
| | "<body><nav>Menu</nav><main><p>Main content</p></main><footer>Foot</footer></body>", |
| | "Menu Main content Foot", |
| | ), |
| | ], |
| | ) |
| | def test_extract_text(crawler_fixture, html_content, expected_text): |
| | soup = BeautifulSoup(html_content, "html.parser") |
| | assert crawler_fixture._extract_text(soup) == expected_text |
| |
|
| |
|
| | |
| |
|
| |
|
| | def test_crawl_single_page_no_links(crawler_fixture): |
| | with requests_mock.Mocker() as m: |
| | m.get( |
| | BASE_URL, |
| | text="<html><head><title>Test Title</title></head><body>No links here.</body></html>", |
| | ) |
| |
|
| | pages = crawler_fixture.crawl() |
| |
|
| | assert len(pages) == 1 |
| | page = pages[0] |
| | assert page.url == BASE_URL |
| | assert page.title == "Test Title" |
| | assert "No links here" in page.text_content |
| | assert page.meta_description is None |
| | assert page.meta_keywords == [] |
| |
|
| |
|
| | def test_crawl_with_links_and_depth(crawler_fixture): |
| | |
| | with requests_mock.Mocker() as m: |
| | m.get( |
| | BASE_URL, |
| | text=f"""<html><head><title>Main</title><meta name="description" content="Main page desc"><meta name="keywords" content="main, test"></head> |
| | <body><a href="{SUB_PAGE_URL}">Subpage</a> <a href="{EXTERNAL_URL}">External</a></body></html>""", |
| | ) |
| | m.get( |
| | SUB_PAGE_URL, |
| | text="""<html><head><title>Sub</title></head><body>Subpage content. <a href="http://example.com/another_sub">Deeper</a></body></html>""", |
| | ) |
| | m.get(EXTERNAL_URL, text="External content") |
| |
|
| | pages = crawler_fixture.crawl() |
| |
|
| | assert len(pages) == 2 |
| |
|
| | main_page = next(p for p in pages if p.url == BASE_URL) |
| | sub_page = next(p for p in pages if p.url == SUB_PAGE_URL) |
| |
|
| | assert main_page.title == "Main" |
| | assert main_page.meta_description == "Main page desc" |
| | assert sorted(main_page.meta_keywords) == sorted(["main", "test"]) |
| | assert "Subpage" in main_page.text_content |
| |
|
| | assert sub_page.title == "Sub" |
| | assert "Subpage content" in sub_page.text_content |
| | assert sub_page.crawl_depth == 1 |
| | assert sub_page.parent_url == BASE_URL |
| |
|
| | |
| | assert len(crawler_fixture.visited_urls) == 2 |
| | |
| |
|
| |
|
| | def test_crawl_respects_max_depth_zero(crawler_fixture): |
| | crawler_fixture.max_depth = 0 |
| | with requests_mock.Mocker() as m: |
| | m.get( |
| | BASE_URL, |
| | text=f"""<html><head><title>Depth Zero</title></head> |
| | <body><a href="{SUB_PAGE_URL}">Link</a></body></html>""", |
| | ) |
| |
|
| | pages = crawler_fixture.crawl() |
| | assert len(pages) == 1 |
| | assert pages[0].url == BASE_URL |
| | assert pages[0].title == "Depth Zero" |
| | assert len(crawler_fixture.visited_urls) == 1 |
| |
|
| |
|
| | def test_crawl_handles_http_error(crawler_fixture): |
| | with requests_mock.Mocker() as m: |
| | m.get( |
| | BASE_URL, |
| | text=f"""<html><head><title>Main</title></head><body><a href="{SUB_PAGE_URL}">Subpage</a></body></html>""", |
| | ) |
| | m.get(SUB_PAGE_URL, status_code=404, text="Not Found") |
| |
|
| | pages = crawler_fixture.crawl() |
| |
|
| | assert len(pages) == 1 |
| | assert pages[0].url == BASE_URL |
| | |
| | assert SUB_PAGE_URL in crawler_fixture.visited_urls |
| |
|
| |
|
| | def test_crawl_include_exclude_patterns(crawler_with_patterns_fixture): |
| | |
| | |
| |
|
| | page_docs_allowed = f"{BASE_URL}/docs/allowed" |
| | page_docs_v1_excluded = f"{BASE_URL}/docs/v1/excluded" |
| | page_docs_v2_allowed = ( |
| | f"{BASE_URL}/docs/v2/allowed_link" |
| | ) |
| | page_blog_excluded = f"{BASE_URL}/blog/initial_link" |
| |
|
| | crawler_with_patterns_fixture.start_url = ( |
| | page_docs_allowed |
| | ) |
| |
|
| | with requests_mock.Mocker() as m: |
| | |
| | m.get( |
| | page_docs_allowed, |
| | text=f"""<html><head><title>Docs Allowed</title></head> |
| | <body> |
| | <a href="{page_docs_v1_excluded}">To Excluded v1</a> |
| | <a href="{page_docs_v2_allowed}">To Allowed v2</a> |
| | <a href="{page_blog_excluded}">To Blog</a> |
| | </body></html>""", |
| | ) |
| | |
| | m.get(page_docs_v1_excluded, text="V1 Excluded Content") |
| | m.get( |
| | page_docs_v2_allowed, |
| | text="<html><head><title>Docs V2 Allowed</title></head><body>V2 Content</body></html>", |
| | ) |
| | m.get(page_blog_excluded, text="Blog Content") |
| |
|
| | pages = crawler_with_patterns_fixture.crawl() |
| |
|
| | assert len(pages) == 2 |
| |
|
| | crawled_urls = [p.url for p in pages] |
| | assert page_docs_allowed in crawled_urls |
| | assert page_docs_v2_allowed in crawled_urls |
| |
|
| | assert page_docs_v1_excluded not in crawled_urls |
| | assert page_blog_excluded not in crawled_urls |
| |
|
| | page_v2 = next(p for p in pages if p.url == page_docs_v2_allowed) |
| | assert page_v2.title == "Docs V2 Allowed" |
| |
|
| |
|
| | def test_crawl_progress_callback(crawler_fixture): |
| | |
| | |
| | progress_log = [] |
| |
|
| | def callback(processed_count, total_urls, current_url): |
| | progress_log.append((processed_count, total_urls, current_url)) |
| |
|
| | with requests_mock.Mocker() as m: |
| | m.get( |
| | BASE_URL, |
| | text=f"""<html><head><title>Main</title></head> |
| | <body> |
| | <a href="{SUB_PAGE_URL}">Subpage</a> |
| | <a href="{BASE_URL}/another">Another</a> |
| | </body></html>""", |
| | ) |
| | m.get(SUB_PAGE_URL, text="<html><body>Sub</body></html>") |
| | m.get(f"{BASE_URL}/another", text="<html><body>Another</body></html>") |
| |
|
| | crawler_fixture.crawl(progress_callback=callback) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | assert ( |
| | len(progress_log) == 7 |
| | ) |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | assert progress_log[0][0] == 0 |
| | assert progress_log[0][2] == BASE_URL |
| |
|
| | |
| | |
| | |
| | |
| |
|