Spaces:
Sleeping
Sleeping
| """ | |
| Unit Tests for Parser Module | |
| Tests for HTML parsing and article content extraction. | |
| """ | |
| from __future__ import annotations | |
| import pytest | |
| from bs4 import BeautifulSoup | |
| from src.parser import extract_article_content, extract_search_results | |
| # ============================================================================= | |
| # FIXTURES | |
| # ============================================================================= | |
| def sample_article_html() -> str: | |
| """Sample Medium article HTML for testing.""" | |
| return """ | |
| <!DOCTYPE html> | |
| <html> | |
| <head><title>Test Article</title></head> | |
| <body> | |
| <article> | |
| <h1 data-testid="storyTitle">Understanding Python Async</h1> | |
| <div class="pw-subtitle-paragraph">A comprehensive guide to asyncio</div> | |
| <div class="pw-author-name">Test Author</div> | |
| <section> | |
| <p>This is the first paragraph of the article.</p> | |
| <p>Here is another paragraph with important content.</p> | |
| <pre><code>import asyncio</code></pre> | |
| <h2>Section Title</h2> | |
| <p>More content here.</p> | |
| </section> | |
| </article> | |
| <button data-testid="clapButton">1.5K</button> | |
| <button data-testid="responseCount">42</button> | |
| </body> | |
| </html> | |
| """ | |
| def sample_search_html() -> str: | |
| """Sample Medium search results HTML.""" | |
| return """ | |
| <html> | |
| <body> | |
| <div class="streamItem"> | |
| <a href="/@user/article-abc123"> | |
| <h3>Article Title 1</h3> | |
| </a> | |
| <a class="ds-link" href="/@testuser">Test Author</a> | |
| <span class="readingTime">5 min read</span> | |
| </div> | |
| <div class="streamItem"> | |
| <a href="/@user2/article2-def456"> | |
| <h3>Article Title 2</h3> | |
| </a> | |
| <a class="ds-link" href="/@author2">Second Author</a> | |
| <span class="readingTime">10 min read</span> | |
| </div> | |
| </body> | |
| </html> | |
| """ | |
| def cloudflare_blocked_html() -> str: | |
| """HTML returned when Cloudflare blocks the request.""" | |
| return """ | |
| <!DOCTYPE html> | |
| <html> | |
| <head><title>Just a moment...</title></head> | |
| <body> | |
| <div id="cf-wrapper"> | |
| <div id="challenge-running"> | |
| <div class="cf-error-title"> | |
| Checking your browser before accessing medium.com | |
| </div> | |
| </div> | |
| </div> | |
| </body> | |
| </html> | |
| """ | |
| def paywall_html() -> str: | |
| """HTML for paywalled article.""" | |
| return """ | |
| <!DOCTYPE html> | |
| <html> | |
| <body> | |
| <article> | |
| <h1 data-testid="storyTitle">Premium Article</h1> | |
| <div class="meteredContent"> | |
| <p>You've reached your limit...</p> | |
| </div> | |
| <div class="pw-paywall-modal"> | |
| <p>Become a member to read this story.</p> | |
| </div> | |
| </article> | |
| </body> | |
| </html> | |
| """ | |
| # ============================================================================= | |
| # ARTICLE CONTENT EXTRACTION TESTS | |
| # ============================================================================= | |
| class TestExtractArticleContent: | |
| """Tests for article content extraction.""" | |
| def test_extract_basic_article(self, sample_article_html: str) -> None: | |
| """Test extraction from a basic article.""" | |
| soup = BeautifulSoup(sample_article_html, "html.parser") | |
| result = extract_article_content(soup) | |
| assert result is not None | |
| assert "title" in result | |
| assert "Understanding Python Async" in result.get("title", "") | |
| def test_extract_with_url_fallback(self, cloudflare_blocked_html: str) -> None: | |
| """Test fallback to URL parsing when content is blocked.""" | |
| soup = BeautifulSoup(cloudflare_blocked_html, "html.parser") | |
| url = "https://medium.com/@user/article-title-abc123def4" | |
| result = extract_article_content(soup, url) | |
| # Should still return something (may use URL-based extraction) | |
| assert result is not None | |
| def test_extract_claps_count(self, sample_article_html: str) -> None: | |
| """Test claps count extraction.""" | |
| soup = BeautifulSoup(sample_article_html, "html.parser") | |
| result = extract_article_content(soup) | |
| # Check if claps were extracted (format varies) | |
| if "claps" in result: | |
| assert result["claps"] >= 0 | |
| def test_empty_html(self) -> None: | |
| """Test handling of empty HTML.""" | |
| soup = BeautifulSoup("", "html.parser") | |
| result = extract_article_content(soup) | |
| # Should handle gracefully | |
| assert result is not None or result is None # Either is acceptable | |
| def test_minimal_html(self) -> None: | |
| """Test handling of minimal HTML.""" | |
| soup = BeautifulSoup("<html><body></body></html>", "html.parser") | |
| result = extract_article_content(soup) | |
| assert result is not None or result is None | |
| class TestExtractSearchResults: | |
| """Tests for search results extraction.""" | |
| def test_extract_search_results_basic(self, sample_search_html: str) -> None: | |
| """Test basic search result extraction.""" | |
| soup = BeautifulSoup(sample_search_html, "html.parser") | |
| base_url = "https://medium.com" | |
| results = extract_search_results(soup, base_url) | |
| assert isinstance(results, list) | |
| def test_empty_search_results(self) -> None: | |
| """Test empty search results page.""" | |
| soup = BeautifulSoup("<html><body></body></html>", "html.parser") | |
| results = extract_search_results(soup, "https://medium.com") | |
| assert isinstance(results, list) | |
| assert len(results) == 0 | |
| def test_search_results_structure(self, sample_search_html: str) -> None: | |
| """Test structure of extracted search results.""" | |
| soup = BeautifulSoup(sample_search_html, "html.parser") | |
| results = extract_search_results(soup, "https://medium.com") | |
| for result in results: | |
| # Each result should be a dict | |
| assert isinstance(result, dict) | |
| class TestRobustness: | |
| """Tests for parser robustness.""" | |
| def test_malformed_html(self) -> None: | |
| """Test handling of malformed HTML.""" | |
| malformed = "<html><body><p>Unclosed paragraph<div>Mixed tags</p></html>" | |
| soup = BeautifulSoup(malformed, "html.parser") | |
| # Should not crash | |
| result = extract_article_content(soup) | |
| assert result is not None or result is None | |
| def test_deeply_nested_html(self) -> None: | |
| """Test handling of deeply nested structures.""" | |
| nested = "<div>" * 50 + "<p>Content</p>" + "</div>" * 50 | |
| soup = BeautifulSoup(nested, "html.parser") | |
| # Should handle deep nesting | |
| result = extract_article_content(soup) | |
| assert result is not None or result is None | |
| def test_large_html_document(self) -> None: | |
| """Test handling of large HTML documents.""" | |
| large = "<html><body>" + "<p>Paragraph content.</p>" * 1000 + "</body></html>" | |
| soup = BeautifulSoup(large, "html.parser") | |
| # Should handle large documents without crashing | |
| result = extract_article_content(soup) | |
| assert result is not None or result is None | |