""" Unit Tests for Parser Module Tests for HTML parsing and article content extraction. """ from __future__ import annotations import pytest from bs4 import BeautifulSoup from src.parser import extract_article_content, extract_search_results # ============================================================================= # FIXTURES # ============================================================================= @pytest.fixture def sample_article_html() -> str: """Sample Medium article HTML for testing.""" return """ Test Article

Understanding Python Async

A comprehensive guide to asyncio

Test Author

This is the first paragraph of the article.

Here is another paragraph with important content.

import asyncio

Section Title

More content here.

""" @pytest.fixture def sample_search_html() -> str: """Sample Medium search results HTML.""" return """

Article Title 1

Test Author 5 min read

Article Title 2

Second Author 10 min read

""" @pytest.fixture def cloudflare_blocked_html() -> str: """HTML returned when Cloudflare blocks the request.""" return """ Just a moment...

Checking your browser before accessing medium.com

""" @pytest.fixture def paywall_html() -> str: """HTML for paywalled article.""" return """

Premium Article

You've reached your limit...

""" # ============================================================================= # ARTICLE CONTENT EXTRACTION TESTS # ============================================================================= class TestExtractArticleContent: """Tests for article content extraction.""" def test_extract_basic_article(self, sample_article_html: str) -> None: """Test extraction from a basic article.""" soup = BeautifulSoup(sample_article_html, "html.parser") result = extract_article_content(soup) assert result is not None assert "title" in result assert "Understanding Python Async" in result.get("title", "") def test_extract_with_url_fallback(self, cloudflare_blocked_html: str) -> None: """Test fallback to URL parsing when content is blocked.""" soup = BeautifulSoup(cloudflare_blocked_html, "html.parser") url = "https://medium.com/@user/article-title-abc123def4" result = extract_article_content(soup, url) # Should still return something (may use URL-based extraction) assert result is not None def test_extract_claps_count(self, sample_article_html: str) -> None: """Test claps count extraction.""" soup = BeautifulSoup(sample_article_html, "html.parser") result = extract_article_content(soup) # Check if claps were extracted (format varies) if "claps" in result: assert result["claps"] >= 0 def test_empty_html(self) -> None: """Test handling of empty HTML.""" soup = BeautifulSoup("", "html.parser") result = extract_article_content(soup) # Should handle gracefully assert result is not None or result is None # Either is acceptable def test_minimal_html(self) -> None: """Test handling of minimal HTML.""" soup = BeautifulSoup("", "html.parser") result = extract_article_content(soup) assert result is not None or result is None class TestExtractSearchResults: """Tests for search results extraction.""" def test_extract_search_results_basic(self, sample_search_html: str) -> None: """Test basic search result extraction.""" soup = BeautifulSoup(sample_search_html, "html.parser") base_url = "https://medium.com" results = extract_search_results(soup, base_url) assert isinstance(results, list) def test_empty_search_results(self) -> None: """Test empty search results page.""" soup = BeautifulSoup("", "html.parser") results = extract_search_results(soup, "https://medium.com") assert isinstance(results, list) assert len(results) == 0 def test_search_results_structure(self, sample_search_html: str) -> None: """Test structure of extracted search results.""" soup = BeautifulSoup(sample_search_html, "html.parser") results = extract_search_results(soup, "https://medium.com") for result in results: # Each result should be a dict assert isinstance(result, dict) class TestRobustness: """Tests for parser robustness.""" def test_malformed_html(self) -> None: """Test handling of malformed HTML.""" malformed = "

Unclosed paragraph

Mixed tags

" soup = BeautifulSoup(malformed, "html.parser") # Should not crash result = extract_article_content(soup) assert result is not None or result is None def test_deeply_nested_html(self) -> None: """Test handling of deeply nested structures.""" nested = "

" * 50 + "

Content

" + "

" * 50 soup = BeautifulSoup(nested, "html.parser") # Should handle deep nesting result = extract_article_content(soup) assert result is not None or result is None def test_large_html_document(self) -> None: """Test handling of large HTML documents.""" large = "" + "

Paragraph content.

" * 1000 + "" soup = BeautifulSoup(large, "html.parser") # Should handle large documents without crashing result = extract_article_content(soup) assert result is not None or result is None