Spaces:
Running
Running
File size: 7,452 Bytes
e98cc10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | """
Unit Tests for Parser Module
Tests for HTML parsing and article content extraction.
"""
from __future__ import annotations
import pytest
from bs4 import BeautifulSoup
from src.parser import extract_article_content, extract_search_results
# =============================================================================
# FIXTURES
# =============================================================================
@pytest.fixture
def sample_article_html() -> str:
"""Sample Medium article HTML for testing."""
return """
<!DOCTYPE html>
<html>
<head><title>Test Article</title></head>
<body>
<article>
<h1 data-testid="storyTitle">Understanding Python Async</h1>
<div class="pw-subtitle-paragraph">A comprehensive guide to asyncio</div>
<div class="pw-author-name">Test Author</div>
<section>
<p>This is the first paragraph of the article.</p>
<p>Here is another paragraph with important content.</p>
<pre><code>import asyncio</code></pre>
<h2>Section Title</h2>
<p>More content here.</p>
</section>
</article>
<button data-testid="clapButton">1.5K</button>
<button data-testid="responseCount">42</button>
</body>
</html>
"""
@pytest.fixture
def sample_search_html() -> str:
"""Sample Medium search results HTML."""
return """
<html>
<body>
<div class="streamItem">
<a href="/@user/article-abc123">
<h3>Article Title 1</h3>
</a>
<a class="ds-link" href="/@testuser">Test Author</a>
<span class="readingTime">5 min read</span>
</div>
<div class="streamItem">
<a href="/@user2/article2-def456">
<h3>Article Title 2</h3>
</a>
<a class="ds-link" href="/@author2">Second Author</a>
<span class="readingTime">10 min read</span>
</div>
</body>
</html>
"""
@pytest.fixture
def cloudflare_blocked_html() -> str:
"""HTML returned when Cloudflare blocks the request."""
return """
<!DOCTYPE html>
<html>
<head><title>Just a moment...</title></head>
<body>
<div id="cf-wrapper">
<div id="challenge-running">
<div class="cf-error-title">
Checking your browser before accessing medium.com
</div>
</div>
</div>
</body>
</html>
"""
@pytest.fixture
def paywall_html() -> str:
"""HTML for paywalled article."""
return """
<!DOCTYPE html>
<html>
<body>
<article>
<h1 data-testid="storyTitle">Premium Article</h1>
<div class="meteredContent">
<p>You've reached your limit...</p>
</div>
<div class="pw-paywall-modal">
<p>Become a member to read this story.</p>
</div>
</article>
</body>
</html>
"""
# =============================================================================
# ARTICLE CONTENT EXTRACTION TESTS
# =============================================================================
class TestExtractArticleContent:
"""Tests for article content extraction."""
def test_extract_basic_article(self, sample_article_html: str) -> None:
"""Test extraction from a basic article."""
soup = BeautifulSoup(sample_article_html, "html.parser")
result = extract_article_content(soup)
assert result is not None
assert "title" in result
assert "Understanding Python Async" in result.get("title", "")
def test_extract_with_url_fallback(self, cloudflare_blocked_html: str) -> None:
"""Test fallback to URL parsing when content is blocked."""
soup = BeautifulSoup(cloudflare_blocked_html, "html.parser")
url = "https://medium.com/@user/article-title-abc123def4"
result = extract_article_content(soup, url)
# Should still return something (may use URL-based extraction)
assert result is not None
def test_extract_claps_count(self, sample_article_html: str) -> None:
"""Test claps count extraction."""
soup = BeautifulSoup(sample_article_html, "html.parser")
result = extract_article_content(soup)
# Check if claps were extracted (format varies)
if "claps" in result:
assert result["claps"] >= 0
def test_empty_html(self) -> None:
"""Test handling of empty HTML."""
soup = BeautifulSoup("", "html.parser")
result = extract_article_content(soup)
# Should handle gracefully
assert result is not None or result is None # Either is acceptable
def test_minimal_html(self) -> None:
"""Test handling of minimal HTML."""
soup = BeautifulSoup("<html><body></body></html>", "html.parser")
result = extract_article_content(soup)
assert result is not None or result is None
class TestExtractSearchResults:
"""Tests for search results extraction."""
def test_extract_search_results_basic(self, sample_search_html: str) -> None:
"""Test basic search result extraction."""
soup = BeautifulSoup(sample_search_html, "html.parser")
base_url = "https://medium.com"
results = extract_search_results(soup, base_url)
assert isinstance(results, list)
def test_empty_search_results(self) -> None:
"""Test empty search results page."""
soup = BeautifulSoup("<html><body></body></html>", "html.parser")
results = extract_search_results(soup, "https://medium.com")
assert isinstance(results, list)
assert len(results) == 0
def test_search_results_structure(self, sample_search_html: str) -> None:
"""Test structure of extracted search results."""
soup = BeautifulSoup(sample_search_html, "html.parser")
results = extract_search_results(soup, "https://medium.com")
for result in results:
# Each result should be a dict
assert isinstance(result, dict)
class TestRobustness:
"""Tests for parser robustness."""
def test_malformed_html(self) -> None:
"""Test handling of malformed HTML."""
malformed = "<html><body><p>Unclosed paragraph<div>Mixed tags</p></html>"
soup = BeautifulSoup(malformed, "html.parser")
# Should not crash
result = extract_article_content(soup)
assert result is not None or result is None
def test_deeply_nested_html(self) -> None:
"""Test handling of deeply nested structures."""
nested = "<div>" * 50 + "<p>Content</p>" + "</div>" * 50
soup = BeautifulSoup(nested, "html.parser")
# Should handle deep nesting
result = extract_article_content(soup)
assert result is not None or result is None
def test_large_html_document(self) -> None:
"""Test handling of large HTML documents."""
large = "<html><body>" + "<p>Paragraph content.</p>" * 1000 + "</body></html>"
soup = BeautifulSoup(large, "html.parser")
# Should handle large documents without crashing
result = extract_article_content(soup)
assert result is not None or result is None
|