Medium-MCP / tests /unit /test_parser.py
Nikhil Pravin Pise
feat: implement comprehensive improvement plan (Phases 1-5)
e98cc10
"""
Unit Tests for Parser Module
Tests for HTML parsing and article content extraction.
"""
from __future__ import annotations
import pytest
from bs4 import BeautifulSoup
from src.parser import extract_article_content, extract_search_results
# =============================================================================
# FIXTURES
# =============================================================================
@pytest.fixture
def sample_article_html() -> str:
"""Sample Medium article HTML for testing."""
return """
<!DOCTYPE html>
<html>
<head><title>Test Article</title></head>
<body>
<article>
<h1 data-testid="storyTitle">Understanding Python Async</h1>
<div class="pw-subtitle-paragraph">A comprehensive guide to asyncio</div>
<div class="pw-author-name">Test Author</div>
<section>
<p>This is the first paragraph of the article.</p>
<p>Here is another paragraph with important content.</p>
<pre><code>import asyncio</code></pre>
<h2>Section Title</h2>
<p>More content here.</p>
</section>
</article>
<button data-testid="clapButton">1.5K</button>
<button data-testid="responseCount">42</button>
</body>
</html>
"""
@pytest.fixture
def sample_search_html() -> str:
"""Sample Medium search results HTML."""
return """
<html>
<body>
<div class="streamItem">
<a href="/@user/article-abc123">
<h3>Article Title 1</h3>
</a>
<a class="ds-link" href="/@testuser">Test Author</a>
<span class="readingTime">5 min read</span>
</div>
<div class="streamItem">
<a href="/@user2/article2-def456">
<h3>Article Title 2</h3>
</a>
<a class="ds-link" href="/@author2">Second Author</a>
<span class="readingTime">10 min read</span>
</div>
</body>
</html>
"""
@pytest.fixture
def cloudflare_blocked_html() -> str:
"""HTML returned when Cloudflare blocks the request."""
return """
<!DOCTYPE html>
<html>
<head><title>Just a moment...</title></head>
<body>
<div id="cf-wrapper">
<div id="challenge-running">
<div class="cf-error-title">
Checking your browser before accessing medium.com
</div>
</div>
</div>
</body>
</html>
"""
@pytest.fixture
def paywall_html() -> str:
"""HTML for paywalled article."""
return """
<!DOCTYPE html>
<html>
<body>
<article>
<h1 data-testid="storyTitle">Premium Article</h1>
<div class="meteredContent">
<p>You've reached your limit...</p>
</div>
<div class="pw-paywall-modal">
<p>Become a member to read this story.</p>
</div>
</article>
</body>
</html>
"""
# =============================================================================
# ARTICLE CONTENT EXTRACTION TESTS
# =============================================================================
class TestExtractArticleContent:
"""Tests for article content extraction."""
def test_extract_basic_article(self, sample_article_html: str) -> None:
"""Test extraction from a basic article."""
soup = BeautifulSoup(sample_article_html, "html.parser")
result = extract_article_content(soup)
assert result is not None
assert "title" in result
assert "Understanding Python Async" in result.get("title", "")
def test_extract_with_url_fallback(self, cloudflare_blocked_html: str) -> None:
"""Test fallback to URL parsing when content is blocked."""
soup = BeautifulSoup(cloudflare_blocked_html, "html.parser")
url = "https://medium.com/@user/article-title-abc123def4"
result = extract_article_content(soup, url)
# Should still return something (may use URL-based extraction)
assert result is not None
def test_extract_claps_count(self, sample_article_html: str) -> None:
"""Test claps count extraction."""
soup = BeautifulSoup(sample_article_html, "html.parser")
result = extract_article_content(soup)
# Check if claps were extracted (format varies)
if "claps" in result:
assert result["claps"] >= 0
def test_empty_html(self) -> None:
"""Test handling of empty HTML."""
soup = BeautifulSoup("", "html.parser")
result = extract_article_content(soup)
# Should handle gracefully
assert result is not None or result is None # Either is acceptable
def test_minimal_html(self) -> None:
"""Test handling of minimal HTML."""
soup = BeautifulSoup("<html><body></body></html>", "html.parser")
result = extract_article_content(soup)
assert result is not None or result is None
class TestExtractSearchResults:
"""Tests for search results extraction."""
def test_extract_search_results_basic(self, sample_search_html: str) -> None:
"""Test basic search result extraction."""
soup = BeautifulSoup(sample_search_html, "html.parser")
base_url = "https://medium.com"
results = extract_search_results(soup, base_url)
assert isinstance(results, list)
def test_empty_search_results(self) -> None:
"""Test empty search results page."""
soup = BeautifulSoup("<html><body></body></html>", "html.parser")
results = extract_search_results(soup, "https://medium.com")
assert isinstance(results, list)
assert len(results) == 0
def test_search_results_structure(self, sample_search_html: str) -> None:
"""Test structure of extracted search results."""
soup = BeautifulSoup(sample_search_html, "html.parser")
results = extract_search_results(soup, "https://medium.com")
for result in results:
# Each result should be a dict
assert isinstance(result, dict)
class TestRobustness:
"""Tests for parser robustness."""
def test_malformed_html(self) -> None:
"""Test handling of malformed HTML."""
malformed = "<html><body><p>Unclosed paragraph<div>Mixed tags</p></html>"
soup = BeautifulSoup(malformed, "html.parser")
# Should not crash
result = extract_article_content(soup)
assert result is not None or result is None
def test_deeply_nested_html(self) -> None:
"""Test handling of deeply nested structures."""
nested = "<div>" * 50 + "<p>Content</p>" + "</div>" * 50
soup = BeautifulSoup(nested, "html.parser")
# Should handle deep nesting
result = extract_article_content(soup)
assert result is not None or result is None
def test_large_html_document(self) -> None:
"""Test handling of large HTML documents."""
large = "<html><body>" + "<p>Paragraph content.</p>" * 1000 + "</body></html>"
soup = BeautifulSoup(large, "html.parser")
# Should handle large documents without crashing
result = extract_article_content(soup)
assert result is not None or result is None