Spaces:

T0X1N
/

Medium-MCP

Sleeping

Medium-MCP / tests /unit /test_parser.py

Nikhil Pravin Pise

feat: implement comprehensive improvement plan (Phases 1-5)

e98cc10 3 months ago

7.45 kB

	"""
	Unit Tests for Parser Module

	Tests for HTML parsing and article content extraction.
	"""

	from __future__ import annotations

	import pytest
	from bs4 import BeautifulSoup

	from src.parser import extract_article_content, extract_search_results


	# =============================================================================
	# FIXTURES
	# =============================================================================


	@pytest.fixture
	def sample_article_html() -> str:
	"""Sample Medium article HTML for testing."""
	return """
	<!DOCTYPE html>
	<html>
	<head><title>Test Article</title></head>
	<body>
	<article>
	<h1 data-testid="storyTitle">Understanding Python Async</h1>
	<div class="pw-subtitle-paragraph">A comprehensive guide to asyncio</div>
	<div class="pw-author-name">Test Author</div>
	<section>
	<p>This is the first paragraph of the article.</p>
	<p>Here is another paragraph with important content.</p>
	<pre><code>import asyncio</code></pre>
	<h2>Section Title</h2>
	<p>More content here.</p>
	</section>
	</article>
	<button data-testid="clapButton">1.5K</button>
	<button data-testid="responseCount">42</button>
	</body>
	</html>
	"""


	@pytest.fixture
	def sample_search_html() -> str:
	"""Sample Medium search results HTML."""
	return """
	<html>
	<body>
	<div class="streamItem">
	<a href="/@user/article-abc123">
	<h3>Article Title 1</h3>
	</a>
	<a class="ds-link" href="/@testuser">Test Author</a>
	<span class="readingTime">5 min read</span>
	</div>
	<div class="streamItem">
	<a href="/@user2/article2-def456">
	<h3>Article Title 2</h3>
	</a>
	<a class="ds-link" href="/@author2">Second Author</a>
	<span class="readingTime">10 min read</span>
	</div>
	</body>
	</html>
	"""


	@pytest.fixture
	def cloudflare_blocked_html() -> str:
	"""HTML returned when Cloudflare blocks the request."""
	return """
	<!DOCTYPE html>
	<html>
	<head><title>Just a moment...</title></head>
	<body>
	<div id="cf-wrapper">
	<div id="challenge-running">
	<div class="cf-error-title">
	Checking your browser before accessing medium.com
	</div>
	</div>
	</div>
	</body>
	</html>
	"""


	@pytest.fixture
	def paywall_html() -> str:
	"""HTML for paywalled article."""
	return """
	<!DOCTYPE html>
	<html>
	<body>
	<article>
	<h1 data-testid="storyTitle">Premium Article</h1>
	<div class="meteredContent">
	<p>You've reached your limit...</p>
	</div>
	<div class="pw-paywall-modal">
	<p>Become a member to read this story.</p>
	</div>
	</article>
	</body>
	</html>
	"""


	# =============================================================================
	# ARTICLE CONTENT EXTRACTION TESTS
	# =============================================================================


	class TestExtractArticleContent:
	"""Tests for article content extraction."""

	def test_extract_basic_article(self, sample_article_html: str) -> None:
	"""Test extraction from a basic article."""
	soup = BeautifulSoup(sample_article_html, "html.parser")
	result = extract_article_content(soup)

	assert result is not None
	assert "title" in result
	assert "Understanding Python Async" in result.get("title", "")

	def test_extract_with_url_fallback(self, cloudflare_blocked_html: str) -> None:
	"""Test fallback to URL parsing when content is blocked."""
	soup = BeautifulSoup(cloudflare_blocked_html, "html.parser")
	url = "https://medium.com/@user/article-title-abc123def4"
	result = extract_article_content(soup, url)

	# Should still return something (may use URL-based extraction)
	assert result is not None

	def test_extract_claps_count(self, sample_article_html: str) -> None:
	"""Test claps count extraction."""
	soup = BeautifulSoup(sample_article_html, "html.parser")
	result = extract_article_content(soup)

	# Check if claps were extracted (format varies)
	if "claps" in result:
	assert result["claps"] >= 0

	def test_empty_html(self) -> None:
	"""Test handling of empty HTML."""
	soup = BeautifulSoup("", "html.parser")
	result = extract_article_content(soup)

	# Should handle gracefully
	assert result is not None or result is None # Either is acceptable

	def test_minimal_html(self) -> None:
	"""Test handling of minimal HTML."""
	soup = BeautifulSoup("<html><body></body></html>", "html.parser")
	result = extract_article_content(soup)

	assert result is not None or result is None


	class TestExtractSearchResults:
	"""Tests for search results extraction."""

	def test_extract_search_results_basic(self, sample_search_html: str) -> None:
	"""Test basic search result extraction."""
	soup = BeautifulSoup(sample_search_html, "html.parser")
	base_url = "https://medium.com"
	results = extract_search_results(soup, base_url)

	assert isinstance(results, list)

	def test_empty_search_results(self) -> None:
	"""Test empty search results page."""
	soup = BeautifulSoup("<html><body></body></html>", "html.parser")
	results = extract_search_results(soup, "https://medium.com")

	assert isinstance(results, list)
	assert len(results) == 0

	def test_search_results_structure(self, sample_search_html: str) -> None:
	"""Test structure of extracted search results."""
	soup = BeautifulSoup(sample_search_html, "html.parser")
	results = extract_search_results(soup, "https://medium.com")

	for result in results:
	# Each result should be a dict
	assert isinstance(result, dict)


	class TestRobustness:
	"""Tests for parser robustness."""

	def test_malformed_html(self) -> None:
	"""Test handling of malformed HTML."""
	malformed = "<html><body><p>Unclosed paragraph<div>Mixed tags</p></html>"
	soup = BeautifulSoup(malformed, "html.parser")

	# Should not crash
	result = extract_article_content(soup)
	assert result is not None or result is None

	def test_deeply_nested_html(self) -> None:
	"""Test handling of deeply nested structures."""
	nested = "<div>" * 50 + "<p>Content</p>" + "</div>" * 50
	soup = BeautifulSoup(nested, "html.parser")

	# Should handle deep nesting
	result = extract_article_content(soup)
	assert result is not None or result is None

	def test_large_html_document(self) -> None:
	"""Test handling of large HTML documents."""
	large = "<html><body>" + "<p>Paragraph content.</p>" * 1000 + "</body></html>"
	soup = BeautifulSoup(large, "html.parser")

	# Should handle large documents without crashing
	result = extract_article_content(soup)
	assert result is not None or result is None