Spaces:

T0X1N
/

Medium-MCP

Running

File size: 7,452 Bytes

e98cc10

"""
Unit Tests for Parser Module

Tests for HTML parsing and article content extraction.
"""

from __future__ import annotations

import pytest
from bs4 import BeautifulSoup

from src.parser import extract_article_content, extract_search_results


# =============================================================================
# FIXTURES
# =============================================================================


@pytest.fixture
def sample_article_html() -> str:
    """Sample Medium article HTML for testing."""
    return """
    <!DOCTYPE html>
    <html>
    <head><title>Test Article</title></head>
    <body>
        <article>
            <h1 data-testid="storyTitle">Understanding Python Async</h1>
            <div class="pw-subtitle-paragraph">A comprehensive guide to asyncio</div>
            <div class="pw-author-name">Test Author</div>
            <section>
                <p>This is the first paragraph of the article.</p>
                <p>Here is another paragraph with important content.</p>
                <pre><code>import asyncio</code></pre>
                <h2>Section Title</h2>
                <p>More content here.</p>
            </section>
        </article>
        <button data-testid="clapButton">1.5K</button>
        <button data-testid="responseCount">42</button>
    </body>
    </html>
    """


@pytest.fixture
def sample_search_html() -> str:
    """Sample Medium search results HTML."""
    return """
    <html>
    <body>
        <div class="streamItem">
            <a href="/@user/article-abc123">
                <h3>Article Title 1</h3>
            </a>
            <a class="ds-link" href="/@testuser">Test Author</a>
            <span class="readingTime">5 min read</span>
        </div>
        <div class="streamItem">
            <a href="/@user2/article2-def456">
                <h3>Article Title 2</h3>
            </a>
            <a class="ds-link" href="/@author2">Second Author</a>
            <span class="readingTime">10 min read</span>
        </div>
    </body>
    </html>
    """


@pytest.fixture
def cloudflare_blocked_html() -> str:
    """HTML returned when Cloudflare blocks the request."""
    return """
    <!DOCTYPE html>
    <html>
    <head><title>Just a moment...</title></head>
    <body>
        <div id="cf-wrapper">
            <div id="challenge-running">
                <div class="cf-error-title">
                    Checking your browser before accessing medium.com
                </div>
            </div>
        </div>
    </body>
    </html>
    """


@pytest.fixture
def paywall_html() -> str:
    """HTML for paywalled article."""
    return """
    <!DOCTYPE html>
    <html>
    <body>
        <article>
            <h1 data-testid="storyTitle">Premium Article</h1>
            <div class="meteredContent">
                <p>You've reached your limit...</p>
            </div>
            <div class="pw-paywall-modal">
                <p>Become a member to read this story.</p>
            </div>
        </article>
    </body>
    </html>
    """


# =============================================================================
# ARTICLE CONTENT EXTRACTION TESTS
# =============================================================================


class TestExtractArticleContent:
    """Tests for article content extraction."""

    def test_extract_basic_article(self, sample_article_html: str) -> None:
        """Test extraction from a basic article."""
        soup = BeautifulSoup(sample_article_html, "html.parser")
        result = extract_article_content(soup)
        
        assert result is not None
        assert "title" in result
        assert "Understanding Python Async" in result.get("title", "")

    def test_extract_with_url_fallback(self, cloudflare_blocked_html: str) -> None:
        """Test fallback to URL parsing when content is blocked."""
        soup = BeautifulSoup(cloudflare_blocked_html, "html.parser")
        url = "https://medium.com/@user/article-title-abc123def4"
        result = extract_article_content(soup, url)
        
        # Should still return something (may use URL-based extraction)
        assert result is not None

    def test_extract_claps_count(self, sample_article_html: str) -> None:
        """Test claps count extraction."""
        soup = BeautifulSoup(sample_article_html, "html.parser")
        result = extract_article_content(soup)
        
        # Check if claps were extracted (format varies)
        if "claps" in result:
            assert result["claps"] >= 0

    def test_empty_html(self) -> None:
        """Test handling of empty HTML."""
        soup = BeautifulSoup("", "html.parser")
        result = extract_article_content(soup)
        
        # Should handle gracefully
        assert result is not None or result is None  # Either is acceptable

    def test_minimal_html(self) -> None:
        """Test handling of minimal HTML."""
        soup = BeautifulSoup("<html><body></body></html>", "html.parser")
        result = extract_article_content(soup)
        
        assert result is not None or result is None


class TestExtractSearchResults:
    """Tests for search results extraction."""

    def test_extract_search_results_basic(self, sample_search_html: str) -> None:
        """Test basic search result extraction."""
        soup = BeautifulSoup(sample_search_html, "html.parser")
        base_url = "https://medium.com"
        results = extract_search_results(soup, base_url)
        
        assert isinstance(results, list)

    def test_empty_search_results(self) -> None:
        """Test empty search results page."""
        soup = BeautifulSoup("<html><body></body></html>", "html.parser")
        results = extract_search_results(soup, "https://medium.com")
        
        assert isinstance(results, list)
        assert len(results) == 0

    def test_search_results_structure(self, sample_search_html: str) -> None:
        """Test structure of extracted search results."""
        soup = BeautifulSoup(sample_search_html, "html.parser")
        results = extract_search_results(soup, "https://medium.com")
        
        for result in results:
            # Each result should be a dict
            assert isinstance(result, dict)


class TestRobustness:
    """Tests for parser robustness."""

    def test_malformed_html(self) -> None:
        """Test handling of malformed HTML."""
        malformed = "<html><body><p>Unclosed paragraph<div>Mixed tags</p></html>"
        soup = BeautifulSoup(malformed, "html.parser")
        
        # Should not crash
        result = extract_article_content(soup)
        assert result is not None or result is None

    def test_deeply_nested_html(self) -> None:
        """Test handling of deeply nested structures."""
        nested = "<div>" * 50 + "<p>Content</p>" + "</div>" * 50
        soup = BeautifulSoup(nested, "html.parser")
        
        # Should handle deep nesting
        result = extract_article_content(soup)
        assert result is not None or result is None

    def test_large_html_document(self) -> None:
        """Test handling of large HTML documents."""
        large = "<html><body>" + "<p>Paragraph content.</p>" * 1000 + "</body></html>"
        soup = BeautifulSoup(large, "html.parser")
        
        # Should handle large documents without crashing
        result = extract_article_content(soup)
        assert result is not None or result is None