File size: 7,452 Bytes
e98cc10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
"""
Unit Tests for Parser Module

Tests for HTML parsing and article content extraction.
"""

from __future__ import annotations

import pytest
from bs4 import BeautifulSoup

from src.parser import extract_article_content, extract_search_results


# =============================================================================
# FIXTURES
# =============================================================================


@pytest.fixture
def sample_article_html() -> str:
    """Sample Medium article HTML for testing."""
    return """
    <!DOCTYPE html>
    <html>
    <head><title>Test Article</title></head>
    <body>
        <article>
            <h1 data-testid="storyTitle">Understanding Python Async</h1>
            <div class="pw-subtitle-paragraph">A comprehensive guide to asyncio</div>
            <div class="pw-author-name">Test Author</div>
            <section>
                <p>This is the first paragraph of the article.</p>
                <p>Here is another paragraph with important content.</p>
                <pre><code>import asyncio</code></pre>
                <h2>Section Title</h2>
                <p>More content here.</p>
            </section>
        </article>
        <button data-testid="clapButton">1.5K</button>
        <button data-testid="responseCount">42</button>
    </body>
    </html>
    """


@pytest.fixture
def sample_search_html() -> str:
    """Sample Medium search results HTML."""
    return """
    <html>
    <body>
        <div class="streamItem">
            <a href="/@user/article-abc123">
                <h3>Article Title 1</h3>
            </a>
            <a class="ds-link" href="/@testuser">Test Author</a>
            <span class="readingTime">5 min read</span>
        </div>
        <div class="streamItem">
            <a href="/@user2/article2-def456">
                <h3>Article Title 2</h3>
            </a>
            <a class="ds-link" href="/@author2">Second Author</a>
            <span class="readingTime">10 min read</span>
        </div>
    </body>
    </html>
    """


@pytest.fixture
def cloudflare_blocked_html() -> str:
    """HTML returned when Cloudflare blocks the request."""
    return """
    <!DOCTYPE html>
    <html>
    <head><title>Just a moment...</title></head>
    <body>
        <div id="cf-wrapper">
            <div id="challenge-running">
                <div class="cf-error-title">
                    Checking your browser before accessing medium.com
                </div>
            </div>
        </div>
    </body>
    </html>
    """


@pytest.fixture
def paywall_html() -> str:
    """HTML for paywalled article."""
    return """
    <!DOCTYPE html>
    <html>
    <body>
        <article>
            <h1 data-testid="storyTitle">Premium Article</h1>
            <div class="meteredContent">
                <p>You've reached your limit...</p>
            </div>
            <div class="pw-paywall-modal">
                <p>Become a member to read this story.</p>
            </div>
        </article>
    </body>
    </html>
    """


# =============================================================================
# ARTICLE CONTENT EXTRACTION TESTS
# =============================================================================


class TestExtractArticleContent:
    """Tests for article content extraction."""

    def test_extract_basic_article(self, sample_article_html: str) -> None:
        """Test extraction from a basic article."""
        soup = BeautifulSoup(sample_article_html, "html.parser")
        result = extract_article_content(soup)
        
        assert result is not None
        assert "title" in result
        assert "Understanding Python Async" in result.get("title", "")

    def test_extract_with_url_fallback(self, cloudflare_blocked_html: str) -> None:
        """Test fallback to URL parsing when content is blocked."""
        soup = BeautifulSoup(cloudflare_blocked_html, "html.parser")
        url = "https://medium.com/@user/article-title-abc123def4"
        result = extract_article_content(soup, url)
        
        # Should still return something (may use URL-based extraction)
        assert result is not None

    def test_extract_claps_count(self, sample_article_html: str) -> None:
        """Test claps count extraction."""
        soup = BeautifulSoup(sample_article_html, "html.parser")
        result = extract_article_content(soup)
        
        # Check if claps were extracted (format varies)
        if "claps" in result:
            assert result["claps"] >= 0

    def test_empty_html(self) -> None:
        """Test handling of empty HTML."""
        soup = BeautifulSoup("", "html.parser")
        result = extract_article_content(soup)
        
        # Should handle gracefully
        assert result is not None or result is None  # Either is acceptable

    def test_minimal_html(self) -> None:
        """Test handling of minimal HTML."""
        soup = BeautifulSoup("<html><body></body></html>", "html.parser")
        result = extract_article_content(soup)
        
        assert result is not None or result is None


class TestExtractSearchResults:
    """Tests for search results extraction."""

    def test_extract_search_results_basic(self, sample_search_html: str) -> None:
        """Test basic search result extraction."""
        soup = BeautifulSoup(sample_search_html, "html.parser")
        base_url = "https://medium.com"
        results = extract_search_results(soup, base_url)
        
        assert isinstance(results, list)

    def test_empty_search_results(self) -> None:
        """Test empty search results page."""
        soup = BeautifulSoup("<html><body></body></html>", "html.parser")
        results = extract_search_results(soup, "https://medium.com")
        
        assert isinstance(results, list)
        assert len(results) == 0

    def test_search_results_structure(self, sample_search_html: str) -> None:
        """Test structure of extracted search results."""
        soup = BeautifulSoup(sample_search_html, "html.parser")
        results = extract_search_results(soup, "https://medium.com")
        
        for result in results:
            # Each result should be a dict
            assert isinstance(result, dict)


class TestRobustness:
    """Tests for parser robustness."""

    def test_malformed_html(self) -> None:
        """Test handling of malformed HTML."""
        malformed = "<html><body><p>Unclosed paragraph<div>Mixed tags</p></html>"
        soup = BeautifulSoup(malformed, "html.parser")
        
        # Should not crash
        result = extract_article_content(soup)
        assert result is not None or result is None

    def test_deeply_nested_html(self) -> None:
        """Test handling of deeply nested structures."""
        nested = "<div>" * 50 + "<p>Content</p>" + "</div>" * 50
        soup = BeautifulSoup(nested, "html.parser")
        
        # Should handle deep nesting
        result = extract_article_content(soup)
        assert result is not None or result is None

    def test_large_html_document(self) -> None:
        """Test handling of large HTML documents."""
        large = "<html><body>" + "<p>Paragraph content.</p>" * 1000 + "</body></html>"
        soup = BeautifulSoup(large, "html.parser")
        
        # Should handle large documents without crashing
        result = extract_article_content(soup)
        assert result is not None or result is None