| """Tests for readability extraction fallback behavior.""" | |
| import subprocess | |
| import pytest | |
| from src.utils.readability import ReadabilityExtractor | |
| def test_extract_article_falls_back_when_readability_js_fails(monkeypatch): | |
| """When Node-based readability fails, extraction should fall back to Python mode.""" | |
| calls: list[bool] = [] | |
| monkeypatch.setattr("src.utils.readability._is_readability_js_available", lambda: True) | |
| def _fake_simple_json_from_html_string(html: str, use_readability: bool = False): | |
| calls.append(use_readability) | |
| if use_readability: | |
| raise subprocess.CalledProcessError( | |
| returncode=1, | |
| cmd=["node", "ExtractArticle.js"], | |
| stderr="boom", | |
| ) | |
| return {"title": "Fallback Title", "content": "<p>Fallback Content</p>"} | |
| monkeypatch.setattr( | |
| "src.utils.readability.simple_json_from_html_string", | |
| _fake_simple_json_from_html_string, | |
| ) | |
| article = ReadabilityExtractor().extract_article("<html><body>test</body></html>") | |
| assert calls == [True, False] | |
| assert article.title == "Fallback Title" | |
| assert article.html_content == "<p>Fallback Content</p>" | |
| def test_extract_article_re_raises_unexpected_exception(monkeypatch): | |
| """Unexpected errors should be surfaced instead of silently falling back.""" | |
| calls: list[bool] = [] | |
| monkeypatch.setattr("src.utils.readability._is_readability_js_available", lambda: True) | |
| def _fake_simple_json_from_html_string(html: str, use_readability: bool = False): | |
| calls.append(use_readability) | |
| if use_readability: | |
| raise RuntimeError("unexpected parser failure") | |
| return {"title": "Should Not Reach Fallback", "content": "<p>Fallback</p>"} | |
| monkeypatch.setattr( | |
| "src.utils.readability.simple_json_from_html_string", | |
| _fake_simple_json_from_html_string, | |
| ) | |
| with pytest.raises(RuntimeError, match="unexpected parser failure"): | |
| ReadabilityExtractor().extract_article("<html><body>test</body></html>") | |
| assert calls == [True] | |
| def test_extract_article_uses_python_path_when_js_unavailable(monkeypatch): | |
| """If JS readability is unavailable, extractor should directly use Python mode.""" | |
| calls: list[bool] = [] | |
| monkeypatch.setattr("src.utils.readability._is_readability_js_available", lambda: False) | |
| def _fake_simple_json_from_html_string(html: str, use_readability: bool = False): | |
| calls.append(use_readability) | |
| return {"title": "Python Mode", "content": "<p>OK</p>"} | |
| monkeypatch.setattr( | |
| "src.utils.readability.simple_json_from_html_string", | |
| _fake_simple_json_from_html_string, | |
| ) | |
| article = ReadabilityExtractor().extract_article("<html><body>test</body></html>") | |
| assert calls == [False] | |
| assert article.title == "Python Mode" | |