Medium-MCP / tests /unit /test_extractor.py
Nikhil Pravin Pise
feat: implement comprehensive improvement plan (Phases 1-5)
e98cc10
"""
Unit Tests for Extractor Module
Tests for Apollo state, JSON-LD, and GraphQL response extraction.
"""
from __future__ import annotations
import json
import pytest
from src.extractor import (
extract_from_apollo_state,
extract_from_graphql_response,
extract_from_json_ld,
)
# =============================================================================
# FIXTURES
# =============================================================================
@pytest.fixture
def apollo_state_html() -> str:
"""HTML with __APOLLO_STATE__ script."""
apollo_data = {
"ROOT_QUERY": {
"post(id:\"abc123\")": {"__ref": "Post:abc123"}
},
"Post:abc123": {
"id": "abc123",
"title": "Test Article Title",
"content": {
"bodyModel": {
"paragraphs": [
{"type": "H3", "text": "Section Title"},
{"type": "P", "text": "This is paragraph content."},
]
}
},
"creator": {
"__ref": "User:user123"
},
"tags": [{"name": "python"}, {"name": "testing"}],
"clapCount": 1500,
},
"User:user123": {
"id": "user123",
"name": "Test Author",
"username": "testauthor",
"bio": "A test author bio",
}
}
return f"""
<!DOCTYPE html>
<html>
<body>
<script>window.__APOLLO_STATE__ = {json.dumps(apollo_data)}</script>
</body>
</html>
"""
@pytest.fixture
def json_ld_html() -> str:
"""HTML with JSON-LD structured data."""
json_ld = {
"@context": "https://schema.org",
"@type": "Article",
"headline": "JSON-LD Test Article",
"description": "A test article with JSON-LD data",
"author": {
"@type": "Person",
"name": "JSON-LD Author",
"url": "https://medium.com/@jsonldauthor"
},
"datePublished": "2024-01-15T10:00:00Z",
"publisher": {
"@type": "Organization",
"name": "Medium"
}
}
return f"""
<!DOCTYPE html>
<html>
<head>
<script type="application/ld+json">{json.dumps(json_ld)}</script>
</head>
<body></body>
</html>
"""
@pytest.fixture
def graphql_response() -> dict:
"""Sample GraphQL API response."""
return {
"data": {
"post": {
"id": "graphql123",
"title": "GraphQL Test Article",
"content": {
"bodyModel": {
"paragraphs": [
{"type": "P", "text": "GraphQL content paragraph."}
]
}
},
"creator": {
"id": "creator123",
"name": "GraphQL Author",
"username": "graphqlauthor",
},
"tags": [
{"name": "graphql"},
{"name": "api"}
],
"clapCount": 500,
"readingTime": 5,
}
}
}
# =============================================================================
# APOLLO STATE TESTS
# =============================================================================
class TestExtractFromApolloState:
"""Tests for Apollo state extraction."""
def test_extract_basic_apollo_state(self, apollo_state_html: str) -> None:
"""Test basic Apollo state extraction."""
result = extract_from_apollo_state(apollo_state_html)
assert result is not None
assert "title" in result
assert result["title"] == "Test Article Title"
def test_extract_author_from_apollo(self, apollo_state_html: str) -> None:
"""Test author extraction from Apollo state."""
result = extract_from_apollo_state(apollo_state_html)
assert result is not None
if "author" in result:
author = result["author"]
assert "name" in author or isinstance(author, str)
def test_apollo_state_missing(self) -> None:
"""Test handling when Apollo state is missing."""
html = "<html><body>No Apollo state here</body></html>"
result = extract_from_apollo_state(html)
assert result is None
def test_apollo_state_invalid_json(self) -> None:
"""Test handling of invalid JSON in Apollo state."""
html = "<script>window.__APOLLO_STATE__ = {invalid json}</script>"
result = extract_from_apollo_state(html)
assert result is None
def test_apollo_state_empty_object(self) -> None:
"""Test handling of empty Apollo state."""
html = "<script>window.__APOLLO_STATE__ = {}</script>"
result = extract_from_apollo_state(html)
assert result is None
# =============================================================================
# JSON-LD TESTS
# =============================================================================
class TestExtractFromJsonLD:
"""Tests for JSON-LD extraction."""
def test_extract_basic_json_ld(self, json_ld_html: str) -> None:
"""Test basic JSON-LD extraction."""
result = extract_from_json_ld(json_ld_html)
assert result is not None
assert "title" in result or "headline" in result
def test_extract_author_from_json_ld(self, json_ld_html: str) -> None:
"""Test author extraction from JSON-LD."""
result = extract_from_json_ld(json_ld_html)
assert result is not None
if "author" in result:
# Author can be string or dict
assert result["author"] is not None
def test_json_ld_missing(self) -> None:
"""Test handling when JSON-LD is missing."""
html = "<html><body>No JSON-LD here</body></html>"
result = extract_from_json_ld(html)
assert result is None
def test_json_ld_invalid_json(self) -> None:
"""Test handling of invalid JSON in JSON-LD."""
html = '<script type="application/ld+json">{invalid}</script>'
result = extract_from_json_ld(html)
assert result is None
def test_json_ld_wrong_type(self) -> None:
"""Test handling of non-Article JSON-LD type."""
json_ld = {
"@context": "https://schema.org",
"@type": "Organization", # Not Article
"name": "Test Org"
}
html = f'<script type="application/ld+json">{json.dumps(json_ld)}</script>'
result = extract_from_json_ld(html)
# Should handle gracefully
assert result is None or result is not None
# =============================================================================
# GRAPHQL RESPONSE TESTS
# =============================================================================
class TestExtractFromGraphQLResponse:
"""Tests for GraphQL response extraction."""
def test_extract_basic_graphql(self, graphql_response: dict) -> None:
"""Test basic GraphQL response extraction."""
result = extract_from_graphql_response(graphql_response)
assert result is not None
assert "title" in result
assert result["title"] == "GraphQL Test Article"
def test_extract_author_from_graphql(self, graphql_response: dict) -> None:
"""Test author extraction from GraphQL response."""
result = extract_from_graphql_response(graphql_response)
assert result is not None
if "author" in result:
author = result["author"]
assert author is not None
def test_extract_tags_from_graphql(self, graphql_response: dict) -> None:
"""Test tags extraction from GraphQL response."""
result = extract_from_graphql_response(graphql_response)
assert result is not None
if "tags" in result:
assert isinstance(result["tags"], list)
def test_graphql_empty_response(self) -> None:
"""Test handling of empty GraphQL response."""
result = extract_from_graphql_response({})
assert result is None
def test_graphql_missing_data(self) -> None:
"""Test handling of GraphQL response without data field."""
result = extract_from_graphql_response({"errors": ["Some error"]})
assert result is None
def test_graphql_missing_post(self) -> None:
"""Test handling of GraphQL response without post."""
result = extract_from_graphql_response({"data": {}})
assert result is None
def test_graphql_none_input(self) -> None:
"""Test handling of None input."""
# Should handle gracefully without crashing
try:
result = extract_from_graphql_response(None) # type: ignore
assert result is None
except (TypeError, AttributeError):
pass # Expected for None input
# =============================================================================
# EDGE CASES
# =============================================================================
class TestExtractorEdgeCases:
"""Tests for edge cases in extraction."""
def test_unicode_content_apollo(self) -> None:
"""Test handling of Unicode content in Apollo state."""
apollo_data = {"Post:123": {"title": "日本語タイトル", "id": "123"}}
html = f"<script>window.__APOLLO_STATE__ = {json.dumps(apollo_data)}</script>"
result = extract_from_apollo_state(html)
# Should handle Unicode gracefully
assert result is None or result is not None
def test_very_large_apollo_state(self) -> None:
"""Test handling of very large Apollo state."""
large_data = {f"Post:{i}": {"title": f"Post {i}", "id": str(i)} for i in range(100)}
html = f"<script>window.__APOLLO_STATE__ = {json.dumps(large_data)}</script>"
# Should not crash or timeout
result = extract_from_apollo_state(html)
assert result is None or result is not None
def test_special_characters_in_content(self) -> None:
"""Test handling of special characters."""
apollo_data = {
"Post:123": {
"title": "Title with <script>alert('xss')</script>",
"id": "123",
}
}
html = f"<script>window.__APOLLO_STATE__ = {json.dumps(apollo_data)}</script>"
# Should handle without breaking
result = extract_from_apollo_state(html)
assert result is None or result is not None