Spaces:

T0X1N
/

Medium-MCP

Sleeping

Medium-MCP / tests /unit /test_extractor.py

Nikhil Pravin Pise

feat: implement comprehensive improvement plan (Phases 1-5)

e98cc10 4 months ago

10.8 kB

	"""
	Unit Tests for Extractor Module

	Tests for Apollo state, JSON-LD, and GraphQL response extraction.
	"""

	from __future__ import annotations

	import json

	import pytest

	from src.extractor import (
	extract_from_apollo_state,
	extract_from_graphql_response,
	extract_from_json_ld,
	)


	# =============================================================================
	# FIXTURES
	# =============================================================================


	@pytest.fixture
	def apollo_state_html() -> str:
	"""HTML with __APOLLO_STATE__ script."""
	apollo_data = {
	"ROOT_QUERY": {
	"post(id:\"abc123\")": {"__ref": "Post:abc123"}
	},
	"Post:abc123": {
	"id": "abc123",
	"title": "Test Article Title",
	"content": {
	"bodyModel": {
	"paragraphs": [
	{"type": "H3", "text": "Section Title"},
	{"type": "P", "text": "This is paragraph content."},
	]
	}
	},
	"creator": {
	"__ref": "User:user123"
	},
	"tags": [{"name": "python"}, {"name": "testing"}],
	"clapCount": 1500,
	},
	"User:user123": {
	"id": "user123",
	"name": "Test Author",
	"username": "testauthor",
	"bio": "A test author bio",
	}
	}
	return f"""
	<!DOCTYPE html>
	<html>
	<body>
	<script>window.__APOLLO_STATE__ = {json.dumps(apollo_data)}</script>
	</body>
	</html>
	"""


	@pytest.fixture
	def json_ld_html() -> str:
	"""HTML with JSON-LD structured data."""
	json_ld = {
	"@context": "https://schema.org",
	"@type": "Article",
	"headline": "JSON-LD Test Article",
	"description": "A test article with JSON-LD data",
	"author": {
	"@type": "Person",
	"name": "JSON-LD Author",
	"url": "https://medium.com/@jsonldauthor"
	},
	"datePublished": "2024-01-15T10:00:00Z",
	"publisher": {
	"@type": "Organization",
	"name": "Medium"
	}
	}
	return f"""
	<!DOCTYPE html>
	<html>
	<head>
	<script type="application/ld+json">{json.dumps(json_ld)}</script>
	</head>
	<body></body>
	</html>
	"""


	@pytest.fixture
	def graphql_response() -> dict:
	"""Sample GraphQL API response."""
	return {
	"data": {
	"post": {
	"id": "graphql123",
	"title": "GraphQL Test Article",
	"content": {
	"bodyModel": {
	"paragraphs": [
	{"type": "P", "text": "GraphQL content paragraph."}
	]
	}
	},
	"creator": {
	"id": "creator123",
	"name": "GraphQL Author",
	"username": "graphqlauthor",
	},
	"tags": [
	{"name": "graphql"},
	{"name": "api"}
	],
	"clapCount": 500,
	"readingTime": 5,
	}
	}
	}


	# =============================================================================
	# APOLLO STATE TESTS
	# =============================================================================


	class TestExtractFromApolloState:
	"""Tests for Apollo state extraction."""

	def test_extract_basic_apollo_state(self, apollo_state_html: str) -> None:
	"""Test basic Apollo state extraction."""
	result = extract_from_apollo_state(apollo_state_html)

	assert result is not None
	assert "title" in result
	assert result["title"] == "Test Article Title"

	def test_extract_author_from_apollo(self, apollo_state_html: str) -> None:
	"""Test author extraction from Apollo state."""
	result = extract_from_apollo_state(apollo_state_html)

	assert result is not None
	if "author" in result:
	author = result["author"]
	assert "name" in author or isinstance(author, str)

	def test_apollo_state_missing(self) -> None:
	"""Test handling when Apollo state is missing."""
	html = "<html><body>No Apollo state here</body></html>"
	result = extract_from_apollo_state(html)

	assert result is None

	def test_apollo_state_invalid_json(self) -> None:
	"""Test handling of invalid JSON in Apollo state."""
	html = "<script>window.__APOLLO_STATE__ = {invalid json}</script>"
	result = extract_from_apollo_state(html)

	assert result is None

	def test_apollo_state_empty_object(self) -> None:
	"""Test handling of empty Apollo state."""
	html = "<script>window.__APOLLO_STATE__ = {}</script>"
	result = extract_from_apollo_state(html)

	assert result is None


	# =============================================================================
	# JSON-LD TESTS
	# =============================================================================


	class TestExtractFromJsonLD:
	"""Tests for JSON-LD extraction."""

	def test_extract_basic_json_ld(self, json_ld_html: str) -> None:
	"""Test basic JSON-LD extraction."""
	result = extract_from_json_ld(json_ld_html)

	assert result is not None
	assert "title" in result or "headline" in result

	def test_extract_author_from_json_ld(self, json_ld_html: str) -> None:
	"""Test author extraction from JSON-LD."""
	result = extract_from_json_ld(json_ld_html)

	assert result is not None
	if "author" in result:
	# Author can be string or dict
	assert result["author"] is not None

	def test_json_ld_missing(self) -> None:
	"""Test handling when JSON-LD is missing."""
	html = "<html><body>No JSON-LD here</body></html>"
	result = extract_from_json_ld(html)

	assert result is None

	def test_json_ld_invalid_json(self) -> None:
	"""Test handling of invalid JSON in JSON-LD."""
	html = '<script type="application/ld+json">{invalid}</script>'
	result = extract_from_json_ld(html)

	assert result is None

	def test_json_ld_wrong_type(self) -> None:
	"""Test handling of non-Article JSON-LD type."""
	json_ld = {
	"@context": "https://schema.org",
	"@type": "Organization", # Not Article
	"name": "Test Org"
	}
	html = f'<script type="application/ld+json">{json.dumps(json_ld)}</script>'
	result = extract_from_json_ld(html)

	# Should handle gracefully
	assert result is None or result is not None


	# =============================================================================
	# GRAPHQL RESPONSE TESTS
	# =============================================================================


	class TestExtractFromGraphQLResponse:
	"""Tests for GraphQL response extraction."""

	def test_extract_basic_graphql(self, graphql_response: dict) -> None:
	"""Test basic GraphQL response extraction."""
	result = extract_from_graphql_response(graphql_response)

	assert result is not None
	assert "title" in result
	assert result["title"] == "GraphQL Test Article"

	def test_extract_author_from_graphql(self, graphql_response: dict) -> None:
	"""Test author extraction from GraphQL response."""
	result = extract_from_graphql_response(graphql_response)

	assert result is not None
	if "author" in result:
	author = result["author"]
	assert author is not None

	def test_extract_tags_from_graphql(self, graphql_response: dict) -> None:
	"""Test tags extraction from GraphQL response."""
	result = extract_from_graphql_response(graphql_response)

	assert result is not None
	if "tags" in result:
	assert isinstance(result["tags"], list)

	def test_graphql_empty_response(self) -> None:
	"""Test handling of empty GraphQL response."""
	result = extract_from_graphql_response({})

	assert result is None

	def test_graphql_missing_data(self) -> None:
	"""Test handling of GraphQL response without data field."""
	result = extract_from_graphql_response({"errors": ["Some error"]})

	assert result is None

	def test_graphql_missing_post(self) -> None:
	"""Test handling of GraphQL response without post."""
	result = extract_from_graphql_response({"data": {}})

	assert result is None

	def test_graphql_none_input(self) -> None:
	"""Test handling of None input."""
	# Should handle gracefully without crashing
	try:
	result = extract_from_graphql_response(None) # type: ignore
	assert result is None
	except (TypeError, AttributeError):
	pass # Expected for None input


	# =============================================================================
	# EDGE CASES
	# =============================================================================


	class TestExtractorEdgeCases:
	"""Tests for edge cases in extraction."""

	def test_unicode_content_apollo(self) -> None:
	"""Test handling of Unicode content in Apollo state."""
	apollo_data = {"Post:123": {"title": "日本語タイトル", "id": "123"}}
	html = f"<script>window.__APOLLO_STATE__ = {json.dumps(apollo_data)}</script>"
	result = extract_from_apollo_state(html)

	# Should handle Unicode gracefully
	assert result is None or result is not None

	def test_very_large_apollo_state(self) -> None:
	"""Test handling of very large Apollo state."""
	large_data = {f"Post:{i}": {"title": f"Post {i}", "id": str(i)} for i in range(100)}
	html = f"<script>window.__APOLLO_STATE__ = {json.dumps(large_data)}</script>"

	# Should not crash or timeout
	result = extract_from_apollo_state(html)
	assert result is None or result is not None

	def test_special_characters_in_content(self) -> None:
	"""Test handling of special characters."""
	apollo_data = {
	"Post:123": {
	"title": "Title with <script>alert('xss')</script>",
	"id": "123",
	}
	}
	html = f"<script>window.__APOLLO_STATE__ = {json.dumps(apollo_data)}</script>"

	# Should handle without breaking
	result = extract_from_apollo_state(html)
	assert result is None or result is not None