Spaces:
Sleeping
Sleeping
| """ | |
| Unit Tests for Extractor Module | |
| Tests for Apollo state, JSON-LD, and GraphQL response extraction. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import pytest | |
| from src.extractor import ( | |
| extract_from_apollo_state, | |
| extract_from_graphql_response, | |
| extract_from_json_ld, | |
| ) | |
| # ============================================================================= | |
| # FIXTURES | |
| # ============================================================================= | |
| def apollo_state_html() -> str: | |
| """HTML with __APOLLO_STATE__ script.""" | |
| apollo_data = { | |
| "ROOT_QUERY": { | |
| "post(id:\"abc123\")": {"__ref": "Post:abc123"} | |
| }, | |
| "Post:abc123": { | |
| "id": "abc123", | |
| "title": "Test Article Title", | |
| "content": { | |
| "bodyModel": { | |
| "paragraphs": [ | |
| {"type": "H3", "text": "Section Title"}, | |
| {"type": "P", "text": "This is paragraph content."}, | |
| ] | |
| } | |
| }, | |
| "creator": { | |
| "__ref": "User:user123" | |
| }, | |
| "tags": [{"name": "python"}, {"name": "testing"}], | |
| "clapCount": 1500, | |
| }, | |
| "User:user123": { | |
| "id": "user123", | |
| "name": "Test Author", | |
| "username": "testauthor", | |
| "bio": "A test author bio", | |
| } | |
| } | |
| return f""" | |
| <!DOCTYPE html> | |
| <html> | |
| <body> | |
| <script>window.__APOLLO_STATE__ = {json.dumps(apollo_data)}</script> | |
| </body> | |
| </html> | |
| """ | |
| def json_ld_html() -> str: | |
| """HTML with JSON-LD structured data.""" | |
| json_ld = { | |
| "@context": "https://schema.org", | |
| "@type": "Article", | |
| "headline": "JSON-LD Test Article", | |
| "description": "A test article with JSON-LD data", | |
| "author": { | |
| "@type": "Person", | |
| "name": "JSON-LD Author", | |
| "url": "https://medium.com/@jsonldauthor" | |
| }, | |
| "datePublished": "2024-01-15T10:00:00Z", | |
| "publisher": { | |
| "@type": "Organization", | |
| "name": "Medium" | |
| } | |
| } | |
| return f""" | |
| <!DOCTYPE html> | |
| <html> | |
| <head> | |
| <script type="application/ld+json">{json.dumps(json_ld)}</script> | |
| </head> | |
| <body></body> | |
| </html> | |
| """ | |
| def graphql_response() -> dict: | |
| """Sample GraphQL API response.""" | |
| return { | |
| "data": { | |
| "post": { | |
| "id": "graphql123", | |
| "title": "GraphQL Test Article", | |
| "content": { | |
| "bodyModel": { | |
| "paragraphs": [ | |
| {"type": "P", "text": "GraphQL content paragraph."} | |
| ] | |
| } | |
| }, | |
| "creator": { | |
| "id": "creator123", | |
| "name": "GraphQL Author", | |
| "username": "graphqlauthor", | |
| }, | |
| "tags": [ | |
| {"name": "graphql"}, | |
| {"name": "api"} | |
| ], | |
| "clapCount": 500, | |
| "readingTime": 5, | |
| } | |
| } | |
| } | |
| # ============================================================================= | |
| # APOLLO STATE TESTS | |
| # ============================================================================= | |
| class TestExtractFromApolloState: | |
| """Tests for Apollo state extraction.""" | |
| def test_extract_basic_apollo_state(self, apollo_state_html: str) -> None: | |
| """Test basic Apollo state extraction.""" | |
| result = extract_from_apollo_state(apollo_state_html) | |
| assert result is not None | |
| assert "title" in result | |
| assert result["title"] == "Test Article Title" | |
| def test_extract_author_from_apollo(self, apollo_state_html: str) -> None: | |
| """Test author extraction from Apollo state.""" | |
| result = extract_from_apollo_state(apollo_state_html) | |
| assert result is not None | |
| if "author" in result: | |
| author = result["author"] | |
| assert "name" in author or isinstance(author, str) | |
| def test_apollo_state_missing(self) -> None: | |
| """Test handling when Apollo state is missing.""" | |
| html = "<html><body>No Apollo state here</body></html>" | |
| result = extract_from_apollo_state(html) | |
| assert result is None | |
| def test_apollo_state_invalid_json(self) -> None: | |
| """Test handling of invalid JSON in Apollo state.""" | |
| html = "<script>window.__APOLLO_STATE__ = {invalid json}</script>" | |
| result = extract_from_apollo_state(html) | |
| assert result is None | |
| def test_apollo_state_empty_object(self) -> None: | |
| """Test handling of empty Apollo state.""" | |
| html = "<script>window.__APOLLO_STATE__ = {}</script>" | |
| result = extract_from_apollo_state(html) | |
| assert result is None | |
| # ============================================================================= | |
| # JSON-LD TESTS | |
| # ============================================================================= | |
| class TestExtractFromJsonLD: | |
| """Tests for JSON-LD extraction.""" | |
| def test_extract_basic_json_ld(self, json_ld_html: str) -> None: | |
| """Test basic JSON-LD extraction.""" | |
| result = extract_from_json_ld(json_ld_html) | |
| assert result is not None | |
| assert "title" in result or "headline" in result | |
| def test_extract_author_from_json_ld(self, json_ld_html: str) -> None: | |
| """Test author extraction from JSON-LD.""" | |
| result = extract_from_json_ld(json_ld_html) | |
| assert result is not None | |
| if "author" in result: | |
| # Author can be string or dict | |
| assert result["author"] is not None | |
| def test_json_ld_missing(self) -> None: | |
| """Test handling when JSON-LD is missing.""" | |
| html = "<html><body>No JSON-LD here</body></html>" | |
| result = extract_from_json_ld(html) | |
| assert result is None | |
| def test_json_ld_invalid_json(self) -> None: | |
| """Test handling of invalid JSON in JSON-LD.""" | |
| html = '<script type="application/ld+json">{invalid}</script>' | |
| result = extract_from_json_ld(html) | |
| assert result is None | |
| def test_json_ld_wrong_type(self) -> None: | |
| """Test handling of non-Article JSON-LD type.""" | |
| json_ld = { | |
| "@context": "https://schema.org", | |
| "@type": "Organization", # Not Article | |
| "name": "Test Org" | |
| } | |
| html = f'<script type="application/ld+json">{json.dumps(json_ld)}</script>' | |
| result = extract_from_json_ld(html) | |
| # Should handle gracefully | |
| assert result is None or result is not None | |
| # ============================================================================= | |
| # GRAPHQL RESPONSE TESTS | |
| # ============================================================================= | |
| class TestExtractFromGraphQLResponse: | |
| """Tests for GraphQL response extraction.""" | |
| def test_extract_basic_graphql(self, graphql_response: dict) -> None: | |
| """Test basic GraphQL response extraction.""" | |
| result = extract_from_graphql_response(graphql_response) | |
| assert result is not None | |
| assert "title" in result | |
| assert result["title"] == "GraphQL Test Article" | |
| def test_extract_author_from_graphql(self, graphql_response: dict) -> None: | |
| """Test author extraction from GraphQL response.""" | |
| result = extract_from_graphql_response(graphql_response) | |
| assert result is not None | |
| if "author" in result: | |
| author = result["author"] | |
| assert author is not None | |
| def test_extract_tags_from_graphql(self, graphql_response: dict) -> None: | |
| """Test tags extraction from GraphQL response.""" | |
| result = extract_from_graphql_response(graphql_response) | |
| assert result is not None | |
| if "tags" in result: | |
| assert isinstance(result["tags"], list) | |
| def test_graphql_empty_response(self) -> None: | |
| """Test handling of empty GraphQL response.""" | |
| result = extract_from_graphql_response({}) | |
| assert result is None | |
| def test_graphql_missing_data(self) -> None: | |
| """Test handling of GraphQL response without data field.""" | |
| result = extract_from_graphql_response({"errors": ["Some error"]}) | |
| assert result is None | |
| def test_graphql_missing_post(self) -> None: | |
| """Test handling of GraphQL response without post.""" | |
| result = extract_from_graphql_response({"data": {}}) | |
| assert result is None | |
| def test_graphql_none_input(self) -> None: | |
| """Test handling of None input.""" | |
| # Should handle gracefully without crashing | |
| try: | |
| result = extract_from_graphql_response(None) # type: ignore | |
| assert result is None | |
| except (TypeError, AttributeError): | |
| pass # Expected for None input | |
| # ============================================================================= | |
| # EDGE CASES | |
| # ============================================================================= | |
| class TestExtractorEdgeCases: | |
| """Tests for edge cases in extraction.""" | |
| def test_unicode_content_apollo(self) -> None: | |
| """Test handling of Unicode content in Apollo state.""" | |
| apollo_data = {"Post:123": {"title": "日本語タイトル", "id": "123"}} | |
| html = f"<script>window.__APOLLO_STATE__ = {json.dumps(apollo_data)}</script>" | |
| result = extract_from_apollo_state(html) | |
| # Should handle Unicode gracefully | |
| assert result is None or result is not None | |
| def test_very_large_apollo_state(self) -> None: | |
| """Test handling of very large Apollo state.""" | |
| large_data = {f"Post:{i}": {"title": f"Post {i}", "id": str(i)} for i in range(100)} | |
| html = f"<script>window.__APOLLO_STATE__ = {json.dumps(large_data)}</script>" | |
| # Should not crash or timeout | |
| result = extract_from_apollo_state(html) | |
| assert result is None or result is not None | |
| def test_special_characters_in_content(self) -> None: | |
| """Test handling of special characters.""" | |
| apollo_data = { | |
| "Post:123": { | |
| "title": "Title with <script>alert('xss')</script>", | |
| "id": "123", | |
| } | |
| } | |
| html = f"<script>window.__APOLLO_STATE__ = {json.dumps(apollo_data)}</script>" | |
| # Should handle without breaking | |
| result = extract_from_apollo_state(html) | |
| assert result is None or result is not None | |