Spaces:
Sleeping
Sleeping
feat: Add AI metadata extraction, latency prediction, context-aware routing, and tool output schemas
d1e5882
| """ | |
| Comprehensive tests for AI-Generated Knowledge Base Metadata Extraction | |
| Tests all metadata extraction features: | |
| - Title extraction (from filename, content, URL) | |
| - Summary generation (LLM and fallback) | |
| - Tags extraction (LLM and fallback) | |
| - Topics extraction (LLM and fallback) | |
| - Date detection | |
| - Quality score calculation | |
| - Database storage | |
| - Integration with ingestion pipeline | |
| """ | |
| import pytest | |
| import asyncio | |
| from unittest.mock import Mock, patch, AsyncMock | |
| from backend.api.services.metadata_extractor import MetadataExtractor | |
| from backend.mcp_server.common.database import insert_document_chunks, get_connection | |
| import json | |
| class TestMetadataExtractor: | |
| """Test the MetadataExtractor service""" | |
| def extractor(self): | |
| """Create a MetadataExtractor instance""" | |
| return MetadataExtractor() | |
| def sample_content(self): | |
| """Sample document content for testing""" | |
| return """ | |
| # API Documentation Guide | |
| This comprehensive guide covers REST API endpoints, authentication, and best practices. | |
| Published on 2024-01-15, this document provides detailed information about our API. | |
| ## Authentication | |
| All API requests require authentication using API keys or OAuth tokens. | |
| ## Endpoints | |
| - GET /api/v1/users - List all users | |
| - POST /api/v1/users - Create a new user | |
| - GET /api/v1/users/{id} - Get user by ID | |
| ## Examples | |
| Here are some example requests and responses. | |
| ## Troubleshooting | |
| Common issues and their solutions. | |
| """ | |
| def test_extract_title_from_filename(self, extractor): | |
| """Test title extraction from filename""" | |
| content = "Some content here" | |
| filename = "API_Documentation_Guide.pdf" | |
| title = extractor._extract_title(content, filename=filename, url=None) | |
| assert title == "Api Documentation Guide" | |
| assert "API" in title or "Api" in title | |
| def test_extract_title_from_content(self, extractor, sample_content): | |
| """Test title extraction from content (first line or markdown)""" | |
| title = extractor._extract_title(sample_content, filename=None, url=None) | |
| # Should extract from markdown header or first meaningful line | |
| assert len(title) > 0 | |
| assert len(title) < 200 | |
| def test_extract_title_from_url(self, extractor): | |
| """Test title extraction from URL""" | |
| content = "Some content" | |
| url = "https://example.com/api/documentation-guide" | |
| title = extractor._extract_title(content, filename=None, url=url) | |
| # URL extraction should return something (may be from URL path or fallback) | |
| assert len(title) > 0 | |
| assert isinstance(title, str) | |
| def test_extract_title_fallback(self, extractor): | |
| """Test title fallback to first 50 chars""" | |
| content = "This is a very long document that doesn't have a clear title structure and continues with more text" | |
| title = extractor._extract_title(content, filename=None, url=None) | |
| assert len(title) > 0 | |
| # Fallback should return first line or first 50 chars (may not have ...) | |
| assert isinstance(title, str) | |
| # Title should be reasonable length (not the entire content if content is long) | |
| # If content is short, title might equal content, which is fine | |
| if len(content) > 50: | |
| assert len(title) <= len(content) | |
| def test_detect_date_formats(self, extractor): | |
| """Test date detection in various formats""" | |
| # YYYY-MM-DD format | |
| content1 = "Published on 2024-01-15" | |
| date1 = extractor._detect_date(content1) | |
| assert date1 == "2024-01-15" | |
| # MM/DD/YYYY format | |
| content2 = "Created on 01/15/2024" | |
| date2 = extractor._detect_date(content2) | |
| assert date2 is not None | |
| # Month name format | |
| content3 = "Last updated January 15, 2024" | |
| date3 = extractor._detect_date(content3) | |
| assert date3 is not None | |
| def test_detect_date_none(self, extractor): | |
| """Test date detection when no date is present""" | |
| content = "This document has no date information" | |
| date = extractor._detect_date(content) | |
| assert date is None | |
| def test_generate_basic_summary(self, extractor, sample_content): | |
| """Test basic summary generation""" | |
| summary = extractor._generate_basic_summary(sample_content) | |
| assert len(summary) > 0 | |
| assert len(summary) < len(sample_content) | |
| assert summary.endswith('.') | |
| def test_extract_basic_tags(self, extractor, sample_content): | |
| """Test basic tag extraction without LLM""" | |
| tags = extractor._extract_basic_tags(sample_content) | |
| assert isinstance(tags, list) | |
| assert len(tags) > 0 | |
| assert len(tags) <= 8 | |
| # Should find "api" in tags | |
| assert any("api" in tag.lower() for tag in tags) | |
| def test_extract_basic_topics(self, extractor, sample_content): | |
| """Test basic topic extraction without LLM""" | |
| topics = extractor._extract_basic_topics(sample_content) | |
| assert isinstance(topics, list) | |
| assert len(topics) > 0 | |
| assert len(topics) <= 5 | |
| # Should find topics from headers | |
| assert any("API" in topic or "api" in topic.lower() for topic in topics) | |
| def test_calculate_quality_score(self, extractor): | |
| """Test quality score calculation""" | |
| # Good quality content | |
| good_content = "This is a well-structured document. " * 50 | |
| good_content += "It has multiple paragraphs. " * 10 | |
| score1 = extractor._calculate_quality_score(good_content, 500, "Good summary") | |
| assert 0.0 <= score1 <= 1.0 | |
| assert score1 > 0.5 # Should be decent quality | |
| # Poor quality content | |
| poor_content = "x" * 100 | |
| score2 = extractor._calculate_quality_score(poor_content, 10, "") | |
| assert 0.0 <= score2 <= 1.0 | |
| assert score2 < score1 # Should be lower quality | |
| def test_extract_fallback(self, extractor, sample_content): | |
| """Test fallback metadata extraction""" | |
| result = extractor._extract_fallback(sample_content, "Test Title") | |
| assert "summary" in result | |
| assert "tags" in result | |
| assert "topics" in result | |
| assert isinstance(result["tags"], list) | |
| assert isinstance(result["topics"], list) | |
| assert len(result["summary"]) > 0 | |
| async def test_extract_with_llm_success(self, extractor, sample_content): | |
| """Test LLM-based metadata extraction (mocked)""" | |
| # Mock LLM response | |
| mock_response = json.dumps({ | |
| "summary": "This document provides comprehensive API documentation.", | |
| "tags": ["api", "documentation", "rest", "endpoints"], | |
| "topics": ["API", "REST", "Endpoints"], | |
| "domain": "Software Development" | |
| }) | |
| with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm: | |
| mock_llm.return_value = mock_response | |
| result = await extractor._extract_with_llm(sample_content, "API Documentation") | |
| assert "summary" in result | |
| assert "tags" in result | |
| assert "topics" in result | |
| assert len(result["tags"]) > 0 | |
| assert len(result["topics"]) > 0 | |
| assert "api" in [tag.lower() for tag in result["tags"]] | |
| async def test_extract_with_llm_timeout(self, extractor, sample_content): | |
| """Test LLM extraction timeout handling""" | |
| with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm: | |
| mock_llm.side_effect = asyncio.TimeoutError() | |
| with pytest.raises(Exception) as exc_info: | |
| await extractor._extract_with_llm(sample_content, "Test") | |
| assert "timeout" in str(exc_info.value).lower() or isinstance(exc_info.value, asyncio.TimeoutError) | |
| async def test_extract_metadata_full(self, extractor, sample_content): | |
| """Test full metadata extraction (with LLM fallback)""" | |
| # Mock LLM to fail (will use fallback) | |
| with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm: | |
| mock_llm.side_effect = Exception("LLM unavailable") | |
| metadata = await extractor.extract_metadata( | |
| content=sample_content, | |
| filename="api_docs.md", | |
| url=None, | |
| source_type="markdown" | |
| ) | |
| # Verify all required fields | |
| assert "title" in metadata | |
| assert "summary" in metadata | |
| assert "tags" in metadata | |
| assert "topics" in metadata | |
| assert "detected_date" in metadata | |
| assert "quality_score" in metadata | |
| assert "word_count" in metadata | |
| assert "char_count" in metadata | |
| assert "source_type" in metadata | |
| assert "extraction_method" in metadata | |
| # Verify data types and ranges | |
| assert isinstance(metadata["title"], str) | |
| assert isinstance(metadata["summary"], str) | |
| assert isinstance(metadata["tags"], list) | |
| assert isinstance(metadata["topics"], list) | |
| assert isinstance(metadata["quality_score"], float) | |
| assert 0.0 <= metadata["quality_score"] <= 1.0 | |
| assert metadata["word_count"] > 0 | |
| assert metadata["extraction_method"] in ["llm", "fallback"] | |
| async def test_extract_metadata_with_llm(self, extractor, sample_content): | |
| """Test metadata extraction with successful LLM call""" | |
| mock_response = json.dumps({ | |
| "summary": "Comprehensive API documentation guide.", | |
| "tags": ["api", "documentation", "rest"], | |
| "topics": ["API", "REST", "Documentation"], | |
| "domain": "API" | |
| }) | |
| with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm: | |
| mock_llm.return_value = mock_response | |
| metadata = await extractor.extract_metadata( | |
| content=sample_content, | |
| filename="api_docs.md" | |
| ) | |
| assert metadata["extraction_method"] == "llm" | |
| assert len(metadata["summary"]) > 0 | |
| assert len(metadata["tags"]) > 0 | |
| assert len(metadata["topics"]) > 0 | |
| class TestDatabaseMetadataStorage: | |
| """Test database storage of metadata""" | |
| def sample_metadata(self): | |
| """Sample metadata for testing""" | |
| return { | |
| "title": "Test Document", | |
| "summary": "This is a test document for metadata extraction.", | |
| "tags": ["test", "documentation"], | |
| "topics": ["Testing", "Metadata"], | |
| "detected_date": "2024-01-15", | |
| "quality_score": 0.85, | |
| "word_count": 100, | |
| "char_count": 500, | |
| "source_type": "txt", | |
| "extraction_method": "llm" | |
| } | |
| def test_insert_with_metadata(self, sample_metadata): | |
| """Test inserting document chunk with metadata""" | |
| # This test requires a real database connection | |
| # Skip if database is not available | |
| try: | |
| conn = get_connection() | |
| conn.close() | |
| except Exception: | |
| pytest.skip("Database not available for testing") | |
| tenant_id = "test_tenant_metadata" | |
| text = "This is a test chunk with metadata." | |
| # Generate a simple embedding (384 dimensions) | |
| embedding = [0.1] * 384 | |
| # Insert with metadata | |
| insert_document_chunks( | |
| tenant_id=tenant_id, | |
| text=text, | |
| embedding=embedding, | |
| metadata=sample_metadata, | |
| doc_id="test_doc_123" | |
| ) | |
| # Verify insertion by querying | |
| conn = get_connection() | |
| cur = conn.cursor() | |
| cur.execute(""" | |
| SELECT metadata, doc_id | |
| FROM documents | |
| WHERE tenant_id = %s | |
| AND chunk_text = %s | |
| LIMIT 1; | |
| """, (tenant_id, text)) | |
| result = cur.fetchone() | |
| assert result is not None | |
| stored_metadata = result[0] | |
| stored_doc_id = result[1] | |
| # Verify metadata was stored correctly | |
| assert stored_metadata is not None | |
| assert stored_metadata["title"] == sample_metadata["title"] | |
| assert stored_metadata["summary"] == sample_metadata["summary"] | |
| assert stored_metadata["quality_score"] == sample_metadata["quality_score"] | |
| # Verify doc_id was stored | |
| assert stored_doc_id == "test_doc_123" | |
| # Cleanup | |
| cur.execute("DELETE FROM documents WHERE tenant_id = %s", (tenant_id,)) | |
| conn.commit() | |
| cur.close() | |
| conn.close() | |
| class TestIngestionIntegration: | |
| """Test metadata extraction integration with ingestion pipeline""" | |
| async def test_metadata_extraction_in_ingestion(self): | |
| """Test that metadata is extracted during document ingestion""" | |
| from backend.api.services.document_ingestion import prepare_ingestion_payload, process_ingestion | |
| from backend.api.mcp_clients.rag_client import RAGClient | |
| from unittest.mock import AsyncMock, patch, MagicMock | |
| # Mock RAG client | |
| mock_rag_client = Mock(spec=RAGClient) | |
| mock_rag_client.ingest_with_metadata = AsyncMock(return_value={ | |
| "chunks_stored": 3, | |
| "status": "ok" | |
| }) | |
| # Prepare payload | |
| payload = await prepare_ingestion_payload( | |
| tenant_id="test_tenant", | |
| content="This is a test document about API documentation. Published on 2024-01-15.", | |
| source_type="txt", | |
| filename="api_docs.txt" | |
| ) | |
| # Process with metadata extraction - patch the import path used in the function | |
| with patch('backend.api.services.metadata_extractor.MetadataExtractor') as mock_extractor_class: | |
| mock_extractor = MagicMock() | |
| mock_extractor.extract_metadata = AsyncMock(return_value={ | |
| "title": "API Documentation", | |
| "summary": "Test document about APIs", | |
| "tags": ["api", "documentation"], | |
| "topics": ["API"], | |
| "detected_date": "2024-01-15", | |
| "quality_score": 0.8, | |
| "word_count": 10, | |
| "char_count": 50, | |
| "source_type": "txt", | |
| "extraction_method": "llm" | |
| }) | |
| mock_extractor_class.return_value = mock_extractor | |
| result = await process_ingestion(payload, mock_rag_client, extract_metadata=True) | |
| # Verify metadata was extracted | |
| assert "extracted_metadata" in result | |
| assert result["extracted_metadata"]["title"] == "API Documentation" | |
| assert result["extracted_metadata"]["quality_score"] == 0.8 | |
| # Verify RAG client was called with metadata | |
| mock_rag_client.ingest_with_metadata.assert_called_once() | |
| call_args = mock_rag_client.ingest_with_metadata.call_args | |
| # Check that metadata was passed (either as kwarg or in the merged metadata) | |
| assert call_args is not None | |
| class TestMetadataEdgeCases: | |
| """Test edge cases and error handling""" | |
| async def test_empty_content(self): | |
| """Test metadata extraction with empty content""" | |
| extractor = MetadataExtractor() | |
| metadata = await extractor.extract_metadata( | |
| content="", | |
| filename="empty.txt" | |
| ) | |
| # Should still return metadata structure | |
| assert "title" in metadata | |
| assert "summary" in metadata | |
| assert metadata["word_count"] == 0 | |
| async def test_very_long_content(self): | |
| """Test metadata extraction with very long content""" | |
| extractor = MetadataExtractor() | |
| long_content = "Word " * 10000 # 10,000 words | |
| metadata = await extractor.extract_metadata( | |
| content=long_content, | |
| filename="long_doc.txt" | |
| ) | |
| assert metadata["word_count"] == 10000 | |
| assert len(metadata["summary"]) > 0 | |
| assert metadata["quality_score"] >= 0.0 | |
| async def test_special_characters(self): | |
| """Test metadata extraction with special characters""" | |
| extractor = MetadataExtractor() | |
| special_content = "Document with émojis 🚀 and spéciál chàracters!" | |
| metadata = await extractor.extract_metadata( | |
| content=special_content, | |
| filename="special.txt" | |
| ) | |
| assert "title" in metadata | |
| assert len(metadata["title"]) > 0 | |
| def test_quality_score_edge_cases(self): | |
| """Test quality score with edge cases""" | |
| extractor = MetadataExtractor() | |
| # Very short content | |
| short = "Hi" | |
| score1 = extractor._calculate_quality_score(short, 1, "") | |
| assert 0.0 <= score1 <= 1.0 | |
| # Very long content | |
| long = "Word " * 20000 | |
| score2 = extractor._calculate_quality_score(long, 20000, "Summary") | |
| assert 0.0 <= score2 <= 1.0 | |
| # No summary | |
| no_summary = "Content " * 100 | |
| score3 = extractor._calculate_quality_score(no_summary, 100, "") | |
| assert 0.0 <= score3 <= 1.0 | |
| if __name__ == "__main__": | |
| pytest.main([__file__, "-v", "--tb=short"]) | |