""" Comprehensive tests for AI-Generated Knowledge Base Metadata Extraction Tests all metadata extraction features: - Title extraction (from filename, content, URL) - Summary generation (LLM and fallback) - Tags extraction (LLM and fallback) - Topics extraction (LLM and fallback) - Date detection - Quality score calculation - Database storage - Integration with ingestion pipeline """ import pytest import asyncio from unittest.mock import Mock, patch, AsyncMock from backend.api.services.metadata_extractor import MetadataExtractor from backend.mcp_server.common.database import insert_document_chunks, get_connection import json class TestMetadataExtractor: """Test the MetadataExtractor service""" @pytest.fixture def extractor(self): """Create a MetadataExtractor instance""" return MetadataExtractor() @pytest.fixture def sample_content(self): """Sample document content for testing""" return """ # API Documentation Guide This comprehensive guide covers REST API endpoints, authentication, and best practices. Published on 2024-01-15, this document provides detailed information about our API. ## Authentication All API requests require authentication using API keys or OAuth tokens. ## Endpoints - GET /api/v1/users - List all users - POST /api/v1/users - Create a new user - GET /api/v1/users/{id} - Get user by ID ## Examples Here are some example requests and responses. ## Troubleshooting Common issues and their solutions. """ def test_extract_title_from_filename(self, extractor): """Test title extraction from filename""" content = "Some content here" filename = "API_Documentation_Guide.pdf" title = extractor._extract_title(content, filename=filename, url=None) assert title == "Api Documentation Guide" assert "API" in title or "Api" in title def test_extract_title_from_content(self, extractor, sample_content): """Test title extraction from content (first line or markdown)""" title = extractor._extract_title(sample_content, filename=None, url=None) # Should extract from markdown header or first meaningful line assert len(title) > 0 assert len(title) < 200 def test_extract_title_from_url(self, extractor): """Test title extraction from URL""" content = "Some content" url = "https://example.com/api/documentation-guide" title = extractor._extract_title(content, filename=None, url=url) # URL extraction should return something (may be from URL path or fallback) assert len(title) > 0 assert isinstance(title, str) def test_extract_title_fallback(self, extractor): """Test title fallback to first 50 chars""" content = "This is a very long document that doesn't have a clear title structure and continues with more text" title = extractor._extract_title(content, filename=None, url=None) assert len(title) > 0 # Fallback should return first line or first 50 chars (may not have ...) assert isinstance(title, str) # Title should be reasonable length (not the entire content if content is long) # If content is short, title might equal content, which is fine if len(content) > 50: assert len(title) <= len(content) def test_detect_date_formats(self, extractor): """Test date detection in various formats""" # YYYY-MM-DD format content1 = "Published on 2024-01-15" date1 = extractor._detect_date(content1) assert date1 == "2024-01-15" # MM/DD/YYYY format content2 = "Created on 01/15/2024" date2 = extractor._detect_date(content2) assert date2 is not None # Month name format content3 = "Last updated January 15, 2024" date3 = extractor._detect_date(content3) assert date3 is not None def test_detect_date_none(self, extractor): """Test date detection when no date is present""" content = "This document has no date information" date = extractor._detect_date(content) assert date is None def test_generate_basic_summary(self, extractor, sample_content): """Test basic summary generation""" summary = extractor._generate_basic_summary(sample_content) assert len(summary) > 0 assert len(summary) < len(sample_content) assert summary.endswith('.') def test_extract_basic_tags(self, extractor, sample_content): """Test basic tag extraction without LLM""" tags = extractor._extract_basic_tags(sample_content) assert isinstance(tags, list) assert len(tags) > 0 assert len(tags) <= 8 # Should find "api" in tags assert any("api" in tag.lower() for tag in tags) def test_extract_basic_topics(self, extractor, sample_content): """Test basic topic extraction without LLM""" topics = extractor._extract_basic_topics(sample_content) assert isinstance(topics, list) assert len(topics) > 0 assert len(topics) <= 5 # Should find topics from headers assert any("API" in topic or "api" in topic.lower() for topic in topics) def test_calculate_quality_score(self, extractor): """Test quality score calculation""" # Good quality content good_content = "This is a well-structured document. " * 50 good_content += "It has multiple paragraphs. " * 10 score1 = extractor._calculate_quality_score(good_content, 500, "Good summary") assert 0.0 <= score1 <= 1.0 assert score1 > 0.5 # Should be decent quality # Poor quality content poor_content = "x" * 100 score2 = extractor._calculate_quality_score(poor_content, 10, "") assert 0.0 <= score2 <= 1.0 assert score2 < score1 # Should be lower quality def test_extract_fallback(self, extractor, sample_content): """Test fallback metadata extraction""" result = extractor._extract_fallback(sample_content, "Test Title") assert "summary" in result assert "tags" in result assert "topics" in result assert isinstance(result["tags"], list) assert isinstance(result["topics"], list) assert len(result["summary"]) > 0 @pytest.mark.asyncio async def test_extract_with_llm_success(self, extractor, sample_content): """Test LLM-based metadata extraction (mocked)""" # Mock LLM response mock_response = json.dumps({ "summary": "This document provides comprehensive API documentation.", "tags": ["api", "documentation", "rest", "endpoints"], "topics": ["API", "REST", "Endpoints"], "domain": "Software Development" }) with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm: mock_llm.return_value = mock_response result = await extractor._extract_with_llm(sample_content, "API Documentation") assert "summary" in result assert "tags" in result assert "topics" in result assert len(result["tags"]) > 0 assert len(result["topics"]) > 0 assert "api" in [tag.lower() for tag in result["tags"]] @pytest.mark.asyncio async def test_extract_with_llm_timeout(self, extractor, sample_content): """Test LLM extraction timeout handling""" with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm: mock_llm.side_effect = asyncio.TimeoutError() with pytest.raises(Exception) as exc_info: await extractor._extract_with_llm(sample_content, "Test") assert "timeout" in str(exc_info.value).lower() or isinstance(exc_info.value, asyncio.TimeoutError) @pytest.mark.asyncio async def test_extract_metadata_full(self, extractor, sample_content): """Test full metadata extraction (with LLM fallback)""" # Mock LLM to fail (will use fallback) with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm: mock_llm.side_effect = Exception("LLM unavailable") metadata = await extractor.extract_metadata( content=sample_content, filename="api_docs.md", url=None, source_type="markdown" ) # Verify all required fields assert "title" in metadata assert "summary" in metadata assert "tags" in metadata assert "topics" in metadata assert "detected_date" in metadata assert "quality_score" in metadata assert "word_count" in metadata assert "char_count" in metadata assert "source_type" in metadata assert "extraction_method" in metadata # Verify data types and ranges assert isinstance(metadata["title"], str) assert isinstance(metadata["summary"], str) assert isinstance(metadata["tags"], list) assert isinstance(metadata["topics"], list) assert isinstance(metadata["quality_score"], float) assert 0.0 <= metadata["quality_score"] <= 1.0 assert metadata["word_count"] > 0 assert metadata["extraction_method"] in ["llm", "fallback"] @pytest.mark.asyncio async def test_extract_metadata_with_llm(self, extractor, sample_content): """Test metadata extraction with successful LLM call""" mock_response = json.dumps({ "summary": "Comprehensive API documentation guide.", "tags": ["api", "documentation", "rest"], "topics": ["API", "REST", "Documentation"], "domain": "API" }) with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm: mock_llm.return_value = mock_response metadata = await extractor.extract_metadata( content=sample_content, filename="api_docs.md" ) assert metadata["extraction_method"] == "llm" assert len(metadata["summary"]) > 0 assert len(metadata["tags"]) > 0 assert len(metadata["topics"]) > 0 class TestDatabaseMetadataStorage: """Test database storage of metadata""" @pytest.fixture def sample_metadata(self): """Sample metadata for testing""" return { "title": "Test Document", "summary": "This is a test document for metadata extraction.", "tags": ["test", "documentation"], "topics": ["Testing", "Metadata"], "detected_date": "2024-01-15", "quality_score": 0.85, "word_count": 100, "char_count": 500, "source_type": "txt", "extraction_method": "llm" } def test_insert_with_metadata(self, sample_metadata): """Test inserting document chunk with metadata""" # This test requires a real database connection # Skip if database is not available try: conn = get_connection() conn.close() except Exception: pytest.skip("Database not available for testing") tenant_id = "test_tenant_metadata" text = "This is a test chunk with metadata." # Generate a simple embedding (384 dimensions) embedding = [0.1] * 384 # Insert with metadata insert_document_chunks( tenant_id=tenant_id, text=text, embedding=embedding, metadata=sample_metadata, doc_id="test_doc_123" ) # Verify insertion by querying conn = get_connection() cur = conn.cursor() cur.execute(""" SELECT metadata, doc_id FROM documents WHERE tenant_id = %s AND chunk_text = %s LIMIT 1; """, (tenant_id, text)) result = cur.fetchone() assert result is not None stored_metadata = result[0] stored_doc_id = result[1] # Verify metadata was stored correctly assert stored_metadata is not None assert stored_metadata["title"] == sample_metadata["title"] assert stored_metadata["summary"] == sample_metadata["summary"] assert stored_metadata["quality_score"] == sample_metadata["quality_score"] # Verify doc_id was stored assert stored_doc_id == "test_doc_123" # Cleanup cur.execute("DELETE FROM documents WHERE tenant_id = %s", (tenant_id,)) conn.commit() cur.close() conn.close() class TestIngestionIntegration: """Test metadata extraction integration with ingestion pipeline""" @pytest.mark.asyncio async def test_metadata_extraction_in_ingestion(self): """Test that metadata is extracted during document ingestion""" from backend.api.services.document_ingestion import prepare_ingestion_payload, process_ingestion from backend.api.mcp_clients.rag_client import RAGClient from unittest.mock import AsyncMock, patch, MagicMock # Mock RAG client mock_rag_client = Mock(spec=RAGClient) mock_rag_client.ingest_with_metadata = AsyncMock(return_value={ "chunks_stored": 3, "status": "ok" }) # Prepare payload payload = await prepare_ingestion_payload( tenant_id="test_tenant", content="This is a test document about API documentation. Published on 2024-01-15.", source_type="txt", filename="api_docs.txt" ) # Process with metadata extraction - patch the import path used in the function with patch('backend.api.services.metadata_extractor.MetadataExtractor') as mock_extractor_class: mock_extractor = MagicMock() mock_extractor.extract_metadata = AsyncMock(return_value={ "title": "API Documentation", "summary": "Test document about APIs", "tags": ["api", "documentation"], "topics": ["API"], "detected_date": "2024-01-15", "quality_score": 0.8, "word_count": 10, "char_count": 50, "source_type": "txt", "extraction_method": "llm" }) mock_extractor_class.return_value = mock_extractor result = await process_ingestion(payload, mock_rag_client, extract_metadata=True) # Verify metadata was extracted assert "extracted_metadata" in result assert result["extracted_metadata"]["title"] == "API Documentation" assert result["extracted_metadata"]["quality_score"] == 0.8 # Verify RAG client was called with metadata mock_rag_client.ingest_with_metadata.assert_called_once() call_args = mock_rag_client.ingest_with_metadata.call_args # Check that metadata was passed (either as kwarg or in the merged metadata) assert call_args is not None class TestMetadataEdgeCases: """Test edge cases and error handling""" @pytest.mark.asyncio async def test_empty_content(self): """Test metadata extraction with empty content""" extractor = MetadataExtractor() metadata = await extractor.extract_metadata( content="", filename="empty.txt" ) # Should still return metadata structure assert "title" in metadata assert "summary" in metadata assert metadata["word_count"] == 0 @pytest.mark.asyncio async def test_very_long_content(self): """Test metadata extraction with very long content""" extractor = MetadataExtractor() long_content = "Word " * 10000 # 10,000 words metadata = await extractor.extract_metadata( content=long_content, filename="long_doc.txt" ) assert metadata["word_count"] == 10000 assert len(metadata["summary"]) > 0 assert metadata["quality_score"] >= 0.0 @pytest.mark.asyncio async def test_special_characters(self): """Test metadata extraction with special characters""" extractor = MetadataExtractor() special_content = "Document with Ć©mojis šŸš€ and spĆ©ciĆ”l chĆ racters!" metadata = await extractor.extract_metadata( content=special_content, filename="special.txt" ) assert "title" in metadata assert len(metadata["title"]) > 0 def test_quality_score_edge_cases(self): """Test quality score with edge cases""" extractor = MetadataExtractor() # Very short content short = "Hi" score1 = extractor._calculate_quality_score(short, 1, "") assert 0.0 <= score1 <= 1.0 # Very long content long = "Word " * 20000 score2 = extractor._calculate_quality_score(long, 20000, "Summary") assert 0.0 <= score2 <= 1.0 # No summary no_summary = "Content " * 100 score3 = extractor._calculate_quality_score(no_summary, 100, "") assert 0.0 <= score3 <= 1.0 if __name__ == "__main__": pytest.main([__file__, "-v", "--tb=short"])