Spaces:

nothingworry
/

IntegraChat

Sleeping

File size: 18,087 Bytes

d1e5882

"""
Comprehensive tests for AI-Generated Knowledge Base Metadata Extraction

Tests all metadata extraction features:
- Title extraction (from filename, content, URL)
- Summary generation (LLM and fallback)
- Tags extraction (LLM and fallback)
- Topics extraction (LLM and fallback)
- Date detection
- Quality score calculation
- Database storage
- Integration with ingestion pipeline
"""

import pytest
import asyncio
from unittest.mock import Mock, patch, AsyncMock
from backend.api.services.metadata_extractor import MetadataExtractor
from backend.mcp_server.common.database import insert_document_chunks, get_connection
import json


class TestMetadataExtractor:
    """Test the MetadataExtractor service"""
    
    @pytest.fixture
    def extractor(self):
        """Create a MetadataExtractor instance"""
        return MetadataExtractor()
    
    @pytest.fixture
    def sample_content(self):
        """Sample document content for testing"""
        return """
        # API Documentation Guide
        
        This comprehensive guide covers REST API endpoints, authentication, and best practices.
        Published on 2024-01-15, this document provides detailed information about our API.
        
        ## Authentication
        All API requests require authentication using API keys or OAuth tokens.
        
        ## Endpoints
        - GET /api/v1/users - List all users
        - POST /api/v1/users - Create a new user
        - GET /api/v1/users/{id} - Get user by ID
        
        ## Examples
        Here are some example requests and responses.
        
        ## Troubleshooting
        Common issues and their solutions.
        """
    
    def test_extract_title_from_filename(self, extractor):
        """Test title extraction from filename"""
        content = "Some content here"
        filename = "API_Documentation_Guide.pdf"
        
        title = extractor._extract_title(content, filename=filename, url=None)
        assert title == "Api Documentation Guide"
        assert "API" in title or "Api" in title
    
    def test_extract_title_from_content(self, extractor, sample_content):
        """Test title extraction from content (first line or markdown)"""
        title = extractor._extract_title(sample_content, filename=None, url=None)
        # Should extract from markdown header or first meaningful line
        assert len(title) > 0
        assert len(title) < 200
    
    def test_extract_title_from_url(self, extractor):
        """Test title extraction from URL"""
        content = "Some content"
        url = "https://example.com/api/documentation-guide"
        
        title = extractor._extract_title(content, filename=None, url=url)
        # URL extraction should return something (may be from URL path or fallback)
        assert len(title) > 0
        assert isinstance(title, str)
    
    def test_extract_title_fallback(self, extractor):
        """Test title fallback to first 50 chars"""
        content = "This is a very long document that doesn't have a clear title structure and continues with more text"
        title = extractor._extract_title(content, filename=None, url=None)
        assert len(title) > 0
        # Fallback should return first line or first 50 chars (may not have ...)
        assert isinstance(title, str)
        # Title should be reasonable length (not the entire content if content is long)
        # If content is short, title might equal content, which is fine
        if len(content) > 50:
            assert len(title) <= len(content)
    
    def test_detect_date_formats(self, extractor):
        """Test date detection in various formats"""
        # YYYY-MM-DD format
        content1 = "Published on 2024-01-15"
        date1 = extractor._detect_date(content1)
        assert date1 == "2024-01-15"
        
        # MM/DD/YYYY format
        content2 = "Created on 01/15/2024"
        date2 = extractor._detect_date(content2)
        assert date2 is not None
        
        # Month name format
        content3 = "Last updated January 15, 2024"
        date3 = extractor._detect_date(content3)
        assert date3 is not None
    
    def test_detect_date_none(self, extractor):
        """Test date detection when no date is present"""
        content = "This document has no date information"
        date = extractor._detect_date(content)
        assert date is None
    
    def test_generate_basic_summary(self, extractor, sample_content):
        """Test basic summary generation"""
        summary = extractor._generate_basic_summary(sample_content)
        assert len(summary) > 0
        assert len(summary) < len(sample_content)
        assert summary.endswith('.')
    
    def test_extract_basic_tags(self, extractor, sample_content):
        """Test basic tag extraction without LLM"""
        tags = extractor._extract_basic_tags(sample_content)
        assert isinstance(tags, list)
        assert len(tags) > 0
        assert len(tags) <= 8
        # Should find "api" in tags
        assert any("api" in tag.lower() for tag in tags)
    
    def test_extract_basic_topics(self, extractor, sample_content):
        """Test basic topic extraction without LLM"""
        topics = extractor._extract_basic_topics(sample_content)
        assert isinstance(topics, list)
        assert len(topics) > 0
        assert len(topics) <= 5
        # Should find topics from headers
        assert any("API" in topic or "api" in topic.lower() for topic in topics)
    
    def test_calculate_quality_score(self, extractor):
        """Test quality score calculation"""
        # Good quality content
        good_content = "This is a well-structured document. " * 50
        good_content += "It has multiple paragraphs. " * 10
        score1 = extractor._calculate_quality_score(good_content, 500, "Good summary")
        assert 0.0 <= score1 <= 1.0
        assert score1 > 0.5  # Should be decent quality
        
        # Poor quality content
        poor_content = "x" * 100
        score2 = extractor._calculate_quality_score(poor_content, 10, "")
        assert 0.0 <= score2 <= 1.0
        assert score2 < score1  # Should be lower quality
    
    def test_extract_fallback(self, extractor, sample_content):
        """Test fallback metadata extraction"""
        result = extractor._extract_fallback(sample_content, "Test Title")
        assert "summary" in result
        assert "tags" in result
        assert "topics" in result
        assert isinstance(result["tags"], list)
        assert isinstance(result["topics"], list)
        assert len(result["summary"]) > 0
    
    @pytest.mark.asyncio
    async def test_extract_with_llm_success(self, extractor, sample_content):
        """Test LLM-based metadata extraction (mocked)"""
        # Mock LLM response
        mock_response = json.dumps({
            "summary": "This document provides comprehensive API documentation.",
            "tags": ["api", "documentation", "rest", "endpoints"],
            "topics": ["API", "REST", "Endpoints"],
            "domain": "Software Development"
        })
        
        with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
            mock_llm.return_value = mock_response
            
            result = await extractor._extract_with_llm(sample_content, "API Documentation")
            
            assert "summary" in result
            assert "tags" in result
            assert "topics" in result
            assert len(result["tags"]) > 0
            assert len(result["topics"]) > 0
            assert "api" in [tag.lower() for tag in result["tags"]]
    
    @pytest.mark.asyncio
    async def test_extract_with_llm_timeout(self, extractor, sample_content):
        """Test LLM extraction timeout handling"""
        with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
            mock_llm.side_effect = asyncio.TimeoutError()
            
            with pytest.raises(Exception) as exc_info:
                await extractor._extract_with_llm(sample_content, "Test")
            assert "timeout" in str(exc_info.value).lower() or isinstance(exc_info.value, asyncio.TimeoutError)
    
    @pytest.mark.asyncio
    async def test_extract_metadata_full(self, extractor, sample_content):
        """Test full metadata extraction (with LLM fallback)"""
        # Mock LLM to fail (will use fallback)
        with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
            mock_llm.side_effect = Exception("LLM unavailable")
            
            metadata = await extractor.extract_metadata(
                content=sample_content,
                filename="api_docs.md",
                url=None,
                source_type="markdown"
            )
            
            # Verify all required fields
            assert "title" in metadata
            assert "summary" in metadata
            assert "tags" in metadata
            assert "topics" in metadata
            assert "detected_date" in metadata
            assert "quality_score" in metadata
            assert "word_count" in metadata
            assert "char_count" in metadata
            assert "source_type" in metadata
            assert "extraction_method" in metadata
            
            # Verify data types and ranges
            assert isinstance(metadata["title"], str)
            assert isinstance(metadata["summary"], str)
            assert isinstance(metadata["tags"], list)
            assert isinstance(metadata["topics"], list)
            assert isinstance(metadata["quality_score"], float)
            assert 0.0 <= metadata["quality_score"] <= 1.0
            assert metadata["word_count"] > 0
            assert metadata["extraction_method"] in ["llm", "fallback"]
    
    @pytest.mark.asyncio
    async def test_extract_metadata_with_llm(self, extractor, sample_content):
        """Test metadata extraction with successful LLM call"""
        mock_response = json.dumps({
            "summary": "Comprehensive API documentation guide.",
            "tags": ["api", "documentation", "rest"],
            "topics": ["API", "REST", "Documentation"],
            "domain": "API"
        })
        
        with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
            mock_llm.return_value = mock_response
            
            metadata = await extractor.extract_metadata(
                content=sample_content,
                filename="api_docs.md"
            )
            
            assert metadata["extraction_method"] == "llm"
            assert len(metadata["summary"]) > 0
            assert len(metadata["tags"]) > 0
            assert len(metadata["topics"]) > 0


class TestDatabaseMetadataStorage:
    """Test database storage of metadata"""
    
    @pytest.fixture
    def sample_metadata(self):
        """Sample metadata for testing"""
        return {
            "title": "Test Document",
            "summary": "This is a test document for metadata extraction.",
            "tags": ["test", "documentation"],
            "topics": ["Testing", "Metadata"],
            "detected_date": "2024-01-15",
            "quality_score": 0.85,
            "word_count": 100,
            "char_count": 500,
            "source_type": "txt",
            "extraction_method": "llm"
        }
    
    def test_insert_with_metadata(self, sample_metadata):
        """Test inserting document chunk with metadata"""
        # This test requires a real database connection
        # Skip if database is not available
        try:
            conn = get_connection()
            conn.close()
        except Exception:
            pytest.skip("Database not available for testing")
        
        tenant_id = "test_tenant_metadata"
        text = "This is a test chunk with metadata."
        
        # Generate a simple embedding (384 dimensions)
        embedding = [0.1] * 384
        
        # Insert with metadata
        insert_document_chunks(
            tenant_id=tenant_id,
            text=text,
            embedding=embedding,
            metadata=sample_metadata,
            doc_id="test_doc_123"
        )
        
        # Verify insertion by querying
        conn = get_connection()
        cur = conn.cursor()
        cur.execute("""
            SELECT metadata, doc_id 
            FROM documents 
            WHERE tenant_id = %s 
            AND chunk_text = %s
            LIMIT 1;
        """, (tenant_id, text))
        
        result = cur.fetchone()
        assert result is not None
        
        stored_metadata = result[0]
        stored_doc_id = result[1]
        
        # Verify metadata was stored correctly
        assert stored_metadata is not None
        assert stored_metadata["title"] == sample_metadata["title"]
        assert stored_metadata["summary"] == sample_metadata["summary"]
        assert stored_metadata["quality_score"] == sample_metadata["quality_score"]
        
        # Verify doc_id was stored
        assert stored_doc_id == "test_doc_123"
        
        # Cleanup
        cur.execute("DELETE FROM documents WHERE tenant_id = %s", (tenant_id,))
        conn.commit()
        cur.close()
        conn.close()


class TestIngestionIntegration:
    """Test metadata extraction integration with ingestion pipeline"""
    
    @pytest.mark.asyncio
    async def test_metadata_extraction_in_ingestion(self):
        """Test that metadata is extracted during document ingestion"""
        from backend.api.services.document_ingestion import prepare_ingestion_payload, process_ingestion
        from backend.api.mcp_clients.rag_client import RAGClient
        from unittest.mock import AsyncMock, patch, MagicMock
        
        # Mock RAG client
        mock_rag_client = Mock(spec=RAGClient)
        mock_rag_client.ingest_with_metadata = AsyncMock(return_value={
            "chunks_stored": 3,
            "status": "ok"
        })
        
        # Prepare payload
        payload = await prepare_ingestion_payload(
            tenant_id="test_tenant",
            content="This is a test document about API documentation. Published on 2024-01-15.",
            source_type="txt",
            filename="api_docs.txt"
        )
        
        # Process with metadata extraction - patch the import path used in the function
        with patch('backend.api.services.metadata_extractor.MetadataExtractor') as mock_extractor_class:
            mock_extractor = MagicMock()
            mock_extractor.extract_metadata = AsyncMock(return_value={
                "title": "API Documentation",
                "summary": "Test document about APIs",
                "tags": ["api", "documentation"],
                "topics": ["API"],
                "detected_date": "2024-01-15",
                "quality_score": 0.8,
                "word_count": 10,
                "char_count": 50,
                "source_type": "txt",
                "extraction_method": "llm"
            })
            mock_extractor_class.return_value = mock_extractor
            
            result = await process_ingestion(payload, mock_rag_client, extract_metadata=True)
            
            # Verify metadata was extracted
            assert "extracted_metadata" in result
            assert result["extracted_metadata"]["title"] == "API Documentation"
            assert result["extracted_metadata"]["quality_score"] == 0.8
            
            # Verify RAG client was called with metadata
            mock_rag_client.ingest_with_metadata.assert_called_once()
            call_args = mock_rag_client.ingest_with_metadata.call_args
            # Check that metadata was passed (either as kwarg or in the merged metadata)
            assert call_args is not None


class TestMetadataEdgeCases:
    """Test edge cases and error handling"""
    
    @pytest.mark.asyncio
    async def test_empty_content(self):
        """Test metadata extraction with empty content"""
        extractor = MetadataExtractor()
        
        metadata = await extractor.extract_metadata(
            content="",
            filename="empty.txt"
        )
        
        # Should still return metadata structure
        assert "title" in metadata
        assert "summary" in metadata
        assert metadata["word_count"] == 0
    
    @pytest.mark.asyncio
    async def test_very_long_content(self):
        """Test metadata extraction with very long content"""
        extractor = MetadataExtractor()
        long_content = "Word " * 10000  # 10,000 words
        
        metadata = await extractor.extract_metadata(
            content=long_content,
            filename="long_doc.txt"
        )
        
        assert metadata["word_count"] == 10000
        assert len(metadata["summary"]) > 0
        assert metadata["quality_score"] >= 0.0
    
    @pytest.mark.asyncio
    async def test_special_characters(self):
        """Test metadata extraction with special characters"""
        extractor = MetadataExtractor()
        special_content = "Document with émojis 🚀 and spéciál chàracters!"
        
        metadata = await extractor.extract_metadata(
            content=special_content,
            filename="special.txt"
        )
        
        assert "title" in metadata
        assert len(metadata["title"]) > 0
    
    def test_quality_score_edge_cases(self):
        """Test quality score with edge cases"""
        extractor = MetadataExtractor()
        
        # Very short content
        short = "Hi"
        score1 = extractor._calculate_quality_score(short, 1, "")
        assert 0.0 <= score1 <= 1.0
        
        # Very long content
        long = "Word " * 20000
        score2 = extractor._calculate_quality_score(long, 20000, "Summary")
        assert 0.0 <= score2 <= 1.0
        
        # No summary
        no_summary = "Content " * 100
        score3 = extractor._calculate_quality_score(no_summary, 100, "")
        assert 0.0 <= score3 <= 1.0


if __name__ == "__main__":
    pytest.main([__file__, "-v", "--tb=short"])