IntegraChat / backend /tests /test_metadata_extraction.py
nothingworry's picture
feat: Add AI metadata extraction, latency prediction, context-aware routing, and tool output schemas
d1e5882
raw
history blame
18.1 kB
"""
Comprehensive tests for AI-Generated Knowledge Base Metadata Extraction
Tests all metadata extraction features:
- Title extraction (from filename, content, URL)
- Summary generation (LLM and fallback)
- Tags extraction (LLM and fallback)
- Topics extraction (LLM and fallback)
- Date detection
- Quality score calculation
- Database storage
- Integration with ingestion pipeline
"""
import pytest
import asyncio
from unittest.mock import Mock, patch, AsyncMock
from backend.api.services.metadata_extractor import MetadataExtractor
from backend.mcp_server.common.database import insert_document_chunks, get_connection
import json
class TestMetadataExtractor:
"""Test the MetadataExtractor service"""
@pytest.fixture
def extractor(self):
"""Create a MetadataExtractor instance"""
return MetadataExtractor()
@pytest.fixture
def sample_content(self):
"""Sample document content for testing"""
return """
# API Documentation Guide
This comprehensive guide covers REST API endpoints, authentication, and best practices.
Published on 2024-01-15, this document provides detailed information about our API.
## Authentication
All API requests require authentication using API keys or OAuth tokens.
## Endpoints
- GET /api/v1/users - List all users
- POST /api/v1/users - Create a new user
- GET /api/v1/users/{id} - Get user by ID
## Examples
Here are some example requests and responses.
## Troubleshooting
Common issues and their solutions.
"""
def test_extract_title_from_filename(self, extractor):
"""Test title extraction from filename"""
content = "Some content here"
filename = "API_Documentation_Guide.pdf"
title = extractor._extract_title(content, filename=filename, url=None)
assert title == "Api Documentation Guide"
assert "API" in title or "Api" in title
def test_extract_title_from_content(self, extractor, sample_content):
"""Test title extraction from content (first line or markdown)"""
title = extractor._extract_title(sample_content, filename=None, url=None)
# Should extract from markdown header or first meaningful line
assert len(title) > 0
assert len(title) < 200
def test_extract_title_from_url(self, extractor):
"""Test title extraction from URL"""
content = "Some content"
url = "https://example.com/api/documentation-guide"
title = extractor._extract_title(content, filename=None, url=url)
# URL extraction should return something (may be from URL path or fallback)
assert len(title) > 0
assert isinstance(title, str)
def test_extract_title_fallback(self, extractor):
"""Test title fallback to first 50 chars"""
content = "This is a very long document that doesn't have a clear title structure and continues with more text"
title = extractor._extract_title(content, filename=None, url=None)
assert len(title) > 0
# Fallback should return first line or first 50 chars (may not have ...)
assert isinstance(title, str)
# Title should be reasonable length (not the entire content if content is long)
# If content is short, title might equal content, which is fine
if len(content) > 50:
assert len(title) <= len(content)
def test_detect_date_formats(self, extractor):
"""Test date detection in various formats"""
# YYYY-MM-DD format
content1 = "Published on 2024-01-15"
date1 = extractor._detect_date(content1)
assert date1 == "2024-01-15"
# MM/DD/YYYY format
content2 = "Created on 01/15/2024"
date2 = extractor._detect_date(content2)
assert date2 is not None
# Month name format
content3 = "Last updated January 15, 2024"
date3 = extractor._detect_date(content3)
assert date3 is not None
def test_detect_date_none(self, extractor):
"""Test date detection when no date is present"""
content = "This document has no date information"
date = extractor._detect_date(content)
assert date is None
def test_generate_basic_summary(self, extractor, sample_content):
"""Test basic summary generation"""
summary = extractor._generate_basic_summary(sample_content)
assert len(summary) > 0
assert len(summary) < len(sample_content)
assert summary.endswith('.')
def test_extract_basic_tags(self, extractor, sample_content):
"""Test basic tag extraction without LLM"""
tags = extractor._extract_basic_tags(sample_content)
assert isinstance(tags, list)
assert len(tags) > 0
assert len(tags) <= 8
# Should find "api" in tags
assert any("api" in tag.lower() for tag in tags)
def test_extract_basic_topics(self, extractor, sample_content):
"""Test basic topic extraction without LLM"""
topics = extractor._extract_basic_topics(sample_content)
assert isinstance(topics, list)
assert len(topics) > 0
assert len(topics) <= 5
# Should find topics from headers
assert any("API" in topic or "api" in topic.lower() for topic in topics)
def test_calculate_quality_score(self, extractor):
"""Test quality score calculation"""
# Good quality content
good_content = "This is a well-structured document. " * 50
good_content += "It has multiple paragraphs. " * 10
score1 = extractor._calculate_quality_score(good_content, 500, "Good summary")
assert 0.0 <= score1 <= 1.0
assert score1 > 0.5 # Should be decent quality
# Poor quality content
poor_content = "x" * 100
score2 = extractor._calculate_quality_score(poor_content, 10, "")
assert 0.0 <= score2 <= 1.0
assert score2 < score1 # Should be lower quality
def test_extract_fallback(self, extractor, sample_content):
"""Test fallback metadata extraction"""
result = extractor._extract_fallback(sample_content, "Test Title")
assert "summary" in result
assert "tags" in result
assert "topics" in result
assert isinstance(result["tags"], list)
assert isinstance(result["topics"], list)
assert len(result["summary"]) > 0
@pytest.mark.asyncio
async def test_extract_with_llm_success(self, extractor, sample_content):
"""Test LLM-based metadata extraction (mocked)"""
# Mock LLM response
mock_response = json.dumps({
"summary": "This document provides comprehensive API documentation.",
"tags": ["api", "documentation", "rest", "endpoints"],
"topics": ["API", "REST", "Endpoints"],
"domain": "Software Development"
})
with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
mock_llm.return_value = mock_response
result = await extractor._extract_with_llm(sample_content, "API Documentation")
assert "summary" in result
assert "tags" in result
assert "topics" in result
assert len(result["tags"]) > 0
assert len(result["topics"]) > 0
assert "api" in [tag.lower() for tag in result["tags"]]
@pytest.mark.asyncio
async def test_extract_with_llm_timeout(self, extractor, sample_content):
"""Test LLM extraction timeout handling"""
with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
mock_llm.side_effect = asyncio.TimeoutError()
with pytest.raises(Exception) as exc_info:
await extractor._extract_with_llm(sample_content, "Test")
assert "timeout" in str(exc_info.value).lower() or isinstance(exc_info.value, asyncio.TimeoutError)
@pytest.mark.asyncio
async def test_extract_metadata_full(self, extractor, sample_content):
"""Test full metadata extraction (with LLM fallback)"""
# Mock LLM to fail (will use fallback)
with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
mock_llm.side_effect = Exception("LLM unavailable")
metadata = await extractor.extract_metadata(
content=sample_content,
filename="api_docs.md",
url=None,
source_type="markdown"
)
# Verify all required fields
assert "title" in metadata
assert "summary" in metadata
assert "tags" in metadata
assert "topics" in metadata
assert "detected_date" in metadata
assert "quality_score" in metadata
assert "word_count" in metadata
assert "char_count" in metadata
assert "source_type" in metadata
assert "extraction_method" in metadata
# Verify data types and ranges
assert isinstance(metadata["title"], str)
assert isinstance(metadata["summary"], str)
assert isinstance(metadata["tags"], list)
assert isinstance(metadata["topics"], list)
assert isinstance(metadata["quality_score"], float)
assert 0.0 <= metadata["quality_score"] <= 1.0
assert metadata["word_count"] > 0
assert metadata["extraction_method"] in ["llm", "fallback"]
@pytest.mark.asyncio
async def test_extract_metadata_with_llm(self, extractor, sample_content):
"""Test metadata extraction with successful LLM call"""
mock_response = json.dumps({
"summary": "Comprehensive API documentation guide.",
"tags": ["api", "documentation", "rest"],
"topics": ["API", "REST", "Documentation"],
"domain": "API"
})
with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
mock_llm.return_value = mock_response
metadata = await extractor.extract_metadata(
content=sample_content,
filename="api_docs.md"
)
assert metadata["extraction_method"] == "llm"
assert len(metadata["summary"]) > 0
assert len(metadata["tags"]) > 0
assert len(metadata["topics"]) > 0
class TestDatabaseMetadataStorage:
"""Test database storage of metadata"""
@pytest.fixture
def sample_metadata(self):
"""Sample metadata for testing"""
return {
"title": "Test Document",
"summary": "This is a test document for metadata extraction.",
"tags": ["test", "documentation"],
"topics": ["Testing", "Metadata"],
"detected_date": "2024-01-15",
"quality_score": 0.85,
"word_count": 100,
"char_count": 500,
"source_type": "txt",
"extraction_method": "llm"
}
def test_insert_with_metadata(self, sample_metadata):
"""Test inserting document chunk with metadata"""
# This test requires a real database connection
# Skip if database is not available
try:
conn = get_connection()
conn.close()
except Exception:
pytest.skip("Database not available for testing")
tenant_id = "test_tenant_metadata"
text = "This is a test chunk with metadata."
# Generate a simple embedding (384 dimensions)
embedding = [0.1] * 384
# Insert with metadata
insert_document_chunks(
tenant_id=tenant_id,
text=text,
embedding=embedding,
metadata=sample_metadata,
doc_id="test_doc_123"
)
# Verify insertion by querying
conn = get_connection()
cur = conn.cursor()
cur.execute("""
SELECT metadata, doc_id
FROM documents
WHERE tenant_id = %s
AND chunk_text = %s
LIMIT 1;
""", (tenant_id, text))
result = cur.fetchone()
assert result is not None
stored_metadata = result[0]
stored_doc_id = result[1]
# Verify metadata was stored correctly
assert stored_metadata is not None
assert stored_metadata["title"] == sample_metadata["title"]
assert stored_metadata["summary"] == sample_metadata["summary"]
assert stored_metadata["quality_score"] == sample_metadata["quality_score"]
# Verify doc_id was stored
assert stored_doc_id == "test_doc_123"
# Cleanup
cur.execute("DELETE FROM documents WHERE tenant_id = %s", (tenant_id,))
conn.commit()
cur.close()
conn.close()
class TestIngestionIntegration:
"""Test metadata extraction integration with ingestion pipeline"""
@pytest.mark.asyncio
async def test_metadata_extraction_in_ingestion(self):
"""Test that metadata is extracted during document ingestion"""
from backend.api.services.document_ingestion import prepare_ingestion_payload, process_ingestion
from backend.api.mcp_clients.rag_client import RAGClient
from unittest.mock import AsyncMock, patch, MagicMock
# Mock RAG client
mock_rag_client = Mock(spec=RAGClient)
mock_rag_client.ingest_with_metadata = AsyncMock(return_value={
"chunks_stored": 3,
"status": "ok"
})
# Prepare payload
payload = await prepare_ingestion_payload(
tenant_id="test_tenant",
content="This is a test document about API documentation. Published on 2024-01-15.",
source_type="txt",
filename="api_docs.txt"
)
# Process with metadata extraction - patch the import path used in the function
with patch('backend.api.services.metadata_extractor.MetadataExtractor') as mock_extractor_class:
mock_extractor = MagicMock()
mock_extractor.extract_metadata = AsyncMock(return_value={
"title": "API Documentation",
"summary": "Test document about APIs",
"tags": ["api", "documentation"],
"topics": ["API"],
"detected_date": "2024-01-15",
"quality_score": 0.8,
"word_count": 10,
"char_count": 50,
"source_type": "txt",
"extraction_method": "llm"
})
mock_extractor_class.return_value = mock_extractor
result = await process_ingestion(payload, mock_rag_client, extract_metadata=True)
# Verify metadata was extracted
assert "extracted_metadata" in result
assert result["extracted_metadata"]["title"] == "API Documentation"
assert result["extracted_metadata"]["quality_score"] == 0.8
# Verify RAG client was called with metadata
mock_rag_client.ingest_with_metadata.assert_called_once()
call_args = mock_rag_client.ingest_with_metadata.call_args
# Check that metadata was passed (either as kwarg or in the merged metadata)
assert call_args is not None
class TestMetadataEdgeCases:
"""Test edge cases and error handling"""
@pytest.mark.asyncio
async def test_empty_content(self):
"""Test metadata extraction with empty content"""
extractor = MetadataExtractor()
metadata = await extractor.extract_metadata(
content="",
filename="empty.txt"
)
# Should still return metadata structure
assert "title" in metadata
assert "summary" in metadata
assert metadata["word_count"] == 0
@pytest.mark.asyncio
async def test_very_long_content(self):
"""Test metadata extraction with very long content"""
extractor = MetadataExtractor()
long_content = "Word " * 10000 # 10,000 words
metadata = await extractor.extract_metadata(
content=long_content,
filename="long_doc.txt"
)
assert metadata["word_count"] == 10000
assert len(metadata["summary"]) > 0
assert metadata["quality_score"] >= 0.0
@pytest.mark.asyncio
async def test_special_characters(self):
"""Test metadata extraction with special characters"""
extractor = MetadataExtractor()
special_content = "Document with émojis 🚀 and spéciál chàracters!"
metadata = await extractor.extract_metadata(
content=special_content,
filename="special.txt"
)
assert "title" in metadata
assert len(metadata["title"]) > 0
def test_quality_score_edge_cases(self):
"""Test quality score with edge cases"""
extractor = MetadataExtractor()
# Very short content
short = "Hi"
score1 = extractor._calculate_quality_score(short, 1, "")
assert 0.0 <= score1 <= 1.0
# Very long content
long = "Word " * 20000
score2 = extractor._calculate_quality_score(long, 20000, "Summary")
assert 0.0 <= score2 <= 1.0
# No summary
no_summary = "Content " * 100
score3 = extractor._calculate_quality_score(no_summary, 100, "")
assert 0.0 <= score3 <= 1.0
if __name__ == "__main__":
pytest.main([__file__, "-v", "--tb=short"])