Spaces:

nothingworry
/

IntegraChat

Sleeping

App Files Files Community

IntegraChat / backend /tests /test_metadata_extraction.py

nothingworry

feat: Add AI metadata extraction, latency prediction, context-aware routing, and tool output schemas

d1e5882 19 days ago

raw

history blame

18.1 kB

	"""
	Comprehensive tests for AI-Generated Knowledge Base Metadata Extraction

	Tests all metadata extraction features:
	- Title extraction (from filename, content, URL)
	- Summary generation (LLM and fallback)
	- Tags extraction (LLM and fallback)
	- Topics extraction (LLM and fallback)
	- Date detection
	- Quality score calculation
	- Database storage
	- Integration with ingestion pipeline
	"""

	import pytest
	import asyncio
	from unittest.mock import Mock, patch, AsyncMock
	from backend.api.services.metadata_extractor import MetadataExtractor
	from backend.mcp_server.common.database import insert_document_chunks, get_connection
	import json


	class TestMetadataExtractor:
	"""Test the MetadataExtractor service"""

	@pytest.fixture
	def extractor(self):
	"""Create a MetadataExtractor instance"""
	return MetadataExtractor()

	@pytest.fixture
	def sample_content(self):
	"""Sample document content for testing"""
	return """
	# API Documentation Guide

	This comprehensive guide covers REST API endpoints, authentication, and best practices.
	Published on 2024-01-15, this document provides detailed information about our API.

	## Authentication
	All API requests require authentication using API keys or OAuth tokens.

	## Endpoints
	- GET /api/v1/users - List all users
	- POST /api/v1/users - Create a new user
	- GET /api/v1/users/{id} - Get user by ID

	## Examples
	Here are some example requests and responses.

	## Troubleshooting
	Common issues and their solutions.
	"""

	def test_extract_title_from_filename(self, extractor):
	"""Test title extraction from filename"""
	content = "Some content here"
	filename = "API_Documentation_Guide.pdf"

	title = extractor._extract_title(content, filename=filename, url=None)
	assert title == "Api Documentation Guide"
	assert "API" in title or "Api" in title

	def test_extract_title_from_content(self, extractor, sample_content):
	"""Test title extraction from content (first line or markdown)"""
	title = extractor._extract_title(sample_content, filename=None, url=None)
	# Should extract from markdown header or first meaningful line
	assert len(title) > 0
	assert len(title) < 200

	def test_extract_title_from_url(self, extractor):
	"""Test title extraction from URL"""
	content = "Some content"
	url = "https://example.com/api/documentation-guide"

	title = extractor._extract_title(content, filename=None, url=url)
	# URL extraction should return something (may be from URL path or fallback)
	assert len(title) > 0
	assert isinstance(title, str)

	def test_extract_title_fallback(self, extractor):
	"""Test title fallback to first 50 chars"""
	content = "This is a very long document that doesn't have a clear title structure and continues with more text"
	title = extractor._extract_title(content, filename=None, url=None)
	assert len(title) > 0
	# Fallback should return first line or first 50 chars (may not have ...)
	assert isinstance(title, str)
	# Title should be reasonable length (not the entire content if content is long)
	# If content is short, title might equal content, which is fine
	if len(content) > 50:
	assert len(title) <= len(content)

	def test_detect_date_formats(self, extractor):
	"""Test date detection in various formats"""
	# YYYY-MM-DD format
	content1 = "Published on 2024-01-15"
	date1 = extractor._detect_date(content1)
	assert date1 == "2024-01-15"

	# MM/DD/YYYY format
	content2 = "Created on 01/15/2024"
	date2 = extractor._detect_date(content2)
	assert date2 is not None

	# Month name format
	content3 = "Last updated January 15, 2024"
	date3 = extractor._detect_date(content3)
	assert date3 is not None

	def test_detect_date_none(self, extractor):
	"""Test date detection when no date is present"""
	content = "This document has no date information"
	date = extractor._detect_date(content)
	assert date is None

	def test_generate_basic_summary(self, extractor, sample_content):
	"""Test basic summary generation"""
	summary = extractor._generate_basic_summary(sample_content)
	assert len(summary) > 0
	assert len(summary) < len(sample_content)
	assert summary.endswith('.')

	def test_extract_basic_tags(self, extractor, sample_content):
	"""Test basic tag extraction without LLM"""
	tags = extractor._extract_basic_tags(sample_content)
	assert isinstance(tags, list)
	assert len(tags) > 0
	assert len(tags) <= 8
	# Should find "api" in tags
	assert any("api" in tag.lower() for tag in tags)

	def test_extract_basic_topics(self, extractor, sample_content):
	"""Test basic topic extraction without LLM"""
	topics = extractor._extract_basic_topics(sample_content)
	assert isinstance(topics, list)
	assert len(topics) > 0
	assert len(topics) <= 5
	# Should find topics from headers
	assert any("API" in topic or "api" in topic.lower() for topic in topics)

	def test_calculate_quality_score(self, extractor):
	"""Test quality score calculation"""
	# Good quality content
	good_content = "This is a well-structured document. " * 50
	good_content += "It has multiple paragraphs. " * 10
	score1 = extractor._calculate_quality_score(good_content, 500, "Good summary")
	assert 0.0 <= score1 <= 1.0
	assert score1 > 0.5 # Should be decent quality

	# Poor quality content
	poor_content = "x" * 100
	score2 = extractor._calculate_quality_score(poor_content, 10, "")
	assert 0.0 <= score2 <= 1.0
	assert score2 < score1 # Should be lower quality

	def test_extract_fallback(self, extractor, sample_content):
	"""Test fallback metadata extraction"""
	result = extractor._extract_fallback(sample_content, "Test Title")
	assert "summary" in result
	assert "tags" in result
	assert "topics" in result
	assert isinstance(result["tags"], list)
	assert isinstance(result["topics"], list)
	assert len(result["summary"]) > 0

	@pytest.mark.asyncio
	async def test_extract_with_llm_success(self, extractor, sample_content):
	"""Test LLM-based metadata extraction (mocked)"""
	# Mock LLM response
	mock_response = json.dumps({
	"summary": "This document provides comprehensive API documentation.",
	"tags": ["api", "documentation", "rest", "endpoints"],
	"topics": ["API", "REST", "Endpoints"],
	"domain": "Software Development"
	})

	with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
	mock_llm.return_value = mock_response

	result = await extractor._extract_with_llm(sample_content, "API Documentation")

	assert "summary" in result
	assert "tags" in result
	assert "topics" in result
	assert len(result["tags"]) > 0
	assert len(result["topics"]) > 0
	assert "api" in [tag.lower() for tag in result["tags"]]

	@pytest.mark.asyncio
	async def test_extract_with_llm_timeout(self, extractor, sample_content):
	"""Test LLM extraction timeout handling"""
	with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
	mock_llm.side_effect = asyncio.TimeoutError()

	with pytest.raises(Exception) as exc_info:
	await extractor._extract_with_llm(sample_content, "Test")
	assert "timeout" in str(exc_info.value).lower() or isinstance(exc_info.value, asyncio.TimeoutError)

	@pytest.mark.asyncio
	async def test_extract_metadata_full(self, extractor, sample_content):
	"""Test full metadata extraction (with LLM fallback)"""
	# Mock LLM to fail (will use fallback)
	with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
	mock_llm.side_effect = Exception("LLM unavailable")

	metadata = await extractor.extract_metadata(
	content=sample_content,
	filename="api_docs.md",
	url=None,
	source_type="markdown"
	)

	# Verify all required fields
	assert "title" in metadata
	assert "summary" in metadata
	assert "tags" in metadata
	assert "topics" in metadata
	assert "detected_date" in metadata
	assert "quality_score" in metadata
	assert "word_count" in metadata
	assert "char_count" in metadata
	assert "source_type" in metadata
	assert "extraction_method" in metadata

	# Verify data types and ranges
	assert isinstance(metadata["title"], str)
	assert isinstance(metadata["summary"], str)
	assert isinstance(metadata["tags"], list)
	assert isinstance(metadata["topics"], list)
	assert isinstance(metadata["quality_score"], float)
	assert 0.0 <= metadata["quality_score"] <= 1.0
	assert metadata["word_count"] > 0
	assert metadata["extraction_method"] in ["llm", "fallback"]

	@pytest.mark.asyncio
	async def test_extract_metadata_with_llm(self, extractor, sample_content):
	"""Test metadata extraction with successful LLM call"""
	mock_response = json.dumps({
	"summary": "Comprehensive API documentation guide.",
	"tags": ["api", "documentation", "rest"],
	"topics": ["API", "REST", "Documentation"],
	"domain": "API"
	})

	with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
	mock_llm.return_value = mock_response

	metadata = await extractor.extract_metadata(
	content=sample_content,
	filename="api_docs.md"
	)

	assert metadata["extraction_method"] == "llm"
	assert len(metadata["summary"]) > 0
	assert len(metadata["tags"]) > 0
	assert len(metadata["topics"]) > 0


	class TestDatabaseMetadataStorage:
	"""Test database storage of metadata"""

	@pytest.fixture
	def sample_metadata(self):
	"""Sample metadata for testing"""
	return {
	"title": "Test Document",
	"summary": "This is a test document for metadata extraction.",
	"tags": ["test", "documentation"],
	"topics": ["Testing", "Metadata"],
	"detected_date": "2024-01-15",
	"quality_score": 0.85,
	"word_count": 100,
	"char_count": 500,
	"source_type": "txt",
	"extraction_method": "llm"
	}

	def test_insert_with_metadata(self, sample_metadata):
	"""Test inserting document chunk with metadata"""
	# This test requires a real database connection
	# Skip if database is not available
	try:
	conn = get_connection()
	conn.close()
	except Exception:
	pytest.skip("Database not available for testing")

	tenant_id = "test_tenant_metadata"
	text = "This is a test chunk with metadata."

	# Generate a simple embedding (384 dimensions)
	embedding = [0.1] * 384

	# Insert with metadata
	insert_document_chunks(
	tenant_id=tenant_id,
	text=text,
	embedding=embedding,
	metadata=sample_metadata,
	doc_id="test_doc_123"
	)

	# Verify insertion by querying
	conn = get_connection()
	cur = conn.cursor()
	cur.execute("""
	SELECT metadata, doc_id
	FROM documents
	WHERE tenant_id = %s
	AND chunk_text = %s
	LIMIT 1;
	""", (tenant_id, text))

	result = cur.fetchone()
	assert result is not None

	stored_metadata = result[0]
	stored_doc_id = result[1]

	# Verify metadata was stored correctly
	assert stored_metadata is not None
	assert stored_metadata["title"] == sample_metadata["title"]
	assert stored_metadata["summary"] == sample_metadata["summary"]
	assert stored_metadata["quality_score"] == sample_metadata["quality_score"]

	# Verify doc_id was stored
	assert stored_doc_id == "test_doc_123"

	# Cleanup
	cur.execute("DELETE FROM documents WHERE tenant_id = %s", (tenant_id,))
	conn.commit()
	cur.close()
	conn.close()


	class TestIngestionIntegration:
	"""Test metadata extraction integration with ingestion pipeline"""

	@pytest.mark.asyncio
	async def test_metadata_extraction_in_ingestion(self):
	"""Test that metadata is extracted during document ingestion"""
	from backend.api.services.document_ingestion import prepare_ingestion_payload, process_ingestion
	from backend.api.mcp_clients.rag_client import RAGClient
	from unittest.mock import AsyncMock, patch, MagicMock

	# Mock RAG client
	mock_rag_client = Mock(spec=RAGClient)
	mock_rag_client.ingest_with_metadata = AsyncMock(return_value={
	"chunks_stored": 3,
	"status": "ok"
	})

	# Prepare payload
	payload = await prepare_ingestion_payload(
	tenant_id="test_tenant",
	content="This is a test document about API documentation. Published on 2024-01-15.",
	source_type="txt",
	filename="api_docs.txt"
	)

	# Process with metadata extraction - patch the import path used in the function
	with patch('backend.api.services.metadata_extractor.MetadataExtractor') as mock_extractor_class:
	mock_extractor = MagicMock()
	mock_extractor.extract_metadata = AsyncMock(return_value={
	"title": "API Documentation",
	"summary": "Test document about APIs",
	"tags": ["api", "documentation"],
	"topics": ["API"],
	"detected_date": "2024-01-15",
	"quality_score": 0.8,
	"word_count": 10,
	"char_count": 50,
	"source_type": "txt",
	"extraction_method": "llm"
	})
	mock_extractor_class.return_value = mock_extractor

	result = await process_ingestion(payload, mock_rag_client, extract_metadata=True)

	# Verify metadata was extracted
	assert "extracted_metadata" in result
	assert result["extracted_metadata"]["title"] == "API Documentation"
	assert result["extracted_metadata"]["quality_score"] == 0.8

	# Verify RAG client was called with metadata
	mock_rag_client.ingest_with_metadata.assert_called_once()
	call_args = mock_rag_client.ingest_with_metadata.call_args
	# Check that metadata was passed (either as kwarg or in the merged metadata)
	assert call_args is not None


	class TestMetadataEdgeCases:
	"""Test edge cases and error handling"""

	@pytest.mark.asyncio
	async def test_empty_content(self):
	"""Test metadata extraction with empty content"""
	extractor = MetadataExtractor()

	metadata = await extractor.extract_metadata(
	content="",
	filename="empty.txt"
	)

	# Should still return metadata structure
	assert "title" in metadata
	assert "summary" in metadata
	assert metadata["word_count"] == 0

	@pytest.mark.asyncio
	async def test_very_long_content(self):
	"""Test metadata extraction with very long content"""
	extractor = MetadataExtractor()
	long_content = "Word " * 10000 # 10,000 words

	metadata = await extractor.extract_metadata(
	content=long_content,
	filename="long_doc.txt"
	)

	assert metadata["word_count"] == 10000
	assert len(metadata["summary"]) > 0
	assert metadata["quality_score"] >= 0.0

	@pytest.mark.asyncio
	async def test_special_characters(self):
	"""Test metadata extraction with special characters"""
	extractor = MetadataExtractor()
	special_content = "Document with émojis 🚀 and spéciál chàracters!"

	metadata = await extractor.extract_metadata(
	content=special_content,
	filename="special.txt"
	)

	assert "title" in metadata
	assert len(metadata["title"]) > 0

	def test_quality_score_edge_cases(self):
	"""Test quality score with edge cases"""
	extractor = MetadataExtractor()

	# Very short content
	short = "Hi"
	score1 = extractor._calculate_quality_score(short, 1, "")
	assert 0.0 <= score1 <= 1.0

	# Very long content
	long = "Word " * 20000
	score2 = extractor._calculate_quality_score(long, 20000, "Summary")
	assert 0.0 <= score2 <= 1.0

	# No summary
	no_summary = "Content " * 100
	score3 = extractor._calculate_quality_score(no_summary, 100, "")
	assert 0.0 <= score3 <= 1.0


	if __name__ == "__main__":
	pytest.main([__file__, "-v", "--tb=short"])