File size: 18,087 Bytes
d1e5882
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
"""
Comprehensive tests for AI-Generated Knowledge Base Metadata Extraction

Tests all metadata extraction features:
- Title extraction (from filename, content, URL)
- Summary generation (LLM and fallback)
- Tags extraction (LLM and fallback)
- Topics extraction (LLM and fallback)
- Date detection
- Quality score calculation
- Database storage
- Integration with ingestion pipeline
"""

import pytest
import asyncio
from unittest.mock import Mock, patch, AsyncMock
from backend.api.services.metadata_extractor import MetadataExtractor
from backend.mcp_server.common.database import insert_document_chunks, get_connection
import json


class TestMetadataExtractor:
    """Test the MetadataExtractor service"""
    
    @pytest.fixture
    def extractor(self):
        """Create a MetadataExtractor instance"""
        return MetadataExtractor()
    
    @pytest.fixture
    def sample_content(self):
        """Sample document content for testing"""
        return """
        # API Documentation Guide
        
        This comprehensive guide covers REST API endpoints, authentication, and best practices.
        Published on 2024-01-15, this document provides detailed information about our API.
        
        ## Authentication
        All API requests require authentication using API keys or OAuth tokens.
        
        ## Endpoints
        - GET /api/v1/users - List all users
        - POST /api/v1/users - Create a new user
        - GET /api/v1/users/{id} - Get user by ID
        
        ## Examples
        Here are some example requests and responses.
        
        ## Troubleshooting
        Common issues and their solutions.
        """
    
    def test_extract_title_from_filename(self, extractor):
        """Test title extraction from filename"""
        content = "Some content here"
        filename = "API_Documentation_Guide.pdf"
        
        title = extractor._extract_title(content, filename=filename, url=None)
        assert title == "Api Documentation Guide"
        assert "API" in title or "Api" in title
    
    def test_extract_title_from_content(self, extractor, sample_content):
        """Test title extraction from content (first line or markdown)"""
        title = extractor._extract_title(sample_content, filename=None, url=None)
        # Should extract from markdown header or first meaningful line
        assert len(title) > 0
        assert len(title) < 200
    
    def test_extract_title_from_url(self, extractor):
        """Test title extraction from URL"""
        content = "Some content"
        url = "https://example.com/api/documentation-guide"
        
        title = extractor._extract_title(content, filename=None, url=url)
        # URL extraction should return something (may be from URL path or fallback)
        assert len(title) > 0
        assert isinstance(title, str)
    
    def test_extract_title_fallback(self, extractor):
        """Test title fallback to first 50 chars"""
        content = "This is a very long document that doesn't have a clear title structure and continues with more text"
        title = extractor._extract_title(content, filename=None, url=None)
        assert len(title) > 0
        # Fallback should return first line or first 50 chars (may not have ...)
        assert isinstance(title, str)
        # Title should be reasonable length (not the entire content if content is long)
        # If content is short, title might equal content, which is fine
        if len(content) > 50:
            assert len(title) <= len(content)
    
    def test_detect_date_formats(self, extractor):
        """Test date detection in various formats"""
        # YYYY-MM-DD format
        content1 = "Published on 2024-01-15"
        date1 = extractor._detect_date(content1)
        assert date1 == "2024-01-15"
        
        # MM/DD/YYYY format
        content2 = "Created on 01/15/2024"
        date2 = extractor._detect_date(content2)
        assert date2 is not None
        
        # Month name format
        content3 = "Last updated January 15, 2024"
        date3 = extractor._detect_date(content3)
        assert date3 is not None
    
    def test_detect_date_none(self, extractor):
        """Test date detection when no date is present"""
        content = "This document has no date information"
        date = extractor._detect_date(content)
        assert date is None
    
    def test_generate_basic_summary(self, extractor, sample_content):
        """Test basic summary generation"""
        summary = extractor._generate_basic_summary(sample_content)
        assert len(summary) > 0
        assert len(summary) < len(sample_content)
        assert summary.endswith('.')
    
    def test_extract_basic_tags(self, extractor, sample_content):
        """Test basic tag extraction without LLM"""
        tags = extractor._extract_basic_tags(sample_content)
        assert isinstance(tags, list)
        assert len(tags) > 0
        assert len(tags) <= 8
        # Should find "api" in tags
        assert any("api" in tag.lower() for tag in tags)
    
    def test_extract_basic_topics(self, extractor, sample_content):
        """Test basic topic extraction without LLM"""
        topics = extractor._extract_basic_topics(sample_content)
        assert isinstance(topics, list)
        assert len(topics) > 0
        assert len(topics) <= 5
        # Should find topics from headers
        assert any("API" in topic or "api" in topic.lower() for topic in topics)
    
    def test_calculate_quality_score(self, extractor):
        """Test quality score calculation"""
        # Good quality content
        good_content = "This is a well-structured document. " * 50
        good_content += "It has multiple paragraphs. " * 10
        score1 = extractor._calculate_quality_score(good_content, 500, "Good summary")
        assert 0.0 <= score1 <= 1.0
        assert score1 > 0.5  # Should be decent quality
        
        # Poor quality content
        poor_content = "x" * 100
        score2 = extractor._calculate_quality_score(poor_content, 10, "")
        assert 0.0 <= score2 <= 1.0
        assert score2 < score1  # Should be lower quality
    
    def test_extract_fallback(self, extractor, sample_content):
        """Test fallback metadata extraction"""
        result = extractor._extract_fallback(sample_content, "Test Title")
        assert "summary" in result
        assert "tags" in result
        assert "topics" in result
        assert isinstance(result["tags"], list)
        assert isinstance(result["topics"], list)
        assert len(result["summary"]) > 0
    
    @pytest.mark.asyncio
    async def test_extract_with_llm_success(self, extractor, sample_content):
        """Test LLM-based metadata extraction (mocked)"""
        # Mock LLM response
        mock_response = json.dumps({
            "summary": "This document provides comprehensive API documentation.",
            "tags": ["api", "documentation", "rest", "endpoints"],
            "topics": ["API", "REST", "Endpoints"],
            "domain": "Software Development"
        })
        
        with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
            mock_llm.return_value = mock_response
            
            result = await extractor._extract_with_llm(sample_content, "API Documentation")
            
            assert "summary" in result
            assert "tags" in result
            assert "topics" in result
            assert len(result["tags"]) > 0
            assert len(result["topics"]) > 0
            assert "api" in [tag.lower() for tag in result["tags"]]
    
    @pytest.mark.asyncio
    async def test_extract_with_llm_timeout(self, extractor, sample_content):
        """Test LLM extraction timeout handling"""
        with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
            mock_llm.side_effect = asyncio.TimeoutError()
            
            with pytest.raises(Exception) as exc_info:
                await extractor._extract_with_llm(sample_content, "Test")
            assert "timeout" in str(exc_info.value).lower() or isinstance(exc_info.value, asyncio.TimeoutError)
    
    @pytest.mark.asyncio
    async def test_extract_metadata_full(self, extractor, sample_content):
        """Test full metadata extraction (with LLM fallback)"""
        # Mock LLM to fail (will use fallback)
        with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
            mock_llm.side_effect = Exception("LLM unavailable")
            
            metadata = await extractor.extract_metadata(
                content=sample_content,
                filename="api_docs.md",
                url=None,
                source_type="markdown"
            )
            
            # Verify all required fields
            assert "title" in metadata
            assert "summary" in metadata
            assert "tags" in metadata
            assert "topics" in metadata
            assert "detected_date" in metadata
            assert "quality_score" in metadata
            assert "word_count" in metadata
            assert "char_count" in metadata
            assert "source_type" in metadata
            assert "extraction_method" in metadata
            
            # Verify data types and ranges
            assert isinstance(metadata["title"], str)
            assert isinstance(metadata["summary"], str)
            assert isinstance(metadata["tags"], list)
            assert isinstance(metadata["topics"], list)
            assert isinstance(metadata["quality_score"], float)
            assert 0.0 <= metadata["quality_score"] <= 1.0
            assert metadata["word_count"] > 0
            assert metadata["extraction_method"] in ["llm", "fallback"]
    
    @pytest.mark.asyncio
    async def test_extract_metadata_with_llm(self, extractor, sample_content):
        """Test metadata extraction with successful LLM call"""
        mock_response = json.dumps({
            "summary": "Comprehensive API documentation guide.",
            "tags": ["api", "documentation", "rest"],
            "topics": ["API", "REST", "Documentation"],
            "domain": "API"
        })
        
        with patch.object(extractor.llm, 'simple_call', new_callable=AsyncMock) as mock_llm:
            mock_llm.return_value = mock_response
            
            metadata = await extractor.extract_metadata(
                content=sample_content,
                filename="api_docs.md"
            )
            
            assert metadata["extraction_method"] == "llm"
            assert len(metadata["summary"]) > 0
            assert len(metadata["tags"]) > 0
            assert len(metadata["topics"]) > 0


class TestDatabaseMetadataStorage:
    """Test database storage of metadata"""
    
    @pytest.fixture
    def sample_metadata(self):
        """Sample metadata for testing"""
        return {
            "title": "Test Document",
            "summary": "This is a test document for metadata extraction.",
            "tags": ["test", "documentation"],
            "topics": ["Testing", "Metadata"],
            "detected_date": "2024-01-15",
            "quality_score": 0.85,
            "word_count": 100,
            "char_count": 500,
            "source_type": "txt",
            "extraction_method": "llm"
        }
    
    def test_insert_with_metadata(self, sample_metadata):
        """Test inserting document chunk with metadata"""
        # This test requires a real database connection
        # Skip if database is not available
        try:
            conn = get_connection()
            conn.close()
        except Exception:
            pytest.skip("Database not available for testing")
        
        tenant_id = "test_tenant_metadata"
        text = "This is a test chunk with metadata."
        
        # Generate a simple embedding (384 dimensions)
        embedding = [0.1] * 384
        
        # Insert with metadata
        insert_document_chunks(
            tenant_id=tenant_id,
            text=text,
            embedding=embedding,
            metadata=sample_metadata,
            doc_id="test_doc_123"
        )
        
        # Verify insertion by querying
        conn = get_connection()
        cur = conn.cursor()
        cur.execute("""
            SELECT metadata, doc_id 
            FROM documents 
            WHERE tenant_id = %s 
            AND chunk_text = %s
            LIMIT 1;
        """, (tenant_id, text))
        
        result = cur.fetchone()
        assert result is not None
        
        stored_metadata = result[0]
        stored_doc_id = result[1]
        
        # Verify metadata was stored correctly
        assert stored_metadata is not None
        assert stored_metadata["title"] == sample_metadata["title"]
        assert stored_metadata["summary"] == sample_metadata["summary"]
        assert stored_metadata["quality_score"] == sample_metadata["quality_score"]
        
        # Verify doc_id was stored
        assert stored_doc_id == "test_doc_123"
        
        # Cleanup
        cur.execute("DELETE FROM documents WHERE tenant_id = %s", (tenant_id,))
        conn.commit()
        cur.close()
        conn.close()


class TestIngestionIntegration:
    """Test metadata extraction integration with ingestion pipeline"""
    
    @pytest.mark.asyncio
    async def test_metadata_extraction_in_ingestion(self):
        """Test that metadata is extracted during document ingestion"""
        from backend.api.services.document_ingestion import prepare_ingestion_payload, process_ingestion
        from backend.api.mcp_clients.rag_client import RAGClient
        from unittest.mock import AsyncMock, patch, MagicMock
        
        # Mock RAG client
        mock_rag_client = Mock(spec=RAGClient)
        mock_rag_client.ingest_with_metadata = AsyncMock(return_value={
            "chunks_stored": 3,
            "status": "ok"
        })
        
        # Prepare payload
        payload = await prepare_ingestion_payload(
            tenant_id="test_tenant",
            content="This is a test document about API documentation. Published on 2024-01-15.",
            source_type="txt",
            filename="api_docs.txt"
        )
        
        # Process with metadata extraction - patch the import path used in the function
        with patch('backend.api.services.metadata_extractor.MetadataExtractor') as mock_extractor_class:
            mock_extractor = MagicMock()
            mock_extractor.extract_metadata = AsyncMock(return_value={
                "title": "API Documentation",
                "summary": "Test document about APIs",
                "tags": ["api", "documentation"],
                "topics": ["API"],
                "detected_date": "2024-01-15",
                "quality_score": 0.8,
                "word_count": 10,
                "char_count": 50,
                "source_type": "txt",
                "extraction_method": "llm"
            })
            mock_extractor_class.return_value = mock_extractor
            
            result = await process_ingestion(payload, mock_rag_client, extract_metadata=True)
            
            # Verify metadata was extracted
            assert "extracted_metadata" in result
            assert result["extracted_metadata"]["title"] == "API Documentation"
            assert result["extracted_metadata"]["quality_score"] == 0.8
            
            # Verify RAG client was called with metadata
            mock_rag_client.ingest_with_metadata.assert_called_once()
            call_args = mock_rag_client.ingest_with_metadata.call_args
            # Check that metadata was passed (either as kwarg or in the merged metadata)
            assert call_args is not None


class TestMetadataEdgeCases:
    """Test edge cases and error handling"""
    
    @pytest.mark.asyncio
    async def test_empty_content(self):
        """Test metadata extraction with empty content"""
        extractor = MetadataExtractor()
        
        metadata = await extractor.extract_metadata(
            content="",
            filename="empty.txt"
        )
        
        # Should still return metadata structure
        assert "title" in metadata
        assert "summary" in metadata
        assert metadata["word_count"] == 0
    
    @pytest.mark.asyncio
    async def test_very_long_content(self):
        """Test metadata extraction with very long content"""
        extractor = MetadataExtractor()
        long_content = "Word " * 10000  # 10,000 words
        
        metadata = await extractor.extract_metadata(
            content=long_content,
            filename="long_doc.txt"
        )
        
        assert metadata["word_count"] == 10000
        assert len(metadata["summary"]) > 0
        assert metadata["quality_score"] >= 0.0
    
    @pytest.mark.asyncio
    async def test_special_characters(self):
        """Test metadata extraction with special characters"""
        extractor = MetadataExtractor()
        special_content = "Document with émojis 🚀 and spéciál chàracters!"
        
        metadata = await extractor.extract_metadata(
            content=special_content,
            filename="special.txt"
        )
        
        assert "title" in metadata
        assert len(metadata["title"]) > 0
    
    def test_quality_score_edge_cases(self):
        """Test quality score with edge cases"""
        extractor = MetadataExtractor()
        
        # Very short content
        short = "Hi"
        score1 = extractor._calculate_quality_score(short, 1, "")
        assert 0.0 <= score1 <= 1.0
        
        # Very long content
        long = "Word " * 20000
        score2 = extractor._calculate_quality_score(long, 20000, "Summary")
        assert 0.0 <= score2 <= 1.0
        
        # No summary
        no_summary = "Content " * 100
        score3 = extractor._calculate_quality_score(no_summary, 100, "")
        assert 0.0 <= score3 <= 1.0


if __name__ == "__main__":
    pytest.main([__file__, "-v", "--tb=short"])