| | """ |
| | Unit Tests for RAG Integration with Document Intelligence |
| | |
| | Tests the bridge between document_intelligence and RAG subsystems: |
| | - DocIntIndexer: Indexing ParseResult into vector store |
| | - DocIntRetriever: Semantic retrieval with evidence |
| | - RAG Tools: IndexDocumentTool, RetrieveChunksTool, RAGAnswerTool |
| | """ |
| |
|
| | import pytest |
| | from unittest.mock import Mock, MagicMock, patch |
| | from typing import List |
| |
|
| |
|
| | class TestDocIntBridge: |
| | """Tests for the document intelligence RAG bridge.""" |
| |
|
| | def test_bridge_imports(self): |
| | """Test that bridge module imports correctly.""" |
| | from src.rag.docint_bridge import ( |
| | DocIntIndexer, |
| | DocIntRetriever, |
| | get_docint_indexer, |
| | get_docint_retriever, |
| | ) |
| |
|
| | assert DocIntIndexer is not None |
| | assert DocIntRetriever is not None |
| |
|
| | def test_indexer_creation(self): |
| | """Test DocIntIndexer creation.""" |
| | from src.rag.docint_bridge import DocIntIndexer |
| | from src.rag.indexer import IndexerConfig |
| |
|
| | config = IndexerConfig( |
| | batch_size=16, |
| | include_bbox=True, |
| | min_chunk_length=5, |
| | ) |
| |
|
| | |
| | mock_store = Mock() |
| | mock_embedder = Mock() |
| | mock_embedder.embed_batch = Mock(return_value=[[0.1] * 768]) |
| |
|
| | indexer = DocIntIndexer( |
| | config=config, |
| | vector_store=mock_store, |
| | embedding_adapter=mock_embedder, |
| | ) |
| |
|
| | assert indexer.config.batch_size == 16 |
| | assert indexer.config.include_bbox is True |
| |
|
| | def test_retriever_creation(self): |
| | """Test DocIntRetriever creation.""" |
| | from src.rag.docint_bridge import DocIntRetriever |
| |
|
| | mock_store = Mock() |
| | mock_embedder = Mock() |
| |
|
| | retriever = DocIntRetriever( |
| | vector_store=mock_store, |
| | embedding_adapter=mock_embedder, |
| | similarity_threshold=0.6, |
| | ) |
| |
|
| | assert retriever.similarity_threshold == 0.6 |
| |
|
| |
|
| | class TestDocIntIndexer: |
| | """Tests for DocIntIndexer functionality.""" |
| |
|
| | @pytest.fixture |
| | def mock_parse_result(self): |
| | """Create a mock ParseResult for testing.""" |
| | from src.document_intelligence.chunks import ( |
| | ParseResult, |
| | DocumentChunk, |
| | ChunkType, |
| | BoundingBox, |
| | ) |
| |
|
| | chunks = [ |
| | DocumentChunk( |
| | chunk_id="chunk_001", |
| | doc_id="test_doc", |
| | chunk_type=ChunkType.PARAGRAPH, |
| | text="This is a test paragraph with enough content to index.", |
| | page=1, |
| | bbox=BoundingBox(x_min=0.1, y_min=0.1, x_max=0.9, y_max=0.2), |
| | confidence=0.9, |
| | sequence_index=0, |
| | ), |
| | DocumentChunk( |
| | chunk_id="chunk_002", |
| | doc_id="test_doc", |
| | chunk_type=ChunkType.PARAGRAPH, |
| | text="Second paragraph with different content for testing.", |
| | page=1, |
| | bbox=BoundingBox(x_min=0.1, y_min=0.3, x_max=0.9, y_max=0.4), |
| | confidence=0.85, |
| | sequence_index=1, |
| | ), |
| | DocumentChunk( |
| | chunk_id="chunk_003", |
| | doc_id="test_doc", |
| | chunk_type=ChunkType.TABLE, |
| | text="| Header | Value |\n| --- | --- |\n| A | 100 |", |
| | page=2, |
| | bbox=BoundingBox(x_min=0.1, y_min=0.1, x_max=0.9, y_max=0.5), |
| | confidence=0.95, |
| | sequence_index=2, |
| | ), |
| | ] |
| |
|
| | return ParseResult( |
| | doc_id="test_doc", |
| | filename="test.pdf", |
| | chunks=chunks, |
| | num_pages=2, |
| | processing_time_ms=100, |
| | markdown_full="# Test Document\n\nContent here.", |
| | ) |
| |
|
| | def test_index_parse_result(self, mock_parse_result): |
| | """Test indexing a ParseResult.""" |
| | from src.rag.docint_bridge import DocIntIndexer |
| |
|
| | mock_store = Mock() |
| | mock_store.add_chunks = Mock() |
| |
|
| | mock_embedder = Mock() |
| | |
| | mock_embedder.embed_batch = Mock(return_value=[ |
| | [0.1] * 768, |
| | [0.2] * 768, |
| | [0.3] * 768, |
| | ]) |
| |
|
| | indexer = DocIntIndexer( |
| | vector_store=mock_store, |
| | embedding_adapter=mock_embedder, |
| | ) |
| |
|
| | result = indexer.index_parse_result(mock_parse_result) |
| |
|
| | assert result.success is True |
| | assert result.document_id == "test_doc" |
| | assert result.num_chunks_indexed == 3 |
| | assert result.num_chunks_skipped == 0 |
| |
|
| | |
| | mock_store.add_chunks.assert_called_once() |
| |
|
| | def test_index_skips_short_chunks(self, mock_parse_result): |
| | """Test that short chunks are skipped.""" |
| | from src.rag.docint_bridge import DocIntIndexer |
| | from src.rag.indexer import IndexerConfig |
| |
|
| | |
| | from src.document_intelligence.chunks import ( |
| | DocumentChunk, |
| | ChunkType, |
| | BoundingBox, |
| | ) |
| |
|
| | mock_parse_result.chunks.append( |
| | DocumentChunk( |
| | chunk_id="chunk_short", |
| | doc_id="test_doc", |
| | chunk_type=ChunkType.PARAGRAPH, |
| | text="Short", |
| | page=1, |
| | bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1), |
| | confidence=0.9, |
| | sequence_index=3, |
| | ) |
| | ) |
| |
|
| | config = IndexerConfig(min_chunk_length=10) |
| |
|
| | mock_store = Mock() |
| | mock_store.add_chunks = Mock() |
| |
|
| | mock_embedder = Mock() |
| | mock_embedder.embed_batch = Mock(return_value=[ |
| | [0.1] * 768, |
| | [0.2] * 768, |
| | [0.3] * 768, |
| | ]) |
| |
|
| | indexer = DocIntIndexer( |
| | config=config, |
| | vector_store=mock_store, |
| | embedding_adapter=mock_embedder, |
| | ) |
| |
|
| | result = indexer.index_parse_result(mock_parse_result) |
| |
|
| | assert result.success is True |
| | assert result.num_chunks_indexed == 3 |
| | assert result.num_chunks_skipped == 1 |
| |
|
| | def test_delete_document(self): |
| | """Test deleting a document from index.""" |
| | from src.rag.docint_bridge import DocIntIndexer |
| |
|
| | mock_store = Mock() |
| | mock_store.delete_document = Mock(return_value=5) |
| |
|
| | indexer = DocIntIndexer(vector_store=mock_store) |
| |
|
| | deleted = indexer.delete_document("test_doc") |
| |
|
| | assert deleted == 5 |
| | mock_store.delete_document.assert_called_once_with("test_doc") |
| |
|
| |
|
| | class TestDocIntRetriever: |
| | """Tests for DocIntRetriever functionality.""" |
| |
|
| | def test_retrieve_chunks(self): |
| | """Test basic chunk retrieval.""" |
| | from src.rag.docint_bridge import DocIntRetriever |
| | from src.rag.store import VectorSearchResult |
| |
|
| | |
| | mock_results = [ |
| | VectorSearchResult( |
| | chunk_id="chunk_001", |
| | document_id="test_doc", |
| | text="Relevant content about the query.", |
| | similarity=0.85, |
| | page=1, |
| | chunk_type="paragraph", |
| | bbox={"x_min": 0.1, "y_min": 0.1, "x_max": 0.9, "y_max": 0.2}, |
| | metadata={"source_path": "test.pdf", "confidence": 0.9}, |
| | ), |
| | VectorSearchResult( |
| | chunk_id="chunk_002", |
| | document_id="test_doc", |
| | text="Another relevant chunk.", |
| | similarity=0.75, |
| | page=2, |
| | chunk_type="paragraph", |
| | bbox={"x_min": 0.1, "y_min": 0.3, "x_max": 0.9, "y_max": 0.4}, |
| | metadata={"source_path": "test.pdf", "confidence": 0.85}, |
| | ), |
| | ] |
| |
|
| | mock_store = Mock() |
| | mock_store.search = Mock(return_value=mock_results) |
| |
|
| | mock_embedder = Mock() |
| | mock_embedder.embed_text = Mock(return_value=[0.1] * 768) |
| |
|
| | retriever = DocIntRetriever( |
| | vector_store=mock_store, |
| | embedding_adapter=mock_embedder, |
| | similarity_threshold=0.5, |
| | ) |
| |
|
| | chunks = retriever.retrieve("test query", top_k=5) |
| |
|
| | assert len(chunks) == 2 |
| | assert chunks[0]["chunk_id"] == "chunk_001" |
| | assert chunks[0]["similarity"] == 0.85 |
| |
|
| | def test_retrieve_with_evidence(self): |
| | """Test retrieval with evidence references.""" |
| | from src.rag.docint_bridge import DocIntRetriever |
| | from src.rag.store import VectorSearchResult |
| |
|
| | mock_results = [ |
| | VectorSearchResult( |
| | chunk_id="chunk_001", |
| | document_id="test_doc", |
| | text="Content with evidence.", |
| | similarity=0.9, |
| | page=1, |
| | chunk_type="paragraph", |
| | bbox={"x_min": 0.1, "y_min": 0.1, "x_max": 0.9, "y_max": 0.2}, |
| | metadata={}, |
| | ), |
| | ] |
| |
|
| | mock_store = Mock() |
| | mock_store.search = Mock(return_value=mock_results) |
| |
|
| | mock_embedder = Mock() |
| | mock_embedder.embed_text = Mock(return_value=[0.1] * 768) |
| |
|
| | retriever = DocIntRetriever( |
| | vector_store=mock_store, |
| | embedding_adapter=mock_embedder, |
| | ) |
| |
|
| | chunks, evidence_refs = retriever.retrieve_with_evidence("query") |
| |
|
| | assert len(chunks) == 1 |
| | assert len(evidence_refs) == 1 |
| | assert evidence_refs[0].chunk_id == "chunk_001" |
| | assert evidence_refs[0].page == 1 |
| |
|
| | def test_retrieve_with_filters(self): |
| | """Test retrieval with filters.""" |
| | from src.rag.docint_bridge import DocIntRetriever |
| |
|
| | mock_store = Mock() |
| | mock_store.search = Mock(return_value=[]) |
| |
|
| | mock_embedder = Mock() |
| | mock_embedder.embed_text = Mock(return_value=[0.1] * 768) |
| |
|
| | retriever = DocIntRetriever( |
| | vector_store=mock_store, |
| | embedding_adapter=mock_embedder, |
| | ) |
| |
|
| | |
| | chunks = retriever.retrieve( |
| | "query", |
| | document_id="specific_doc", |
| | chunk_types=["paragraph", "table"], |
| | page_range=(1, 5), |
| | ) |
| |
|
| | |
| | call_args = mock_store.search.call_args |
| | filters = call_args.kwargs.get("filters") |
| |
|
| | assert filters["document_id"] == "specific_doc" |
| | assert filters["chunk_type"] == ["paragraph", "table"] |
| | assert filters["page"] == {"min": 1, "max": 5} |
| |
|
| | def test_build_context(self): |
| | """Test context building from chunks.""" |
| | from src.rag.docint_bridge import DocIntRetriever |
| |
|
| | retriever = DocIntRetriever() |
| |
|
| | chunks = [ |
| | { |
| | "chunk_id": "c1", |
| | "text": "First chunk content.", |
| | "page": 1, |
| | "chunk_type": "paragraph", |
| | "similarity": 0.9, |
| | }, |
| | { |
| | "chunk_id": "c2", |
| | "text": "Second chunk content.", |
| | "page": 2, |
| | "chunk_type": "table", |
| | "similarity": 0.8, |
| | }, |
| | ] |
| |
|
| | context = retriever.build_context(chunks) |
| |
|
| | assert "[1]" in context |
| | assert "[2]" in context |
| | assert "Page 1" in context |
| | assert "Page 2" in context |
| | assert "First chunk content" in context |
| | assert "Second chunk content" in context |
| |
|
| |
|
| | class TestRAGTools: |
| | """Tests for RAG tools in document_intelligence.""" |
| |
|
| | def test_tool_imports(self): |
| | """Test that RAG tools import correctly.""" |
| | from src.document_intelligence.tools import ( |
| | IndexDocumentTool, |
| | RetrieveChunksTool, |
| | RAGAnswerTool, |
| | DeleteDocumentTool, |
| | GetIndexStatsTool, |
| | get_rag_tool, |
| | list_rag_tools, |
| | ) |
| |
|
| | assert IndexDocumentTool is not None |
| | assert RetrieveChunksTool is not None |
| | assert RAGAnswerTool is not None |
| |
|
| | def test_list_rag_tools(self): |
| | """Test listing RAG tools.""" |
| | from src.document_intelligence.tools import list_rag_tools |
| |
|
| | tools = list_rag_tools() |
| |
|
| | assert len(tools) >= 3 |
| | tool_names = [t["name"] for t in tools] |
| | assert "index_document" in tool_names |
| | assert "retrieve_chunks" in tool_names |
| | assert "rag_answer" in tool_names |
| |
|
| | def test_get_rag_tool(self): |
| | """Test getting RAG tool by name.""" |
| | from src.document_intelligence.tools import get_rag_tool |
| |
|
| | tool = get_rag_tool("index_document") |
| | assert tool.name == "index_document" |
| |
|
| | tool = get_rag_tool("retrieve_chunks") |
| | assert tool.name == "retrieve_chunks" |
| |
|
| | @patch("src.document_intelligence.tools.rag_tools.RAG_AVAILABLE", False) |
| | def test_tool_graceful_degradation(self): |
| | """Test that tools handle missing RAG gracefully.""" |
| | from src.document_intelligence.tools.rag_tools import IndexDocumentTool |
| |
|
| | tool = IndexDocumentTool() |
| | result = tool.execute(path="test.pdf") |
| |
|
| | assert result.success is False |
| | assert "not available" in result.error.lower() |
| |
|
| |
|
| | class TestAnswerQuestionRAGMode: |
| | """Tests for AnswerQuestionTool with RAG mode.""" |
| |
|
| | def test_answer_with_keywords(self): |
| | """Test keyword-based answering (use_rag=False).""" |
| | from src.document_intelligence.tools import get_tool |
| | from src.document_intelligence.chunks import ( |
| | ParseResult, |
| | DocumentChunk, |
| | ChunkType, |
| | BoundingBox, |
| | ) |
| |
|
| | |
| | chunks = [ |
| | DocumentChunk( |
| | chunk_id="chunk_001", |
| | doc_id="test_doc", |
| | chunk_type=ChunkType.PARAGRAPH, |
| | text="The total amount due is $500.00 as shown on page one.", |
| | page=1, |
| | bbox=BoundingBox(x_min=0.1, y_min=0.1, x_max=0.9, y_max=0.2), |
| | confidence=0.9, |
| | sequence_index=0, |
| | ), |
| | ] |
| |
|
| | parse_result = ParseResult( |
| | doc_id="test_doc", |
| | filename="test.pdf", |
| | chunks=chunks, |
| | num_pages=1, |
| | processing_time_ms=100, |
| | markdown_full="# Test", |
| | ) |
| |
|
| | tool = get_tool("answer_question") |
| | result = tool.execute( |
| | parse_result=parse_result, |
| | question="What is the total amount?", |
| | use_rag=False, |
| | ) |
| |
|
| | assert result.success is True |
| | assert "500" in result.data.get("answer", "") |
| |
|
| |
|
| | class TestAbstentionPolicy: |
| | """Tests for abstention behavior.""" |
| |
|
| | def test_abstains_on_no_results(self): |
| | """Test that system abstains when no relevant chunks found.""" |
| | from src.document_intelligence.tools import get_tool |
| | from src.document_intelligence.chunks import ( |
| | ParseResult, |
| | DocumentChunk, |
| | ChunkType, |
| | BoundingBox, |
| | ) |
| |
|
| | |
| | chunks = [ |
| | DocumentChunk( |
| | chunk_id="chunk_001", |
| | doc_id="test_doc", |
| | chunk_type=ChunkType.PARAGRAPH, |
| | text="This document discusses weather patterns in Antarctica.", |
| | page=1, |
| | bbox=BoundingBox(x_min=0, y_min=0, x_max=1, y_max=1), |
| | confidence=0.9, |
| | sequence_index=0, |
| | ), |
| | ] |
| |
|
| | parse_result = ParseResult( |
| | doc_id="test_doc", |
| | filename="test.pdf", |
| | chunks=chunks, |
| | num_pages=1, |
| | processing_time_ms=100, |
| | markdown_full="# Test", |
| | ) |
| |
|
| | tool = get_tool("answer_question") |
| | result = tool.execute( |
| | parse_result=parse_result, |
| | question="What is the invoice number?", |
| | use_rag=False, |
| | ) |
| |
|
| | assert result.success is True |
| | assert result.data.get("abstained") is True |
| | assert result.data.get("confidence", 1.0) == 0.0 |
| |
|
| |
|
| | class TestEvidenceGeneration: |
| | """Tests for evidence reference generation.""" |
| |
|
| | def test_evidence_from_retrieval(self): |
| | """Test evidence refs are generated from retrieval.""" |
| | from src.rag.docint_bridge import DocIntRetriever |
| | from src.rag.store import VectorSearchResult |
| |
|
| | mock_results = [ |
| | VectorSearchResult( |
| | chunk_id="chunk_001", |
| | document_id="doc_001", |
| | text="Evidence text here.", |
| | similarity=0.9, |
| | page=1, |
| | chunk_type="paragraph", |
| | bbox={"x_min": 0.1, "y_min": 0.2, "x_max": 0.9, "y_max": 0.3}, |
| | metadata={"confidence": 0.95}, |
| | ), |
| | ] |
| |
|
| | mock_store = Mock() |
| | mock_store.search = Mock(return_value=mock_results) |
| |
|
| | mock_embedder = Mock() |
| | mock_embedder.embed_text = Mock(return_value=[0.1] * 768) |
| |
|
| | retriever = DocIntRetriever( |
| | vector_store=mock_store, |
| | embedding_adapter=mock_embedder, |
| | ) |
| |
|
| | chunks, evidence = retriever.retrieve_with_evidence("query") |
| |
|
| | assert len(evidence) == 1 |
| | ev = evidence[0] |
| | assert ev.chunk_id == "chunk_001" |
| | assert ev.page == 1 |
| | assert ev.bbox.x_min == 0.1 |
| | assert ev.bbox.y_max == 0.3 |
| | assert "Evidence text" in ev.snippet |
| |
|
| |
|
| | if __name__ == "__main__": |
| | pytest.main([__file__, "-v"]) |
| |
|