""" Integration Tests for RAG Pipeline Tests the full RAG workflow: - Vector store operations - Embedding generation - Document retrieval - Answer generation """ import pytest from pathlib import Path from unittest.mock import Mock, patch, MagicMock import json class TestVectorStore: """Test vector store functionality.""" def test_vector_store_config(self): """Test VectorStoreConfig creation.""" from src.rag.store import VectorStoreConfig config = VectorStoreConfig( collection_name="test_collection", default_top_k=10, similarity_threshold=0.8, ) assert config.collection_name == "test_collection" assert config.default_top_k == 10 def test_vector_search_result(self): """Test VectorSearchResult model.""" from src.rag.store import VectorSearchResult result = VectorSearchResult( chunk_id="chunk_1", document_id="doc_1", text="Sample text", metadata={"page": 0}, similarity=0.85, page=0, chunk_type="text", ) assert result.similarity == 0.85 assert result.chunk_id == "chunk_1" @pytest.mark.skipif( not pytest.importorskip("chromadb", reason="ChromaDB not installed"), reason="ChromaDB not available" ) def test_chromadb_store_creation(self, tmp_path): """Test ChromaDB store creation.""" from src.rag.store import ChromaVectorStore, VectorStoreConfig config = VectorStoreConfig( persist_directory=str(tmp_path / "vectorstore"), collection_name="test_collection", ) store = ChromaVectorStore(config) assert store.count() == 0 class TestEmbeddings: """Test embedding functionality.""" def test_embedding_config(self): """Test EmbeddingConfig creation.""" from src.rag.embeddings import EmbeddingConfig config = EmbeddingConfig( adapter_type="ollama", ollama_model="nomic-embed-text", batch_size=16, ) assert config.adapter_type == "ollama" assert config.batch_size == 16 def test_embedding_cache_creation(self, tmp_path): """Test EmbeddingCache creation.""" from src.rag.embeddings import EmbeddingCache cache = EmbeddingCache(str(tmp_path), "test_model") assert cache.cache_dir.exists() def test_embedding_cache_operations(self, tmp_path): """Test EmbeddingCache get/put operations.""" from src.rag.embeddings import EmbeddingCache cache = EmbeddingCache(str(tmp_path), "test_model") # Test put and get test_text = "Hello world" test_embedding = [0.1, 0.2, 0.3, 0.4] cache.put(test_text, test_embedding) retrieved = cache.get(test_text) assert retrieved == test_embedding def test_ollama_embedding_dimensions(self): """Test OllamaEmbedding model dimensions mapping.""" from src.rag.embeddings import OllamaEmbedding assert OllamaEmbedding.MODEL_DIMENSIONS["nomic-embed-text"] == 768 assert OllamaEmbedding.MODEL_DIMENSIONS["mxbai-embed-large"] == 1024 class TestRetriever: """Test retriever functionality.""" def test_retriever_config(self): """Test RetrieverConfig creation.""" from src.rag.retriever import RetrieverConfig config = RetrieverConfig( default_top_k=10, similarity_threshold=0.75, include_evidence=True, ) assert config.default_top_k == 10 assert config.include_evidence is True def test_retrieved_chunk(self): """Test RetrievedChunk model.""" from src.rag.retriever import RetrievedChunk chunk = RetrievedChunk( chunk_id="chunk_1", document_id="doc_1", text="Sample retrieved text", similarity=0.9, page=0, chunk_type="text", ) assert chunk.similarity == 0.9 class TestGenerator: """Test generator functionality.""" def test_generator_config(self): """Test GeneratorConfig creation.""" from src.rag.generator import GeneratorConfig config = GeneratorConfig( llm_provider="ollama", ollama_model="llama3.2:3b", temperature=0.1, require_citations=True, ) assert config.llm_provider == "ollama" assert config.require_citations is True def test_citation_model(self): """Test Citation model.""" from src.rag.generator import Citation citation = Citation( index=1, chunk_id="chunk_1", page=0, text_snippet="Sample snippet", confidence=0.85, ) assert citation.index == 1 assert citation.confidence == 0.85 def test_generated_answer_model(self): """Test GeneratedAnswer model.""" from src.rag.generator import GeneratedAnswer, Citation answer = GeneratedAnswer( answer="This is the generated answer.", citations=[ Citation( index=1, chunk_id="chunk_1", page=0, text_snippet="Evidence text", confidence=0.9, ) ], confidence=0.85, abstained=False, num_chunks_used=3, query="What is the answer?", ) assert answer.answer == "This is the generated answer." assert len(answer.citations) == 1 assert answer.abstained is False def test_abstention(self): """Test abstention behavior.""" from src.rag.generator import GeneratedAnswer answer = GeneratedAnswer( answer="I cannot provide a confident answer.", citations=[], confidence=0.3, abstained=True, abstain_reason="Low confidence", num_chunks_used=2, query="Complex question", ) assert answer.abstained is True assert answer.abstain_reason == "Low confidence" class TestIndexer: """Test indexer functionality.""" def test_indexer_config(self): """Test IndexerConfig creation.""" from src.rag.indexer import IndexerConfig config = IndexerConfig( batch_size=64, include_bbox=True, skip_empty_chunks=True, ) assert config.batch_size == 64 def test_indexing_result(self): """Test IndexingResult model.""" from src.rag.indexer import IndexingResult result = IndexingResult( document_id="doc_1", source_path="/path/to/doc.pdf", num_chunks_indexed=10, num_chunks_skipped=2, success=True, ) assert result.success is True assert result.num_chunks_indexed == 10 class TestRAGIntegration: """Integration tests for full RAG pipeline.""" @pytest.fixture def mock_chunks(self): """Create mock document chunks.""" from src.rag.retriever import RetrievedChunk return [ RetrievedChunk( chunk_id=f"chunk_{i}", document_id="doc_1", text=f"This is sample text from chunk {i}.", similarity=0.9 - (i * 0.1), page=i, chunk_type="text", ) for i in range(3) ] def test_context_building(self, mock_chunks): """Test building context from chunks.""" from src.rag.retriever import DocumentRetriever retriever = DocumentRetriever() context = retriever.build_context(mock_chunks, include_metadata=True) assert "chunk 0" in context.lower() assert "Page 1" in context # Page numbers are 1-indexed in display def test_citation_extraction(self): """Test citation extraction from text.""" from src.rag.generator import GroundedGenerator from src.rag.retriever import RetrievedChunk generator = GroundedGenerator() chunks = [ RetrievedChunk( chunk_id="chunk_1", document_id="doc_1", text="First chunk content", similarity=0.9, page=0, ), RetrievedChunk( chunk_id="chunk_2", document_id="doc_1", text="Second chunk content", similarity=0.85, page=1, ), ] answer_text = "The answer is based on [1] and [2]." citations = generator._extract_citations(answer_text, chunks) assert len(citations) == 2 assert citations[0].index == 1 assert citations[1].index == 2