| | """ |
| | Integration Tests for RAG Pipeline |
| | |
| | Tests the full RAG workflow: |
| | - Vector store operations |
| | - Embedding generation |
| | - Document retrieval |
| | - Answer generation |
| | """ |
| |
|
| | import pytest |
| | from pathlib import Path |
| | from unittest.mock import Mock, patch, MagicMock |
| | import json |
| |
|
| |
|
| | class TestVectorStore: |
| | """Test vector store functionality.""" |
| |
|
| | def test_vector_store_config(self): |
| | """Test VectorStoreConfig creation.""" |
| | from src.rag.store import VectorStoreConfig |
| |
|
| | config = VectorStoreConfig( |
| | collection_name="test_collection", |
| | default_top_k=10, |
| | similarity_threshold=0.8, |
| | ) |
| |
|
| | assert config.collection_name == "test_collection" |
| | assert config.default_top_k == 10 |
| |
|
| | def test_vector_search_result(self): |
| | """Test VectorSearchResult model.""" |
| | from src.rag.store import VectorSearchResult |
| |
|
| | result = VectorSearchResult( |
| | chunk_id="chunk_1", |
| | document_id="doc_1", |
| | text="Sample text", |
| | metadata={"page": 0}, |
| | similarity=0.85, |
| | page=0, |
| | chunk_type="text", |
| | ) |
| |
|
| | assert result.similarity == 0.85 |
| | assert result.chunk_id == "chunk_1" |
| |
|
| | @pytest.mark.skipif( |
| | not pytest.importorskip("chromadb", reason="ChromaDB not installed"), |
| | reason="ChromaDB not available" |
| | ) |
| | def test_chromadb_store_creation(self, tmp_path): |
| | """Test ChromaDB store creation.""" |
| | from src.rag.store import ChromaVectorStore, VectorStoreConfig |
| |
|
| | config = VectorStoreConfig( |
| | persist_directory=str(tmp_path / "vectorstore"), |
| | collection_name="test_collection", |
| | ) |
| |
|
| | store = ChromaVectorStore(config) |
| | assert store.count() == 0 |
| |
|
| |
|
| | class TestEmbeddings: |
| | """Test embedding functionality.""" |
| |
|
| | def test_embedding_config(self): |
| | """Test EmbeddingConfig creation.""" |
| | from src.rag.embeddings import EmbeddingConfig |
| |
|
| | config = EmbeddingConfig( |
| | adapter_type="ollama", |
| | ollama_model="nomic-embed-text", |
| | batch_size=16, |
| | ) |
| |
|
| | assert config.adapter_type == "ollama" |
| | assert config.batch_size == 16 |
| |
|
| | def test_embedding_cache_creation(self, tmp_path): |
| | """Test EmbeddingCache creation.""" |
| | from src.rag.embeddings import EmbeddingCache |
| |
|
| | cache = EmbeddingCache(str(tmp_path), "test_model") |
| | assert cache.cache_dir.exists() |
| |
|
| | def test_embedding_cache_operations(self, tmp_path): |
| | """Test EmbeddingCache get/put operations.""" |
| | from src.rag.embeddings import EmbeddingCache |
| |
|
| | cache = EmbeddingCache(str(tmp_path), "test_model") |
| |
|
| | |
| | test_text = "Hello world" |
| | test_embedding = [0.1, 0.2, 0.3, 0.4] |
| |
|
| | cache.put(test_text, test_embedding) |
| | retrieved = cache.get(test_text) |
| |
|
| | assert retrieved == test_embedding |
| |
|
| | def test_ollama_embedding_dimensions(self): |
| | """Test OllamaEmbedding model dimensions mapping.""" |
| | from src.rag.embeddings import OllamaEmbedding |
| |
|
| | assert OllamaEmbedding.MODEL_DIMENSIONS["nomic-embed-text"] == 768 |
| | assert OllamaEmbedding.MODEL_DIMENSIONS["mxbai-embed-large"] == 1024 |
| |
|
| |
|
| | class TestRetriever: |
| | """Test retriever functionality.""" |
| |
|
| | def test_retriever_config(self): |
| | """Test RetrieverConfig creation.""" |
| | from src.rag.retriever import RetrieverConfig |
| |
|
| | config = RetrieverConfig( |
| | default_top_k=10, |
| | similarity_threshold=0.75, |
| | include_evidence=True, |
| | ) |
| |
|
| | assert config.default_top_k == 10 |
| | assert config.include_evidence is True |
| |
|
| | def test_retrieved_chunk(self): |
| | """Test RetrievedChunk model.""" |
| | from src.rag.retriever import RetrievedChunk |
| |
|
| | chunk = RetrievedChunk( |
| | chunk_id="chunk_1", |
| | document_id="doc_1", |
| | text="Sample retrieved text", |
| | similarity=0.9, |
| | page=0, |
| | chunk_type="text", |
| | ) |
| |
|
| | assert chunk.similarity == 0.9 |
| |
|
| |
|
| | class TestGenerator: |
| | """Test generator functionality.""" |
| |
|
| | def test_generator_config(self): |
| | """Test GeneratorConfig creation.""" |
| | from src.rag.generator import GeneratorConfig |
| |
|
| | config = GeneratorConfig( |
| | llm_provider="ollama", |
| | ollama_model="llama3.2:3b", |
| | temperature=0.1, |
| | require_citations=True, |
| | ) |
| |
|
| | assert config.llm_provider == "ollama" |
| | assert config.require_citations is True |
| |
|
| | def test_citation_model(self): |
| | """Test Citation model.""" |
| | from src.rag.generator import Citation |
| |
|
| | citation = Citation( |
| | index=1, |
| | chunk_id="chunk_1", |
| | page=0, |
| | text_snippet="Sample snippet", |
| | confidence=0.85, |
| | ) |
| |
|
| | assert citation.index == 1 |
| | assert citation.confidence == 0.85 |
| |
|
| | def test_generated_answer_model(self): |
| | """Test GeneratedAnswer model.""" |
| | from src.rag.generator import GeneratedAnswer, Citation |
| |
|
| | answer = GeneratedAnswer( |
| | answer="This is the generated answer.", |
| | citations=[ |
| | Citation( |
| | index=1, |
| | chunk_id="chunk_1", |
| | page=0, |
| | text_snippet="Evidence text", |
| | confidence=0.9, |
| | ) |
| | ], |
| | confidence=0.85, |
| | abstained=False, |
| | num_chunks_used=3, |
| | query="What is the answer?", |
| | ) |
| |
|
| | assert answer.answer == "This is the generated answer." |
| | assert len(answer.citations) == 1 |
| | assert answer.abstained is False |
| |
|
| | def test_abstention(self): |
| | """Test abstention behavior.""" |
| | from src.rag.generator import GeneratedAnswer |
| |
|
| | answer = GeneratedAnswer( |
| | answer="I cannot provide a confident answer.", |
| | citations=[], |
| | confidence=0.3, |
| | abstained=True, |
| | abstain_reason="Low confidence", |
| | num_chunks_used=2, |
| | query="Complex question", |
| | ) |
| |
|
| | assert answer.abstained is True |
| | assert answer.abstain_reason == "Low confidence" |
| |
|
| |
|
| | class TestIndexer: |
| | """Test indexer functionality.""" |
| |
|
| | def test_indexer_config(self): |
| | """Test IndexerConfig creation.""" |
| | from src.rag.indexer import IndexerConfig |
| |
|
| | config = IndexerConfig( |
| | batch_size=64, |
| | include_bbox=True, |
| | skip_empty_chunks=True, |
| | ) |
| |
|
| | assert config.batch_size == 64 |
| |
|
| | def test_indexing_result(self): |
| | """Test IndexingResult model.""" |
| | from src.rag.indexer import IndexingResult |
| |
|
| | result = IndexingResult( |
| | document_id="doc_1", |
| | source_path="/path/to/doc.pdf", |
| | num_chunks_indexed=10, |
| | num_chunks_skipped=2, |
| | success=True, |
| | ) |
| |
|
| | assert result.success is True |
| | assert result.num_chunks_indexed == 10 |
| |
|
| |
|
| | class TestRAGIntegration: |
| | """Integration tests for full RAG pipeline.""" |
| |
|
| | @pytest.fixture |
| | def mock_chunks(self): |
| | """Create mock document chunks.""" |
| | from src.rag.retriever import RetrievedChunk |
| |
|
| | return [ |
| | RetrievedChunk( |
| | chunk_id=f"chunk_{i}", |
| | document_id="doc_1", |
| | text=f"This is sample text from chunk {i}.", |
| | similarity=0.9 - (i * 0.1), |
| | page=i, |
| | chunk_type="text", |
| | ) |
| | for i in range(3) |
| | ] |
| |
|
| | def test_context_building(self, mock_chunks): |
| | """Test building context from chunks.""" |
| | from src.rag.retriever import DocumentRetriever |
| |
|
| | retriever = DocumentRetriever() |
| |
|
| | context = retriever.build_context(mock_chunks, include_metadata=True) |
| |
|
| | assert "chunk 0" in context.lower() |
| | assert "Page 1" in context |
| |
|
| | def test_citation_extraction(self): |
| | """Test citation extraction from text.""" |
| | from src.rag.generator import GroundedGenerator |
| | from src.rag.retriever import RetrievedChunk |
| |
|
| | generator = GroundedGenerator() |
| |
|
| | chunks = [ |
| | RetrievedChunk( |
| | chunk_id="chunk_1", |
| | document_id="doc_1", |
| | text="First chunk content", |
| | similarity=0.9, |
| | page=0, |
| | ), |
| | RetrievedChunk( |
| | chunk_id="chunk_2", |
| | document_id="doc_1", |
| | text="Second chunk content", |
| | similarity=0.85, |
| | page=1, |
| | ), |
| | ] |
| |
|
| | answer_text = "The answer is based on [1] and [2]." |
| |
|
| | citations = generator._extract_citations(answer_text, chunks) |
| |
|
| | assert len(citations) == 2 |
| | assert citations[0].index == 1 |
| | assert citations[1].index == 2 |
| |
|