"""Tests for evaluation metrics.""" import pytest from core.eval import RAGEvaluator, BenchmarkDataset from core.eval_utils import generate_evaluation_report import pandas as pd from pathlib import Path import tempfile def test_hit_at_k(): """Test Hit@k metric.""" evaluator = RAGEvaluator() retrieved = ["doc1", "doc2", "doc3", "doc4", "doc5"] relevant = ["doc2", "doc6"] assert evaluator.hit_at_k(retrieved, relevant, k=5) == 1.0 assert evaluator.hit_at_k(retrieved, relevant, k=1) == 0.0 def test_precision_recall(): """Test Precision and Recall metrics.""" evaluator = RAGEvaluator() retrieved = ["doc1", "doc2", "doc3"] relevant = ["doc2", "doc3", "doc4"] precision = evaluator.precision_at_k(retrieved, relevant, k=3) recall = evaluator.recall_at_k(retrieved, relevant, k=3) assert precision == pytest.approx(2/3, 0.01) assert recall == pytest.approx(2/3, 0.01) def test_mrr(): """Test Mean Reciprocal Rank.""" evaluator = RAGEvaluator() retrieved = ["doc1", "doc2", "doc3"] relevant = ["doc2"] mrr = evaluator.mrr(retrieved, relevant) assert mrr == pytest.approx(0.5, 0.01) # 1/2 def test_semantic_similarity(): """Test semantic similarity metric.""" evaluator = RAGEvaluator() answer = "The patient needs proper identification." reference = "Patient identification is required." similarity = evaluator.semantic_similarity(answer, reference) assert 0 <= similarity <= 1 assert similarity > 0.5 # Should be reasonably similar def test_benchmark_dataset(): """Test benchmark dataset functionality.""" dataset = BenchmarkDataset() hospital_queries = dataset.get_sample_hospital_queries() bank_queries = dataset.get_sample_bank_queries() fluid_queries = dataset.get_sample_fluid_simulation_queries() assert len(hospital_queries) > 0 assert len(bank_queries) > 0 assert len(fluid_queries) > 0 assert "query" in hospital_queries[0] assert "domain" in hospital_queries[0] def test_evaluation_report_generation(): """Test evaluation report generation.""" # Create sample evaluation data data = { 'query': ['Query 1', 'Query 2', 'Query 3'], 'base_retrieval_time': [0.05, 0.06, 0.04], 'base_total_time': [1.5, 2.0, 1.8], 'hier_retrieval_time': [0.03, 0.04, 0.03], 'hier_total_time': [1.0, 1.2, 1.1], 'speedup': [1.5, 1.67, 1.64] } with tempfile.TemporaryDirectory() as temp_dir: # Save sample CSV csv_path = Path(temp_dir) / "test_eval.csv" df = pd.DataFrame(data) df.to_csv(csv_path, index=False) # Generate report stats = generate_evaluation_report(str(csv_path)) # Verify statistics assert stats['total_queries'] == 3 assert stats['avg_speedup'] > 1.0 assert stats['hier_wins'] == 3 # Verify files created assert Path(str(csv_path).replace('.csv', '_report_charts.png')).exists() assert Path(str(csv_path).replace('.csv', '_report_summary.md')).exists() def test_evaluate_rag_pipeline(): """Test complete RAG pipeline evaluation.""" evaluator = RAGEvaluator() # Mock RAG result rag_result = { "answer": "Patient admission requires ID verification.", "contexts": [ {"id": "doc1", "document": "Text about admission", "metadata": {}}, {"id": "doc2", "document": "Text about verification", "metadata": {}} ], "retrieval_time": 0.05, "generation_time": 1.2, "total_time": 1.25, "pipeline": "Hier-RAG" } relevant_ids = ["doc1", "doc3"] reference_answer = "Admission requires identification." metrics = evaluator.evaluate_rag_pipeline( rag_result, relevant_ids, reference_answer, k_values=[1, 3, 5] ) assert "hit@1" in metrics assert "hit@3" in metrics assert "mrr" in metrics assert "semantic_similarity" in metrics assert metrics["retrieval_time"] == 0.05 def test_empty_results(): """Test evaluation with empty results.""" evaluator = RAGEvaluator() retrieved = [] relevant = ["doc1", "doc2"] assert evaluator.hit_at_k(retrieved, relevant, k=5) == 0.0 assert evaluator.precision_at_k(retrieved, relevant, k=5) == 0.0 assert evaluator.mrr(retrieved, relevant) == 0.0