Spaces:
Sleeping
Sleeping
| """Tests for evaluation metrics.""" | |
| import pytest | |
| from core.eval import RAGEvaluator, BenchmarkDataset | |
| from core.eval_utils import generate_evaluation_report | |
| import pandas as pd | |
| from pathlib import Path | |
| import tempfile | |
| def test_hit_at_k(): | |
| """Test Hit@k metric.""" | |
| evaluator = RAGEvaluator() | |
| retrieved = ["doc1", "doc2", "doc3", "doc4", "doc5"] | |
| relevant = ["doc2", "doc6"] | |
| assert evaluator.hit_at_k(retrieved, relevant, k=5) == 1.0 | |
| assert evaluator.hit_at_k(retrieved, relevant, k=1) == 0.0 | |
| def test_precision_recall(): | |
| """Test Precision and Recall metrics.""" | |
| evaluator = RAGEvaluator() | |
| retrieved = ["doc1", "doc2", "doc3"] | |
| relevant = ["doc2", "doc3", "doc4"] | |
| precision = evaluator.precision_at_k(retrieved, relevant, k=3) | |
| recall = evaluator.recall_at_k(retrieved, relevant, k=3) | |
| assert precision == pytest.approx(2/3, 0.01) | |
| assert recall == pytest.approx(2/3, 0.01) | |
| def test_mrr(): | |
| """Test Mean Reciprocal Rank.""" | |
| evaluator = RAGEvaluator() | |
| retrieved = ["doc1", "doc2", "doc3"] | |
| relevant = ["doc2"] | |
| mrr = evaluator.mrr(retrieved, relevant) | |
| assert mrr == pytest.approx(0.5, 0.01) # 1/2 | |
| def test_semantic_similarity(): | |
| """Test semantic similarity metric.""" | |
| evaluator = RAGEvaluator() | |
| answer = "The patient needs proper identification." | |
| reference = "Patient identification is required." | |
| similarity = evaluator.semantic_similarity(answer, reference) | |
| assert 0 <= similarity <= 1 | |
| assert similarity > 0.5 # Should be reasonably similar | |
| def test_benchmark_dataset(): | |
| """Test benchmark dataset functionality.""" | |
| dataset = BenchmarkDataset() | |
| hospital_queries = dataset.get_sample_hospital_queries() | |
| bank_queries = dataset.get_sample_bank_queries() | |
| fluid_queries = dataset.get_sample_fluid_simulation_queries() | |
| assert len(hospital_queries) > 0 | |
| assert len(bank_queries) > 0 | |
| assert len(fluid_queries) > 0 | |
| assert "query" in hospital_queries[0] | |
| assert "domain" in hospital_queries[0] | |
| def test_evaluation_report_generation(): | |
| """Test evaluation report generation.""" | |
| # Create sample evaluation data | |
| data = { | |
| 'query': ['Query 1', 'Query 2', 'Query 3'], | |
| 'base_retrieval_time': [0.05, 0.06, 0.04], | |
| 'base_total_time': [1.5, 2.0, 1.8], | |
| 'hier_retrieval_time': [0.03, 0.04, 0.03], | |
| 'hier_total_time': [1.0, 1.2, 1.1], | |
| 'speedup': [1.5, 1.67, 1.64] | |
| } | |
| with tempfile.TemporaryDirectory() as temp_dir: | |
| # Save sample CSV | |
| csv_path = Path(temp_dir) / "test_eval.csv" | |
| df = pd.DataFrame(data) | |
| df.to_csv(csv_path, index=False) | |
| # Generate report | |
| stats = generate_evaluation_report(str(csv_path)) | |
| # Verify statistics | |
| assert stats['total_queries'] == 3 | |
| assert stats['avg_speedup'] > 1.0 | |
| assert stats['hier_wins'] == 3 | |
| # Verify files created | |
| assert Path(str(csv_path).replace('.csv', '_report_charts.png')).exists() | |
| assert Path(str(csv_path).replace('.csv', '_report_summary.md')).exists() | |
| def test_evaluate_rag_pipeline(): | |
| """Test complete RAG pipeline evaluation.""" | |
| evaluator = RAGEvaluator() | |
| # Mock RAG result | |
| rag_result = { | |
| "answer": "Patient admission requires ID verification.", | |
| "contexts": [ | |
| {"id": "doc1", "document": "Text about admission", "metadata": {}}, | |
| {"id": "doc2", "document": "Text about verification", "metadata": {}} | |
| ], | |
| "retrieval_time": 0.05, | |
| "generation_time": 1.2, | |
| "total_time": 1.25, | |
| "pipeline": "Hier-RAG" | |
| } | |
| relevant_ids = ["doc1", "doc3"] | |
| reference_answer = "Admission requires identification." | |
| metrics = evaluator.evaluate_rag_pipeline( | |
| rag_result, | |
| relevant_ids, | |
| reference_answer, | |
| k_values=[1, 3, 5] | |
| ) | |
| assert "hit@1" in metrics | |
| assert "hit@3" in metrics | |
| assert "mrr" in metrics | |
| assert "semantic_similarity" in metrics | |
| assert metrics["retrieval_time"] == 0.05 | |
| def test_empty_results(): | |
| """Test evaluation with empty results.""" | |
| evaluator = RAGEvaluator() | |
| retrieved = [] | |
| relevant = ["doc1", "doc2"] | |
| assert evaluator.hit_at_k(retrieved, relevant, k=5) == 0.0 | |
| assert evaluator.precision_at_k(retrieved, relevant, k=5) == 0.0 | |
| assert evaluator.mrr(retrieved, relevant) == 0.0 |