hh786's picture
Deployment of Hierarchical RAG system
c54dcef
"""Tests for evaluation metrics."""
import pytest
from core.eval import RAGEvaluator, BenchmarkDataset
from core.eval_utils import generate_evaluation_report
import pandas as pd
from pathlib import Path
import tempfile
def test_hit_at_k():
"""Test Hit@k metric."""
evaluator = RAGEvaluator()
retrieved = ["doc1", "doc2", "doc3", "doc4", "doc5"]
relevant = ["doc2", "doc6"]
assert evaluator.hit_at_k(retrieved, relevant, k=5) == 1.0
assert evaluator.hit_at_k(retrieved, relevant, k=1) == 0.0
def test_precision_recall():
"""Test Precision and Recall metrics."""
evaluator = RAGEvaluator()
retrieved = ["doc1", "doc2", "doc3"]
relevant = ["doc2", "doc3", "doc4"]
precision = evaluator.precision_at_k(retrieved, relevant, k=3)
recall = evaluator.recall_at_k(retrieved, relevant, k=3)
assert precision == pytest.approx(2/3, 0.01)
assert recall == pytest.approx(2/3, 0.01)
def test_mrr():
"""Test Mean Reciprocal Rank."""
evaluator = RAGEvaluator()
retrieved = ["doc1", "doc2", "doc3"]
relevant = ["doc2"]
mrr = evaluator.mrr(retrieved, relevant)
assert mrr == pytest.approx(0.5, 0.01) # 1/2
def test_semantic_similarity():
"""Test semantic similarity metric."""
evaluator = RAGEvaluator()
answer = "The patient needs proper identification."
reference = "Patient identification is required."
similarity = evaluator.semantic_similarity(answer, reference)
assert 0 <= similarity <= 1
assert similarity > 0.5 # Should be reasonably similar
def test_benchmark_dataset():
"""Test benchmark dataset functionality."""
dataset = BenchmarkDataset()
hospital_queries = dataset.get_sample_hospital_queries()
bank_queries = dataset.get_sample_bank_queries()
fluid_queries = dataset.get_sample_fluid_simulation_queries()
assert len(hospital_queries) > 0
assert len(bank_queries) > 0
assert len(fluid_queries) > 0
assert "query" in hospital_queries[0]
assert "domain" in hospital_queries[0]
def test_evaluation_report_generation():
"""Test evaluation report generation."""
# Create sample evaluation data
data = {
'query': ['Query 1', 'Query 2', 'Query 3'],
'base_retrieval_time': [0.05, 0.06, 0.04],
'base_total_time': [1.5, 2.0, 1.8],
'hier_retrieval_time': [0.03, 0.04, 0.03],
'hier_total_time': [1.0, 1.2, 1.1],
'speedup': [1.5, 1.67, 1.64]
}
with tempfile.TemporaryDirectory() as temp_dir:
# Save sample CSV
csv_path = Path(temp_dir) / "test_eval.csv"
df = pd.DataFrame(data)
df.to_csv(csv_path, index=False)
# Generate report
stats = generate_evaluation_report(str(csv_path))
# Verify statistics
assert stats['total_queries'] == 3
assert stats['avg_speedup'] > 1.0
assert stats['hier_wins'] == 3
# Verify files created
assert Path(str(csv_path).replace('.csv', '_report_charts.png')).exists()
assert Path(str(csv_path).replace('.csv', '_report_summary.md')).exists()
def test_evaluate_rag_pipeline():
"""Test complete RAG pipeline evaluation."""
evaluator = RAGEvaluator()
# Mock RAG result
rag_result = {
"answer": "Patient admission requires ID verification.",
"contexts": [
{"id": "doc1", "document": "Text about admission", "metadata": {}},
{"id": "doc2", "document": "Text about verification", "metadata": {}}
],
"retrieval_time": 0.05,
"generation_time": 1.2,
"total_time": 1.25,
"pipeline": "Hier-RAG"
}
relevant_ids = ["doc1", "doc3"]
reference_answer = "Admission requires identification."
metrics = evaluator.evaluate_rag_pipeline(
rag_result,
relevant_ids,
reference_answer,
k_values=[1, 3, 5]
)
assert "hit@1" in metrics
assert "hit@3" in metrics
assert "mrr" in metrics
assert "semantic_similarity" in metrics
assert metrics["retrieval_time"] == 0.05
def test_empty_results():
"""Test evaluation with empty results."""
evaluator = RAGEvaluator()
retrieved = []
relevant = ["doc1", "doc2"]
assert evaluator.hit_at_k(retrieved, relevant, k=5) == 0.0
assert evaluator.precision_at_k(retrieved, relevant, k=5) == 0.0
assert evaluator.mrr(retrieved, relevant) == 0.0