Spaces:
Sleeping
Sleeping
| """ | |
| RAG Evaluation Framework | |
| Comprehensive evaluation system for RAG applications including: | |
| - Retrieval quality metrics (Precision@K, Recall@K, MRR, NDCG) | |
| - Generation quality metrics (BLEU, ROUGE, BERTScore, Faithfulness) | |
| - System performance metrics (Latency, Throughput, Error rates) | |
| - User experience metrics (Satisfaction, Task completion) | |
| """ | |
| """Lazy evaluation exports to avoid heavy imports at test collection time. | |
| Some evaluation modules (BLEU/ROUGE/BERTScore) import large packages like | |
| `torch` which slow down pytest collection. Expose lightweight accessors that | |
| perform imports only when evaluation functions are actually used. | |
| """ | |
| from .core import BenchmarkResults, EvaluationMetrics, EvaluationResult # noqa: E402 | |
| def _lazy_import(name: str): | |
| module = __import__(f"src.evaluation.metrics.{name}", fromlist=["*"]) | |
| return module | |
| def get_generation_metrics(): | |
| m = _lazy_import("generation_metrics") | |
| return ( | |
| m.calculate_bert_score, | |
| m.calculate_bleu_score, | |
| m.calculate_faithfulness_score, | |
| m.calculate_rouge_scores, | |
| ) | |
| def get_retrieval_metrics(): | |
| m = _lazy_import("retrieval_metrics") | |
| return m.mean_reciprocal_rank, m.ndcg_at_k, m.precision_at_k, m.recall_at_k | |
| def get_system_metrics(): | |
| m = _lazy_import("system_metrics") | |
| return m.ErrorTracker, m.LatencyTracker, m.ThroughputTracker | |
| def get_user_metrics(): | |
| m = _lazy_import("user_metrics") | |
| return m.CitationAccuracyTracker, m.TaskCompletionTracker, m.UserSatisfactionTracker | |
| def get_runner(): | |
| m = _lazy_import("runner") | |
| return m.EvaluationRunner | |
| __all__ = [ | |
| "EvaluationMetrics", | |
| "EvaluationResult", | |
| "BenchmarkResults", | |
| "get_generation_metrics", | |
| "get_retrieval_metrics", | |
| "get_system_metrics", | |
| "get_user_metrics", | |
| "get_runner", | |
| ] | |