Spaces:
Sleeping
Sleeping
| """Unified evaluation pipeline supporting both heuristic and GPT labeling approaches. | |
| This module provides a facade for evaluating RAG systems using either: | |
| 1. Rule-based TRACE metrics (fast, no LLM calls) | |
| 2. GPT labeling prompts (accurate, requires LLM calls) | |
| 3. Hybrid approach (combines both) | |
| """ | |
| from typing import List, Dict, Optional, Literal | |
| from dataclasses import asdict | |
| from trace_evaluator import TRACEEvaluator, TRACEScores | |
| from advanced_rag_evaluator import AdvancedRAGEvaluator, AdvancedTRACEScores, RMSECalculator, AUCROCCalculator | |
| class UnifiedEvaluationPipeline: | |
| """Unified pipeline for RAG evaluation.""" | |
| def __init__(self, llm_client=None, chunking_strategy: Optional[str] = None, | |
| embedding_model: Optional[str] = None, chunk_size: Optional[int] = None, | |
| chunk_overlap: Optional[int] = None): | |
| """Initialize evaluation pipeline. | |
| Args: | |
| llm_client: LLM client for GPT labeling | |
| chunking_strategy: Chunking strategy used | |
| embedding_model: Embedding model used | |
| chunk_size: Chunk size used | |
| chunk_overlap: Chunk overlap used | |
| """ | |
| self.llm_client = llm_client | |
| self.chunking_strategy = chunking_strategy | |
| self.embedding_model = embedding_model | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| # Initialize both evaluators | |
| self.trace_evaluator = TRACEEvaluator( | |
| llm_client=llm_client, | |
| chunking_strategy=chunking_strategy, | |
| embedding_model=embedding_model, | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap | |
| ) | |
| self.advanced_evaluator = AdvancedRAGEvaluator( | |
| llm_client=llm_client, | |
| chunking_strategy=chunking_strategy, | |
| embedding_model=embedding_model, | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap | |
| ) | |
| def evaluate( | |
| self, | |
| question: str, | |
| response: str, | |
| retrieved_documents: List[str], | |
| ground_truth: Optional[str] = None, | |
| method: Literal["trace", "gpt_labeling", "hybrid"] = "trace" | |
| ) -> Dict: | |
| """Evaluate a single RAG query result. | |
| Args: | |
| question: User question | |
| response: LLM response | |
| retrieved_documents: Retrieved context | |
| ground_truth: Optional ground truth | |
| method: Evaluation method ("trace", "gpt_labeling", or "hybrid") | |
| Returns: | |
| Dictionary with evaluation scores and details | |
| """ | |
| if method == "trace": | |
| # Rule-based TRACE metrics | |
| scores = self.trace_evaluator.evaluate( | |
| question, response, retrieved_documents, ground_truth | |
| ) | |
| return { | |
| "method": "trace_heuristics", | |
| "scores": asdict(scores), | |
| "description": "Rule-based TRACE metrics" | |
| } | |
| elif method == "gpt_labeling": | |
| # GPT labeling-based evaluation | |
| result = self.advanced_evaluator.evaluate( | |
| question, response, retrieved_documents, ground_truth | |
| ) | |
| # Handle both tuple (scores, llm_info) and single score returns | |
| if isinstance(result, tuple): | |
| scores, llm_info = result | |
| else: | |
| scores = result | |
| llm_info = {} | |
| return { | |
| "method": "gpt_labeling_prompts", | |
| "scores": asdict(scores), | |
| "llm_request_info": llm_info, | |
| "description": "GPT-based sentence-level labeling" | |
| } | |
| elif method == "hybrid": | |
| # Combine both approaches | |
| trace_scores = self.trace_evaluator.evaluate( | |
| question, response, retrieved_documents, ground_truth | |
| ) | |
| result = self.advanced_evaluator.evaluate( | |
| question, response, retrieved_documents, ground_truth | |
| ) | |
| # Handle both tuple (scores, llm_info) and single score returns | |
| if isinstance(result, tuple): | |
| gpt_scores, llm_info = result | |
| else: | |
| gpt_scores = result | |
| llm_info = {} | |
| return { | |
| "method": "hybrid", | |
| "trace_scores": asdict(trace_scores), | |
| "gpt_scores": asdict(gpt_scores), | |
| "llm_request_info": llm_info, | |
| "description": "Combines rule-based and GPT-based evaluation" | |
| } | |
| else: | |
| raise ValueError(f"Unknown evaluation method: {method}") | |
| def evaluate_batch( | |
| self, | |
| test_cases: List[Dict], | |
| method: Literal["trace", "gpt_labeling", "hybrid"] = "trace" | |
| ) -> Dict: | |
| """Evaluate multiple test cases. | |
| Args: | |
| test_cases: List of test cases | |
| method: Evaluation method | |
| Returns: | |
| Aggregated evaluation results | |
| """ | |
| if method == "trace": | |
| return self.trace_evaluator.evaluate_batch(test_cases) | |
| elif method == "gpt_labeling": | |
| return self.advanced_evaluator.evaluate_batch(test_cases) | |
| elif method == "hybrid": | |
| trace_results = self.trace_evaluator.evaluate_batch(test_cases) | |
| gpt_results = self.advanced_evaluator.evaluate_batch(test_cases) | |
| return { | |
| "method": "hybrid", | |
| "trace_results": trace_results, | |
| "gpt_results": gpt_results, | |
| "description": "Combined evaluation results" | |
| } | |
| else: | |
| raise ValueError(f"Unknown evaluation method: {method}") | |
| def get_evaluation_methods() -> List[Dict]: | |
| """Get available evaluation methods and their descriptions. | |
| Returns: | |
| List of available methods with descriptions | |
| """ | |
| return [ | |
| { | |
| "id": "trace", | |
| "name": "TRACE Heuristics", | |
| "description": "Fast rule-based evaluation (no LLM calls)", | |
| "metrics": ["utilization", "relevance", "adherence", "completeness"], | |
| "speed": "Fast", | |
| "accuracy": "Good", | |
| "llm_required": False | |
| }, | |
| { | |
| "id": "gpt_labeling", | |
| "name": "GPT Labeling Prompts", | |
| "description": "Accurate sentence-level LLM-based labeling (RAGBench approach)", | |
| "metrics": ["context_relevance", "context_utilization", "completeness", "adherence"], | |
| "speed": "Slow", | |
| "accuracy": "Excellent", | |
| "llm_required": True | |
| }, | |
| { | |
| "id": "hybrid", | |
| "name": "Hybrid (TRACE + GPT)", | |
| "description": "Combines both approaches for comprehensive analysis", | |
| "metrics": ["All TRACE metrics + All GPT metrics"], | |
| "speed": "Very Slow", | |
| "accuracy": "Excellent", | |
| "llm_required": True | |
| } | |
| ] | |