CapstoneRAG10 / evaluation_pipeline.py
PavaniYerra's picture
Clone
9bc547e
"""Unified evaluation pipeline supporting both heuristic and GPT labeling approaches.
This module provides a facade for evaluating RAG systems using either:
1. Rule-based TRACE metrics (fast, no LLM calls)
2. GPT labeling prompts (accurate, requires LLM calls)
3. Hybrid approach (combines both)
"""
from typing import List, Dict, Optional, Literal
from dataclasses import asdict
from trace_evaluator import TRACEEvaluator, TRACEScores
from advanced_rag_evaluator import AdvancedRAGEvaluator, AdvancedTRACEScores, RMSECalculator, AUCROCCalculator
class UnifiedEvaluationPipeline:
"""Unified pipeline for RAG evaluation."""
def __init__(self, llm_client=None, chunking_strategy: Optional[str] = None,
embedding_model: Optional[str] = None, chunk_size: Optional[int] = None,
chunk_overlap: Optional[int] = None):
"""Initialize evaluation pipeline.
Args:
llm_client: LLM client for GPT labeling
chunking_strategy: Chunking strategy used
embedding_model: Embedding model used
chunk_size: Chunk size used
chunk_overlap: Chunk overlap used
"""
self.llm_client = llm_client
self.chunking_strategy = chunking_strategy
self.embedding_model = embedding_model
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
# Initialize both evaluators
self.trace_evaluator = TRACEEvaluator(
llm_client=llm_client,
chunking_strategy=chunking_strategy,
embedding_model=embedding_model,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
self.advanced_evaluator = AdvancedRAGEvaluator(
llm_client=llm_client,
chunking_strategy=chunking_strategy,
embedding_model=embedding_model,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap
)
def evaluate(
self,
question: str,
response: str,
retrieved_documents: List[str],
ground_truth: Optional[str] = None,
method: Literal["trace", "gpt_labeling", "hybrid"] = "trace"
) -> Dict:
"""Evaluate a single RAG query result.
Args:
question: User question
response: LLM response
retrieved_documents: Retrieved context
ground_truth: Optional ground truth
method: Evaluation method ("trace", "gpt_labeling", or "hybrid")
Returns:
Dictionary with evaluation scores and details
"""
if method == "trace":
# Rule-based TRACE metrics
scores = self.trace_evaluator.evaluate(
question, response, retrieved_documents, ground_truth
)
return {
"method": "trace_heuristics",
"scores": asdict(scores),
"description": "Rule-based TRACE metrics"
}
elif method == "gpt_labeling":
# GPT labeling-based evaluation
result = self.advanced_evaluator.evaluate(
question, response, retrieved_documents, ground_truth
)
# Handle both tuple (scores, llm_info) and single score returns
if isinstance(result, tuple):
scores, llm_info = result
else:
scores = result
llm_info = {}
return {
"method": "gpt_labeling_prompts",
"scores": asdict(scores),
"llm_request_info": llm_info,
"description": "GPT-based sentence-level labeling"
}
elif method == "hybrid":
# Combine both approaches
trace_scores = self.trace_evaluator.evaluate(
question, response, retrieved_documents, ground_truth
)
result = self.advanced_evaluator.evaluate(
question, response, retrieved_documents, ground_truth
)
# Handle both tuple (scores, llm_info) and single score returns
if isinstance(result, tuple):
gpt_scores, llm_info = result
else:
gpt_scores = result
llm_info = {}
return {
"method": "hybrid",
"trace_scores": asdict(trace_scores),
"gpt_scores": asdict(gpt_scores),
"llm_request_info": llm_info,
"description": "Combines rule-based and GPT-based evaluation"
}
else:
raise ValueError(f"Unknown evaluation method: {method}")
def evaluate_batch(
self,
test_cases: List[Dict],
method: Literal["trace", "gpt_labeling", "hybrid"] = "trace"
) -> Dict:
"""Evaluate multiple test cases.
Args:
test_cases: List of test cases
method: Evaluation method
Returns:
Aggregated evaluation results
"""
if method == "trace":
return self.trace_evaluator.evaluate_batch(test_cases)
elif method == "gpt_labeling":
return self.advanced_evaluator.evaluate_batch(test_cases)
elif method == "hybrid":
trace_results = self.trace_evaluator.evaluate_batch(test_cases)
gpt_results = self.advanced_evaluator.evaluate_batch(test_cases)
return {
"method": "hybrid",
"trace_results": trace_results,
"gpt_results": gpt_results,
"description": "Combined evaluation results"
}
else:
raise ValueError(f"Unknown evaluation method: {method}")
@staticmethod
def get_evaluation_methods() -> List[Dict]:
"""Get available evaluation methods and their descriptions.
Returns:
List of available methods with descriptions
"""
return [
{
"id": "trace",
"name": "TRACE Heuristics",
"description": "Fast rule-based evaluation (no LLM calls)",
"metrics": ["utilization", "relevance", "adherence", "completeness"],
"speed": "Fast",
"accuracy": "Good",
"llm_required": False
},
{
"id": "gpt_labeling",
"name": "GPT Labeling Prompts",
"description": "Accurate sentence-level LLM-based labeling (RAGBench approach)",
"metrics": ["context_relevance", "context_utilization", "completeness", "adherence"],
"speed": "Slow",
"accuracy": "Excellent",
"llm_required": True
},
{
"id": "hybrid",
"name": "Hybrid (TRACE + GPT)",
"description": "Combines both approaches for comprehensive analysis",
"metrics": ["All TRACE metrics + All GPT metrics"],
"speed": "Very Slow",
"accuracy": "Excellent",
"llm_required": True
}
]