aankitdas's picture
feat: add RAGAS eval
af2f8e1
"""
RAGAS Evaluator - Core evaluation logic using RAGAS framework
"""
import os
import logging
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, field
from datetime import datetime
# RAGAS imports
from ragas.metrics import (
Faithfulness,
ResponseRelevancy,
LLMContextPrecisionWithoutReference,
)
from ragas.llms import LangchainLLMWrapper
from ragas.dataset_schema import SingleTurnSample
# LangChain for LLM wrapper (RAGAS requirement)
from langchain_groq import ChatGroq
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class RagasEvaluationResult:
"""Result from RAGAS evaluation."""
eval_id: str
query: str
# RAGAS metrics (0-1 scale)
faithfulness: float
answer_relevancy: float
context_precision: float
# Composite score
ragas_score: float = 0.0
# Metadata
latency_ms: float = 0.0
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
def __post_init__(self):
"""Calculate composite RAGAS score."""
scores = [self.faithfulness, self.context_precision]
valid_scores = [s for s in scores if s > 0]
self.ragas_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
class RagasEvaluator:
"""
Evaluates RAG responses using RAGAS metrics.
Metrics:
- Faithfulness: Is the answer grounded in the context?
- Answer Relevancy: Does the answer address the question?
- Context Precision: Are the retrieved chunks useful?
"""
def __init__(self, groq_api_key: Optional[str] = None):
"""
Initialize RAGAS evaluator.
Args:
groq_api_key: Your Groq API key (or uses GROQ_API_KEY env var)
"""
# TODO: Step 1 - Get API key
api_key = groq_api_key or os.getenv("GROQ_API_KEY")
if not api_key:
raise ValueError("GROQ_API_KEY required")
llm = ChatGroq(
api_key=api_key,
model_name="llama-3.3-70b-versatile",
temperature=0
)
self.evaluator_llm = LangchainLLMWrapper(llm)
self.faithfulness = Faithfulness(llm=self.evaluator_llm)
# self.answer_relevancy = ResponseRelevancy(llm=self.evaluator_llm)
self.context_precision = LLMContextPrecisionWithoutReference(llm=self.evaluator_llm)
# Storage for results
self.results: List[RagasEvaluationResult] = []
logger.info("✓ RAGAS Evaluator initialized (Faithfulness + Context Precision)")
async def evaluate_single(
self,
query: str,
answer: str,
contexts: List[str],
ground_truth: Optional[str] = None
) -> RagasEvaluationResult:
"""
Evaluate a single RAG response.
"""
import time
import hashlib
start_time = time.time()
# 1. Create SingleTurnSample
sample = SingleTurnSample(
user_input=query,
response=answer,
retrieved_contexts=contexts,
reference=ground_truth or ""
)
# 2. Score with each metric (async!)
faithfulness_score = await self.faithfulness.single_turn_ascore(sample)
# answer_relevancy_score = await self.answer_relevancy.single_turn_ascore(sample)
answer_relevancy_score = None
context_precision_score = await self.context_precision.single_turn_ascore(sample)
# 3. Calculate latency
latency_ms = (time.time() - start_time) * 1000
# 4. Generate eval_id
eval_id = hashlib.md5(f"{query}{datetime.now().isoformat()}".encode()).hexdigest()[:8]
# 5. Create and store result
result = RagasEvaluationResult(
eval_id=eval_id,
query=query,
faithfulness=float(faithfulness_score),
answer_relevancy=0.0, #float(answer_relevancy_score),
context_precision=float(context_precision_score),
latency_ms=latency_ms
)
self.results.append(result)
logger.info(f"Evaluation complete: RAGAS score = {result.ragas_score:.3f}")
return result