Spaces:
Sleeping
Sleeping
File size: 4,299 Bytes
af2f8e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
"""
RAGAS Evaluator - Core evaluation logic using RAGAS framework
"""
import os
import logging
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, field
from datetime import datetime
# RAGAS imports
from ragas.metrics import (
Faithfulness,
ResponseRelevancy,
LLMContextPrecisionWithoutReference,
)
from ragas.llms import LangchainLLMWrapper
from ragas.dataset_schema import SingleTurnSample
# LangChain for LLM wrapper (RAGAS requirement)
from langchain_groq import ChatGroq
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
@dataclass
class RagasEvaluationResult:
"""Result from RAGAS evaluation."""
eval_id: str
query: str
# RAGAS metrics (0-1 scale)
faithfulness: float
answer_relevancy: float
context_precision: float
# Composite score
ragas_score: float = 0.0
# Metadata
latency_ms: float = 0.0
timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
def __post_init__(self):
"""Calculate composite RAGAS score."""
scores = [self.faithfulness, self.context_precision]
valid_scores = [s for s in scores if s > 0]
self.ragas_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0
class RagasEvaluator:
"""
Evaluates RAG responses using RAGAS metrics.
Metrics:
- Faithfulness: Is the answer grounded in the context?
- Answer Relevancy: Does the answer address the question?
- Context Precision: Are the retrieved chunks useful?
"""
def __init__(self, groq_api_key: Optional[str] = None):
"""
Initialize RAGAS evaluator.
Args:
groq_api_key: Your Groq API key (or uses GROQ_API_KEY env var)
"""
# TODO: Step 1 - Get API key
api_key = groq_api_key or os.getenv("GROQ_API_KEY")
if not api_key:
raise ValueError("GROQ_API_KEY required")
llm = ChatGroq(
api_key=api_key,
model_name="llama-3.3-70b-versatile",
temperature=0
)
self.evaluator_llm = LangchainLLMWrapper(llm)
self.faithfulness = Faithfulness(llm=self.evaluator_llm)
# self.answer_relevancy = ResponseRelevancy(llm=self.evaluator_llm)
self.context_precision = LLMContextPrecisionWithoutReference(llm=self.evaluator_llm)
# Storage for results
self.results: List[RagasEvaluationResult] = []
logger.info("✓ RAGAS Evaluator initialized (Faithfulness + Context Precision)")
async def evaluate_single(
self,
query: str,
answer: str,
contexts: List[str],
ground_truth: Optional[str] = None
) -> RagasEvaluationResult:
"""
Evaluate a single RAG response.
"""
import time
import hashlib
start_time = time.time()
# 1. Create SingleTurnSample
sample = SingleTurnSample(
user_input=query,
response=answer,
retrieved_contexts=contexts,
reference=ground_truth or ""
)
# 2. Score with each metric (async!)
faithfulness_score = await self.faithfulness.single_turn_ascore(sample)
# answer_relevancy_score = await self.answer_relevancy.single_turn_ascore(sample)
answer_relevancy_score = None
context_precision_score = await self.context_precision.single_turn_ascore(sample)
# 3. Calculate latency
latency_ms = (time.time() - start_time) * 1000
# 4. Generate eval_id
eval_id = hashlib.md5(f"{query}{datetime.now().isoformat()}".encode()).hexdigest()[:8]
# 5. Create and store result
result = RagasEvaluationResult(
eval_id=eval_id,
query=query,
faithfulness=float(faithfulness_score),
answer_relevancy=0.0, #float(answer_relevancy_score),
context_precision=float(context_precision_score),
latency_ms=latency_ms
)
self.results.append(result)
logger.info(f"Evaluation complete: RAGAS score = {result.ragas_score:.3f}")
return result |