File size: 4,299 Bytes
af2f8e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
RAGAS Evaluator - Core evaluation logic using RAGAS framework
"""
import os
import logging
from typing import List, Dict, Any, Optional
from dataclasses import dataclass, field
from datetime import datetime

# RAGAS imports
from ragas.metrics import (
    Faithfulness,
    ResponseRelevancy,
    LLMContextPrecisionWithoutReference,
)
from ragas.llms import LangchainLLMWrapper
from ragas.dataset_schema import SingleTurnSample

# LangChain for LLM wrapper (RAGAS requirement)
from langchain_groq import ChatGroq

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


@dataclass
class RagasEvaluationResult:
    """Result from RAGAS evaluation."""
    eval_id: str
    query: str
    
    # RAGAS metrics (0-1 scale)
    faithfulness: float
    answer_relevancy: float
    context_precision: float
    
    # Composite score
    ragas_score: float = 0.0
    
    # Metadata
    latency_ms: float = 0.0
    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
    
    def __post_init__(self):
        """Calculate composite RAGAS score."""
        scores = [self.faithfulness, self.context_precision]
        valid_scores = [s for s in scores if s > 0]
        self.ragas_score = sum(valid_scores) / len(valid_scores) if valid_scores else 0.0


class RagasEvaluator:
    """
    Evaluates RAG responses using RAGAS metrics.
    
    Metrics:
    - Faithfulness: Is the answer grounded in the context?
    - Answer Relevancy: Does the answer address the question?
    - Context Precision: Are the retrieved chunks useful?
    """
    
    def __init__(self, groq_api_key: Optional[str] = None):
        """
        Initialize RAGAS evaluator.
        
        Args:
            groq_api_key: Your Groq API key (or uses GROQ_API_KEY env var)
        """
        # TODO: Step 1 - Get API key
        api_key = groq_api_key or os.getenv("GROQ_API_KEY")
        if not api_key:
            raise ValueError("GROQ_API_KEY required")
        llm = ChatGroq(
            api_key=api_key,
            model_name="llama-3.3-70b-versatile",
            temperature=0
        )

        self.evaluator_llm = LangchainLLMWrapper(llm)
        
        
        self.faithfulness = Faithfulness(llm=self.evaluator_llm)
        # self.answer_relevancy = ResponseRelevancy(llm=self.evaluator_llm)
        self.context_precision = LLMContextPrecisionWithoutReference(llm=self.evaluator_llm)
        
        # Storage for results
        self.results: List[RagasEvaluationResult] = []
        
        logger.info("✓ RAGAS Evaluator initialized (Faithfulness + Context Precision)")
    
    async def evaluate_single(
        self,
        query: str,
        answer: str,
        contexts: List[str],
        ground_truth: Optional[str] = None
    ) -> RagasEvaluationResult:
        """
        Evaluate a single RAG response.
        """
        import time
        import hashlib
    
        start_time = time.time()  
        
        # 1. Create SingleTurnSample
        sample = SingleTurnSample(
            user_input=query,
            response=answer,
            retrieved_contexts=contexts,
            reference=ground_truth or ""
        )

        # 2. Score with each metric (async!)
        faithfulness_score = await self.faithfulness.single_turn_ascore(sample)
        # answer_relevancy_score = await self.answer_relevancy.single_turn_ascore(sample)
        answer_relevancy_score = None
        context_precision_score = await self.context_precision.single_turn_ascore(sample)
        
        # 3. Calculate latency
        latency_ms = (time.time() - start_time) * 1000

        # 4. Generate eval_id
        eval_id = hashlib.md5(f"{query}{datetime.now().isoformat()}".encode()).hexdigest()[:8]
        
        # 5. Create and store result
        result = RagasEvaluationResult(
            eval_id=eval_id,
            query=query,
            faithfulness=float(faithfulness_score),
            answer_relevancy=0.0, #float(answer_relevancy_score),      
            context_precision=float(context_precision_score),    
            latency_ms=latency_ms
        )
        
        self.results.append(result)
        
        logger.info(f"Evaluation complete: RAGAS score = {result.ragas_score:.3f}")
        
        return result