File size: 2,969 Bytes
c003cc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
from typing import Optional
from dotenv import load_dotenv

load_dotenv()


def evaluate_answer(
    question: str,
    answer: str,
    contexts: list[str],
) -> Optional[dict]:
    """
    Evaluate a RAG answer using RAGAS metrics.

    Runs three metrics:
      - Faithfulness: Does the answer only say things supported by the chunks?
      - Answer Relevancy: Does the answer actually address the question?
      - Context Precision: Were the retrieved chunks relevant to the question?

    Args:
        question: The user's original question
        answer:   The answer generated by the RAG pipeline
        contexts: List of text chunks that were retrieved from ChromaDB
    """
    try:
        from datasets import Dataset
        from ragas import evaluate
        from ragas.metrics import (
            faithfulness,
            answer_relevancy,
            context_precision,
        )
        from langchain_openai import ChatOpenAI, OpenAIEmbeddings

        ragas_llm = ChatOpenAI(
            model=os.getenv("OPENROUTER_MODEL", "anthropic/claude-3-haiku"),
            api_key=os.getenv("OPENROUTER_API_KEY"),
            base_url="https://openrouter.ai/api/v1",
            temperature=0,
        )

        ragas_embeddings = OpenAIEmbeddings(
            model="text-embedding-3-small",
            api_key=os.getenv("OPENROUTER_API_KEY"),
            base_url="https://openrouter.ai/api/v1",
        )

        # RAGAS expects data in Dataset format.
        data = {
            "question": [question],
            "answer": [answer],
            # contexts must be a list of lists (one list of chunks per question)
            "contexts": [contexts],
            # ground_truth is optional, we skip it since we have no labeled data
            "ground_truth": [""],
        }

        dataset = Dataset.from_dict(data)

        # Run evaluation
        result = evaluate(
            dataset=dataset,
            metrics=[
                faithfulness,
                answer_relevancy,
                context_precision,
            ],
            llm=ragas_llm,
            embeddings=ragas_embeddings,
            raise_exceptions=False,
        )

        # Extract scores
        scores = result.to_pandas().iloc[0].to_dict()

        return {
            "faithfulness": round(float(scores.get("faithfulness", 0)), 2),
            "answer_relevancy": round(float(scores.get("answer_relevancy", 0)), 2),
            "context_precision": round(float(scores.get("context_precision", 0)), 2),
        }

    except Exception as e:
        print("RAGAS evaluation error: " + str(e))
        return None



def get_score_emoji(score: float) -> str:
    if score >= 0.75:
        return "🟢"
    elif score >= 0.5:
        return "🟡"
    else:
        return "🔴"


def format_score_bar(score: float, width: int = 10) -> str:
    filled = int(score * width)
    empty = width - filled
    return "█" * filled + "░" * empty