File size: 2,969 Bytes
c003cc2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | import os
from typing import Optional
from dotenv import load_dotenv
load_dotenv()
def evaluate_answer(
question: str,
answer: str,
contexts: list[str],
) -> Optional[dict]:
"""
Evaluate a RAG answer using RAGAS metrics.
Runs three metrics:
- Faithfulness: Does the answer only say things supported by the chunks?
- Answer Relevancy: Does the answer actually address the question?
- Context Precision: Were the retrieved chunks relevant to the question?
Args:
question: The user's original question
answer: The answer generated by the RAG pipeline
contexts: List of text chunks that were retrieved from ChromaDB
"""
try:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
)
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
ragas_llm = ChatOpenAI(
model=os.getenv("OPENROUTER_MODEL", "anthropic/claude-3-haiku"),
api_key=os.getenv("OPENROUTER_API_KEY"),
base_url="https://openrouter.ai/api/v1",
temperature=0,
)
ragas_embeddings = OpenAIEmbeddings(
model="text-embedding-3-small",
api_key=os.getenv("OPENROUTER_API_KEY"),
base_url="https://openrouter.ai/api/v1",
)
# RAGAS expects data in Dataset format.
data = {
"question": [question],
"answer": [answer],
# contexts must be a list of lists (one list of chunks per question)
"contexts": [contexts],
# ground_truth is optional, we skip it since we have no labeled data
"ground_truth": [""],
}
dataset = Dataset.from_dict(data)
# Run evaluation
result = evaluate(
dataset=dataset,
metrics=[
faithfulness,
answer_relevancy,
context_precision,
],
llm=ragas_llm,
embeddings=ragas_embeddings,
raise_exceptions=False,
)
# Extract scores
scores = result.to_pandas().iloc[0].to_dict()
return {
"faithfulness": round(float(scores.get("faithfulness", 0)), 2),
"answer_relevancy": round(float(scores.get("answer_relevancy", 0)), 2),
"context_precision": round(float(scores.get("context_precision", 0)), 2),
}
except Exception as e:
print("RAGAS evaluation error: " + str(e))
return None
def get_score_emoji(score: float) -> str:
if score >= 0.75:
return "🟢"
elif score >= 0.5:
return "🟡"
else:
return "🔴"
def format_score_bar(score: float, width: int = 10) -> str:
filled = int(score * width)
empty = width - filled
return "█" * filled + "░" * empty
|