chatpaper / src /evaluation /ragas_eval.py
Shafagh99's picture
add chatpaper project
c003cc2
import os
from typing import Optional
from dotenv import load_dotenv
load_dotenv()
def evaluate_answer(
question: str,
answer: str,
contexts: list[str],
) -> Optional[dict]:
"""
Evaluate a RAG answer using RAGAS metrics.
Runs three metrics:
- Faithfulness: Does the answer only say things supported by the chunks?
- Answer Relevancy: Does the answer actually address the question?
- Context Precision: Were the retrieved chunks relevant to the question?
Args:
question: The user's original question
answer: The answer generated by the RAG pipeline
contexts: List of text chunks that were retrieved from ChromaDB
"""
try:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import (
faithfulness,
answer_relevancy,
context_precision,
)
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
ragas_llm = ChatOpenAI(
model=os.getenv("OPENROUTER_MODEL", "anthropic/claude-3-haiku"),
api_key=os.getenv("OPENROUTER_API_KEY"),
base_url="https://openrouter.ai/api/v1",
temperature=0,
)
ragas_embeddings = OpenAIEmbeddings(
model="text-embedding-3-small",
api_key=os.getenv("OPENROUTER_API_KEY"),
base_url="https://openrouter.ai/api/v1",
)
# RAGAS expects data in Dataset format.
data = {
"question": [question],
"answer": [answer],
# contexts must be a list of lists (one list of chunks per question)
"contexts": [contexts],
# ground_truth is optional, we skip it since we have no labeled data
"ground_truth": [""],
}
dataset = Dataset.from_dict(data)
# Run evaluation
result = evaluate(
dataset=dataset,
metrics=[
faithfulness,
answer_relevancy,
context_precision,
],
llm=ragas_llm,
embeddings=ragas_embeddings,
raise_exceptions=False,
)
# Extract scores
scores = result.to_pandas().iloc[0].to_dict()
return {
"faithfulness": round(float(scores.get("faithfulness", 0)), 2),
"answer_relevancy": round(float(scores.get("answer_relevancy", 0)), 2),
"context_precision": round(float(scores.get("context_precision", 0)), 2),
}
except Exception as e:
print("RAGAS evaluation error: " + str(e))
return None
def get_score_emoji(score: float) -> str:
if score >= 0.75:
return "🟒"
elif score >= 0.5:
return "🟑"
else:
return "πŸ”΄"
def format_score_bar(score: float, width: int = 10) -> str:
filled = int(score * width)
empty = width - filled
return "β–ˆ" * filled + "β–‘" * empty