Rag-ag / src /eval.py
beastLucifer's picture
Upload 11 files
9806c71 verified
from datasets import Dataset
from ragas import evaluate
from ragas.llms import llm_factory
from ragas.metrics.collections import Faithfulness, ResponseRelevancy, ContextPrecision
from src.agent import app
# Ragas 2025: Experiment-based factory
judge_llm = llm_factory("gemini-2.0-flash")
def evaluate_agent(questions: list, references: list):
"""MNC-grade verification of RAG pipeline quality."""
results = []
for q, r in zip(questions, references):
output = app.invoke({"messages": [("user", q)]})
results.append({
"user_input": q,
"response": output["messages"][-1].content,
"retrieved_contexts": [m.content for m in output["messages"] if hasattr(m, "tool_call_id")],
"reference": r
})
dataset = Dataset.from_list(results)
metrics = [Faithfulness(), ResponseRelevancy(), ContextPrecision()]
# Evaluate with Gemini judge
return evaluate(dataset=dataset, metrics=metrics, llm=judge_llm)