from datasets import Dataset from ragas import evaluate from ragas.llms import llm_factory from ragas.metrics.collections import Faithfulness, ResponseRelevancy, ContextPrecision from src.agent import app # Ragas 2025: Experiment-based factory judge_llm = llm_factory("gemini-2.0-flash") def evaluate_agent(questions: list, references: list): """MNC-grade verification of RAG pipeline quality.""" results = [] for q, r in zip(questions, references): output = app.invoke({"messages": [("user", q)]}) results.append({ "user_input": q, "response": output["messages"][-1].content, "retrieved_contexts": [m.content for m in output["messages"] if hasattr(m, "tool_call_id")], "reference": r }) dataset = Dataset.from_list(results) metrics = [Faithfulness(), ResponseRelevancy(), ContextPrecision()] # Evaluate with Gemini judge return evaluate(dataset=dataset, metrics=metrics, llm=judge_llm)