Spaces:
Sleeping
Sleeping
| from datasets import Dataset | |
| from ragas import evaluate | |
| from ragas.llms import llm_factory | |
| from ragas.metrics.collections import Faithfulness, ResponseRelevancy, ContextPrecision | |
| from src.agent import app | |
| # Ragas 2025: Experiment-based factory | |
| judge_llm = llm_factory("gemini-2.0-flash") | |
| def evaluate_agent(questions: list, references: list): | |
| """MNC-grade verification of RAG pipeline quality.""" | |
| results = [] | |
| for q, r in zip(questions, references): | |
| output = app.invoke({"messages": [("user", q)]}) | |
| results.append({ | |
| "user_input": q, | |
| "response": output["messages"][-1].content, | |
| "retrieved_contexts": [m.content for m in output["messages"] if hasattr(m, "tool_call_id")], | |
| "reference": r | |
| }) | |
| dataset = Dataset.from_list(results) | |
| metrics = [Faithfulness(), ResponseRelevancy(), ContextPrecision()] | |
| # Evaluate with Gemini judge | |
| return evaluate(dataset=dataset, metrics=metrics, llm=judge_llm) |