import os import json import cohere import pandas as pd from dotenv import load_dotenv load_dotenv(override=True) from rag import RAG from prompts import eval_preamble, eval_message def evaluate(co, question, agent_answer, ground_truth): response = co.chat( model='command-r', message=eval_message.format(question=question, agent_answer=agent_answer, ground_truth=ground_truth), temperature=0.0, chat_history=[{"role": "system", "message": eval_preamble}], prompt_truncation='AUTO', connectors=[] ) json_response = json.loads(response.text[8:-4]) correctness = json_response['correctness'] correctness_reason = json_response['correctness_reason'] return correctness, correctness_reason if __name__ == '__main__': co = cohere.Client(os.getenv('COHERE_API_KEY')) rag = RAG() def df_evaluate(row): correctness, correctness_reason = evaluate( co, row["question"], row["agent_answer"], row["answer"] ) row["correctness"] = correctness row["correctness_reason"] = correctness_reason return row eval_df = pd.read_csv("eval/eval_set.csv") eval_df["agent_answer"] = eval_df["question"].apply(lambda x: list(rag(x))[0]) eval_df = eval_df.apply(df_evaluate, axis=1) eval_df.to_csv("eval/eval_set_results.csv", index=False) accuracy = eval_df["correctness"].mean() print(f"Accuracy: {accuracy}")