AraRAG / evaluate.py
Yaser Abdelaziz
Change tokenizer
a936ca4
import os
import json
import cohere
import pandas as pd
from dotenv import load_dotenv
load_dotenv(override=True)
from rag import RAG
from prompts import eval_preamble, eval_message
def evaluate(co, question, agent_answer, ground_truth):
response = co.chat(
model='command-r',
message=eval_message.format(question=question, agent_answer=agent_answer, ground_truth=ground_truth),
temperature=0.0,
chat_history=[{"role": "system", "message": eval_preamble}],
prompt_truncation='AUTO',
connectors=[]
)
json_response = json.loads(response.text[8:-4])
correctness = json_response['correctness']
correctness_reason = json_response['correctness_reason']
return correctness, correctness_reason
if __name__ == '__main__':
co = cohere.Client(os.getenv('COHERE_API_KEY'))
rag = RAG()
def df_evaluate(row):
correctness, correctness_reason = evaluate(
co,
row["question"],
row["agent_answer"],
row["answer"]
)
row["correctness"] = correctness
row["correctness_reason"] = correctness_reason
return row
eval_df = pd.read_csv("eval/eval_set.csv")
eval_df["agent_answer"] = eval_df["question"].apply(lambda x: list(rag(x))[0])
eval_df = eval_df.apply(df_evaluate, axis=1)
eval_df.to_csv("eval/eval_set_results.csv", index=False)
accuracy = eval_df["correctness"].mean()
print(f"Accuracy: {accuracy}")