| import argparse
|
| import jsonlines
|
| import json
|
|
|
| from deepeval.models import OllamaModel
|
| from deepeval.metrics import (
|
| ContextualRelevancyMetric,
|
| ContextualRecallMetric,
|
| ContextualPrecisionMetric,
|
| AnswerRelevancyMetric,
|
| FaithfulnessMetric
|
| )
|
|
|
|
|
|
|
|
|
| from deepeval.test_case import LLMTestCase
|
| from deepeval.dataset import EvaluationDataset, Golden
|
|
|
| from deepeval import evaluate
|
| from deepeval.models import OllamaModel
|
| from transformers import AutoModelForCausalLM, AutoTokenizer
|
| from Llemma_Finetuned import Llemma_Finetuned
|
| import ollama
|
|
|
|
|
|
|
|
|
| if __name__=="__main__":
|
|
|
| parser = argparse.ArgumentParser()
|
|
|
|
|
| parser.add_argument("-n", "--num", help = "Number of test cases to use")
|
| parser.add_argument("-s", "--shot", help = "n-shot inference examples")
|
| parser.add_argument("-d", "--dataset", help = "Path to test case dataset")
|
|
|
|
|
| args = parser.parse_args()
|
| test_case_num = int(args.num)
|
| num_shot = int(args.shot)
|
| dataset_name = str(args.dataset)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| sorted_rows = []
|
| with open('dataset_row_stl.txt', 'r') as file:
|
| sorted_rows = file.readlines()
|
|
|
| sorted_rows = sorted_rows[0:num_shot]
|
| sorted_rows = [int(x) for x in sorted_rows]
|
|
|
| print("Read in sorted rows.")
|
|
|
| examples = "Here are " + str(num_shot) + " examples of math questions (Q) with given answers (A).\n"
|
| with jsonlines.open("mse_text_img_QA_ds_test.jsonl", mode='r') as fp:
|
|
|
| n = 0
|
| for j, data in enumerate(fp):
|
| if j + 1 in sorted_rows:
|
| print("Num shot row " + str(j + 1))
|
|
|
| examples += "Q: " + data["body"] + "\n\n"
|
| is_accepted = False
|
| best_score = float('-inf')
|
| output_text = ""
|
| for i in range(len(data["answers"])):
|
| if bool(data["answers"][i]["accepted"]) == True:
|
| if is_accepted == False:
|
| is_accepted = True
|
| best_score = int(data["answers"][i]["score"])
|
| output_text = data["answers"][i]["body"]
|
| elif int(data["answers"][i]["score"]) > best_score:
|
| best_score = int(data["answers"][i]["score"])
|
| output_text = data["answers"][i]["body"]
|
| elif int(data["answers"][i]["score"]) > best_score:
|
| best_score = int(data["answers"][i]["score"])
|
| output_text = data["answers"][i]["body"]
|
| examples += "A: " + output_text + "\n\n"
|
| if n == (num_shot - 1):
|
| examples += "Provide an answer (A) to the following math question (Q) in a similar manner to the previous example(s) given.\n\nQ: "
|
|
|
| n += 1
|
| elif n >= num_shot:
|
| break
|
| else:
|
| continue
|
|
|
| print("Generated examples for", str(num_shot), "shot.")
|
|
|
| mse_dataset = []
|
| with jsonlines.open("mse_text_img_QA_ds_test.jsonl", mode='r') as reader:
|
|
|
| count = 0
|
|
|
| curr_row = 0
|
| for row in reader.iter(type=dict, skip_invalid=True):
|
| curr_row += 1
|
| if curr_row == 33 or curr_row == 36 or curr_row == 69 \
|
| or curr_row == 24 or curr_row == 76 \
|
| or curr_row == 66 or curr_row == 9 \
|
| or curr_row == 26 or curr_row == 27 \
|
| or curr_row == 37 or curr_row == 55 \
|
| or curr_row == 54 or curr_row == 138 \
|
| or curr_row == 77 or curr_row == 84 or curr_row == 87 \
|
| or curr_row == 80 or curr_row == 81 or curr_row == 97 \
|
| or curr_row == 115 or curr_row == 106:
|
| print("Skipped row " + str(curr_row))
|
| continue
|
| elif curr_row in sorted_rows:
|
| print("Skipped row " + str(curr_row) + " because it is a shorter example")
|
| continue
|
|
|
|
|
|
|
| if count >= test_case_num:
|
| break
|
| else:
|
| input_text = row["body"]
|
|
|
|
|
| is_accepted = False
|
| best_score = float('-inf')
|
| output_text = ""
|
|
|
| next_best_answer = ""
|
| for i in range(len(row["answers"])):
|
| if bool(row["answers"][i]["accepted"]) == True:
|
| if is_accepted == False:
|
| is_accepted = True
|
| next_best_answer = output_text
|
| best_score = int(row["answers"][i]["score"])
|
| output_text = row["answers"][i]["body"]
|
| elif int(row["answers"][i]["score"]) > best_score:
|
| next_best_answer = output_text
|
| best_score = int(row["answers"][i]["score"])
|
| output_text = row["answers"][i]["body"]
|
|
|
|
|
| elif int(row["answers"][i]["score"]) > best_score:
|
| next_best_answer = output_text
|
| best_score = int(row["answers"][i]["score"])
|
| output_text = row["answers"][i]["body"]
|
|
|
|
|
| if next_best_answer == "" or next_best_answer is None:
|
| next_best_answer = row["title"]
|
|
|
|
|
| if num_shot == 0:
|
| i_text = json.dumps(input_text)
|
| e_output = json.dumps(output_text)
|
| r_context = json.dumps(next_best_answer)
|
| gen_answer = ollama.generate(model="Hudson/llemma:7b", prompt=i_text)
|
| a_output = json.dumps(gen_answer.response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| mse_dataset.append(LLMTestCase(input=i_text, actual_output=a_output, expected_output=e_output, retrieval_context=[r_context]))
|
| else:
|
| i_text = json.dumps(examples + input_text)
|
| e_output = json.dumps(output_text)
|
| r_context = json.dumps(next_best_answer)
|
| gen_answer = ollama.generate(model="Hudson/llemma:7b", prompt=i_text)
|
| a_output = json.dumps(gen_answer.response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| mse_dataset.append(LLMTestCase(input=i_text, actual_output=a_output, expected_output=e_output, retrieval_context=[r_context]))
|
| count = count + 1
|
|
|
| print("At", str(count), "out of", str(test_case_num), " current row =", str(curr_row))
|
|
|
|
|
|
|
|
|
|
|
| dataset = EvaluationDataset(test_cases=mse_dataset)
|
| dataset.save_as(file_type="json", directory="./deepeval-test-dataset", file_name=dataset_name, include_test_cases=True)
|
|
|
|
|