Spaces:

THUIR
/

AEOLLM

Running

陈俊杰

baseline

5f37ab9 over 1 year ago

3.83 kB

	import pandas as pd
	from tqdm import *
	import math

	import numpy as np
	from scipy.stats import kendalltau, spearmanr

	log_txt = open("log.txt", "a")

	def calculate(human_scores, model_scores):
	"""
	Calculate the metrics based on the model evaluation results and human annotation results for the 7 different answers to the same question.
	"""
	acc = 0
	total = 0
	score = 0
	for i in range(0, len(human_scores)):
	for j in range(i + 1, len(human_scores)):
	A_human = human_scores[i]
	B_human = human_scores[j]
	if A_human != B_human:
	total += 1
	A_model = model_scores[i]
	B_model = model_scores[j]
	if A_model == B_model:
	score += 0.5
	else:
	if A_human > B_human and A_model > B_model:
	score += 1
	elif A_human < B_human and A_model < B_model:
	score += 1
	if total != 0:
	acc = score / total
	else:
	acc = 1
	x = np.array(human_scores)
	y = np.array(model_scores)
	kendall, kendall_p_value = kendalltau(x, y)
	if math.isnan(kendall):
	kendall = 0
	spearman, spearman_p_value = spearmanr(x, y)
	if math.isnan(spearman):
	spearman = 0

	return acc, abs(kendall), abs(spearman)

	def eval(path):
	"""
	Obtain the metric scores of the results from the specified path.
	"""
	# taskId,questionId,answerId,score,rank
	# Read the result file
	with open(path, 'r') as file:
	lines = file.readlines()
	data = [line.strip().split() for line in lines]
	sorted_data = sorted(data, key=lambda x: (int(x[0]), int(x[1]), int(x[2])))

	# Read the human annotation file
	annotations = ["output_dialog.csv", "output_story.csv", "output_Xsum.csv", "output_NFCATS.csv"]
	model_scores = []
	for annotation in annotations:
	df = pd.read_csv(f"human_annotation/{annotation}") # hide
	row_labels = df.index
	test_total_num = 20 # test_set
	average_acc = 0
	average_kendall = 0
	average_spearman = 0
	now_questionId = -1
	answerId = -1
	for row in tqdm(row_labels):
	taskId = df.loc[row, "taskId"]
	questionId = df.loc[row, "questionId"]
	if int(questionId) < 20 or int(questionId) > 39:
	continue
	human_score = df.loc[row, "score"]
	answerId = (answerId + 1) % 7
	model_score = sorted_data[taskId * 140 + (questionId - 20) * 7 + answerId][3]
	if questionId == now_questionId:
	human_scores.append(human_score)
	model_scores.append(model_score)
	else:
	if now_questionId != -1:
	acc, kendall, spearman = calculate(human_scores, model_scores)
	log_txt.write(f"{now_questionId}: acc is {acc}, kendall is {kendall}, spearman is {spearman}\n")
	average_acc += acc
	average_kendall += kendall
	average_spearman += spearman
	human_scores = [human_score]
	model_scores = [model_score]
	now_questionId = questionId

	acc, kendall, spearman = calculate(human_scores, model_scores)
	log_txt.write(f"{now_questionId}: acc is {acc}, kendall is {kendall}, spearman is {spearman}\n")
	average_acc += acc
	average_kendall += kendall
	average_spearman += spearman
	log_txt.write(f"On task{taskId}, average acc is {average_acc/test_total_num}, average kendall is {average_kendall/test_total_num}, average spearman is {average_spearman/test_total_num}\n")


	if __name__ == "__main__":
	eval("output/baseline1_chatglm3_6B.txt")