| import pandas as pd |
| from tqdm import * |
| import math |
|
|
| import numpy as np |
| from scipy.stats import kendalltau, spearmanr |
|
|
| log_txt = open("log.txt", "a") |
|
|
| def calculate(human_scores, model_scores): |
| """ |
| Calculate the metrics based on the model evaluation results and human annotation results for the 7 different answers to the same question. |
| """ |
| acc = 0 |
| total = 0 |
| score = 0 |
| for i in range(0, len(human_scores)): |
| for j in range(i + 1, len(human_scores)): |
| A_human = human_scores[i] |
| B_human = human_scores[j] |
| if A_human != B_human: |
| total += 1 |
| A_model = model_scores[i] |
| B_model = model_scores[j] |
| if A_model == B_model: |
| score += 0.5 |
| else: |
| if A_human > B_human and A_model > B_model: |
| score += 1 |
| elif A_human < B_human and A_model < B_model: |
| score += 1 |
| if total != 0: |
| acc = score / total |
| else: |
| acc = 1 |
| x = np.array(human_scores) |
| y = np.array(model_scores) |
| kendall, kendall_p_value = kendalltau(x, y) |
| if math.isnan(kendall): |
| kendall = 0 |
| spearman, spearman_p_value = spearmanr(x, y) |
| if math.isnan(spearman): |
| spearman = 0 |
| |
| return acc, abs(kendall), abs(spearman) |
|
|
| def eval(path): |
| """ |
| Obtain the metric scores of the results from the specified path. |
| """ |
| |
| |
| with open(path, 'r') as file: |
| lines = file.readlines() |
| data = [line.strip().split() for line in lines] |
| sorted_data = sorted(data, key=lambda x: (int(x[0]), int(x[1]), int(x[2]))) |
|
|
| |
| annotations = ["output_dialog.csv", "output_story.csv", "output_Xsum.csv", "output_NFCATS.csv"] |
| model_scores = [] |
| for annotation in annotations: |
| df = pd.read_csv(f"human_annotation/{annotation}") |
| row_labels = df.index |
| test_total_num = 20 |
| average_acc = 0 |
| average_kendall = 0 |
| average_spearman = 0 |
| now_questionId = -1 |
| answerId = -1 |
| for row in tqdm(row_labels): |
| taskId = df.loc[row, "taskId"] |
| questionId = df.loc[row, "questionId"] |
| if int(questionId) < 20 or int(questionId) > 39: |
| continue |
| human_score = df.loc[row, "score"] |
| answerId = (answerId + 1) % 7 |
| model_score = sorted_data[taskId * 140 + (questionId - 20) * 7 + answerId][3] |
| if questionId == now_questionId: |
| human_scores.append(human_score) |
| model_scores.append(model_score) |
| else: |
| if now_questionId != -1: |
| acc, kendall, spearman = calculate(human_scores, model_scores) |
| log_txt.write(f"{now_questionId}: acc is {acc}, kendall is {kendall}, spearman is {spearman}\n") |
| average_acc += acc |
| average_kendall += kendall |
| average_spearman += spearman |
| human_scores = [human_score] |
| model_scores = [model_score] |
| now_questionId = questionId |
|
|
| acc, kendall, spearman = calculate(human_scores, model_scores) |
| log_txt.write(f"{now_questionId}: acc is {acc}, kendall is {kendall}, spearman is {spearman}\n") |
| average_acc += acc |
| average_kendall += kendall |
| average_spearman += spearman |
| log_txt.write(f"On task{taskId}, average acc is {average_acc/test_total_num}, average kendall is {average_kendall/test_total_num}, average spearman is {average_spearman/test_total_num}\n") |
|
|
|
|
| if __name__ == "__main__": |
| eval("output/baseline1_chatglm3_6B.txt") |
|
|