|
|
import pandas as pd
|
|
|
import json
|
|
|
|
|
|
from numpy.ma.extras import average
|
|
|
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
|
|
|
|
|
|
model_name = 'llama3-8b-nei-sft'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
file_path = './testset/FEVEROUS.json'
|
|
|
save = []
|
|
|
dataset_name = []
|
|
|
def process_line(line, label):
|
|
|
line = line.strip()
|
|
|
if 'support' in line or 'Support' in line:
|
|
|
return 1
|
|
|
else:
|
|
|
return 0
|
|
|
|
|
|
|
|
|
def process_file(file_path, labels):
|
|
|
results = []
|
|
|
with open(file_path, 'r') as file:
|
|
|
for i, line in enumerate(file):
|
|
|
if i >= len(labels):
|
|
|
raise IndexError(f"Not enough labels for line {i + 1}")
|
|
|
processed_value = process_line(line, labels[i])
|
|
|
results.append(processed_value)
|
|
|
return results
|
|
|
|
|
|
|
|
|
prediction_path = f'results_llama/FEVEROUS-{model_name}.txt'
|
|
|
|
|
|
label = []
|
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
|
raw_data = json.load(file)
|
|
|
for item in raw_data:
|
|
|
label.append(int(item['label'] == 'supports'))
|
|
|
prediction = process_file(prediction_path, label)
|
|
|
|
|
|
|
|
|
accuracy = sum([int(label[i] == prediction[i]) for i in range(len(label))]) / len(label)
|
|
|
print(f'Accuracy: {accuracy: .4f}')
|
|
|
|
|
|
macro_f1 = f1_score(label, prediction, average='macro')
|
|
|
print(f'Macro F1 Score: {macro_f1: .4f}')
|
|
|
|
|
|
|
|
|
tn, fp, fn, tp = confusion_matrix(label, prediction).ravel()
|
|
|
|
|
|
type1_error = fp / (fp + tn) if (fp + tn) > 0 else 0
|
|
|
print(f'Type 1 Error (False Positive Rate): {type1_error: .2f}')
|
|
|
|
|
|
type2_error = fn / (fn + tp) if (fn + tp) > 0 else 0
|
|
|
print(f'Type 2 Error (False Negative Rate): {type2_error: .2f}')
|
|
|
|
|
|
print(f'{accuracy * 100: .2f} {macro_f1 * 100: .2f} {type1_error * 100: .2f} {type2_error * 100: .2f}')
|
|
|
save.append(f'{macro_f1 * 100: .2f}')
|
|
|
dataset_name.append('FEVEROUS')
|
|
|
|
|
|
for hop in [2, 3, 4]:
|
|
|
file_path = './testset/HOVER.json'
|
|
|
prediction_path = f'results_llama/HOVER-{model_name}.txt'
|
|
|
|
|
|
label0 = []
|
|
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
|
raw_data = json.load(file)
|
|
|
for item in raw_data:
|
|
|
label0.append(int(item['label'] == 'supports'))
|
|
|
prediction0 = process_file(prediction_path, label0)
|
|
|
label = []
|
|
|
prediction = []
|
|
|
for i in range(len(prediction0)):
|
|
|
if raw_data[i]['num_hops'] == hop:
|
|
|
label.append(label0[i])
|
|
|
prediction.append(prediction0[i])
|
|
|
|
|
|
accuracy = sum([int(label[i] == prediction[i]) for i in range(len(label))]) / len(label)
|
|
|
print(f'Accuracy: {accuracy: .4f}')
|
|
|
|
|
|
macro_f1 = f1_score(label, prediction, average='macro')
|
|
|
print(f'Macro F1 Score: {macro_f1: .4f}')
|
|
|
|
|
|
tn, fp, fn, tp = confusion_matrix(label, prediction).ravel()
|
|
|
|
|
|
type1_error = fp / (fp + tn) if (fp + tn) > 0 else 0
|
|
|
print(f'Type 1 Error (False Positive Rate): {type1_error: .2f}')
|
|
|
|
|
|
type2_error = fn / (fn + tp) if (fn + tp) > 0 else 0
|
|
|
print(f'Type 2 Error (False Negative Rate): {type2_error: .2f}')
|
|
|
save.append(f'{macro_f1 * 100: .2f}')
|
|
|
dataset_name.append(f'HOVER-{hop}hop')
|
|
|
|
|
|
print(f'{accuracy * 100: .2f} {macro_f1 * 100: .2f} {type1_error * 100: .2f} {type2_error * 100: .2f}')
|
|
|
|
|
|
file_path = './testset/FEVEROUS.json'
|
|
|
prediction_path = f'results_llama/Open_FEVEROUS-{model_name}.txt'
|
|
|
|
|
|
label = []
|
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
|
raw_data = json.load(file)
|
|
|
for item in raw_data:
|
|
|
label.append(int(item['label'] == 'supports'))
|
|
|
prediction = process_file(prediction_path, label)
|
|
|
|
|
|
|
|
|
accuracy = sum([int(label[i] == prediction[i]) for i in range(len(label))]) / len(label)
|
|
|
print(f'Accuracy: {accuracy: .4f}')
|
|
|
|
|
|
macro_f1 = f1_score(label, prediction, average='macro')
|
|
|
print(f'Macro F1 Score: {macro_f1: .4f}')
|
|
|
|
|
|
tn, fp, fn, tp = confusion_matrix(label, prediction).ravel()
|
|
|
|
|
|
type1_error = fp / (fp + tn) if (fp + tn) > 0 else 0
|
|
|
print(f'Type 1 Error (False Positive Rate): {type1_error: .2f}')
|
|
|
|
|
|
type2_error = fn / (fn + tp) if (fn + tp) > 0 else 0
|
|
|
print(f'Type 2 Error (False Negative Rate): {type2_error: .2f}')
|
|
|
|
|
|
print(f'{accuracy * 100: .2f} {macro_f1 * 100: .2f} {type1_error * 100: .2f} {type2_error * 100: .2f}')
|
|
|
save.append(f'{macro_f1 * 100: .2f}')
|
|
|
dataset_name.append('Open_FEVEROUS')
|
|
|
|
|
|
for hop in [2, 3, 4]:
|
|
|
file_path = './testset/HOVER.json'
|
|
|
prediction_path = f'results_llama/Open_HOVER-{model_name}.txt'
|
|
|
|
|
|
label0 = []
|
|
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
|
raw_data = json.load(file)
|
|
|
for item in raw_data:
|
|
|
label0.append(int(item['label'] == 'supports'))
|
|
|
prediction0 = process_file(prediction_path, label0)
|
|
|
label = []
|
|
|
prediction = []
|
|
|
for i in range(len(prediction0)):
|
|
|
if raw_data[i]['num_hops'] == hop:
|
|
|
label.append(label0[i])
|
|
|
prediction.append(prediction0[i])
|
|
|
|
|
|
accuracy = sum([int(label[i] == prediction[i]) for i in range(len(label))]) / len(label)
|
|
|
print(f'Accuracy: {accuracy: .4f}')
|
|
|
|
|
|
macro_f1 = f1_score(label, prediction, average='macro')
|
|
|
print(f'Macro F1 Score: {macro_f1: .4f}')
|
|
|
|
|
|
tn, fp, fn, tp = confusion_matrix(label, prediction).ravel()
|
|
|
|
|
|
type1_error = fp / (fp + tn) if (fp + tn) > 0 else 0
|
|
|
print(f'Type 1 Error (False Positive Rate): {type1_error: .2f}')
|
|
|
|
|
|
type2_error = fn / (fn + tp) if (fn + tp) > 0 else 0
|
|
|
print(f'Type 2 Error (False Negative Rate): {type2_error: .2f}')
|
|
|
save.append(f'{macro_f1 * 100: .2f}')
|
|
|
dataset_name.append(f'Open_HOVER-{hop}hop')
|
|
|
|
|
|
print(f'{accuracy * 100: .2f} {macro_f1 * 100: .2f} {type1_error * 100: .2f} {type2_error * 100: .2f}')
|
|
|
|
|
|
file_path = './testset/LLM-AggreFact_test.json'
|
|
|
prediction_path = f'results_llama/LLM-AggreFact_test-{model_name}.txt'
|
|
|
|
|
|
label = []
|
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
|
raw_data = json.load(file)
|
|
|
for item in raw_data:
|
|
|
label.append(int(item['label'] == 1))
|
|
|
prediction = process_file(prediction_path, label)
|
|
|
|
|
|
accuracy = sum([int(label[i] == prediction[i]) for i in range(len(label))]) / len(label)
|
|
|
print(f'Accuracy: {accuracy: .4f}')
|
|
|
|
|
|
macro_f1 = f1_score(label, prediction, average='macro')
|
|
|
print(f'Macro F1 Score: {macro_f1: .4f}')
|
|
|
|
|
|
tn, fp, fn, tp = confusion_matrix(label, prediction).ravel()
|
|
|
|
|
|
type1_error = fp / (fp + tn) if (fp + tn) > 0 else 0
|
|
|
print(f'Type 1 Error (False Positive Rate): {type1_error: .2f}')
|
|
|
|
|
|
type2_error = fn / (fn + tp) if (fn + tp) > 0 else 0
|
|
|
print(f'Type 2 Error (False Negative Rate): {type2_error: .2f}')
|
|
|
|
|
|
print(f'{accuracy * 100: .2f} {macro_f1 * 100: .2f} {type1_error * 100: .2f} {type2_error * 100: .2f}')
|
|
|
save.append(f'{macro_f1 * 100: .2f}')
|
|
|
dataset_name.append('LLM-AggreFact')
|
|
|
|
|
|
|
|
|
def process_line_nei(line, label):
|
|
|
line = line.strip()
|
|
|
if line == "['support']" or line == "['Support']":
|
|
|
return 2
|
|
|
elif line == "['refute']" or line == "['Refute']":
|
|
|
return 0
|
|
|
else:
|
|
|
return 1
|
|
|
|
|
|
|
|
|
def process_file_nei(file_path, labels):
|
|
|
results = []
|
|
|
with open(file_path, 'r') as file:
|
|
|
for i, line in enumerate(file):
|
|
|
if i >= len(labels):
|
|
|
raise IndexError(f"Not enough labels for line {i + 1}")
|
|
|
processed_value = process_line_nei(line, labels[i])
|
|
|
results.append(processed_value)
|
|
|
return results
|
|
|
|
|
|
|
|
|
|
|
|
for name in ['Healthver_test', 'Open_Healthver_test', 'Scifact_train', 'Scifact_dev', 'VitaminC_dev', 'VitaminC_test']:
|
|
|
|
|
|
file_path = f'./testset/{name}.json'
|
|
|
|
|
|
prediction_path = f'results_llama/{name}-{model_name}.txt'
|
|
|
|
|
|
label = []
|
|
|
|
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
|
raw_data = json.load(file)
|
|
|
if name == 'Healthver_test' or name == 'Open_Healthver_test':
|
|
|
for item in raw_data:
|
|
|
if item['label'] == 'Supports':
|
|
|
label.append(int(2))
|
|
|
elif item['label'] == 'Neutral':
|
|
|
label.append(int(1))
|
|
|
else:
|
|
|
label.append(int(0))
|
|
|
elif name == 'VitaminC_dev' or name == 'VitaminC_test':
|
|
|
for item in raw_data:
|
|
|
if item['label'] == 'SUPPORTS':
|
|
|
label.append(int(2))
|
|
|
elif item['label'] == 'NOT ENOUGH INFO':
|
|
|
label.append(int(1))
|
|
|
else:
|
|
|
label.append(int(0))
|
|
|
else:
|
|
|
for item in raw_data:
|
|
|
if item['label'] == 'SUPPORT':
|
|
|
label.append(int(2))
|
|
|
elif item['label'] == 'UNKNOWN':
|
|
|
label.append(int(1))
|
|
|
else:
|
|
|
label.append(int(0))
|
|
|
|
|
|
prediction = process_file_nei(prediction_path, label)
|
|
|
|
|
|
|
|
|
accuracy = sum([int(label[i] == prediction[i]) for i in range(len(label))]) / len(label)
|
|
|
print(f'Accuracy: {accuracy: .4f}')
|
|
|
|
|
|
macro_f1 = f1_score(label, prediction, average='macro')
|
|
|
print(f'Macro F1 Score: {macro_f1: .4f}')
|
|
|
|
|
|
print(f'{accuracy * 100: .2f} {macro_f1 * 100: .2f}')
|
|
|
save.append(f'{macro_f1 * 100: .2f}')
|
|
|
dataset_name.append(name + ' w NEI')
|
|
|
|
|
|
|
|
|
for name in ['Healthver_test', 'Open_Healthver_test', 'Scifact_train', 'Scifact_dev', 'VitaminC_dev', 'VitaminC_test']:
|
|
|
|
|
|
file_path = f'./testset/{name}.json'
|
|
|
|
|
|
prediction_path = f'results_llama/{name}-{model_name}.txt'
|
|
|
|
|
|
label = []
|
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
|
raw_data = json.load(file)
|
|
|
if name == 'Healthver_test' or name == 'Open_Healthver_test':
|
|
|
for item in raw_data:
|
|
|
if item['label'] == 'Supports':
|
|
|
label.append(int(1))
|
|
|
else:
|
|
|
label.append(int(0))
|
|
|
elif name == 'VitaminC_dev' or name == 'VitaminC_test':
|
|
|
for item in raw_data:
|
|
|
if item['label'] == 'SUPPORTS':
|
|
|
label.append(int(1))
|
|
|
else:
|
|
|
label.append(int(0))
|
|
|
else:
|
|
|
for item in raw_data:
|
|
|
if item['label'] == 'SUPPORT':
|
|
|
label.append(int(1))
|
|
|
else:
|
|
|
label.append(int(0))
|
|
|
prediction = process_file(prediction_path, label)
|
|
|
|
|
|
accuracy = sum([int(label[i] == prediction[i]) for i in range(len(label))]) / len(label)
|
|
|
print(f'Accuracy: {accuracy: .4f}')
|
|
|
|
|
|
macro_f1 = f1_score(label, prediction, average='macro')
|
|
|
print(f'Macro F1 Score: {macro_f1: .4f}')
|
|
|
|
|
|
print(f'{accuracy * 100: .2f} {macro_f1 * 100: .2f}')
|
|
|
save.append(f'{macro_f1 * 100: .2f}')
|
|
|
dataset_name.append(name + ' w/o NEI')
|
|
|
import pandas as pd
|
|
|
from tabulate import tabulate
|
|
|
|
|
|
show_rank = []
|
|
|
show_rank.append([0, 1, 2, 3, 15, 9])
|
|
|
show_rank.append([4, 5, 6, 7, 16, 10])
|
|
|
show_rank.append([8, 17, 11, 18, 12, 19, 13, 20, 14])
|
|
|
dataset_name_show = []
|
|
|
performance_show = []
|
|
|
for rank in show_rank:
|
|
|
dataset_temp = []
|
|
|
save_temp = []
|
|
|
for number in rank:
|
|
|
dataset_temp.append(dataset_name[number])
|
|
|
save_temp.append(save[number])
|
|
|
dataset_name_show.append(dataset_temp)
|
|
|
performance_show.append(save_temp)
|
|
|
df1 = pd.DataFrame({
|
|
|
"Dataset Name": dataset_name_show[0],
|
|
|
"Performance": performance_show[0]
|
|
|
})
|
|
|
df2 = pd.DataFrame({
|
|
|
"Dataset Name": dataset_name_show[1],
|
|
|
"Performance": performance_show[1]
|
|
|
})
|
|
|
df3 = pd.DataFrame({
|
|
|
"Dataset Name": dataset_name_show[2],
|
|
|
"Performance": performance_show[2]
|
|
|
})
|
|
|
|
|
|
for table in range(3):
|
|
|
for i in range(len(performance_show[table])):
|
|
|
print(f'{performance_show[table][i]}', end=' ')
|
|
|
print('')
|
|
|
table = tabulate(df1.T, headers='keys', tablefmt='fancy_grid')
|
|
|
print(table)
|
|
|
table = tabulate(df2.T, headers='keys', tablefmt='fancy_grid')
|
|
|
print(table)
|
|
|
table = tabulate(df3.T, headers='keys', tablefmt='fancy_grid')
|
|
|
print(table)
|
|
|
|