|
|
from glob import glob |
|
|
import json |
|
|
|
|
|
|
|
|
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix |
|
|
from sklearn.metrics import precision_recall_fscore_support |
|
|
|
|
|
def evaluate_predictions(ground_truth, model_scores, mode="multiclass"): |
|
|
""" |
|
|
Computes various evaluation metrics (accuracy, precision, recall, F1-score) |
|
|
for the given ground_truth and model_scores lists. |
|
|
|
|
|
:param ground_truth: list of true labels |
|
|
:param model_scores: list of predicted labels |
|
|
:return: dict containing accuracy, classification report, confusion matrix, |
|
|
precision, recall, and f1-score |
|
|
""" |
|
|
|
|
|
accuracy = accuracy_score(ground_truth, model_scores) |
|
|
|
|
|
|
|
|
precision, recall, f1_score, _ = precision_recall_fscore_support( |
|
|
ground_truth, |
|
|
model_scores, |
|
|
average='micro' |
|
|
) |
|
|
|
|
|
|
|
|
class_report = classification_report(ground_truth, model_scores) |
|
|
|
|
|
|
|
|
cm = confusion_matrix(ground_truth, model_scores) |
|
|
|
|
|
|
|
|
return { |
|
|
'accuracy': accuracy, |
|
|
'precision (macro avg)': precision, |
|
|
'recall (macro avg)': recall, |
|
|
'f1_score (macro avg)': f1_score, |
|
|
'classification_report': class_report, |
|
|
'confusion_matrix': cm |
|
|
} |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
all_files = glob("benchmark_logs/Qwen2.5-3B-Instruct/*.json") |
|
|
mode = "mu" |
|
|
|
|
|
failed_ = 0 |
|
|
ground_truts = [] |
|
|
inference_scoes = [] |
|
|
|
|
|
|
|
|
for all_samples in all_files: |
|
|
with open(all_samples) as f: |
|
|
da_ = json.load(f) |
|
|
|
|
|
try: |
|
|
|
|
|
ground_truth = eval(da_['ground_truth'])['is_met'] |
|
|
|
|
|
if mode=="bi": |
|
|
|
|
|
if ground_truth == "undetermined": |
|
|
ground_truts.append("no") |
|
|
else: |
|
|
ground_truts.append(ground_truth) |
|
|
|
|
|
else: |
|
|
ground_truts.append(ground_truth) |
|
|
|
|
|
|
|
|
api_res = eval(da_['api_response'])['assessments'][0]['is_met'].lower() |
|
|
|
|
|
if mode=="bi": |
|
|
if api_res == "undetermined": |
|
|
inference_scoes.append("no") |
|
|
else: |
|
|
inference_scoes.append(api_res) |
|
|
else: |
|
|
inference_scoes.append(api_res) |
|
|
|
|
|
except Exception as e: |
|
|
failed_ += 1 |
|
|
|
|
|
pass |
|
|
|
|
|
|
|
|
results = evaluate_predictions(ground_truts, inference_scoes, mode="binary") |
|
|
|
|
|
|
|
|
print(f"Number of failed files: {failed_}") |
|
|
print("Accuracy:", results['accuracy']) |
|
|
print("Precision (macro avg):", results['precision (macro avg)']) |
|
|
print("Recall (macro avg):", results['recall (macro avg)']) |
|
|
print("F1-score (macro avg):", results['f1_score (macro avg)']) |
|
|
print("Classification Report:\n", results['classification_report']) |
|
|
print("Confusion Matrix:\n", results['confusion_matrix']) |
|
|
|