File size: 3,413 Bytes
9d5b280 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
from glob import glob
import json
# We'll use scikit-learn for evaluation metrics.
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
def evaluate_predictions(ground_truth, model_scores, mode="multiclass"):
"""
Computes various evaluation metrics (accuracy, precision, recall, F1-score)
for the given ground_truth and model_scores lists.
:param ground_truth: list of true labels
:param model_scores: list of predicted labels
:return: dict containing accuracy, classification report, confusion matrix,
precision, recall, and f1-score
"""
# Calculate accuracy
accuracy = accuracy_score(ground_truth, model_scores)
# Calculate macro-averaged precision, recall, and F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(
ground_truth,
model_scores,
average='micro'
)
# Generate a classification report
class_report = classification_report(ground_truth, model_scores)
# Generate a confusion matrix
cm = confusion_matrix(ground_truth, model_scores)
# Return the metrics in a dictionary
return {
'accuracy': accuracy,
'precision (macro avg)': precision,
'recall (macro avg)': recall,
'f1_score (macro avg)': f1_score,
'classification_report': class_report,
'confusion_matrix': cm
}
if __name__ == "__main__":
# Collect JSON files
all_files = glob("benchmark_logs/Qwen2.5-3B-Instruct/*.json")
mode = "mu"
failed_ = 0
ground_truts = []
inference_scoes = []
# Read each file and extract ground truth + model predictions
for all_samples in all_files:
with open(all_samples) as f:
da_ = json.load(f)
try:
# Evaluate the string in da_['ground_truth'] and extract 'is_met'
ground_truth = eval(da_['ground_truth'])['is_met']
if mode=="bi":
# print(ground_truth)
if ground_truth == "undetermined":
ground_truts.append("no")
else:
ground_truts.append(ground_truth)
else:
ground_truts.append(ground_truth)
# Evaluate the string in da_['api_response'] and extract 'is_met'
api_res = eval(da_['api_response'])['assessments'][0]['is_met'].lower()
if mode=="bi":
if api_res == "undetermined":
inference_scoes.append("no")
else:
inference_scoes.append(api_res)
else:
inference_scoes.append(api_res)
except Exception as e:
failed_ += 1
# If something goes wrong, skip this file
pass
# Evaluate predictions
results = evaluate_predictions(ground_truts, inference_scoes, mode="binary")
# Print results
print(f"Number of failed files: {failed_}")
print("Accuracy:", results['accuracy'])
print("Precision (macro avg):", results['precision (macro avg)'])
print("Recall (macro avg):", results['recall (macro avg)'])
print("F1-score (macro avg):", results['f1_score (macro avg)'])
print("Classification Report:\n", results['classification_report'])
print("Confusion Matrix:\n", results['confusion_matrix'])
|