| | from glob import glob |
| | import json |
| |
|
| | |
| | from sklearn.metrics import accuracy_score, classification_report, confusion_matrix |
| | from sklearn.metrics import precision_recall_fscore_support |
| |
|
| | def evaluate_predictions(ground_truth, model_scores, mode="multiclass"): |
| | """ |
| | Computes various evaluation metrics (accuracy, precision, recall, F1-score) |
| | for the given ground_truth and model_scores lists. |
| | |
| | :param ground_truth: list of true labels |
| | :param model_scores: list of predicted labels |
| | :return: dict containing accuracy, classification report, confusion matrix, |
| | precision, recall, and f1-score |
| | """ |
| | |
| | accuracy = accuracy_score(ground_truth, model_scores) |
| |
|
| | |
| | precision, recall, f1_score, _ = precision_recall_fscore_support( |
| | ground_truth, |
| | model_scores, |
| | average='micro' |
| | ) |
| |
|
| | |
| | class_report = classification_report(ground_truth, model_scores) |
| |
|
| | |
| | cm = confusion_matrix(ground_truth, model_scores) |
| |
|
| | |
| | return { |
| | 'accuracy': accuracy, |
| | 'precision (macro avg)': precision, |
| | 'recall (macro avg)': recall, |
| | 'f1_score (macro avg)': f1_score, |
| | 'classification_report': class_report, |
| | 'confusion_matrix': cm |
| | } |
| |
|
| | if __name__ == "__main__": |
| | |
| | all_files = glob("benchmark_logs/Qwen2.5-3B-Instruct/*.json") |
| | mode = "mu" |
| |
|
| | failed_ = 0 |
| | ground_truts = [] |
| | inference_scoes = [] |
| |
|
| | |
| | for all_samples in all_files: |
| | with open(all_samples) as f: |
| | da_ = json.load(f) |
| |
|
| | try: |
| | |
| | ground_truth = eval(da_['ground_truth'])['is_met'] |
| |
|
| | if mode=="bi": |
| | |
| | if ground_truth == "undetermined": |
| | ground_truts.append("no") |
| | else: |
| | ground_truts.append(ground_truth) |
| |
|
| | else: |
| | ground_truts.append(ground_truth) |
| | |
| | |
| | api_res = eval(da_['api_response'])['assessments'][0]['is_met'].lower() |
| |
|
| | if mode=="bi": |
| | if api_res == "undetermined": |
| | inference_scoes.append("no") |
| | else: |
| | inference_scoes.append(api_res) |
| | else: |
| | inference_scoes.append(api_res) |
| |
|
| | except Exception as e: |
| | failed_ += 1 |
| | |
| | pass |
| |
|
| | |
| | results = evaluate_predictions(ground_truts, inference_scoes, mode="binary") |
| |
|
| | |
| | print(f"Number of failed files: {failed_}") |
| | print("Accuracy:", results['accuracy']) |
| | print("Precision (macro avg):", results['precision (macro avg)']) |
| | print("Recall (macro avg):", results['recall (macro avg)']) |
| | print("F1-score (macro avg):", results['f1_score (macro avg)']) |
| | print("Classification Report:\n", results['classification_report']) |
| | print("Confusion Matrix:\n", results['confusion_matrix']) |
| |
|