File size: 3,413 Bytes
9d5b280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from glob import glob
import json

# We'll use scikit-learn for evaluation metrics.
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

def evaluate_predictions(ground_truth, model_scores, mode="multiclass"):
    """
    Computes various evaluation metrics (accuracy, precision, recall, F1-score)
    for the given ground_truth and model_scores lists.

    :param ground_truth: list of true labels
    :param model_scores: list of predicted labels
    :return: dict containing accuracy, classification report, confusion matrix,
             precision, recall, and f1-score
    """
    # Calculate accuracy
    accuracy = accuracy_score(ground_truth, model_scores)

    # Calculate macro-averaged precision, recall, and F1-score
    precision, recall, f1_score, _ = precision_recall_fscore_support(
        ground_truth, 
        model_scores, 
        average='micro'
    )

    # Generate a classification report
    class_report = classification_report(ground_truth, model_scores)

    # Generate a confusion matrix
    cm = confusion_matrix(ground_truth, model_scores)

    # Return the metrics in a dictionary
    return {
        'accuracy': accuracy,
        'precision (macro avg)': precision,
        'recall (macro avg)': recall,
        'f1_score (macro avg)': f1_score,
        'classification_report': class_report,
        'confusion_matrix': cm
    }

if __name__ == "__main__":
    # Collect JSON files
    all_files = glob("benchmark_logs/Qwen2.5-3B-Instruct/*.json")
    mode = "mu"

    failed_ = 0
    ground_truts = []
    inference_scoes = []

    # Read each file and extract ground truth + model predictions
    for all_samples in all_files:
        with open(all_samples) as f:
            da_ = json.load(f)

        try:
            # Evaluate the string in da_['ground_truth'] and extract 'is_met'
            ground_truth = eval(da_['ground_truth'])['is_met']

            if mode=="bi":
                # print(ground_truth)
                if ground_truth == "undetermined":
                    ground_truts.append("no")
                else:
                    ground_truts.append(ground_truth)

            else:
                ground_truts.append(ground_truth)
            
            # Evaluate the string in da_['api_response'] and extract 'is_met'
            api_res = eval(da_['api_response'])['assessments'][0]['is_met'].lower()

            if mode=="bi":
                if api_res == "undetermined":
                    inference_scoes.append("no")
                else:
                    inference_scoes.append(api_res)
            else:
                inference_scoes.append(api_res)

        except Exception as e:
            failed_ += 1
            # If something goes wrong, skip this file
            pass

    # Evaluate predictions
    results = evaluate_predictions(ground_truts, inference_scoes, mode="binary")

    # Print results
    print(f"Number of failed files: {failed_}")
    print("Accuracy:", results['accuracy'])
    print("Precision (macro avg):", results['precision (macro avg)'])
    print("Recall (macro avg):", results['recall (macro avg)'])
    print("F1-score (macro avg):", results['f1_score (macro avg)'])
    print("Classification Report:\n", results['classification_report'])
    print("Confusion Matrix:\n", results['confusion_matrix'])