File size: 4,183 Bytes
9d5b280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
from glob import glob
import json

# We'll use scikit-learn for evaluation metrics.
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

def evaluate_predictions(ground_truth, model_scores, mode="multiclass"):
    """
    Computes various evaluation metrics (accuracy, precision, recall, F1-score)
    for the given ground_truth and model_scores lists.

    :param ground_truth: list of true labels
    :param model_scores: list of predicted labels
    :return: dict containing accuracy, classification report, confusion matrix,
             precision, recall, and f1-score
    """
    # Calculate accuracy
    accuracy = accuracy_score(ground_truth, model_scores)

    # Calculate macro-averaged precision, recall, and F1-score
    precision, recall, f1_score, _ = precision_recall_fscore_support(
        ground_truth, 
        model_scores, 
        average='micro'
    )

    # Generate a classification report
    class_report = classification_report(ground_truth, model_scores)

    # Generate a confusion matrix
    cm = confusion_matrix(ground_truth, model_scores)

    # Return the metrics in a dictionary
    return {
        'accuracy': accuracy,
        'precision (macro avg)': precision,
        'recall (macro avg)': recall,
        'f1_score (macro avg)': f1_score,
        'classification_report': class_report,
        'confusion_matrix': cm
    }

def extract_json_output(data):
    api_response = data["api_response"]
    
    # Find the JSON block between ```json and ```
    start = api_response.find("```json") + 7
    end = api_response.rfind("```")
    json_str = api_response[start:end].strip()

    try:
        return eval(json_str)
    except Exception as e:
        return "ok"

if __name__ == "__main__":
    # Collect JSON files
    all_files = glob("benchmark_logs/DeepSeek-R1-Distill-Qwen-1.5B/*.json")
    print(len(all_files))
    mode = "mu"

    failed_ = 0
    ground_truts = []
    inference_scoes = []

    # Read each file and extract ground truth + model predictions
    for all_samples in all_files:
        with open(all_samples) as f:
            da_m = json.load(f)
            da_ = extract_json_output(da_m)
            if da_!="ok":
                try:

                    # Evaluate the string in da_['api_response'] and extract 'is_met'
                    api_res = da_['assessments'][0]['is_met'].lower()
        
                    if mode=="bi":
                        if api_res == "undetermined":
                            inference_scoes.append("no")
                        else:
                            inference_scoes.append(api_res)
                    else:
                        inference_scoes.append(api_res)
                    
                    # Evaluate the string in da_['ground_truth'] and extract 'is_met'
                    ground_truth = eval(da_m['ground_truth'])['is_met']
        
                    if mode=="bi":
                        # print(ground_truth)
                        if ground_truth == "undetermined":
                            ground_truts.append("no")
                        else:
                            ground_truts.append(ground_truth)
        
                    else:
                        ground_truts.append(ground_truth)
                    

        
                except Exception as e:
                    print(e)
                    failed_ += 1
                    # If something goes wrong, skip this file
                    pass
        
    # Evaluate predictions
    print(len(ground_truts), len(inference_scoes))
    results = evaluate_predictions(ground_truts, inference_scoes, mode="binary")

    # Print results
    print(f"Number of failed files: {failed_}")
    print("Accuracy:", results['accuracy'])
    print("Precision (macro avg):", results['precision (macro avg)'])
    print("Recall (macro avg):", results['recall (macro avg)'])
    print("F1-score (macro avg):", results['f1_score (macro avg)'])
    print("Classification Report:\n", results['classification_report'])
    print("Confusion Matrix:\n", results['confusion_matrix'])