from glob import glob import json # We'll use scikit-learn for evaluation metrics. from sklearn.metrics import accuracy_score, classification_report, confusion_matrix from sklearn.metrics import precision_recall_fscore_support def evaluate_predictions(ground_truth, model_scores, mode="multiclass"): """ Computes various evaluation metrics (accuracy, precision, recall, F1-score) for the given ground_truth and model_scores lists. :param ground_truth: list of true labels :param model_scores: list of predicted labels :return: dict containing accuracy, classification report, confusion matrix, precision, recall, and f1-score """ # Calculate accuracy accuracy = accuracy_score(ground_truth, model_scores) # Calculate macro-averaged precision, recall, and F1-score precision, recall, f1_score, _ = precision_recall_fscore_support( ground_truth, model_scores, average='micro' ) # Generate a classification report class_report = classification_report(ground_truth, model_scores) # Generate a confusion matrix cm = confusion_matrix(ground_truth, model_scores) # Return the metrics in a dictionary return { 'accuracy': accuracy, 'precision (macro avg)': precision, 'recall (macro avg)': recall, 'f1_score (macro avg)': f1_score, 'classification_report': class_report, 'confusion_matrix': cm } def extract_json_output(data): api_response = data["api_response"] # Find the JSON block between ```json and ``` start = api_response.find("```json") + 7 end = api_response.rfind("```") json_str = api_response[start:end].strip() try: return eval(json_str) except Exception as e: return "ok" if __name__ == "__main__": # Collect JSON files all_files = glob("benchmark_logs/DeepSeek-R1-Distill-Qwen-1.5B/*.json") print(len(all_files)) mode = "mu" failed_ = 0 ground_truts = [] inference_scoes = [] # Read each file and extract ground truth + model predictions for all_samples in all_files: with open(all_samples) as f: da_m = json.load(f) da_ = extract_json_output(da_m) if da_!="ok": try: # Evaluate the string in da_['api_response'] and extract 'is_met' api_res = da_['assessments'][0]['is_met'].lower() if mode=="bi": if api_res == "undetermined": inference_scoes.append("no") else: inference_scoes.append(api_res) else: inference_scoes.append(api_res) # Evaluate the string in da_['ground_truth'] and extract 'is_met' ground_truth = eval(da_m['ground_truth'])['is_met'] if mode=="bi": # print(ground_truth) if ground_truth == "undetermined": ground_truts.append("no") else: ground_truts.append(ground_truth) else: ground_truts.append(ground_truth) except Exception as e: print(e) failed_ += 1 # If something goes wrong, skip this file pass # Evaluate predictions print(len(ground_truts), len(inference_scoes)) results = evaluate_predictions(ground_truts, inference_scoes, mode="binary") # Print results print(f"Number of failed files: {failed_}") print("Accuracy:", results['accuracy']) print("Precision (macro avg):", results['precision (macro avg)']) print("Recall (macro avg):", results['recall (macro avg)']) print("F1-score (macro avg):", results['f1_score (macro avg)']) print("Classification Report:\n", results['classification_report']) print("Confusion Matrix:\n", results['confusion_matrix'])