"""Evaluation metrics for multi-label SQL error classification.""" from __future__ import annotations from typing import Dict import numpy as np from sklearn.metrics import ( accuracy_score, f1_score, hamming_loss, precision_score, recall_score, ) def sigmoid(x: np.ndarray) -> np.ndarray: return 1.0 / (1.0 + np.exp(-x)) def compute_multilabel_metrics( logits: np.ndarray, labels: np.ndarray, threshold: float = 0.5, ) -> Dict[str, float]: probs = sigmoid(logits) preds = (probs >= threshold).astype(int) labels = labels.astype(int) return { "accuracy": float(accuracy_score(labels, preds)), "f1_macro": float(f1_score(labels, preds, average="macro", zero_division=0)), "f1_micro": float(f1_score(labels, preds, average="micro", zero_division=0)), "precision_macro": float( precision_score(labels, preds, average="macro", zero_division=0) ), "recall_macro": float( recall_score(labels, preds, average="macro", zero_division=0) ), "hamming_loss": float(hamming_loss(labels, preds)), "subset_accuracy": float((preds == labels).all(axis=1).mean()), } def build_compute_metrics(threshold: float = 0.5): """Factory for Hugging Face Trainer compute_metrics callback.""" def compute_metrics(eval_pred) -> Dict[str, float]: logits, labels = eval_pred return compute_multilabel_metrics(logits, labels, threshold=threshold) return compute_metrics