| import torch |
| import numpy as np |
| from sklearn.metrics import ( |
| r2_score, |
| mean_squared_error, |
| mean_absolute_error, |
| f1_score, |
| precision_score, |
| recall_score, |
| roc_auc_score, |
| precision_recall_curve, |
| auc, |
| matthews_corrcoef, |
| confusion_matrix, |
| hamming_loss, |
| accuracy_score, |
| make_scorer, |
| ) |
| from scipy.stats import pearsonr, spearmanr |
| from transformers import EvalPrediction |
|
|
|
|
| def softmax(x: np.ndarray) -> np.ndarray: |
| return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True) |
|
|
|
|
| def regression_scorer(): |
| def dual_score(y_true, y_pred): |
| return spearmanr(y_true, y_pred).correlation * r2_score(y_true, y_pred) |
| return dual_score |
|
|
|
|
| def classification_scorer(): |
| def mcc_scorer(y_true, y_pred): |
| return matthews_corrcoef(y_true, y_pred) |
| return mcc_scorer |
|
|
|
|
| def get_classification_scorer(): |
| return make_scorer(classification_scorer(), greater_is_better=True) |
|
|
|
|
| def get_regression_scorer(): |
| return make_scorer(regression_scorer(), greater_is_better=True) |
|
|
|
|
| def calculate_max_metrics(ss: torch.Tensor, labels: torch.Tensor, cutoff: float) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: |
| """ |
| Calculate precision, recall and F1 metrics for binary classification at a specific cutoff threshold. |
| |
| Args: |
| ss: Prediction scores tensor, typically between -1 and 1 |
| labels: Ground truth binary labels tensor (0 or 1) |
| cutoff: Classification threshold value |
| |
| Returns: |
| Tuple containing: |
| - F1 score (torch.Tensor) |
| - Precision score (torch.Tensor) |
| - Recall score (torch.Tensor) |
| |
| Note: |
| - Input tensors are converted to float type |
| - Handles division by zero cases by returning 0 |
| - Uses standard binary classification metrics formulas: |
| - Precision = TP / (TP + FP) |
| - Recall = TP / (TP + FN) |
| - F1 = 2 * (Precision * Recall) / (Precision + Recall) |
| """ |
| ss, labels = ss.float(), labels.float() |
| tp = torch.sum((ss >= cutoff) & (labels == 1.0)) |
| fp = torch.sum((ss >= cutoff) & (labels == 0.0)) |
| fn = torch.sum((ss < cutoff) & (labels == 1.0)) |
| precision_denominator = tp + fp |
| precision = torch.where(precision_denominator != 0, tp / precision_denominator, torch.tensor(0.0)) |
| recall_denominator = tp + fn |
| recall = torch.where(recall_denominator != 0, tp / recall_denominator, torch.tensor(0.0)) |
| f1 = torch.where((precision + recall) != 0, (2 * precision * recall) / (precision + recall), torch.tensor(0.0)) |
| return f1, precision, recall |
|
|
|
|
| def max_metrics(ss: torch.Tensor, labels: torch.Tensor, increment: float = 0.01) -> tuple[float, float, float, float]: |
| """ |
| Find optimal classification metrics by scanning different cutoff thresholds. |
| Optimized version that vectorizes calculations across all cutoffs. |
| |
| Args: |
| ss: Prediction scores tensor, typically between -1 and 1 |
| labels: Ground truth binary labels tensor (0 or 1) |
| increment: Step size for scanning cutoff values, defaults to 0.01 |
| |
| Returns: |
| Tuple containing: |
| - Maximum F1 score (float) |
| - Maximum precision score (float) |
| - Maximum recall score (float) |
| - Optimal cutoff threshold (float) |
| |
| Note: |
| - Input scores are clamped to [-1, 1] range |
| - Handles edge case where all scores are >= 1 |
| - Scans cutoff values from min score to 1 in increments |
| - Handles NaN F1 scores by replacing with -1 before finding max |
| - Returns metrics at the threshold that maximizes F1 score |
| - Optimized to compute metrics for all cutoffs in parallel using vectorization |
| """ |
| |
| ss = torch.nan_to_num(ss, nan=0.0) |
| ss = torch.clamp(ss, -1.0, 1.0) |
| min_val = ss.min().item() |
| max_val = 1 |
| if min_val >= max_val: |
| min_val = 0 |
| |
| |
| ss = ss.float() |
| labels = labels.float() |
| |
| |
| cutoffs = torch.arange(min_val, max_val, increment, device=ss.device, dtype=ss.dtype) |
| n_cutoffs = len(cutoffs) |
| |
| if n_cutoffs == 0: |
| |
| return 0.0, 0.0, 0.0, min_val |
| |
| |
| |
| ss_expanded = ss.unsqueeze(0) |
| cutoffs_expanded = cutoffs.unsqueeze(1) |
| labels_expanded = labels.unsqueeze(0) |
| |
| |
| predictions = (ss_expanded >= cutoffs_expanded).float() |
| |
| |
| |
| tp = torch.sum(predictions * labels_expanded, dim=1) |
| |
| fp = torch.sum(predictions * (1.0 - labels_expanded), dim=1) |
| |
| fn = torch.sum((1.0 - predictions) * labels_expanded, dim=1) |
| |
| |
| precision_denominator = tp + fp |
| precision = torch.where(precision_denominator != 0, tp / precision_denominator, torch.tensor(0.0, device=ss.device)) |
| |
| recall_denominator = tp + fn |
| recall = torch.where(recall_denominator != 0, tp / recall_denominator, torch.tensor(0.0, device=ss.device)) |
| |
| |
| f1_denominator = precision + recall |
| f1s = torch.where(f1_denominator != 0, (2 * precision * recall) / f1_denominator, torch.tensor(0.0, device=ss.device)) |
| |
| |
| valid_f1s = torch.where(torch.isnan(f1s), torch.tensor(-1.0, device=ss.device), f1s) |
| max_index = torch.argmax(valid_f1s) |
| |
| return f1s[max_index].item(), precision[max_index].item(), recall[max_index].item(), cutoffs[max_index].item() |
|
|
|
|
|
|
| def calculate_robust_roc_auc_multiclass(y_true: np.ndarray, probs: np.ndarray) -> float: |
| """ |
| Robust ROC AUC for multi-class (single-label) tasks. |
| Handles missing classes in y_true by ignoring them in the weighted average. |
| """ |
| |
| if np.isnan(probs).any(): |
| probs = np.nan_to_num(probs, nan=0.0) |
| |
| n_classes = probs.shape[1] |
| try: |
| if n_classes == 2: |
| if len(np.unique(y_true)) == 2: |
| return roc_auc_score(y_true, probs[:, 1]) |
| return -100.0 |
| |
| y_true_onehot = np.eye(n_classes)[y_true] |
| scores = [] |
| weights = [] |
| for i in range(n_classes): |
| |
| if len(np.unique(y_true_onehot[:, i])) == 2: |
| scores.append(roc_auc_score(y_true_onehot[:, i], probs[:, i])) |
| weights.append(np.sum(y_true_onehot[:, i])) |
| |
| if not scores: |
| return -100.0 |
| |
| return float(np.average(scores, weights=weights)) |
| except Exception: |
| return -100.0 |
|
|
|
|
| def calculate_robust_pr_auc_multiclass(y_true: np.ndarray, probs: np.ndarray) -> float: |
| """ |
| Robust PR AUC for multi-class (single-label) tasks. |
| """ |
| |
| if np.isnan(probs).any(): |
| probs = np.nan_to_num(probs, nan=0.0) |
|
|
| n_classes = probs.shape[1] |
| try: |
| if n_classes == 2: |
| if len(np.unique(y_true)) == 2: |
| precision, recall, _ = precision_recall_curve(y_true, probs[:, 1]) |
| return auc(recall, precision) |
| return -100.0 |
|
|
| y_true_onehot = np.eye(n_classes)[y_true] |
| scores = [] |
| weights = [] |
| for i in range(n_classes): |
| if len(np.unique(y_true_onehot[:, i])) == 2: |
| precision, recall, _ = precision_recall_curve(y_true_onehot[:, i], probs[:, i]) |
| scores.append(auc(recall, precision)) |
| weights.append(np.sum(y_true_onehot[:, i])) |
| |
| if not scores: |
| return -100.0 |
| |
| return float(np.average(scores, weights=weights)) |
| except Exception: |
| return -100.0 |
|
|
|
|
| def calculate_robust_roc_auc_multilabel(y_true: np.ndarray, probs: np.ndarray) -> float: |
| """ |
| Robust ROC AUC for multi-label tasks (macro average). |
| """ |
| if np.isnan(probs).any(): |
| probs = np.nan_to_num(probs, nan=0.0) |
|
|
| scores = [] |
| try: |
| for i in range(y_true.shape[1]): |
| if len(np.unique(y_true[:, i])) == 2: |
| scores.append(roc_auc_score(y_true[:, i], probs[:, i])) |
| |
| if not scores: |
| return -100.0 |
| return float(np.mean(scores)) |
| except Exception: |
| return -100.0 |
|
|
|
|
| def calculate_robust_pr_auc_multilabel(y_true: np.ndarray, probs: np.ndarray) -> float: |
| """ |
| Robust PR AUC for multi-label tasks (macro average). |
| """ |
| if np.isnan(probs).any(): |
| probs = np.nan_to_num(probs, nan=0.0) |
|
|
| scores = [] |
| try: |
| for i in range(y_true.shape[1]): |
| if len(np.unique(y_true[:, i])) == 2: |
| precision, recall, _ = precision_recall_curve(y_true[:, i], probs[:, i]) |
| scores.append(auc(recall, precision)) |
| |
| if not scores: |
| return -100.0 |
| return float(np.mean(scores)) |
| except Exception: |
| return -100.0 |
|
|
|
|
| def compute_single_label_classification_metrics(p: EvalPrediction) -> dict[str, float]: |
| """ |
| Compute comprehensive metrics for single-label classification tasks. |
| |
| Args: |
| p: EvalPrediction object containing model predictions and ground truth labels |
| |
| Returns: |
| Dictionary with the following metrics (all rounded to 5 decimal places): |
| - f1: F1 score (weighted average) |
| - precision: Precision score (weighted average) |
| - recall: Recall score (weighted average) |
| - accuracy: Overall accuracy |
| - mcc: Matthews Correlation Coefficient |
| - roc_auc: Area Under ROC Curve (weighted average) |
| - pr_auc: Area Under Precision-Recall Curve (weighted average) |
| |
| Note: |
| - Handles both binary and multi-class cases |
| - For binary case: uses 0.5 threshold on probabilities |
| - For multi-class: uses argmax for class prediction |
| - Prints confusion matrix for detailed error analysis |
| - Uses weighted averaging for multi-class metrics |
| - Handles AUC calculation for both binary and multi-class cases |
| """ |
| logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions |
| labels = p.label_ids[1] if isinstance(p.label_ids, tuple) else p.label_ids |
|
|
| y_pred = logits.argmax(axis=-1).flatten() |
| y_true = labels.flatten().astype(int) |
| probs = softmax(logits) |
|
|
| |
| roc_auc = calculate_robust_roc_auc_multiclass(y_true, probs) |
| |
| |
| pr_auc = calculate_robust_pr_auc_multiclass(y_true, probs) |
| |
| cm = confusion_matrix(y_true, y_pred) |
| print("\nConfusion Matrix:") |
| print(cm) |
|
|
| f1 = f1_score(y_true, y_pred, average='macro', zero_division=0) |
| precision = precision_score(y_true, y_pred, average='macro', zero_division=0) |
| recall = recall_score(y_true, y_pred, average='macro', zero_division=0) |
| accuracy = accuracy_score(y_true, y_pred) |
| mcc = matthews_corrcoef(y_true, y_pred) |
|
|
| return { |
| 'f1': round(f1, 5), |
| 'precision': round(precision, 5), |
| 'recall': round(recall, 5), |
| 'accuracy': round(accuracy, 5), |
| 'mcc': round(mcc, 5), |
| 'roc_auc': round(roc_auc, 5), |
| 'pr_auc': round(pr_auc, 5) |
| } |
|
|
|
|
| def compute_tokenwise_classification_metrics(p: EvalPrediction) -> dict[str, float]: |
| """ |
| Compute metrics for token-level classification tasks. |
| |
| Args: |
| p: EvalPrediction object containing model predictions and ground truth labels |
| |
| Returns: |
| Dictionary containing the following metrics (all rounded to 5 decimal places): |
| - accuracy: Overall accuracy |
| - f1: F1 score (macro average) |
| - precision: Precision score (macro average) |
| - recall: Recall score (macro average) |
| - mcc: Matthews Correlation Coefficient |
| - roc_auc: Area Under ROC Curve (weighted average) |
| - pr_auc: Area Under Precision-Recall Curve (weighted average) |
| |
| Note: |
| - Handles special token padding (-100) by filtering before metric calculation |
| - Uses macro averaging for multi-class metrics |
| - Converts predictions to class labels using argmax |
| - Handles AUC calculation for both binary and multi-class cases |
| """ |
| logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions |
| labels = p.label_ids |
| |
| y_pred = logits.argmax(axis=-1).flatten() |
| y_true = labels.flatten() |
| valid_indices = y_true != -100 |
| y_pred = y_pred[valid_indices] |
| y_true = y_true[valid_indices] |
|
|
| cm = confusion_matrix(y_true, y_pred) |
| print("\nConfusion Matrix:") |
| print(cm) |
|
|
| f1 = f1_score(y_true, y_pred, average='macro', zero_division=0) |
| precision = precision_score(y_true, y_pred, average='macro', zero_division=0) |
| recall = recall_score(y_true, y_pred, average='macro', zero_division=0) |
| accuracy = accuracy_score(y_true, y_pred) |
| mcc = matthews_corrcoef(y_true, y_pred) |
| |
| |
| probs = softmax(logits) |
| probs = probs.reshape(-1, probs.shape[-1]) |
| probs = probs[valid_indices] |
| |
| |
| roc_auc = calculate_robust_roc_auc_multiclass(y_true, probs) |
| |
| |
| pr_auc = calculate_robust_pr_auc_multiclass(y_true, probs) |
| |
| return { |
| 'accuracy': round(accuracy, 5), |
| 'f1': round(f1, 5), |
| 'precision': round(precision, 5), |
| 'recall': round(recall, 5), |
| 'mcc': round(mcc, 5), |
| 'roc_auc': round(roc_auc, 5), |
| 'pr_auc': round(pr_auc, 5) |
| } |
|
|
|
|
| def compute_multi_label_classification_metrics(p: EvalPrediction) -> dict[str, float]: |
| """ |
| Compute comprehensive metrics for multi-label classification tasks. |
| |
| Args: |
| p: EvalPrediction object containing model predictions and ground truth labels |
| |
| Returns: |
| Dictionary containing the following metrics (all rounded to 5 decimal places): |
| - accuracy: Overall accuracy |
| - f1: F1 score (optimized across thresholds) |
| - precision: Precision score (at optimal threshold) |
| - recall: Recall score (at optimal threshold) |
| - hamming_loss: Proportion of wrong labels |
| - threshold: Optimal classification threshold |
| - mcc: Matthews Correlation Coefficient |
| - roc_auc: Area Under ROC Curve (macro average) |
| - pr_auc: Area Under Precision-Recall Curve (macro average) |
| |
| Note: |
| - Converts inputs to PyTorch tensors |
| - Applies softmax to raw predictions |
| - Uses threshold optimization for best F1 score |
| - Handles multi-class ROC AUC using one-vs-rest |
| - All metrics are computed on flattened predictions |
| """ |
| preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions |
| labels = p.label_ids[1] if isinstance(p.label_ids, tuple) else p.label_ids |
|
|
| |
| if not isinstance(preds, torch.Tensor): |
| preds = torch.tensor(preds) |
| if not isinstance(labels, torch.Tensor): |
| y_true = torch.tensor(labels, dtype=torch.int) |
| else: |
| y_true = labels.int() |
|
|
| probs = preds.sigmoid() |
| y_pred = (probs > 0.5).int() |
|
|
| |
| probs_flat = probs.flatten() |
| y_true_flat = y_true.flatten() |
| f1, prec, recall, thres = max_metrics(probs_flat, y_true_flat) |
| |
| y_pred_flat, y_true_flat = y_pred.flatten().numpy(), y_true.flatten().numpy() |
| |
| accuracy = accuracy_score(y_pred_flat, y_true_flat) |
| hamming = hamming_loss(y_pred_flat, y_true_flat) |
| mcc = matthews_corrcoef(y_true_flat, y_pred_flat) |
| |
| |
| |
| roc_auc = calculate_robust_roc_auc_multilabel(y_true.numpy(), probs.numpy()) |
| |
| |
| pr_auc = calculate_robust_pr_auc_multilabel(y_true.numpy(), probs.numpy()) |
|
|
| return { |
| 'accuracy': round(accuracy, 5), |
| 'f1': round(f1, 5), |
| 'precision': round(prec, 5), |
| 'recall': round(recall, 5), |
| 'hamming_loss': round(hamming, 5), |
| 'threshold': round(thres, 5), |
| 'mcc': round(mcc, 5), |
| 'roc_auc': round(roc_auc, 5), |
| 'pr_auc': round(pr_auc, 5) |
| } |
|
|
|
|
| def compute_regression_metrics(p: EvalPrediction) -> dict[str, float]: |
| """ |
| Compute comprehensive metrics for regression tasks. |
| |
| Args: |
| p: EvalPrediction object containing model predictions and ground truth values |
| |
| Returns: |
| Dictionary containing the following metrics (all rounded to 5 decimal places): |
| - r_squared: Coefficient of determination (R²) |
| - spearman_rho: Spearman rank correlation coefficient |
| - spear_pval: P-value for Spearman correlation |
| - pearson_rho: Pearson correlation coefficient |
| - pear_pval: P-value for Pearson correlation |
| - mse: Mean Squared Error |
| - mae: Mean Absolute Error |
| - rmse: Root Mean Squared Error |
| |
| Note: |
| - Handles both raw predictions and tuple predictions |
| - Flattens inputs to 1D arrays |
| - Includes both correlation and error metrics |
| - P-values indicate statistical significance of correlations |
| - RMSE is calculated as square root of MSE |
| """ |
| preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions |
| labels = p.label_ids[1] if isinstance(p.label_ids, tuple) else p.label_ids |
|
|
| y_pred = np.array(preds).flatten() |
| y_true = np.array(labels).flatten() |
|
|
| if np.isnan(y_true).any(): |
| print("y_true Nans were cast to 0") |
| y_true = np.where(np.isnan(y_true), 0, y_true) |
| if np.isnan(y_pred).any(): |
| print("y_pred Nans were cast to 0") |
| y_pred = np.where(np.isnan(y_pred), 0, y_pred) |
|
|
| try: |
| spearman_rho, spear_pval = spearmanr(y_pred, y_true) |
| pearson_rho, pear_pval = pearsonr(y_pred, y_true) |
| except: |
| spearman_rho = -100.0 |
| spear_pval = -100.0 |
| pearson_rho = -100.0 |
| pear_pval = -100.0 |
|
|
| r2 = r2_score(y_true, y_pred) |
| mse = mean_squared_error(y_true, y_pred) |
| mae = mean_absolute_error(y_true, y_pred) |
| rmse = np.sqrt(mse) |
|
|
| return { |
| 'r_squared': round(r2, 5), |
| 'spearman_rho': round(spearman_rho, 5), |
| 'spear_pval': round(spear_pval, 5), |
| 'pearson_rho': round(pearson_rho, 5), |
| 'pear_pval': round(pear_pval, 5), |
| 'mse': round(mse, 5), |
| 'mae': round(mae, 5), |
| 'rmse': round(rmse, 5), |
| } |
|
|
|
|
| def compute_tokenwise_regression_metrics(p: EvalPrediction) -> dict[str, float]: |
| """ |
| Compute regression metrics tokenwise, ignoring label positions equal to -100. |
| |
| Compatible with HF Trainer `compute_metrics` API. |
| """ |
| preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions |
| labels = p.label_ids[1] if isinstance(p.label_ids, tuple) else p.label_ids |
|
|
| y_pred = np.array(preds) |
| y_true = np.array(labels) |
|
|
| |
| if y_pred.ndim == y_true.ndim + 1 and y_pred.shape[-1] == 1: |
| y_pred = np.squeeze(y_pred, axis=-1) |
|
|
| |
| valid_mask = (y_true != -100) |
| y_true = y_true[valid_mask].astype(float) |
| y_pred = y_pred[valid_mask].astype(float) |
|
|
| if y_true.size == 0: |
| return { |
| 'r_squared': -100.0, |
| 'spearman_rho': -100.0, |
| 'spear_pval': -100.0, |
| 'pearson_rho': -100.0, |
| 'pear_pval': -100.0, |
| 'mse': -100.0, |
| 'mae': -100.0, |
| 'rmse': -100.0, |
| } |
|
|
| if np.isnan(y_true).any(): |
| print("y_true Nans were cast to 0") |
| y_true = np.where(np.isnan(y_true), 0, y_true) |
| if np.isnan(y_pred).any(): |
| print("y_pred Nans were cast to 0") |
| y_pred = np.where(np.isnan(y_pred), 0, y_pred) |
|
|
| try: |
| spearman_rho, spear_pval = spearmanr(y_pred, y_true) |
| pearson_rho, pear_pval = pearsonr(y_pred, y_true) |
| except Exception: |
| spearman_rho = -100.0 |
| spear_pval = -100.0 |
| pearson_rho = -100.0 |
| pear_pval = -100.0 |
|
|
| r2 = r2_score(y_true, y_pred) |
| mse = mean_squared_error(y_true, y_pred) |
| mae = mean_absolute_error(y_true, y_pred) |
| rmse = np.sqrt(mse) |
|
|
| return { |
| 'r_squared': round(float(r2), 5), |
| 'spearman_rho': round(float(spearman_rho), 5), |
| 'spear_pval': round(float(spear_pval), 5), |
| 'pearson_rho': round(float(pearson_rho), 5), |
| 'pear_pval': round(float(pear_pval), 5), |
| 'mse': round(float(mse), 5), |
| 'mae': round(float(mae), 5), |
| 'rmse': round(float(rmse), 5), |
| } |
|
|
|
|
| def get_compute_metrics(task_type: str, tokenwise: bool = False): |
| if task_type == 'singlelabel': |
| compute_metrics = compute_single_label_classification_metrics |
| elif task_type == 'multilabel': |
| compute_metrics = compute_multi_label_classification_metrics |
| elif task_type == 'sigmoid_regression': |
| |
| compute_metrics = compute_tokenwise_regression_metrics if tokenwise else compute_regression_metrics |
| elif not task_type == 'regression' and tokenwise: |
| compute_metrics = compute_tokenwise_classification_metrics |
| elif task_type == 'regression' and not tokenwise: |
| compute_metrics = compute_regression_metrics |
| elif task_type == 'regression' and tokenwise: |
| compute_metrics = compute_tokenwise_regression_metrics |
| else: |
| raise ValueError(f'Task type {task_type} not supported') |
| return compute_metrics |
|
|
|
|
| if __name__ == "__main__": |
| |
|
|
| print("Running tests for metrics functions...") |
| |
| |
| print("\n--- compute_single_label_classification_metrics (Binary) ---") |
| |
| |
| predictions = np.array([[2.0, -1.0], [-1.0, 2.0]]) |
| label_ids = np.array([0, 1]) |
| p = EvalPrediction(predictions=predictions, label_ids=label_ids) |
| metrics = compute_single_label_classification_metrics(p) |
| print(metrics) |
|
|
| print("\n--- compute_single_label_classification_metrics (Multi-class) ---") |
| |
| predictions = np.array([[2.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 2.0]]) |
| label_ids = np.array([0, 1, 2]) |
| p = EvalPrediction(predictions=predictions, label_ids=label_ids) |
| metrics = compute_single_label_classification_metrics(p) |
| print(metrics) |
|
|
| |
| print("\n--- compute_tokenwise_classification_metrics ---") |
| |
| |
| |
| |
| predictions = np.array([[[2.0, -1.0], [-1.0, 2.0], [2.0, -1.0]]]) |
| label_ids = np.array([[0, 1, -100]]) |
| p = EvalPrediction(predictions=predictions, label_ids=label_ids) |
| metrics = compute_tokenwise_classification_metrics(p) |
| print(metrics) |
|
|
| |
| print("\n--- compute_multi_label_classification_metrics ---") |
| |
| |
| |
| |
| predictions = np.array([[5.0, -5.0, 5.0], [-5.0, 5.0, -5.0]]) |
| label_ids = np.array([[1, 0, 1], [0, 1, 0]]) |
| p = EvalPrediction(predictions=predictions, label_ids=label_ids) |
| metrics = compute_multi_label_classification_metrics(p) |
| print(metrics) |
|
|
| |
| print("\n--- compute_regression_metrics ---") |
| predictions = np.array([1.0, 2.0, 3.0]) |
| label_ids = np.array([1.1, 1.9, 3.2]) |
| p = EvalPrediction(predictions=predictions, label_ids=label_ids) |
| metrics = compute_regression_metrics(p) |
| print(metrics) |
|
|
| |
| print("\n--- compute_tokenwise_regression_metrics ---") |
| |
| |
| predictions = np.array([[1.0, 2.0, 5.0]]) |
| label_ids = np.array([[1.1, 1.9, -100.0]]) |
| p = EvalPrediction(predictions=predictions, label_ids=label_ids) |
| metrics = compute_tokenwise_regression_metrics(p) |
| print(metrics) |
|
|
|
|