Upload folder using huggingface_hub

714cf46 verified 20 days ago

25.3 kB

	import torch
	import numpy as np
	from sklearn.metrics import (
	r2_score,
	mean_squared_error,
	mean_absolute_error,
	f1_score,
	precision_score,
	recall_score,
	roc_auc_score,
	precision_recall_curve,
	auc,
	matthews_corrcoef,
	confusion_matrix,
	hamming_loss,
	accuracy_score,
	make_scorer,
	)
	from scipy.stats import pearsonr, spearmanr
	from transformers import EvalPrediction


	def softmax(x: np.ndarray) -> np.ndarray:
	return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)


	def regression_scorer():
	def dual_score(y_true, y_pred):
	return spearmanr(y_true, y_pred).correlation * r2_score(y_true, y_pred)
	return dual_score


	def classification_scorer():
	def mcc_scorer(y_true, y_pred):
	return matthews_corrcoef(y_true, y_pred)
	return mcc_scorer


	def get_classification_scorer():
	return make_scorer(classification_scorer(), greater_is_better=True)


	def get_regression_scorer():
	return make_scorer(regression_scorer(), greater_is_better=True)


	def calculate_max_metrics(ss: torch.Tensor, labels: torch.Tensor, cutoff: float) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
	"""
	Calculate precision, recall and F1 metrics for binary classification at a specific cutoff threshold.

	Args:
	ss: Prediction scores tensor, typically between -1 and 1
	labels: Ground truth binary labels tensor (0 or 1)
	cutoff: Classification threshold value

	Returns:
	Tuple containing:
	- F1 score (torch.Tensor)
	- Precision score (torch.Tensor)
	- Recall score (torch.Tensor)

	Note:
	- Input tensors are converted to float type
	- Handles division by zero cases by returning 0
	- Uses standard binary classification metrics formulas:
	- Precision = TP / (TP + FP)
	- Recall = TP / (TP + FN)
	- F1 = 2 * (Precision * Recall) / (Precision + Recall)
	"""
	ss, labels = ss.float(), labels.float()
	tp = torch.sum((ss >= cutoff) & (labels == 1.0))
	fp = torch.sum((ss >= cutoff) & (labels == 0.0))
	fn = torch.sum((ss < cutoff) & (labels == 1.0))
	precision_denominator = tp + fp
	precision = torch.where(precision_denominator != 0, tp / precision_denominator, torch.tensor(0.0))
	recall_denominator = tp + fn
	recall = torch.where(recall_denominator != 0, tp / recall_denominator, torch.tensor(0.0))
	f1 = torch.where((precision + recall) != 0, (2 * precision * recall) / (precision + recall), torch.tensor(0.0))
	return f1, precision, recall


	def max_metrics(ss: torch.Tensor, labels: torch.Tensor, increment: float = 0.01) -> tuple[float, float, float, float]:
	"""
	Find optimal classification metrics by scanning different cutoff thresholds.
	Optimized version that vectorizes calculations across all cutoffs.

	Args:
	ss: Prediction scores tensor, typically between -1 and 1
	labels: Ground truth binary labels tensor (0 or 1)
	increment: Step size for scanning cutoff values, defaults to 0.01

	Returns:
	Tuple containing:
	- Maximum F1 score (float)
	- Maximum precision score (float)
	- Maximum recall score (float)
	- Optimal cutoff threshold (float)

	Note:
	- Input scores are clamped to [-1, 1] range
	- Handles edge case where all scores are >= 1
	- Scans cutoff values from min score to 1 in increments
	- Handles NaN F1 scores by replacing with -1 before finding max
	- Returns metrics at the threshold that maximizes F1 score
	- Optimized to compute metrics for all cutoffs in parallel using vectorization
	"""
	# Handle NaNs by replacing with 0.0
	ss = torch.nan_to_num(ss, nan=0.0)
	ss = torch.clamp(ss, -1.0, 1.0)
	min_val = ss.min().item()
	max_val = 1
	if min_val >= max_val:
	min_val = 0

	# Convert to float and ensure labels are binary
	ss = ss.float()
	labels = labels.float()

	# Create cutoff tensor
	cutoffs = torch.arange(min_val, max_val, increment, device=ss.device, dtype=ss.dtype)
	n_cutoffs = len(cutoffs)

	if n_cutoffs == 0:
	# Edge case: no cutoffs to test
	return 0.0, 0.0, 0.0, min_val

	# Vectorize across all cutoffs: shape (n_cutoffs, n_samples)
	# Expand cutoffs to (n_cutoffs, 1) and ss to (1, n_samples) for broadcasting
	ss_expanded = ss.unsqueeze(0) # (1, n_samples)
	cutoffs_expanded = cutoffs.unsqueeze(1) # (n_cutoffs, 1)
	labels_expanded = labels.unsqueeze(0) # (1, n_samples)

	# Compute predictions for all cutoffs at once: (n_cutoffs, n_samples)
	predictions = (ss_expanded >= cutoffs_expanded).float()

	# Compute TP, FP, FN for all cutoffs simultaneously
	# TP: predicted positive and label positive
	tp = torch.sum(predictions * labels_expanded, dim=1) # (n_cutoffs,)
	# FP: predicted positive but label negative
	fp = torch.sum(predictions * (1.0 - labels_expanded), dim=1) # (n_cutoffs,)
	# FN: predicted negative but label positive
	fn = torch.sum((1.0 - predictions) * labels_expanded, dim=1) # (n_cutoffs,)

	# Compute precision, recall, F1 for all cutoffs
	precision_denominator = tp + fp
	precision = torch.where(precision_denominator != 0, tp / precision_denominator, torch.tensor(0.0, device=ss.device))

	recall_denominator = tp + fn
	recall = torch.where(recall_denominator != 0, tp / recall_denominator, torch.tensor(0.0, device=ss.device))

	# Compute F1 scores
	f1_denominator = precision + recall
	f1s = torch.where(f1_denominator != 0, (2 * precision * recall) / f1_denominator, torch.tensor(0.0, device=ss.device))

	# Handle NaN values by replacing with -1
	valid_f1s = torch.where(torch.isnan(f1s), torch.tensor(-1.0, device=ss.device), f1s)
	max_index = torch.argmax(valid_f1s)

	return f1s[max_index].item(), precision[max_index].item(), recall[max_index].item(), cutoffs[max_index].item()



	def calculate_robust_roc_auc_multiclass(y_true: np.ndarray, probs: np.ndarray) -> float:
	"""
	Robust ROC AUC for multi-class (single-label) tasks.
	Handles missing classes in y_true by ignoring them in the weighted average.
	"""
	# Check for NaNs in probs
	if np.isnan(probs).any():
	probs = np.nan_to_num(probs, nan=0.0)

	n_classes = probs.shape[1]
	try:
	if n_classes == 2:
	if len(np.unique(y_true)) == 2:
	return roc_auc_score(y_true, probs[:, 1])
	return -100.0

	y_true_onehot = np.eye(n_classes)[y_true]
	scores = []
	weights = []
	for i in range(n_classes):
	# Only calculate if both positive and negative samples exist
	if len(np.unique(y_true_onehot[:, i])) == 2:
	scores.append(roc_auc_score(y_true_onehot[:, i], probs[:, i]))
	weights.append(np.sum(y_true_onehot[:, i]))

	if not scores:
	return -100.0

	return float(np.average(scores, weights=weights))
	except Exception:
	return -100.0


	def calculate_robust_pr_auc_multiclass(y_true: np.ndarray, probs: np.ndarray) -> float:
	"""
	Robust PR AUC for multi-class (single-label) tasks.
	"""
	# Check for NaNs in probs
	if np.isnan(probs).any():
	probs = np.nan_to_num(probs, nan=0.0)

	n_classes = probs.shape[1]
	try:
	if n_classes == 2:
	if len(np.unique(y_true)) == 2:
	precision, recall, _ = precision_recall_curve(y_true, probs[:, 1])
	return auc(recall, precision)
	return -100.0

	y_true_onehot = np.eye(n_classes)[y_true]
	scores = []
	weights = []
	for i in range(n_classes):
	if len(np.unique(y_true_onehot[:, i])) == 2:
	precision, recall, _ = precision_recall_curve(y_true_onehot[:, i], probs[:, i])
	scores.append(auc(recall, precision))
	weights.append(np.sum(y_true_onehot[:, i]))

	if not scores:
	return -100.0

	return float(np.average(scores, weights=weights))
	except Exception:
	return -100.0


	def calculate_robust_roc_auc_multilabel(y_true: np.ndarray, probs: np.ndarray) -> float:
	"""
	Robust ROC AUC for multi-label tasks (macro average).
	"""
	if np.isnan(probs).any():
	probs = np.nan_to_num(probs, nan=0.0)

	scores = []
	try:
	for i in range(y_true.shape[1]):
	if len(np.unique(y_true[:, i])) == 2:
	scores.append(roc_auc_score(y_true[:, i], probs[:, i]))

	if not scores:
	return -100.0
	return float(np.mean(scores))
	except Exception:
	return -100.0


	def calculate_robust_pr_auc_multilabel(y_true: np.ndarray, probs: np.ndarray) -> float:
	"""
	Robust PR AUC for multi-label tasks (macro average).
	"""
	if np.isnan(probs).any():
	probs = np.nan_to_num(probs, nan=0.0)

	scores = []
	try:
	for i in range(y_true.shape[1]):
	if len(np.unique(y_true[:, i])) == 2:
	precision, recall, _ = precision_recall_curve(y_true[:, i], probs[:, i])
	scores.append(auc(recall, precision))

	if not scores:
	return -100.0
	return float(np.mean(scores))
	except Exception:
	return -100.0


	def compute_single_label_classification_metrics(p: EvalPrediction) -> dict[str, float]:
	"""
	Compute comprehensive metrics for single-label classification tasks.

	Args:
	p: EvalPrediction object containing model predictions and ground truth labels

	Returns:
	Dictionary with the following metrics (all rounded to 5 decimal places):
	- f1: F1 score (weighted average)
	- precision: Precision score (weighted average)
	- recall: Recall score (weighted average)
	- accuracy: Overall accuracy
	- mcc: Matthews Correlation Coefficient
	- roc_auc: Area Under ROC Curve (weighted average)
	- pr_auc: Area Under Precision-Recall Curve (weighted average)

	Note:
	- Handles both binary and multi-class cases
	- For binary case: uses 0.5 threshold on probabilities
	- For multi-class: uses argmax for class prediction
	- Prints confusion matrix for detailed error analysis
	- Uses weighted averaging for multi-class metrics
	- Handles AUC calculation for both binary and multi-class cases
	"""
	logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
	labels = p.label_ids[1] if isinstance(p.label_ids, tuple) else p.label_ids

	y_pred = logits.argmax(axis=-1).flatten()
	y_true = labels.flatten().astype(int)
	probs = softmax(logits)

	# Calculate ROC AUC
	roc_auc = calculate_robust_roc_auc_multiclass(y_true, probs)

	# Calculate PR AUC (true AUC of Precision-Recall curve)
	pr_auc = calculate_robust_pr_auc_multiclass(y_true, probs)

	cm = confusion_matrix(y_true, y_pred)
	print("\nConfusion Matrix:")
	print(cm)

	f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
	precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
	recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
	accuracy = accuracy_score(y_true, y_pred)
	mcc = matthews_corrcoef(y_true, y_pred)

	return {
	'f1': round(f1, 5),
	'precision': round(precision, 5),
	'recall': round(recall, 5),
	'accuracy': round(accuracy, 5),
	'mcc': round(mcc, 5),
	'roc_auc': round(roc_auc, 5),
	'pr_auc': round(pr_auc, 5)
	}


	def compute_tokenwise_classification_metrics(p: EvalPrediction) -> dict[str, float]:
	"""
	Compute metrics for token-level classification tasks.

	Args:
	p: EvalPrediction object containing model predictions and ground truth labels

	Returns:
	Dictionary containing the following metrics (all rounded to 5 decimal places):
	- accuracy: Overall accuracy
	- f1: F1 score (macro average)
	- precision: Precision score (macro average)
	- recall: Recall score (macro average)
	- mcc: Matthews Correlation Coefficient
	- roc_auc: Area Under ROC Curve (weighted average)
	- pr_auc: Area Under Precision-Recall Curve (weighted average)

	Note:
	- Handles special token padding (-100) by filtering before metric calculation
	- Uses macro averaging for multi-class metrics
	- Converts predictions to class labels using argmax
	- Handles AUC calculation for both binary and multi-class cases
	"""
	logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
	labels = p.label_ids
	# Compute f1 score
	y_pred = logits.argmax(axis=-1).flatten()
	y_true = labels.flatten()
	valid_indices = y_true != -100
	y_pred = y_pred[valid_indices]
	y_true = y_true[valid_indices]

	cm = confusion_matrix(y_true, y_pred)
	print("\nConfusion Matrix:")
	print(cm)

	f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
	precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
	recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
	accuracy = accuracy_score(y_true, y_pred)
	mcc = matthews_corrcoef(y_true, y_pred)

	# Calculate probabilities for AUC metrics
	probs = softmax(logits)
	probs = probs.reshape(-1, probs.shape[-1]) # Flatten to (n_samples, n_classes)
	probs = probs[valid_indices] # Filter by valid indices

	# Calculate ROC AUC
	roc_auc = calculate_robust_roc_auc_multiclass(y_true, probs)

	# Calculate PR AUC (true AUC of Precision-Recall curve)
	pr_auc = calculate_robust_pr_auc_multiclass(y_true, probs)

	return {
	'accuracy': round(accuracy, 5),
	'f1': round(f1, 5),
	'precision': round(precision, 5),
	'recall': round(recall, 5),
	'mcc': round(mcc, 5),
	'roc_auc': round(roc_auc, 5),
	'pr_auc': round(pr_auc, 5)
	}


	def compute_multi_label_classification_metrics(p: EvalPrediction) -> dict[str, float]:
	"""
	Compute comprehensive metrics for multi-label classification tasks.

	Args:
	p: EvalPrediction object containing model predictions and ground truth labels

	Returns:
	Dictionary containing the following metrics (all rounded to 5 decimal places):
	- accuracy: Overall accuracy
	- f1: F1 score (optimized across thresholds)
	- precision: Precision score (at optimal threshold)
	- recall: Recall score (at optimal threshold)
	- hamming_loss: Proportion of wrong labels
	- threshold: Optimal classification threshold
	- mcc: Matthews Correlation Coefficient
	- roc_auc: Area Under ROC Curve (macro average)
	- pr_auc: Area Under Precision-Recall Curve (macro average)

	Note:
	- Converts inputs to PyTorch tensors
	- Applies softmax to raw predictions
	- Uses threshold optimization for best F1 score
	- Handles multi-class ROC AUC using one-vs-rest
	- All metrics are computed on flattened predictions
	"""
	preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
	labels = p.label_ids[1] if isinstance(p.label_ids, tuple) else p.label_ids

	# Convert to tensors efficiently, avoiding unnecessary numpy round-trip
	if not isinstance(preds, torch.Tensor):
	preds = torch.tensor(preds)
	if not isinstance(labels, torch.Tensor):
	y_true = torch.tensor(labels, dtype=torch.int)
	else:
	y_true = labels.int()

	probs = preds.sigmoid()
	y_pred = (probs > 0.5).int()

	# Flatten before max_metrics for efficiency - max_metrics expects flattened tensors
	probs_flat = probs.flatten()
	y_true_flat = y_true.flatten()
	f1, prec, recall, thres = max_metrics(probs_flat, y_true_flat)

	y_pred_flat, y_true_flat = y_pred.flatten().numpy(), y_true.flatten().numpy()

	accuracy = accuracy_score(y_pred_flat, y_true_flat)
	hamming = hamming_loss(y_pred_flat, y_true_flat)
	mcc = matthews_corrcoef(y_true_flat, y_pred_flat)

	# Calculate ROC AUC for multilabel case
	# Use unflattened arrays for macro averaging
	roc_auc = calculate_robust_roc_auc_multilabel(y_true.numpy(), probs.numpy())

	# Calculate PR AUC for multilabel case (true AUC of Precision-Recall curve)
	pr_auc = calculate_robust_pr_auc_multilabel(y_true.numpy(), probs.numpy())

	return {
	'accuracy': round(accuracy, 5),
	'f1': round(f1, 5),
	'precision': round(prec, 5),
	'recall': round(recall, 5),
	'hamming_loss': round(hamming, 5),
	'threshold': round(thres, 5),
	'mcc': round(mcc, 5),
	'roc_auc': round(roc_auc, 5),
	'pr_auc': round(pr_auc, 5)
	}


	def compute_regression_metrics(p: EvalPrediction) -> dict[str, float]:
	"""
	Compute comprehensive metrics for regression tasks.

	Args:
	p: EvalPrediction object containing model predictions and ground truth values

	Returns:
	Dictionary containing the following metrics (all rounded to 5 decimal places):
	- r_squared: Coefficient of determination (R²)
	- spearman_rho: Spearman rank correlation coefficient
	- spear_pval: P-value for Spearman correlation
	- pearson_rho: Pearson correlation coefficient
	- pear_pval: P-value for Pearson correlation
	- mse: Mean Squared Error
	- mae: Mean Absolute Error
	- rmse: Root Mean Squared Error

	Note:
	- Handles both raw predictions and tuple predictions
	- Flattens inputs to 1D arrays
	- Includes both correlation and error metrics
	- P-values indicate statistical significance of correlations
	- RMSE is calculated as square root of MSE
	"""
	preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
	labels = p.label_ids[1] if isinstance(p.label_ids, tuple) else p.label_ids

	y_pred = np.array(preds).flatten()
	y_true = np.array(labels).flatten()

	if np.isnan(y_true).any():
	print("y_true Nans were cast to 0")
	y_true = np.where(np.isnan(y_true), 0, y_true)
	if np.isnan(y_pred).any():
	print("y_pred Nans were cast to 0")
	y_pred = np.where(np.isnan(y_pred), 0, y_pred)

	try:
	spearman_rho, spear_pval = spearmanr(y_pred, y_true)
	pearson_rho, pear_pval = pearsonr(y_pred, y_true)
	except:
	spearman_rho = -100.0
	spear_pval = -100.0
	pearson_rho = -100.0
	pear_pval = -100.0

	r2 = r2_score(y_true, y_pred)
	mse = mean_squared_error(y_true, y_pred)
	mae = mean_absolute_error(y_true, y_pred)
	rmse = np.sqrt(mse)

	return {
	'r_squared': round(r2, 5),
	'spearman_rho': round(spearman_rho, 5),
	'spear_pval': round(spear_pval, 5),
	'pearson_rho': round(pearson_rho, 5),
	'pear_pval': round(pear_pval, 5),
	'mse': round(mse, 5),
	'mae': round(mae, 5),
	'rmse': round(rmse, 5),
	}


	def compute_tokenwise_regression_metrics(p: EvalPrediction) -> dict[str, float]:
	"""
	Compute regression metrics tokenwise, ignoring label positions equal to -100.

	Compatible with HF Trainer `compute_metrics` API.
	"""
	preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
	labels = p.label_ids[1] if isinstance(p.label_ids, tuple) else p.label_ids

	y_pred = np.array(preds)
	y_true = np.array(labels)

	# If predictions have an extra trailing dim of size 1, squeeze it
	if y_pred.ndim == y_true.ndim + 1 and y_pred.shape[-1] == 1:
	y_pred = np.squeeze(y_pred, axis=-1)

	# Flatten to align and filter by valid positions (labels != -100)
	valid_mask = (y_true != -100)
	y_true = y_true[valid_mask].astype(float)
	y_pred = y_pred[valid_mask].astype(float)

	if y_true.size == 0:
	return {
	'r_squared': -100.0,
	'spearman_rho': -100.0,
	'spear_pval': -100.0,
	'pearson_rho': -100.0,
	'pear_pval': -100.0,
	'mse': -100.0,
	'mae': -100.0,
	'rmse': -100.0,
	}

	if np.isnan(y_true).any():
	print("y_true Nans were cast to 0")
	y_true = np.where(np.isnan(y_true), 0, y_true)
	if np.isnan(y_pred).any():
	print("y_pred Nans were cast to 0")
	y_pred = np.where(np.isnan(y_pred), 0, y_pred)

	try:
	spearman_rho, spear_pval = spearmanr(y_pred, y_true)
	pearson_rho, pear_pval = pearsonr(y_pred, y_true)
	except Exception:
	spearman_rho = -100.0
	spear_pval = -100.0
	pearson_rho = -100.0
	pear_pval = -100.0

	r2 = r2_score(y_true, y_pred)
	mse = mean_squared_error(y_true, y_pred)
	mae = mean_absolute_error(y_true, y_pred)
	rmse = np.sqrt(mse)

	return {
	'r_squared': round(float(r2), 5),
	'spearman_rho': round(float(spearman_rho), 5),
	'spear_pval': round(float(spear_pval), 5),
	'pearson_rho': round(float(pearson_rho), 5),
	'pear_pval': round(float(pear_pval), 5),
	'mse': round(float(mse), 5),
	'mae': round(float(mae), 5),
	'rmse': round(float(rmse), 5),
	}


	def get_compute_metrics(task_type: str, tokenwise: bool = False):
	if task_type == 'singlelabel':
	compute_metrics = compute_single_label_classification_metrics
	elif task_type == 'multilabel':
	compute_metrics = compute_multi_label_classification_metrics
	elif task_type == 'sigmoid_regression':
	# Treat sigmoid_regression like regression for metrics
	compute_metrics = compute_tokenwise_regression_metrics if tokenwise else compute_regression_metrics
	elif not task_type == 'regression' and tokenwise:
	compute_metrics = compute_tokenwise_classification_metrics
	elif task_type == 'regression' and not tokenwise:
	compute_metrics = compute_regression_metrics
	elif task_type == 'regression' and tokenwise:
	compute_metrics = compute_tokenwise_regression_metrics
	else:
	raise ValueError(f'Task type {task_type} not supported')
	return compute_metrics


	if __name__ == "__main__":
	# py -m metrics

	print("Running tests for metrics functions...")

	# Test compute_single_label_classification_metrics
	print("\n--- compute_single_label_classification_metrics (Binary) ---")
	# 2 samples, 2 classes.
	# Logits: Sample 0 -> class 0 (high, low), Sample 1 -> class 1 (low, high)
	predictions = np.array([[2.0, -1.0], [-1.0, 2.0]])
	label_ids = np.array([0, 1])
	p = EvalPrediction(predictions=predictions, label_ids=label_ids)
	metrics = compute_single_label_classification_metrics(p)
	print(metrics)

	print("\n--- compute_single_label_classification_metrics (Multi-class) ---")
	# 3 samples, 3 classes.
	predictions = np.array([[2.0, 0.0, 0.0], [0.0, 2.0, 0.0], [0.0, 0.0, 2.0]])
	label_ids = np.array([0, 1, 2])
	p = EvalPrediction(predictions=predictions, label_ids=label_ids)
	metrics = compute_single_label_classification_metrics(p)
	print(metrics)

	# Test compute_tokenwise_classification_metrics
	print("\n--- compute_tokenwise_classification_metrics ---")
	# 1 sample, 3 tokens, 2 classes.
	# Token 0: pred 0, label 0
	# Token 1: pred 1, label 1
	# Token 2: pred 0, label -100 (ignored)
	predictions = np.array([[[2.0, -1.0], [-1.0, 2.0], [2.0, -1.0]]])
	label_ids = np.array([[0, 1, -100]])
	p = EvalPrediction(predictions=predictions, label_ids=label_ids)
	metrics = compute_tokenwise_classification_metrics(p)
	print(metrics)

	# Test compute_multi_label_classification_metrics
	print("\n--- compute_multi_label_classification_metrics ---")
	# 2 samples, 3 classes
	# Sample 0: pred [1, 0, 1], label [1, 0, 1]
	# Sample 1: pred [0, 1, 0], label [0, 1, 0]
	# Logits need to be high for 1, low for 0.
	predictions = np.array([[5.0, -5.0, 5.0], [-5.0, 5.0, -5.0]])
	label_ids = np.array([[1, 0, 1], [0, 1, 0]])
	p = EvalPrediction(predictions=predictions, label_ids=label_ids)
	metrics = compute_multi_label_classification_metrics(p)
	print(metrics)

	# Test compute_regression_metrics
	print("\n--- compute_regression_metrics ---")
	predictions = np.array([1.0, 2.0, 3.0])
	label_ids = np.array([1.1, 1.9, 3.2])
	p = EvalPrediction(predictions=predictions, label_ids=label_ids)
	metrics = compute_regression_metrics(p)
	print(metrics)

	# Test compute_tokenwise_regression_metrics
	print("\n--- compute_tokenwise_regression_metrics ---")
	# 1 sample, 3 tokens
	# Token 2 is ignored (-100)
	predictions = np.array([[1.0, 2.0, 5.0]])
	label_ids = np.array([[1.1, 1.9, -100.0]])
	p = EvalPrediction(predictions=predictions, label_ids=label_ids)
	metrics = compute_tokenwise_regression_metrics(p)
	print(metrics)