Spaces:

akhil-vaidya
/

bootstrap

Sleeping

App Files Files Community

bootstrap / src /qualivec /evaluation.py

akhil-vaidya

Upload 26 files

f133a92 verified about 1 month ago

raw

history blame contribute delete

10.1 kB

	"""Evaluation utilities for QualiVec."""

	import numpy as np
	import pandas as pd
	from typing import Dict, List, Tuple, Optional, Union, Any
	from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
	import matplotlib.pyplot as plt
	import seaborn as sns
	from tqdm import tqdm


	class Evaluator:
	"""Handles evaluation for QualiVec."""

	def __init__(self, verbose: bool = True):
	"""Initialize the evaluator.

	Args:
	verbose: Whether to print status messages.
	"""
	self.verbose = verbose

	def evaluate(self,
	true_labels: List[str],
	predicted_labels: List[str],
	class_names: Optional[List[str]] = None) -> Dict[str, Any]:
	"""Evaluate predictions against true labels.

	Args:
	true_labels: List of true class labels.
	predicted_labels: List of predicted class labels.
	class_names: List of class names for detailed metrics.

	Returns:
	Dictionary with evaluation metrics.
	"""
	if len(true_labels) != len(predicted_labels):
	raise ValueError(f"Length mismatch: {len(true_labels)} true labels vs {len(predicted_labels)} predictions")

	if self.verbose:
	print(f"Evaluating {len(true_labels)} predictions")

	# Calculate metrics
	accuracy = accuracy_score(true_labels, predicted_labels)

	# If class_names not provided, use unique values from true and predicted
	if class_names is None:
	class_names = sorted(set(true_labels) \| set(predicted_labels))

	# Calculate precision, recall, F1 (macro average)
	precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
	true_labels, predicted_labels, average='macro'
	)

	# Calculate per-class metrics
	precision, recall, f1, support = precision_recall_fscore_support(
	true_labels, predicted_labels, labels=class_names, average=None
	)

	# Create class-wise metrics
	class_metrics = {
	"precision": {cls: p for cls, p in zip(class_names, precision)},
	"recall": {cls: r for cls, r in zip(class_names, recall)},
	"f1": {cls: f for cls, f in zip(class_names, f1)},
	"support": {cls: s for cls, s in zip(class_names, support)}
	}

	# Create confusion matrix
	cm = confusion_matrix(true_labels, predicted_labels, labels=class_names)

	# Compile results
	results = {
	"accuracy": accuracy,
	"precision_macro": precision_macro,
	"recall_macro": recall_macro,
	"f1_macro": f1_macro,
	"class_metrics": class_metrics,
	"confusion_matrix": cm,
	"confusion_matrix_labels": class_names,
	"n_samples": len(true_labels)
	}

	if self.verbose:
	print(f"Accuracy: {accuracy:.4f}")
	print(f"Precision (macro): {precision_macro:.4f}")
	print(f"Recall (macro): {recall_macro:.4f}")
	print(f"F1 (macro): {f1_macro:.4f}")

	return results

	def bootstrap_evaluate(self,
	true_labels: List[str],
	predicted_labels: List[str],
	n_iterations: int = 1000,
	confidence_levels: List[float] = [0.9, 0.95, 0.99],
	random_seed: Optional[int] = None) -> Dict[str, Any]:
	"""Evaluate with bootstrap confidence intervals.

	Args:
	true_labels: List of true class labels.
	predicted_labels: List of predicted class labels.
	n_iterations: Number of bootstrap iterations.
	confidence_levels: Confidence levels to compute.
	random_seed: Random seed for reproducibility.

	Returns:
	Dictionary with evaluation metrics and confidence intervals.
	"""
	if len(true_labels) != len(predicted_labels):
	raise ValueError(f"Length mismatch: {len(true_labels)} true labels vs {len(predicted_labels)} predictions")

	if self.verbose:
	print(f"Running bootstrap evaluation with {n_iterations} iterations")

	# Set random seed
	if random_seed is not None:
	np.random.seed(random_seed)

	# Initialize storage for bootstrap results
	bootstrap_metrics = {
	"accuracy": [],
	"precision_macro": [],
	"recall_macro": [],
	"f1_macro": []
	}

	# Original evaluation
	original_results = self.evaluate(true_labels, predicted_labels)

	# Run bootstrap iterations
	n_samples = len(true_labels)

	for _ in tqdm(range(n_iterations), disable=not self.verbose):
	# Sample with replacement
	indices = np.random.choice(n_samples, size=n_samples, replace=True)

	# Get bootstrap sample
	bootstrap_true = [true_labels[i] for i in indices]
	bootstrap_pred = [predicted_labels[i] for i in indices]

	# Evaluate
	results = self.evaluate(bootstrap_true, bootstrap_pred)

	# Store results
	bootstrap_metrics["accuracy"].append(results["accuracy"])
	bootstrap_metrics["precision_macro"].append(results["precision_macro"])
	bootstrap_metrics["recall_macro"].append(results["recall_macro"])
	bootstrap_metrics["f1_macro"].append(results["f1_macro"])

	# Calculate confidence intervals
	confidence_intervals = {}

	for metric, values in bootstrap_metrics.items():
	confidence_intervals[metric] = {}
	for level in confidence_levels:
	lower_percentile = (1 - level) / 2 * 100
	upper_percentile = (1 + level) / 2 * 100

	lower = np.percentile(values, lower_percentile)
	upper = np.percentile(values, upper_percentile)

	confidence_intervals[metric][level] = (lower, upper)

	# Combine results
	results = {
	"point_estimates": {
	"accuracy": original_results["accuracy"],
	"precision_macro": original_results["precision_macro"],
	"recall_macro": original_results["recall_macro"],
	"f1_macro": original_results["f1_macro"]
	},
	"confidence_intervals": confidence_intervals,
	"bootstrap_distribution": bootstrap_metrics,
	"n_iterations": n_iterations,
	"n_samples": n_samples
	}

	if self.verbose:
	print(f"Bootstrap evaluation complete")
	print(f"Accuracy: {results['point_estimates']['accuracy']:.4f}")
	for level in confidence_levels:
	lower, upper = results['confidence_intervals']['accuracy'][level]
	print(f" {level*100:.0f}% CI: ({lower:.4f}, {upper:.4f})")

	return results

	def plot_confusion_matrix(self,
	confusion_matrix: np.ndarray,
	class_names: List[str],
	figsize: Tuple[int, int] = (10, 8),
	title: str = "Confusion Matrix"):
	"""Plot a confusion matrix.

	Args:
	confusion_matrix: Confusion matrix as numpy array.
	class_names: List of class names.
	figsize: Figure size as (width, height).
	title: Plot title.
	"""
	plt.figure(figsize=figsize)

	# Create heatmap
	sns.heatmap(
	confusion_matrix,
	annot=True,
	fmt="d",
	cmap="Blues",
	xticklabels=class_names,
	yticklabels=class_names
	)

	plt.xlabel("Predicted")
	plt.ylabel("True")
	plt.title(title)
	plt.tight_layout()
	plt.show()

	def plot_bootstrap_distributions(self, bootstrap_results: Dict[str, Any], figsize: Tuple[int, int] = (12, 8)):
	"""Plot bootstrap distributions for key metrics.

	Args:
	bootstrap_results: Results from bootstrap_evaluate.
	figsize: Figure size as (width, height).
	"""
	metrics = ["accuracy", "precision_macro", "recall_macro", "f1_macro"]

	plt.figure(figsize=figsize)

	for i, metric in enumerate(metrics):
	plt.subplot(2, 2, i+1)

	# Get distribution data
	values = bootstrap_results["bootstrap_distribution"][metric]

	# Plot histogram
	sns.histplot(values, kde=True)

	# Add point estimate
	point_est = bootstrap_results["point_estimates"][metric]
	plt.axvline(point_est, color='red', linestyle='--', label=f'Point est: {point_est:.4f}')

	# Add confidence intervals
	for level, (lower, upper) in bootstrap_results["confidence_intervals"][metric].items():
	plt.axvline(lower, color='green', linestyle=':',
	label=f'{level*100:.0f}% CI: ({lower:.4f}, {upper:.4f})')
	plt.axvline(upper, color='green', linestyle=':')

	plt.title(f"{metric.replace('_', ' ').title()}")

	if i == 0: # Only add legend to first plot
	plt.legend(loc='best')

	plt.tight_layout()
	plt.show()