bootstrap / src /qualivec /evaluation.py
akhil-vaidya's picture
Upload 26 files
f133a92 verified
"""Evaluation utilities for QualiVec."""
import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Optional, Union, Any
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
class Evaluator:
"""Handles evaluation for QualiVec."""
def __init__(self, verbose: bool = True):
"""Initialize the evaluator.
Args:
verbose: Whether to print status messages.
"""
self.verbose = verbose
def evaluate(self,
true_labels: List[str],
predicted_labels: List[str],
class_names: Optional[List[str]] = None) -> Dict[str, Any]:
"""Evaluate predictions against true labels.
Args:
true_labels: List of true class labels.
predicted_labels: List of predicted class labels.
class_names: List of class names for detailed metrics.
Returns:
Dictionary with evaluation metrics.
"""
if len(true_labels) != len(predicted_labels):
raise ValueError(f"Length mismatch: {len(true_labels)} true labels vs {len(predicted_labels)} predictions")
if self.verbose:
print(f"Evaluating {len(true_labels)} predictions")
# Calculate metrics
accuracy = accuracy_score(true_labels, predicted_labels)
# If class_names not provided, use unique values from true and predicted
if class_names is None:
class_names = sorted(set(true_labels) | set(predicted_labels))
# Calculate precision, recall, F1 (macro average)
precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
true_labels, predicted_labels, average='macro'
)
# Calculate per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(
true_labels, predicted_labels, labels=class_names, average=None
)
# Create class-wise metrics
class_metrics = {
"precision": {cls: p for cls, p in zip(class_names, precision)},
"recall": {cls: r for cls, r in zip(class_names, recall)},
"f1": {cls: f for cls, f in zip(class_names, f1)},
"support": {cls: s for cls, s in zip(class_names, support)}
}
# Create confusion matrix
cm = confusion_matrix(true_labels, predicted_labels, labels=class_names)
# Compile results
results = {
"accuracy": accuracy,
"precision_macro": precision_macro,
"recall_macro": recall_macro,
"f1_macro": f1_macro,
"class_metrics": class_metrics,
"confusion_matrix": cm,
"confusion_matrix_labels": class_names,
"n_samples": len(true_labels)
}
if self.verbose:
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (macro): {precision_macro:.4f}")
print(f"Recall (macro): {recall_macro:.4f}")
print(f"F1 (macro): {f1_macro:.4f}")
return results
def bootstrap_evaluate(self,
true_labels: List[str],
predicted_labels: List[str],
n_iterations: int = 1000,
confidence_levels: List[float] = [0.9, 0.95, 0.99],
random_seed: Optional[int] = None) -> Dict[str, Any]:
"""Evaluate with bootstrap confidence intervals.
Args:
true_labels: List of true class labels.
predicted_labels: List of predicted class labels.
n_iterations: Number of bootstrap iterations.
confidence_levels: Confidence levels to compute.
random_seed: Random seed for reproducibility.
Returns:
Dictionary with evaluation metrics and confidence intervals.
"""
if len(true_labels) != len(predicted_labels):
raise ValueError(f"Length mismatch: {len(true_labels)} true labels vs {len(predicted_labels)} predictions")
if self.verbose:
print(f"Running bootstrap evaluation with {n_iterations} iterations")
# Set random seed
if random_seed is not None:
np.random.seed(random_seed)
# Initialize storage for bootstrap results
bootstrap_metrics = {
"accuracy": [],
"precision_macro": [],
"recall_macro": [],
"f1_macro": []
}
# Original evaluation
original_results = self.evaluate(true_labels, predicted_labels)
# Run bootstrap iterations
n_samples = len(true_labels)
for _ in tqdm(range(n_iterations), disable=not self.verbose):
# Sample with replacement
indices = np.random.choice(n_samples, size=n_samples, replace=True)
# Get bootstrap sample
bootstrap_true = [true_labels[i] for i in indices]
bootstrap_pred = [predicted_labels[i] for i in indices]
# Evaluate
results = self.evaluate(bootstrap_true, bootstrap_pred)
# Store results
bootstrap_metrics["accuracy"].append(results["accuracy"])
bootstrap_metrics["precision_macro"].append(results["precision_macro"])
bootstrap_metrics["recall_macro"].append(results["recall_macro"])
bootstrap_metrics["f1_macro"].append(results["f1_macro"])
# Calculate confidence intervals
confidence_intervals = {}
for metric, values in bootstrap_metrics.items():
confidence_intervals[metric] = {}
for level in confidence_levels:
lower_percentile = (1 - level) / 2 * 100
upper_percentile = (1 + level) / 2 * 100
lower = np.percentile(values, lower_percentile)
upper = np.percentile(values, upper_percentile)
confidence_intervals[metric][level] = (lower, upper)
# Combine results
results = {
"point_estimates": {
"accuracy": original_results["accuracy"],
"precision_macro": original_results["precision_macro"],
"recall_macro": original_results["recall_macro"],
"f1_macro": original_results["f1_macro"]
},
"confidence_intervals": confidence_intervals,
"bootstrap_distribution": bootstrap_metrics,
"n_iterations": n_iterations,
"n_samples": n_samples
}
if self.verbose:
print(f"Bootstrap evaluation complete")
print(f"Accuracy: {results['point_estimates']['accuracy']:.4f}")
for level in confidence_levels:
lower, upper = results['confidence_intervals']['accuracy'][level]
print(f" {level*100:.0f}% CI: ({lower:.4f}, {upper:.4f})")
return results
def plot_confusion_matrix(self,
confusion_matrix: np.ndarray,
class_names: List[str],
figsize: Tuple[int, int] = (10, 8),
title: str = "Confusion Matrix"):
"""Plot a confusion matrix.
Args:
confusion_matrix: Confusion matrix as numpy array.
class_names: List of class names.
figsize: Figure size as (width, height).
title: Plot title.
"""
plt.figure(figsize=figsize)
# Create heatmap
sns.heatmap(
confusion_matrix,
annot=True,
fmt="d",
cmap="Blues",
xticklabels=class_names,
yticklabels=class_names
)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title(title)
plt.tight_layout()
plt.show()
def plot_bootstrap_distributions(self, bootstrap_results: Dict[str, Any], figsize: Tuple[int, int] = (12, 8)):
"""Plot bootstrap distributions for key metrics.
Args:
bootstrap_results: Results from bootstrap_evaluate.
figsize: Figure size as (width, height).
"""
metrics = ["accuracy", "precision_macro", "recall_macro", "f1_macro"]
plt.figure(figsize=figsize)
for i, metric in enumerate(metrics):
plt.subplot(2, 2, i+1)
# Get distribution data
values = bootstrap_results["bootstrap_distribution"][metric]
# Plot histogram
sns.histplot(values, kde=True)
# Add point estimate
point_est = bootstrap_results["point_estimates"][metric]
plt.axvline(point_est, color='red', linestyle='--', label=f'Point est: {point_est:.4f}')
# Add confidence intervals
for level, (lower, upper) in bootstrap_results["confidence_intervals"][metric].items():
plt.axvline(lower, color='green', linestyle=':',
label=f'{level*100:.0f}% CI: ({lower:.4f}, {upper:.4f})')
plt.axvline(upper, color='green', linestyle=':')
plt.title(f"{metric.replace('_', ' ').title()}")
if i == 0: # Only add legend to first plot
plt.legend(loc='best')
plt.tight_layout()
plt.show()