Spaces:

akhil-vaidya
/

bootstrap

Sleeping

File size: 10,089 Bytes

f133a92

"""Evaluation utilities for QualiVec."""

import numpy as np
import pandas as pd
from typing import Dict, List, Tuple, Optional, Union, Any
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm


class Evaluator:
    """Handles evaluation for QualiVec."""
    
    def __init__(self, verbose: bool = True):
        """Initialize the evaluator.

        

        Args:

            verbose: Whether to print status messages.

        """
        self.verbose = verbose
    
    def evaluate(self, 

                true_labels: List[str], 

                predicted_labels: List[str],

                class_names: Optional[List[str]] = None) -> Dict[str, Any]:
        """Evaluate predictions against true labels.

        

        Args:

            true_labels: List of true class labels.

            predicted_labels: List of predicted class labels.

            class_names: List of class names for detailed metrics.

            

        Returns:

            Dictionary with evaluation metrics.

        """
        if len(true_labels) != len(predicted_labels):
            raise ValueError(f"Length mismatch: {len(true_labels)} true labels vs {len(predicted_labels)} predictions")
        
        if self.verbose:
            print(f"Evaluating {len(true_labels)} predictions")
        
        # Calculate metrics
        accuracy = accuracy_score(true_labels, predicted_labels)
        
        # If class_names not provided, use unique values from true and predicted
        if class_names is None:
            class_names = sorted(set(true_labels) | set(predicted_labels))
        
        # Calculate precision, recall, F1 (macro average)
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
            true_labels, predicted_labels, average='macro'
        )
        
        # Calculate per-class metrics
        precision, recall, f1, support = precision_recall_fscore_support(
            true_labels, predicted_labels, labels=class_names, average=None
        )
        
        # Create class-wise metrics
        class_metrics = {
            "precision": {cls: p for cls, p in zip(class_names, precision)},
            "recall": {cls: r for cls, r in zip(class_names, recall)},
            "f1": {cls: f for cls, f in zip(class_names, f1)},
            "support": {cls: s for cls, s in zip(class_names, support)}
        }
        
        # Create confusion matrix
        cm = confusion_matrix(true_labels, predicted_labels, labels=class_names)
        
        # Compile results
        results = {
            "accuracy": accuracy,
            "precision_macro": precision_macro,
            "recall_macro": recall_macro,
            "f1_macro": f1_macro,
            "class_metrics": class_metrics,
            "confusion_matrix": cm,
            "confusion_matrix_labels": class_names,
            "n_samples": len(true_labels)
        }
        
        if self.verbose:
            print(f"Accuracy: {accuracy:.4f}")
            print(f"Precision (macro): {precision_macro:.4f}")
            print(f"Recall (macro): {recall_macro:.4f}")
            print(f"F1 (macro): {f1_macro:.4f}")
        
        return results
    
    def bootstrap_evaluate(self,

                          true_labels: List[str],

                          predicted_labels: List[str],

                          n_iterations: int = 1000,

                          confidence_levels: List[float] = [0.9, 0.95, 0.99],

                          random_seed: Optional[int] = None) -> Dict[str, Any]:
        """Evaluate with bootstrap confidence intervals.

        

        Args:

            true_labels: List of true class labels.

            predicted_labels: List of predicted class labels.

            n_iterations: Number of bootstrap iterations.

            confidence_levels: Confidence levels to compute.

            random_seed: Random seed for reproducibility.

            

        Returns:

            Dictionary with evaluation metrics and confidence intervals.

        """
        if len(true_labels) != len(predicted_labels):
            raise ValueError(f"Length mismatch: {len(true_labels)} true labels vs {len(predicted_labels)} predictions")
        
        if self.verbose:
            print(f"Running bootstrap evaluation with {n_iterations} iterations")
        
        # Set random seed
        if random_seed is not None:
            np.random.seed(random_seed)
        
        # Initialize storage for bootstrap results
        bootstrap_metrics = {
            "accuracy": [],
            "precision_macro": [],
            "recall_macro": [],
            "f1_macro": []
        }
        
        # Original evaluation
        original_results = self.evaluate(true_labels, predicted_labels)
        
        # Run bootstrap iterations
        n_samples = len(true_labels)
        
        for _ in tqdm(range(n_iterations), disable=not self.verbose):
            # Sample with replacement
            indices = np.random.choice(n_samples, size=n_samples, replace=True)
            
            # Get bootstrap sample
            bootstrap_true = [true_labels[i] for i in indices]
            bootstrap_pred = [predicted_labels[i] for i in indices]
            
            # Evaluate
            results = self.evaluate(bootstrap_true, bootstrap_pred)
            
            # Store results
            bootstrap_metrics["accuracy"].append(results["accuracy"])
            bootstrap_metrics["precision_macro"].append(results["precision_macro"])
            bootstrap_metrics["recall_macro"].append(results["recall_macro"])
            bootstrap_metrics["f1_macro"].append(results["f1_macro"])
        
        # Calculate confidence intervals
        confidence_intervals = {}
        
        for metric, values in bootstrap_metrics.items():
            confidence_intervals[metric] = {}
            for level in confidence_levels:
                lower_percentile = (1 - level) / 2 * 100
                upper_percentile = (1 + level) / 2 * 100
                
                lower = np.percentile(values, lower_percentile)
                upper = np.percentile(values, upper_percentile)
                
                confidence_intervals[metric][level] = (lower, upper)
        
        # Combine results
        results = {
            "point_estimates": {
                "accuracy": original_results["accuracy"],
                "precision_macro": original_results["precision_macro"],
                "recall_macro": original_results["recall_macro"],
                "f1_macro": original_results["f1_macro"]
            },
            "confidence_intervals": confidence_intervals,
            "bootstrap_distribution": bootstrap_metrics,
            "n_iterations": n_iterations,
            "n_samples": n_samples
        }
        
        if self.verbose:
            print(f"Bootstrap evaluation complete")
            print(f"Accuracy: {results['point_estimates']['accuracy']:.4f}")
            for level in confidence_levels:
                lower, upper = results['confidence_intervals']['accuracy'][level]
                print(f"  {level*100:.0f}% CI: ({lower:.4f}, {upper:.4f})")
        
        return results
    
    def plot_confusion_matrix(self, 

                             confusion_matrix: np.ndarray, 

                             class_names: List[str],

                             figsize: Tuple[int, int] = (10, 8),

                             title: str = "Confusion Matrix"):
        """Plot a confusion matrix.

        

        Args:

            confusion_matrix: Confusion matrix as numpy array.

            class_names: List of class names.

            figsize: Figure size as (width, height).

            title: Plot title.

        """
        plt.figure(figsize=figsize)
        
        # Create heatmap
        sns.heatmap(
            confusion_matrix, 
            annot=True, 
            fmt="d", 
            cmap="Blues",
            xticklabels=class_names,
            yticklabels=class_names
        )
        
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.title(title)
        plt.tight_layout()
        plt.show()
    
    def plot_bootstrap_distributions(self, bootstrap_results: Dict[str, Any], figsize: Tuple[int, int] = (12, 8)):
        """Plot bootstrap distributions for key metrics.

        

        Args:

            bootstrap_results: Results from bootstrap_evaluate.

            figsize: Figure size as (width, height).

        """
        metrics = ["accuracy", "precision_macro", "recall_macro", "f1_macro"]
        
        plt.figure(figsize=figsize)
        
        for i, metric in enumerate(metrics):
            plt.subplot(2, 2, i+1)
            
            # Get distribution data
            values = bootstrap_results["bootstrap_distribution"][metric]
            
            # Plot histogram
            sns.histplot(values, kde=True)
            
            # Add point estimate
            point_est = bootstrap_results["point_estimates"][metric]
            plt.axvline(point_est, color='red', linestyle='--', label=f'Point est: {point_est:.4f}')
            
            # Add confidence intervals
            for level, (lower, upper) in bootstrap_results["confidence_intervals"][metric].items():
                plt.axvline(lower, color='green', linestyle=':', 
                          label=f'{level*100:.0f}% CI: ({lower:.4f}, {upper:.4f})')
                plt.axvline(upper, color='green', linestyle=':')
            
            plt.title(f"{metric.replace('_', ' ').title()}")
            
            if i == 0:  # Only add legend to first plot
                plt.legend(loc='best')
        
        plt.tight_layout()
        plt.show()