File size: 15,697 Bytes

8eab354

"""
Evaluation metrics for signature verification.
"""

import torch
import numpy as np
from typing import List, Tuple, Dict, Optional
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns


class SignatureVerificationMetrics:
    """
    Comprehensive metrics for signature verification evaluation.
    """
    
    def __init__(self, threshold: float = 0.5):
        """
        Initialize metrics calculator.
        
        Args:
            threshold: Similarity threshold for binary classification
        """
        self.threshold = threshold
        self.reset()
    
    def reset(self):
        """Reset all stored predictions and labels."""
        self.predictions = []
        self.labels = []
        self.similarities = []
    
    def update(self, 
               similarities: np.ndarray, 
               labels: np.ndarray):
        """
        Update metrics with new predictions.
        
        Args:
            similarities: Similarity scores
            labels: Ground truth labels (1 for genuine, 0 for forged)
        """
        self.similarities.extend(similarities)
        self.labels.extend(labels)
        
        # Convert similarities to binary predictions
        predictions = (similarities >= self.threshold).astype(int)
        self.predictions.extend(predictions)
    
    def compute_metrics(self) -> Dict[str, float]:
        """
        Compute all evaluation metrics.
        
        Returns:
            Dictionary of metrics
        """
        if not self.predictions or not self.labels:
            raise ValueError("No predictions or labels available. Call update() first.")
        
        similarities = np.array(self.similarities)
        labels = np.array(self.labels)
        predictions = np.array(self.predictions)
        
        # Basic classification metrics
        accuracy = accuracy_score(labels, predictions)
        precision = precision_score(labels, predictions, zero_division=0)
        recall = recall_score(labels, predictions, zero_division=0)
        f1 = f1_score(labels, predictions, zero_division=0)
        
        # ROC AUC
        try:
            roc_auc = roc_auc_score(labels, similarities)
        except ValueError:
            roc_auc = 0.0
        
        # Precision-Recall AUC
        try:
            precision_vals, recall_vals, _ = precision_recall_curve(labels, similarities)
            pr_auc = np.trapz(precision_vals, recall_vals)
        except ValueError:
            pr_auc = 0.0
        
        # Confusion matrix
        cm = confusion_matrix(labels, predictions)
        tn, fp, fn, tp = cm.ravel() if cm.size == 4 else (0, 0, 0, 0)
        
        # Additional metrics
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0.0
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        
        # Equal Error Rate (EER)
        eer = self._compute_eer(labels, similarities)
        
        # False Acceptance Rate (FAR) and False Rejection Rate (FRR)
        far = fp / (fp + tn) if (fp + tn) > 0 else 0.0
        frr = fn / (fn + tp) if (fn + tp) > 0 else 0.0
        
        metrics = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'roc_auc': roc_auc,
            'pr_auc': pr_auc,
            'specificity': specificity,
            'sensitivity': sensitivity,
            'eer': eer,
            'far': far,
            'frr': frr,
            'threshold': self.threshold
        }
        
        return metrics
    
    def _compute_eer(self, labels: np.ndarray, similarities: np.ndarray) -> float:
        """
        Compute Equal Error Rate (EER).
        
        Args:
            labels: Ground truth labels
            similarities: Similarity scores
            
        Returns:
            Equal Error Rate
        """
        try:
            fpr, tpr, thresholds = roc_curve(labels, similarities)
            fnr = 1 - tpr
            eer_threshold = thresholds[np.nanargmin(np.absolute((fnr - fpr)))]
            eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
            return float(eer)
        except (ValueError, IndexError):
            return 0.0
    
    def plot_roc_curve(self, save_path: Optional[str] = None):
        """
        Plot ROC curve.
        
        Args:
            save_path: Path to save the plot
        """
        if not self.similarities or not self.labels:
            raise ValueError("No data available for plotting.")
        
        similarities = np.array(self.similarities)
        labels = np.array(self.labels)
        
        fpr, tpr, _ = roc_curve(labels, similarities)
        roc_auc = roc_auc_score(labels, similarities)
        
        plt.figure(figsize=(8, 6))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")
        plt.grid(True)
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()
    
    def plot_precision_recall_curve(self, save_path: Optional[str] = None):
        """
        Plot Precision-Recall curve.
        
        Args:
            save_path: Path to save the plot
        """
        if not self.similarities or not self.labels:
            raise ValueError("No data available for plotting.")
        
        similarities = np.array(self.similarities)
        labels = np.array(self.labels)
        
        precision, recall, _ = precision_recall_curve(labels, similarities)
        pr_auc = np.trapz(precision, recall)
        
        plt.figure(figsize=(8, 6))
        plt.plot(recall, precision, color='darkorange', lw=2, label=f'PR curve (AUC = {pr_auc:.2f})')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')
        plt.legend(loc="lower left")
        plt.grid(True)
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()
    
    def plot_confusion_matrix(self, save_path: Optional[str] = None):
        """
        Plot confusion matrix.
        
        Args:
            save_path: Path to save the plot
        """
        if not self.predictions or not self.labels:
            raise ValueError("No data available for plotting.")
        
        cm = confusion_matrix(self.labels, self.predictions)
        
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=['Forged', 'Genuine'], 
                   yticklabels=['Forged', 'Genuine'])
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()
    
    def plot_similarity_distribution(self, save_path: Optional[str] = None):
        """
        Plot distribution of similarity scores for genuine and forged pairs.
        
        Args:
            save_path: Path to save the plot
        """
        if not self.similarities or not self.labels:
            raise ValueError("No data available for plotting.")
        
        similarities = np.array(self.similarities)
        labels = np.array(self.labels)
        
        genuine_similarities = similarities[labels == 1]
        forged_similarities = similarities[labels == 0]
        
        plt.figure(figsize=(10, 6))
        plt.hist(genuine_similarities, bins=50, alpha=0.7, label='Genuine', color='green')
        plt.hist(forged_similarities, bins=50, alpha=0.7, label='Forged', color='red')
        plt.axvline(self.threshold, color='black', linestyle='--', label=f'Threshold = {self.threshold}')
        plt.xlabel('Similarity Score')
        plt.ylabel('Frequency')
        plt.title('Distribution of Similarity Scores')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()


class ThresholdOptimizer:
    """
    Optimize threshold for signature verification.
    """
    
    def __init__(self, metric: str = 'f1_score'):
        """
        Initialize threshold optimizer.
        
        Args:
            metric: Metric to optimize ('f1_score', 'accuracy', 'eer')
        """
        self.metric = metric
        self.best_threshold = 0.5
        self.best_score = 0.0
    
    def optimize(self, 
                 similarities: np.ndarray, 
                 labels: np.ndarray,
                 threshold_range: Tuple[float, float] = (0.0, 1.0),
                 num_thresholds: int = 100) -> Dict[str, float]:
        """
        Optimize threshold for given metric.
        
        Args:
            similarities: Similarity scores
            labels: Ground truth labels
            threshold_range: Range of thresholds to test
            num_thresholds: Number of thresholds to test
            
        Returns:
            Dictionary with best threshold and score
        """
        thresholds = np.linspace(threshold_range[0], threshold_range[1], num_thresholds)
        scores = []
        
        for threshold in thresholds:
            predictions = (similarities >= threshold).astype(int)
            
            if self.metric == 'f1_score':
                score = f1_score(labels, predictions, zero_division=0)
            elif self.metric == 'accuracy':
                score = accuracy_score(labels, predictions)
            elif self.metric == 'eer':
                # Compute EER for this threshold
                fpr, tpr, _ = roc_curve(labels, similarities)
                fnr = 1 - tpr
                try:
                    score = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
                except (ValueError, IndexError):
                    score = 1.0
            else:
                raise ValueError(f"Unsupported metric: {self.metric}")
            
            scores.append(score)
        
        # Find best threshold
        if self.metric == 'eer':
            best_idx = np.argmin(scores)
        else:
            best_idx = np.argmax(scores)
        
        self.best_threshold = thresholds[best_idx]
        self.best_score = scores[best_idx]
        
        return {
            'best_threshold': self.best_threshold,
            'best_score': self.best_score,
            'thresholds': thresholds,
            'scores': scores
        }
    
    def plot_threshold_analysis(self, 
                               similarities: np.ndarray, 
                               labels: np.ndarray,
                               save_path: Optional[str] = None):
        """
        Plot threshold analysis.
        
        Args:
            similarities: Similarity scores
            labels: Ground truth labels
            save_path: Path to save the plot
        """
        result = self.optimize(similarities, labels)
        
        plt.figure(figsize=(10, 6))
        plt.plot(result['thresholds'], result['scores'], 'b-', linewidth=2)
        plt.axvline(self.best_threshold, color='red', linestyle='--', 
                   label=f'Best threshold = {self.best_threshold:.3f}')
        plt.xlabel('Threshold')
        plt.ylabel(f'{self.metric.upper()}')
        plt.title(f'Threshold Optimization - {self.metric.upper()}')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()


class CrossValidationEvaluator:
    """
    Cross-validation evaluator for signature verification.
    """
    
    def __init__(self, 
                 model: torch.nn.Module,
                 k_folds: int = 5,
                 threshold: float = 0.5):
        """
        Initialize cross-validation evaluator.
        
        Args:
            model: Model to evaluate
            k_folds: Number of folds for cross-validation
            threshold: Similarity threshold
        """
        self.model = model
        self.k_folds = k_folds
        self.threshold = threshold
        self.results = []
    
    def evaluate(self, 
                 data_pairs: List[Tuple[str, str, int]],
                 preprocessor,
                 batch_size: int = 32) -> Dict[str, float]:
        """
        Perform k-fold cross-validation.
        
        Args:
            data_pairs: List of (signature1_path, signature2_path, label) tuples
            preprocessor: Image preprocessor
            batch_size: Batch size for evaluation
            
        Returns:
            Average metrics across all folds
        """
        from sklearn.model_selection import KFold
        
        kf = KFold(n_splits=self.k_folds, shuffle=True, random_state=42)
        data_pairs = np.array(data_pairs)
        
        fold_metrics = []
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(data_pairs)):
            print(f"Evaluating fold {fold + 1}/{self.k_folds}")
            
            val_pairs = data_pairs[val_idx]
            
            # Evaluate on validation set
            fold_metrics.append(self._evaluate_fold(val_pairs, preprocessor, batch_size))
        
        # Compute average metrics
        avg_metrics = {}
        for metric in fold_metrics[0].keys():
            avg_metrics[metric] = np.mean([fold[metric] for fold in fold_metrics])
            avg_metrics[f'{metric}_std'] = np.std([fold[metric] for fold in fold_metrics])
        
        self.results = fold_metrics
        return avg_metrics
    
    def _evaluate_fold(self, 
                      val_pairs: np.ndarray,
                      preprocessor,
                      batch_size: int) -> Dict[str, float]:
        """
        Evaluate a single fold.
        
        Args:
            val_pairs: Validation pairs
            preprocessor: Image preprocessor
            batch_size: Batch size
            
        Returns:
            Metrics for this fold
        """
        self.model.eval()
        similarities = []
        labels = []
        
        with torch.no_grad():
            for i in range(0, len(val_pairs), batch_size):
                batch_pairs = val_pairs[i:i+batch_size]
                
                for sig1_path, sig2_path, label in batch_pairs:
                    # Load and preprocess images
                    sig1 = preprocessor.preprocess_image(sig1_path)
                    sig2 = preprocessor.preprocess_image(sig2_path)
                    
                    # Add batch dimension
                    sig1 = sig1.unsqueeze(0)
                    sig2 = sig2.unsqueeze(0)
                    
                    # Compute similarity
                    similarity = self.model(sig1, sig2)
                    similarities.append(similarity.item())
                    labels.append(label)
        
        # Compute metrics
        similarities = np.array(similarities)
        labels = np.array(labels)
        
        metrics_calculator = SignatureVerificationMetrics(threshold=self.threshold)
        metrics_calculator.update(similarities, labels)
        
        return metrics_calculator.compute_metrics()