Spaces:

songhieng
/

MLOps-Platforms

Sleeping

File size: 16,347 Bytes

7e825f9

"""

Model Evaluator Module

======================



Provides comprehensive model evaluation with visualization

support for confusion matrices, learning curves, and metrics.

"""

import os
# Set environment variables before transformers import
os.environ.setdefault('TF_CPP_MIN_LOG_LEVEL', '3')
os.environ.setdefault('TRANSFORMERS_NO_TF', '1')

import json
import logging
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass
import numpy as np

import torch
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
    roc_curve,
    auc,
    precision_recall_curve
)
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')  # Non-interactive backend for server use
import seaborn as sns

logger = logging.getLogger(__name__)


@dataclass
class EvaluationResults:
    """Container for evaluation results."""
    accuracy: float = 0.0
    precision: float = 0.0
    recall: float = 0.0
    f1: float = 0.0
    support: int = 0
    confusion_matrix: Optional[np.ndarray] = None
    classification_report: str = ""
    predictions: Optional[List[int]] = None
    probabilities: Optional[List[float]] = None
    true_labels: Optional[List[int]] = None
    
    def to_dict(self) -> dict:
        return {
            "accuracy": self.accuracy,
            "precision": self.precision,
            "recall": self.recall,
            "f1": self.f1,
            "support": self.support,
            "classification_report": self.classification_report
        }


class ModelEvaluator:
    """

    Comprehensive model evaluation with visualization support.

    """
    
    def __init__(self, model=None, tokenizer=None, label_names: List[str] = None):
        """

        Initialize evaluator.

        

        Args:

            model: Trained model (optional, can be loaded later)

            tokenizer: Tokenizer (optional, can be loaded later)

            label_names: List of label names for display

        """
        self.model = model
        self.tokenizer = tokenizer
        self.label_names = label_names or ["Class 0", "Class 1"]
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    def load_model(self, model_path: str) -> bool:
        """

        Load model and tokenizer from path.

        

        Args:

            model_path: Path to saved model directory

            

        Returns:

            True if successful, False otherwise

        """
        try:
            from transformers import AutoTokenizer, AutoModelForSequenceClassification
            
            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
            self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
            self.model.to(self.device)
            self.model.eval()
            
            logger.info(f"Model loaded from {model_path}")
            return True
            
        except Exception as e:
            logger.error(f"Failed to load model: {str(e)}")
            return False
    
    def predict(self, texts: List[str], batch_size: int = 16, 

                max_length: int = 256) -> Tuple[List[int], List[float]]:
        """

        Make predictions on a list of texts.

        

        Args:

            texts: List of texts to predict

            batch_size: Batch size for inference

            max_length: Maximum sequence length

            

        Returns:

            Tuple of (predictions, probabilities)

        """
        if self.model is None or self.tokenizer is None:
            raise ValueError("Model and tokenizer must be loaded first")
        
        self.model.eval()
        all_predictions = []
        all_probabilities = []
        
        with torch.no_grad():
            for i in range(0, len(texts), batch_size):
                batch_texts = texts[i:i + batch_size]
                
                encodings = self.tokenizer(
                    batch_texts,
                    truncation=True,
                    padding=True,
                    max_length=max_length,
                    return_tensors="pt"
                )
                
                encodings = {k: v.to(self.device) for k, v in encodings.items()}
                
                outputs = self.model(**encodings)
                probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
                preds = torch.argmax(probs, dim=-1)
                
                all_predictions.extend(preds.cpu().numpy().tolist())
                # Get probability of positive class (class 1)
                all_probabilities.extend(probs[:, 1].cpu().numpy().tolist())
        
        return all_predictions, all_probabilities
    
    def evaluate(self, texts: List[str], true_labels: List[int],

                 batch_size: int = 16) -> EvaluationResults:
        """

        Evaluate model on a dataset.

        

        Args:

            texts: List of texts

            true_labels: True labels

            batch_size: Batch size for inference

            

        Returns:

            EvaluationResults object

        """
        predictions, probabilities = self.predict(texts, batch_size)
        
        # Calculate metrics
        accuracy = accuracy_score(true_labels, predictions)
        precision, recall, f1, support = precision_recall_fscore_support(
            true_labels, predictions, average='weighted', zero_division=0
        )
        
        cm = confusion_matrix(true_labels, predictions)
        report = classification_report(
            true_labels, predictions,
            target_names=self.label_names,
            zero_division=0
        )
        
        results = EvaluationResults(
            accuracy=accuracy,
            precision=precision,
            recall=recall,
            f1=f1,
            support=len(true_labels),
            confusion_matrix=cm,
            classification_report=report,
            predictions=predictions,
            probabilities=probabilities,
            true_labels=true_labels
        )
        
        return results
    
    def plot_confusion_matrix(self, results: EvaluationResults,

                             figsize: Tuple[int, int] = (8, 6),

                             cmap: str = "Blues") -> plt.Figure:
        """

        Plot confusion matrix.

        

        Args:

            results: EvaluationResults object

            figsize: Figure size

            cmap: Color map

            

        Returns:

            Matplotlib figure

        """
        fig, ax = plt.subplots(figsize=figsize)
        
        sns.heatmap(
            results.confusion_matrix,
            annot=True,
            fmt='d',
            cmap=cmap,
            xticklabels=self.label_names,
            yticklabels=self.label_names,
            ax=ax
        )
        
        ax.set_xlabel('Predicted Label', fontsize=12)
        ax.set_ylabel('True Label', fontsize=12)
        ax.set_title('Confusion Matrix', fontsize=14)
        
        plt.tight_layout()
        return fig
    
    def plot_roc_curve(self, results: EvaluationResults,

                       figsize: Tuple[int, int] = (8, 6)) -> plt.Figure:
        """

        Plot ROC curve for binary classification.

        

        Args:

            results: EvaluationResults object

            figsize: Figure size

            

        Returns:

            Matplotlib figure

        """
        if results.probabilities is None or results.true_labels is None:
            raise ValueError("Probabilities and true labels required for ROC curve")
        
        fpr, tpr, thresholds = roc_curve(results.true_labels, results.probabilities)
        roc_auc = auc(fpr, tpr)
        
        fig, ax = plt.subplots(figsize=figsize)
        
        ax.plot(fpr, tpr, color='darkorange', lw=2,
                label=f'ROC curve (AUC = {roc_auc:.3f})')
        ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--',
                label='Random classifier')
        
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('False Positive Rate', fontsize=12)
        ax.set_ylabel('True Positive Rate', fontsize=12)
        ax.set_title('Receiver Operating Characteristic (ROC) Curve', fontsize=14)
        ax.legend(loc='lower right')
        ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        return fig
    
    def plot_precision_recall_curve(self, results: EvaluationResults,

                                   figsize: Tuple[int, int] = (8, 6)) -> plt.Figure:
        """

        Plot precision-recall curve.

        

        Args:

            results: EvaluationResults object

            figsize: Figure size

            

        Returns:

            Matplotlib figure

        """
        if results.probabilities is None or results.true_labels is None:
            raise ValueError("Probabilities and true labels required")
        
        precision, recall, thresholds = precision_recall_curve(
            results.true_labels, results.probabilities
        )
        
        fig, ax = plt.subplots(figsize=figsize)
        
        ax.plot(recall, precision, color='blue', lw=2)
        ax.fill_between(recall, precision, alpha=0.2, color='blue')
        
        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.05])
        ax.set_xlabel('Recall', fontsize=12)
        ax.set_ylabel('Precision', fontsize=12)
        ax.set_title('Precision-Recall Curve', fontsize=14)
        ax.grid(True, alpha=0.3)
        
        plt.tight_layout()
        return fig
    
    def plot_training_history(self, metrics_history: List[Dict],

                             figsize: Tuple[int, int] = (12, 4)) -> plt.Figure:
        """

        Plot training history (loss and metrics over epochs).

        

        Args:

            metrics_history: List of metric dictionaries

            figsize: Figure size

            

        Returns:

            Matplotlib figure

        """
        if not metrics_history:
            raise ValueError("No metrics history to plot")
        
        fig, axes = plt.subplots(1, 3, figsize=figsize)
        
        # Extract data
        epochs = [m.get('epoch', i) for i, m in enumerate(metrics_history)]
        train_loss = [m.get('train_loss', 0) for m in metrics_history]
        eval_loss = [m.get('eval_loss', 0) for m in metrics_history]
        accuracy = [m.get('accuracy', 0) for m in metrics_history]
        f1 = [m.get('f1', 0) for m in metrics_history]
        
        # Loss plot
        if any(train_loss):
            axes[0].plot(epochs, train_loss, 'b-', label='Train Loss', marker='o')
        if any(eval_loss):
            axes[0].plot(epochs, eval_loss, 'r-', label='Eval Loss', marker='s')
        axes[0].set_xlabel('Epoch')
        axes[0].set_ylabel('Loss')
        axes[0].set_title('Training & Validation Loss')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
        
        # Accuracy plot
        if any(accuracy):
            axes[1].plot(epochs, accuracy, 'g-', marker='o')
            axes[1].set_xlabel('Epoch')
            axes[1].set_ylabel('Accuracy')
            axes[1].set_title('Accuracy over Training')
            axes[1].grid(True, alpha=0.3)
        
        # F1 score plot
        if any(f1):
            axes[2].plot(epochs, f1, 'm-', marker='o')
            axes[2].set_xlabel('Epoch')
            axes[2].set_ylabel('F1 Score')
            axes[2].set_title('F1 Score over Training')
            axes[2].grid(True, alpha=0.3)
        
        plt.tight_layout()
        return fig
    
    def plot_class_distribution(self, labels: List[int],

                               figsize: Tuple[int, int] = (8, 5)) -> plt.Figure:
        """

        Plot class distribution in dataset.

        

        Args:

            labels: List of labels

            figsize: Figure size

            

        Returns:

            Matplotlib figure

        """
        unique, counts = np.unique(labels, return_counts=True)
        
        fig, ax = plt.subplots(figsize=figsize)
        
        colors = plt.cm.Set3(np.linspace(0, 1, len(unique)))
        bars = ax.bar(
            [self.label_names[i] if i < len(self.label_names) else f"Class {i}" 
             for i in unique],
            counts,
            color=colors
        )
        
        # Add value labels on bars
        for bar, count in zip(bars, counts):
            height = bar.get_height()
            ax.annotate(f'{count}',
                       xy=(bar.get_x() + bar.get_width() / 2, height),
                       xytext=(0, 3),
                       textcoords="offset points",
                       ha='center', va='bottom',
                       fontsize=12, fontweight='bold')
        
        ax.set_xlabel('Class', fontsize=12)
        ax.set_ylabel('Count', fontsize=12)
        ax.set_title('Class Distribution', fontsize=14)
        ax.grid(True, alpha=0.3, axis='y')
        
        plt.tight_layout()
        return fig
    
    def generate_report(self, results: EvaluationResults,

                       output_path: Optional[str] = None) -> str:
        """

        Generate a text report of evaluation results.

        

        Args:

            results: EvaluationResults object

            output_path: Optional path to save the report

            

        Returns:

            Report string

        """
        report = []
        report.append("=" * 60)
        report.append("MODEL EVALUATION REPORT")
        report.append("=" * 60)
        report.append("")
        report.append("OVERALL METRICS:")
        report.append(f"  Accuracy:  {results.accuracy:.4f} ({results.accuracy*100:.2f}%)")
        report.append(f"  Precision: {results.precision:.4f}")
        report.append(f"  Recall:    {results.recall:.4f}")
        report.append(f"  F1 Score:  {results.f1:.4f}")
        report.append(f"  Samples:   {results.support}")
        report.append("")
        report.append("CLASSIFICATION REPORT:")
        report.append(results.classification_report)
        report.append("")
        report.append("CONFUSION MATRIX:")
        if results.confusion_matrix is not None:
            report.append(str(results.confusion_matrix))
        report.append("")
        report.append("=" * 60)
        
        report_str = "\n".join(report)
        
        if output_path:
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(report_str)
            logger.info(f"Report saved to {output_path}")
        
        return report_str
    
    def save_results(self, results: EvaluationResults, output_dir: str):
        """

        Save evaluation results to files.

        

        Args:

            results: EvaluationResults object

            output_dir: Output directory

        """
        os.makedirs(output_dir, exist_ok=True)
        
        # Save metrics as JSON
        metrics_path = os.path.join(output_dir, "evaluation_metrics.json")
        with open(metrics_path, 'w', encoding='utf-8') as f:
            json.dump(results.to_dict(), f, indent=2, ensure_ascii=False)
        
        # Save confusion matrix as image
        try:
            fig = self.plot_confusion_matrix(results)
            fig.savefig(os.path.join(output_dir, "confusion_matrix.png"), dpi=150)
            plt.close(fig)
        except Exception as e:
            logger.warning(f"Could not save confusion matrix: {e}")
        
        # Save text report
        report_path = os.path.join(output_dir, "evaluation_report.txt")
        self.generate_report(results, report_path)
        
        logger.info(f"Results saved to {output_dir}")


def create_evaluator(label_names: List[str] = None) -> ModelEvaluator:
    """Factory function to create a ModelEvaluator instance."""
    return ModelEvaluator(label_names=label_names)