MLOps-Platforms / src /mlops /evaluator.py
songhieng's picture
Upload 72 files
7e825f9 verified
"""
Model Evaluator Module
======================
Provides comprehensive model evaluation with visualization
support for confusion matrices, learning curves, and metrics.
"""
import os
# Set environment variables before transformers import
os.environ.setdefault('TF_CPP_MIN_LOG_LEVEL', '3')
os.environ.setdefault('TRANSFORMERS_NO_TF', '1')
import json
import logging
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass
import numpy as np
import torch
from sklearn.metrics import (
accuracy_score,
precision_recall_fscore_support,
confusion_matrix,
classification_report,
roc_curve,
auc,
precision_recall_curve
)
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg') # Non-interactive backend for server use
import seaborn as sns
logger = logging.getLogger(__name__)
@dataclass
class EvaluationResults:
"""Container for evaluation results."""
accuracy: float = 0.0
precision: float = 0.0
recall: float = 0.0
f1: float = 0.0
support: int = 0
confusion_matrix: Optional[np.ndarray] = None
classification_report: str = ""
predictions: Optional[List[int]] = None
probabilities: Optional[List[float]] = None
true_labels: Optional[List[int]] = None
def to_dict(self) -> dict:
return {
"accuracy": self.accuracy,
"precision": self.precision,
"recall": self.recall,
"f1": self.f1,
"support": self.support,
"classification_report": self.classification_report
}
class ModelEvaluator:
"""
Comprehensive model evaluation with visualization support.
"""
def __init__(self, model=None, tokenizer=None, label_names: List[str] = None):
"""
Initialize evaluator.
Args:
model: Trained model (optional, can be loaded later)
tokenizer: Tokenizer (optional, can be loaded later)
label_names: List of label names for display
"""
self.model = model
self.tokenizer = tokenizer
self.label_names = label_names or ["Class 0", "Class 1"]
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def load_model(self, model_path: str) -> bool:
"""
Load model and tokenizer from path.
Args:
model_path: Path to saved model directory
Returns:
True if successful, False otherwise
"""
try:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
self.model.to(self.device)
self.model.eval()
logger.info(f"Model loaded from {model_path}")
return True
except Exception as e:
logger.error(f"Failed to load model: {str(e)}")
return False
def predict(self, texts: List[str], batch_size: int = 16,
max_length: int = 256) -> Tuple[List[int], List[float]]:
"""
Make predictions on a list of texts.
Args:
texts: List of texts to predict
batch_size: Batch size for inference
max_length: Maximum sequence length
Returns:
Tuple of (predictions, probabilities)
"""
if self.model is None or self.tokenizer is None:
raise ValueError("Model and tokenizer must be loaded first")
self.model.eval()
all_predictions = []
all_probabilities = []
with torch.no_grad():
for i in range(0, len(texts), batch_size):
batch_texts = texts[i:i + batch_size]
encodings = self.tokenizer(
batch_texts,
truncation=True,
padding=True,
max_length=max_length,
return_tensors="pt"
)
encodings = {k: v.to(self.device) for k, v in encodings.items()}
outputs = self.model(**encodings)
probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
preds = torch.argmax(probs, dim=-1)
all_predictions.extend(preds.cpu().numpy().tolist())
# Get probability of positive class (class 1)
all_probabilities.extend(probs[:, 1].cpu().numpy().tolist())
return all_predictions, all_probabilities
def evaluate(self, texts: List[str], true_labels: List[int],
batch_size: int = 16) -> EvaluationResults:
"""
Evaluate model on a dataset.
Args:
texts: List of texts
true_labels: True labels
batch_size: Batch size for inference
Returns:
EvaluationResults object
"""
predictions, probabilities = self.predict(texts, batch_size)
# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1, support = precision_recall_fscore_support(
true_labels, predictions, average='weighted', zero_division=0
)
cm = confusion_matrix(true_labels, predictions)
report = classification_report(
true_labels, predictions,
target_names=self.label_names,
zero_division=0
)
results = EvaluationResults(
accuracy=accuracy,
precision=precision,
recall=recall,
f1=f1,
support=len(true_labels),
confusion_matrix=cm,
classification_report=report,
predictions=predictions,
probabilities=probabilities,
true_labels=true_labels
)
return results
def plot_confusion_matrix(self, results: EvaluationResults,
figsize: Tuple[int, int] = (8, 6),
cmap: str = "Blues") -> plt.Figure:
"""
Plot confusion matrix.
Args:
results: EvaluationResults object
figsize: Figure size
cmap: Color map
Returns:
Matplotlib figure
"""
fig, ax = plt.subplots(figsize=figsize)
sns.heatmap(
results.confusion_matrix,
annot=True,
fmt='d',
cmap=cmap,
xticklabels=self.label_names,
yticklabels=self.label_names,
ax=ax
)
ax.set_xlabel('Predicted Label', fontsize=12)
ax.set_ylabel('True Label', fontsize=12)
ax.set_title('Confusion Matrix', fontsize=14)
plt.tight_layout()
return fig
def plot_roc_curve(self, results: EvaluationResults,
figsize: Tuple[int, int] = (8, 6)) -> plt.Figure:
"""
Plot ROC curve for binary classification.
Args:
results: EvaluationResults object
figsize: Figure size
Returns:
Matplotlib figure
"""
if results.probabilities is None or results.true_labels is None:
raise ValueError("Probabilities and true labels required for ROC curve")
fpr, tpr, thresholds = roc_curve(results.true_labels, results.probabilities)
roc_auc = auc(fpr, tpr)
fig, ax = plt.subplots(figsize=figsize)
ax.plot(fpr, tpr, color='darkorange', lw=2,
label=f'ROC curve (AUC = {roc_auc:.3f})')
ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--',
label='Random classifier')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate', fontsize=12)
ax.set_ylabel('True Positive Rate', fontsize=12)
ax.set_title('Receiver Operating Characteristic (ROC) Curve', fontsize=14)
ax.legend(loc='lower right')
ax.grid(True, alpha=0.3)
plt.tight_layout()
return fig
def plot_precision_recall_curve(self, results: EvaluationResults,
figsize: Tuple[int, int] = (8, 6)) -> plt.Figure:
"""
Plot precision-recall curve.
Args:
results: EvaluationResults object
figsize: Figure size
Returns:
Matplotlib figure
"""
if results.probabilities is None or results.true_labels is None:
raise ValueError("Probabilities and true labels required")
precision, recall, thresholds = precision_recall_curve(
results.true_labels, results.probabilities
)
fig, ax = plt.subplots(figsize=figsize)
ax.plot(recall, precision, color='blue', lw=2)
ax.fill_between(recall, precision, alpha=0.2, color='blue')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('Recall', fontsize=12)
ax.set_ylabel('Precision', fontsize=12)
ax.set_title('Precision-Recall Curve', fontsize=14)
ax.grid(True, alpha=0.3)
plt.tight_layout()
return fig
def plot_training_history(self, metrics_history: List[Dict],
figsize: Tuple[int, int] = (12, 4)) -> plt.Figure:
"""
Plot training history (loss and metrics over epochs).
Args:
metrics_history: List of metric dictionaries
figsize: Figure size
Returns:
Matplotlib figure
"""
if not metrics_history:
raise ValueError("No metrics history to plot")
fig, axes = plt.subplots(1, 3, figsize=figsize)
# Extract data
epochs = [m.get('epoch', i) for i, m in enumerate(metrics_history)]
train_loss = [m.get('train_loss', 0) for m in metrics_history]
eval_loss = [m.get('eval_loss', 0) for m in metrics_history]
accuracy = [m.get('accuracy', 0) for m in metrics_history]
f1 = [m.get('f1', 0) for m in metrics_history]
# Loss plot
if any(train_loss):
axes[0].plot(epochs, train_loss, 'b-', label='Train Loss', marker='o')
if any(eval_loss):
axes[0].plot(epochs, eval_loss, 'r-', label='Eval Loss', marker='s')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training & Validation Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
# Accuracy plot
if any(accuracy):
axes[1].plot(epochs, accuracy, 'g-', marker='o')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Accuracy over Training')
axes[1].grid(True, alpha=0.3)
# F1 score plot
if any(f1):
axes[2].plot(epochs, f1, 'm-', marker='o')
axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('F1 Score')
axes[2].set_title('F1 Score over Training')
axes[2].grid(True, alpha=0.3)
plt.tight_layout()
return fig
def plot_class_distribution(self, labels: List[int],
figsize: Tuple[int, int] = (8, 5)) -> plt.Figure:
"""
Plot class distribution in dataset.
Args:
labels: List of labels
figsize: Figure size
Returns:
Matplotlib figure
"""
unique, counts = np.unique(labels, return_counts=True)
fig, ax = plt.subplots(figsize=figsize)
colors = plt.cm.Set3(np.linspace(0, 1, len(unique)))
bars = ax.bar(
[self.label_names[i] if i < len(self.label_names) else f"Class {i}"
for i in unique],
counts,
color=colors
)
# Add value labels on bars
for bar, count in zip(bars, counts):
height = bar.get_height()
ax.annotate(f'{count}',
xy=(bar.get_x() + bar.get_width() / 2, height),
xytext=(0, 3),
textcoords="offset points",
ha='center', va='bottom',
fontsize=12, fontweight='bold')
ax.set_xlabel('Class', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_title('Class Distribution', fontsize=14)
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
return fig
def generate_report(self, results: EvaluationResults,
output_path: Optional[str] = None) -> str:
"""
Generate a text report of evaluation results.
Args:
results: EvaluationResults object
output_path: Optional path to save the report
Returns:
Report string
"""
report = []
report.append("=" * 60)
report.append("MODEL EVALUATION REPORT")
report.append("=" * 60)
report.append("")
report.append("OVERALL METRICS:")
report.append(f" Accuracy: {results.accuracy:.4f} ({results.accuracy*100:.2f}%)")
report.append(f" Precision: {results.precision:.4f}")
report.append(f" Recall: {results.recall:.4f}")
report.append(f" F1 Score: {results.f1:.4f}")
report.append(f" Samples: {results.support}")
report.append("")
report.append("CLASSIFICATION REPORT:")
report.append(results.classification_report)
report.append("")
report.append("CONFUSION MATRIX:")
if results.confusion_matrix is not None:
report.append(str(results.confusion_matrix))
report.append("")
report.append("=" * 60)
report_str = "\n".join(report)
if output_path:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(report_str)
logger.info(f"Report saved to {output_path}")
return report_str
def save_results(self, results: EvaluationResults, output_dir: str):
"""
Save evaluation results to files.
Args:
results: EvaluationResults object
output_dir: Output directory
"""
os.makedirs(output_dir, exist_ok=True)
# Save metrics as JSON
metrics_path = os.path.join(output_dir, "evaluation_metrics.json")
with open(metrics_path, 'w', encoding='utf-8') as f:
json.dump(results.to_dict(), f, indent=2, ensure_ascii=False)
# Save confusion matrix as image
try:
fig = self.plot_confusion_matrix(results)
fig.savefig(os.path.join(output_dir, "confusion_matrix.png"), dpi=150)
plt.close(fig)
except Exception as e:
logger.warning(f"Could not save confusion matrix: {e}")
# Save text report
report_path = os.path.join(output_dir, "evaluation_report.txt")
self.generate_report(results, report_path)
logger.info(f"Results saved to {output_dir}")
def create_evaluator(label_names: List[str] = None) -> ModelEvaluator:
"""Factory function to create a ModelEvaluator instance."""
return ModelEvaluator(label_names=label_names)