Spaces:

teoat
/

zenith-backend

Paused

App Files Files Community

zenith-backend / core /diagnostics /ml_performance_analyzer.py

teoat

Upload folder using huggingface_hub

4ae946d verified 3 months ago

raw

history blame contribute delete

31.3 kB

	#!/usr/bin/env python3
	"""
	ML Model Performance Analyzer
	Comprehensive machine learning model performance monitoring and analysis
	"""

	from dataclasses import asdict, dataclass
	from datetime import datetime
	from typing import Any, Dict, List, Optional

	import numpy as np


	@dataclass
	class ModelPerformanceMetric:
	"""ML model performance metric data structure"""

	model_name: str
	metric_type: str # accuracy, precision, recall, f1, auc, rmse, mae, r2
	value: float
	threshold: Optional[float]
	status: str # excellent, good, acceptable, poor, critical
	timestamp: datetime
	dataset_type: str # training, validation, test, production
	details: Dict[str, Any]

	def to_dict(self) -> Dict[str, Any]:
	"""Convert to dictionary for JSON serialization"""
	result = asdict(self)
	result["timestamp"] = self.timestamp.isoformat()
	return result


	@dataclass
	class ModelDriftAnalysis:
	"""Model drift analysis results"""

	model_name: str
	drift_detected: bool
	drift_type: str # concept_drift, data_drift, covariate_shift
	drift_score: float
	baseline_performance: float
	current_performance: float
	performance_drop: float
	timestamp: datetime
	recommendations: List[str]

	def to_dict(self) -> Dict[str, Any]:
	result = asdict(self)
	result["timestamp"] = self.timestamp.isoformat()
	return result


	class MLModelPerformanceAnalyzer:
	"""Comprehensive ML model performance monitoring and analysis"""

	def __init__(self):
	self.start_time = datetime.now()
	self.performance_metrics = []
	self.drift_analyses = []
	self.model_baselines = {}

	# Performance thresholds for different model types
	self.thresholds = {
	"classification": {
	"accuracy": {
	"excellent": 0.95,
	"good": 0.90,
	"acceptable": 0.80,
	"poor": 0.70,
	},
	"precision": {
	"excellent": 0.95,
	"good": 0.90,
	"acceptable": 0.80,
	"poor": 0.70,
	},
	"recall": {
	"excellent": 0.95,
	"good": 0.90,
	"acceptable": 0.80,
	"poor": 0.70,
	},
	"f1": {
	"excellent": 0.95,
	"good": 0.90,
	"acceptable": 0.80,
	"poor": 0.70,
	},
	"auc": {
	"excellent": 0.95,
	"good": 0.90,
	"acceptable": 0.80,
	"poor": 0.70,
	},
	},
	"regression": {
	"rmse": {
	"excellent": 0.05,
	"good": 0.10,
	"acceptable": 0.20,
	"poor": 0.30,
	},
	"mae": {
	"excellent": 0.05,
	"good": 0.10,
	"acceptable": 0.20,
	"poor": 0.30,
	},
	"r2": {
	"excellent": 0.95,
	"good": 0.90,
	"acceptable": 0.80,
	"poor": 0.70,
	},
	},
	"anomaly_detection": {
	"precision": {
	"excellent": 0.90,
	"good": 0.80,
	"acceptable": 0.70,
	"poor": 0.60,
	},
	"recall": {
	"excellent": 0.90,
	"good": 0.80,
	"acceptable": 0.70,
	"poor": 0.60,
	},
	"f1": {
	"excellent": 0.90,
	"good": 0.80,
	"acceptable": 0.70,
	"poor": 0.60,
	},
	},
	}

	# Drift detection thresholds
	self.drift_thresholds = {
	"performance_drop": 0.10, # 10% drop in performance
	"data_drift": 0.15, # 15% data distribution change
	"concept_drift": 0.20, # 20% concept change
	}

	async def analyze_model_performance(
	self,
	model_name: str,
	model_type: str,
	predictions: List[Any],
	actuals: List[Any],
	dataset_type: str = "production",
	) -> List[ModelPerformanceMetric]:
	"""Analyze model performance metrics"""
	metrics = []
	current_time = datetime.now()

	try:
	# Convert to numpy arrays for calculations
	y_pred = np.array(predictions)
	y_true = np.array(actuals)

	if model_type == "classification":
	# Classification metrics
	from sklearn.metrics import (
	accuracy_score,
	confusion_matrix,
	f1_score,
	precision_score,
	recall_score,
	roc_auc_score,
	)

	# Basic metrics
	accuracy = accuracy_score(y_true, y_pred)
	precision = precision_score(
	y_true, y_pred, average="weighted", zero_division=0
	)
	recall = recall_score(
	y_true, y_pred, average="weighted", zero_division=0
	)
	f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)

	# Add metrics
	metrics.extend(
	[
	self._create_metric(
	model_name,
	"accuracy",
	accuracy,
	model_type,
	dataset_type,
	current_time,
	),
	self._create_metric(
	model_name,
	"precision",
	precision,
	model_type,
	dataset_type,
	current_time,
	),
	self._create_metric(
	model_name,
	"recall",
	recall,
	model_type,
	dataset_type,
	current_time,
	),
	self._create_metric(
	model_name, "f1", f1, model_type, dataset_type, current_time
	),
	]
	)

	# AUC if binary classification
	if len(np.unique(y_true)) == 2:
	try:
	auc = roc_auc_score(y_true, y_pred)
	metrics.append(
	self._create_metric(
	model_name,
	"auc",
	auc,
	model_type,
	dataset_type,
	current_time,
	)
	)
	except Exception:
	pass

	# Confusion matrix details
	cm = confusion_matrix(y_true, y_pred)
	details = {
	"confusion_matrix": cm.tolist(),
	"true_positives": int(cm[1, 1]) if cm.shape[0] > 1 else 0,
	"false_positives": int(cm[0, 1]) if cm.shape[0] > 1 else 0,
	"true_negatives": int(cm[0, 0]) if cm.shape[0] > 1 else 0,
	"false_negatives": int(cm[1, 0]) if cm.shape[0] > 1 else 0,
	"total_predictions": len(y_pred),
	"unique_classes": len(np.unique(y_true)),
	}

	elif model_type == "regression":
	# Regression metrics
	from sklearn.metrics import (
	mean_absolute_error,
	mean_squared_error,
	r2_score,
	)

	mse = mean_squared_error(y_true, y_pred)
	rmse = np.sqrt(mse)
	mae = mean_absolute_error(y_true, y_pred)
	r2 = r2_score(y_true, y_pred)

	metrics.extend(
	[
	self._create_metric(
	model_name,
	"rmse",
	rmse,
	model_type,
	dataset_type,
	current_time,
	),
	self._create_metric(
	model_name,
	"mae",
	mae,
	model_type,
	dataset_type,
	current_time,
	),
	self._create_metric(
	model_name, "r2", r2, model_type, dataset_type, current_time
	),
	]
	)

	details = {
	"mse": mse,
	"total_predictions": len(y_pred),
	"target_range": [float(np.min(y_true)), float(np.max(y_true))],
	"prediction_range": [float(np.min(y_pred)), float(np.max(y_pred))],
	}

	elif model_type == "anomaly_detection":
	# Anomaly detection metrics
	from sklearn.metrics import f1_score, precision_score, recall_score

	precision = precision_score(y_true, y_pred, zero_division=0)
	recall = recall_score(y_true, y_pred, zero_division=0)
	f1 = f1_score(y_true, y_pred, zero_division=0)

	metrics.extend(
	[
	self._create_metric(
	model_name,
	"precision",
	precision,
	model_type,
	dataset_type,
	current_time,
	),
	self._create_metric(
	model_name,
	"recall",
	recall,
	model_type,
	dataset_type,
	current_time,
	),
	self._create_metric(
	model_name, "f1", f1, model_type, dataset_type, current_time
	),
	]
	)

	details = {
	"total_predictions": len(y_pred),
	"anomalies_detected": int(np.sum(y_pred)),
	"actual_anomalies": int(np.sum(y_true)),
	"true_positives": int(np.sum((y_pred == 1) & (y_true == 1))),
	"false_positives": int(np.sum((y_pred == 1) & (y_true == 0))),
	}

	# Add details to all metrics
	for metric in metrics:
	metric.details.update(details)

	# Store metrics
	self.performance_metrics.extend(metrics)

	except Exception as e:
	# Add error metric
	metrics.append(
	ModelPerformanceMetric(
	model_name=model_name,
	metric_type="error",
	value=0.0,
	threshold=None,
	status="critical",
	timestamp=current_time,
	dataset_type=dataset_type,
	details={"error": str(e)},
	)
	)

	return metrics

	def _create_metric(
	self,
	model_name: str,
	metric_type: str,
	value: float,
	model_type: str,
	dataset_type: str,
	timestamp: datetime,
	) -> ModelPerformanceMetric:
	"""Create a performance metric with status determination"""

	# Get threshold for this metric type
	thresholds = self.thresholds.get(model_type, {}).get(metric_type, {})

	if thresholds:
	if value >= thresholds.get("excellent", 1.0):
	status = "excellent"
	elif value >= thresholds.get("good", 0.8):
	status = "good"
	elif value >= thresholds.get("acceptable", 0.6):
	status = "acceptable"
	elif value >= thresholds.get("poor", 0.4):
	status = "poor"
	else:
	status = "critical"

	threshold = thresholds.get("acceptable", 0.5)
	else:
	status = "unknown"
	threshold = None

	return ModelPerformanceMetric(
	model_name=model_name,
	metric_type=metric_type,
	value=value,
	threshold=threshold,
	status=status,
	timestamp=timestamp,
	dataset_type=dataset_type,
	details={},
	)

	async def detect_model_drift(
	self,
	model_name: str,
	current_predictions: List[Any],
	current_actuals: List[Any],
	baseline_predictions: List[Any] = None,
	baseline_actuals: List[Any] = None,
	) -> ModelDriftAnalysis:
	"""Detect model drift using various techniques"""
	current_time = datetime.now()

	try:
	# Get current performance
	current_metrics = await self.analyze_model_performance(
	model_name,
	"classification",
	current_predictions,
	current_actuals,
	"production",
	)

	# Get baseline performance
	if baseline_predictions and baseline_actuals:
	baseline_metrics = await self.analyze_model_performance(
	model_name,
	"classification",
	baseline_predictions,
	baseline_actuals,
	"training",
	)
	else:
	# Use stored baseline if available
	baseline_metrics = [
	m
	for m in self.performance_metrics
	if m.model_name == model_name and m.dataset_type == "training"
	]

	if not baseline_metrics:
	return ModelDriftAnalysis(
	model_name=model_name,
	drift_detected=False,
	drift_type="no_baseline",
	drift_score=0.0,
	baseline_performance=0.0,
	current_performance=0.0,
	performance_drop=0.0,
	timestamp=current_time,
	recommendations=["Establish baseline performance metrics"],
	)

	# Calculate performance drop
	baseline_perf = np.mean(
	[
	m.value
	for m in baseline_metrics
	if m.metric_type in ["accuracy", "f1", "r2"]
	]
	)
	current_perf = np.mean(
	[
	m.value
	for m in current_metrics
	if m.metric_type in ["accuracy", "f1", "r2"]
	]
	)

	performance_drop = (
	(baseline_perf - current_perf) / baseline_perf
	if baseline_perf > 0
	else 0
	)

	# Detect performance drift
	performance_drift = (
	performance_drop > self.drift_thresholds["performance_drop"]
	)

	# Detect data drift (simplified - in real implementation would use statistical tests)
	data_drift_score = self._calculate_data_drift(
	current_predictions, baseline_predictions
	)
	data_drift = data_drift_score > self.drift_thresholds["data_drift"]

	# Detect concept drift
	concept_drift_score = self._calculate_concept_drift(
	current_predictions,
	current_actuals,
	baseline_predictions,
	baseline_actuals,
	)
	concept_drift = concept_drift_score > self.drift_thresholds["concept_drift"]

	# Determine overall drift
	drift_detected = performance_drift or data_drift or concept_drift

	# Determine drift type
	if concept_drift:
	drift_type = "concept_drift"
	elif data_drift:
	drift_type = "data_drift"
	elif performance_drift:
	drift_type = "performance_drift"
	else:
	drift_type = "no_drift"

	# Calculate overall drift score
	drift_score = max(performance_drop, data_drift_score, concept_drift_score)

	# Generate recommendations
	recommendations = []
	if drift_detected:
	if performance_drift:
	recommendations.append("Retrain model with recent data")
	if data_drift:
	recommendations.append("Update data preprocessing pipeline")
	if concept_drift:
	recommendations.append(
	"Review model features and target definition"
	)

	recommendations.append("Monitor model performance closely")
	recommendations.append("Consider ensemble methods for robustness")

	return ModelDriftAnalysis(
	model_name=model_name,
	drift_detected=drift_detected,
	drift_type=drift_type,
	drift_score=drift_score,
	baseline_performance=baseline_perf,
	current_performance=current_perf,
	performance_drop=performance_drop,
	timestamp=current_time,
	recommendations=recommendations,
	)

	except Exception as e:
	return ModelDriftAnalysis(
	model_name=model_name,
	drift_detected=False,
	drift_type="error",
	drift_score=0.0,
	baseline_performance=0.0,
	current_performance=0.0,
	performance_drop=0.0,
	timestamp=current_time,
	recommendations=[f"Error in drift detection: {str(e)}"],
	)

	def _calculate_data_drift(
	self, current_data: List[Any], baseline_data: List[Any]
	) -> float:
	"""Calculate data drift score (simplified)"""
	try:
	if not baseline_data:
	return 0.0

	# Convert to numpy arrays
	current_arr = np.array(current_data)
	baseline_arr = np.array(baseline_data)

	# Simple statistical comparison
	current_mean = np.mean(current_arr)
	baseline_mean = np.mean(baseline_arr)

	current_std = np.std(current_arr)
	baseline_std = np.std(baseline_arr)

	# Calculate drift score based on mean and std differences
	mean_diff = abs(current_mean - baseline_mean) / (abs(baseline_mean) + 1e-8)
	std_diff = abs(current_std - baseline_std) / (abs(baseline_std) + 1e-8)

	drift_score = (mean_diff + std_diff) / 2
	return min(drift_score, 1.0)

	except Exception:
	return 0.0

	def _calculate_concept_drift(
	self,
	current_pred: List[Any],
	current_true: List[Any],
	baseline_pred: List[Any],
	baseline_true: List[Any],
	) -> float:
	"""Calculate concept drift score (simplified)"""
	try:
	if not baseline_pred or not baseline_true:
	return 0.0

	# Calculate error rates
	current_error = np.mean(np.array(current_pred) != np.array(current_true))
	baseline_error = np.mean(np.array(baseline_pred) != np.array(baseline_true))

	# Concept drift is change in error rate
	concept_drift = abs(current_error - baseline_error)
	return min(concept_drift, 1.0)

	except Exception:
	return 0.0

	async def analyze_feature_importance(
	self, model_name: str, feature_names: List[str], feature_importance: List[float]
	) -> Dict[str, Any]:
	"""Analyze feature importance and stability"""
	try:
	# Sort features by importance
	feature_data = list(zip(feature_names, feature_importance))
	feature_data.sort(key=lambda x: x[1], reverse=True)

	# Calculate statistics
	importance_values = [imp for _, imp in feature_data]

	analysis = {
	"model_name": model_name,
	"total_features": len(feature_names),
	"top_features": feature_data[:10],
	"feature_importance_stats": {
	"mean": np.mean(importance_values),
	"std": np.std(importance_values),
	"min": np.min(importance_values),
	"max": np.max(importance_values),
	"median": np.median(importance_values),
	},
	"importance_distribution": {
	"high_importance": len(
	[imp for imp in importance_values if imp > 0.1]
	),
	"medium_importance": len(
	[imp for imp in importance_values if 0.05 <= imp <= 0.1]
	),
	"low_importance": len(
	[imp for imp in importance_values if imp < 0.05]
	),
	},
	"recommendations": [],
	}

	# Generate recommendations
	if (
	analysis["importance_distribution"]["low_importance"]
	> len(feature_names) * 0.5
	):
	analysis["recommendations"].append(
	"Consider feature selection to reduce dimensionality"
	)

	if analysis["feature_importance_stats"]["std"] > 0.3:
	analysis["recommendations"].append(
	"Feature importance is highly skewed - consider regularization"
	)

	if len(feature_names) > 100:
	analysis["recommendations"].append(
	"High-dimensional data - consider dimensionality reduction"
	)

	return analysis

	except Exception as e:
	return {
	"model_name": model_name,
	"error": str(e),
	"recommendations": ["Fix feature importance analysis"],
	}

	async def generate_model_performance_report(self) -> Dict[str, Any]:
	"""Generate comprehensive ML model performance report"""
	current_time = datetime.now()

	# Group metrics by model
	models = {}
	for metric in self.performance_metrics:
	if metric.model_name not in models:
	models[metric.model_name] = []
	models[metric.model_name].append(metric)

	# Analyze each model
	model_analyses = {}
	for model_name, model_metrics in models.items():
	# Group by dataset type
	training_metrics = [
	m for m in model_metrics if m.dataset_type == "training"
	]
	validation_metrics = [
	m for m in model_metrics if m.dataset_type == "validation"
	]
	test_metrics = [m for m in model_metrics if m.dataset_type == "test"]
	production_metrics = [
	m for m in model_metrics if m.dataset_type == "production"
	]

	# Calculate overall scores
	def calculate_score(metrics_list):
	if not metrics_list:
	return 0.0

	score_map = {
	"excellent": 100,
	"good": 80,
	"acceptable": 60,
	"poor": 40,
	"critical": 20,
	"unknown": 50,
	}
	scores = [score_map.get(m.status, 50) for m in metrics_list]
	return np.mean(scores)

	model_analyses[model_name] = {
	"overall_score": calculate_score(model_metrics),
	"training_score": calculate_score(training_metrics),
	"validation_score": calculate_score(validation_metrics),
	"test_score": calculate_score(test_metrics),
	"production_score": calculate_score(production_metrics),
	"metrics_count": {
	"total": len(model_metrics),
	"training": len(training_metrics),
	"validation": len(validation_metrics),
	"test": len(test_metrics),
	"production": len(production_metrics),
	},
	"latest_metrics": {
	metric.metric_type: metric.to_dict()
	for metric in sorted(
	model_metrics, key=lambda x: x.timestamp, reverse=True
	)[:10]
	},
	"performance_trend": self._analyze_performance_trend(model_metrics),
	"critical_issues": [
	m.to_dict() for m in model_metrics if m.status == "critical"
	],
	}

	# Analyze drift across all models
	drift_summary = {
	"total_models_analyzed": len(self.drift_analyses),
	"models_with_drift": len(
	[d for d in self.drift_analyses if d.drift_detected]
	),
	"drift_types": {
	drift_type: len(
	[d for d in self.drift_analyses if d.drift_type == drift_type]
	)
	for drift_type in set(d.drift_type for d in self.drift_analyses)
	},
	"recent_drift": [
	d.to_dict()
	for d in sorted(
	self.drift_analyses, key=lambda x: x.timestamp, reverse=True
	)[:5]
	],
	}

	# Generate recommendations
	recommendations = []

	# Model-specific recommendations
	for model_name, analysis in model_analyses.items():
	if analysis["production_score"] < 60:
	recommendations.append(
	f"CRITICAL: {model_name} performance is poor in production - immediate action required"
	)
	elif analysis["production_score"] < 80:
	recommendations.append(
	f"WARNING: {model_name} performance degradation detected - consider retraining"
	)

	if analysis["critical_issues"]:
	recommendations.append(
	f"Address {len(analysis['critical_issues'])} critical issues in {model_name}"
	)

	# General recommendations
	if drift_summary["models_with_drift"] > 0:
	recommendations.append("Implement automated model retraining pipeline")

	if len(model_analyses) > 5:
	recommendations.append("Consider model ensemble for improved robustness")

	# Calculate overall ML system health
	overall_scores = [
	analysis["overall_score"] for analysis in model_analyses.values()
	]
	ml_system_health = np.mean(overall_scores) if overall_scores else 0.0

	report = {
	"ml_system_health": ml_system_health,
	"analysis_timestamp": current_time.isoformat(),
	"summary": {
	"total_models": len(model_analyses),
	"models_with_critical_issues": len(
	[m for m in model_analyses.values() if m["critical_issues"]]
	),
	"average_production_score": (
	np.mean([m["production_score"] for m in model_analyses.values()])
	if model_analyses
	else 0.0
	),
	"drift_detection_active": len(self.drift_analyses) > 0,
	},
	"model_analyses": model_analyses,
	"drift_analysis": drift_summary,
	"recommendations": recommendations,
	"next_steps": [
	"1. Address critical model performance issues immediately",
	"2. Implement automated model monitoring and drift detection",
	"3. Schedule regular model retraining based on drift detection",
	"4. Consider A/B testing for model improvements",
	"5. Establish model governance and versioning procedures",
	],
	}

	return report

	def _analyze_performance_trend(
	self, metrics: List[ModelPerformanceMetric]
	) -> Dict[str, Any]:
	"""Analyze performance trend over time"""
	if len(metrics) < 2:
	return {"trend": "insufficient_data", "direction": "unknown"}

	# Sort by timestamp
	sorted_metrics = sorted(metrics, key=lambda x: x.timestamp)

	# Calculate trend for each metric type
	trends = {}
	for metric_type in set(m.metric_type for m in metrics):
	type_metrics = [m for m in sorted_metrics if m.metric_type == metric_type]
	if len(type_metrics) >= 2:
	# Simple linear regression
	values = [m.value for m in type_metrics]
	n = len(values)
	x = list(range(n))

	# Calculate slope
	sum_x = sum(x)
	sum_y = sum(values)
	sum_xy = sum(x[i] * values[i] for i in range(n))
	sum_x2 = sum(x[i] * x[i] for i in range(n))

	if n * sum_x2 - sum_x * sum_x != 0:
	slope = (n * sum_xy - sum_x * sum_y) / (n * sum_x2 - sum_x * sum_x)

	if abs(slope) < 0.01:
	trend = "stable"
	elif slope > 0:
	trend = "improving"
	else:
	trend = "degrading"

	trends[metric_type] = {
	"trend": trend,
	"slope": slope,
	"recent_value": values[-1],
	"change_percent": (
	((values[-1] - values[0]) / values[0] * 100)
	if values[0] != 0
	else 0
	),
	}

	return trends


	# Global ML performance analyzer instance
	ml_performance_analyzer = MLModelPerformanceAnalyzer()