Spaces:
Running
Running
Add 05_model_validation.ipynb
Browse files
notebooks/05_model_validation.ipynb
ADDED
|
@@ -0,0 +1,574 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"id": "360397d9",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# 05 - Model Validation\n",
|
| 9 |
+
"\n",
|
| 10 |
+
"## CyberForge AI - Production Validation & Safety\n",
|
| 11 |
+
"\n",
|
| 12 |
+
"This notebook validates trained models for production deployment:\n",
|
| 13 |
+
"- Performance metrics and benchmarks\n",
|
| 14 |
+
"- Edge case testing\n",
|
| 15 |
+
"- Failure analysis and recovery\n",
|
| 16 |
+
"- Continuous operation safety checks\n",
|
| 17 |
+
"\n",
|
| 18 |
+
"### Validation Requirements:\n",
|
| 19 |
+
"- All models must pass accuracy thresholds\n",
|
| 20 |
+
"- Inference time must meet real-time requirements\n",
|
| 21 |
+
"- Edge cases must not cause crashes\n",
|
| 22 |
+
"- Memory usage must be within bounds"
|
| 23 |
+
]
|
| 24 |
+
},
|
| 25 |
+
{
|
| 26 |
+
"cell_type": "code",
|
| 27 |
+
"execution_count": null,
|
| 28 |
+
"id": "781bbd3c",
|
| 29 |
+
"metadata": {},
|
| 30 |
+
"outputs": [],
|
| 31 |
+
"source": [
|
| 32 |
+
"import json\n",
|
| 33 |
+
"import pandas as pd\n",
|
| 34 |
+
"import numpy as np\n",
|
| 35 |
+
"from pathlib import Path\n",
|
| 36 |
+
"from typing import Dict, List, Any, Optional\n",
|
| 37 |
+
"import time\n",
|
| 38 |
+
"import traceback\n",
|
| 39 |
+
"import joblib\n",
|
| 40 |
+
"import warnings\n",
|
| 41 |
+
"warnings.filterwarnings('ignore')\n",
|
| 42 |
+
"\n",
|
| 43 |
+
"from sklearn.metrics import (\n",
|
| 44 |
+
" accuracy_score, precision_score, recall_score, f1_score,\n",
|
| 45 |
+
" confusion_matrix, classification_report, roc_auc_score\n",
|
| 46 |
+
")\n",
|
| 47 |
+
"\n",
|
| 48 |
+
"# Configuration\n",
|
| 49 |
+
"config_path = Path(\"../notebook_config.json\")\n",
|
| 50 |
+
"with open(config_path) as f:\n",
|
| 51 |
+
" CONFIG = json.load(f)\n",
|
| 52 |
+
"\n",
|
| 53 |
+
"MODELS_DIR = Path(CONFIG[\"datasets_dir\"]).parent / \"models\"\n",
|
| 54 |
+
"VALIDATION_DIR = MODELS_DIR.parent / \"validation\"\n",
|
| 55 |
+
"VALIDATION_DIR.mkdir(exist_ok=True)\n",
|
| 56 |
+
"\n",
|
| 57 |
+
"print(f\"β Configuration loaded\")\n",
|
| 58 |
+
"print(f\"β Models from: {MODELS_DIR}\")\n",
|
| 59 |
+
"print(f\"β Validation output: {VALIDATION_DIR}\")"
|
| 60 |
+
]
|
| 61 |
+
},
|
| 62 |
+
{
|
| 63 |
+
"cell_type": "markdown",
|
| 64 |
+
"id": "a3afcfd1",
|
| 65 |
+
"metadata": {},
|
| 66 |
+
"source": [
|
| 67 |
+
"## 1. Validation Thresholds"
|
| 68 |
+
]
|
| 69 |
+
},
|
| 70 |
+
{
|
| 71 |
+
"cell_type": "code",
|
| 72 |
+
"execution_count": null,
|
| 73 |
+
"id": "74553e0c",
|
| 74 |
+
"metadata": {},
|
| 75 |
+
"outputs": [],
|
| 76 |
+
"source": [
|
| 77 |
+
"class ValidationThresholds:\n",
|
| 78 |
+
" \"\"\"Production-ready thresholds for model validation\"\"\"\n",
|
| 79 |
+
" \n",
|
| 80 |
+
" # Performance thresholds\n",
|
| 81 |
+
" MIN_ACCURACY = 0.80\n",
|
| 82 |
+
" MIN_PRECISION = 0.75\n",
|
| 83 |
+
" MIN_RECALL = 0.70\n",
|
| 84 |
+
" MIN_F1 = 0.75\n",
|
| 85 |
+
" \n",
|
| 86 |
+
" # Latency thresholds (milliseconds)\n",
|
| 87 |
+
" MAX_INFERENCE_TIME_MS = 100\n",
|
| 88 |
+
" MAX_BATCH_INFERENCE_TIME_MS = 500\n",
|
| 89 |
+
" \n",
|
| 90 |
+
" # Resource thresholds\n",
|
| 91 |
+
" MAX_MODEL_SIZE_MB = 100\n",
|
| 92 |
+
" MAX_MEMORY_MB = 500\n",
|
| 93 |
+
" \n",
|
| 94 |
+
" # Stability thresholds\n",
|
| 95 |
+
" MIN_CONSISTENCY_SCORE = 0.95 # Same input should give same output\n",
|
| 96 |
+
" MAX_EDGE_CASE_FAILURE_RATE = 0.05\n",
|
| 97 |
+
" \n",
|
| 98 |
+
" @classmethod\n",
|
| 99 |
+
" def check_performance(cls, metrics: Dict) -> Dict[str, bool]:\n",
|
| 100 |
+
" \"\"\"Check if metrics pass thresholds\"\"\"\n",
|
| 101 |
+
" return {\n",
|
| 102 |
+
" 'accuracy': metrics.get('accuracy', 0) >= cls.MIN_ACCURACY,\n",
|
| 103 |
+
" 'precision': metrics.get('precision', 0) >= cls.MIN_PRECISION,\n",
|
| 104 |
+
" 'recall': metrics.get('recall', 0) >= cls.MIN_RECALL,\n",
|
| 105 |
+
" 'f1': metrics.get('f1', 0) >= cls.MIN_F1,\n",
|
| 106 |
+
" 'inference_time': metrics.get('inference_time_ms', 999) <= cls.MAX_INFERENCE_TIME_MS\n",
|
| 107 |
+
" }\n",
|
| 108 |
+
"\n",
|
| 109 |
+
"print(\"β Validation Thresholds loaded\")\n",
|
| 110 |
+
"print(f\" Min Accuracy: {ValidationThresholds.MIN_ACCURACY}\")\n",
|
| 111 |
+
"print(f\" Max Inference: {ValidationThresholds.MAX_INFERENCE_TIME_MS}ms\")"
|
| 112 |
+
]
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"cell_type": "markdown",
|
| 116 |
+
"id": "8e96d341",
|
| 117 |
+
"metadata": {},
|
| 118 |
+
"source": [
|
| 119 |
+
"## 2. Load Models and Registry"
|
| 120 |
+
]
|
| 121 |
+
},
|
| 122 |
+
{
|
| 123 |
+
"cell_type": "code",
|
| 124 |
+
"execution_count": null,
|
| 125 |
+
"id": "3a0f6d54",
|
| 126 |
+
"metadata": {},
|
| 127 |
+
"outputs": [],
|
| 128 |
+
"source": [
|
| 129 |
+
"# Load model registry\n",
|
| 130 |
+
"registry_path = MODELS_DIR / \"model_registry.json\"\n",
|
| 131 |
+
"\n",
|
| 132 |
+
"if registry_path.exists():\n",
|
| 133 |
+
" with open(registry_path) as f:\n",
|
| 134 |
+
" registry = json.load(f)\n",
|
| 135 |
+
" print(f\"β Loaded registry with {len(registry.get('models', {}))} models\")\n",
|
| 136 |
+
"else:\n",
|
| 137 |
+
" print(\"β No model registry. Run 03_model_training.ipynb first.\")\n",
|
| 138 |
+
" registry = {'models': {}}\n",
|
| 139 |
+
"\n",
|
| 140 |
+
"# List available models\n",
|
| 141 |
+
"print(\"\\nAvailable models:\")\n",
|
| 142 |
+
"for name, info in registry.get('models', {}).items():\n",
|
| 143 |
+
" print(f\" - {name}: {info['best_model']} (F1: {info['f1_score']:.4f})\")"
|
| 144 |
+
]
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"cell_type": "markdown",
|
| 148 |
+
"id": "91e2c16f",
|
| 149 |
+
"metadata": {},
|
| 150 |
+
"source": [
|
| 151 |
+
"## 3. Model Validator"
|
| 152 |
+
]
|
| 153 |
+
},
|
| 154 |
+
{
|
| 155 |
+
"cell_type": "code",
|
| 156 |
+
"execution_count": null,
|
| 157 |
+
"id": "9042d66f",
|
| 158 |
+
"metadata": {},
|
| 159 |
+
"outputs": [],
|
| 160 |
+
"source": [
|
| 161 |
+
"class ModelValidator:\n",
|
| 162 |
+
" \"\"\"\n",
|
| 163 |
+
" Comprehensive model validation for production readiness.\n",
|
| 164 |
+
" \"\"\"\n",
|
| 165 |
+
" \n",
|
| 166 |
+
" def __init__(self, models_dir: Path):\n",
|
| 167 |
+
" self.models_dir = models_dir\n",
|
| 168 |
+
" self.validation_results = {}\n",
|
| 169 |
+
" \n",
|
| 170 |
+
" def load_model_artifacts(self, model_name: str, model_type: str) -> Dict:\n",
|
| 171 |
+
" \"\"\"Load model and associated artifacts\"\"\"\n",
|
| 172 |
+
" model_dir = self.models_dir / model_name\n",
|
| 173 |
+
" \n",
|
| 174 |
+
" artifacts = {}\n",
|
| 175 |
+
" \n",
|
| 176 |
+
" # Load model\n",
|
| 177 |
+
" model_path = model_dir / f\"{model_type}.pkl\"\n",
|
| 178 |
+
" if model_path.exists():\n",
|
| 179 |
+
" artifacts['model'] = joblib.load(model_path)\n",
|
| 180 |
+
" artifacts['model_size_mb'] = model_path.stat().st_size / (1024 * 1024)\n",
|
| 181 |
+
" \n",
|
| 182 |
+
" # Load scaler\n",
|
| 183 |
+
" scaler_path = model_dir / \"scaler.pkl\"\n",
|
| 184 |
+
" if scaler_path.exists():\n",
|
| 185 |
+
" artifacts['scaler'] = joblib.load(scaler_path)\n",
|
| 186 |
+
" \n",
|
| 187 |
+
" # Load metadata\n",
|
| 188 |
+
" metadata_path = model_dir / f\"{model_type}_metadata.json\"\n",
|
| 189 |
+
" if metadata_path.exists():\n",
|
| 190 |
+
" with open(metadata_path) as f:\n",
|
| 191 |
+
" artifacts['metadata'] = json.load(f)\n",
|
| 192 |
+
" \n",
|
| 193 |
+
" return artifacts\n",
|
| 194 |
+
" \n",
|
| 195 |
+
" def validate_performance(self, model, X_test, y_test, scaler=None) -> Dict:\n",
|
| 196 |
+
" \"\"\"Validate model performance metrics\"\"\"\n",
|
| 197 |
+
" # Scale if needed\n",
|
| 198 |
+
" if scaler:\n",
|
| 199 |
+
" X_test = scaler.transform(X_test)\n",
|
| 200 |
+
" \n",
|
| 201 |
+
" # Predictions\n",
|
| 202 |
+
" start = time.time()\n",
|
| 203 |
+
" y_pred = model.predict(X_test)\n",
|
| 204 |
+
" inference_time = (time.time() - start) / len(X_test) * 1000\n",
|
| 205 |
+
" \n",
|
| 206 |
+
" # Metrics\n",
|
| 207 |
+
" metrics = {\n",
|
| 208 |
+
" 'accuracy': accuracy_score(y_test, y_pred),\n",
|
| 209 |
+
" 'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),\n",
|
| 210 |
+
" 'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),\n",
|
| 211 |
+
" 'f1': f1_score(y_test, y_pred, average='weighted', zero_division=0),\n",
|
| 212 |
+
" 'inference_time_ms': inference_time,\n",
|
| 213 |
+
" 'samples_tested': len(y_test)\n",
|
| 214 |
+
" }\n",
|
| 215 |
+
" \n",
|
| 216 |
+
" # Check thresholds\n",
|
| 217 |
+
" metrics['passed_thresholds'] = ValidationThresholds.check_performance(metrics)\n",
|
| 218 |
+
" metrics['all_passed'] = all(metrics['passed_thresholds'].values())\n",
|
| 219 |
+
" \n",
|
| 220 |
+
" return metrics\n",
|
| 221 |
+
" \n",
|
| 222 |
+
" def validate_edge_cases(self, model, scaler=None) -> Dict:\n",
|
| 223 |
+
" \"\"\"Test model behavior on edge cases\"\"\"\n",
|
| 224 |
+
" results = {\n",
|
| 225 |
+
" 'tests_run': 0,\n",
|
| 226 |
+
" 'tests_passed': 0,\n",
|
| 227 |
+
" 'errors': []\n",
|
| 228 |
+
" }\n",
|
| 229 |
+
" \n",
|
| 230 |
+
" # Get expected feature count\n",
|
| 231 |
+
" if hasattr(model, 'n_features_in_'):\n",
|
| 232 |
+
" n_features = model.n_features_in_\n",
|
| 233 |
+
" else:\n",
|
| 234 |
+
" n_features = 10 # Default\n",
|
| 235 |
+
" \n",
|
| 236 |
+
" edge_cases = [\n",
|
| 237 |
+
" ('zeros', np.zeros((1, n_features))),\n",
|
| 238 |
+
" ('ones', np.ones((1, n_features))),\n",
|
| 239 |
+
" ('large_values', np.ones((1, n_features)) * 1e6),\n",
|
| 240 |
+
" ('negative', -np.ones((1, n_features))),\n",
|
| 241 |
+
" ('mixed', np.random.randn(1, n_features) * 100),\n",
|
| 242 |
+
" ]\n",
|
| 243 |
+
" \n",
|
| 244 |
+
" for case_name, X in edge_cases:\n",
|
| 245 |
+
" results['tests_run'] += 1\n",
|
| 246 |
+
" try:\n",
|
| 247 |
+
" if scaler:\n",
|
| 248 |
+
" X = scaler.transform(X)\n",
|
| 249 |
+
" pred = model.predict(X)\n",
|
| 250 |
+
" \n",
|
| 251 |
+
" # Check prediction is valid\n",
|
| 252 |
+
" if pred is not None and len(pred) == 1:\n",
|
| 253 |
+
" results['tests_passed'] += 1\n",
|
| 254 |
+
" else:\n",
|
| 255 |
+
" results['errors'].append(f\"{case_name}: Invalid prediction shape\")\n",
|
| 256 |
+
" \n",
|
| 257 |
+
" except Exception as e:\n",
|
| 258 |
+
" results['errors'].append(f\"{case_name}: {str(e)}\")\n",
|
| 259 |
+
" \n",
|
| 260 |
+
" results['pass_rate'] = results['tests_passed'] / max(results['tests_run'], 1)\n",
|
| 261 |
+
" return results\n",
|
| 262 |
+
" \n",
|
| 263 |
+
" def validate_consistency(self, model, scaler=None, n_runs: int = 10) -> Dict:\n",
|
| 264 |
+
" \"\"\"Test prediction consistency (same input = same output)\"\"\"\n",
|
| 265 |
+
" if hasattr(model, 'n_features_in_'):\n",
|
| 266 |
+
" n_features = model.n_features_in_\n",
|
| 267 |
+
" else:\n",
|
| 268 |
+
" n_features = 10\n",
|
| 269 |
+
" \n",
|
| 270 |
+
" # Fixed input\n",
|
| 271 |
+
" np.random.seed(42)\n",
|
| 272 |
+
" X = np.random.randn(1, n_features)\n",
|
| 273 |
+
" \n",
|
| 274 |
+
" if scaler:\n",
|
| 275 |
+
" X = scaler.transform(X)\n",
|
| 276 |
+
" \n",
|
| 277 |
+
" predictions = []\n",
|
| 278 |
+
" for _ in range(n_runs):\n",
|
| 279 |
+
" pred = model.predict(X)[0]\n",
|
| 280 |
+
" predictions.append(pred)\n",
|
| 281 |
+
" \n",
|
| 282 |
+
" unique_preds = len(set(predictions))\n",
|
| 283 |
+
" consistency = 1.0 if unique_preds == 1 else 1.0 / unique_preds\n",
|
| 284 |
+
" \n",
|
| 285 |
+
" return {\n",
|
| 286 |
+
" 'consistency_score': consistency,\n",
|
| 287 |
+
" 'unique_predictions': unique_preds,\n",
|
| 288 |
+
" 'is_consistent': unique_preds == 1\n",
|
| 289 |
+
" }\n",
|
| 290 |
+
" \n",
|
| 291 |
+
" def validate_latency(self, model, scaler=None, n_samples: int = 100) -> Dict:\n",
|
| 292 |
+
" \"\"\"Validate inference latency\"\"\"\n",
|
| 293 |
+
" if hasattr(model, 'n_features_in_'):\n",
|
| 294 |
+
" n_features = model.n_features_in_\n",
|
| 295 |
+
" else:\n",
|
| 296 |
+
" n_features = 10\n",
|
| 297 |
+
" \n",
|
| 298 |
+
" X = np.random.randn(n_samples, n_features)\n",
|
| 299 |
+
" if scaler:\n",
|
| 300 |
+
" X = scaler.transform(X)\n",
|
| 301 |
+
" \n",
|
| 302 |
+
" # Single sample latency\n",
|
| 303 |
+
" single_times = []\n",
|
| 304 |
+
" for i in range(min(10, n_samples)):\n",
|
| 305 |
+
" start = time.time()\n",
|
| 306 |
+
" model.predict(X[i:i+1])\n",
|
| 307 |
+
" single_times.append((time.time() - start) * 1000)\n",
|
| 308 |
+
" \n",
|
| 309 |
+
" # Batch latency\n",
|
| 310 |
+
" start = time.time()\n",
|
| 311 |
+
" model.predict(X)\n",
|
| 312 |
+
" batch_time = (time.time() - start) * 1000\n",
|
| 313 |
+
" \n",
|
| 314 |
+
" return {\n",
|
| 315 |
+
" 'single_mean_ms': np.mean(single_times),\n",
|
| 316 |
+
" 'single_max_ms': np.max(single_times),\n",
|
| 317 |
+
" 'single_std_ms': np.std(single_times),\n",
|
| 318 |
+
" 'batch_total_ms': batch_time,\n",
|
| 319 |
+
" 'batch_per_sample_ms': batch_time / n_samples,\n",
|
| 320 |
+
" 'meets_latency_target': np.mean(single_times) <= ValidationThresholds.MAX_INFERENCE_TIME_MS\n",
|
| 321 |
+
" }\n",
|
| 322 |
+
"\n",
|
| 323 |
+
"validator = ModelValidator(MODELS_DIR)\n",
|
| 324 |
+
"print(\"β Model Validator initialized\")"
|
| 325 |
+
]
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"cell_type": "markdown",
|
| 329 |
+
"id": "53cddd40",
|
| 330 |
+
"metadata": {},
|
| 331 |
+
"source": [
|
| 332 |
+
"## 4. Run Validation Suite"
|
| 333 |
+
]
|
| 334 |
+
},
|
| 335 |
+
{
|
| 336 |
+
"cell_type": "code",
|
| 337 |
+
"execution_count": null,
|
| 338 |
+
"id": "ccc75859",
|
| 339 |
+
"metadata": {},
|
| 340 |
+
"outputs": [],
|
| 341 |
+
"source": [
|
| 342 |
+
"# Run validation on all models\n",
|
| 343 |
+
"validation_results = {}\n",
|
| 344 |
+
"\n",
|
| 345 |
+
"print(\"Running validation suite...\\n\")\n",
|
| 346 |
+
"\n",
|
| 347 |
+
"for model_name, model_info in registry.get('models', {}).items():\n",
|
| 348 |
+
" print(f\"{'='*50}\")\n",
|
| 349 |
+
" print(f\"Validating: {model_name}\")\n",
|
| 350 |
+
" print(f\"{'='*50}\")\n",
|
| 351 |
+
" \n",
|
| 352 |
+
" # Load model artifacts\n",
|
| 353 |
+
" artifacts = validator.load_model_artifacts(model_name, model_info['best_model'])\n",
|
| 354 |
+
" \n",
|
| 355 |
+
" if 'model' not in artifacts:\n",
|
| 356 |
+
" print(f\" β Model not found\")\n",
|
| 357 |
+
" continue\n",
|
| 358 |
+
" \n",
|
| 359 |
+
" model = artifacts['model']\n",
|
| 360 |
+
" scaler = artifacts.get('scaler')\n",
|
| 361 |
+
" \n",
|
| 362 |
+
" results = {\n",
|
| 363 |
+
" 'model_name': model_name,\n",
|
| 364 |
+
" 'model_type': model_info['best_model'],\n",
|
| 365 |
+
" 'model_size_mb': artifacts.get('model_size_mb', 0)\n",
|
| 366 |
+
" }\n",
|
| 367 |
+
" \n",
|
| 368 |
+
" # Edge case validation\n",
|
| 369 |
+
" print(\"\\n Edge Case Testing...\")\n",
|
| 370 |
+
" edge_results = validator.validate_edge_cases(model, scaler)\n",
|
| 371 |
+
" results['edge_cases'] = edge_results\n",
|
| 372 |
+
" print(f\" Pass rate: {edge_results['pass_rate']:.2%}\")\n",
|
| 373 |
+
" if edge_results['errors']:\n",
|
| 374 |
+
" for err in edge_results['errors'][:2]:\n",
|
| 375 |
+
" print(f\" β {err}\")\n",
|
| 376 |
+
" \n",
|
| 377 |
+
" # Consistency validation\n",
|
| 378 |
+
" print(\"\\n Consistency Testing...\")\n",
|
| 379 |
+
" consistency_results = validator.validate_consistency(model, scaler)\n",
|
| 380 |
+
" results['consistency'] = consistency_results\n",
|
| 381 |
+
" print(f\" Consistent: {consistency_results['is_consistent']}\")\n",
|
| 382 |
+
" \n",
|
| 383 |
+
" # Latency validation\n",
|
| 384 |
+
" print(\"\\n Latency Testing...\")\n",
|
| 385 |
+
" latency_results = validator.validate_latency(model, scaler)\n",
|
| 386 |
+
" results['latency'] = latency_results\n",
|
| 387 |
+
" print(f\" Single inference: {latency_results['single_mean_ms']:.3f}ms\")\n",
|
| 388 |
+
" print(f\" Meets target: {latency_results['meets_latency_target']}\")\n",
|
| 389 |
+
" \n",
|
| 390 |
+
" # Overall validation status\n",
|
| 391 |
+
" passed = (\n",
|
| 392 |
+
" edge_results['pass_rate'] >= (1 - ValidationThresholds.MAX_EDGE_CASE_FAILURE_RATE) and\n",
|
| 393 |
+
" consistency_results['is_consistent'] and\n",
|
| 394 |
+
" latency_results['meets_latency_target']\n",
|
| 395 |
+
" )\n",
|
| 396 |
+
" \n",
|
| 397 |
+
" results['validation_passed'] = passed\n",
|
| 398 |
+
" validation_results[model_name] = results\n",
|
| 399 |
+
" \n",
|
| 400 |
+
" status = \"β PASSED\" if passed else \"β FAILED\"\n",
|
| 401 |
+
" print(f\"\\n Status: {status}\")\n",
|
| 402 |
+
"\n",
|
| 403 |
+
"print(f\"\\n\\nβ Validation complete for {len(validation_results)} models\")"
|
| 404 |
+
]
|
| 405 |
+
},
|
| 406 |
+
{
|
| 407 |
+
"cell_type": "markdown",
|
| 408 |
+
"id": "45e71432",
|
| 409 |
+
"metadata": {},
|
| 410 |
+
"source": [
|
| 411 |
+
"## 5. Generate Validation Report"
|
| 412 |
+
]
|
| 413 |
+
},
|
| 414 |
+
{
|
| 415 |
+
"cell_type": "code",
|
| 416 |
+
"execution_count": null,
|
| 417 |
+
"id": "24bbe906",
|
| 418 |
+
"metadata": {},
|
| 419 |
+
"outputs": [],
|
| 420 |
+
"source": [
|
| 421 |
+
"class ValidationReporter:\n",
|
| 422 |
+
" \"\"\"Generate validation reports for documentation\"\"\"\n",
|
| 423 |
+
" \n",
|
| 424 |
+
" @staticmethod\n",
|
| 425 |
+
" def generate_report(results: Dict) -> str:\n",
|
| 426 |
+
" \"\"\"Generate markdown validation report\"\"\"\n",
|
| 427 |
+
" lines = [\n",
|
| 428 |
+
" \"# CyberForge Model Validation Report\",\n",
|
| 429 |
+
" \"\",\n",
|
| 430 |
+
" f\"**Generated:** {time.strftime('%Y-%m-%d %H:%M:%S')}\",\n",
|
| 431 |
+
" f\"**Models Validated:** {len(results)}\",\n",
|
| 432 |
+
" \"\",\n",
|
| 433 |
+
" \"## Summary\",\n",
|
| 434 |
+
" \"\",\n",
|
| 435 |
+
" \"| Model | Type | Size (MB) | Edge Cases | Consistency | Latency (ms) | Status |\",\n",
|
| 436 |
+
" \"|-------|------|-----------|------------|-------------|--------------|--------|\"\n",
|
| 437 |
+
" ]\n",
|
| 438 |
+
" \n",
|
| 439 |
+
" for name, data in results.items():\n",
|
| 440 |
+
" status = \"β Pass\" if data.get('validation_passed') else \"β Fail\"\n",
|
| 441 |
+
" edge = f\"{data.get('edge_cases', {}).get('pass_rate', 0):.0%}\"\n",
|
| 442 |
+
" consist = \"β\" if data.get('consistency', {}).get('is_consistent') else \"β\"\n",
|
| 443 |
+
" latency = f\"{data.get('latency', {}).get('single_mean_ms', 999):.2f}\"\n",
|
| 444 |
+
" \n",
|
| 445 |
+
" lines.append(\n",
|
| 446 |
+
" f\"| {name} | {data.get('model_type', 'N/A')} | \"\n",
|
| 447 |
+
" f\"{data.get('model_size_mb', 0):.2f} | {edge} | {consist} | {latency} | {status} |\"\n",
|
| 448 |
+
" )\n",
|
| 449 |
+
" \n",
|
| 450 |
+
" lines.extend([\n",
|
| 451 |
+
" \"\",\n",
|
| 452 |
+
" \"## Validation Thresholds\",\n",
|
| 453 |
+
" \"\",\n",
|
| 454 |
+
" f\"- Min Accuracy: {ValidationThresholds.MIN_ACCURACY}\",\n",
|
| 455 |
+
" f\"- Max Inference Time: {ValidationThresholds.MAX_INFERENCE_TIME_MS}ms\",\n",
|
| 456 |
+
" f\"- Max Edge Case Failure Rate: {ValidationThresholds.MAX_EDGE_CASE_FAILURE_RATE:.0%}\",\n",
|
| 457 |
+
" f\"- Min Consistency Score: {ValidationThresholds.MIN_CONSISTENCY_SCORE}\",\n",
|
| 458 |
+
" ])\n",
|
| 459 |
+
" \n",
|
| 460 |
+
" return \"\\n\".join(lines)\n",
|
| 461 |
+
"\n",
|
| 462 |
+
"# Generate report\n",
|
| 463 |
+
"report = ValidationReporter.generate_report(validation_results)\n",
|
| 464 |
+
"\n",
|
| 465 |
+
"report_path = VALIDATION_DIR / \"validation_report.md\"\n",
|
| 466 |
+
"with open(report_path, 'w') as f:\n",
|
| 467 |
+
" f.write(report)\n",
|
| 468 |
+
"\n",
|
| 469 |
+
"print(f\"β Report saved to: {report_path}\")\n",
|
| 470 |
+
"print(\"\\n\" + report)"
|
| 471 |
+
]
|
| 472 |
+
},
|
| 473 |
+
{
|
| 474 |
+
"cell_type": "markdown",
|
| 475 |
+
"id": "a0a647bc",
|
| 476 |
+
"metadata": {},
|
| 477 |
+
"source": [
|
| 478 |
+
"## 6. Save Validation Results"
|
| 479 |
+
]
|
| 480 |
+
},
|
| 481 |
+
{
|
| 482 |
+
"cell_type": "code",
|
| 483 |
+
"execution_count": null,
|
| 484 |
+
"id": "8b52d7da",
|
| 485 |
+
"metadata": {},
|
| 486 |
+
"outputs": [],
|
| 487 |
+
"source": [
|
| 488 |
+
"# Save detailed validation results\n",
|
| 489 |
+
"results_path = VALIDATION_DIR / \"validation_results.json\"\n",
|
| 490 |
+
"\n",
|
| 491 |
+
"# Make results JSON-serializable\n",
|
| 492 |
+
"serializable_results = {}\n",
|
| 493 |
+
"for name, data in validation_results.items():\n",
|
| 494 |
+
" serializable_results[name] = {\n",
|
| 495 |
+
" k: v if not isinstance(v, np.floating) else float(v)\n",
|
| 496 |
+
" for k, v in data.items()\n",
|
| 497 |
+
" }\n",
|
| 498 |
+
"\n",
|
| 499 |
+
"with open(results_path, 'w') as f:\n",
|
| 500 |
+
" json.dump({\n",
|
| 501 |
+
" 'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),\n",
|
| 502 |
+
" 'thresholds': {\n",
|
| 503 |
+
" 'min_accuracy': ValidationThresholds.MIN_ACCURACY,\n",
|
| 504 |
+
" 'max_inference_time_ms': ValidationThresholds.MAX_INFERENCE_TIME_MS,\n",
|
| 505 |
+
" 'max_edge_case_failure_rate': ValidationThresholds.MAX_EDGE_CASE_FAILURE_RATE\n",
|
| 506 |
+
" },\n",
|
| 507 |
+
" 'results': serializable_results\n",
|
| 508 |
+
" }, f, indent=2, default=str)\n",
|
| 509 |
+
"\n",
|
| 510 |
+
"print(f\"β Results saved to: {results_path}\")"
|
| 511 |
+
]
|
| 512 |
+
},
|
| 513 |
+
{
|
| 514 |
+
"cell_type": "markdown",
|
| 515 |
+
"id": "e4b142e1",
|
| 516 |
+
"metadata": {},
|
| 517 |
+
"source": [
|
| 518 |
+
"## 7. Summary"
|
| 519 |
+
]
|
| 520 |
+
},
|
| 521 |
+
{
|
| 522 |
+
"cell_type": "code",
|
| 523 |
+
"execution_count": null,
|
| 524 |
+
"id": "e9532d95",
|
| 525 |
+
"metadata": {},
|
| 526 |
+
"outputs": [],
|
| 527 |
+
"source": [
|
| 528 |
+
"# Calculate summary stats\n",
|
| 529 |
+
"passed_count = sum(1 for r in validation_results.values() if r.get('validation_passed'))\n",
|
| 530 |
+
"total_count = len(validation_results)\n",
|
| 531 |
+
"\n",
|
| 532 |
+
"print(\"\\n\" + \"=\" * 60)\n",
|
| 533 |
+
"print(\"MODEL VALIDATION COMPLETE\")\n",
|
| 534 |
+
"print(\"=\" * 60)\n",
|
| 535 |
+
"\n",
|
| 536 |
+
"print(f\"\"\"\n",
|
| 537 |
+
"β
Validation Summary:\n",
|
| 538 |
+
" - Models validated: {total_count}\n",
|
| 539 |
+
" - Models passed: {passed_count}\n",
|
| 540 |
+
" - Models failed: {total_count - passed_count}\n",
|
| 541 |
+
" - Pass rate: {passed_count/max(total_count,1):.0%}\n",
|
| 542 |
+
"\n",
|
| 543 |
+
"π Validation Checks:\n",
|
| 544 |
+
" β Edge case handling\n",
|
| 545 |
+
" β Prediction consistency\n",
|
| 546 |
+
" β Inference latency\n",
|
| 547 |
+
" β Model size limits\n",
|
| 548 |
+
"\n",
|
| 549 |
+
"π Output Files:\n",
|
| 550 |
+
" - Report: {VALIDATION_DIR}/validation_report.md\n",
|
| 551 |
+
" - Results: {VALIDATION_DIR}/validation_results.json\n",
|
| 552 |
+
"\n",
|
| 553 |
+
"Models Ready for Production:\"\"\")\n",
|
| 554 |
+
"\n",
|
| 555 |
+
"for name, data in validation_results.items():\n",
|
| 556 |
+
" status = \"β\" if data.get('validation_passed') else \"β\"\n",
|
| 557 |
+
" print(f\" {status} {name}\")\n",
|
| 558 |
+
"\n",
|
| 559 |
+
"print(f\"\"\"\n",
|
| 560 |
+
"Next step:\n",
|
| 561 |
+
" β 06_backend_integration.ipynb\n",
|
| 562 |
+
"\"\"\")\n",
|
| 563 |
+
"print(\"=\" * 60)"
|
| 564 |
+
]
|
| 565 |
+
}
|
| 566 |
+
],
|
| 567 |
+
"metadata": {
|
| 568 |
+
"language_info": {
|
| 569 |
+
"name": "python"
|
| 570 |
+
}
|
| 571 |
+
},
|
| 572 |
+
"nbformat": 4,
|
| 573 |
+
"nbformat_minor": 5
|
| 574 |
+
}
|