Che237 commited on
Commit
a1f6dc3
Β·
verified Β·
1 Parent(s): 3d2a643

Add 03_model_training.ipynb

Browse files
Files changed (1) hide show
  1. notebooks/03_model_training.ipynb +691 -0
notebooks/03_model_training.ipynb ADDED
@@ -0,0 +1,691 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "eae17b13",
6
+ "metadata": {},
7
+ "source": [
8
+ "# 03 - Model Training\n",
9
+ "\n",
10
+ "## CyberForge AI - Lightweight Security Models\n",
11
+ "\n",
12
+ "This notebook trains production-ready ML models optimized for:\n",
13
+ "- Real-time inference\n",
14
+ "- Backend API integration\n",
15
+ "- Agentic AI workflows\n",
16
+ "\n",
17
+ "### Model Categories:\n",
18
+ "1. **Risk Scoring** - Website security risk assessment\n",
19
+ "2. **Threat Classification** - Malware, phishing, anomaly detection\n",
20
+ "3. **Behavioral Analysis** - Pattern-based threat detection\n",
21
+ "\n",
22
+ "### Backend Alignment:\n",
23
+ "- Models compatible with mlService.js\n",
24
+ "- Output format matches ThreatService expectations\n",
25
+ "- Inference time < 100ms for real-time use"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "code",
30
+ "execution_count": null,
31
+ "id": "473944d7",
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "import json\n",
36
+ "import pandas as pd\n",
37
+ "import numpy as np\n",
38
+ "from pathlib import Path\n",
39
+ "from typing import Dict, List, Any, Optional, Tuple\n",
40
+ "import time\n",
41
+ "import warnings\n",
42
+ "warnings.filterwarnings('ignore')\n",
43
+ "\n",
44
+ "# ML Libraries\n",
45
+ "from sklearn.model_selection import train_test_split, cross_val_score\n",
46
+ "from sklearn.preprocessing import StandardScaler, LabelEncoder\n",
47
+ "from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix\n",
48
+ "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
49
+ "from sklearn.linear_model import LogisticRegression\n",
50
+ "from sklearn.svm import SVC\n",
51
+ "import joblib\n",
52
+ "\n",
53
+ "# Configuration\n",
54
+ "config_path = Path(\"../notebook_config.json\")\n",
55
+ "with open(config_path) as f:\n",
56
+ " CONFIG = json.load(f)\n",
57
+ "\n",
58
+ "DATASETS_DIR = Path(CONFIG[\"datasets_dir\"])\n",
59
+ "FEATURES_DIR = DATASETS_DIR / \"features\"\n",
60
+ "MODELS_DIR = DATASETS_DIR.parent / \"models\"\n",
61
+ "MODELS_DIR.mkdir(exist_ok=True)\n",
62
+ "\n",
63
+ "print(f\"βœ“ Configuration loaded\")\n",
64
+ "print(f\"βœ“ Features from: {FEATURES_DIR}\")\n",
65
+ "print(f\"βœ“ Models output: {MODELS_DIR}\")"
66
+ ]
67
+ },
68
+ {
69
+ "cell_type": "markdown",
70
+ "id": "fe015229",
71
+ "metadata": {},
72
+ "source": [
73
+ "## 1. Load Feature-Engineered Data"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": null,
79
+ "id": "46797075",
80
+ "metadata": {},
81
+ "outputs": [],
82
+ "source": [
83
+ "# Load feature manifest\n",
84
+ "feature_manifest_path = FEATURES_DIR / \"feature_manifest.json\"\n",
85
+ "\n",
86
+ "if feature_manifest_path.exists():\n",
87
+ " with open(feature_manifest_path) as f:\n",
88
+ " feature_manifest = json.load(f)\n",
89
+ " print(f\"βœ“ Loaded {len(feature_manifest)} feature datasets\")\n",
90
+ "else:\n",
91
+ " print(\"⚠ No feature manifest. Run 02_feature_engineering.ipynb first.\")\n",
92
+ " feature_manifest = []\n",
93
+ "\n",
94
+ "# Load datasets\n",
95
+ "datasets = {}\n",
96
+ "print(\"\\nLoading feature datasets:\")\n",
97
+ "\n",
98
+ "for entry in feature_manifest:\n",
99
+ " name = entry['name']\n",
100
+ " path = Path(\"..\") / entry['path']\n",
101
+ " \n",
102
+ " if path.exists() and entry.get('has_labels', False):\n",
103
+ " df = pd.read_parquet(path)\n",
104
+ " datasets[name] = df\n",
105
+ " print(f\" βœ“ {name}: {len(df)} samples, {len(df.columns)} features\")\n",
106
+ " else:\n",
107
+ " print(f\" ⚠ {name}: No labels or file missing\")\n",
108
+ "\n",
109
+ "print(f\"\\nβœ“ Loaded {len(datasets)} datasets with labels for training\")"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "markdown",
114
+ "id": "71c83005",
115
+ "metadata": {},
116
+ "source": [
117
+ "## 2. Model Configuration"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": null,
123
+ "id": "600086b6",
124
+ "metadata": {},
125
+ "outputs": [],
126
+ "source": [
127
+ "class ModelConfig:\n",
128
+ " \"\"\"\n",
129
+ " Model configurations optimized for production.\n",
130
+ " Models are lightweight for fast inference.\n",
131
+ " \"\"\"\n",
132
+ " \n",
133
+ " # Model definitions\n",
134
+ " MODELS = {\n",
135
+ " 'random_forest': {\n",
136
+ " 'class': RandomForestClassifier,\n",
137
+ " 'params': {\n",
138
+ " 'n_estimators': 100,\n",
139
+ " 'max_depth': 10,\n",
140
+ " 'min_samples_split': 5,\n",
141
+ " 'min_samples_leaf': 2,\n",
142
+ " 'n_jobs': -1,\n",
143
+ " 'random_state': 42\n",
144
+ " },\n",
145
+ " 'inference_time_target': 50 # ms\n",
146
+ " },\n",
147
+ " 'gradient_boosting': {\n",
148
+ " 'class': GradientBoostingClassifier,\n",
149
+ " 'params': {\n",
150
+ " 'n_estimators': 50,\n",
151
+ " 'max_depth': 5,\n",
152
+ " 'learning_rate': 0.1,\n",
153
+ " 'random_state': 42\n",
154
+ " },\n",
155
+ " 'inference_time_target': 30 # ms\n",
156
+ " },\n",
157
+ " 'logistic_regression': {\n",
158
+ " 'class': LogisticRegression,\n",
159
+ " 'params': {\n",
160
+ " 'max_iter': 1000,\n",
161
+ " 'random_state': 42\n",
162
+ " },\n",
163
+ " 'inference_time_target': 5 # ms\n",
164
+ " }\n",
165
+ " }\n",
166
+ " \n",
167
+ " # Dataset to model mapping\n",
168
+ " TASK_MODELS = {\n",
169
+ " 'phishing_detection': ['random_forest', 'gradient_boosting'],\n",
170
+ " 'malware_detection': ['random_forest', 'gradient_boosting'],\n",
171
+ " 'anomaly_detection': ['random_forest'],\n",
172
+ " 'web_attack_detection': ['random_forest', 'gradient_boosting'],\n",
173
+ " 'threat_intelligence': ['logistic_regression', 'random_forest'],\n",
174
+ " 'vulnerability_assessment': ['gradient_boosting']\n",
175
+ " }\n",
176
+ " \n",
177
+ " @classmethod\n",
178
+ " def get_models_for_task(cls, task_name: str) -> List[str]:\n",
179
+ " \"\"\"Get recommended models for a task\"\"\"\n",
180
+ " # Match partial task names\n",
181
+ " for key, models in cls.TASK_MODELS.items():\n",
182
+ " if key in task_name.lower():\n",
183
+ " return models\n",
184
+ " return ['random_forest'] # Default\n",
185
+ "\n",
186
+ "print(\"βœ“ Model Configuration loaded\")\n",
187
+ "print(f\" Available models: {list(ModelConfig.MODELS.keys())}\")"
188
+ ]
189
+ },
190
+ {
191
+ "cell_type": "markdown",
192
+ "id": "ad7d2f43",
193
+ "metadata": {},
194
+ "source": [
195
+ "## 3. Training Pipeline"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": null,
201
+ "id": "d9b11805",
202
+ "metadata": {},
203
+ "outputs": [],
204
+ "source": [
205
+ "class CyberForgeTrainer:\n",
206
+ " \"\"\"\n",
207
+ " Training pipeline for CyberForge security models.\n",
208
+ " Optimized for production deployment and fast inference.\n",
209
+ " \"\"\"\n",
210
+ " \n",
211
+ " def __init__(self):\n",
212
+ " self.scaler = StandardScaler()\n",
213
+ " self.label_encoder = LabelEncoder()\n",
214
+ " self.trained_models = {}\n",
215
+ " self.training_metrics = {}\n",
216
+ " \n",
217
+ " def prepare_data(self, df: pd.DataFrame, label_col: str = 'label', \n",
218
+ " test_size: float = 0.2) -> Tuple:\n",
219
+ " \"\"\"Prepare data for training\"\"\"\n",
220
+ " # Separate features and labels\n",
221
+ " y = df[label_col]\n",
222
+ " X = df.drop(columns=[label_col])\n",
223
+ " \n",
224
+ " # Keep only numeric columns\n",
225
+ " X = X.select_dtypes(include=[np.number]).fillna(0)\n",
226
+ " \n",
227
+ " # Encode labels if needed\n",
228
+ " if y.dtype == 'object':\n",
229
+ " y = self.label_encoder.fit_transform(y)\n",
230
+ " \n",
231
+ " # Scale features\n",
232
+ " X_scaled = self.scaler.fit_transform(X)\n",
233
+ " \n",
234
+ " # Split\n",
235
+ " X_train, X_test, y_train, y_test = train_test_split(\n",
236
+ " X_scaled, y, test_size=test_size, random_state=42, stratify=y\n",
237
+ " )\n",
238
+ " \n",
239
+ " return X_train, X_test, y_train, y_test, X.columns.tolist()\n",
240
+ " \n",
241
+ " def train_model(self, X_train, y_train, model_type: str) -> Any:\n",
242
+ " \"\"\"Train a single model\"\"\"\n",
243
+ " config = ModelConfig.MODELS.get(model_type)\n",
244
+ " if not config:\n",
245
+ " raise ValueError(f\"Unknown model type: {model_type}\")\n",
246
+ " \n",
247
+ " model = config['class'](**config['params'])\n",
248
+ " \n",
249
+ " start_time = time.time()\n",
250
+ " model.fit(X_train, y_train)\n",
251
+ " train_time = time.time() - start_time\n",
252
+ " \n",
253
+ " return model, train_time\n",
254
+ " \n",
255
+ " def evaluate_model(self, model, X_test, y_test) -> Dict:\n",
256
+ " \"\"\"Evaluate model performance\"\"\"\n",
257
+ " # Predictions\n",
258
+ " start_time = time.time()\n",
259
+ " y_pred = model.predict(X_test)\n",
260
+ " inference_time = (time.time() - start_time) / len(X_test) * 1000 # ms per sample\n",
261
+ " \n",
262
+ " # Probabilities if available\n",
263
+ " if hasattr(model, 'predict_proba'):\n",
264
+ " y_proba = model.predict_proba(X_test)\n",
265
+ " else:\n",
266
+ " y_proba = None\n",
267
+ " \n",
268
+ " # Metrics\n",
269
+ " accuracy = accuracy_score(y_test, y_pred)\n",
270
+ " f1 = f1_score(y_test, y_pred, average='weighted')\n",
271
+ " \n",
272
+ " return {\n",
273
+ " 'accuracy': accuracy,\n",
274
+ " 'f1_score': f1,\n",
275
+ " 'inference_time_ms': inference_time,\n",
276
+ " 'predictions': y_pred,\n",
277
+ " 'probabilities': y_proba\n",
278
+ " }\n",
279
+ " \n",
280
+ " def train_for_dataset(self, df: pd.DataFrame, dataset_name: str) -> Dict:\n",
281
+ " \"\"\"Train all recommended models for a dataset\"\"\"\n",
282
+ " print(f\"\\n{'='*50}\")\n",
283
+ " print(f\"Training models for: {dataset_name}\")\n",
284
+ " print(f\"{'='*50}\")\n",
285
+ " \n",
286
+ " # Prepare data\n",
287
+ " X_train, X_test, y_train, y_test, feature_names = self.prepare_data(df)\n",
288
+ " print(f\" Data: {len(X_train)} train, {len(X_test)} test samples\")\n",
289
+ " print(f\" Features: {len(feature_names)}\")\n",
290
+ " \n",
291
+ " # Get recommended models\n",
292
+ " model_types = ModelConfig.get_models_for_task(dataset_name)\n",
293
+ " \n",
294
+ " results = {}\n",
295
+ " best_model = None\n",
296
+ " best_score = 0\n",
297
+ " \n",
298
+ " for model_type in model_types:\n",
299
+ " print(f\"\\n Training: {model_type}\")\n",
300
+ " \n",
301
+ " # Train\n",
302
+ " model, train_time = self.train_model(X_train, y_train, model_type)\n",
303
+ " print(f\" Training time: {train_time:.2f}s\")\n",
304
+ " \n",
305
+ " # Evaluate\n",
306
+ " metrics = self.evaluate_model(model, X_test, y_test)\n",
307
+ " print(f\" Accuracy: {metrics['accuracy']:.4f}\")\n",
308
+ " print(f\" F1 Score: {metrics['f1_score']:.4f}\")\n",
309
+ " print(f\" Inference: {metrics['inference_time_ms']:.3f}ms/sample\")\n",
310
+ " \n",
311
+ " results[model_type] = {\n",
312
+ " 'model': model,\n",
313
+ " 'metrics': metrics,\n",
314
+ " 'train_time': train_time,\n",
315
+ " 'feature_names': feature_names\n",
316
+ " }\n",
317
+ " \n",
318
+ " # Track best\n",
319
+ " if metrics['f1_score'] > best_score:\n",
320
+ " best_score = metrics['f1_score']\n",
321
+ " best_model = model_type\n",
322
+ " \n",
323
+ " print(f\"\\n βœ“ Best model: {best_model} (F1: {best_score:.4f})\")\n",
324
+ " \n",
325
+ " # Store results\n",
326
+ " self.trained_models[dataset_name] = {\n",
327
+ " 'models': results,\n",
328
+ " 'best_model': best_model,\n",
329
+ " 'scaler': self.scaler,\n",
330
+ " 'label_encoder': self.label_encoder if hasattr(self.label_encoder, 'classes_') else None\n",
331
+ " }\n",
332
+ " \n",
333
+ " return results\n",
334
+ "\n",
335
+ "trainer = CyberForgeTrainer()\n",
336
+ "print(\"βœ“ CyberForge Trainer initialized\")"
337
+ ]
338
+ },
339
+ {
340
+ "cell_type": "markdown",
341
+ "id": "828ef403",
342
+ "metadata": {},
343
+ "source": [
344
+ "## 4. Train Models"
345
+ ]
346
+ },
347
+ {
348
+ "cell_type": "code",
349
+ "execution_count": null,
350
+ "id": "e662de72",
351
+ "metadata": {},
352
+ "outputs": [],
353
+ "source": [
354
+ "# Train models for each dataset\n",
355
+ "all_results = {}\n",
356
+ "\n",
357
+ "for name, df in datasets.items():\n",
358
+ " if 'label' not in df.columns:\n",
359
+ " print(f\"⚠ Skipping {name}: no label column\")\n",
360
+ " continue\n",
361
+ " \n",
362
+ " try:\n",
363
+ " results = trainer.train_for_dataset(df, name)\n",
364
+ " all_results[name] = results\n",
365
+ " except Exception as e:\n",
366
+ " print(f\"⚠ Error training {name}: {e}\")\n",
367
+ "\n",
368
+ "print(f\"\\n\\nβœ“ Trained models for {len(all_results)} datasets\")"
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "markdown",
373
+ "id": "ba9c2c2c",
374
+ "metadata": {},
375
+ "source": [
376
+ "## 5. Model Serialization for Backend"
377
+ ]
378
+ },
379
+ {
380
+ "cell_type": "code",
381
+ "execution_count": null,
382
+ "id": "2edd4ef9",
383
+ "metadata": {},
384
+ "outputs": [],
385
+ "source": [
386
+ "class ModelSerializer:\n",
387
+ " \"\"\"\n",
388
+ " Serialize models for backend integration.\n",
389
+ " Outputs format compatible with mlService.js\n",
390
+ " \"\"\"\n",
391
+ " \n",
392
+ " def __init__(self, models_dir: Path):\n",
393
+ " self.models_dir = models_dir\n",
394
+ " \n",
395
+ " def save_model(self, dataset_name: str, model_data: Dict) -> Dict:\n",
396
+ " \"\"\"Save a trained model with metadata\"\"\"\n",
397
+ " model_dir = self.models_dir / dataset_name\n",
398
+ " model_dir.mkdir(exist_ok=True)\n",
399
+ " \n",
400
+ " saved_files = {}\n",
401
+ " \n",
402
+ " for model_type, data in model_data['models'].items():\n",
403
+ " model = data['model']\n",
404
+ " metrics = data['metrics']\n",
405
+ " \n",
406
+ " # Save model\n",
407
+ " model_path = model_dir / f\"{model_type}.pkl\"\n",
408
+ " joblib.dump(model, model_path)\n",
409
+ " \n",
410
+ " # Save metadata\n",
411
+ " metadata = {\n",
412
+ " 'model_type': model_type,\n",
413
+ " 'dataset': dataset_name,\n",
414
+ " 'accuracy': float(metrics['accuracy']),\n",
415
+ " 'f1_score': float(metrics['f1_score']),\n",
416
+ " 'inference_time_ms': float(metrics['inference_time_ms']),\n",
417
+ " 'feature_names': data['feature_names'],\n",
418
+ " 'version': '1.0.0',\n",
419
+ " 'framework': 'sklearn'\n",
420
+ " }\n",
421
+ " \n",
422
+ " metadata_path = model_dir / f\"{model_type}_metadata.json\"\n",
423
+ " with open(metadata_path, 'w') as f:\n",
424
+ " json.dump(metadata, f, indent=2)\n",
425
+ " \n",
426
+ " saved_files[model_type] = {\n",
427
+ " 'model_path': str(model_path),\n",
428
+ " 'metadata_path': str(metadata_path)\n",
429
+ " }\n",
430
+ " \n",
431
+ " # Save scaler\n",
432
+ " if model_data.get('scaler'):\n",
433
+ " scaler_path = model_dir / \"scaler.pkl\"\n",
434
+ " joblib.dump(model_data['scaler'], scaler_path)\n",
435
+ " saved_files['scaler'] = str(scaler_path)\n",
436
+ " \n",
437
+ " # Save label encoder\n",
438
+ " if model_data.get('label_encoder'):\n",
439
+ " encoder_path = model_dir / \"label_encoder.pkl\"\n",
440
+ " joblib.dump(model_data['label_encoder'], encoder_path)\n",
441
+ " saved_files['label_encoder'] = str(encoder_path)\n",
442
+ " \n",
443
+ " return saved_files\n",
444
+ " \n",
445
+ " def create_model_registry(self, trained_models: Dict) -> Dict:\n",
446
+ " \"\"\"Create a model registry for backend use\"\"\"\n",
447
+ " registry = {\n",
448
+ " 'version': '1.0.0',\n",
449
+ " 'models': {}\n",
450
+ " }\n",
451
+ " \n",
452
+ " for dataset_name, model_data in trained_models.items():\n",
453
+ " best_model = model_data['best_model']\n",
454
+ " best_metrics = model_data['models'][best_model]['metrics']\n",
455
+ " \n",
456
+ " registry['models'][dataset_name] = {\n",
457
+ " 'best_model': best_model,\n",
458
+ " 'model_path': f\"models/{dataset_name}/{best_model}.pkl\",\n",
459
+ " 'metadata_path': f\"models/{dataset_name}/{best_model}_metadata.json\",\n",
460
+ " 'scaler_path': f\"models/{dataset_name}/scaler.pkl\",\n",
461
+ " 'accuracy': float(best_metrics['accuracy']),\n",
462
+ " 'f1_score': float(best_metrics['f1_score']),\n",
463
+ " 'inference_time_ms': float(best_metrics['inference_time_ms']),\n",
464
+ " 'available_models': list(model_data['models'].keys())\n",
465
+ " }\n",
466
+ " \n",
467
+ " return registry\n",
468
+ "\n",
469
+ "serializer = ModelSerializer(MODELS_DIR)\n",
470
+ "print(\"βœ“ Model Serializer initialized\")"
471
+ ]
472
+ },
473
+ {
474
+ "cell_type": "code",
475
+ "execution_count": null,
476
+ "id": "b9a2b692",
477
+ "metadata": {},
478
+ "outputs": [],
479
+ "source": [
480
+ "# Save all trained models\n",
481
+ "print(\"Saving trained models...\\n\")\n",
482
+ "\n",
483
+ "for dataset_name, model_data in trainer.trained_models.items():\n",
484
+ " print(f\" Saving: {dataset_name}\")\n",
485
+ " saved = serializer.save_model(dataset_name, model_data)\n",
486
+ " print(f\" βœ“ Saved {len(saved)} files\")\n",
487
+ "\n",
488
+ "# Create model registry\n",
489
+ "registry = serializer.create_model_registry(trainer.trained_models)\n",
490
+ "registry_path = MODELS_DIR / \"model_registry.json\"\n",
491
+ "with open(registry_path, 'w') as f:\n",
492
+ " json.dump(registry, f, indent=2)\n",
493
+ "\n",
494
+ "print(f\"\\nβœ“ Model registry saved to: {registry_path}\")"
495
+ ]
496
+ },
497
+ {
498
+ "cell_type": "markdown",
499
+ "id": "c87fde7e",
500
+ "metadata": {},
501
+ "source": [
502
+ "## 6. Inference API for Backend"
503
+ ]
504
+ },
505
+ {
506
+ "cell_type": "code",
507
+ "execution_count": null,
508
+ "id": "5db8ef76",
509
+ "metadata": {},
510
+ "outputs": [],
511
+ "source": [
512
+ "class ModelInferenceAPI:\n",
513
+ " \"\"\"\n",
514
+ " Inference API compatible with backend mlService.js\n",
515
+ " Provides fast, standardized predictions.\n",
516
+ " \"\"\"\n",
517
+ " \n",
518
+ " def __init__(self, models_dir: Path):\n",
519
+ " self.models_dir = models_dir\n",
520
+ " self.loaded_models = {}\n",
521
+ " self.registry = self._load_registry()\n",
522
+ " \n",
523
+ " def _load_registry(self) -> Dict:\n",
524
+ " registry_path = self.models_dir / \"model_registry.json\"\n",
525
+ " if registry_path.exists():\n",
526
+ " with open(registry_path) as f:\n",
527
+ " return json.load(f)\n",
528
+ " return {'models': {}}\n",
529
+ " \n",
530
+ " def load_model(self, task_name: str) -> bool:\n",
531
+ " \"\"\"Load a model for inference\"\"\"\n",
532
+ " if task_name in self.loaded_models:\n",
533
+ " return True\n",
534
+ " \n",
535
+ " task_config = self.registry['models'].get(task_name)\n",
536
+ " if not task_config:\n",
537
+ " return False\n",
538
+ " \n",
539
+ " model_path = self.models_dir / task_name / f\"{task_config['best_model']}.pkl\"\n",
540
+ " scaler_path = self.models_dir / task_name / \"scaler.pkl\"\n",
541
+ " \n",
542
+ " if model_path.exists():\n",
543
+ " self.loaded_models[task_name] = {\n",
544
+ " 'model': joblib.load(model_path),\n",
545
+ " 'scaler': joblib.load(scaler_path) if scaler_path.exists() else None\n",
546
+ " }\n",
547
+ " return True\n",
548
+ " \n",
549
+ " return False\n",
550
+ " \n",
551
+ " def predict(self, task_name: str, features: Dict) -> Dict:\n",
552
+ " \"\"\"Make a prediction\"\"\"\n",
553
+ " if not self.load_model(task_name):\n",
554
+ " return {'error': f'Model not found: {task_name}'}\n",
555
+ " \n",
556
+ " model_data = self.loaded_models[task_name]\n",
557
+ " model = model_data['model']\n",
558
+ " scaler = model_data['scaler']\n",
559
+ " \n",
560
+ " # Convert features to array\n",
561
+ " X = np.array([list(features.values())])\n",
562
+ " \n",
563
+ " # Scale if scaler available\n",
564
+ " if scaler:\n",
565
+ " X = scaler.transform(X)\n",
566
+ " \n",
567
+ " # Predict\n",
568
+ " start_time = time.time()\n",
569
+ " prediction = model.predict(X)[0]\n",
570
+ " \n",
571
+ " # Get probability if available\n",
572
+ " confidence = 0.5\n",
573
+ " if hasattr(model, 'predict_proba'):\n",
574
+ " proba = model.predict_proba(X)[0]\n",
575
+ " confidence = float(max(proba))\n",
576
+ " \n",
577
+ " inference_time = (time.time() - start_time) * 1000\n",
578
+ " \n",
579
+ " return {\n",
580
+ " 'prediction': int(prediction),\n",
581
+ " 'confidence': confidence,\n",
582
+ " 'inference_time_ms': inference_time,\n",
583
+ " 'model': task_name\n",
584
+ " }\n",
585
+ " \n",
586
+ " def batch_predict(self, task_name: str, features_list: List[Dict]) -> List[Dict]:\n",
587
+ " \"\"\"Batch predictions\"\"\"\n",
588
+ " return [self.predict(task_name, f) for f in features_list]\n",
589
+ "\n",
590
+ "# Save inference API code\n",
591
+ "inference_api_code = '''\n",
592
+ "# CyberForge Model Inference API\n",
593
+ "# Compatible with backend mlService.js\n",
594
+ "\n",
595
+ "import joblib\n",
596
+ "import numpy as np\n",
597
+ "from pathlib import Path\n",
598
+ "import json\n",
599
+ "import time\n",
600
+ "\n",
601
+ "class CyberForgeInference:\n",
602
+ " def __init__(self, models_dir: str):\n",
603
+ " self.models_dir = Path(models_dir)\n",
604
+ " self.loaded_models = {}\n",
605
+ " with open(self.models_dir / \"model_registry.json\") as f:\n",
606
+ " self.registry = json.load(f)\n",
607
+ " \n",
608
+ " def predict(self, task: str, features: dict) -> dict:\n",
609
+ " if task not in self.loaded_models:\n",
610
+ " cfg = self.registry[\"models\"][task]\n",
611
+ " self.loaded_models[task] = {\n",
612
+ " \"model\": joblib.load(self.models_dir / task / f\"{cfg['best_model']}.pkl\"),\n",
613
+ " \"scaler\": joblib.load(self.models_dir / task / \"scaler.pkl\")\n",
614
+ " }\n",
615
+ " \n",
616
+ " m = self.loaded_models[task]\n",
617
+ " X = np.array([list(features.values())])\n",
618
+ " X = m[\"scaler\"].transform(X)\n",
619
+ " \n",
620
+ " pred = m[\"model\"].predict(X)[0]\n",
621
+ " conf = float(max(m[\"model\"].predict_proba(X)[0]))\n",
622
+ " \n",
623
+ " return {\"prediction\": int(pred), \"confidence\": conf, \"task\": task}\n",
624
+ "'''\n",
625
+ "\n",
626
+ "inference_path = MODELS_DIR / \"inference.py\"\n",
627
+ "with open(inference_path, 'w') as f:\n",
628
+ " f.write(inference_api_code)\n",
629
+ "\n",
630
+ "print(f\"βœ“ Inference API saved to: {inference_path}\")"
631
+ ]
632
+ },
633
+ {
634
+ "cell_type": "markdown",
635
+ "id": "e4d50734",
636
+ "metadata": {},
637
+ "source": [
638
+ "## 7. Summary"
639
+ ]
640
+ },
641
+ {
642
+ "cell_type": "code",
643
+ "execution_count": null,
644
+ "id": "6a634cc3",
645
+ "metadata": {},
646
+ "outputs": [],
647
+ "source": [
648
+ "print(\"\\n\" + \"=\" * 60)\n",
649
+ "print(\"MODEL TRAINING COMPLETE\")\n",
650
+ "print(\"=\" * 60)\n",
651
+ "\n",
652
+ "total_models = sum(len(m['models']) for m in trainer.trained_models.values())\n",
653
+ "\n",
654
+ "print(f\"\"\"\n",
655
+ "πŸ€– Training Summary:\n",
656
+ " - Datasets trained: {len(trainer.trained_models)}\n",
657
+ " - Total models: {total_models}\n",
658
+ " - Output directory: {MODELS_DIR}\n",
659
+ "\n",
660
+ "πŸ“Š Model Performance:\"\"\")\n",
661
+ "\n",
662
+ "for dataset, data in trainer.trained_models.items():\n",
663
+ " best = data['best_model']\n",
664
+ " metrics = data['models'][best]['metrics']\n",
665
+ " print(f\" {dataset}:\")\n",
666
+ " print(f\" Best: {best}\")\n",
667
+ " print(f\" Accuracy: {metrics['accuracy']:.4f}\")\n",
668
+ " print(f\" F1: {metrics['f1_score']:.4f}\")\n",
669
+ " print(f\" Inference: {metrics['inference_time_ms']:.3f}ms\")\n",
670
+ "\n",
671
+ "print(f\"\"\"\n",
672
+ "πŸ“ Output Files:\n",
673
+ " - Model files: {MODELS_DIR}/<dataset>/<model>.pkl\n",
674
+ " - Registry: {MODELS_DIR}/model_registry.json\n",
675
+ " - Inference API: {MODELS_DIR}/inference.py\n",
676
+ "\n",
677
+ "Next step:\n",
678
+ " β†’ 04_agent_intelligence.ipynb\n",
679
+ "\"\"\")\n",
680
+ "print(\"=\" * 60)"
681
+ ]
682
+ }
683
+ ],
684
+ "metadata": {
685
+ "language_info": {
686
+ "name": "python"
687
+ }
688
+ },
689
+ "nbformat": 4,
690
+ "nbformat_minor": 5
691
+ }