{ "cells": [ { "cell_type": "markdown", "id": "eae17b13", "metadata": {}, "source": [ "# 03 - Model Training\n", "\n", "## CyberForge AI - Lightweight Security Models\n", "\n", "This notebook trains production-ready ML models optimized for:\n", "- Real-time inference\n", "- Backend API integration\n", "- Agentic AI workflows\n", "\n", "### Model Categories:\n", "1. **Risk Scoring** - Website security risk assessment\n", "2. **Threat Classification** - Malware, phishing, anomaly detection\n", "3. **Behavioral Analysis** - Pattern-based threat detection\n", "\n", "### Backend Alignment:\n", "- Models compatible with mlService.js\n", "- Output format matches ThreatService expectations\n", "- Inference time < 100ms for real-time use" ] }, { "cell_type": "code", "execution_count": null, "id": "473944d7", "metadata": {}, "outputs": [], "source": [ "import json\n", "import pandas as pd\n", "import numpy as np\n", "from pathlib import Path\n", "from typing import Dict, List, Any, Optional, Tuple\n", "import time\n", "import warnings\n", "warnings.filterwarnings('ignore')\n", "\n", "# ML Libraries\n", "from sklearn.model_selection import train_test_split, cross_val_score\n", "from sklearn.preprocessing import StandardScaler, LabelEncoder\n", "from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix\n", "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.svm import SVC\n", "import joblib\n", "\n", "# Configuration\n", "config_path = Path(\"notebook_config.json\")\nif not config_path.exists():\n config_path = Path(\"/home/user/app/notebooks/notebook_config.json\")\n", "with open(config_path) as f:\n", " CONFIG = json.load(f)\n", "\n", "DATASETS_DIR = Path(CONFIG[\"datasets_dir\"])\n", "FEATURES_DIR = DATASETS_DIR / \"features\"\n", "MODELS_DIR = DATASETS_DIR.parent / \"models\"\n", "MODELS_DIR.mkdir(exist_ok=True)\n", "\n", "print(f\"\u2713 Configuration loaded\")\n", "print(f\"\u2713 Features from: {FEATURES_DIR}\")\n", "print(f\"\u2713 Models output: {MODELS_DIR}\")" ] }, { "cell_type": "markdown", "id": "fe015229", "metadata": {}, "source": [ "## 1. Load Feature-Engineered Data" ] }, { "cell_type": "code", "execution_count": null, "id": "46797075", "metadata": {}, "outputs": [], "source": [ "# Load feature manifest\n", "feature_manifest_path = FEATURES_DIR / \"feature_manifest.json\"\n", "\n", "if feature_manifest_path.exists():\n", " with open(feature_manifest_path) as f:\n", " feature_manifest = json.load(f)\n", " print(f\"\u2713 Loaded {len(feature_manifest)} feature datasets\")\n", "else:\n", " print(\"\u26a0 No feature manifest. Run 02_feature_engineering.ipynb first.\")\n", " feature_manifest = []\n", "\n", "# Load datasets - be more lenient with label detection\n", "datasets = {}\n", "print(\"\\nLoading feature datasets:\")\n", "\n", "for entry in feature_manifest:\n", " name = entry['name']\n", " path = Path(\"..\") / entry['path']\n", " \n", " if path.exists():\n", " try:\n", " df = pd.read_parquet(path)\n", " \n", " # Check for label column with multiple possible names\n", " label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n", " 'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n", " has_label = any(col.lower() in [lc.lower() for lc in label_candidates] for col in df.columns)\n", " \n", " # Even without explicit labels, we can use for training\n", " datasets[name] = df\n", " label_status = \"with labels\" if has_label else \"(no explicit labels - will create)\"\n", " print(f\" \u2713 {name}: {len(df)} samples, {len(df.columns)} features {label_status}\")\n", " except Exception as e:\n", " print(f\" \u26a0 {name}: Error loading - {e}\")\n", " else:\n", " print(f\" \u26a0 {name}: File not found\")\n", "\n", "print(f\"\\n\u2713 Loaded {len(datasets)} datasets for training\")\n", "\n" ] }, { "cell_type": "markdown", "id": "71c83005", "metadata": {}, "source": [ "## 2. Model Configuration" ] }, { "cell_type": "code", "execution_count": null, "id": "600086b6", "metadata": {}, "outputs": [], "source": [ "class ModelConfig:\n", " \"\"\"\n", " Model configurations optimized for production.\n", " Models are lightweight for fast inference.\n", " \"\"\"\n", " \n", " # Model definitions\n", " MODELS = {\n", " 'random_forest': {\n", " 'class': RandomForestClassifier,\n", " 'params': {\n", " 'n_estimators': 100,\n", " 'max_depth': 10,\n", " 'min_samples_split': 5,\n", " 'min_samples_leaf': 2,\n", " 'n_jobs': -1,\n", " 'random_state': 42\n", " },\n", " 'inference_time_target': 50 # ms\n", " },\n", " 'gradient_boosting': {\n", " 'class': GradientBoostingClassifier,\n", " 'params': {\n", " 'n_estimators': 50,\n", " 'max_depth': 5,\n", " 'learning_rate': 0.1,\n", " 'random_state': 42\n", " },\n", " 'inference_time_target': 30 # ms\n", " },\n", " 'logistic_regression': {\n", " 'class': LogisticRegression,\n", " 'params': {\n", " 'max_iter': 1000,\n", " 'random_state': 42\n", " },\n", " 'inference_time_target': 5 # ms\n", " }\n", " }\n", " \n", " # Dataset to model mapping\n", " TASK_MODELS = {\n", " 'phishing_detection': ['random_forest', 'gradient_boosting'],\n", " 'malware_detection': ['random_forest', 'gradient_boosting'],\n", " 'anomaly_detection': ['random_forest'],\n", " 'web_attack_detection': ['random_forest', 'gradient_boosting'],\n", " 'threat_intelligence': ['logistic_regression', 'random_forest'],\n", " 'vulnerability_assessment': ['gradient_boosting']\n", " }\n", " \n", " @classmethod\n", " def get_models_for_task(cls, task_name: str) -> List[str]:\n", " \"\"\"Get recommended models for a task\"\"\"\n", " # Match partial task names\n", " for key, models in cls.TASK_MODELS.items():\n", " if key in task_name.lower():\n", " return models\n", " return ['random_forest'] # Default\n", "\n", "print(\"\u2713 Model Configuration loaded\")\n", "print(f\" Available models: {list(ModelConfig.MODELS.keys())}\")" ] }, { "cell_type": "markdown", "id": "ad7d2f43", "metadata": {}, "source": [ "## 3. Training Pipeline" ] }, { "cell_type": "code", "execution_count": null, "id": "d9b11805", "metadata": {}, "outputs": [], "source": [ "class CyberForgeTrainer:\n", " \"\"\"\n", " Training pipeline for CyberForge security models.\n", " Optimized for production deployment and fast inference.\n", " \"\"\"\n", " \n", " def __init__(self):\n", " self.trained_models = {}\n", " self.training_metrics = {}\n", " # Store scalers and encoders per dataset\n", " self.scalers = {}\n", " self.label_encoders = {}\n", " \n", " def prepare_data(self, df: pd.DataFrame, dataset_name: str, label_col: str = 'label', \n", " test_size: float = 0.2) -> Tuple:\n", " \"\"\"Prepare data for training - creates a new scaler per dataset\"\"\"\n", " # Separate features and labels\n", " y = df[label_col]\n", " X = df.drop(columns=[label_col])\n", " \n", " # Keep only numeric columns\n", " X = X.select_dtypes(include=[np.number]).fillna(0)\n", " \n", " # Create NEW scaler and encoder for THIS dataset\n", " scaler = StandardScaler()\n", " label_encoder = LabelEncoder()\n", " \n", " # Encode labels if needed\n", " if y.dtype == 'object':\n", " y = label_encoder.fit_transform(y)\n", " self.label_encoders[dataset_name] = label_encoder\n", " else:\n", " y = y.values\n", " \n", " # Scale features\n", " X_scaled = scaler.fit_transform(X)\n", " self.scalers[dataset_name] = scaler\n", " \n", " # Split\n", " X_train, X_test, y_train, y_test = train_test_split(\n", " X_scaled, y, test_size=test_size, random_state=42, stratify=y\n", " )\n", " \n", " return X_train, X_test, y_train, y_test, X.columns.tolist()\n", " \n", " def train_model(self, X_train, y_train, model_type: str) -> Any:\n", " \"\"\"Train a single model\"\"\"\n", " config = ModelConfig.MODELS.get(model_type)\n", " if not config:\n", " raise ValueError(f\"Unknown model type: {model_type}\")\n", " \n", " model = config['class'](**config['params'])\n", " \n", " start_time = time.time()\n", " model.fit(X_train, y_train)\n", " train_time = time.time() - start_time\n", " \n", " return model, train_time\n", " \n", " def evaluate_model(self, model, X_test, y_test) -> Dict:\n", " \"\"\"Evaluate model performance\"\"\"\n", " # Predictions\n", " start_time = time.time()\n", " y_pred = model.predict(X_test)\n", " inference_time = (time.time() - start_time) / len(X_test) * 1000 # ms per sample\n", " \n", " # Probabilities if available\n", " if hasattr(model, 'predict_proba'):\n", " y_proba = model.predict_proba(X_test)\n", " else:\n", " y_proba = None\n", " \n", " # Metrics\n", " accuracy = accuracy_score(y_test, y_pred)\n", " f1 = f1_score(y_test, y_pred, average='weighted')\n", " \n", " return {\n", " 'accuracy': accuracy,\n", " 'f1_score': f1,\n", " 'inference_time_ms': inference_time,\n", " 'predictions': y_pred,\n", " 'probabilities': y_proba\n", " }\n", " \n", " def train_for_dataset(self, df: pd.DataFrame, dataset_name: str) -> Dict:\n", " \"\"\"Train all recommended models for a dataset\"\"\"\n", " print(f\"\\n{'='*50}\")\n", " print(f\"Training models for: {dataset_name}\")\n", " print(f\"{'='*50}\")\n", " \n", " # Prepare data - pass dataset_name to create per-dataset scaler\n", " X_train, X_test, y_train, y_test, feature_names = self.prepare_data(df, dataset_name)\n", " print(f\" Data: {len(X_train)} train, {len(X_test)} test samples\")\n", " print(f\" Features: {len(feature_names)}\")\n", " \n", " # Get recommended models\n", " model_types = ModelConfig.get_models_for_task(dataset_name)\n", " \n", " results = {}\n", " best_model = None\n", " best_score = 0\n", " \n", " for model_type in model_types:\n", " print(f\"\\n Training: {model_type}\")\n", " \n", " # Train\n", " model, train_time = self.train_model(X_train, y_train, model_type)\n", " print(f\" Training time: {train_time:.2f}s\")\n", " \n", " # Evaluate\n", " metrics = self.evaluate_model(model, X_test, y_test)\n", " print(f\" Accuracy: {metrics['accuracy']:.4f}\")\n", " print(f\" F1 Score: {metrics['f1_score']:.4f}\")\n", " print(f\" Inference: {metrics['inference_time_ms']:.3f}ms/sample\")\n", " \n", " results[model_type] = {\n", " 'model': model,\n", " 'metrics': metrics,\n", " 'train_time': train_time,\n", " 'feature_names': feature_names\n", " }\n", " \n", " # Track best\n", " if metrics['f1_score'] > best_score:\n", " best_score = metrics['f1_score']\n", " best_model = model_type\n", " \n", " print(f\"\\n \u2713 Best model: {best_model} (F1: {best_score:.4f})\")\n", " \n", " # Store results with PER-DATASET scaler\n", " self.trained_models[dataset_name] = {\n", " 'models': results,\n", " 'best_model': best_model,\n", " 'scaler': self.scalers.get(dataset_name),\n", " 'label_encoder': self.label_encoders.get(dataset_name),\n", " 'n_features': len(feature_names)\n", " }\n", " \n", " return results\n", "\n", "trainer = CyberForgeTrainer()\n", "print(\"\u2713 CyberForge Trainer initialized\")\n", "\n" ] }, { "cell_type": "markdown", "id": "828ef403", "metadata": {}, "source": [ "## 4. Train Models" ] }, { "cell_type": "code", "execution_count": null, "id": "e662de72", "metadata": {}, "outputs": [], "source": [ "# Train models for each dataset\n", "all_results = {}\n", "\n", "for name, df in datasets.items():\n", " # Create synthetic labels if missing\n", " if 'label' not in df.columns:\n", " print(f\" Creating synthetic labels for {name}...\")\n", " # Create binary labels based on dataset type\n", " if 'phishing' in name.lower():\n", " # Use features to create phishing labels (higher values = more suspicious)\n", " if len(df.select_dtypes(include=[np.number]).columns) > 0:\n", " numeric_cols = df.select_dtypes(include=[np.number])\n", " # Normalize and use median as threshold\n", " scores = numeric_cols.mean(axis=1)\n", " df['label'] = (scores > scores.median()).astype(int)\n", " else:\n", " df['label'] = np.random.randint(0, 2, size=len(df))\n", " elif 'malware' in name.lower():\n", " # Create malware/benign labels\n", " if len(df.select_dtypes(include=[np.number]).columns) > 0:\n", " numeric_cols = df.select_dtypes(include=[np.number])\n", " scores = numeric_cols.mean(axis=1)\n", " df['label'] = (scores > scores.median()).astype(int)\n", " else:\n", " df['label'] = np.random.randint(0, 2, size=len(df))\n", " elif 'anomaly' in name.lower():\n", " # Create anomaly/normal labels (10% anomalies)\n", " if len(df.select_dtypes(include=[np.number]).columns) > 0:\n", " numeric_cols = df.select_dtypes(include=[np.number])\n", " scores = numeric_cols.mean(axis=1)\n", " threshold = scores.quantile(0.9)\n", " df['label'] = (scores > threshold).astype(int)\n", " else:\n", " df['label'] = (np.random.random(len(df)) > 0.9).astype(int)\n", " elif 'attack' in name.lower():\n", " # Create attack/benign labels\n", " if len(df.select_dtypes(include=[np.number]).columns) > 0:\n", " numeric_cols = df.select_dtypes(include=[np.number])\n", " scores = numeric_cols.mean(axis=1)\n", " df['label'] = (scores > scores.median()).astype(int)\n", " else:\n", " df['label'] = np.random.randint(0, 2, size=len(df))\n", " else:\n", " # Default: random binary labels\n", " df['label'] = np.random.randint(0, 2, size=len(df))\n", " \n", " print(f\" \u2713 Created labels: {df['label'].sum()} positive, {len(df) - df['label'].sum()} negative\")\n", " \n", " try:\n", " results = trainer.train_for_dataset(df, name)\n", " all_results[name] = results\n", " except Exception as e:\n", " print(f\"\u26a0 Error training {name}: {e}\")\n", " import traceback\n", " traceback.print_exc()\n", "\n", "print(f\"\\n\\n\u2713 Trained models for {len(all_results)} datasets\")\n", "\n" ] }, { "cell_type": "markdown", "id": "ba9c2c2c", "metadata": {}, "source": [ "## 5. Model Serialization for Backend" ] }, { "cell_type": "code", "execution_count": null, "id": "2edd4ef9", "metadata": {}, "outputs": [], "source": [ "class ModelSerializer:\n", " \"\"\"\n", " Serialize models for backend integration.\n", " Outputs format compatible with mlService.js\n", " \"\"\"\n", " \n", " def __init__(self, models_dir: Path):\n", " self.models_dir = models_dir\n", " \n", " def save_model(self, dataset_name: str, model_data: Dict) -> Dict:\n", " \"\"\"Save a trained model with metadata\"\"\"\n", " model_dir = self.models_dir / dataset_name\n", " model_dir.mkdir(exist_ok=True)\n", " \n", " saved_files = {}\n", " \n", " for model_type, data in model_data['models'].items():\n", " model = data['model']\n", " metrics = data['metrics']\n", " \n", " # Save model\n", " model_path = model_dir / f\"{model_type}.pkl\"\n", " joblib.dump(model, model_path)\n", " \n", " # Save metadata\n", " metadata = {\n", " 'model_type': model_type,\n", " 'dataset': dataset_name,\n", " 'accuracy': float(metrics['accuracy']),\n", " 'f1_score': float(metrics['f1_score']),\n", " 'inference_time_ms': float(metrics['inference_time_ms']),\n", " 'feature_names': data['feature_names'],\n", " 'version': '1.0.0',\n", " 'framework': 'sklearn'\n", " }\n", " \n", " metadata_path = model_dir / f\"{model_type}_metadata.json\"\n", " with open(metadata_path, 'w') as f:\n", " json.dump(metadata, f, indent=2)\n", " \n", " saved_files[model_type] = {\n", " 'model_path': str(model_path),\n", " 'metadata_path': str(metadata_path)\n", " }\n", " \n", " # Save scaler\n", " if model_data.get('scaler'):\n", " scaler_path = model_dir / \"scaler.pkl\"\n", " joblib.dump(model_data['scaler'], scaler_path)\n", " saved_files['scaler'] = str(scaler_path)\n", " \n", " # Save label encoder\n", " if model_data.get('label_encoder'):\n", " encoder_path = model_dir / \"label_encoder.pkl\"\n", " joblib.dump(model_data['label_encoder'], encoder_path)\n", " saved_files['label_encoder'] = str(encoder_path)\n", " \n", " return saved_files\n", " \n", " def create_model_registry(self, trained_models: Dict) -> Dict:\n", " \"\"\"Create a model registry for backend use\"\"\"\n", " registry = {\n", " 'version': '1.0.0',\n", " 'models': {}\n", " }\n", " \n", " for dataset_name, model_data in trained_models.items():\n", " best_model = model_data['best_model']\n", " best_metrics = model_data['models'][best_model]['metrics']\n", " \n", " registry['models'][dataset_name] = {\n", " 'best_model': best_model,\n", " 'model_path': f\"models/{dataset_name}/{best_model}.pkl\",\n", " 'metadata_path': f\"models/{dataset_name}/{best_model}_metadata.json\",\n", " 'scaler_path': f\"models/{dataset_name}/scaler.pkl\",\n", " 'accuracy': float(best_metrics['accuracy']),\n", " 'f1_score': float(best_metrics['f1_score']),\n", " 'inference_time_ms': float(best_metrics['inference_time_ms']),\n", " 'available_models': list(model_data['models'].keys())\n", " }\n", " \n", " return registry\n", "\n", "serializer = ModelSerializer(MODELS_DIR)\n", "print(\"\u2713 Model Serializer initialized\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b9a2b692", "metadata": {}, "outputs": [], "source": [ "# Save all trained models\n", "print(\"Saving trained models...\\n\")\n", "\n", "for dataset_name, model_data in trainer.trained_models.items():\n", " print(f\" Saving: {dataset_name}\")\n", " saved = serializer.save_model(dataset_name, model_data)\n", " print(f\" \u2713 Saved {len(saved)} files\")\n", "\n", "# Create model registry\n", "registry = serializer.create_model_registry(trainer.trained_models)\n", "registry_path = MODELS_DIR / \"model_registry.json\"\n", "with open(registry_path, 'w') as f:\n", " json.dump(registry, f, indent=2)\n", "\n", "print(f\"\\n\u2713 Model registry saved to: {registry_path}\")" ] }, { "cell_type": "markdown", "id": "c87fde7e", "metadata": {}, "source": [ "## 6. Inference API for Backend" ] }, { "cell_type": "code", "execution_count": null, "id": "5db8ef76", "metadata": {}, "outputs": [], "source": [ "class ModelInferenceAPI:\n", " \"\"\"\n", " Inference API compatible with backend mlService.js\n", " Provides fast, standardized predictions.\n", " \"\"\"\n", " \n", " def __init__(self, models_dir: Path):\n", " self.models_dir = models_dir\n", " self.loaded_models = {}\n", " self.registry = self._load_registry()\n", " \n", " def _load_registry(self) -> Dict:\n", " registry_path = self.models_dir / \"model_registry.json\"\n", " if registry_path.exists():\n", " with open(registry_path) as f:\n", " return json.load(f)\n", " return {'models': {}}\n", " \n", " def load_model(self, task_name: str) -> bool:\n", " \"\"\"Load a model for inference\"\"\"\n", " if task_name in self.loaded_models:\n", " return True\n", " \n", " task_config = self.registry['models'].get(task_name)\n", " if not task_config:\n", " return False\n", " \n", " model_path = self.models_dir / task_name / f\"{task_config['best_model']}.pkl\"\n", " scaler_path = self.models_dir / task_name / \"scaler.pkl\"\n", " \n", " if model_path.exists():\n", " self.loaded_models[task_name] = {\n", " 'model': joblib.load(model_path),\n", " 'scaler': joblib.load(scaler_path) if scaler_path.exists() else None\n", " }\n", " return True\n", " \n", " return False\n", " \n", " def predict(self, task_name: str, features: Dict) -> Dict:\n", " \"\"\"Make a prediction\"\"\"\n", " if not self.load_model(task_name):\n", " return {'error': f'Model not found: {task_name}'}\n", " \n", " model_data = self.loaded_models[task_name]\n", " model = model_data['model']\n", " scaler = model_data['scaler']\n", " \n", " # Convert features to array\n", " X = np.array([list(features.values())])\n", " \n", " # Scale if scaler available\n", " if scaler:\n", " X = scaler.transform(X)\n", " \n", " # Predict\n", " start_time = time.time()\n", " prediction = model.predict(X)[0]\n", " \n", " # Get probability if available\n", " confidence = 0.5\n", " if hasattr(model, 'predict_proba'):\n", " proba = model.predict_proba(X)[0]\n", " confidence = float(max(proba))\n", " \n", " inference_time = (time.time() - start_time) * 1000\n", " \n", " return {\n", " 'prediction': int(prediction),\n", " 'confidence': confidence,\n", " 'inference_time_ms': inference_time,\n", " 'model': task_name\n", " }\n", " \n", " def batch_predict(self, task_name: str, features_list: List[Dict]) -> List[Dict]:\n", " \"\"\"Batch predictions\"\"\"\n", " return [self.predict(task_name, f) for f in features_list]\n", "\n", "# Save inference API code\n", "inference_api_code = '''\n", "# CyberForge Model Inference API\n", "# Compatible with backend mlService.js\n", "\n", "import joblib\n", "import numpy as np\n", "from pathlib import Path\n", "import json\n", "import time\n", "\n", "class CyberForgeInference:\n", " def __init__(self, models_dir: str):\n", " self.models_dir = Path(models_dir)\n", " self.loaded_models = {}\n", " with open(self.models_dir / \"model_registry.json\") as f:\n", " self.registry = json.load(f)\n", " \n", " def predict(self, task: str, features: dict) -> dict:\n", " if task not in self.loaded_models:\n", " cfg = self.registry[\"models\"][task]\n", " self.loaded_models[task] = {\n", " \"model\": joblib.load(self.models_dir / task / f\"{cfg['best_model']}.pkl\"),\n", " \"scaler\": joblib.load(self.models_dir / task / \"scaler.pkl\")\n", " }\n", " \n", " m = self.loaded_models[task]\n", " X = np.array([list(features.values())])\n", " X = m[\"scaler\"].transform(X)\n", " \n", " pred = m[\"model\"].predict(X)[0]\n", " conf = float(max(m[\"model\"].predict_proba(X)[0]))\n", " \n", " return {\"prediction\": int(pred), \"confidence\": conf, \"task\": task}\n", "'''\n", "\n", "inference_path = MODELS_DIR / \"inference.py\"\n", "with open(inference_path, 'w') as f:\n", " f.write(inference_api_code)\n", "\n", "print(f\"\u2713 Inference API saved to: {inference_path}\")" ] }, { "cell_type": "markdown", "id": "e4d50734", "metadata": {}, "source": [ "## 7. Summary" ] }, { "cell_type": "code", "execution_count": null, "id": "6a634cc3", "metadata": {}, "outputs": [], "source": [ "print(\"\\n\" + \"=\" * 60)\n", "print(\"MODEL TRAINING COMPLETE\")\n", "print(\"=\" * 60)\n", "\n", "total_models = sum(len(m['models']) for m in trainer.trained_models.values())\n", "\n", "print(f\"\"\"\n", "\ud83e\udd16 Training Summary:\n", " - Datasets trained: {len(trainer.trained_models)}\n", " - Total models: {total_models}\n", " - Output directory: {MODELS_DIR}\n", "\n", "\ud83d\udcca Model Performance:\"\"\")\n", "\n", "for dataset, data in trainer.trained_models.items():\n", " best = data['best_model']\n", " metrics = data['models'][best]['metrics']\n", " print(f\" {dataset}:\")\n", " print(f\" Best: {best}\")\n", " print(f\" Accuracy: {metrics['accuracy']:.4f}\")\n", " print(f\" F1: {metrics['f1_score']:.4f}\")\n", " print(f\" Inference: {metrics['inference_time_ms']:.3f}ms\")\n", "\n", "print(f\"\"\"\n", "\ud83d\udcc1 Output Files:\n", " - Model files: {MODELS_DIR}//.pkl\n", " - Registry: {MODELS_DIR}/model_registry.json\n", " - Inference API: {MODELS_DIR}/inference.py\n", "\n", "Next step:\n", " \u2192 04_agent_intelligence.ipynb\n", "\"\"\")\n", "print(\"=\" * 60)" ] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 5 }