Spaces:

Che237
/

cyberforge

Running

App Files Files Community

Che237 commited on 21 days ago

Commit

b8b5a3f

verified ·

1 Parent(s): 4b93b4a

Add 05_model_validation.ipynb

Browse files

Files changed (1) hide show

notebooks/05_model_validation.ipynb +574 -0

notebooks/05_model_validation.ipynb ADDED Viewed

	@@ -0,0 +1,574 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "360397d9",
+   "metadata": {},
+   "source": [
+    "# 05 - Model Validation\n",
+    "\n",
+    "## CyberForge AI - Production Validation & Safety\n",
+    "\n",
+    "This notebook validates trained models for production deployment:\n",
+    "- Performance metrics and benchmarks\n",
+    "- Edge case testing\n",
+    "- Failure analysis and recovery\n",
+    "- Continuous operation safety checks\n",
+    "\n",
+    "### Validation Requirements:\n",
+    "- All models must pass accuracy thresholds\n",
+    "- Inference time must meet real-time requirements\n",
+    "- Edge cases must not cause crashes\n",
+    "- Memory usage must be within bounds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "781bbd3c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from pathlib import Path\n",
+    "from typing import Dict, List, Any, Optional\n",
+    "import time\n",
+    "import traceback\n",
+    "import joblib\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')\n",
+    "\n",
+    "from sklearn.metrics import (\n",
+    "    accuracy_score, precision_score, recall_score, f1_score,\n",
+    "    confusion_matrix, classification_report, roc_auc_score\n",
+    ")\n",
+    "\n",
+    "# Configuration\n",
+    "config_path = Path(\"../notebook_config.json\")\n",
+    "with open(config_path) as f:\n",
+    "    CONFIG = json.load(f)\n",
+    "\n",
+    "MODELS_DIR = Path(CONFIG[\"datasets_dir\"]).parent / \"models\"\n",
+    "VALIDATION_DIR = MODELS_DIR.parent / \"validation\"\n",
+    "VALIDATION_DIR.mkdir(exist_ok=True)\n",
+    "\n",
+    "print(f\"✓ Configuration loaded\")\n",
+    "print(f\"✓ Models from: {MODELS_DIR}\")\n",
+    "print(f\"✓ Validation output: {VALIDATION_DIR}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a3afcfd1",
+   "metadata": {},
+   "source": [
+    "## 1. Validation Thresholds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74553e0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ValidationThresholds:\n",
+    "    \"\"\"Production-ready thresholds for model validation\"\"\"\n",
+    "    \n",
+    "    # Performance thresholds\n",
+    "    MIN_ACCURACY = 0.80\n",
+    "    MIN_PRECISION = 0.75\n",
+    "    MIN_RECALL = 0.70\n",
+    "    MIN_F1 = 0.75\n",
+    "    \n",
+    "    # Latency thresholds (milliseconds)\n",
+    "    MAX_INFERENCE_TIME_MS = 100\n",
+    "    MAX_BATCH_INFERENCE_TIME_MS = 500\n",
+    "    \n",
+    "    # Resource thresholds\n",
+    "    MAX_MODEL_SIZE_MB = 100\n",
+    "    MAX_MEMORY_MB = 500\n",
+    "    \n",
+    "    # Stability thresholds\n",
+    "    MIN_CONSISTENCY_SCORE = 0.95  # Same input should give same output\n",
+    "    MAX_EDGE_CASE_FAILURE_RATE = 0.05\n",
+    "    \n",
+    "    @classmethod\n",
+    "    def check_performance(cls, metrics: Dict) -> Dict[str, bool]:\n",
+    "        \"\"\"Check if metrics pass thresholds\"\"\"\n",
+    "        return {\n",
+    "            'accuracy': metrics.get('accuracy', 0) >= cls.MIN_ACCURACY,\n",
+    "            'precision': metrics.get('precision', 0) >= cls.MIN_PRECISION,\n",
+    "            'recall': metrics.get('recall', 0) >= cls.MIN_RECALL,\n",
+    "            'f1': metrics.get('f1', 0) >= cls.MIN_F1,\n",
+    "            'inference_time': metrics.get('inference_time_ms', 999) <= cls.MAX_INFERENCE_TIME_MS\n",
+    "        }\n",
+    "\n",
+    "print(\"✓ Validation Thresholds loaded\")\n",
+    "print(f\"   Min Accuracy: {ValidationThresholds.MIN_ACCURACY}\")\n",
+    "print(f\"   Max Inference: {ValidationThresholds.MAX_INFERENCE_TIME_MS}ms\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e96d341",
+   "metadata": {},
+   "source": [
+    "## 2. Load Models and Registry"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a0f6d54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load model registry\n",
+    "registry_path = MODELS_DIR / \"model_registry.json\"\n",
+    "\n",
+    "if registry_path.exists():\n",
+    "    with open(registry_path) as f:\n",
+    "        registry = json.load(f)\n",
+    "    print(f\"✓ Loaded registry with {len(registry.get('models', {}))} models\")\n",
+    "else:\n",
+    "    print(\"⚠ No model registry. Run 03_model_training.ipynb first.\")\n",
+    "    registry = {'models': {}}\n",
+    "\n",
+    "# List available models\n",
+    "print(\"\\nAvailable models:\")\n",
+    "for name, info in registry.get('models', {}).items():\n",
+    "    print(f\"  - {name}: {info['best_model']} (F1: {info['f1_score']:.4f})\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "91e2c16f",
+   "metadata": {},
+   "source": [
+    "## 3. Model Validator"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9042d66f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ModelValidator:\n",
+    "    \"\"\"\n",
+    "    Comprehensive model validation for production readiness.\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    def __init__(self, models_dir: Path):\n",
+    "        self.models_dir = models_dir\n",
+    "        self.validation_results = {}\n",
+    "    \n",
+    "    def load_model_artifacts(self, model_name: str, model_type: str) -> Dict:\n",
+    "        \"\"\"Load model and associated artifacts\"\"\"\n",
+    "        model_dir = self.models_dir / model_name\n",
+    "        \n",
+    "        artifacts = {}\n",
+    "        \n",
+    "        # Load model\n",
+    "        model_path = model_dir / f\"{model_type}.pkl\"\n",
+    "        if model_path.exists():\n",
+    "            artifacts['model'] = joblib.load(model_path)\n",
+    "            artifacts['model_size_mb'] = model_path.stat().st_size / (1024 * 1024)\n",
+    "        \n",
+    "        # Load scaler\n",
+    "        scaler_path = model_dir / \"scaler.pkl\"\n",
+    "        if scaler_path.exists():\n",
+    "            artifacts['scaler'] = joblib.load(scaler_path)\n",
+    "        \n",
+    "        # Load metadata\n",
+    "        metadata_path = model_dir / f\"{model_type}_metadata.json\"\n",
+    "        if metadata_path.exists():\n",
+    "            with open(metadata_path) as f:\n",
+    "                artifacts['metadata'] = json.load(f)\n",
+    "        \n",
+    "        return artifacts\n",
+    "    \n",
+    "    def validate_performance(self, model, X_test, y_test, scaler=None) -> Dict:\n",
+    "        \"\"\"Validate model performance metrics\"\"\"\n",
+    "        # Scale if needed\n",
+    "        if scaler:\n",
+    "            X_test = scaler.transform(X_test)\n",
+    "        \n",
+    "        # Predictions\n",
+    "        start = time.time()\n",
+    "        y_pred = model.predict(X_test)\n",
+    "        inference_time = (time.time() - start) / len(X_test) * 1000\n",
+    "        \n",
+    "        # Metrics\n",
+    "        metrics = {\n",
+    "            'accuracy': accuracy_score(y_test, y_pred),\n",
+    "            'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),\n",
+    "            'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),\n",
+    "            'f1': f1_score(y_test, y_pred, average='weighted', zero_division=0),\n",
+    "            'inference_time_ms': inference_time,\n",
+    "            'samples_tested': len(y_test)\n",
+    "        }\n",
+    "        \n",
+    "        # Check thresholds\n",
+    "        metrics['passed_thresholds'] = ValidationThresholds.check_performance(metrics)\n",
+    "        metrics['all_passed'] = all(metrics['passed_thresholds'].values())\n",
+    "        \n",
+    "        return metrics\n",
+    "    \n",
+    "    def validate_edge_cases(self, model, scaler=None) -> Dict:\n",
+    "        \"\"\"Test model behavior on edge cases\"\"\"\n",
+    "        results = {\n",
+    "            'tests_run': 0,\n",
+    "            'tests_passed': 0,\n",
+    "            'errors': []\n",
+    "        }\n",
+    "        \n",
+    "        # Get expected feature count\n",
+    "        if hasattr(model, 'n_features_in_'):\n",
+    "            n_features = model.n_features_in_\n",
+    "        else:\n",
+    "            n_features = 10  # Default\n",
+    "        \n",
+    "        edge_cases = [\n",
+    "            ('zeros', np.zeros((1, n_features))),\n",
+    "            ('ones', np.ones((1, n_features))),\n",
+    "            ('large_values', np.ones((1, n_features)) * 1e6),\n",
+    "            ('negative', -np.ones((1, n_features))),\n",
+    "            ('mixed', np.random.randn(1, n_features) * 100),\n",
+    "        ]\n",
+    "        \n",
+    "        for case_name, X in edge_cases:\n",
+    "            results['tests_run'] += 1\n",
+    "            try:\n",
+    "                if scaler:\n",
+    "                    X = scaler.transform(X)\n",
+    "                pred = model.predict(X)\n",
+    "                \n",
+    "                # Check prediction is valid\n",
+    "                if pred is not None and len(pred) == 1:\n",
+    "                    results['tests_passed'] += 1\n",
+    "                else:\n",
+    "                    results['errors'].append(f\"{case_name}: Invalid prediction shape\")\n",
+    "                    \n",
+    "            except Exception as e:\n",
+    "                results['errors'].append(f\"{case_name}: {str(e)}\")\n",
+    "        \n",
+    "        results['pass_rate'] = results['tests_passed'] / max(results['tests_run'], 1)\n",
+    "        return results\n",
+    "    \n",
+    "    def validate_consistency(self, model, scaler=None, n_runs: int = 10) -> Dict:\n",
+    "        \"\"\"Test prediction consistency (same input = same output)\"\"\"\n",
+    "        if hasattr(model, 'n_features_in_'):\n",
+    "            n_features = model.n_features_in_\n",
+    "        else:\n",
+    "            n_features = 10\n",
+    "        \n",
+    "        # Fixed input\n",
+    "        np.random.seed(42)\n",
+    "        X = np.random.randn(1, n_features)\n",
+    "        \n",
+    "        if scaler:\n",
+    "            X = scaler.transform(X)\n",
+    "        \n",
+    "        predictions = []\n",
+    "        for _ in range(n_runs):\n",
+    "            pred = model.predict(X)[0]\n",
+    "            predictions.append(pred)\n",
+    "        \n",
+    "        unique_preds = len(set(predictions))\n",
+    "        consistency = 1.0 if unique_preds == 1 else 1.0 / unique_preds\n",
+    "        \n",
+    "        return {\n",
+    "            'consistency_score': consistency,\n",
+    "            'unique_predictions': unique_preds,\n",
+    "            'is_consistent': unique_preds == 1\n",
+    "        }\n",
+    "    \n",
+    "    def validate_latency(self, model, scaler=None, n_samples: int = 100) -> Dict:\n",
+    "        \"\"\"Validate inference latency\"\"\"\n",
+    "        if hasattr(model, 'n_features_in_'):\n",
+    "            n_features = model.n_features_in_\n",
+    "        else:\n",
+    "            n_features = 10\n",
+    "        \n",
+    "        X = np.random.randn(n_samples, n_features)\n",
+    "        if scaler:\n",
+    "            X = scaler.transform(X)\n",
+    "        \n",
+    "        # Single sample latency\n",
+    "        single_times = []\n",
+    "        for i in range(min(10, n_samples)):\n",
+    "            start = time.time()\n",
+    "            model.predict(X[i:i+1])\n",
+    "            single_times.append((time.time() - start) * 1000)\n",
+    "        \n",
+    "        # Batch latency\n",
+    "        start = time.time()\n",
+    "        model.predict(X)\n",
+    "        batch_time = (time.time() - start) * 1000\n",
+    "        \n",
+    "        return {\n",
+    "            'single_mean_ms': np.mean(single_times),\n",
+    "            'single_max_ms': np.max(single_times),\n",
+    "            'single_std_ms': np.std(single_times),\n",
+    "            'batch_total_ms': batch_time,\n",
+    "            'batch_per_sample_ms': batch_time / n_samples,\n",
+    "            'meets_latency_target': np.mean(single_times) <= ValidationThresholds.MAX_INFERENCE_TIME_MS\n",
+    "        }\n",
+    "\n",
+    "validator = ModelValidator(MODELS_DIR)\n",
+    "print(\"✓ Model Validator initialized\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "53cddd40",
+   "metadata": {},
+   "source": [
+    "## 4. Run Validation Suite"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ccc75859",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run validation on all models\n",
+    "validation_results = {}\n",
+    "\n",
+    "print(\"Running validation suite...\\n\")\n",
+    "\n",
+    "for model_name, model_info in registry.get('models', {}).items():\n",
+    "    print(f\"{'='*50}\")\n",
+    "    print(f\"Validating: {model_name}\")\n",
+    "    print(f\"{'='*50}\")\n",
+    "    \n",
+    "    # Load model artifacts\n",
+    "    artifacts = validator.load_model_artifacts(model_name, model_info['best_model'])\n",
+    "    \n",
+    "    if 'model' not in artifacts:\n",
+    "        print(f\"  ⚠ Model not found\")\n",
+    "        continue\n",
+    "    \n",
+    "    model = artifacts['model']\n",
+    "    scaler = artifacts.get('scaler')\n",
+    "    \n",
+    "    results = {\n",
+    "        'model_name': model_name,\n",
+    "        'model_type': model_info['best_model'],\n",
+    "        'model_size_mb': artifacts.get('model_size_mb', 0)\n",
+    "    }\n",
+    "    \n",
+    "    # Edge case validation\n",
+    "    print(\"\\n  Edge Case Testing...\")\n",
+    "    edge_results = validator.validate_edge_cases(model, scaler)\n",
+    "    results['edge_cases'] = edge_results\n",
+    "    print(f\"    Pass rate: {edge_results['pass_rate']:.2%}\")\n",
+    "    if edge_results['errors']:\n",
+    "        for err in edge_results['errors'][:2]:\n",
+    "            print(f\"    ⚠ {err}\")\n",
+    "    \n",
+    "    # Consistency validation\n",
+    "    print(\"\\n  Consistency Testing...\")\n",
+    "    consistency_results = validator.validate_consistency(model, scaler)\n",
+    "    results['consistency'] = consistency_results\n",
+    "    print(f\"    Consistent: {consistency_results['is_consistent']}\")\n",
+    "    \n",
+    "    # Latency validation\n",
+    "    print(\"\\n  Latency Testing...\")\n",
+    "    latency_results = validator.validate_latency(model, scaler)\n",
+    "    results['latency'] = latency_results\n",
+    "    print(f\"    Single inference: {latency_results['single_mean_ms']:.3f}ms\")\n",
+    "    print(f\"    Meets target: {latency_results['meets_latency_target']}\")\n",
+    "    \n",
+    "    # Overall validation status\n",
+    "    passed = (\n",
+    "        edge_results['pass_rate'] >= (1 - ValidationThresholds.MAX_EDGE_CASE_FAILURE_RATE) and\n",
+    "        consistency_results['is_consistent'] and\n",
+    "        latency_results['meets_latency_target']\n",
+    "    )\n",
+    "    \n",
+    "    results['validation_passed'] = passed\n",
+    "    validation_results[model_name] = results\n",
+    "    \n",
+    "    status = \"✓ PASSED\" if passed else \"✗ FAILED\"\n",
+    "    print(f\"\\n  Status: {status}\")\n",
+    "\n",
+    "print(f\"\\n\\n✓ Validation complete for {len(validation_results)} models\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "45e71432",
+   "metadata": {},
+   "source": [
+    "## 5. Generate Validation Report"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24bbe906",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class ValidationReporter:\n",
+    "    \"\"\"Generate validation reports for documentation\"\"\"\n",
+    "    \n",
+    "    @staticmethod\n",
+    "    def generate_report(results: Dict) -> str:\n",
+    "        \"\"\"Generate markdown validation report\"\"\"\n",
+    "        lines = [\n",
+    "            \"# CyberForge Model Validation Report\",\n",
+    "            \"\",\n",
+    "            f\"**Generated:** {time.strftime('%Y-%m-%d %H:%M:%S')}\",\n",
+    "            f\"**Models Validated:** {len(results)}\",\n",
+    "            \"\",\n",
+    "            \"## Summary\",\n",
+    "            \"\",\n",
+    "            \"| Model | Type | Size (MB) | Edge Cases | Consistency | Latency (ms) | Status |\",\n",
+    "            \"|-------|------|-----------|------------|-------------|--------------|--------|\"\n",
+    "        ]\n",
+    "        \n",
+    "        for name, data in results.items():\n",
+    "            status = \"✓ Pass\" if data.get('validation_passed') else \"✗ Fail\"\n",
+    "            edge = f\"{data.get('edge_cases', {}).get('pass_rate', 0):.0%}\"\n",
+    "            consist = \"✓\" if data.get('consistency', {}).get('is_consistent') else \"✗\"\n",
+    "            latency = f\"{data.get('latency', {}).get('single_mean_ms', 999):.2f}\"\n",
+    "            \n",
+    "            lines.append(\n",
+    "                f\"| {name} | {data.get('model_type', 'N/A')} | \"\n",
+    "                f\"{data.get('model_size_mb', 0):.2f} | {edge} | {consist} | {latency} | {status} |\"\n",
+    "            )\n",
+    "        \n",
+    "        lines.extend([\n",
+    "            \"\",\n",
+    "            \"## Validation Thresholds\",\n",
+    "            \"\",\n",
+    "            f\"- Min Accuracy: {ValidationThresholds.MIN_ACCURACY}\",\n",
+    "            f\"- Max Inference Time: {ValidationThresholds.MAX_INFERENCE_TIME_MS}ms\",\n",
+    "            f\"- Max Edge Case Failure Rate: {ValidationThresholds.MAX_EDGE_CASE_FAILURE_RATE:.0%}\",\n",
+    "            f\"- Min Consistency Score: {ValidationThresholds.MIN_CONSISTENCY_SCORE}\",\n",
+    "        ])\n",
+    "        \n",
+    "        return \"\\n\".join(lines)\n",
+    "\n",
+    "# Generate report\n",
+    "report = ValidationReporter.generate_report(validation_results)\n",
+    "\n",
+    "report_path = VALIDATION_DIR / \"validation_report.md\"\n",
+    "with open(report_path, 'w') as f:\n",
+    "    f.write(report)\n",
+    "\n",
+    "print(f\"✓ Report saved to: {report_path}\")\n",
+    "print(\"\\n\" + report)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a0a647bc",
+   "metadata": {},
+   "source": [
+    "## 6. Save Validation Results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b52d7da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save detailed validation results\n",
+    "results_path = VALIDATION_DIR / \"validation_results.json\"\n",
+    "\n",
+    "# Make results JSON-serializable\n",
+    "serializable_results = {}\n",
+    "for name, data in validation_results.items():\n",
+    "    serializable_results[name] = {\n",
+    "        k: v if not isinstance(v, np.floating) else float(v)\n",
+    "        for k, v in data.items()\n",
+    "    }\n",
+    "\n",
+    "with open(results_path, 'w') as f:\n",
+    "    json.dump({\n",
+    "        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),\n",
+    "        'thresholds': {\n",
+    "            'min_accuracy': ValidationThresholds.MIN_ACCURACY,\n",
+    "            'max_inference_time_ms': ValidationThresholds.MAX_INFERENCE_TIME_MS,\n",
+    "            'max_edge_case_failure_rate': ValidationThresholds.MAX_EDGE_CASE_FAILURE_RATE\n",
+    "        },\n",
+    "        'results': serializable_results\n",
+    "    }, f, indent=2, default=str)\n",
+    "\n",
+    "print(f\"✓ Results saved to: {results_path}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e4b142e1",
+   "metadata": {},
+   "source": [
+    "## 7. Summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9532d95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculate summary stats\n",
+    "passed_count = sum(1 for r in validation_results.values() if r.get('validation_passed'))\n",
+    "total_count = len(validation_results)\n",
+    "\n",
+    "print(\"\\n\" + \"=\" * 60)\n",
+    "print(\"MODEL VALIDATION COMPLETE\")\n",
+    "print(\"=\" * 60)\n",
+    "\n",
+    "print(f\"\"\"\n",
+    "✅ Validation Summary:\n",
+    "   - Models validated: {total_count}\n",
+    "   - Models passed: {passed_count}\n",
+    "   - Models failed: {total_count - passed_count}\n",
+    "   - Pass rate: {passed_count/max(total_count,1):.0%}\n",
+    "\n",
+    "📊 Validation Checks:\n",
+    "   ✓ Edge case handling\n",
+    "   ✓ Prediction consistency\n",
+    "   ✓ Inference latency\n",
+    "   ✓ Model size limits\n",
+    "\n",
+    "📁 Output Files:\n",
+    "   - Report: {VALIDATION_DIR}/validation_report.md\n",
+    "   - Results: {VALIDATION_DIR}/validation_results.json\n",
+    "\n",
+    "Models Ready for Production:\"\"\")\n",
+    "\n",
+    "for name, data in validation_results.items():\n",
+    "    status = \"✓\" if data.get('validation_passed') else \"✗\"\n",
+    "    print(f\"   {status} {name}\")\n",
+    "\n",
+    "print(f\"\"\"\n",
+    "Next step:\n",
+    "  → 06_backend_integration.ipynb\n",
+    "\"\"\")\n",
+    "print(\"=\" * 60)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}