{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# 03 — Classical ML Models (v3) — 8-Model Optimized Training\n",
        "## SOH Regression: Cross-Battery Generalization Split\n",
        "\n",
        "**v3 Pipeline (bug fixes over v2):**\n",
        "- Load preprocessed `battery_features.csv` from NB02 (18 features)\n",
        "- **Cross-battery grouped split** (v2 bug: used intra-battery 80/20 → data leakage)\n",
        "- Train 8 core models with 18 features (v2 had 12)\n",
        "- Proper NaN imputation (no more `fillna(0)` for Re/Rct)\n",
        "- Target: ≥95% within-±5% SOH accuracy on all models\n",
        "- Save artifacts to `artifacts/v3/`\n",
        "\n",
        "**v3 Bug Fixes:**\n",
        "1. Split: Intra-battery → cross-battery (no leakage)\n",
        "2. Features: 12 → 18 (6 new physics-informed features)\n",
        "3. Imputation: `fillna(0)` → ffill/bfill/median (already done in NB02)\n",
        "4. Scaler: single consistent scaler from NB02 training split\n",
        "\n",
        "**Models (8 total):**\n",
        "1. ExtraTrees (tree-based, unscaled)\n",
        "2. GradientBoosting (sequential ensemble)\n",
        "3. RandomForest (bagging ensemble)\n",
        "4. XGBoost (boosted trees with tuning)\n",
        "5. LightGBM (fast gradient boosting)\n",
        "6. SVR (support vector regression)\n",
        "7. Ridge (linear with L2)\n",
        "8. KNN-5 (instance-based with distance weighting)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Setup complete.\n"
          ]
        }
      ],
      "source": [
        "import sys, os\n",
        "sys.path.insert(0, os.path.abspath('..'))\n",
        "\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import joblib\n",
        "import warnings\n",
        "warnings.filterwarnings('ignore')\n",
        "\n",
        "from pathlib import Path\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "from sklearn.linear_model import Ridge\n",
        "from sklearn.svm import SVR\n",
        "from sklearn.ensemble import (\n",
        "    RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor\n",
        ")\n",
        "from sklearn.neighbors import KNeighborsRegressor\n",
        "from sklearn.metrics import r2_score, mean_absolute_error\n",
        "from xgboost import XGBRegressor\n",
        "from lightgbm import LGBMRegressor\n",
        "\n",
        "from src.utils.config import get_version_paths, ensure_version_dirs, FEATURE_COLS_V3\n",
        "\n",
        "print('Setup complete.')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "v3 Results:  E:\\VIT\\aiBatteryLifecycle\\artifacts\\v3\\results\n",
            "v3 Models:   E:\\VIT\\aiBatteryLifecycle\\artifacts\\v3\\models\\classical\n",
            "v3 Scalers:  E:\\VIT\\aiBatteryLifecycle\\artifacts\\v3\\scalers\n",
            "v3 Features: E:\\VIT\\aiBatteryLifecycle\\artifacts\\v3\\features\n"
          ]
        }
      ],
      "source": [
        "# Setup v3 paths\n",
        "v3 = get_version_paths('v3')\n",
        "ensure_version_dirs('v3')\n",
        "\n",
        "V3_FEATURES = v3['root'] / 'features'\n",
        "\n",
        "print(f'v3 Results:  {v3[\"results\"]}')\n",
        "print(f'v3 Models:   {v3[\"models_classical\"]}')\n",
        "print(f'v3 Scalers:  {v3[\"scalers\"]}')\n",
        "print(f'v3 Features: {V3_FEATURES}')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Dataset shape: (2678, 25)\n",
            "Batteries: ['B0005', 'B0006', 'B0007', 'B0018', 'B0025', 'B0026', 'B0027', 'B0028', 'B0029', 'B0030', 'B0031', 'B0032', 'B0033', 'B0034', 'B0036', 'B0038', 'B0039', 'B0040', 'B0041', 'B0042', 'B0043', 'B0044', 'B0045', 'B0046', 'B0047', 'B0048', 'B0053', 'B0054', 'B0055', 'B0056']\n",
            "SOH range: 2.2% — 122.2%\n",
            "NaN count: 0\n"
          ]
        }
      ],
      "source": [
        "# Load preprocessed features from NB02 (v3: 18 features, already imputed)\n",
        "features_df = pd.read_csv(V3_FEATURES / 'battery_features.csv')\n",
        "print(f'Dataset shape: {features_df.shape}')\n",
        "print(f'Batteries: {sorted(features_df[\"battery_id\"].unique())}')\n",
        "print(f'SOH range: {features_df[\"SoH\"].min():.1f}% — {features_df[\"SoH\"].max():.1f}%')\n",
        "print(f'NaN count: {features_df[FEATURE_COLS_V3].isna().sum().sum()}')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Train: 2163 samples from 24 batteries\n",
            "Test:  515 samples from 6 batteries\n",
            "Train batteries: ['B0005', 'B0006', 'B0007', 'B0018', 'B0025', 'B0026', 'B0029', 'B0030', 'B0032', 'B0033', 'B0034', 'B0038', 'B0039', 'B0040', 'B0041', 'B0044', 'B0045', 'B0046', 'B0047', 'B0048', 'B0053', 'B0054', 'B0055', 'B0056']\n",
            "Test batteries:  ['B0027', 'B0028', 'B0031', 'B0036', 'B0042', 'B0043']\n",
            "Overlap: NONE ✓ (no leakage)\n",
            "Train SOH: 2.2% — 101.8%\n",
            "Test SOH:  2.8% — 122.2%\n"
          ]
        }
      ],
      "source": [
        "# ── v3 FIX: Cross-battery grouped split (no data leakage) ──\n",
        "# v2 bug: intra-battery 80/20 chronological split per battery\n",
        "#   → All batteries appear in both train AND test → inflated R²\n",
        "# v3 fix: entire batteries in train OR test, never both\n",
        "\n",
        "from src.data.preprocessing import group_battery_split\n",
        "\n",
        "train_df, test_df = group_battery_split(features_df, train_ratio=0.8)\n",
        "\n",
        "print(f'Train: {len(train_df)} samples from {train_df[\"battery_id\"].nunique()} batteries')\n",
        "print(f'Test:  {len(test_df)} samples from {test_df[\"battery_id\"].nunique()} batteries')\n",
        "print(f'Train batteries: {sorted(train_df[\"battery_id\"].unique())}')\n",
        "print(f'Test batteries:  {sorted(test_df[\"battery_id\"].unique())}')\n",
        "\n",
        "overlap = set(train_df['battery_id']) & set(test_df['battery_id'])\n",
        "print(f'Overlap: {overlap if overlap else \"NONE ✓ (no leakage)\"}')\n",
        "print(f'Train SOH: {train_df[\"SoH\"].min():.1f}% — {train_df[\"SoH\"].max():.1f}%')\n",
        "print(f'Test SOH:  {test_df[\"SoH\"].min():.1f}% — {test_df[\"SoH\"].max():.1f}%')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Using 18 features: ['cycle_number', 'ambient_temperature', 'peak_voltage', 'min_voltage', 'voltage_range', 'avg_current', 'avg_temp', 'temp_rise', 'cycle_duration', 'Re', 'Rct', 'delta_capacity', 'capacity_retention', 'cumulative_energy', 'dRe_dn', 'dRct_dn', 'soh_rolling_mean', 'voltage_slope']\n",
            "X_train: (2163, 18)\n",
            "y_train: (2163,)\n",
            "X_test:  (515, 18)\n",
            "y_test:  (515,)\n"
          ]
        }
      ],
      "source": [
        "# v3: Use all 18 features (12 base + 6 physics-informed)\n",
        "feature_cols = [c for c in FEATURE_COLS_V3 if c in features_df.columns]\n",
        "print(f'Using {len(feature_cols)} features: {feature_cols}')\n",
        "\n",
        "X_train = train_df[feature_cols].values\n",
        "y_train = train_df['SoH'].values\n",
        "X_test = test_df[feature_cols].values\n",
        "y_test = test_df['SoH'].values\n",
        "\n",
        "print(f'X_train: {X_train.shape}')\n",
        "print(f'y_train: {y_train.shape}')\n",
        "print(f'X_test:  {X_test.shape}')\n",
        "print(f'y_test:  {y_test.shape}')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 6,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Scaler loaded from NB02 (fitted on training batteries only).\n",
            "  Mean range: [-0.0001, 3282.9991]\n"
          ]
        }
      ],
      "source": [
        "# Load scaler from NB02 (v3: consistent with training split)\n",
        "scaler = joblib.load(v3['scalers'] / 'v3_features_standard.joblib')\n",
        "X_train_scaled = scaler.transform(X_train)\n",
        "X_test_scaled = scaler.transform(X_test)\n",
        "print(f'Scaler loaded from NB02 (fitted on training batteries only).')\n",
        "print(f'  Mean range: [{scaler.mean_.min():.4f}, {scaler.mean_.max():.4f}]')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 7,
      "metadata": {},
      "outputs": [],
      "source": [
        "def evaluate_model(model_name, model, X_eval, y_eval, save_path=None):\n",
        "    \"\"\"Evaluate model and optionally save it.\"\"\"\n",
        "    y_pred = model.predict(X_eval)\n",
        "    \n",
        "    r2 = r2_score(y_eval, y_pred)\n",
        "    mae = mean_absolute_error(y_eval, y_pred)\n",
        "    within_5pct = float((np.abs(y_pred - y_eval) <= 5).mean() * 100)\n",
        "    \n",
        "    status = '✓ PASS' if within_5pct >= 95.0 else '✗ FAIL'\n",
        "    print(f'{model_name:20s} | R²={r2:.4f} | MAE={mae:.2f} | Within-5%={within_5pct:.1f}% | {status}')\n",
        "    \n",
        "    if save_path:\n",
        "        joblib.dump(model, save_path)\n",
        "    \n",
        "    return y_pred, {'model': model_name, 'r2': r2, 'mae': mae, 'within_5pct': within_5pct}"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 8,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "ExtraTrees           | R²=0.9701 | MAE=3.20 | Within-5%=75.1% | ✗ FAIL\n"
          ]
        }
      ],
      "source": [
        "# ExtraTrees (unscaled)\n",
        "model_et = ExtraTreesRegressor(\n",
        "    n_estimators=1000,\n",
        "    min_samples_leaf=2,\n",
        "    max_features=0.7,\n",
        "    random_state=42,\n",
        "    n_jobs=-1\n",
        ")\n",
        "model_et.fit(X_train, y_train)\n",
        "_, metrics_et = evaluate_model('ExtraTrees', model_et, X_test, y_test,\n",
        "                               v3['models_classical'] / 'extra_trees.joblib')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "GradientBoosting     | R²=0.9860 | MAE=1.38 | Within-5%=95.1% | ✓ PASS\n"
          ]
        }
      ],
      "source": [
        "# GradientBoosting (unscaled)\n",
        "model_gb = GradientBoostingRegressor(\n",
        "    n_estimators=800,\n",
        "    max_depth=6,\n",
        "    learning_rate=0.05,\n",
        "    subsample=0.8,\n",
        "    random_state=42\n",
        ")\n",
        "model_gb.fit(X_train, y_train)\n",
        "_, metrics_gb = evaluate_model('GradientBoosting', model_gb, X_test, y_test,\n",
        "                               v3['models_classical'] / 'gradient_boosting.joblib')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "RandomForest         | R²=0.9814 | MAE=1.83 | Within-5%=91.3% | ✗ FAIL\n"
          ]
        }
      ],
      "source": [
        "# RandomForest (unscaled)\n",
        "model_rf = RandomForestRegressor(\n",
        "    n_estimators=1000,\n",
        "    min_samples_leaf=2,\n",
        "    max_features=0.7,\n",
        "    random_state=42,\n",
        "    n_jobs=-1\n",
        ")\n",
        "model_rf.fit(X_train, y_train)\n",
        "_, metrics_rf = evaluate_model('RandomForest', model_rf, X_test, y_test,\n",
        "                               v3['models_classical'] / 'random_forest.joblib')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "XGBoost              | R²=0.9866 | MAE=1.58 | Within-5%=93.8% | ✗ FAIL\n"
          ]
        }
      ],
      "source": [
        "# XGBoost (unscaled, tuned hyperparameters)\n",
        "model_xgb = XGBRegressor(\n",
        "    n_estimators=1200,\n",
        "    max_depth=9,\n",
        "    learning_rate=0.02,\n",
        "    subsample=0.85,\n",
        "    colsample_bytree=0.85,\n",
        "    random_state=42,\n",
        "    n_jobs=-1,\n",
        "    verbosity=0\n",
        ")\n",
        "model_xgb.fit(X_train, y_train)\n",
        "_, metrics_xgb = evaluate_model('XGBoost', model_xgb, X_test, y_test,\n",
        "                                v3['models_classical'] / 'xgboost.joblib')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "LightGBM             | R²=0.9826 | MAE=1.98 | Within-5%=89.5% | ✗ FAIL\n"
          ]
        }
      ],
      "source": [
        "# LightGBM (unscaled, tuned hyperparameters)\n",
        "model_lgbm = LGBMRegressor(\n",
        "    n_estimators=1200,\n",
        "    num_leaves=127,\n",
        "    learning_rate=0.02,\n",
        "    subsample=0.85,\n",
        "    colsample_bytree=0.85,\n",
        "    random_state=42,\n",
        "    n_jobs=-1,\n",
        "    verbose=-1\n",
        ")\n",
        "model_lgbm.fit(X_train, y_train)\n",
        "_, metrics_lgbm = evaluate_model('LightGBM', model_lgbm, X_test, y_test,\n",
        "                                 v3['models_classical'] / 'lightgbm.joblib')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "SVR                  | R²=0.8898 | MAE=4.92 | Within-5%=79.0% | ✗ FAIL\n"
          ]
        }
      ],
      "source": [
        "# SVR (scaled)\n",
        "model_svr = SVR(\n",
        "    C=1000.0,\n",
        "    epsilon=0.1,\n",
        "    kernel='rbf'\n",
        ")\n",
        "model_svr.fit(X_train_scaled, y_train)\n",
        "_, metrics_svr = evaluate_model('SVR', model_svr, X_test_scaled, y_test,\n",
        "                               v3['models_classical'] / 'svr.joblib')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 14,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Ridge                | R²=0.9656 | MAE=3.23 | Within-5%=88.9% | ✗ FAIL\n"
          ]
        }
      ],
      "source": [
        "# Ridge (scaled)\n",
        "model_ridge = Ridge(\n",
        "    alpha=0.1\n",
        ")\n",
        "model_ridge.fit(X_train_scaled, y_train)\n",
        "_, metrics_ridge = evaluate_model('Ridge', model_ridge, X_test_scaled, y_test,\n",
        "                                  v3['models_classical'] / 'ridge.joblib')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 15,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "KNN-5                | R²=0.7555 | MAE=11.02 | Within-5%=34.2% | ✗ FAIL\n"
          ]
        }
      ],
      "source": [
        "# KNN-5 (scaled, with distance weighting)\n",
        "model_knn5 = KNeighborsRegressor(\n",
        "    n_neighbors=5,\n",
        "    weights='distance',\n",
        "    n_jobs=-1\n",
        ")\n",
        "model_knn5.fit(X_train_scaled, y_train)\n",
        "_, metrics_knn5 = evaluate_model('KNN-5', model_knn5, X_test_scaled, y_test,\n",
        "                                v3['models_classical'] / 'knn_k5.joblib')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": []
    },
    {
      "cell_type": "code",
      "execution_count": 16,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "\n",
            "======================================================================\n",
            "FINAL RESULTS — v3 Classical ML (Cross-Battery Split, 18 Features)\n",
            "======================================================================\n",
            "           model       r2       mae  within_5pct\n",
            "GradientBoosting 0.985984  1.383230    95.145631\n",
            "         XGBoost 0.986594  1.576671    93.786408\n",
            "    RandomForest 0.981407  1.834184    91.262136\n",
            "        LightGBM 0.982554  1.976782    89.514563\n",
            "           Ridge 0.965638  3.225993    88.932039\n",
            "             SVR 0.889759  4.923939    79.029126\n",
            "      ExtraTrees 0.970125  3.201794    75.145631\n",
            "           KNN-5 0.755476 11.023403    34.174757\n",
            "\n",
            "Passed (≥95%): 1/8\n",
            "\n",
            "Results saved to E:\\VIT\\aiBatteryLifecycle\\artifacts\\v3\\results\\v3_classical_soh_results.csv\n"
          ]
        }
      ],
      "source": [
        "# Collect results\n",
        "all_metrics = [\n",
        "    metrics_et, metrics_gb, metrics_rf, metrics_xgb, metrics_lgbm,\n",
        "    metrics_svr, metrics_ridge, metrics_knn5\n",
        "]\n",
        "\n",
        "results_df = pd.DataFrame(all_metrics)\n",
        "results_df = results_df.sort_values('within_5pct', ascending=False)\n",
        "\n",
        "print('\\n' + '='*70)\n",
        "print('FINAL RESULTS — v3 Classical ML (Cross-Battery Split, 18 Features)')\n",
        "print('='*70)\n",
        "print(results_df.to_string(index=False))\n",
        "\n",
        "# Count passes\n",
        "n_passed = (results_df['within_5pct'] >= 95.0).sum()\n",
        "print(f'\\nPassed (≥95%): {n_passed}/8')\n",
        "\n",
        "# Save results (v3: consistent naming)\n",
        "results_df.to_csv(v3['results'] / 'v3_classical_soh_results.csv', index=False)\n",
        "print(f'\\nResults saved to {v3[\"results\"] / \"v3_classical_soh_results.csv\"}')"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "venv",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.12.10"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}