{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "id": "2ce2c903-ae90-40ef-a8d9-2b2b89f23983",
      "metadata": {
        "id": "2ce2c903-ae90-40ef-a8d9-2b2b89f23983"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "import pickle\n",
        "import warnings\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "import seaborn as sns\n",
        "from pathlib import Path\n",
        "from scipy import stats\n",
        "from sklearn.svm import SVC\n",
        "from sklearn.model_selection import GridSearchCV\n",
        "from time import time\n",
        "\n",
        "from sklearn.model_selection import (\n",
        "    StratifiedKFold, cross_validate\n",
        ")\n",
        "from sklearn.pipeline import Pipeline\n",
        "from sklearn.model_selection import cross_val_score\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "from sklearn.metrics import (\n",
        "    accuracy_score, precision_score, recall_score, f1_score,\n",
        "    classification_report, confusion_matrix\n",
        ")\n",
        "from sklearn.linear_model import LogisticRegression\n",
        "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
        "from sklearn.neighbors import KNeighborsClassifier\n",
        "from sklearn.naive_bayes import GaussianNB\n",
        "from sklearn.ensemble import (\n",
        "    RandomForestClassifier,\n",
        "    VotingClassifier,\n",
        "    BaggingClassifier,\n",
        "    StackingClassifier,\n",
        ")\n",
        "import xgboost as xgb\n",
        "import lightgbm as lgb\n",
        "import pickle\n",
        "warnings.filterwarnings('ignore')\n",
        "np.random.seed(42)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "28f4e5d9-23b1-405c-8f84-0dc33448cb2d",
      "metadata": {
        "id": "28f4e5d9-23b1-405c-8f84-0dc33448cb2d"
      },
      "outputs": [],
      "source": [
        "REPO_ROOT    = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
        "DATA_DIR     = os.path.join(REPO_ROOT, 'Datasets_all')\n",
        "OUT_DIR      = Path('models')\n",
        "OUT_DIR.mkdir(exist_ok=True)\n",
        "\n",
        "RANDOM_STATE = 42\n",
        "N_SPLITS     = 5\n",
        "CHAMPION_F1  = 0.6484   # Score from A5b"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "id": "26dc4267-d9d1-4481-90af-7da28143b033",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "26dc4267-d9d1-4481-90af-7da28143b033",
        "outputId": "494d8880-3d67-4cdc-f9b1-545751653d5a"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Movement features shape: (2094, 43)\n",
            "Weak link scores shape: (2096, 17)\n",
            "Shape after duplicate removal: (2094, 38)\n",
            "Weakest Link class distribution:\n",
            "WeakestLink\n",
            "LeftArmFallForward              616\n",
            "RightArmFallForward             458\n",
            "RightKneeMovesOutward           274\n",
            "RightShoulderElevation          245\n",
            "ExcessiveForwardLean            128\n",
            "ForwardHead                     109\n",
            "LeftAsymmetricalWeightShift      80\n",
            "LeftShoulderElevation            55\n",
            "LeftKneeMovesOutward             54\n",
            "RightKneeMovesInward             45\n",
            "RightAsymmetricalWeightShift     20\n",
            "LeftHeelRises                     7\n",
            "LeftKneeMovesInward               3\n",
            "RightHeelRises                    2\n",
            "Name: count, dtype: int64\n"
          ]
        }
      ],
      "source": [
        "movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))\n",
        "weaklink_scores_df   = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))\n",
        "\n",
        "print('Movement features shape:', movement_features_df.shape)\n",
        "print('Weak link scores shape:', weaklink_scores_df.shape)\n",
        "\n",
        "DUPLICATE_NASM_COLS = [\n",
        "    'No_1_NASM_Deviation',\n",
        "    'No_2_NASM_Deviation',\n",
        "    'No_3_NASM_Deviation',\n",
        "    'No_4_NASM_Deviation',\n",
        "    'No_5_NASM_Deviation',\n",
        "]\n",
        "\n",
        "movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)\n",
        "print('Shape after duplicate removal:', movement_features_df.shape)\n",
        "\n",
        "weaklink_categories = [\n",
        "    'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',\n",
        "    'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',\n",
        "    'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',\n",
        "    'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',\n",
        "    'RightKneeMovesOutward', 'RightShoulderElevation',\n",
        "]\n",
        "\n",
        "weaklink_scores_df['WeakestLink'] = (\n",
        "    weaklink_scores_df[weaklink_categories].idxmax(axis=1)\n",
        ")\n",
        "print('Weakest Link class distribution:')\n",
        "print(weaklink_scores_df['WeakestLink'].value_counts())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "id": "1f50b04e-0769-4610-b8ac-404b28ada493",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "1f50b04e-0769-4610-b8ac-404b28ada493",
        "outputId": "fa4dacb3-82fd-410e-c3b2-942cd53eed8c"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Merged dataset shape: (2094, 39)\n",
            "Feature matrix shape : (2094, 36)\n",
            "Number of features   : 36\n",
            "Number of classes    : 14\n"
          ]
        }
      ],
      "source": [
        "# Merge Datasets\n",
        "target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()\n",
        "merged_df = movement_features_df.merge(target_df, on='ID', how='inner')\n",
        "print('Merged dataset shape:', merged_df.shape)\n",
        "\n",
        "EXCLUDE_COLS    = ['ID', 'WeakestLink', 'EstimatedScore']\n",
        "feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]\n",
        "\n",
        "X = merged_df[feature_columns].values\n",
        "y = merged_df['WeakestLink'].values\n",
        "\n",
        "print(f'Feature matrix shape : {X.shape}')\n",
        "print(f'Number of features   : {len(feature_columns)}')\n",
        "print(f'Number of classes    : {len(np.unique(y))}')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 9,
      "id": "e6bbc0b5-f4a2-4911-9ce5-6f3fca74ebdf",
      "metadata": {
        "id": "e6bbc0b5-f4a2-4911-9ce5-6f3fca74ebdf"
      },
      "outputs": [],
      "source": [
        "C_range     = [2**i for i in range(-5, 10, 4)]\n",
        "gamma_range = [2**i for i in range(-10, 4, 4)]\n",
        "\n",
        "svm_param_grid = [\n",
        "    {'svm__kernel': ['rbf'],    'svm__C': C_range, 'svm__gamma': gamma_range, 'svm__class_weight': ['balanced']},\n",
        "    {'svm__kernel': ['poly'],   'svm__C': C_range, 'svm__gamma': gamma_range, 'svm__degree': [2, 3], 'svm__class_weight': ['balanced']},\n",
        "    {'svm__kernel': ['linear'], 'svm__C': C_range, 'svm__class_weight': ['balanced']},\n",
        "]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 10,
      "id": "qBUGqPVmp-TH",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "qBUGqPVmp-TH",
        "outputId": "f3b9186e-5f25-4b14-a380-69df6232fc2b"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Per-fold F1 : [0.5938 0.5981 0.5761 0.6399 0.6123]\n",
            "Mean F1     : 0.6040 +/- 0.0213\n"
          ]
        }
      ],
      "source": [
        "outer_cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)\n",
        "inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)\n",
        "\n",
        "# Pipeline keeps scaler inside each fold\n",
        "svm_pipeline = Pipeline([\n",
        "    ('scaler', StandardScaler()),\n",
        "    ('svm',    SVC(probability=True, random_state=RANDOM_STATE)),\n",
        "])\n",
        "\n",
        "nested_svm = GridSearchCV(\n",
        "    estimator  = svm_pipeline,\n",
        "    param_grid = svm_param_grid,\n",
        "    cv         = inner_cv,\n",
        "    scoring    = 'f1_weighted',\n",
        "    n_jobs     = -1,\n",
        "    verbose    = 0,\n",
        "    refit      = True,\n",
        ")\n",
        "nested_svm_scores = cross_val_score(\n",
        "    nested_svm, X, y,\n",
        "    cv      = outer_cv,\n",
        "    scoring = 'f1_weighted',\n",
        "    n_jobs  = -1,\n",
        ")\n",
        "\n",
        "print(f'Per-fold F1 : {np.round(nested_svm_scores, 4)}')\n",
        "print(f'Mean F1     : {nested_svm_scores.mean():.4f} +/- {nested_svm_scores.std():.4f}')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 11,
      "id": "34cb620f-02e6-4e4e-9637-ee9b96298fa9",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "34cb620f-02e6-4e4e-9637-ee9b96298fa9",
        "outputId": "56380093-2371-4284-a3b5-10622ec44adc"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Running CV for Soft Voting champion\n",
            "Per-fold F1 : [0.6316 0.6433 0.6289 0.7063 0.6331]\n",
            "Mean F1     : 0.6486 +/- 0.0292\n"
          ]
        }
      ],
      "source": [
        "\n",
        "soft_voting = VotingClassifier(\n",
        "    estimators=[\n",
        "        ('rf',  RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced_subsample',\n",
        "                                       random_state=RANDOM_STATE, n_jobs=-1)),\n",
        "        ('lr',  LogisticRegression( max_iter=1000, class_weight='balanced',random_state=RANDOM_STATE)),\n",
        "        ('xgb', xgb.XGBClassifier(  n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8,\n",
        "                                    colsample_bytree=0.8, random_state=RANDOM_STATE,class_weight='balanced', n_jobs=-1 )),\n",
        "        ('lgb', lgb.LGBMClassifier( n_estimators=200, learning_rate=0.1, class_weight='balanced',subsample=0.8, colsample_bytree=0.8,\n",
        "                                    random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1 )),\n",
        "        ('knn', KNeighborsClassifier(n_neighbors=7)),\n",
        "        ('lda', LinearDiscriminantAnalysis()),\n",
        "    ],\n",
        "    voting='soft',\n",
        "    n_jobs=-1,\n",
        ")\n",
        "sv_pipeline = Pipeline([\n",
        "    ('scaler', StandardScaler()),\n",
        "    ('voting', soft_voting),\n",
        "])\n",
        "\n",
        "print('Running CV for Soft Voting champion')\n",
        "sv_scores = cross_val_score(sv_pipeline, X, y, cv=outer_cv, scoring='f1_weighted', n_jobs=-1)\n",
        "print(f'Per-fold F1 : {np.round(sv_scores, 4)}')\n",
        "print(f'Mean F1     : {sv_scores.mean():.4f} +/- {sv_scores.std():.4f}')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 12,
      "id": "67dd5a18-3e9a-4342-8917-0f4d4d607f20",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "67dd5a18-3e9a-4342-8917-0f4d4d607f20",
        "outputId": "3b908043-6c47-428c-f434-abcacd15da08"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "                    Model  F1_mean   F1_std vs_A5b\n",
            "A5 Champion (Soft Voting) 0.648627 0.029224  +0.0%\n",
            "          SVM (Nested CV) 0.604041 0.021310  -6.8%\n"
          ]
        }
      ],
      "source": [
        "CHAMPION_F1 = 0.6484  # A5b reported score\n",
        "\n",
        "results = [\n",
        "    {'Model': 'SVM (Nested CV)',           'F1_mean': nested_svm_scores.mean(), 'F1_std': nested_svm_scores.std(), '_scores': nested_svm_scores},\n",
        "    {'Model': 'A5 Champion (Soft Voting)', 'F1_mean': sv_scores.mean(),         'F1_std': sv_scores.std(),         '_scores': sv_scores},\n",
        "]\n",
        "\n",
        "results_df = pd.DataFrame([{k:v for k,v in r.items() if k != '_scores'} for r in results])\n",
        "results_df = results_df.sort_values('F1_mean', ascending=False).reset_index(drop=True)\n",
        "results_df['vs_A5b'] = results_df['F1_mean'].apply(lambda f: f'{(f - CHAMPION_F1)/CHAMPION_F1*100:+.1f}%')\n",
        "print(results_df[['Model','F1_mean','F1_std','vs_A5b']].to_string(index=False))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 13,
      "id": "46b4acac-2e0e-44a9-96e4-ec5bccdb2ed2",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "46b4acac-2e0e-44a9-96e4-ec5bccdb2ed2",
        "outputId": "8beb76a7-854d-4960-8e9d-1c88850792d5"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "SVM (Nested CV) vs A5 Champion: t=-3.913, p=0.0173  -> Significant\n"
          ]
        }
      ],
      "source": [
        "from scipy import stats\n",
        "\n",
        "def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):\n",
        "    k       = len(scores_a)\n",
        "    diff    = scores_a - scores_b\n",
        "    d_bar   = diff.mean()\n",
        "    s_sq    = diff.var(ddof=1)\n",
        "    var_corr = (1/k + n_test/n_train) * s_sq\n",
        "    t_stat  = d_bar / np.sqrt(var_corr)\n",
        "    p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))\n",
        "    return float(t_stat), float(p_value)\n",
        "\n",
        "n_total      = len(X)\n",
        "n_test_fold  = n_total // N_SPLITS\n",
        "n_train_fold = n_total - n_test_fold\n",
        "\n",
        "score_map = {r['Model']: r['_scores'] for r in results}\n",
        "sv_f1     = score_map['A5 Champion (Soft Voting)']\n",
        "svm_f1    = score_map['SVM (Nested CV)']\n",
        "\n",
        "t, p = corrected_resampled_ttest(svm_f1, sv_f1, n_train_fold, n_test_fold)\n",
        "sig  = 'Significant' if p < 0.05 else 'Not significant'\n",
        "print(f'SVM (Nested CV) vs A5 Champion: t={t:+.3f}, p={p:.4f}  -> {sig}')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 14,
      "id": "809938d4-93cd-4e17-8b15-cf34bea8e9bc",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "809938d4-93cd-4e17-8b15-cf34bea8e9bc",
        "outputId": "bfd2d8ec-e390-43f5-99bc-bbb517f1935b"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Fitting 5 folds for each of 52 candidates, totalling 260 fits\n",
            "Best params: {'svm__C': 8, 'svm__class_weight': 'balanced', 'svm__gamma': 0.015625, 'svm__kernel': 'rbf'}\n",
            "Model saved to champion_svm.pkl\n"
          ]
        }
      ],
      "source": [
        "final_pipeline = Pipeline([\n",
        "    ('scaler', StandardScaler()),\n",
        "    ('svm',    SVC(probability=True, random_state=RANDOM_STATE)),\n",
        "])\n",
        "\n",
        "final_grid = GridSearchCV(\n",
        "    final_pipeline, svm_param_grid,\n",
        "    cv      = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE),\n",
        "    scoring = 'f1_weighted',\n",
        "    n_jobs  = -1, verbose=1,\n",
        ")\n",
        "final_grid.fit(X, y)\n",
        "print(f'Best params: {final_grid.best_params_}')\n",
        "\n",
        "with open(OUT_DIR / 'champion_svm.pkl', 'wb') as f:\n",
        "    pickle.dump(final_grid.best_estimator_, f)\n",
        "print('Model saved to champion_svm.pkl')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "id": "YLYSUEj82IXQ",
      "metadata": {
        "id": "YLYSUEj82IXQ"
      },
      "outputs": [],
      "source": []
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.11"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
}