Spaces:

Bachstelze
/

github_sync

Sleeping

App Files Files Community

RasaBh commited on 8 days ago

Commit

7563458

1 Parent(s): 17f345c

SVM Classifier

Browse files

Files changed (1) hide show

A6/A6_Classification.ipynb +461 -0

A6/A6_Classification.ipynb ADDED Viewed

	@@ -0,0 +1,461 @@

+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "id": "2ce2c903-ae90-40ef-a8d9-2b2b89f23983",
+      "metadata": {
+        "id": "2ce2c903-ae90-40ef-a8d9-2b2b89f23983"
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import pickle\n",
+        "import warnings\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "import matplotlib.pyplot as plt\n",
+        "import seaborn as sns\n",
+        "from pathlib import Path\n",
+        "from scipy import stats\n",
+        "from sklearn.svm import SVC\n",
+        "from sklearn.model_selection import GridSearchCV\n",
+        "from time import time\n",
+        "\n",
+        "from sklearn.model_selection import (\n",
+        "    StratifiedKFold, cross_validate\n",
+        ")\n",
+        "from sklearn.pipeline import Pipeline\n",
+        "from sklearn.model_selection import cross_val_score\n",
+        "from sklearn.preprocessing import StandardScaler\n",
+        "from sklearn.metrics import (\n",
+        "    accuracy_score, precision_score, recall_score, f1_score,\n",
+        "    classification_report, confusion_matrix\n",
+        ")\n",
+        "from sklearn.linear_model import LogisticRegression\n",
+        "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
+        "from sklearn.neighbors import KNeighborsClassifier\n",
+        "from sklearn.naive_bayes import GaussianNB\n",
+        "from sklearn.ensemble import (\n",
+        "    RandomForestClassifier,\n",
+        "    VotingClassifier,\n",
+        "    BaggingClassifier,\n",
+        "    StackingClassifier,\n",
+        ")\n",
+        "import xgboost as xgb\n",
+        "import lightgbm as lgb\n",
+        "import pickle\n",
+        "warnings.filterwarnings('ignore')\n",
+        "np.random.seed(42)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "28f4e5d9-23b1-405c-8f84-0dc33448cb2d",
+      "metadata": {
+        "id": "28f4e5d9-23b1-405c-8f84-0dc33448cb2d"
+      },
+      "outputs": [],
+      "source": [
+        "REPO_ROOT    = os.path.abspath(os.path.join(os.getcwd(), '..'))\n",
+        "DATA_DIR     = os.path.join(REPO_ROOT, 'Datasets_all')\n",
+        "OUT_DIR      = Path('models')\n",
+        "OUT_DIR.mkdir(exist_ok=True)\n",
+        "\n",
+        "RANDOM_STATE = 42\n",
+        "N_SPLITS     = 5\n",
+        "CHAMPION_F1  = 0.6484   # Score from A5b"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "id": "26dc4267-d9d1-4481-90af-7da28143b033",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "26dc4267-d9d1-4481-90af-7da28143b033",
+        "outputId": "494d8880-3d67-4cdc-f9b1-545751653d5a"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Movement features shape: (2094, 43)\n",
+            "Weak link scores shape: (2096, 17)\n",
+            "Shape after duplicate removal: (2094, 38)\n",
+            "Weakest Link class distribution:\n",
+            "WeakestLink\n",
+            "LeftArmFallForward              616\n",
+            "RightArmFallForward             458\n",
+            "RightKneeMovesOutward           274\n",
+            "RightShoulderElevation          245\n",
+            "ExcessiveForwardLean            128\n",
+            "ForwardHead                     109\n",
+            "LeftAsymmetricalWeightShift      80\n",
+            "LeftShoulderElevation            55\n",
+            "LeftKneeMovesOutward             54\n",
+            "RightKneeMovesInward             45\n",
+            "RightAsymmetricalWeightShift     20\n",
+            "LeftHeelRises                     7\n",
+            "LeftKneeMovesInward               3\n",
+            "RightHeelRises                    2\n",
+            "Name: count, dtype: int64\n"
+          ]
+        }
+      ],
+      "source": [
+        "movement_features_df = pd.read_csv(os.path.join(DATA_DIR, 'aimoscores.csv'))\n",
+        "weaklink_scores_df   = pd.read_csv(os.path.join(DATA_DIR, 'scores_and_weaklink.csv'))\n",
+        "\n",
+        "print('Movement features shape:', movement_features_df.shape)\n",
+        "print('Weak link scores shape:', weaklink_scores_df.shape)\n",
+        "\n",
+        "DUPLICATE_NASM_COLS = [\n",
+        "    'No_1_NASM_Deviation',\n",
+        "    'No_2_NASM_Deviation',\n",
+        "    'No_3_NASM_Deviation',\n",
+        "    'No_4_NASM_Deviation',\n",
+        "    'No_5_NASM_Deviation',\n",
+        "]\n",
+        "\n",
+        "movement_features_df = movement_features_df.drop(columns=DUPLICATE_NASM_COLS)\n",
+        "print('Shape after duplicate removal:', movement_features_df.shape)\n",
+        "\n",
+        "weaklink_categories = [\n",
+        "    'ExcessiveForwardLean', 'ForwardHead', 'LeftArmFallForward',\n",
+        "    'LeftAsymmetricalWeightShift', 'LeftHeelRises', 'LeftKneeMovesInward',\n",
+        "    'LeftKneeMovesOutward', 'LeftShoulderElevation', 'RightArmFallForward',\n",
+        "    'RightAsymmetricalWeightShift', 'RightHeelRises', 'RightKneeMovesInward',\n",
+        "    'RightKneeMovesOutward', 'RightShoulderElevation',\n",
+        "]\n",
+        "\n",
+        "weaklink_scores_df['WeakestLink'] = (\n",
+        "    weaklink_scores_df[weaklink_categories].idxmax(axis=1)\n",
+        ")\n",
+        "print('Weakest Link class distribution:')\n",
+        "print(weaklink_scores_df['WeakestLink'].value_counts())"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "id": "1f50b04e-0769-4610-b8ac-404b28ada493",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "1f50b04e-0769-4610-b8ac-404b28ada493",
+        "outputId": "fa4dacb3-82fd-410e-c3b2-942cd53eed8c"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Merged dataset shape: (2094, 39)\n",
+            "Feature matrix shape : (2094, 36)\n",
+            "Number of features   : 36\n",
+            "Number of classes    : 14\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Merge Datasets\n",
+        "target_df = weaklink_scores_df[['ID', 'WeakestLink']].copy()\n",
+        "merged_df = movement_features_df.merge(target_df, on='ID', how='inner')\n",
+        "print('Merged dataset shape:', merged_df.shape)\n",
+        "\n",
+        "EXCLUDE_COLS    = ['ID', 'WeakestLink', 'EstimatedScore']\n",
+        "feature_columns = [c for c in merged_df.columns if c not in EXCLUDE_COLS]\n",
+        "\n",
+        "X = merged_df[feature_columns].values\n",
+        "y = merged_df['WeakestLink'].values\n",
+        "\n",
+        "print(f'Feature matrix shape : {X.shape}')\n",
+        "print(f'Number of features   : {len(feature_columns)}')\n",
+        "print(f'Number of classes    : {len(np.unique(y))}')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "id": "e6bbc0b5-f4a2-4911-9ce5-6f3fca74ebdf",
+      "metadata": {
+        "id": "e6bbc0b5-f4a2-4911-9ce5-6f3fca74ebdf"
+      },
+      "outputs": [],
+      "source": [
+        "C_range     = [2**i for i in range(-5, 10, 4)]\n",
+        "gamma_range = [2**i for i in range(-10, 4, 4)]\n",
+        "\n",
+        "svm_param_grid = [\n",
+        "    {'svm__kernel': ['rbf'],    'svm__C': C_range, 'svm__gamma': gamma_range, 'svm__class_weight': ['balanced']},\n",
+        "    {'svm__kernel': ['poly'],   'svm__C': C_range, 'svm__gamma': gamma_range, 'svm__degree': [2, 3], 'svm__class_weight': ['balanced']},\n",
+        "    {'svm__kernel': ['linear'], 'svm__C': C_range, 'svm__class_weight': ['balanced']},\n",
+        "]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "id": "qBUGqPVmp-TH",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "qBUGqPVmp-TH",
+        "outputId": "f3b9186e-5f25-4b14-a380-69df6232fc2b"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Per-fold F1 : [0.5938 0.5981 0.5761 0.6399 0.6123]\n",
+            "Mean F1     : 0.6040 +/- 0.0213\n"
+          ]
+        }
+      ],
+      "source": [
+        "outer_cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)\n",
+        "inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)\n",
+        "\n",
+        "# Pipeline keeps scaler inside each fold\n",
+        "svm_pipeline = Pipeline([\n",
+        "    ('scaler', StandardScaler()),\n",
+        "    ('svm',    SVC(probability=True, random_state=RANDOM_STATE)),\n",
+        "])\n",
+        "\n",
+        "nested_svm = GridSearchCV(\n",
+        "    estimator  = svm_pipeline,\n",
+        "    param_grid = svm_param_grid,\n",
+        "    cv         = inner_cv,\n",
+        "    scoring    = 'f1_weighted',\n",
+        "    n_jobs     = -1,\n",
+        "    verbose    = 0,\n",
+        "    refit      = True,\n",
+        ")\n",
+        "nested_svm_scores = cross_val_score(\n",
+        "    nested_svm, X, y,\n",
+        "    cv      = outer_cv,\n",
+        "    scoring = 'f1_weighted',\n",
+        "    n_jobs  = -1,\n",
+        ")\n",
+        "\n",
+        "print(f'Per-fold F1 : {np.round(nested_svm_scores, 4)}')\n",
+        "print(f'Mean F1     : {nested_svm_scores.mean():.4f} +/- {nested_svm_scores.std():.4f}')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "id": "34cb620f-02e6-4e4e-9637-ee9b96298fa9",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "34cb620f-02e6-4e4e-9637-ee9b96298fa9",
+        "outputId": "56380093-2371-4284-a3b5-10622ec44adc"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Running CV for Soft Voting champion\n",
+            "Per-fold F1 : [0.6316 0.6433 0.6289 0.7063 0.6331]\n",
+            "Mean F1     : 0.6486 +/- 0.0292\n"
+          ]
+        }
+      ],
+      "source": [
+        "\n",
+        "soft_voting = VotingClassifier(\n",
+        "    estimators=[\n",
+        "        ('rf',  RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=5, min_samples_leaf=2, class_weight='balanced_subsample',\n",
+        "                                       random_state=RANDOM_STATE, n_jobs=-1)),\n",
+        "        ('lr',  LogisticRegression( max_iter=1000, class_weight='balanced',random_state=RANDOM_STATE)),\n",
+        "        ('xgb', xgb.XGBClassifier(  n_estimators=200, max_depth=6, learning_rate=0.1, subsample=0.8,\n",
+        "                                    colsample_bytree=0.8, random_state=RANDOM_STATE,class_weight='balanced', n_jobs=-1 )),\n",
+        "        ('lgb', lgb.LGBMClassifier( n_estimators=200, learning_rate=0.1, class_weight='balanced',subsample=0.8, colsample_bytree=0.8,\n",
+        "                                    random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1 )),\n",
+        "        ('knn', KNeighborsClassifier(n_neighbors=7)),\n",
+        "        ('lda', LinearDiscriminantAnalysis()),\n",
+        "    ],\n",
+        "    voting='soft',\n",
+        "    n_jobs=-1,\n",
+        ")\n",
+        "sv_pipeline = Pipeline([\n",
+        "    ('scaler', StandardScaler()),\n",
+        "    ('voting', soft_voting),\n",
+        "])\n",
+        "\n",
+        "print('Running CV for Soft Voting champion')\n",
+        "sv_scores = cross_val_score(sv_pipeline, X, y, cv=outer_cv, scoring='f1_weighted', n_jobs=-1)\n",
+        "print(f'Per-fold F1 : {np.round(sv_scores, 4)}')\n",
+        "print(f'Mean F1     : {sv_scores.mean():.4f} +/- {sv_scores.std():.4f}')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "id": "67dd5a18-3e9a-4342-8917-0f4d4d607f20",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "67dd5a18-3e9a-4342-8917-0f4d4d607f20",
+        "outputId": "3b908043-6c47-428c-f434-abcacd15da08"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "                    Model  F1_mean   F1_std vs_A5b\n",
+            "A5 Champion (Soft Voting) 0.648627 0.029224  +0.0%\n",
+            "          SVM (Nested CV) 0.604041 0.021310  -6.8%\n"
+          ]
+        }
+      ],
+      "source": [
+        "CHAMPION_F1 = 0.6484  # A5b reported score\n",
+        "\n",
+        "results = [\n",
+        "    {'Model': 'SVM (Nested CV)',           'F1_mean': nested_svm_scores.mean(), 'F1_std': nested_svm_scores.std(), '_scores': nested_svm_scores},\n",
+        "    {'Model': 'A5 Champion (Soft Voting)', 'F1_mean': sv_scores.mean(),         'F1_std': sv_scores.std(),         '_scores': sv_scores},\n",
+        "]\n",
+        "\n",
+        "results_df = pd.DataFrame([{k:v for k,v in r.items() if k != '_scores'} for r in results])\n",
+        "results_df = results_df.sort_values('F1_mean', ascending=False).reset_index(drop=True)\n",
+        "results_df['vs_A5b'] = results_df['F1_mean'].apply(lambda f: f'{(f - CHAMPION_F1)/CHAMPION_F1*100:+.1f}%')\n",
+        "print(results_df[['Model','F1_mean','F1_std','vs_A5b']].to_string(index=False))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "id": "46b4acac-2e0e-44a9-96e4-ec5bccdb2ed2",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "46b4acac-2e0e-44a9-96e4-ec5bccdb2ed2",
+        "outputId": "8beb76a7-854d-4960-8e9d-1c88850792d5"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "SVM (Nested CV) vs A5 Champion: t=-3.913, p=0.0173  -> Significant\n"
+          ]
+        }
+      ],
+      "source": [
+        "from scipy import stats\n",
+        "\n",
+        "def corrected_resampled_ttest(scores_a, scores_b, n_train, n_test):\n",
+        "    k       = len(scores_a)\n",
+        "    diff    = scores_a - scores_b\n",
+        "    d_bar   = diff.mean()\n",
+        "    s_sq    = diff.var(ddof=1)\n",
+        "    var_corr = (1/k + n_test/n_train) * s_sq\n",
+        "    t_stat  = d_bar / np.sqrt(var_corr)\n",
+        "    p_value = 2 * (1 - stats.t.cdf(abs(t_stat), df=k-1))\n",
+        "    return float(t_stat), float(p_value)\n",
+        "\n",
+        "n_total      = len(X)\n",
+        "n_test_fold  = n_total // N_SPLITS\n",
+        "n_train_fold = n_total - n_test_fold\n",
+        "\n",
+        "score_map = {r['Model']: r['_scores'] for r in results}\n",
+        "sv_f1     = score_map['A5 Champion (Soft Voting)']\n",
+        "svm_f1    = score_map['SVM (Nested CV)']\n",
+        "\n",
+        "t, p = corrected_resampled_ttest(svm_f1, sv_f1, n_train_fold, n_test_fold)\n",
+        "sig  = 'Significant' if p < 0.05 else 'Not significant'\n",
+        "print(f'SVM (Nested CV) vs A5 Champion: t={t:+.3f}, p={p:.4f}  -> {sig}')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "id": "809938d4-93cd-4e17-8b15-cf34bea8e9bc",
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "809938d4-93cd-4e17-8b15-cf34bea8e9bc",
+        "outputId": "bfd2d8ec-e390-43f5-99bc-bbb517f1935b"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Fitting 5 folds for each of 52 candidates, totalling 260 fits\n",
+            "Best params: {'svm__C': 8, 'svm__class_weight': 'balanced', 'svm__gamma': 0.015625, 'svm__kernel': 'rbf'}\n",
+            "Model saved to champion_svm.pkl\n"
+          ]
+        }
+      ],
+      "source": [
+        "final_pipeline = Pipeline([\n",
+        "    ('scaler', StandardScaler()),\n",
+        "    ('svm',    SVC(probability=True, random_state=RANDOM_STATE)),\n",
+        "])\n",
+        "\n",
+        "final_grid = GridSearchCV(\n",
+        "    final_pipeline, svm_param_grid,\n",
+        "    cv      = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE),\n",
+        "    scoring = 'f1_weighted',\n",
+        "    n_jobs  = -1, verbose=1,\n",
+        ")\n",
+        "final_grid.fit(X, y)\n",
+        "print(f'Best params: {final_grid.best_params_}')\n",
+        "\n",
+        "with open(OUT_DIR / 'champion_svm.pkl', 'wb') as f:\n",
+        "    pickle.dump(final_grid.best_estimator_, f)\n",
+        "print('Model saved to champion_svm.pkl')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "YLYSUEj82IXQ",
+      "metadata": {
+        "id": "YLYSUEj82IXQ"
+      },
+      "outputs": [],
+      "source": []
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.11"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 5
+}