{ "cells": [ { "cell_type": "markdown", "id": "f9fd590d-a3c3-4e94-b5ca-59a6ee9b29c3", "metadata": {}, "source": [ "# Detection Layer" ] }, { "cell_type": "code", "execution_count": null, "id": "dddcb2a8-73d3-4476-b88a-bc1494a2c830", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.model_selection import cross_val_score, KFold\n", "from sklearn.impute import KNNImputer\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier\n", "from xgboost import XGBClassifier\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.experimental import enable_iterative_imputer\n", "from imblearn.pipeline import make_pipeline as make_pipeline_imb\n", "from imblearn.over_sampling import SMOTE,SMOTENC\n", "from sklearn.model_selection import train_test_split\n", "from collections import Counter\n", "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n", "from sklearn.ensemble import GradientBoostingClassifier\n", "from sklearn.ensemble import VotingClassifier\n", "from sklearn.svm import SVC\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.ensemble import BaggingClassifier\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.ensemble import ExtraTreesClassifier\n", "from deslib.dcs import APosteriori\n", "from deslib.des import KNORAE, KNORAU, KNOP, DESMI\n", "from sklearn.neighbors import LocalOutlierFactor\n", "from sklearn.utils import resample\n", "import warnings\n", "from imblearn.over_sampling import RandomOverSampler\n", "from imblearn.under_sampling import RandomUnderSampler\n", "from sklearn.preprocessing import MinMaxScaler\n", "from imblearn.pipeline import Pipeline\n", "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n", "from sklearn.preprocessing import LabelEncoder, PowerTransformer\n", "from collections import defaultdict\n", "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier\n", "from catboost import CatBoostClassifier\n", "from lightgbm import LGBMClassifier\n", "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.neural_network import MLPClassifier\n", "import Orange\n", "from scipy.stats import friedmanchisquare, rankdata\n", "import shap\n", "import scikit_posthocs as sp\n", "from sklearn.feature_selection import SelectFromModel\n", "from IPython.display import FileLink, display\n", "import math\n", "from sklearn.ensemble import RandomForestClassifier\n", "from skopt.space import Integer, Real\n", "from sklearn.model_selection import StratifiedKFold\n", "from skopt import BayesSearchCV\n", "import xgboost as xgb\n", "from imblearn.over_sampling import SMOTE\n", "from sklearn.tree import DecisionTreeClassifier, export_text\n", "from sklearn import tree\n", "from skopt.space import Real, Integer, Categorical\n", "from skopt.callbacks import VerboseCallback\n", "from deslib.des.knora_e import KNORAE\n", "from deslib.des.knora_u import KNORAU\n", "from deslib.des.knop import KNOP\n", "from deslib.des.meta_des import METADES\n", "from deslib.des.des_knn import DESKNN\n", "from deslib.des.des_p import DESP" ] }, { "cell_type": "markdown", "id": "39d6683c-fd3f-4daa-afdc-2e7f83c3fce3", "metadata": {}, "source": [ "### Preparation before training" ] }, { "cell_type": "code", "execution_count": null, "id": "ec17e47a-8d92-498d-8f28-d8259d6ebc4e", "metadata": {}, "outputs": [], "source": [ "# Call Dataset\n", "pd.set_option('display.max_rows', 10)\n", "initial_df = pd.read_csv('3labelv4Classification.csv')\n", "initial_df.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "d3751352-73bc-42c6-b6a5-84a3badf14ff", "metadata": {}, "outputs": [], "source": [ "# All categorical features except for label\n", "cols = initial_df.columns\n", "num_cols = initial_df._get_numeric_data().columns\n", "categorical_features = list(set(cols) - set(num_cols))\n", "categorical_features.remove('depression_category')\n", "\n", "# Label Encode all categorical, but keep missing values\n", "le_initial_df = initial_df.copy()\n", "dropped_labels = le_initial_df['depression_category']\n", "le_initial_df = le_initial_df.drop('depression_category', axis = 1)\n", "\n", "for col in le_initial_df.columns:\n", " if le_initial_df[col].dtype == 'object':\n", " le_initial_df[col] = le_initial_df[col].fillna('missing')\n", "\n", " label_encoder = LabelEncoder()\n", " le_initial_df[col] = label_encoder.fit_transform(le_initial_df[col])\n", "\n", " missing_value_index = np.where(label_encoder.classes_ == 'missing')[0]\n", " \n", " le_initial_df[col] = le_initial_df[col].replace(missing_value_index, np.nan)\n", "\n", "le_initial_df = pd.concat([le_initial_df, dropped_labels], axis = 1)" ] }, { "cell_type": "code", "execution_count": null, "id": "9fa21a95-21f1-4274-86ba-d7977813066b", "metadata": {}, "outputs": [], "source": [ "le_initial_df" ] }, { "cell_type": "code", "execution_count": null, "id": "0ba54a81-d4de-4884-99d3-29867eb7ea40", "metadata": {}, "outputs": [], "source": [ "# Seperate and Combine\n", "le_df_normal = le_initial_df[le_initial_df['depression_category'] == 'normal']\n", "le_df_mild = le_initial_df[le_initial_df['depression_category'] == 'mild']\n", "le_df_moderatesevere = le_initial_df[le_initial_df['depression_category'] == 'moderatesevere']\n", "\n", "le_df_depression = pd.concat([le_df_mild, le_df_moderatesevere], ignore_index = False)\n", "\n", "le_df_depression['depression_category'] = 'depression'\n", "\n", "# Check depression category counts\n", "dataframes = [le_df_normal, le_df_depression]\n", "le_initial_df = pd.concat(dataframes, ignore_index=True)\n", "label_counts = le_initial_df['depression_category'].value_counts()\n", "label_counts" ] }, { "cell_type": "code", "execution_count": null, "id": "c517213f-f6bb-4298-99ee-3b29f6f7d0cb", "metadata": {}, "outputs": [], "source": [ "# Some outlier\n", "threshold = int(0.8 * le_df_normal.shape[1])\n", "le_df_normal = le_df_normal.dropna(thresh = threshold)\n", "threshold = int(0.8 * le_df_depression.shape[1])\n", "le_df_depression = le_df_depression.dropna(thresh = threshold)\n", "\n", "# Check depression category counts\n", "dataframes = [le_df_normal, le_df_depression]\n", "le_initial_df = pd.concat(dataframes, ignore_index=True)\n", "label_counts = le_initial_df['depression_category'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "8c5bdf52-c645-4a11-920a-579e28db1a50", "metadata": {}, "outputs": [], "source": [ "# Imputation\n", "different_le_dfs = [le_df_normal, le_df_depression]\n", "imputed_le_dfs = []\n", "from sklearn.impute import IterativeImputer\n", "for le_df in different_le_dfs:\n", " y = le_df['depression_category']\n", " X = le_df.drop('depression_category', axis = 1)\n", " \n", " imputer = SimpleImputer(strategy='median')\n", " imputed_data = imputer.fit_transform(X)\n", " imputed_df = pd.DataFrame(imputed_data, columns = X.columns)\n", "\n", " imputed_df['depression_category'] = y.reset_index(drop = True)\n", " imputed_le_dfs.append(imputed_df)\n", "\n", "concatenated_le_dfs = pd.concat(imputed_le_dfs, ignore_index = True)\n", "concatenated_le_dfs" ] }, { "cell_type": "code", "execution_count": null, "id": "0e871a5f-beab-4f90-b8c4-e922ef86d0f0", "metadata": {}, "outputs": [], "source": [ "# Full label encode depression category\n", "fully_LE_concatenated_le_dfs = concatenated_le_dfs.copy()\n", "fully_LE_concatenated_le_dfs['depression_category'] = label_encoder.fit_transform(fully_LE_concatenated_le_dfs['depression_category'])\n", "\n", "# The dataset after category connect, imputation, and label encoding\n", "splitted_dataset = fully_LE_concatenated_le_dfs.copy()\n", "splitted_dataset" ] }, { "cell_type": "markdown", "id": "198fdc3c-ccbc-4ac5-8621-c0c5dc771cbb", "metadata": {}, "source": [ "### Setup for training" ] }, { "cell_type": "code", "execution_count": null, "id": "101c5549-7bc3-4573-867b-f09776e254db", "metadata": { "jupyter": { "source_hidden": true } }, "outputs": [], "source": [ "def plot_combined_roc_curve(roc_curves, classifier_names):\n", " plt.figure(figsize=(12, 8))\n", " mean_fpr = np.linspace(0, 1, 100)\n", " colors = plt.cm.get_cmap('tab20', len(classifier_names))\n", " \n", " for i, clf_name in enumerate(classifier_names):\n", " tprs = []\n", " for fpr, tpr in roc_curves[clf_name]:\n", " tprs.append(np.interp(mean_fpr, fpr, tpr))\n", " mean_tpr = np.mean(tprs, axis=0)\n", " mean_tpr[-1] = 1.0\n", " mean_auc = auc(mean_fpr, mean_tpr)\n", " plt.plot(mean_fpr, mean_tpr, color=colors(i), lw=2, linestyle='-', marker='o', markersize=4, \n", " label=f'{clf_name} (AUC = {mean_auc:.3f})')\n", "\n", " plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--')\n", " plt.xlim([0.0, 1.0])\n", " plt.ylim([0.0, 1.05])\n", " plt.xlabel('False Positive Rate', fontsize=26)\n", " plt.ylabel('True Positive Rate', fontsize=26)\n", " plt.xticks(fontsize=30)\n", " plt.yticks(fontsize=30)\n", " plt.legend(loc=\"lower right\", fontsize=22, frameon=True, framealpha=0.9)\n", " plt.grid(True)\n", "\n", " filename='bonk.svg'\n", "\n", " plt.savefig(filename, format='svg')\n", " plt.show()\n", "\n", " display(FileLink(filename))\n", "\n", "# Preparation code to make CD diagram from older version of Orange\n", "def compute_CD(avranks, n, alpha=\"0.05\", test=\"nemenyi\"):\n", " \"\"\"\n", " Returns critical difference for Nemenyi or Bonferroni-Dunn test\n", " according to given alpha (either alpha=\"0.05\" or alpha=\"0.1\") for average\n", " ranks and number of tested datasets N. Test can be either \"nemenyi\" for\n", " for Nemenyi two tailed test or \"bonferroni-dunn\" for Bonferroni-Dunn test.\n", "\n", " This function is deprecated and will be removed in Orange 3.34.\n", " \"\"\"\n", " k = len(avranks)\n", " d = {(\"nemenyi\", \"0.05\"): [0, 0, 1.959964, 2.343701, 2.569032, 2.727774,\n", " 2.849705, 2.94832, 3.030879, 3.101730, 3.163684,\n", " 3.218654, 3.268004, 3.312739, 3.353618, 3.39123,\n", " 3.426041, 3.458425, 3.488685, 3.517073,\n", " 3.543799],\n", " (\"nemenyi\", \"0.1\"): [0, 0, 1.644854, 2.052293, 2.291341, 2.459516,\n", " 2.588521, 2.692732, 2.779884, 2.854606, 2.919889,\n", " 2.977768, 3.029694, 3.076733, 3.119693, 3.159199,\n", " 3.195743, 3.229723, 3.261461, 3.291224, 3.319233],\n", " (\"bonferroni-dunn\", \"0.05\"): [0, 0, 1.960, 2.241, 2.394, 2.498, 2.576,\n", " 2.638, 2.690, 2.724, 2.773],\n", " (\"bonferroni-dunn\", \"0.1\"): [0, 0, 1.645, 1.960, 2.128, 2.241, 2.326,\n", " 2.394, 2.450, 2.498, 2.539]}\n", " q = d[(test, alpha)]\n", " cd = q[k] * (k * (k + 1) / (6.0 * n)) ** 0.5\n", " return cd\n", "\n", "\n", "def graph_ranks(avranks, names, cd=None, cdmethod=None, lowv=None, highv=None,\n", " width=6, textspace=1, reverse=False, filename=None, **kwargs):\n", " \"\"\"\n", " Draws a CD graph, which is used to display the differences in methods'\n", " performance. See Janez Demsar, Statistical Comparisons of Classifiers over\n", " Multiple Data Sets, 7(Jan):1--30, 2006.\n", "\n", " Needs matplotlib to work.\n", "\n", " The image is ploted on `plt` imported using\n", " `import matplotlib.pyplot as plt`.\n", "\n", " This function is deprecated and will be removed in Orange 3.34.\n", "\n", " Args:\n", " avranks (list of float): average ranks of methods.\n", " names (list of str): names of methods.\n", " cd (float): Critical difference used for statistically significance of\n", " difference between methods.\n", " cdmethod (int, optional): the method that is compared with other methods\n", " If omitted, show pairwise comparison of methods\n", " lowv (int, optional): the lowest shown rank\n", " highv (int, optional): the highest shown rank\n", " width (int, optional): default width in inches (default: 6)\n", " textspace (int, optional): space on figure sides (in inches) for the\n", " method names (default: 1)\n", " reverse (bool, optional): if set to `True`, the lowest rank is on the\n", " right (default: `False`)\n", " filename (str, optional): output file name (with extension). If not\n", " given, the function does not write a file.\n", " \"\"\"\n", " try:\n", " import matplotlib.pyplot as plt\n", " from matplotlib.backends.backend_agg import FigureCanvasAgg\n", " except ImportError:\n", " raise ImportError(\"Function graph_ranks requires matplotlib.\")\n", "\n", " width = float(width)\n", " textspace = float(textspace)\n", "\n", " def nth(l, n):\n", " \"\"\"\n", " Returns only nth elemnt in a list.\n", " \"\"\"\n", " n = lloc(l, n)\n", " return [a[n] for a in l]\n", "\n", " def lloc(l, n):\n", " \"\"\"\n", " List location in list of list structure.\n", " Enable the use of negative locations:\n", " -1 is the last element, -2 second last...\n", " \"\"\"\n", " if n < 0:\n", " return len(l[0]) + n\n", " else:\n", " return n\n", "\n", " def mxrange(lr):\n", " \"\"\"\n", " Multiple xranges. Can be used to traverse matrices.\n", " This function is very slow due to unknown number of\n", " parameters.\n", "\n", " >>> mxrange([3,5])\n", " [(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)]\n", "\n", " >>> mxrange([[3,5,1],[9,0,-3]])\n", " [(3, 9), (3, 6), (3, 3), (4, 9), (4, 6), (4, 3)]\n", "\n", " \"\"\"\n", " if not len(lr):\n", " yield ()\n", " else:\n", " # it can work with single numbers\n", " index = lr[0]\n", " if isinstance(index, int):\n", " index = [index]\n", " for a in range(*index):\n", " for b in mxrange(lr[1:]):\n", " yield tuple([a] + list(b))\n", "\n", " def print_figure(fig, *args, **kwargs):\n", " canvas = FigureCanvasAgg(fig)\n", " canvas.print_figure(*args, **kwargs)\n", "\n", " sums = avranks\n", "\n", " tempsort = sorted([(a, i) for i, a in enumerate(sums)], reverse=reverse)\n", " ssums = nth(tempsort, 0)\n", " sortidx = nth(tempsort, 1)\n", " nnames = [names[x] for x in sortidx]\n", "\n", " if lowv is None:\n", " lowv = min(1, int(math.floor(min(ssums))))\n", " if highv is None:\n", " highv = max(len(avranks), int(math.ceil(max(ssums))))\n", "\n", " cline = 0.4\n", "\n", " k = len(sums)\n", "\n", " lines = None\n", "\n", " linesblank = 0\n", " scalewidth = width - 2 * textspace\n", "\n", " def rankpos(rank):\n", " if not reverse:\n", " a = rank - lowv\n", " else:\n", " a = highv - rank\n", " return textspace + scalewidth / (highv - lowv) * a\n", "\n", " distanceh = 0.25\n", "\n", " if cd and cdmethod is None:\n", " # get pairs of non significant methods\n", "\n", " def get_lines(sums, hsd):\n", " # get all pairs\n", " lsums = len(sums)\n", " allpairs = [(i, j) for i, j in mxrange([[lsums], [lsums]]) if j > i]\n", " # remove not significant\n", " notSig = [(i, j) for i, j in allpairs\n", " if abs(sums[i] - sums[j]) <= hsd]\n", " # keep only longest\n", "\n", " def no_longer(ij_tuple, notSig):\n", " i, j = ij_tuple\n", " for i1, j1 in notSig:\n", " if (i1 <= i and j1 > j) or (i1 < i and j1 >= j):\n", " return False\n", " return True\n", "\n", " longest = [(i, j) for i, j in notSig if no_longer((i, j), notSig)]\n", "\n", " return longest\n", "\n", " lines = get_lines(ssums, cd)\n", " linesblank = 0.2 + 0.2 + (len(lines) - 1) * 0.1\n", "\n", " # add scale\n", " distanceh = 0.25\n", " cline += distanceh\n", "\n", " # calculate height needed height of an image\n", " minnotsignificant = max(2 * 0.2, linesblank)\n", " height = cline + ((k + 1) / 2) * 0.2 + minnotsignificant\n", "\n", " fig = plt.figure(figsize=(width, height))\n", " fig.set_facecolor('white')\n", " ax = fig.add_axes([0, 0, 1, 1]) # reverse y axis\n", " ax.set_axis_off()\n", "\n", " hf = 1. / height # height factor\n", " wf = 1. / width\n", "\n", " def hfl(l):\n", " return [a * hf for a in l]\n", "\n", " def wfl(l):\n", " return [a * wf for a in l]\n", "\n", "\n", " # Upper left corner is (0,0).\n", " ax.plot([0, 1], [0, 1], c=\"w\")\n", " ax.set_xlim(0, 1)\n", " ax.set_ylim(1, 0)\n", "\n", " def line(l, color='k', **kwargs):\n", " \"\"\"\n", " Input is a list of pairs of points.\n", " \"\"\"\n", " ax.plot(wfl(nth(l, 0)), hfl(nth(l, 1)), color=color, **kwargs)\n", "\n", " def text(x, y, s, *args, **kwargs):\n", " ax.text(wf * x, hf * y, s, fontsize = 14, *args, **kwargs)\n", "\n", " line([(textspace, cline), (width - textspace, cline)], linewidth=0.7)\n", "\n", " bigtick = 0.1\n", " smalltick = 0.05\n", "\n", " tick = None\n", " for a in list(np.arange(lowv, highv, 0.5)) + [highv]:\n", " tick = smalltick\n", " if a == int(a):\n", " tick = bigtick\n", " line([(rankpos(a), cline - tick / 2),\n", " (rankpos(a), cline)],\n", " linewidth=0.7)\n", "\n", " for a in range(lowv, highv + 1):\n", " text(rankpos(a), cline - tick / 2 - 0.05, str(a),\n", " ha=\"center\", va=\"bottom\")\n", "\n", " k = len(ssums)\n", "\n", " for i in range(math.ceil(k / 2)):\n", " chei = cline + minnotsignificant + i * 0.2\n", " line([(rankpos(ssums[i]), cline),\n", " (rankpos(ssums[i]), chei),\n", " (textspace - 0.1, chei)],\n", " linewidth=0.7)\n", " text(textspace - 0.2, chei, nnames[i], ha=\"right\", va=\"center\")\n", "\n", " for i in range(math.ceil(k / 2), k):\n", " chei = cline + minnotsignificant + (k - i - 1) * 0.2\n", " line([(rankpos(ssums[i]), cline),\n", " (rankpos(ssums[i]), chei),\n", " (textspace + scalewidth + 0.1, chei)],\n", " linewidth=0.7)\n", " text(textspace + scalewidth + 0.2, chei, nnames[i],\n", " ha=\"left\", va=\"center\")\n", "\n", " if cd and cdmethod is None:\n", " # upper scale\n", " if not reverse:\n", " begin, end = rankpos(lowv), rankpos(lowv + cd)\n", " else:\n", " begin, end = rankpos(highv), rankpos(highv - cd)\n", "\n", " line([(begin, distanceh), (end, distanceh)], linewidth=0.7)\n", " line([(begin, distanceh + bigtick / 2),\n", " (begin, distanceh - bigtick / 2)],\n", " linewidth=0.7)\n", " line([(end, distanceh + bigtick / 2),\n", " (end, distanceh - bigtick / 2)],\n", " linewidth=0.7)\n", " text((begin + end) / 2, distanceh - 0.05, \"CD\",\n", " ha=\"center\", va=\"bottom\")\n", "\n", " # no-significance lines\n", " def draw_lines(lines, side=0.05, height=0.1):\n", " start = cline + 0.2\n", " for l, r in lines:\n", " line([(rankpos(ssums[l]) - side, start),\n", " (rankpos(ssums[r]) + side, start)],\n", " linewidth=2.5)\n", " start += height\n", "\n", " draw_lines(lines)\n", "\n", " elif cd:\n", " begin = rankpos(avranks[cdmethod] - cd)\n", " end = rankpos(avranks[cdmethod] + cd)\n", " line([(begin, cline), (end, cline)],\n", " linewidth=2.5)\n", " line([(begin, cline + bigtick / 2),\n", " (begin, cline - bigtick / 2)],\n", " linewidth=2.5)\n", " line([(end, cline + bigtick / 2),\n", " (end, cline - bigtick / 2)],\n", " linewidth=2.5)\n", "\n", " if filename:\n", " print_figure(fig, filename, **kwargs)\n", "\n", "def train_evaluate_model(clf, X_train, y_train, X_test, y_test, clf_name='Classifier'):\n", " clf.fit(X_train, y_train)\n", " y_pred = clf.predict(X_test)\n", " \n", " accuracy = accuracy_score(y_test, y_pred)\n", " precision = precision_score(y_test, y_pred, average='weighted')\n", " recall = recall_score(y_test, y_pred, average='weighted')\n", " f1 = f1_score(y_test, y_pred, average='weighted')\n", " conf_matrix = confusion_matrix(y_test, y_pred)\n", " \n", " if hasattr(clf, 'predict_proba'):\n", " y_score = clf.predict_proba(X_test)[:, 1]\n", " else:\n", " y_score = clf.decision_function(X_test)\n", " \n", " fpr, tpr, _ = roc_curve(y_test, y_score)\n", " roc_auc = auc(fpr, tpr)\n", " \n", " print(f'{clf_name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}')\n", " return accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc" ] }, { "cell_type": "code", "execution_count": null, "id": "3e602f4d-efb2-4ac9-99bb-c6fbeca791cc", "metadata": { "jupyter": { "source_hidden": true } }, "outputs": [], "source": [ "warnings.filterwarnings('ignore')" ] }, { "cell_type": "markdown", "id": "773b2524-af11-464f-97a9-f4add85be0d2", "metadata": {}, "source": [ "### Training (classic/static)\n", "In order to run classical/static, make sure to uncomment the one you need. \"Post Training\" is after one of these classical/static is done." ] }, { "cell_type": "markdown", "id": "f5b8873c-13d3-43b7-8902-14b73e8d5409", "metadata": {}, "source": [ "#### Classical Classifiers" ] }, { "cell_type": "code", "execution_count": null, "id": "264648a0-b0d8-4a63-9167-cea3a4ed1e18", "metadata": {}, "outputs": [], "source": [ "# Optimized Classifiers\n", "# classifiers = {\n", "# 'DT': DecisionTreeClassifier(\n", "# random_state=0, \n", "# criterion='gini', \n", "# max_depth=6, \n", "# min_samples_leaf=10, \n", "# min_samples_split=9\n", "# ),\n", "# 'LR': LogisticRegression(\n", "# random_state=0, \n", "# C=0.09659168435718246, \n", "# max_iter=100, \n", "# solver='lbfgs'\n", "# ),\n", "# 'NB': GaussianNB(\n", "# var_smoothing=0.0058873326349240295\n", "# ),\n", "# 'KN': KNeighborsClassifier(\n", "# metric='manhattan', \n", "# n_neighbors=8, \n", "# weights='uniform'\n", "# ),\n", "# 'MLP': MLPClassifier(\n", "# random_state=0, \n", "# max_iter=1000, \n", "# alpha=0.0003079393718075164, \n", "# hidden_layer_sizes=195, \n", "# learning_rate_init=0.0001675266159417717\n", "# ),\n", "# 'SVC': SVC(probability=True, kernel = 'rbf', C = 0.95, gamma = 'scale')}\n", "\n", "# Default classifiers\n", "# classifiers = {\n", "# 'DecisionTree': DecisionTreeClassifier(random_state=0),\n", "# 'LogisticRegression': LogisticRegression(max_iter=1000, random_state=0),\n", "# 'NaiveBayes': GaussianNB(),\n", "# 'KNeighbors': KNeighborsClassifier(),\n", "# 'MLP': MLPClassifier(max_iter=1000, random_state=0),\n", "# 'SVC': SVC(probability=True, random_state=0)\n", "# }\n", "\n", "# Main\n", "# Initialize\n", "metric_sums = defaultdict(lambda: {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0})\n", "conf_matrices = defaultdict(list)\n", "roc_curves = defaultdict(list)\n", "roc_aucs = defaultdict(list)\n", "accuracy_scores = defaultdict(list)\n", "precision_scores = defaultdict(list)\n", "recall_scores = defaultdict(list)\n", "f1_scores = defaultdict(list)\n", "\n", "# Loop over 10 different random states\n", "for random_state in range(10):\n", " print(f\"Processing for Random State: {random_state}\")\n", "\n", " # Splitting the data\n", " X = splitted_dataset.drop('depression_category', axis=1)\n", " y = splitted_dataset['depression_category']\n", " \n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n", " \n", " # Identify outliers in the training dataset\n", " lof = LocalOutlierFactor()\n", " yhat = lof.fit_predict(X_train)\n", " # Select all rows that are not outliers\n", " mask = yhat != -1\n", " X_train, y_train = X_train[mask], y_train[mask]\n", " \n", " original_columns = X.columns.tolist()\n", "\n", " # SMOTE\n", " smote = SMOTE(random_state=random_state)\n", " X_res, y_res = smote.fit_resample(X_train, y_train)\n", "\n", " print(f\"Number of training labels after ROS: {y_res.value_counts()}\")\n", "\n", " print(f\"Number of test labels before resampling: {y_test.value_counts()}\") \n", " \n", " sampling_strategy_undersample = {0: 372}\n", " rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n", " X_test, y_test = rus.fit_resample(X_test, y_test)\n", " print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n", "\n", " # Normalization\n", " scaler = MinMaxScaler()\n", " \n", " X_res = scaler.fit_transform(X_res)\n", " X_test = scaler.transform(X_test)\n", " \n", " X_res = pd.DataFrame(X_res, columns=original_columns)\n", " X_test = pd.DataFrame(X_test, columns=original_columns)\n", "\n", " # Correlation Feat Analysis\n", " corr_df = X_res.copy()\n", " corr_df['target'] = y_res\n", " \n", " corr_mat = corr_df.corr()\n", " target_correlation = corr_mat['target'].drop('target')\n", " top_features = target_correlation.abs().sort_values(ascending=False).head(200).index.tolist()\n", " \n", " # Only take top features\n", " X_res_fi = X_res[top_features]\n", " X_test_fi = X_test[top_features]\n", "\n", " # Evaluate classifiers\n", " for clf_name, clf in classifiers.items():\n", " # Ensure the random state for classifiers is consistent\n", " if hasattr(clf, 'random_state'):\n", " clf.set_params(random_state=random_state)\n", " accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc = train_evaluate_model(clf, X_res_fi, y_res, X_test_fi, y_test, clf_name=clf_name)\n", " metric_sums[clf_name]['accuracy'] += accuracy\n", " metric_sums[clf_name]['precision'] += precision\n", " metric_sums[clf_name]['recall'] += recall\n", " metric_sums[clf_name]['f1'] += f1\n", " conf_matrices[clf_name].append(conf_matrix)\n", " roc_curves[clf_name].append((fpr, tpr))\n", " roc_aucs[clf_name].append(roc_auc)\n", " accuracy_scores[clf_name].append(accuracy)\n", " precision_scores[clf_name].append(precision)\n", " recall_scores[clf_name].append(recall)\n", " f1_scores[clf_name].append(f1)" ] }, { "cell_type": "markdown", "id": "0ec7e67e-a086-48a7-83de-26b54ef03899", "metadata": {}, "source": [ "#### Static Classifiers" ] }, { "cell_type": "code", "execution_count": null, "id": "e9e20dd2-bdd8-4626-be89-8bb866212de9", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Initialize\n", "metric_sums = defaultdict(lambda: {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0})\n", "conf_matrices = defaultdict(list)\n", "roc_curves = defaultdict(list)\n", "roc_aucs = defaultdict(list)\n", "accuracy_scores = defaultdict(list)\n", "precision_scores = defaultdict(list)\n", "recall_scores = defaultdict(list)\n", "f1_scores = defaultdict(list)\n", "\n", "# Optimized Classifiers\n", "classifiers = {\n", " 'RF': RandomForestClassifier(n_estimators=143, criterion='entropy', max_depth=15, random_state=0),\n", " 'XGB': XGBClassifier(n_estimators=200, max_depth=3, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss', random_state=0),\n", " 'GB': GradientBoostingClassifier(n_estimators=300, max_depth=3, learning_rate=0.05),\n", " # 'AB': AdaBoostClassifier(n_estimators=400, learning_rate=0.1),\n", " # 'CB': CatBoostClassifier(depth = 3, iterations = 168, learning_rate = 0.1, verbose = 0),\n", " # 'LGBM': LGBMClassifier(learning_rate = 0.1, max_depth = 3, n_estimators = 200) \n", "}\n", "\n", "# Default Classifiers\n", "# classifiers = {\n", "# 'RandomForest': RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=7, random_state=0),\n", "# 'XGBoost': XGBClassifier(n_estimators=100, max_depth=7, use_label_encoder=False, eval_metric='mlogloss', random_state=0),\n", "# 'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=0),\n", "# 'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=0),\n", "# 'CatBoost': CatBoostClassifier(n_estimators=100, verbose=0, random_state=0),\n", "# 'LightGBM': LGBMClassifier(n_estimators=100, random_state=0)\n", "# }\n", "\n", "# voting_clf = VotingClassifier(estimators=[\n", "# ('rf', classifiers['RF']),\n", "# ('xgb', classifiers['XGB']),\n", "# ('gb', classifiers['GB']),\n", "# ('ada', classifiers['AB']),\n", "# ('cat', classifiers['CB']),\n", "# ('lgbm', classifiers['LGBM'])\n", "# ], voting='soft', n_jobs=1)\n", "\n", "# classifiers['Vot'] = voting_clf\n", "\n", "# Define the number of features for each classifier\n", "num_features = {\n", " 'RF': 150,\n", " 'XGB': 150,\n", " 'GB': 150,\n", " # 'AB': 150,\n", " # 'CB': 150,\n", " # 'LGBM': 150,\n", " # 'Vot': 150\n", "}\n", "\n", "for random_state in range(10):\n", " print(f\"Processing for Random State: {random_state}\")\n", "\n", " X = splitted_dataset.drop('depression_category', axis=1)\n", " y = splitted_dataset['depression_category']\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n", " \n", " lof = LocalOutlierFactor()\n", " yhat = lof.fit_predict(X_train)\n", " mask = yhat != -1\n", " X_train, y_train = X_train[mask], y_train[mask]\n", " \n", " original_columns = X.columns.tolist()\n", "\n", " smote = SMOTE(random_state=random_state)\n", " X_res, y_res = smote.fit_resample(X_train, y_train)\n", "\n", " print(f\"Number of training labels after ROS: {y_res.value_counts()}\")\n", "\n", " print(f\"Number of test labels before resampling: {y_test.value_counts()}\") \n", " \n", " sampling_strategy_undersample = {0: 372}\n", " rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n", " X_test, y_test = rus.fit_resample(X_test, y_test)\n", "\n", " print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n", "\n", " scaler = MinMaxScaler()\n", " \n", " X_res = scaler.fit_transform(X_res)\n", " X_test = scaler.transform(X_test)\n", " \n", " X_res = pd.DataFrame(X_res, columns=original_columns)\n", " X_test = pd.DataFrame(X_test, columns=original_columns)\n", "\n", " log_reg = LogisticRegression(C=0.09659168435718246, max_iter=100, solver='lbfgs', random_state=random_state)\n", " log_reg.fit(X_res, y_res)\n", " selector = SelectFromModel(log_reg, prefit=True)\n", " \n", " importance = np.abs(log_reg.coef_[0])\n", " indices = np.argsort(importance)[::-1]\n", " important_features = [original_columns[i] for i in indices]\n", " \n", " for clf_name, clf in classifiers.items():\n", " num_top_features = num_features[clf_name]\n", " selected_features = important_features[:num_top_features]\n", " \n", " X_res_fi = pd.DataFrame(X_res, columns=original_columns)[selected_features]\n", " X_test_fi = pd.DataFrame(X_test, columns=original_columns)[selected_features]\n", "\n", " accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc = train_evaluate_model(\n", " clf, X_res_fi, y_res, X_test_fi, y_test, clf_name=clf_name\n", " )\n", " metric_sums[clf_name]['accuracy'] += accuracy\n", " metric_sums[clf_name]['precision'] += precision\n", " metric_sums[clf_name]['recall'] += recall\n", " metric_sums[clf_name]['f1'] += f1\n", " conf_matrices[clf_name].append(conf_matrix)\n", " roc_curves[clf_name].append((fpr, tpr))\n", " roc_aucs[clf_name].append(roc_auc)\n", " accuracy_scores[clf_name].append(accuracy)\n", " precision_scores[clf_name].append(precision)\n", " recall_scores[clf_name].append(recall)\n", " f1_scores[clf_name].append(f1)" ] }, { "cell_type": "markdown", "id": "c0158f58-e6d5-4adf-90e3-f1892294ad24", "metadata": {}, "source": [ "### Post Training (classic/static)\n", "Only run after one of the training methods above are done" ] }, { "cell_type": "code", "execution_count": null, "id": "c2ab5543-ad6e-4463-8d9a-35fe3e4e8eb4", "metadata": {}, "outputs": [], "source": [ "print('\\nAverage Metrics over 10 Random States:')\n", "for clf_name, metrics in metric_sums.items():\n", " avg_accuracy = metrics['accuracy'] / 10\n", " avg_precision = metrics['precision'] / 10\n", " avg_recall = metrics['recall'] / 10\n", " avg_f1 = metrics['f1'] / 10\n", " std_accuracy = np.std(accuracy_scores[clf_name])\n", " std_precision = np.std(precision_scores[clf_name])\n", " std_recall = np.std(recall_scores[clf_name])\n", " std_f1 = np.std(f1_scores[clf_name])\n", " avg_auc = np.mean(roc_aucs[clf_name])\n", " print(f'{clf_name} - Accuracy: {avg_accuracy:.4f} ± {std_accuracy:.4f}, Precision: {avg_precision:.4f} ± {std_precision:.4f}, Recall: {avg_recall:.4f} ± {std_recall:.4f}, F1-Score: {avg_f1:.4f} ± {std_f1:.4f}, AUC: {avg_auc:.4f}')" ] }, { "cell_type": "code", "execution_count": null, "id": "21d5c872-3452-41ca-8f7c-5f4ceb184fba", "metadata": {}, "outputs": [], "source": [ "# Plot ROC Curves for each classifier in one graph\n", "plot_combined_roc_curve(roc_curves, classifiers.keys())" ] }, { "cell_type": "code", "execution_count": null, "id": "26f7a715-efc8-44a6-8c53-e59b6268e551", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# FN Curve\n", "df = pd.DataFrame(accuracy_scores)\n", "scores = [df[col].values for col in df.columns]\n", "stat, p = friedmanchisquare(*scores)\n", "print(f'Friedman Test Statistic: {stat}, p-value: {p}')\n", "ranks = df.rank(axis=1, method='average', ascending=False)\n", "average_ranks = ranks.mean().values\n", "n_datasets = df.shape[0]\n", "alpha = 0.05\n", "cd = compute_CD(average_ranks, n_datasets, alpha='0.05')\n", "print(f'Critical Difference: {cd}')\n", "classifiers = [f\"{clf} ({rank:.2f})\" for clf, rank in zip(df.columns, average_ranks)]\n", "plt.figure(figsize=(14, 8))\n", "graph_ranks(average_ranks, classifiers, cd=cd, width=6, textspace=1)\n", "plt.text(0.5, 1.19, f'Friedman-Nemenyi: {stat:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=14)\n", "plt.text(0.5, 1.10, f'CD: {cd:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=14)\n", "plt.tight_layout()" ] }, { "cell_type": "markdown", "id": "88072d69-8bb5-46ab-9c79-6990db7cc8d2", "metadata": {}, "source": [ "### Hyperparameter optimization (classic/static)" ] }, { "cell_type": "code", "execution_count": null, "id": "3a4a1546-c24f-4349-bf1b-75ba937700be", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Hyperparameter optimization classic\n", "search_spaces = {\n", " 'DecisionTree': {\n", " 'criterion': Categorical(['gini', 'entropy']),\n", " 'max_depth': Integer(1, 20),\n", " 'min_samples_split': Integer(2, 10),\n", " 'min_samples_leaf': Integer(1, 10)\n", " },\n", " 'LogisticRegression': {\n", " 'C': Real(1e-6, 1e+6, prior='log-uniform'),\n", " 'solver': Categorical(['lbfgs', 'liblinear']),\n", " 'max_iter': Integer(100, 1000)\n", " },\n", " 'NaiveBayes': {\n", " 'var_smoothing': Real(1e-9, 1e-2, prior='log-uniform')\n", " },\n", " 'KNeighbors': {\n", " 'n_neighbors': Integer(1, 30),\n", " 'weights': Categorical(['uniform', 'distance']),\n", " 'metric': Categorical(['euclidean', 'manhattan', 'minkowski'])\n", " },\n", " 'MLP': {\n", " 'hidden_layer_sizes': Integer(50, 200),\n", " 'alpha': Real(1e-6, 1e-2, prior='log-uniform'),\n", " 'learning_rate_init': Real(1e-4, 1e-2, prior='log-uniform')\n", " },\n", " 'SVC': {\n", " 'C': [0.1, 1, 10, 100, 1000],\n", " 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],\n", " 'kernel': ['rbf']\n", " }\n", "}\n", "\n", "classifiers = {\n", " 'DecisionTree': DecisionTreeClassifier(random_state=0),\n", " 'LogisticRegression': LogisticRegression(max_iter=1000, random_state=0),\n", " 'NaiveBayes': GaussianNB(),\n", " 'KNeighbors': KNeighborsClassifier(),\n", " 'MLP': MLPClassifier(max_iter=1000, random_state=0),\n", " 'SVC': SVC(probability=True, random_state=0)\n", "}\n", "\n", "top_features_count = {\n", " 'DecisionTree': 200,\n", " 'LogisticRegression': 200,\n", " 'NaiveBayes': 200,\n", " 'KNeighbors': 200,\n", " 'MLP': 200,\n", " 'SVC': 200\n", "}\n", "\n", "random_state = 0\n", "print(f\"Processing for Random State: {random_state}\")\n", "\n", "X = splitted_dataset.drop('depression_category', axis=1)\n", "y = splitted_dataset['depression_category']\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n", "\n", "lof = LocalOutlierFactor()\n", "yhat = lof.fit_predict(X_train)\n", "mask = yhat != -1\n", "X_train, y_train = X_train[mask], y_train[mask]\n", "\n", "original_columns = X.columns.tolist()\n", "\n", "smote = SMOTE(random_state=random_state)\n", "X_res, y_res = smote.fit_resample(X_train, y_train)\n", "\n", "print(f\"Number of training labels after ROS: {y_res.value_counts()}\")\n", "\n", "print(f\"Number of test labels before resampling: {y_test.value_counts()}\") \n", "\n", "sampling_strategy_undersample = {0: 372}\n", "rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n", "X_test, y_test = rus.fit_resample(X_test, y_test)\n", "print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n", "\n", "scaler = MinMaxScaler()\n", "\n", "X_res = scaler.fit_transform(X_res)\n", "X_test = scaler.transform(X_test)\n", "\n", "X_res = pd.DataFrame(X_res, columns=original_columns)\n", "X_test = pd.DataFrame(X_test, columns=original_columns)\n", "\n", "corr_df = X_res.copy()\n", "corr_df['target'] = y_res\n", "\n", "corr_mat = corr_df.corr()\n", "target_correlation = corr_mat['target'].drop('target')\n", "\n", "for clf_name, clf in classifiers.items():\n", " print(f\"Optimizing {clf_name}\")\n", " \n", " top_features = target_correlation.abs().sort_values(ascending=False).head(top_features_count[clf_name]).index.tolist()\n", " \n", " X_res_fi = X_res[top_features]\n", " X_test_fi = X_test[top_features]\n", " \n", " opt = BayesSearchCV(clf, search_spaces[clf_name], n_iter=30, cv=3, random_state=random_state, n_jobs=-1, verbose = 30)\n", " opt.fit(X_res_fi, y_res)\n", " \n", " best_clf = opt.best_estimator_\n", " best_params = opt.best_params_\n", "\n", " print(f\"Best parameters for {clf_name}: {best_params}\")\n", " \n", " accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc = train_evaluate_model(best_clf, X_res_fi, y_res, X_test_fi, y_test, clf_name=clf_name)\n", " print(f\"Best results for {clf_name}:\")\n", " print(f'Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, AUC: {roc_auc:.4f}')\n", " print(conf_matrix)\n", " print() " ] }, { "cell_type": "code", "execution_count": null, "id": "2f473cc7-577d-4dcd-a8b1-be7f33200fbc", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Hyperparameter optimization static\n", "metric_sums = defaultdict(lambda: {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0})\n", "conf_matrices = defaultdict(list)\n", "accuracy_scores = defaultdict(list)\n", "precision_scores = defaultdict(list)\n", "recall_scores = defaultdict(list)\n", "f1_scores = defaultdict(list)\n", "\n", "classifiers = {\n", " 'RandomForest': RandomForestClassifier(),\n", " 'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),\n", " 'AdaBoost': AdaBoostClassifier(),\n", " 'GradientBoosting': GradientBoostingClassifier(),\n", " 'CatBoost': CatBoostClassifier(verbose=0),\n", " 'LightGBM': LGBMClassifier()\n", "}\n", "\n", "num_features = {\n", " 'RandomForest': 150,\n", " 'XGBoost': 150,\n", " 'GradientBoosting': 150,\n", " 'AdaBoost': 150,\n", " 'CatBoost': 150,\n", " 'LightGBM': 150,\n", "}\n", "\n", "search_spaces = {\n", " 'RandomForest': {\n", " 'n_estimators': [100, 200, 300],\n", " 'criterion': ['gini', 'entropy'],\n", " 'max_depth': [None, 7, 15],\n", " 'bootstrap': [True, False]\n", " },\n", " 'XGBoost': {\n", " 'n_estimators': [100, 200, 300],\n", " 'max_depth': [5, 10],\n", " 'learning_rate': [0.01, 0.1, 0.2],\n", " 'gamma': [0, 0.2, 0.4],\n", " },\n", " 'GradientBoosting': {\n", " 'n_estimators': [100, 200, 300],\n", " 'learning_rate': [0.01, 0.1, 0.2],\n", " 'max_depth': [5, 10],\n", " 'subsample': [0.7, 0.9, 1.0],\n", " },\n", " 'AdaBoost': {\n", " 'n_estimators': [100, 200, 300],\n", " 'learning_rate': [0.1, 0.5, 1.0],\n", " 'algorithm': ['SAMME', 'SAMME.R']\n", " },\n", " 'CatBoost': {\n", " 'iterations': [100, 200, 300],\n", " 'depth': [5, 7, 9],\n", " 'learning_rate': [0.01, 0.1, 0.2],\n", " },\n", " 'LightGBM': {\n", " 'n_estimators': [100, 200, 300],\n", " 'num_leaves': [31, 63, 127],\n", " 'learning_rate': [0.01, 0.1, 0.2],\n", " 'subsample': [0.7, 0.9, 1.0],\n", " }\n", "}\n", "\n", "def hyperparameter_optimization(clf, search_space, X, y):\n", " combined_results = []\n", " for random_state in range(3):\n", " cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)\n", " opt = BayesSearchCV(clf, search_space, n_iter=30, cv=cv, random_state=random_state, n_jobs=-1, verbose=0)\n", " opt.fit(X, y)\n", " combined_results.append(opt.best_params_)\n", " best_params = pd.DataFrame(combined_results).mode().iloc[0].to_dict()\n", " return best_params\n", "\n", "for random_state in range(9,10):\n", " print(f\"Processing for Random State: {random_state}\")\n", "\n", " X = splitted_dataset.drop('depression_category', axis=1)\n", " y = splitted_dataset['depression_category']\n", " \n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n", " \n", " lof = LocalOutlierFactor()\n", " yhat = lof.fit_predict(X_train)\n", " mask = yhat != -1\n", " X_train, y_train = X_train[mask], y_train[mask]\n", " \n", " original_columns = X.columns.tolist()\n", "\n", " smote = SMOTE(random_state=random_state)\n", " X_res, y_res = smote.fit_resample(X_train, y_train)\n", "\n", " print(f\"Number of training labels after ROS: {y_res.value_counts()}\")\n", "\n", " print(f\"Number of test labels before resampling: {y_test.value_counts()}\") \n", " \n", " sampling_strategy_undersample = {0: 372}\n", " rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n", " X_test, y_test = rus.fit_resample(X_test, y_test)\n", "\n", " print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n", "\n", " scaler = MinMaxScaler()\n", " \n", " X_res = scaler.fit_transform(X_res)\n", " X_test = scaler.transform(X_test)\n", " \n", " X_res = pd.DataFrame(X_res, columns=original_columns)\n", " X_test = pd.DataFrame(X_test, columns=original_columns)\n", "\n", " log_reg = LogisticRegression(C=0.09659168435718246, max_iter=100, solver='lbfgs', random_state=random_state)\n", " log_reg.fit(X_res, y_res)\n", " selector = SelectFromModel(log_reg, prefit=True)\n", " \n", " importance = np.abs(log_reg.coef_[0])\n", " indices = np.argsort(importance)[::-1]\n", " important_features = [original_columns[i] for i in indices[:300]]\n", "\n", " for clf_name, clf in classifiers.items():\n", " print(f\"Optimizing {clf_name}\")\n", " num_top_features = num_features[clf_name]\n", " selected_features = important_features[:num_top_features]\n", " \n", " X_res_fi = pd.DataFrame(X_res, columns=original_columns)[selected_features]\n", " \n", " best_params = hyperparameter_optimization(clf, search_spaces[clf_name], X_res_fi, y_res)\n", " if 'n_estimators' in best_params:\n", " best_params['n_estimators'] = int(best_params['n_estimators'])\n", " if 'max_depth' in best_params:\n", " best_params['max_depth'] = int(best_params['max_depth'])\n", " if 'iterations' in best_params:\n", " best_params['iterations'] = int(best_params['iterations'])\n", " clf.set_params(**best_params)\n", " print(f\"Best parameters for {clf_name}: {best_params}\")\n", "\n", " X_test_fi = pd.DataFrame(X_test, columns=original_columns)[selected_features]\n", " accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc = train_evaluate_model(clf, X_res_fi, y_res, X_test_fi, y_test, clf_name=clf_name)\n", " metric_sums[clf_name]['accuracy'] += accuracy\n", " metric_sums[clf_name]['precision'] += precision\n", " metric_sums[clf_name]['recall'] += recall\n", " metric_sums[clf_name]['f1'] += f1\n", " conf_matrices[clf_name].append(conf_matrix)\n", " accuracy_scores[clf_name].append(accuracy)\n", " precision_scores[clf_name].append(precision)\n", " recall_scores[clf_name].append(recall)\n", " f1_scores[clf_name].append(f1)" ] }, { "cell_type": "markdown", "id": "65df93d4-12d3-4d68-b402-633354849dff", "metadata": {}, "source": [ "### DES Training (all)" ] }, { "cell_type": "code", "execution_count": null, "id": "709bb1f8-249e-4409-a537-c6bbe9a399f3", "metadata": { "scrolled": true }, "outputs": [], "source": [ "metric_sums_des = {\n", " 'KNORAE': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n", " 'KNORAU': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n", " 'KNOP': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n", " 'DESMI': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n", " 'METADES': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n", " 'DESKNN': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n", " 'DESP': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n", " 'FIRE-KNORA-U': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n", " 'FIRE-KNORA-E': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n", " 'FIRE-METADES': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n", " 'FIRE-DESKNN': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n", " 'FIRE-DESP': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n", " 'FIRE-KNOP': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n", "}\n", "\n", "conf_matrices_des = {\n", " 'KNORAE': [],\n", " 'KNORAU': [],\n", " 'KNOP': [],\n", " 'DESMI': [],\n", " 'METADES': [],\n", " 'DESKNN': [],\n", " 'DESP': [],\n", " 'FIRE-KNORA-U': [],\n", " 'FIRE-KNORA-E': [],\n", " 'FIRE-METADES': [],\n", " 'FIRE-DESKNN': [],\n", " 'FIRE-DESP': [],\n", " 'FIRE-KNOP': [],\n", "}\n", "\n", "roc_curves = defaultdict(list)\n", "roc_aucs = defaultdict(list)\n", "accuracy_scores = defaultdict(list)\n", "precision_scores = defaultdict(list)\n", "recall_scores = defaultdict(list)\n", "f1_scores = defaultdict(list)\n", "feature_importance_runs = []\n", "\n", "# Uncomment wanted combinations\n", "base_classifiers = {\n", " # 'DecisionTree': DecisionTreeClassifier(\n", " # random_state=0, \n", " # criterion='gini', \n", " # max_depth=6, \n", " # min_samples_leaf=10, \n", " # min_samples_split=9\n", " # ),\n", " # 'LogisticRegression': LogisticRegression(\n", " # random_state=0, \n", " # C=0.09659168435718246, \n", " # max_iter=100, \n", " # solver='lbfgs'\n", " # ),\n", " # 'NaiveBayes': GaussianNB(\n", " # var_smoothing=0.0058873326349240295\n", " # ),\n", " # 'KNeighbors': KNeighborsClassifier(\n", " # metric='manhattan', \n", " # n_neighbors=15, \n", " # weights='uniform'\n", " # ),\n", " # 'MLP': MLPClassifier(\n", " # random_state=0, \n", " # max_iter=1000, \n", " # alpha=0.0003079393718075164, \n", " # hidden_layer_sizes=195, \n", " # learning_rate_init=0.0001675266159417717\n", " # ),\n", " # 'SVC': SVC(probability=True, kernel = 'rbf', C = 1.5, gamma = 'auto'),\n", " # 'RF': RandomForestClassifier(n_estimators=143, criterion='entropy', max_depth=15, random_state=0),\n", " 'XGB': XGBClassifier(n_estimators=200, max_depth=3, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss', random_state=0),\n", " 'GB': GradientBoostingClassifier(n_estimators=300, max_depth=3, learning_rate=0.05),\n", " 'AB': AdaBoostClassifier(n_estimators=400, learning_rate=0.1),\n", " # 'CB': CatBoostClassifier(depth = 3, iterations = 168, learning_rate = 0.1, verbose = 0),\n", " # 'LGBM': LGBMClassifier(learning_rate = 0.1, max_depth = 3, n_estimators = 200) \n", "}\n", "\n", "random_state = 0\n", "\n", "for random_state in range(10):\n", " print(f\"Processing for Random State: {random_state}\")\n", "\n", " X = splitted_dataset.drop('depression_category', axis=1)\n", " y = splitted_dataset['depression_category']\n", " \n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n", " \n", " lof = LocalOutlierFactor()\n", " yhat = lof.fit_predict(X_train)\n", " mask = yhat != -1\n", " X_train, y_train = X_train[mask], y_train[mask]\n", " \n", " original_columns = X.columns.tolist()\n", "\n", " smote = SMOTE(random_state=random_state)\n", " X_res, y_res = smote.fit_resample(X_train, y_train)\n", "\n", " print(f\"Number of test labels before resampling: {y_test.value_counts()}\")\n", " sampling_strategy_undersample = {0: 372}\n", " rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n", " X_test, y_test = rus.fit_resample(X_test, y_test) \n", "\n", " print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n", "\n", " scaler = MinMaxScaler()\n", " X_res = scaler.fit_transform(X_res)\n", " X_test = scaler.transform(X_test)\n", " \n", " X_res = pd.DataFrame(X_res, columns=original_columns)\n", " X_test = pd.DataFrame(X_test, columns=original_columns)\n", "\n", " xgb_fs = XGBClassifier(n_estimators=200, max_depth=3, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss', random_state=random_state)\n", " xgb_fs.fit(X_res, y_res)\n", "\n", " feature_importances = xgb_fs.feature_importances_\n", " indices = np.argsort(feature_importances)[::-1]\n", " top_50_features = [original_columns[i] for i in indices[:50]]\n", " current_run_features = {original_columns[i]: feature_importances[i] for i in indices[:50]}\n", " \n", " feature_importance_runs.append(current_run_features)\n", "\n", " X_res_fi = X_res[top_50_features]\n", " X_test_fi = X_test[top_50_features]\n", " \n", " model_pool = list(base_classifiers.values())\n", " \n", " for clf in model_pool:\n", " clf.fit(X_res_fi, y_res)\n", " \n", " des_models = {\n", " 'KNORAE': KNORAE(pool_classifiers=model_pool, random_state=random_state),\n", " 'KNORAU': KNORAU(pool_classifiers=model_pool, random_state=random_state),\n", " 'DESMI': DESMI(pool_classifiers=model_pool, random_state=random_state),\n", " 'METADES': METADES(pool_classifiers=model_pool, random_state=random_state),\n", " 'DESKNN': DESKNN(pool_classifiers=model_pool, random_state=random_state),\n", " 'DESP': DESP(pool_classifiers=model_pool, random_state=random_state),\n", " 'KNOP': KNOP(pool_classifiers=model_pool, random_state=random_state, k=9),\n", " 'FIRE-KNORA-U': KNORAU(pool_classifiers=model_pool, DFP=True, k=9, random_state = random_state),\n", " 'FIRE-KNORA-E': KNORAE(pool_classifiers=model_pool, DFP=True, k=9, random_state = random_state),\n", " 'FIRE-METADES': METADES(pool_classifiers=model_pool, DFP=True, k=9, random_state = random_state),\n", " 'FIRE-DESKNN': DESKNN(pool_classifiers=model_pool, DFP=True, k=9, random_state = random_state),\n", " 'FIRE-DESP': DESP(pool_classifiers=model_pool, DFP=True, k=9, random_state = random_state),\n", " 'FIRE-KNOP': KNOP(pool_classifiers=model_pool, DFP=True, k=40, random_state = random_state)\n", " }\n", "\n", " for des_name, des_model in des_models.items():\n", " accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc = train_evaluate_model(\n", " des_model, X_res_fi, y_res, X_test_fi, y_test, clf_name=des_name\n", " )\n", " metric_sums_des[des_name]['accuracy'] += accuracy\n", " metric_sums_des[des_name]['precision'] += precision\n", " metric_sums_des[des_name]['recall'] += recall\n", " metric_sums_des[des_name]['f1'] += f1\n", " conf_matrices_des[des_name].append(conf_matrix)\n", " roc_curves[des_name].append((fpr, tpr))\n", " roc_aucs[des_name].append(roc_auc)\n", " accuracy_scores[des_name].append(accuracy)\n", " precision_scores[des_name].append(precision)\n", " recall_scores[des_name].append(recall)\n", " f1_scores[des_name].append(f1)\n", "\n", " print(f'Confusion Matrix for {des_name} at Random State {random_state}:\\n{conf_matrix}\\n')" ] }, { "cell_type": "code", "execution_count": null, "id": "34b0ead4-1d8c-4d0a-b0c7-71cd71f6ab7d", "metadata": {}, "outputs": [], "source": [ "def plot_combined_roc_curve(roc_curves, classifier_names):\n", " plt.figure(figsize=(12, 8))\n", " mean_fpr = np.linspace(0, 1, 100)\n", " colors = plt.cm.get_cmap('tab20', len(classifier_names))\n", " \n", " for i, clf_name in enumerate(classifier_names):\n", " tprs = []\n", " for fpr, tpr in roc_curves[clf_name]:\n", " tprs.append(np.interp(mean_fpr, fpr, tpr))\n", " mean_tpr = np.mean(tprs, axis=0)\n", " mean_tpr[-1] = 1.0\n", " mean_auc = auc(mean_fpr, mean_tpr)\n", " plt.plot(mean_fpr, mean_tpr, color=colors(i), lw=2, linestyle='-', marker='o', markersize=4, \n", " label=f'{clf_name} (AUC = {mean_auc:.3f})')\n", "\n", " plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--')\n", " plt.xlim([0.0, 1.0])\n", " plt.ylim([0.0, 1.05])\n", " plt.xlabel('False Positive Rate', fontsize=26)\n", " plt.ylabel('True Positive Rate', fontsize=26)\n", " plt.xticks(fontsize=30) # Increase x-axis numbers font size\n", " plt.yticks(fontsize=30) # Increase y-axis numbers font size\n", " plt.legend(loc=\"center left\", bbox_to_anchor=(1.05, 0.5), fontsize=26, frameon=True, framealpha=0.9) # Place legend beside the plot\n", " plt.grid(True)\n", "\n", " filename='bonk.svg'\n", "\n", " plt.savefig(filename, format='svg', bbox_inches = 'tight')\n", " plt.show()\n", "\n", " display(FileLink(filename))" ] }, { "cell_type": "code", "execution_count": null, "id": "4cfd8d73-26fd-4dbc-a6ca-92a9643e1d27", "metadata": {}, "outputs": [], "source": [ "print('\\nAverage Metrics over 10 Random States:')\n", "for des_name, metrics in metric_sums_des.items():\n", " avg_accuracy = metrics['accuracy'] / 10\n", " avg_precision = metrics['precision'] / 10\n", " avg_recall = metrics['recall'] / 10\n", " avg_f1 = metrics['f1'] / 10\n", " std_accuracy = np.std(accuracy_scores[des_name])\n", " std_precision = np.std(precision_scores[des_name])\n", " std_recall = np.std(recall_scores[des_name])\n", " std_f1 = np.std(f1_scores[des_name])\n", " avg_auc = np.mean(roc_aucs[des_name])\n", " print(f'{des_name} - Accuracy: {avg_accuracy:.4f} ± {std_accuracy:.4f}, Precision: {avg_precision:.4f} ± {std_precision:.4f}, Recall: {avg_recall:.4f} ± {std_recall:.4f}, F1-Score: {avg_f1:.4f} ± {std_f1:.4f}, AUC: {avg_auc:.4f}')\n", "\n", "plot_combined_roc_curve(roc_curves, list(des_models.keys()))" ] }, { "cell_type": "code", "execution_count": null, "id": "32780b70-dae5-4f7d-9ae4-128dc782fad4", "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(accuracy_scores)\n", "scores = [df[col].values for col in df.columns]\n", "\n", "stat, p = friedmanchisquare(*scores)\n", "print(f'Friedman Test Statistic: {stat}, p-value: {p}')\n", "\n", "ranks = df.rank(axis=1, method='average', ascending=False)\n", "average_ranks = ranks.mean().values\n", "\n", "n_datasets = df.shape[0]\n", "alpha = 0.05\n", "\n", "cd = compute_CD(average_ranks, n_datasets, alpha='0.05')\n", "print(f'Critical Difference: {cd}')\n", "\n", "classifiers = [f\"{clf} ({rank:.2f})\" for clf, rank in zip(df.columns, average_ranks)]\n", "\n", "plt.figure(figsize=(14, 10))\n", "\n", "graph_ranks(average_ranks, classifiers, cd=cd, width=6, textspace=1)\n", "plt.xlabel('Classifiers')\n", "\n", "plt.text(0.5, 1.19, f'Friedman-Nemenyi: {stat:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=16)\n", "plt.text(0.5, 1.10, f'CD: {cd:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=16)\n", "\n", "plt.tight_layout()" ] }, { "cell_type": "markdown", "id": "f25a2b29-129f-4314-b2f9-8aff9615d5d7", "metadata": {}, "source": [ "### Shap (will mostly be exported files)" ] }, { "cell_type": "code", "execution_count": null, "id": "ed3c43d0-68d2-4aac-ac44-25a28dc6dc75", "metadata": {}, "outputs": [], "source": [ "# Example with XGB\n", "\n", "random_state = 2\n", "print(f\"Processing for Random State: {random_state}\")\n", "\n", "X = splitted_dataset.drop('depression_category', axis=1)\n", "y = splitted_dataset['depression_category']\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n", "\n", "lof = LocalOutlierFactor()\n", "yhat = lof.fit_predict(X_train)\n", "mask = yhat != -1\n", "X_train, y_train = X_train[mask], y_train[mask]\n", "\n", "original_columns = X.columns.tolist()\n", "\n", "smote = SMOTE(random_state=random_state)\n", "X_res, y_res = smote.fit_resample(X_train, y_train)\n", "\n", "print(f\"Number of training labels after SMOTE: {y_res.value_counts()}\")\n", "print(f\"Number of test labels before resampling: {y_test.value_counts()}\")\n", "\n", "sampling_strategy_undersample = {0: 372}\n", "rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n", "X_test, y_test = rus.fit_resample(X_test, y_test)\n", "\n", "print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n", "\n", "# Normalization\n", "# scaler = MinMaxScaler()\n", "# X_res = scaler.fit_transform(X_res)\n", "# X_test = scaler.transform(X_test)\n", "\n", "X_res = pd.DataFrame(X_res, columns=original_columns)\n", "X_test = pd.DataFrame(X_test, columns=original_columns)\n", "\n", "# Train XGBoost model on all features\n", "model = xgb.XGBClassifier(n_estimators=200, max_depth=3, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss', random_state=random_state)\n", "model.fit(X_res, y_res)\n", "\n", "y_pred = model.predict(X_test)\n", "\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(f'Accuracy: {accuracy:.4f}')\n", "\n", "explainer = shap.Explainer(model, X_res)\n", "shap_values = explainer(X_res)" ] }, { "cell_type": "code", "execution_count": null, "id": "2cb9c929-4e86-42df-bda2-48730004c154", "metadata": {}, "outputs": [], "source": [ "def plot_shap_waterfall(instance_index, filename):\n", " shap_value = shap_values[instance_index]\n", " plt.figure(figsize=(14, 8))\n", " \n", " shap.plots.waterfall(shap_value, show=False)\n", " \n", " ax = plt.gca()\n", " \n", " ax.tick_params(axis='both', which='major', labelsize=16)\n", " ax.set_xlabel(ax.get_xlabel(), fontsize=20)\n", " ax.set_ylabel(ax.get_ylabel(), fontsize=22)\n", " \n", " plt.tight_layout()\n", "\n", " plt.savefig(filename, format='svg')\n", " \n", " plt.close()\n", "\n", "plot_shap_waterfall(0, \"waterfall_plot_instance_0.svg\")\n", "plot_shap_waterfall(562, \"waterfall_plot_instance_562.svg\")" ] }, { "cell_type": "code", "execution_count": null, "id": "e111213e-caea-48a0-adb7-c6b2362eed4f", "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "plt.figure(figsize=(14, 8))\n", "shap.summary_plot(\n", " shap_values,\n", " X_res,\n", " plot_type=\"bar\",\n", " feature_names=original_columns,\n", " show=False\n", ")\n", "\n", "ax = plt.gca()\n", "ax.tick_params(axis='both', which='major', labelsize=16)\n", "ax.set_xlabel(ax.get_xlabel(), fontsize=16)\n", "ax.set_ylabel(ax.get_ylabel(), fontsize=22)\n", "\n", "plt.savefig(\"shap_summary_plot.svg\", format='svg')\n", "plt.close()" ] }, { "cell_type": "code", "execution_count": null, "id": "4ae84637-a018-49a1-ad4e-522aa937bfad", "metadata": {}, "outputs": [], "source": [ "shap.initjs()\n", "\n", "fig, ax = plt.subplots(figsize=(14, 8))\n", "\n", "shap.summary_plot(\n", " shap_values,\n", " X_res,\n", " plot_type=\"dot\",\n", " feature_names=original_columns,\n", " show=False\n", ")\n", "\n", "ax.tick_params(axis='both', which='major', labelsize=16)\n", "ax.set_xlabel(ax.get_xlabel(), fontsize=16)\n", "ax.set_ylabel(ax.get_ylabel(), fontsize=22)\n", "\n", "fig.savefig(\"shap_summary_dot_plot.svg\", format='svg', bbox_inches='tight')\n", "\n", "plt.close(fig)" ] }, { "cell_type": "code", "execution_count": null, "id": "05f0de1c-3850-4d77-9184-d593451022c1", "metadata": {}, "outputs": [], "source": [ "from sklearn.tree import DecisionTreeClassifier, export_text\n", "from sklearn import tree\n", "\n", "random_state = 5\n", "\n", "X = splitted_dataset.drop('depression_category', axis=1)\n", "y = splitted_dataset['depression_category']\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n", "\n", "lof = LocalOutlierFactor()\n", "yhat = lof.fit_predict(X_train)\n", "mask = yhat != -1\n", "X_train, y_train = X_train[mask], y_train[mask]\n", "\n", "original_columns = X.columns.tolist()\n", "\n", "smote = SMOTE(random_state=random_state)\n", "X_res, y_res = smote.fit_resample(X_train, y_train)\n", "\n", "print(f\"Number of training labels after ROS: {y_res.value_counts()}\")\n", "print(f\"Number of test labels before resampling: {y_test.value_counts()}\")\n", "sampling_strategy_undersample = {0: 372}\n", "rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n", "X_test, y_test = rus.fit_resample(X_test, y_test)\n", "\n", "print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n", "\n", "# Normalization\n", "# scaler = MinMaxScaler()\n", "# X_res = scaler.fit_transform(X_res)\n", "# X_test = scaler.transform(X_test)\n", "\n", "X_res = pd.DataFrame(X_res, columns=original_columns)\n", "X_test = pd.DataFrame(X_test, columns=original_columns)\n", "\n", "decision_tree_model = DecisionTreeClassifier(\n", " random_state=0, \n", " criterion='gini', \n", " max_depth=6, \n", " min_samples_leaf=10, \n", " min_samples_split=9\n", ")\n", "decision_tree_model.fit(X_res, y_res)\n", "\n", "plt.figure(figsize=(20, 14))\n", "tree.plot_tree(\n", " decision_tree_model, \n", " feature_names=original_columns, \n", " class_names=['depression', 'normal'],\n", " filled=True, \n", " rounded=True, \n", " fontsize=10,\n", " max_depth = 3\n", ")\n", "\n", "plt.savefig(\"decision_tree_plot.svg\", format='svg')\n", "plt.close()\n", "\n", "print(\"Decision Tree plot saved as 'decision_tree_plot.svg'\")" ] }, { "cell_type": "code", "execution_count": null, "id": "b46a92a6-f370-4df4-ac29-8f382fc1820b", "metadata": { "scrolled": true }, "outputs": [], "source": [ "tree_rules = export_text(decision_tree_model, feature_names=original_columns, max_depth=50)\n", "print(\"Decision rules for the tree (up to depth 3):\")\n", "print(tree_rules)\n", "\n", "node_indicator = decision_tree_model.decision_path(X_test)\n", "\n", "sample_id = 0\n", "node_index = node_indicator.indices[node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]]\n", "\n", "print(f\"\\nDecision path for sample {sample_id}:\")\n", "for node_id in node_index:\n", " if X_test.iloc[sample_id, decision_tree_model.tree_.feature[node_id]] <= decision_tree_model.tree_.threshold[node_id]:\n", " threshold_sign = \"<=\"\n", " else:\n", " threshold_sign = \">\"\n", " print(f\"Node {node_id}: (X_test[{sample_id}, {decision_tree_model.tree_.feature[node_id]}] = {X_test.iloc[sample_id, decision_tree_model.tree_.feature[node_id]]}) \"\n", " f\"{threshold_sign} {decision_tree_model.tree_.threshold[node_id]}\")\n", "\n", "# Get prediction for a specific test sample\n", "predicted_class = decision_tree_model.predict([X_test.iloc[sample_id]])\n", "print(f\"\\nPredicted class for test sample {sample_id}: {predicted_class}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }