Spaces:

fdabbiras
/

MDD

No application file

App Files Files Community

fdabbiras commited on Dec 19, 2025

Commit

33e757b

verified ·

1 Parent(s): fb90326

Upload 4 files

Browse files

Files changed (4) hide show

dataPreprocessing.ipynb +1203 -0
detectionLayer.ipynb +1799 -0
regressionLayer.ipynb +962 -0
severityPredictionLayer.ipynb +1858 -0

dataPreprocessing.ipynb ADDED Viewed

	@@ -0,0 +1,1203 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "37cfe918-891e-4c07-914b-a5c42ff01f12",
+   "metadata": {},
+   "source": [
+    "### Data Import"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "db8bb49a-2e2d-4408-b571-44c7547b463b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Libraries\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import random as rnd\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "8d470c8b-9188-45ba-9356-ec22c1b146bf",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 3005 entries, 0 to 3004\n",
+      "Columns: 820 entries, ID to CASEDIF\n",
+      "dtypes: float64(93), int64(6), object(721)\n",
+      "memory usage: 18.8+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load up data\n",
+    "pd.set_option('display.max_rows', 10)\n",
+    "df = pd.read_csv('./data.csv')\n",
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c6fe342-0591-48e4-9d5f-d981d62d50c9",
+   "metadata": {},
+   "source": [
+    "# Feature Deletion"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "6508cb2e-4304-4e2c-a101-79e7caeaffc2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Features with high amount of missing values\n",
+    "empty_values = df.isna().sum()\n",
+    "total_rows = len(df)\n",
+    "empty_percentages = (empty_values / total_rows) * 100\n",
+    "filtered_empty_values_ab25per = empty_percentages[empty_percentages > 25]\n",
+    "cols_to_drop = filtered_empty_values_ab25per.index.tolist()\n",
+    "df = df.drop(columns = cols_to_drop)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f67860b6-5837-49de-a970-50f40a7e4bc7",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 3005 entries, 0 to 3004\n",
+      "Columns: 556 entries, ID to CASEDIF\n",
+      "dtypes: float64(49), int64(6), object(501)\n",
+      "memory usage: 12.7+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "2020f8b3-bda8-459b-819c-f601c4aa6a53",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Medecine features except for ones with high correlation with depression\n",
+    "medicine_cols_to_keep = ['PSYCHOTHERAGENTS', 'ANTIDEPRESSANTS', 'SSRIANTIDEPRESSA',\n",
+    "                      'ANXIOLYTICSSEDAT', 'ANTICONVULSANTS', 'BENZODIAZEPINES']\n",
+    "medicine_cols = [\n",
+    "    \"antiinfectives\",\n",
+    "    \"amebicides\",\n",
+    "    \"antifungals\",\n",
+    "    \"antimalarialagen\",\n",
+    "    \"antituberagents\",\n",
+    "    \"cephalosporins\",\n",
+    "    \"leprostatics\",\n",
+    "    \"macrolidederivat\",\n",
+    "    \"miscantibiotics\",\n",
+    "    \"penicillins\",\n",
+    "    \"quinolones\",\n",
+    "    \"sulfonamides\",\n",
+    "    \"tetracyclines\",\n",
+    "    \"urinaryantiinfec\",\n",
+    "    \"antihyplipagents\",\n",
+    "    \"antineoplastics\",\n",
+    "    \"alkylatingagents\",\n",
+    "    \"antimetabolites\",\n",
+    "    \"hormonesantineop\",\n",
+    "    \"miscantineoplast\",\n",
+    "    \"biologicals\",\n",
+    "    \"recombinanthuman\",\n",
+    "    \"cardiovascularag\",\n",
+    "    \"angiotensinconve\",\n",
+    "    \"antiadrenergperi\",\n",
+    "    \"antiadrenergcent\",\n",
+    "    \"antianginalagent\",\n",
+    "    \"antiarrhythmicag\",\n",
+    "    \"betaadrenergicbl\",\n",
+    "    \"calciumchannelbl\",\n",
+    "    \"diuretics\",\n",
+    "    \"inotropicagents\",\n",
+    "    \"misccardiovascul\",\n",
+    "    \"peripheralvasodi\",\n",
+    "    \"vasodilators\",\n",
+    "    \"vasopressors\",\n",
+    "    \"antihypertensive\",\n",
+    "    \"angiotensiniiinh\",\n",
+    "    \"centralnervoussy\",\n",
+    "    \"analgesics\",\n",
+    "    \"miscanalgesics\",\n",
+    "    \"narcanalgs\",\n",
+    "    \"nonsteroidalanti\",\n",
+    "    \"salicylates\",\n",
+    "    \"analgesiccombina\",\n",
+    "    \"anticonvulsants\",\n",
+    "    \"antiemeticantive\",\n",
+    "    \"antiparkinsonage\",\n",
+    "    \"anxiolyticssedat\",\n",
+    "    \"barbiturates\",\n",
+    "    \"benzodiazepines\",\n",
+    "    \"miscanxiolyticss\",\n",
+    "    \"cnsstimulants\",\n",
+    "    \"musclerelaxants\",\n",
+    "    \"miscantidepressa\",\n",
+    "    \"miscantipsychoti\",\n",
+    "    \"psychothercombin\",\n",
+    "    \"misccentralnervo\",\n",
+    "    \"coagulationmodif\",\n",
+    "    \"anticoagulants\",\n",
+    "    \"antiplateletagen\",\n",
+    "    \"misccoagulationm\",\n",
+    "    \"gastrointestinal\",\n",
+    "    \"antacids\",\n",
+    "    \"anticholsantispa\",\n",
+    "    \"antidiarrheals\",\n",
+    "    \"digestiveenzymes\",\n",
+    "    \"gallstonesolubil\",\n",
+    "    \"gistimulants\",\n",
+    "    \"h2antagonists\",\n",
+    "    \"laxatives\",\n",
+    "    \"miscgiagents\",\n",
+    "    \"hormones\",\n",
+    "    \"adrenalcorticals\",\n",
+    "    \"antidiabeticagen\",\n",
+    "    \"mischormones\",\n",
+    "    \"sexhormones\",\n",
+    "    \"contraceptives\",\n",
+    "    \"thyroiddrugs\",\n",
+    "    \"immunosuppressiv\",\n",
+    "    \"miscagents\",\n",
+    "    \"antidotes\",\n",
+    "    \"chelatingagents\",\n",
+    "    \"cholinergicmuscl\",\n",
+    "    \"localinjectablea\",\n",
+    "    \"miscuncategorize\",\n",
+    "    \"genitourinarytra\",\n",
+    "    \"nutritionalprods\",\n",
+    "    \"ironproducts\",\n",
+    "    \"mineralsandelect\",\n",
+    "    \"vitamins\",\n",
+    "    \"vitaminmineral\",\n",
+    "    \"respiratoryagent\",\n",
+    "    \"antihistamines\",\n",
+    "    \"antitussives\",\n",
+    "    \"bronchodilators\",\n",
+    "    \"methylxanthines\",\n",
+    "    \"decongestants\",\n",
+    "    \"expectorants\",\n",
+    "    \"miscrespiratorya\",\n",
+    "    \"respiratoryinhal\",\n",
+    "    \"upperrespiratory\",\n",
+    "    \"topicalagents\",\n",
+    "    \"dermatologicalag\",\n",
+    "    \"topicalantiinfec\",\n",
+    "    \"topicalsteroids\",\n",
+    "    \"topicalanestheti\",\n",
+    "    \"misctopicalagent\",\n",
+    "    \"topicalacneagent\",\n",
+    "    \"mouthandthroatpr\",\n",
+    "    \"ophthalpreparati\",\n",
+    "    \"oticpreparations\",\n",
+    "    \"vaginalpreparati\",\n",
+    "    \"loopdiuretics\",\n",
+    "    \"potassiumsparing\",\n",
+    "    \"thiazidediuretic\",\n",
+    "    \"carbonicanhydras\",\n",
+    "    \"firstgenerationc\",\n",
+    "    \"thirdgenerationc\",\n",
+    "    \"ophthalantiinfec\",\n",
+    "    \"ophthalglaucomaa\",\n",
+    "    \"ophthalsteroids\",\n",
+    "    \"ophthalsteroidsw\",\n",
+    "    \"ophthalantiinfla\",\n",
+    "    \"miscophthalagent\",\n",
+    "    \"oticsteroidswith\",\n",
+    "    \"miscoticagents\",\n",
+    "    \"hmgcoareductasei\",\n",
+    "    \"miscantihyplipag\",\n",
+    "    \"skelmuscrels\",\n",
+    "    \"adrenergicbronch\",\n",
+    "    \"bronchodilatorco\",\n",
+    "    \"androgensandanab\",\n",
+    "    \"estrogens\",\n",
+    "    \"progestins\",\n",
+    "    \"sexhormonecombin\",\n",
+    "    \"narcanalgcombina\",\n",
+    "    \"antirheumatics\",\n",
+    "    \"antimigraineagen\",\n",
+    "    \"antigoutagents\",\n",
+    "    \"fiveht3receptora\",\n",
+    "    \"phenthiazantieme\",\n",
+    "    \"anticholantiemet\",\n",
+    "    \"miscantiemetics\",\n",
+    "    \"hydantoinanticon\",\n",
+    "    \"barbiturateantic\",\n",
+    "    \"benzodiazepinean\",\n",
+    "    \"miscanticonvulsa\",\n",
+    "    \"anticholantipark\",\n",
+    "    \"ssriantidepressa\",\n",
+    "    \"tricyclicantidep\",\n",
+    "    \"phenthiazantipsy\",\n",
+    "    \"plateletaggregat\",\n",
+    "    \"sulfonylureas\",\n",
+    "    \"nonsulfonylureas\",\n",
+    "    \"insulin\",\n",
+    "    \"alphaglucosidase\",\n",
+    "    \"bisphosphonates\",\n",
+    "    \"alternativemeds\",\n",
+    "    \"nutraceuticals\",\n",
+    "    \"herbalproducts\",\n",
+    "    \"penicillinaseres\",\n",
+    "    \"aminopenicillins\",\n",
+    "    \"betalactamaseinh\",\n",
+    "    \"adamantaneantivi\",\n",
+    "    \"purinenucleoside\",\n",
+    "    \"miscantituberage\",\n",
+    "    \"polyenes\",\n",
+    "    \"azoleantifungals\",\n",
+    "    \"miscantifungals\",\n",
+    "    \"antimalarialquin\",\n",
+    "    \"miscantimalarial\",\n",
+    "    \"lincomycinderiva\",\n",
+    "    \"fibricacidderiva\",\n",
+    "    \"psychotheragents\",\n",
+    "    \"leukotrienemodif\",\n",
+    "    \"nasallubricants\",\n",
+    "    \"nasalsteroids\",\n",
+    "    \"nasalantihistami\",\n",
+    "    \"nasalpreparation\",\n",
+    "    \"antidepressants\",\n",
+    "    \"monoamineoxidase\",\n",
+    "    \"antipsychotics\",\n",
+    "    \"bileacidsequestr\",\n",
+    "    \"anorexiants\",\n",
+    "    \"immunologicagent\",\n",
+    "    \"monoclonalantibo\",\n",
+    "    \"heparins\",\n",
+    "    \"coumarinsandinda\",\n",
+    "    \"impotenceagents\",\n",
+    "    \"urinaryantispasm\",\n",
+    "    \"urinaryphmodifie\",\n",
+    "    \"miscgenitourinar\",\n",
+    "    \"ophthalantihista\",\n",
+    "    \"miscvaginalagent\",\n",
+    "    \"antipsoriatics\",\n",
+    "    \"thiazolidinedion\",\n",
+    "    \"protonpumpinhibi\",\n",
+    "    \"cardioselectiveb\",\n",
+    "    \"noncardioselecti\",\n",
+    "    \"dopaminergicanti\",\n",
+    "    \"fiveaminosalic\",\n",
+    "    \"cox2inhibitors\",\n",
+    "    \"meglitinides\",\n",
+    "    \"fivealphareducti\",\n",
+    "    \"antihyperuricemi\",\n",
+    "    \"topicalantibioti\",\n",
+    "    \"topicalantifunga\",\n",
+    "    \"inhaledcorticost\",\n",
+    "    \"mastcellstabiliz\",\n",
+    "    \"anticholbronchod\",\n",
+    "    \"glucocorticoids\",\n",
+    "    \"mineralocorticoi\",\n",
+    "    \"agentsforpulmona\",\n",
+    "    \"macrolides\",\n",
+    "    \"ketolides\",\n",
+    "    \"phenylpiperazine\",\n",
+    "    \"tetracyclicantid\",\n",
+    "    \"ssnriantidepress\",\n",
+    "    \"miscantidiabetic\",\n",
+    "    \"dibenzazepineant\",\n",
+    "    \"cholinergicagoni\",\n",
+    "    \"cholinesterasein\",\n",
+    "    \"antidiabeticcomb\",\n",
+    "    \"cholesterolabsor\",\n",
+    "    \"antihyplipcombin\",\n",
+    "    \"smokingcessation\",\n",
+    "    \"othersupplements\"\n",
+    "]\n",
+    "\n",
+    "medicine_cols_to_drop = [x.upper() for x in medicine_cols]\n",
+    "    \n",
+    "medicine_cols_to_drop = [element for element in medicine_cols_to_drop if element not in medicine_cols_to_keep]\n",
+    "\n",
+    "df = df.drop(columns = medicine_cols_to_drop)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "1b6f4c04-d4af-48cb-941e-d206c5fea6ff",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 3005 entries, 0 to 3004\n",
+      "Columns: 334 entries, ID to CASEDIF\n",
+      "dtypes: float64(49), int64(6), object(279)\n",
+      "memory usage: 7.7+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "a282ca80-1dfd-41a6-8773-67fbc747e254",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Delete useless features\n",
+    "useless_cols = ['ID', 'FI_ID', 'PATH', 'VERSION', 'INT_START',\n",
+    "               'WEIGHT_SEL', 'WEIGHT_ADJ', 'STRATUM', \n",
+    "               'CLUSTER']\n",
+    "\n",
+    "df = df.drop(columns = useless_cols)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "33bce8f0-e10f-4adf-8775-a36b202e41ab",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 3005 entries, 0 to 3004\n",
+      "Columns: 325 entries, GENDER to CASEDIF\n",
+      "dtypes: float64(47), int64(1), object(277)\n",
+      "memory usage: 7.5+ MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d8130d58-98ce-456e-8992-545b737fae14",
+   "metadata": {},
+   "source": [
+    "# Depression Scale Creation and Entry Cleaning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "4315d3d1-daa1-4add-a891-8b420b21faba",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "RESTLES     222\n",
+      "CONFIDNT    216\n",
+      "WORRY       212\n",
+      "RELAXED     211\n",
+      "FLTEFF       13\n",
+      "NOTGETGO     10\n",
+      "FLTDEP        9\n",
+      "NOSLEEP       7\n",
+      "NOTEAT        7\n",
+      "dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_pure = df\n",
+    "\n",
+    "symptoms_array = [\n",
+    "    \"NOTGETGO\",  # Difficulty getting going\n",
+    "    \"FLTDEP\",    # Feeling of deep sadness or emptiness\n",
+    "    \"NOSLEEP\",   # Insomnia or sleeping too much\n",
+    "    \"RESTLES\",   # Feeling restless\n",
+    "    \"NOTEAT\",    # Changes in appetite or weight\n",
+    "    \"CONFIDNT\",  # Lack of confidence\n",
+    "    \"FLTEFF\",   # Feeling things are out of control\n",
+    "    \"RELAXED\",   # Unable to feel relaxed\n",
+    "    \"WORRY\"   # Worrying thoughts\n",
+    "]\n",
+    "\n",
+    "df_phq9 = df_pure[symptoms_array]\n",
+    "\n",
+    "missing_counts = df_phq9.isna().sum()\n",
+    "missing_counts_sorted = missing_counts.sort_values(ascending=False)\n",
+    "\n",
+    "# Print the sorted counts\n",
+    "print(missing_counts_sorted)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "98932270-aeae-4758-a9d9-5234087e6734",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 3005 entries, 0 to 3004\n",
+      "Data columns (total 9 columns):\n",
+      " #   Column    Non-Null Count  Dtype \n",
+      "---  ------    --------------  ----- \n",
+      " 0   NOTGETGO  2995 non-null   object\n",
+      " 1   FLTDEP    2996 non-null   object\n",
+      " 2   NOSLEEP   2998 non-null   object\n",
+      " 3   RESTLES   2783 non-null   object\n",
+      " 4   NOTEAT    2998 non-null   object\n",
+      " 5   CONFIDNT  2789 non-null   object\n",
+      " 6   FLTEFF    2992 non-null   object\n",
+      " 7   RELAXED   2794 non-null   object\n",
+      " 8   WORRY     2793 non-null   object\n",
+      "dtypes: object(9)\n",
+      "memory usage: 211.4+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_phq9.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "7a9e9234-32bd-4d21-8017-db9ef7354bd9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 2763 entries, 0 to 3004\n",
+      "Data columns (total 9 columns):\n",
+      " #   Column    Non-Null Count  Dtype \n",
+      "---  ------    --------------  ----- \n",
+      " 0   NOTGETGO  2763 non-null   object\n",
+      " 1   FLTDEP    2763 non-null   object\n",
+      " 2   NOSLEEP   2763 non-null   object\n",
+      " 3   RESTLES   2763 non-null   object\n",
+      " 4   NOTEAT    2763 non-null   object\n",
+      " 5   CONFIDNT  2763 non-null   object\n",
+      " 6   FLTEFF    2763 non-null   object\n",
+      " 7   RELAXED   2763 non-null   object\n",
+      " 8   WORRY     2763 non-null   object\n",
+      "dtypes: object(9)\n",
+      "memory usage: 215.9+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Delete all entries that has a missing value in one of these features\n",
+    "df_phq9_2 = df_phq9.copy()\n",
+    "df_phq9_2 = df_phq9_2.dropna()\n",
+    "df_phq9_2.info() # los around 244 entries from that"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "5ba060e8-46e9-4f4f-a680-33660116e35e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Making depression scale\n",
+    "from sklearn.preprocessing import LabelEncoder\n",
+    "label_encoder = LabelEncoder()\n",
+    "\n",
+    "for col in df_phq9_2.columns:\n",
+    "    df_phq9_2.loc[:, col] = label_encoder.fit_transform(df_phq9_2[col])\n",
+    "\n",
+    "# Need to reverse the order from bad-good to good-bad\n",
+    "df_phq9_2.loc[:, 'CONFIDNT'] = df_phq9_2['CONFIDNT'].apply(lambda x: 3 - x)\n",
+    "df_phq9_2.loc[:, 'RELAXED'] = df_phq9_2['RELAXED'].apply(lambda x: 3 - x)\n",
+    "\n",
+    "df_phq9_2['total_sum'] = df_phq9_2.sum(axis = 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "c7684ba0-4a07-4cf4-a7a6-3d2087e5e125",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Categorize depression\n",
+    "def categorize_score(score):\n",
+    "    if 0 <= score <= 4:\n",
+    "        return 'Normal'\n",
+    "    elif 5 <= score <= 9:\n",
+    "        return 'Mild'\n",
+    "    elif 10 <= score <= 27:\n",
+    "        return 'ModerateSevere'\n",
+    "\n",
+    "# Applying the categorization function to the 'Total_Sum' column\n",
+    "df_phq9_2['depression_category'] = df_phq9_2['total_sum'].apply(categorize_score)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "eb79389e-0a1e-4f8f-b27e-efb320610038",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Seperate by Category\n",
+    "df_phq9_normal = df_phq9_2[df_phq9_2['depression_category'] == 'Normal']\n",
+    "df_phq9_mild = df_phq9_2[df_phq9_2['depression_category'] == 'Mild']\n",
+    "df_phq9_moderatesevere = df_phq9_2[df_phq9_2['depression_category'] == 'ModerateSevere']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "c03612d6-788b-4cdd-85f6-8e8e7c541956",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 1308 entries, 0 to 3003\n",
+      "Data columns (total 11 columns):\n",
+      " #   Column               Non-Null Count  Dtype \n",
+      "---  ------               --------------  ----- \n",
+      " 0   NOTGETGO             1308 non-null   object\n",
+      " 1   FLTDEP               1308 non-null   object\n",
+      " 2   NOSLEEP              1308 non-null   object\n",
+      " 3   RESTLES              1308 non-null   object\n",
+      " 4   NOTEAT               1308 non-null   object\n",
+      " 5   CONFIDNT             1308 non-null   object\n",
+      " 6   FLTEFF               1308 non-null   object\n",
+      " 7   RELAXED              1308 non-null   object\n",
+      " 8   WORRY                1308 non-null   object\n",
+      " 9   total_sum            1308 non-null   object\n",
+      " 10  depression_category  1308 non-null   object\n",
+      "dtypes: object(11)\n",
+      "memory usage: 122.6+ KB\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 940 entries, 2 to 3004\n",
+      "Data columns (total 11 columns):\n",
+      " #   Column               Non-Null Count  Dtype \n",
+      "---  ------               --------------  ----- \n",
+      " 0   NOTGETGO             940 non-null    object\n",
+      " 1   FLTDEP               940 non-null    object\n",
+      " 2   NOSLEEP              940 non-null    object\n",
+      " 3   RESTLES              940 non-null    object\n",
+      " 4   NOTEAT               940 non-null    object\n",
+      " 5   CONFIDNT             940 non-null    object\n",
+      " 6   FLTEFF               940 non-null    object\n",
+      " 7   RELAXED              940 non-null    object\n",
+      " 8   WORRY                940 non-null    object\n",
+      " 9   total_sum            940 non-null    object\n",
+      " 10  depression_category  940 non-null    object\n",
+      "dtypes: object(11)\n",
+      "memory usage: 88.1+ KB\n",
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "Index: 515 entries, 3 to 3000\n",
+      "Data columns (total 11 columns):\n",
+      " #   Column               Non-Null Count  Dtype \n",
+      "---  ------               --------------  ----- \n",
+      " 0   NOTGETGO             515 non-null    object\n",
+      " 1   FLTDEP               515 non-null    object\n",
+      " 2   NOSLEEP              515 non-null    object\n",
+      " 3   RESTLES              515 non-null    object\n",
+      " 4   NOTEAT               515 non-null    object\n",
+      " 5   CONFIDNT             515 non-null    object\n",
+      " 6   FLTEFF               515 non-null    object\n",
+      " 7   RELAXED              515 non-null    object\n",
+      " 8   WORRY                515 non-null    object\n",
+      " 9   total_sum            515 non-null    object\n",
+      " 10  depression_category  515 non-null    object\n",
+      "dtypes: object(11)\n",
+      "memory usage: 48.3+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df_phq9_normal.info()\n",
+    "df_phq9_mild.info()\n",
+    "df_phq9_moderatesevere.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "7b8ab44d-aab2-4b5c-ba3e-e1eabd3d7239",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Connect with original dataset\n",
+    "mild_indices = df_phq9_mild.index\n",
+    "\n",
+    "df_mild_connection = df_pure.loc[mild_indices]\n",
+    "df_mild_connection['depression_category'] = 'mild'\n",
+    "df_mild_connection\n",
+    "\n",
+    "moderatesevere_indices = df_phq9_moderatesevere.index\n",
+    "\n",
+    "df_moderatesevere_connection = df_pure.loc[moderatesevere_indices]\n",
+    "df_moderatesevere_connection['depression_category'] = 'moderatesevere'\n",
+    "df_moderatesevere_connection\n",
+    "\n",
+    "normal_indices = df_phq9_normal.index\n",
+    "\n",
+    "df_normal_connection = df_pure.loc[normal_indices]\n",
+    "df_normal_connection['depression_category'] = 'normal'\n",
+    "df_normal_connection\n",
+    "\n",
+    "df_appended = pd.concat([df_normal_connection, df_mild_connection, df_moderatesevere_connection], ignore_index=False)\n",
+    "\n",
+    "symptoms_array = [\n",
+    "    \"NOTGETGO\",  # Difficulty getting going\n",
+    "    \"FLTDEP\",    # Feeling of deep sadness or emptiness\n",
+    "    \"NOSLEEP\",   # Insomnia or sleeping too much\n",
+    "    \"RESTLES\",   # Feeling restless\n",
+    "    \"NOTEAT\",    # Changes in appetite or weight\n",
+    "    \"CONFIDNT\",  # Lack of confidence\n",
+    "    \"UNCNTRL\",   # Feeling things are out of control\n",
+    "    \"RELAXED\",   # Unable to feel relaxed\n",
+    "    \"WORRY\"   # Worrying thoughts\n",
+    "]\n",
+    "\n",
+    "df_appended = df_appended.drop(columns = symptoms_array)\n",
+    "df_appended['total_sum'] = df_phq9_2['total_sum']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "d6d0ae13-8b6a-466f-af99-571509ef1201",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>GENDER</th>\n",
+       "      <th>AGE</th>\n",
+       "      <th>AGEGRP</th>\n",
+       "      <th>DEGREE_RECODE</th>\n",
+       "      <th>EDUC</th>\n",
+       "      <th>RACE_RECODE</th>\n",
+       "      <th>HISPANIC</th>\n",
+       "      <th>ETHGRP</th>\n",
+       "      <th>MILITARY</th>\n",
+       "      <th>JAIL</th>\n",
+       "      <th>...</th>\n",
+       "      <th>IWLOC5</th>\n",
+       "      <th>IWLOC6</th>\n",
+       "      <th>STRUCTQ</th>\n",
+       "      <th>BUILD</th>\n",
+       "      <th>OTBUILD</th>\n",
+       "      <th>COMBUILD</th>\n",
+       "      <th>CASECOMP</th>\n",
+       "      <th>CASEDIF</th>\n",
+       "      <th>depression_category</th>\n",
+       "      <th>total_sum</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>(2) female</td>\n",
+       "      <td>62</td>\n",
+       "      <td>(1) 57-64</td>\n",
+       "      <td>(5) masters</td>\n",
+       "      <td>(4) bachelors or more</td>\n",
+       "      <td>(1) white/caucasian</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(1) white</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>...</td>\n",
+       "      <td>(1) 1 (quiet)</td>\n",
+       "      <td>(1) 1 (no smell)</td>\n",
+       "      <td>(02) detached single family house</td>\n",
+       "      <td>(4) very well kept</td>\n",
+       "      <td>(3) fairly well kept (needs cosmetic work)</td>\n",
+       "      <td>(3) average</td>\n",
+       "      <td>(11) eleventh case or more</td>\n",
+       "      <td>(2) somewhat difficult</td>\n",
+       "      <td>normal</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>(2) female</td>\n",
+       "      <td>79</td>\n",
+       "      <td>(3) 75-85</td>\n",
+       "      <td>(2) high school diploma/equivalency</td>\n",
+       "      <td>(3) voc cert/some college/assoc</td>\n",
+       "      <td>(1) white/caucasian</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(1) white</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>...</td>\n",
+       "      <td>(1) 1 (quiet)</td>\n",
+       "      <td>(1) 1 (no smell)</td>\n",
+       "      <td>(02) detached single family house</td>\n",
+       "      <td>(4) very well kept</td>\n",
+       "      <td>(3) fairly well kept (needs cosmetic work)</td>\n",
+       "      <td>(4) above average</td>\n",
+       "      <td>(11) eleventh case or more</td>\n",
+       "      <td>(3) not very difficult</td>\n",
+       "      <td>normal</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>(1) male</td>\n",
+       "      <td>58</td>\n",
+       "      <td>(1) 57-64</td>\n",
+       "      <td>(4) bachelors</td>\n",
+       "      <td>(4) bachelors or more</td>\n",
+       "      <td>(1) white/caucasian</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(1) white</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>...</td>\n",
+       "      <td>(1) 1 (quiet)</td>\n",
+       "      <td>(1) 1 (no smell)</td>\n",
+       "      <td>(02) detached single family house</td>\n",
+       "      <td>(3) fairly well kept (needs cosmetic work)</td>\n",
+       "      <td>(3) fairly well kept (needs cosmetic work)</td>\n",
+       "      <td>(3) average</td>\n",
+       "      <td>(11) eleventh case or more</td>\n",
+       "      <td>(2) somewhat difficult</td>\n",
+       "      <td>normal</td>\n",
+       "      <td>2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>(1) male</td>\n",
+       "      <td>79</td>\n",
+       "      <td>(3) 75-85</td>\n",
+       "      <td>(5) masters</td>\n",
+       "      <td>(4) bachelors or more</td>\n",
+       "      <td>(1) white/caucasian</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(1) white</td>\n",
+       "      <td>(1) yes</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>...</td>\n",
+       "      <td>(1) 1 (quiet)</td>\n",
+       "      <td>(1) 1 (no smell)</td>\n",
+       "      <td>(02) detached single family house</td>\n",
+       "      <td>(4) very well kept</td>\n",
+       "      <td>(4) very well kept</td>\n",
+       "      <td>(5) far above average</td>\n",
+       "      <td>(08) eighth case</td>\n",
+       "      <td>(2) somewhat difficult</td>\n",
+       "      <td>normal</td>\n",
+       "      <td>4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>(2) female</td>\n",
+       "      <td>68</td>\n",
+       "      <td>(2) 65-74</td>\n",
+       "      <td>(6) law, md or phd</td>\n",
+       "      <td>(4) bachelors or more</td>\n",
+       "      <td>(1) white/caucasian</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(1) white</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>...</td>\n",
+       "      <td>(1) 1 (quiet)</td>\n",
+       "      <td>(1) 1 (no smell)</td>\n",
+       "      <td>(02) detached single family house</td>\n",
+       "      <td>(4) very well kept</td>\n",
+       "      <td>(3) fairly well kept (needs cosmetic work)</td>\n",
+       "      <td>(4) above average</td>\n",
+       "      <td>(11) eleventh case or more</td>\n",
+       "      <td>(2) somewhat difficult</td>\n",
+       "      <td>normal</td>\n",
+       "      <td>3</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2988</th>\n",
+       "      <td>(2) female</td>\n",
+       "      <td>61</td>\n",
+       "      <td>(1) 57-64</td>\n",
+       "      <td>(1) none</td>\n",
+       "      <td>(1) &lt; hs</td>\n",
+       "      <td>(1) white/caucasian</td>\n",
+       "      <td>(1) yes</td>\n",
+       "      <td>(3) hispanic, non-black</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>...</td>\n",
+       "      <td>(1) 1 (quiet)</td>\n",
+       "      <td>(1) 1 (no smell)</td>\n",
+       "      <td>(01) trailer</td>\n",
+       "      <td>(3) fairly well kept (needs cosmetic work)</td>\n",
+       "      <td>(1) very poorly kept (needs major repairs)</td>\n",
+       "      <td>(4) above average</td>\n",
+       "      <td>(11) eleventh case or more</td>\n",
+       "      <td>(3) not very difficult</td>\n",
+       "      <td>moderatesevere</td>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2991</th>\n",
+       "      <td>(2) female</td>\n",
+       "      <td>70</td>\n",
+       "      <td>(2) 65-74</td>\n",
+       "      <td>(1) none</td>\n",
+       "      <td>(1) &lt; hs</td>\n",
+       "      <td>(3) asian, pacific islander, american indian o...</td>\n",
+       "      <td>(1) yes</td>\n",
+       "      <td>(3) hispanic, non-black</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>...</td>\n",
+       "      <td>(1) 1 (quiet)</td>\n",
+       "      <td>(1) 1 (no smell)</td>\n",
+       "      <td>(02) detached single family house</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>(11) eleventh case or more</td>\n",
+       "      <td>(3) not very difficult</td>\n",
+       "      <td>moderatesevere</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2992</th>\n",
+       "      <td>(2) female</td>\n",
+       "      <td>70</td>\n",
+       "      <td>(2) 65-74</td>\n",
+       "      <td>(3) associates</td>\n",
+       "      <td>(3) voc cert/some college/assoc</td>\n",
+       "      <td>(1) white/caucasian</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(1) white</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>...</td>\n",
+       "      <td>(1) 1 (quiet)</td>\n",
+       "      <td>(4) 4</td>\n",
+       "      <td>(02) detached single family house</td>\n",
+       "      <td>(3) fairly well kept (needs cosmetic work)</td>\n",
+       "      <td>(3) fairly well kept (needs cosmetic work)</td>\n",
+       "      <td>(3) average</td>\n",
+       "      <td>(11) eleventh case or more</td>\n",
+       "      <td>(4) not at all difficult</td>\n",
+       "      <td>moderatesevere</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2993</th>\n",
+       "      <td>(1) male</td>\n",
+       "      <td>63</td>\n",
+       "      <td>(1) 57-64</td>\n",
+       "      <td>(4) bachelors</td>\n",
+       "      <td>(4) bachelors or more</td>\n",
+       "      <td>(1) white/caucasian</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(1) white</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>...</td>\n",
+       "      <td>(2) 2</td>\n",
+       "      <td>(1) 1 (no smell)</td>\n",
+       "      <td>(02) detached single family house</td>\n",
+       "      <td>(4) very well kept</td>\n",
+       "      <td>(4) very well kept</td>\n",
+       "      <td>(3) average</td>\n",
+       "      <td>(07) seventh case</td>\n",
+       "      <td>(4) not at all difficult</td>\n",
+       "      <td>moderatesevere</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3000</th>\n",
+       "      <td>(2) female</td>\n",
+       "      <td>73</td>\n",
+       "      <td>(2) 65-74</td>\n",
+       "      <td>(3) associates</td>\n",
+       "      <td>(3) voc cert/some college/assoc</td>\n",
+       "      <td>(1) white/caucasian</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(1) white</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>(0) no</td>\n",
+       "      <td>...</td>\n",
+       "      <td>(2) 2</td>\n",
+       "      <td>(2) 2</td>\n",
+       "      <td>(02) detached single family house</td>\n",
+       "      <td>(4) very well kept</td>\n",
+       "      <td>(4) very well kept</td>\n",
+       "      <td>(3) average</td>\n",
+       "      <td>(05) fifth case</td>\n",
+       "      <td>(3) not very difficult</td>\n",
+       "      <td>moderatesevere</td>\n",
+       "      <td>17</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>2763 rows × 318 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          GENDER  AGE     AGEGRP                        DEGREE_RECODE  \\\n",
+       "0     (2) female   62  (1) 57-64                          (5) masters   \n",
+       "1     (2) female   79  (3) 75-85  (2) high school diploma/equivalency   \n",
+       "17      (1) male   58  (1) 57-64                        (4) bachelors   \n",
+       "24      (1) male   79  (3) 75-85                          (5) masters   \n",
+       "26    (2) female   68  (2) 65-74                   (6) law, md or phd   \n",
+       "...          ...  ...        ...                                  ...   \n",
+       "2988  (2) female   61  (1) 57-64                             (1) none   \n",
+       "2991  (2) female   70  (2) 65-74                             (1) none   \n",
+       "2992  (2) female   70  (2) 65-74                       (3) associates   \n",
+       "2993    (1) male   63  (1) 57-64                        (4) bachelors   \n",
+       "3000  (2) female   73  (2) 65-74                       (3) associates   \n",
+       "\n",
+       "                                 EDUC  \\\n",
+       "0               (4) bachelors or more   \n",
+       "1     (3) voc cert/some college/assoc   \n",
+       "17              (4) bachelors or more   \n",
+       "24              (4) bachelors or more   \n",
+       "26              (4) bachelors or more   \n",
+       "...                               ...   \n",
+       "2988                         (1) < hs   \n",
+       "2991                         (1) < hs   \n",
+       "2992  (3) voc cert/some college/assoc   \n",
+       "2993            (4) bachelors or more   \n",
+       "3000  (3) voc cert/some college/assoc   \n",
+       "\n",
+       "                                            RACE_RECODE HISPANIC  \\\n",
+       "0                                   (1) white/caucasian   (0) no   \n",
+       "1                                   (1) white/caucasian   (0) no   \n",
+       "17                                  (1) white/caucasian   (0) no   \n",
+       "24                                  (1) white/caucasian   (0) no   \n",
+       "26                                  (1) white/caucasian   (0) no   \n",
+       "...                                                 ...      ...   \n",
+       "2988                                (1) white/caucasian  (1) yes   \n",
+       "2991  (3) asian, pacific islander, american indian o...  (1) yes   \n",
+       "2992                                (1) white/caucasian   (0) no   \n",
+       "2993                                (1) white/caucasian   (0) no   \n",
+       "3000                                (1) white/caucasian   (0) no   \n",
+       "\n",
+       "                       ETHGRP MILITARY    JAIL  ...         IWLOC5  \\\n",
+       "0                   (1) white   (0) no  (0) no  ...  (1) 1 (quiet)   \n",
+       "1                   (1) white   (0) no  (0) no  ...  (1) 1 (quiet)   \n",
+       "17                  (1) white   (0) no  (0) no  ...  (1) 1 (quiet)   \n",
+       "24                  (1) white  (1) yes  (0) no  ...  (1) 1 (quiet)   \n",
+       "26                  (1) white   (0) no  (0) no  ...  (1) 1 (quiet)   \n",
+       "...                       ...      ...     ...  ...            ...   \n",
+       "2988  (3) hispanic, non-black   (0) no  (0) no  ...  (1) 1 (quiet)   \n",
+       "2991  (3) hispanic, non-black   (0) no  (0) no  ...  (1) 1 (quiet)   \n",
+       "2992                (1) white   (0) no  (0) no  ...  (1) 1 (quiet)   \n",
+       "2993                (1) white   (0) no  (0) no  ...          (2) 2   \n",
+       "3000                (1) white   (0) no  (0) no  ...          (2) 2   \n",
+       "\n",
+       "                IWLOC6                            STRUCTQ  \\\n",
+       "0     (1) 1 (no smell)  (02) detached single family house   \n",
+       "1     (1) 1 (no smell)  (02) detached single family house   \n",
+       "17    (1) 1 (no smell)  (02) detached single family house   \n",
+       "24    (1) 1 (no smell)  (02) detached single family house   \n",
+       "26    (1) 1 (no smell)  (02) detached single family house   \n",
+       "...                ...                                ...   \n",
+       "2988  (1) 1 (no smell)                       (01) trailer   \n",
+       "2991  (1) 1 (no smell)  (02) detached single family house   \n",
+       "2992             (4) 4  (02) detached single family house   \n",
+       "2993  (1) 1 (no smell)  (02) detached single family house   \n",
+       "3000             (2) 2  (02) detached single family house   \n",
+       "\n",
+       "                                           BUILD  \\\n",
+       "0                             (4) very well kept   \n",
+       "1                             (4) very well kept   \n",
+       "17    (3) fairly well kept (needs cosmetic work)   \n",
+       "24                            (4) very well kept   \n",
+       "26                            (4) very well kept   \n",
+       "...                                          ...   \n",
+       "2988  (3) fairly well kept (needs cosmetic work)   \n",
+       "2991                                         NaN   \n",
+       "2992  (3) fairly well kept (needs cosmetic work)   \n",
+       "2993                          (4) very well kept   \n",
+       "3000                          (4) very well kept   \n",
+       "\n",
+       "                                         OTBUILD               COMBUILD  \\\n",
+       "0     (3) fairly well kept (needs cosmetic work)            (3) average   \n",
+       "1     (3) fairly well kept (needs cosmetic work)      (4) above average   \n",
+       "17    (3) fairly well kept (needs cosmetic work)            (3) average   \n",
+       "24                            (4) very well kept  (5) far above average   \n",
+       "26    (3) fairly well kept (needs cosmetic work)      (4) above average   \n",
+       "...                                          ...                    ...   \n",
+       "2988  (1) very poorly kept (needs major repairs)      (4) above average   \n",
+       "2991                                         NaN                    NaN   \n",
+       "2992  (3) fairly well kept (needs cosmetic work)            (3) average   \n",
+       "2993                          (4) very well kept            (3) average   \n",
+       "3000                          (4) very well kept            (3) average   \n",
+       "\n",
+       "                        CASECOMP                   CASEDIF  \\\n",
+       "0     (11) eleventh case or more    (2) somewhat difficult   \n",
+       "1     (11) eleventh case or more    (3) not very difficult   \n",
+       "17    (11) eleventh case or more    (2) somewhat difficult   \n",
+       "24              (08) eighth case    (2) somewhat difficult   \n",
+       "26    (11) eleventh case or more    (2) somewhat difficult   \n",
+       "...                          ...                       ...   \n",
+       "2988  (11) eleventh case or more    (3) not very difficult   \n",
+       "2991  (11) eleventh case or more    (3) not very difficult   \n",
+       "2992  (11) eleventh case or more  (4) not at all difficult   \n",
+       "2993           (07) seventh case  (4) not at all difficult   \n",
+       "3000             (05) fifth case    (3) not very difficult   \n",
+       "\n",
+       "     depression_category total_sum  \n",
+       "0                 normal         2  \n",
+       "1                 normal         3  \n",
+       "17                normal         2  \n",
+       "24                normal         4  \n",
+       "26                normal         3  \n",
+       "...                  ...       ...  \n",
+       "2988      moderatesevere        14  \n",
+       "2991      moderatesevere        11  \n",
+       "2992      moderatesevere        10  \n",
+       "2993      moderatesevere        11  \n",
+       "3000      moderatesevere        17  \n",
+       "\n",
+       "[2763 rows x 318 columns]"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_appended"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "a25493c0-42b4-4ce4-8fb2-c3579ec28253",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Export for regression\n",
+    "df_appended.to_csv('3labelv4Regression.csv', index = False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "4946608b-95e0-4752-b35b-a895d0a85763",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Export for classification\n",
+    "df_appended_classification = df_appended.drop('total_sum', axis = 1)\n",
+    "df_appended_classification.to_csv('3labelv4Classification.csv', index = False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

detectionLayer.ipynb ADDED Viewed

	@@ -0,0 +1,1799 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "f9fd590d-a3c3-4e94-b5ca-59a6ee9b29c3",
+   "metadata": {},
+   "source": [
+    "# Detection Layer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dddcb2a8-73d3-4476-b88a-bc1494a2c830",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from sklearn.model_selection import cross_val_score, KFold\n",
+    "from sklearn.impute import KNNImputer\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier\n",
+    "from xgboost import XGBClassifier\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "from sklearn.experimental import enable_iterative_imputer\n",
+    "from imblearn.pipeline import make_pipeline as make_pipeline_imb\n",
+    "from imblearn.over_sampling import SMOTE,SMOTENC\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from collections import Counter\n",
+    "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n",
+    "from sklearn.ensemble import GradientBoostingClassifier\n",
+    "from sklearn.ensemble import VotingClassifier\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.ensemble import BaggingClassifier\n",
+    "from sklearn.neighbors import KNeighborsClassifier\n",
+    "from sklearn.ensemble import ExtraTreesClassifier\n",
+    "from deslib.dcs import APosteriori\n",
+    "from deslib.des import KNORAE, KNORAU, KNOP, DESMI\n",
+    "from sklearn.neighbors import LocalOutlierFactor\n",
+    "from sklearn.utils import resample\n",
+    "import warnings\n",
+    "from imblearn.over_sampling import RandomOverSampler\n",
+    "from imblearn.under_sampling import RandomUnderSampler\n",
+    "from sklearn.preprocessing import MinMaxScaler\n",
+    "from imblearn.pipeline import Pipeline\n",
+    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
+    "from sklearn.preprocessing import LabelEncoder, PowerTransformer\n",
+    "from collections import defaultdict\n",
+    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier\n",
+    "from catboost import CatBoostClassifier\n",
+    "from lightgbm import LGBMClassifier\n",
+    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "from sklearn.neural_network import MLPClassifier\n",
+    "import Orange\n",
+    "from scipy.stats import friedmanchisquare, rankdata\n",
+    "import shap\n",
+    "import scikit_posthocs as sp\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "from IPython.display import FileLink, display\n",
+    "import math\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from skopt.space import Integer, Real\n",
+    "from sklearn.model_selection import StratifiedKFold\n",
+    "from skopt import BayesSearchCV\n",
+    "import xgboost as xgb\n",
+    "from imblearn.over_sampling import SMOTE\n",
+    "from sklearn.tree import DecisionTreeClassifier, export_text\n",
+    "from sklearn import tree\n",
+    "from skopt.space import Real, Integer, Categorical\n",
+    "from skopt.callbacks import VerboseCallback\n",
+    "from deslib.des.knora_e import KNORAE\n",
+    "from deslib.des.knora_u import KNORAU\n",
+    "from deslib.des.knop import KNOP\n",
+    "from deslib.des.meta_des import METADES\n",
+    "from deslib.des.des_knn import DESKNN\n",
+    "from deslib.des.des_p import DESP"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39d6683c-fd3f-4daa-afdc-2e7f83c3fce3",
+   "metadata": {},
+   "source": [
+    "### Preparation before training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec17e47a-8d92-498d-8f28-d8259d6ebc4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Call Dataset\n",
+    "pd.set_option('display.max_rows', 10)\n",
+    "initial_df = pd.read_csv('3labelv4Classification.csv')\n",
+    "initial_df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d3751352-73bc-42c6-b6a5-84a3badf14ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# All categorical features except for label\n",
+    "cols = initial_df.columns\n",
+    "num_cols = initial_df._get_numeric_data().columns\n",
+    "categorical_features = list(set(cols) - set(num_cols))\n",
+    "categorical_features.remove('depression_category')\n",
+    "\n",
+    "# Label Encode all categorical, but keep missing values\n",
+    "le_initial_df = initial_df.copy()\n",
+    "dropped_labels = le_initial_df['depression_category']\n",
+    "le_initial_df = le_initial_df.drop('depression_category', axis = 1)\n",
+    "\n",
+    "for col in le_initial_df.columns:\n",
+    "    if le_initial_df[col].dtype == 'object':\n",
+    "        le_initial_df[col] = le_initial_df[col].fillna('missing')\n",
+    "\n",
+    "        label_encoder = LabelEncoder()\n",
+    "        le_initial_df[col] = label_encoder.fit_transform(le_initial_df[col])\n",
+    "\n",
+    "        missing_value_index = np.where(label_encoder.classes_ == 'missing')[0]\n",
+    "        \n",
+    "        le_initial_df[col] = le_initial_df[col].replace(missing_value_index, np.nan)\n",
+    "\n",
+    "le_initial_df = pd.concat([le_initial_df, dropped_labels], axis = 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9fa21a95-21f1-4274-86ba-d7977813066b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "le_initial_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ba54a81-d4de-4884-99d3-29867eb7ea40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Seperate and Combine\n",
+    "le_df_normal = le_initial_df[le_initial_df['depression_category'] == 'normal']\n",
+    "le_df_mild = le_initial_df[le_initial_df['depression_category'] == 'mild']\n",
+    "le_df_moderatesevere = le_initial_df[le_initial_df['depression_category'] == 'moderatesevere']\n",
+    "\n",
+    "le_df_depression = pd.concat([le_df_mild, le_df_moderatesevere], ignore_index = False)\n",
+    "\n",
+    "le_df_depression['depression_category'] = 'depression'\n",
+    "\n",
+    "# Check depression category counts\n",
+    "dataframes = [le_df_normal, le_df_depression]\n",
+    "le_initial_df = pd.concat(dataframes, ignore_index=True)\n",
+    "label_counts = le_initial_df['depression_category'].value_counts()\n",
+    "label_counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c517213f-f6bb-4298-99ee-3b29f6f7d0cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Some outlier\n",
+    "threshold = int(0.8 * le_df_normal.shape[1])\n",
+    "le_df_normal = le_df_normal.dropna(thresh = threshold)\n",
+    "threshold = int(0.8 * le_df_depression.shape[1])\n",
+    "le_df_depression = le_df_depression.dropna(thresh = threshold)\n",
+    "\n",
+    "# Check depression category counts\n",
+    "dataframes = [le_df_normal, le_df_depression]\n",
+    "le_initial_df = pd.concat(dataframes, ignore_index=True)\n",
+    "label_counts = le_initial_df['depression_category'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c5bdf52-c645-4a11-920a-579e28db1a50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imputation\n",
+    "different_le_dfs = [le_df_normal, le_df_depression]\n",
+    "imputed_le_dfs = []\n",
+    "from sklearn.impute import IterativeImputer\n",
+    "for le_df in different_le_dfs:\n",
+    "    y = le_df['depression_category']\n",
+    "    X = le_df.drop('depression_category', axis = 1)\n",
+    "    \n",
+    "    imputer = SimpleImputer(strategy='median')\n",
+    "    imputed_data = imputer.fit_transform(X)\n",
+    "    imputed_df = pd.DataFrame(imputed_data, columns = X.columns)\n",
+    "\n",
+    "    imputed_df['depression_category'] = y.reset_index(drop = True)\n",
+    "    imputed_le_dfs.append(imputed_df)\n",
+    "\n",
+    "concatenated_le_dfs = pd.concat(imputed_le_dfs, ignore_index = True)\n",
+    "concatenated_le_dfs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e871a5f-beab-4f90-b8c4-e922ef86d0f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Full label encode depression category\n",
+    "fully_LE_concatenated_le_dfs = concatenated_le_dfs.copy()\n",
+    "fully_LE_concatenated_le_dfs['depression_category'] = label_encoder.fit_transform(fully_LE_concatenated_le_dfs['depression_category'])\n",
+    "\n",
+    "# The dataset after category connect, imputation, and label encoding\n",
+    "splitted_dataset = fully_LE_concatenated_le_dfs.copy()\n",
+    "splitted_dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "198fdc3c-ccbc-4ac5-8621-c0c5dc771cbb",
+   "metadata": {},
+   "source": [
+    "### Setup for training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "101c5549-7bc3-4573-867b-f09776e254db",
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def plot_combined_roc_curve(roc_curves, classifier_names):\n",
+    "    plt.figure(figsize=(12, 8))\n",
+    "    mean_fpr = np.linspace(0, 1, 100)\n",
+    "    colors = plt.cm.get_cmap('tab20', len(classifier_names))\n",
+    "    \n",
+    "    for i, clf_name in enumerate(classifier_names):\n",
+    "        tprs = []\n",
+    "        for fpr, tpr in roc_curves[clf_name]:\n",
+    "            tprs.append(np.interp(mean_fpr, fpr, tpr))\n",
+    "        mean_tpr = np.mean(tprs, axis=0)\n",
+    "        mean_tpr[-1] = 1.0\n",
+    "        mean_auc = auc(mean_fpr, mean_tpr)\n",
+    "        plt.plot(mean_fpr, mean_tpr, color=colors(i), lw=2, linestyle='-', marker='o', markersize=4, \n",
+    "                 label=f'{clf_name} (AUC = {mean_auc:.3f})')\n",
+    "\n",
+    "    plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--')\n",
+    "    plt.xlim([0.0, 1.0])\n",
+    "    plt.ylim([0.0, 1.05])\n",
+    "    plt.xlabel('False Positive Rate', fontsize=26)\n",
+    "    plt.ylabel('True Positive Rate', fontsize=26)\n",
+    "    plt.xticks(fontsize=30)\n",
+    "    plt.yticks(fontsize=30)\n",
+    "    plt.legend(loc=\"lower right\", fontsize=22, frameon=True, framealpha=0.9)\n",
+    "    plt.grid(True)\n",
+    "\n",
+    "    filename='bonk.svg'\n",
+    "\n",
+    "    plt.savefig(filename, format='svg')\n",
+    "    plt.show()\n",
+    "\n",
+    "    display(FileLink(filename))\n",
+    "\n",
+    "# Preparation code to make CD diagram from older version of Orange\n",
+    "def compute_CD(avranks, n, alpha=\"0.05\", test=\"nemenyi\"):\n",
+    "    \"\"\"\n",
+    "    Returns critical difference for Nemenyi or Bonferroni-Dunn test\n",
+    "    according to given alpha (either alpha=\"0.05\" or alpha=\"0.1\") for average\n",
+    "    ranks and number of tested datasets N. Test can be either \"nemenyi\" for\n",
+    "    for Nemenyi two tailed test or \"bonferroni-dunn\" for Bonferroni-Dunn test.\n",
+    "\n",
+    "    This function is deprecated and will be removed in Orange 3.34.\n",
+    "    \"\"\"\n",
+    "    k = len(avranks)\n",
+    "    d = {(\"nemenyi\", \"0.05\"): [0, 0, 1.959964, 2.343701, 2.569032, 2.727774,\n",
+    "                               2.849705, 2.94832, 3.030879, 3.101730, 3.163684,\n",
+    "                               3.218654, 3.268004, 3.312739, 3.353618, 3.39123,\n",
+    "                               3.426041, 3.458425, 3.488685, 3.517073,\n",
+    "                               3.543799],\n",
+    "         (\"nemenyi\", \"0.1\"): [0, 0, 1.644854, 2.052293, 2.291341, 2.459516,\n",
+    "                              2.588521, 2.692732, 2.779884, 2.854606, 2.919889,\n",
+    "                              2.977768, 3.029694, 3.076733, 3.119693, 3.159199,\n",
+    "                              3.195743, 3.229723, 3.261461, 3.291224, 3.319233],\n",
+    "         (\"bonferroni-dunn\", \"0.05\"): [0, 0, 1.960, 2.241, 2.394, 2.498, 2.576,\n",
+    "                                       2.638, 2.690, 2.724, 2.773],\n",
+    "         (\"bonferroni-dunn\", \"0.1\"): [0, 0, 1.645, 1.960, 2.128, 2.241, 2.326,\n",
+    "                                      2.394, 2.450, 2.498, 2.539]}\n",
+    "    q = d[(test, alpha)]\n",
+    "    cd = q[k] * (k * (k + 1) / (6.0 * n)) ** 0.5\n",
+    "    return cd\n",
+    "\n",
+    "\n",
+    "def graph_ranks(avranks, names, cd=None, cdmethod=None, lowv=None, highv=None,\n",
+    "                width=6, textspace=1, reverse=False, filename=None, **kwargs):\n",
+    "    \"\"\"\n",
+    "    Draws a CD graph, which is used to display  the differences in methods'\n",
+    "    performance. See Janez Demsar, Statistical Comparisons of Classifiers over\n",
+    "    Multiple Data Sets, 7(Jan):1--30, 2006.\n",
+    "\n",
+    "    Needs matplotlib to work.\n",
+    "\n",
+    "    The image is ploted on `plt` imported using\n",
+    "    `import matplotlib.pyplot as plt`.\n",
+    "\n",
+    "    This function is deprecated and will be removed in Orange 3.34.\n",
+    "\n",
+    "    Args:\n",
+    "        avranks (list of float): average ranks of methods.\n",
+    "        names (list of str): names of methods.\n",
+    "        cd (float): Critical difference used for statistically significance of\n",
+    "            difference between methods.\n",
+    "        cdmethod (int, optional): the method that is compared with other methods\n",
+    "            If omitted, show pairwise comparison of methods\n",
+    "        lowv (int, optional): the lowest shown rank\n",
+    "        highv (int, optional): the highest shown rank\n",
+    "        width (int, optional): default width in inches (default: 6)\n",
+    "        textspace (int, optional): space on figure sides (in inches) for the\n",
+    "            method names (default: 1)\n",
+    "        reverse (bool, optional):  if set to `True`, the lowest rank is on the\n",
+    "            right (default: `False`)\n",
+    "        filename (str, optional): output file name (with extension). If not\n",
+    "            given, the function does not write a file.\n",
+    "    \"\"\"\n",
+    "    try:\n",
+    "        import matplotlib.pyplot as plt\n",
+    "        from matplotlib.backends.backend_agg import FigureCanvasAgg\n",
+    "    except ImportError:\n",
+    "        raise ImportError(\"Function graph_ranks requires matplotlib.\")\n",
+    "\n",
+    "    width = float(width)\n",
+    "    textspace = float(textspace)\n",
+    "\n",
+    "    def nth(l, n):\n",
+    "        \"\"\"\n",
+    "        Returns only nth elemnt in a list.\n",
+    "        \"\"\"\n",
+    "        n = lloc(l, n)\n",
+    "        return [a[n] for a in l]\n",
+    "\n",
+    "    def lloc(l, n):\n",
+    "        \"\"\"\n",
+    "        List location in list of list structure.\n",
+    "        Enable the use of negative locations:\n",
+    "        -1 is the last element, -2 second last...\n",
+    "        \"\"\"\n",
+    "        if n < 0:\n",
+    "            return len(l[0]) + n\n",
+    "        else:\n",
+    "            return n\n",
+    "\n",
+    "    def mxrange(lr):\n",
+    "        \"\"\"\n",
+    "        Multiple xranges. Can be used to traverse matrices.\n",
+    "        This function is very slow due to unknown number of\n",
+    "        parameters.\n",
+    "\n",
+    "        >>> mxrange([3,5])\n",
+    "        [(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)]\n",
+    "\n",
+    "        >>> mxrange([[3,5,1],[9,0,-3]])\n",
+    "        [(3, 9), (3, 6), (3, 3), (4, 9), (4, 6), (4, 3)]\n",
+    "\n",
+    "        \"\"\"\n",
+    "        if not len(lr):\n",
+    "            yield ()\n",
+    "        else:\n",
+    "            # it can work with single numbers\n",
+    "            index = lr[0]\n",
+    "            if isinstance(index, int):\n",
+    "                index = [index]\n",
+    "            for a in range(*index):\n",
+    "                for b in mxrange(lr[1:]):\n",
+    "                    yield tuple([a] + list(b))\n",
+    "\n",
+    "    def print_figure(fig, *args, **kwargs):\n",
+    "        canvas = FigureCanvasAgg(fig)\n",
+    "        canvas.print_figure(*args, **kwargs)\n",
+    "\n",
+    "    sums = avranks\n",
+    "\n",
+    "    tempsort = sorted([(a, i) for i, a in enumerate(sums)], reverse=reverse)\n",
+    "    ssums = nth(tempsort, 0)\n",
+    "    sortidx = nth(tempsort, 1)\n",
+    "    nnames = [names[x] for x in sortidx]\n",
+    "\n",
+    "    if lowv is None:\n",
+    "        lowv = min(1, int(math.floor(min(ssums))))\n",
+    "    if highv is None:\n",
+    "        highv = max(len(avranks), int(math.ceil(max(ssums))))\n",
+    "\n",
+    "    cline = 0.4\n",
+    "\n",
+    "    k = len(sums)\n",
+    "\n",
+    "    lines = None\n",
+    "\n",
+    "    linesblank = 0\n",
+    "    scalewidth = width - 2 * textspace\n",
+    "\n",
+    "    def rankpos(rank):\n",
+    "        if not reverse:\n",
+    "            a = rank - lowv\n",
+    "        else:\n",
+    "            a = highv - rank\n",
+    "        return textspace + scalewidth / (highv - lowv) * a\n",
+    "\n",
+    "    distanceh = 0.25\n",
+    "\n",
+    "    if cd and cdmethod is None:\n",
+    "        # get pairs of non significant methods\n",
+    "\n",
+    "        def get_lines(sums, hsd):\n",
+    "            # get all pairs\n",
+    "            lsums = len(sums)\n",
+    "            allpairs = [(i, j) for i, j in mxrange([[lsums], [lsums]]) if j > i]\n",
+    "            # remove not significant\n",
+    "            notSig = [(i, j) for i, j in allpairs\n",
+    "                      if abs(sums[i] - sums[j]) <= hsd]\n",
+    "            # keep only longest\n",
+    "\n",
+    "            def no_longer(ij_tuple, notSig):\n",
+    "                i, j = ij_tuple\n",
+    "                for i1, j1 in notSig:\n",
+    "                    if (i1 <= i and j1 > j) or (i1 < i and j1 >= j):\n",
+    "                        return False\n",
+    "                return True\n",
+    "\n",
+    "            longest = [(i, j) for i, j in notSig if no_longer((i, j), notSig)]\n",
+    "\n",
+    "            return longest\n",
+    "\n",
+    "        lines = get_lines(ssums, cd)\n",
+    "        linesblank = 0.2 + 0.2 + (len(lines) - 1) * 0.1\n",
+    "\n",
+    "        # add scale\n",
+    "        distanceh = 0.25\n",
+    "        cline += distanceh\n",
+    "\n",
+    "    # calculate height needed height of an image\n",
+    "    minnotsignificant = max(2 * 0.2, linesblank)\n",
+    "    height = cline + ((k + 1) / 2) * 0.2 + minnotsignificant\n",
+    "\n",
+    "    fig = plt.figure(figsize=(width, height))\n",
+    "    fig.set_facecolor('white')\n",
+    "    ax = fig.add_axes([0, 0, 1, 1])  # reverse y axis\n",
+    "    ax.set_axis_off()\n",
+    "\n",
+    "    hf = 1. / height  # height factor\n",
+    "    wf = 1. / width\n",
+    "\n",
+    "    def hfl(l):\n",
+    "        return [a * hf for a in l]\n",
+    "\n",
+    "    def wfl(l):\n",
+    "        return [a * wf for a in l]\n",
+    "\n",
+    "\n",
+    "    # Upper left corner is (0,0).\n",
+    "    ax.plot([0, 1], [0, 1], c=\"w\")\n",
+    "    ax.set_xlim(0, 1)\n",
+    "    ax.set_ylim(1, 0)\n",
+    "\n",
+    "    def line(l, color='k', **kwargs):\n",
+    "        \"\"\"\n",
+    "        Input is a list of pairs of points.\n",
+    "        \"\"\"\n",
+    "        ax.plot(wfl(nth(l, 0)), hfl(nth(l, 1)), color=color, **kwargs)\n",
+    "\n",
+    "    def text(x, y, s, *args, **kwargs):\n",
+    "        ax.text(wf * x, hf * y, s, fontsize = 14, *args, **kwargs)\n",
+    "\n",
+    "    line([(textspace, cline), (width - textspace, cline)], linewidth=0.7)\n",
+    "\n",
+    "    bigtick = 0.1\n",
+    "    smalltick = 0.05\n",
+    "\n",
+    "    tick = None\n",
+    "    for a in list(np.arange(lowv, highv, 0.5)) + [highv]:\n",
+    "        tick = smalltick\n",
+    "        if a == int(a):\n",
+    "            tick = bigtick\n",
+    "        line([(rankpos(a), cline - tick / 2),\n",
+    "              (rankpos(a), cline)],\n",
+    "             linewidth=0.7)\n",
+    "\n",
+    "    for a in range(lowv, highv + 1):\n",
+    "        text(rankpos(a), cline - tick / 2 - 0.05, str(a),\n",
+    "             ha=\"center\", va=\"bottom\")\n",
+    "\n",
+    "    k = len(ssums)\n",
+    "\n",
+    "    for i in range(math.ceil(k / 2)):\n",
+    "        chei = cline + minnotsignificant + i * 0.2\n",
+    "        line([(rankpos(ssums[i]), cline),\n",
+    "              (rankpos(ssums[i]), chei),\n",
+    "              (textspace - 0.1, chei)],\n",
+    "             linewidth=0.7)\n",
+    "        text(textspace - 0.2, chei, nnames[i], ha=\"right\", va=\"center\")\n",
+    "\n",
+    "    for i in range(math.ceil(k / 2), k):\n",
+    "        chei = cline + minnotsignificant + (k - i - 1) * 0.2\n",
+    "        line([(rankpos(ssums[i]), cline),\n",
+    "              (rankpos(ssums[i]), chei),\n",
+    "              (textspace + scalewidth + 0.1, chei)],\n",
+    "             linewidth=0.7)\n",
+    "        text(textspace + scalewidth + 0.2, chei, nnames[i],\n",
+    "             ha=\"left\", va=\"center\")\n",
+    "\n",
+    "    if cd and cdmethod is None:\n",
+    "        # upper scale\n",
+    "        if not reverse:\n",
+    "            begin, end = rankpos(lowv), rankpos(lowv + cd)\n",
+    "        else:\n",
+    "            begin, end = rankpos(highv), rankpos(highv - cd)\n",
+    "\n",
+    "        line([(begin, distanceh), (end, distanceh)], linewidth=0.7)\n",
+    "        line([(begin, distanceh + bigtick / 2),\n",
+    "              (begin, distanceh - bigtick / 2)],\n",
+    "             linewidth=0.7)\n",
+    "        line([(end, distanceh + bigtick / 2),\n",
+    "              (end, distanceh - bigtick / 2)],\n",
+    "             linewidth=0.7)\n",
+    "        text((begin + end) / 2, distanceh - 0.05, \"CD\",\n",
+    "             ha=\"center\", va=\"bottom\")\n",
+    "\n",
+    "        # no-significance lines\n",
+    "        def draw_lines(lines, side=0.05, height=0.1):\n",
+    "            start = cline + 0.2\n",
+    "            for l, r in lines:\n",
+    "                line([(rankpos(ssums[l]) - side, start),\n",
+    "                      (rankpos(ssums[r]) + side, start)],\n",
+    "                     linewidth=2.5)\n",
+    "                start += height\n",
+    "\n",
+    "        draw_lines(lines)\n",
+    "\n",
+    "    elif cd:\n",
+    "        begin = rankpos(avranks[cdmethod] - cd)\n",
+    "        end = rankpos(avranks[cdmethod] + cd)\n",
+    "        line([(begin, cline), (end, cline)],\n",
+    "             linewidth=2.5)\n",
+    "        line([(begin, cline + bigtick / 2),\n",
+    "              (begin, cline - bigtick / 2)],\n",
+    "             linewidth=2.5)\n",
+    "        line([(end, cline + bigtick / 2),\n",
+    "              (end, cline - bigtick / 2)],\n",
+    "             linewidth=2.5)\n",
+    "\n",
+    "    if filename:\n",
+    "        print_figure(fig, filename, **kwargs)\n",
+    "\n",
+    "def train_evaluate_model(clf, X_train, y_train, X_test, y_test, clf_name='Classifier'):\n",
+    "    clf.fit(X_train, y_train)\n",
+    "    y_pred = clf.predict(X_test)\n",
+    "    \n",
+    "    accuracy = accuracy_score(y_test, y_pred)\n",
+    "    precision = precision_score(y_test, y_pred, average='weighted')\n",
+    "    recall = recall_score(y_test, y_pred, average='weighted')\n",
+    "    f1 = f1_score(y_test, y_pred, average='weighted')\n",
+    "    conf_matrix = confusion_matrix(y_test, y_pred)\n",
+    "    \n",
+    "    if hasattr(clf, 'predict_proba'):\n",
+    "        y_score = clf.predict_proba(X_test)[:, 1]\n",
+    "    else:\n",
+    "        y_score = clf.decision_function(X_test)\n",
+    "        \n",
+    "    fpr, tpr, _ = roc_curve(y_test, y_score)\n",
+    "    roc_auc = auc(fpr, tpr)\n",
+    "    \n",
+    "    print(f'{clf_name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}')\n",
+    "    return accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e602f4d-efb2-4ac9-99bb-c6fbeca791cc",
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "warnings.filterwarnings('ignore')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "773b2524-af11-464f-97a9-f4add85be0d2",
+   "metadata": {},
+   "source": [
+    "### Training (classic/static)\n",
+    "In order to run classical/static, make sure to uncomment the one you need. \"Post Training\" is after one of these classical/static is done."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5b8873c-13d3-43b7-8902-14b73e8d5409",
+   "metadata": {},
+   "source": [
+    "#### Classical Classifiers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "264648a0-b0d8-4a63-9167-cea3a4ed1e18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optimized Classifiers\n",
+    "# classifiers = {\n",
+    "#     'DT': DecisionTreeClassifier(\n",
+    "#         random_state=0, \n",
+    "#         criterion='gini', \n",
+    "#         max_depth=6, \n",
+    "#         min_samples_leaf=10, \n",
+    "#         min_samples_split=9\n",
+    "#     ),\n",
+    "#     'LR': LogisticRegression(\n",
+    "#         random_state=0, \n",
+    "#         C=0.09659168435718246, \n",
+    "#         max_iter=100, \n",
+    "#         solver='lbfgs'\n",
+    "#     ),\n",
+    "#     'NB': GaussianNB(\n",
+    "#         var_smoothing=0.0058873326349240295\n",
+    "#     ),\n",
+    "#     'KN': KNeighborsClassifier(\n",
+    "#         metric='manhattan', \n",
+    "#         n_neighbors=8, \n",
+    "#         weights='uniform'\n",
+    "#     ),\n",
+    "#     'MLP': MLPClassifier(\n",
+    "#         random_state=0, \n",
+    "#         max_iter=1000, \n",
+    "#         alpha=0.0003079393718075164, \n",
+    "#         hidden_layer_sizes=195, \n",
+    "#         learning_rate_init=0.0001675266159417717\n",
+    "#     ),\n",
+    "#     'SVC': SVC(probability=True, kernel = 'rbf', C = 0.95, gamma = 'scale')}\n",
+    "\n",
+    "# Default classifiers\n",
+    "# classifiers = {\n",
+    "#     'DecisionTree': DecisionTreeClassifier(random_state=0),\n",
+    "#     'LogisticRegression': LogisticRegression(max_iter=1000, random_state=0),\n",
+    "#     'NaiveBayes': GaussianNB(),\n",
+    "#     'KNeighbors': KNeighborsClassifier(),\n",
+    "#     'MLP': MLPClassifier(max_iter=1000, random_state=0),\n",
+    "#     'SVC': SVC(probability=True, random_state=0)\n",
+    "# }\n",
+    "\n",
+    "# Main\n",
+    "# Initialize\n",
+    "metric_sums = defaultdict(lambda: {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0})\n",
+    "conf_matrices = defaultdict(list)\n",
+    "roc_curves = defaultdict(list)\n",
+    "roc_aucs = defaultdict(list)\n",
+    "accuracy_scores = defaultdict(list)\n",
+    "precision_scores = defaultdict(list)\n",
+    "recall_scores = defaultdict(list)\n",
+    "f1_scores = defaultdict(list)\n",
+    "\n",
+    "# Loop over 10 different random states\n",
+    "for random_state in range(10):\n",
+    "    print(f\"Processing for Random State: {random_state}\")\n",
+    "\n",
+    "    # Splitting the data\n",
+    "    X = splitted_dataset.drop('depression_category', axis=1)\n",
+    "    y = splitted_dataset['depression_category']\n",
+    "    \n",
+    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n",
+    "    \n",
+    "    # Identify outliers in the training dataset\n",
+    "    lof = LocalOutlierFactor()\n",
+    "    yhat = lof.fit_predict(X_train)\n",
+    "    # Select all rows that are not outliers\n",
+    "    mask = yhat != -1\n",
+    "    X_train, y_train = X_train[mask], y_train[mask]\n",
+    "    \n",
+    "    original_columns = X.columns.tolist()\n",
+    "\n",
+    "    # SMOTE\n",
+    "    smote = SMOTE(random_state=random_state)\n",
+    "    X_res, y_res = smote.fit_resample(X_train, y_train)\n",
+    "\n",
+    "    print(f\"Number of training labels after ROS: {y_res.value_counts()}\")\n",
+    "\n",
+    "    print(f\"Number of test labels before resampling: {y_test.value_counts()}\") \n",
+    "    \n",
+    "    sampling_strategy_undersample = {0: 372}\n",
+    "    rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n",
+    "    X_test, y_test = rus.fit_resample(X_test, y_test)\n",
+    "    print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "    # Normalization\n",
+    "    scaler = MinMaxScaler()\n",
+    "    \n",
+    "    X_res = scaler.fit_transform(X_res)\n",
+    "    X_test = scaler.transform(X_test)\n",
+    "    \n",
+    "    X_res = pd.DataFrame(X_res, columns=original_columns)\n",
+    "    X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "   # Correlation Feat Analysis\n",
+    "    corr_df = X_res.copy()\n",
+    "    corr_df['target'] = y_res\n",
+    "    \n",
+    "    corr_mat = corr_df.corr()\n",
+    "    target_correlation = corr_mat['target'].drop('target')\n",
+    "    top_features = target_correlation.abs().sort_values(ascending=False).head(200).index.tolist()\n",
+    "    \n",
+    "    # Only take top features\n",
+    "    X_res_fi = X_res[top_features]\n",
+    "    X_test_fi = X_test[top_features]\n",
+    "\n",
+    "    # Evaluate classifiers\n",
+    "    for clf_name, clf in classifiers.items():\n",
+    "        # Ensure the random state for classifiers is consistent\n",
+    "        if hasattr(clf, 'random_state'):\n",
+    "            clf.set_params(random_state=random_state)\n",
+    "        accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc = train_evaluate_model(clf, X_res_fi, y_res, X_test_fi, y_test, clf_name=clf_name)\n",
+    "        metric_sums[clf_name]['accuracy'] += accuracy\n",
+    "        metric_sums[clf_name]['precision'] += precision\n",
+    "        metric_sums[clf_name]['recall'] += recall\n",
+    "        metric_sums[clf_name]['f1'] += f1\n",
+    "        conf_matrices[clf_name].append(conf_matrix)\n",
+    "        roc_curves[clf_name].append((fpr, tpr))\n",
+    "        roc_aucs[clf_name].append(roc_auc)\n",
+    "        accuracy_scores[clf_name].append(accuracy)\n",
+    "        precision_scores[clf_name].append(precision)\n",
+    "        recall_scores[clf_name].append(recall)\n",
+    "        f1_scores[clf_name].append(f1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0ec7e67e-a086-48a7-83de-26b54ef03899",
+   "metadata": {},
+   "source": [
+    "#### Static Classifiers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9e20dd2-bdd8-4626-be89-8bb866212de9",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Initialize\n",
+    "metric_sums = defaultdict(lambda: {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0})\n",
+    "conf_matrices = defaultdict(list)\n",
+    "roc_curves = defaultdict(list)\n",
+    "roc_aucs = defaultdict(list)\n",
+    "accuracy_scores = defaultdict(list)\n",
+    "precision_scores = defaultdict(list)\n",
+    "recall_scores = defaultdict(list)\n",
+    "f1_scores = defaultdict(list)\n",
+    "\n",
+    "# Optimized Classifiers\n",
+    "classifiers = {\n",
+    "    'RF': RandomForestClassifier(n_estimators=143, criterion='entropy', max_depth=15, random_state=0),\n",
+    "    'XGB': XGBClassifier(n_estimators=200, max_depth=3, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss', random_state=0),\n",
+    "    'GB': GradientBoostingClassifier(n_estimators=300, max_depth=3, learning_rate=0.05),\n",
+    "    # 'AB': AdaBoostClassifier(n_estimators=400, learning_rate=0.1),\n",
+    "    # 'CB': CatBoostClassifier(depth = 3, iterations = 168, learning_rate = 0.1, verbose = 0),\n",
+    "    # 'LGBM': LGBMClassifier(learning_rate = 0.1, max_depth = 3, n_estimators = 200) \n",
+    "}\n",
+    "\n",
+    "# Default Classifiers\n",
+    "# classifiers = {\n",
+    "#     'RandomForest': RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=7, random_state=0),\n",
+    "#     'XGBoost': XGBClassifier(n_estimators=100, max_depth=7, use_label_encoder=False, eval_metric='mlogloss', random_state=0),\n",
+    "#     'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=0),\n",
+    "#     'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=0),\n",
+    "#     'CatBoost': CatBoostClassifier(n_estimators=100, verbose=0, random_state=0),\n",
+    "#     'LightGBM': LGBMClassifier(n_estimators=100, random_state=0)\n",
+    "# }\n",
+    "\n",
+    "# voting_clf = VotingClassifier(estimators=[\n",
+    "#     ('rf', classifiers['RF']),\n",
+    "#     ('xgb', classifiers['XGB']),\n",
+    "#     ('gb', classifiers['GB']),\n",
+    "#     ('ada', classifiers['AB']),\n",
+    "#     ('cat', classifiers['CB']),\n",
+    "#     ('lgbm', classifiers['LGBM'])\n",
+    "# ], voting='soft', n_jobs=1)\n",
+    "\n",
+    "# classifiers['Vot'] = voting_clf\n",
+    "\n",
+    "# Define the number of features for each classifier\n",
+    "num_features = {\n",
+    "     'RF': 150,\n",
+    "     'XGB': 150,\n",
+    "     'GB': 150,\n",
+    "     # 'AB': 150,\n",
+    "     # 'CB': 150,\n",
+    "     # 'LGBM': 150,\n",
+    "     # 'Vot': 150\n",
+    "}\n",
+    "\n",
+    "for random_state in range(10):\n",
+    "    print(f\"Processing for Random State: {random_state}\")\n",
+    "\n",
+    "    X = splitted_dataset.drop('depression_category', axis=1)\n",
+    "    y = splitted_dataset['depression_category']\n",
+    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n",
+    "    \n",
+    "    lof = LocalOutlierFactor()\n",
+    "    yhat = lof.fit_predict(X_train)\n",
+    "    mask = yhat != -1\n",
+    "    X_train, y_train = X_train[mask], y_train[mask]\n",
+    "    \n",
+    "    original_columns = X.columns.tolist()\n",
+    "\n",
+    "    smote = SMOTE(random_state=random_state)\n",
+    "    X_res, y_res = smote.fit_resample(X_train, y_train)\n",
+    "\n",
+    "    print(f\"Number of training labels after ROS: {y_res.value_counts()}\")\n",
+    "\n",
+    "    print(f\"Number of test labels before resampling: {y_test.value_counts()}\") \n",
+    "    \n",
+    "    sampling_strategy_undersample = {0: 372}\n",
+    "    rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n",
+    "    X_test, y_test = rus.fit_resample(X_test, y_test)\n",
+    "\n",
+    "    print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "    scaler = MinMaxScaler()\n",
+    "    \n",
+    "    X_res = scaler.fit_transform(X_res)\n",
+    "    X_test = scaler.transform(X_test)\n",
+    "    \n",
+    "    X_res = pd.DataFrame(X_res, columns=original_columns)\n",
+    "    X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "    log_reg = LogisticRegression(C=0.09659168435718246, max_iter=100, solver='lbfgs', random_state=random_state)\n",
+    "    log_reg.fit(X_res, y_res)\n",
+    "    selector = SelectFromModel(log_reg, prefit=True)\n",
+    "    \n",
+    "    importance = np.abs(log_reg.coef_[0])\n",
+    "    indices = np.argsort(importance)[::-1]\n",
+    "    important_features = [original_columns[i] for i in indices]\n",
+    "    \n",
+    "    for clf_name, clf in classifiers.items():\n",
+    "        num_top_features = num_features[clf_name]\n",
+    "        selected_features = important_features[:num_top_features]\n",
+    "        \n",
+    "        X_res_fi = pd.DataFrame(X_res, columns=original_columns)[selected_features]\n",
+    "        X_test_fi = pd.DataFrame(X_test, columns=original_columns)[selected_features]\n",
+    "\n",
+    "        accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc = train_evaluate_model(\n",
+    "            clf, X_res_fi, y_res, X_test_fi, y_test, clf_name=clf_name\n",
+    "        )\n",
+    "        metric_sums[clf_name]['accuracy'] += accuracy\n",
+    "        metric_sums[clf_name]['precision'] += precision\n",
+    "        metric_sums[clf_name]['recall'] += recall\n",
+    "        metric_sums[clf_name]['f1'] += f1\n",
+    "        conf_matrices[clf_name].append(conf_matrix)\n",
+    "        roc_curves[clf_name].append((fpr, tpr))\n",
+    "        roc_aucs[clf_name].append(roc_auc)\n",
+    "        accuracy_scores[clf_name].append(accuracy)\n",
+    "        precision_scores[clf_name].append(precision)\n",
+    "        recall_scores[clf_name].append(recall)\n",
+    "        f1_scores[clf_name].append(f1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c0158f58-e6d5-4adf-90e3-f1892294ad24",
+   "metadata": {},
+   "source": [
+    "### Post Training (classic/static)\n",
+    "Only run after one of the training methods above are done"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2ab5543-ad6e-4463-8d9a-35fe3e4e8eb4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('\\nAverage Metrics over 10 Random States:')\n",
+    "for clf_name, metrics in metric_sums.items():\n",
+    "    avg_accuracy = metrics['accuracy'] / 10\n",
+    "    avg_precision = metrics['precision'] / 10\n",
+    "    avg_recall = metrics['recall'] / 10\n",
+    "    avg_f1 = metrics['f1'] / 10\n",
+    "    std_accuracy = np.std(accuracy_scores[clf_name])\n",
+    "    std_precision = np.std(precision_scores[clf_name])\n",
+    "    std_recall = np.std(recall_scores[clf_name])\n",
+    "    std_f1 = np.std(f1_scores[clf_name])\n",
+    "    avg_auc = np.mean(roc_aucs[clf_name])\n",
+    "    print(f'{clf_name} - Accuracy: {avg_accuracy:.4f} ± {std_accuracy:.4f}, Precision: {avg_precision:.4f} ± {std_precision:.4f}, Recall: {avg_recall:.4f} ± {std_recall:.4f}, F1-Score: {avg_f1:.4f} ± {std_f1:.4f}, AUC: {avg_auc:.4f}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "21d5c872-3452-41ca-8f7c-5f4ceb184fba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot ROC Curves for each classifier in one graph\n",
+    "plot_combined_roc_curve(roc_curves, classifiers.keys())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26f7a715-efc8-44a6-8c53-e59b6268e551",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# FN Curve\n",
+    "df = pd.DataFrame(accuracy_scores)\n",
+    "scores = [df[col].values for col in df.columns]\n",
+    "stat, p = friedmanchisquare(*scores)\n",
+    "print(f'Friedman Test Statistic: {stat}, p-value: {p}')\n",
+    "ranks = df.rank(axis=1, method='average', ascending=False)\n",
+    "average_ranks = ranks.mean().values\n",
+    "n_datasets = df.shape[0]\n",
+    "alpha = 0.05\n",
+    "cd = compute_CD(average_ranks, n_datasets, alpha='0.05')\n",
+    "print(f'Critical Difference: {cd}')\n",
+    "classifiers = [f\"{clf} ({rank:.2f})\" for clf, rank in zip(df.columns, average_ranks)]\n",
+    "plt.figure(figsize=(14, 8))\n",
+    "graph_ranks(average_ranks, classifiers, cd=cd, width=6, textspace=1)\n",
+    "plt.text(0.5, 1.19, f'Friedman-Nemenyi: {stat:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=14)\n",
+    "plt.text(0.5, 1.10, f'CD: {cd:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=14)\n",
+    "plt.tight_layout()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88072d69-8bb5-46ab-9c79-6990db7cc8d2",
+   "metadata": {},
+   "source": [
+    "### Hyperparameter optimization (classic/static)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a4a1546-c24f-4349-bf1b-75ba937700be",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Hyperparameter optimization classic\n",
+    "search_spaces = {\n",
+    "    'DecisionTree': {\n",
+    "        'criterion': Categorical(['gini', 'entropy']),\n",
+    "        'max_depth': Integer(1, 20),\n",
+    "        'min_samples_split': Integer(2, 10),\n",
+    "        'min_samples_leaf': Integer(1, 10)\n",
+    "    },\n",
+    "    'LogisticRegression': {\n",
+    "        'C': Real(1e-6, 1e+6, prior='log-uniform'),\n",
+    "        'solver': Categorical(['lbfgs', 'liblinear']),\n",
+    "        'max_iter': Integer(100, 1000)\n",
+    "    },\n",
+    "    'NaiveBayes': {\n",
+    "        'var_smoothing': Real(1e-9, 1e-2, prior='log-uniform')\n",
+    "    },\n",
+    "    'KNeighbors': {\n",
+    "        'n_neighbors': Integer(1, 30),\n",
+    "        'weights': Categorical(['uniform', 'distance']),\n",
+    "        'metric': Categorical(['euclidean', 'manhattan', 'minkowski'])\n",
+    "    },\n",
+    "    'MLP': {\n",
+    "        'hidden_layer_sizes': Integer(50, 200),\n",
+    "        'alpha': Real(1e-6, 1e-2, prior='log-uniform'),\n",
+    "        'learning_rate_init': Real(1e-4, 1e-2, prior='log-uniform')\n",
+    "    },\n",
+    "    'SVC': {\n",
+    "        'C': [0.1, 1, 10, 100, 1000],\n",
+    "        'gamma': [1, 0.1, 0.01, 0.001, 0.0001],\n",
+    "        'kernel': ['rbf']\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "classifiers = {\n",
+    "    'DecisionTree': DecisionTreeClassifier(random_state=0),\n",
+    "    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=0),\n",
+    "    'NaiveBayes': GaussianNB(),\n",
+    "    'KNeighbors': KNeighborsClassifier(),\n",
+    "    'MLP': MLPClassifier(max_iter=1000, random_state=0),\n",
+    "    'SVC': SVC(probability=True, random_state=0)\n",
+    "}\n",
+    "\n",
+    "top_features_count = {\n",
+    "    'DecisionTree': 200,\n",
+    "    'LogisticRegression': 200,\n",
+    "    'NaiveBayes': 200,\n",
+    "    'KNeighbors': 200,\n",
+    "    'MLP': 200,\n",
+    "    'SVC': 200\n",
+    "}\n",
+    "\n",
+    "random_state = 0\n",
+    "print(f\"Processing for Random State: {random_state}\")\n",
+    "\n",
+    "X = splitted_dataset.drop('depression_category', axis=1)\n",
+    "y = splitted_dataset['depression_category']\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n",
+    "\n",
+    "lof = LocalOutlierFactor()\n",
+    "yhat = lof.fit_predict(X_train)\n",
+    "mask = yhat != -1\n",
+    "X_train, y_train = X_train[mask], y_train[mask]\n",
+    "\n",
+    "original_columns = X.columns.tolist()\n",
+    "\n",
+    "smote = SMOTE(random_state=random_state)\n",
+    "X_res, y_res = smote.fit_resample(X_train, y_train)\n",
+    "\n",
+    "print(f\"Number of training labels after ROS: {y_res.value_counts()}\")\n",
+    "\n",
+    "print(f\"Number of test labels before resampling: {y_test.value_counts()}\") \n",
+    "\n",
+    "sampling_strategy_undersample = {0: 372}\n",
+    "rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n",
+    "X_test, y_test = rus.fit_resample(X_test, y_test)\n",
+    "print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "scaler = MinMaxScaler()\n",
+    "\n",
+    "X_res = scaler.fit_transform(X_res)\n",
+    "X_test = scaler.transform(X_test)\n",
+    "\n",
+    "X_res = pd.DataFrame(X_res, columns=original_columns)\n",
+    "X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "corr_df = X_res.copy()\n",
+    "corr_df['target'] = y_res\n",
+    "\n",
+    "corr_mat = corr_df.corr()\n",
+    "target_correlation = corr_mat['target'].drop('target')\n",
+    "\n",
+    "for clf_name, clf in classifiers.items():\n",
+    "    print(f\"Optimizing {clf_name}\")\n",
+    "    \n",
+    "    top_features = target_correlation.abs().sort_values(ascending=False).head(top_features_count[clf_name]).index.tolist()\n",
+    "    \n",
+    "    X_res_fi = X_res[top_features]\n",
+    "    X_test_fi = X_test[top_features]\n",
+    "    \n",
+    "    opt = BayesSearchCV(clf, search_spaces[clf_name], n_iter=30, cv=3, random_state=random_state, n_jobs=-1, verbose = 30)\n",
+    "    opt.fit(X_res_fi, y_res)\n",
+    "    \n",
+    "    best_clf = opt.best_estimator_\n",
+    "    best_params = opt.best_params_\n",
+    "\n",
+    "    print(f\"Best parameters for {clf_name}: {best_params}\")\n",
+    "    \n",
+    "    accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc = train_evaluate_model(best_clf, X_res_fi, y_res, X_test_fi, y_test, clf_name=clf_name)\n",
+    "    print(f\"Best results for {clf_name}:\")\n",
+    "    print(f'Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, AUC: {roc_auc:.4f}')\n",
+    "    print(conf_matrix)\n",
+    "    print() "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f473cc7-577d-4dcd-a8b1-be7f33200fbc",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Hyperparameter optimization static\n",
+    "metric_sums = defaultdict(lambda: {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0})\n",
+    "conf_matrices = defaultdict(list)\n",
+    "accuracy_scores = defaultdict(list)\n",
+    "precision_scores = defaultdict(list)\n",
+    "recall_scores = defaultdict(list)\n",
+    "f1_scores = defaultdict(list)\n",
+    "\n",
+    "classifiers = {\n",
+    "    'RandomForest': RandomForestClassifier(),\n",
+    "    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),\n",
+    "    'AdaBoost': AdaBoostClassifier(),\n",
+    "    'GradientBoosting': GradientBoostingClassifier(),\n",
+    "    'CatBoost': CatBoostClassifier(verbose=0),\n",
+    "    'LightGBM': LGBMClassifier()\n",
+    "}\n",
+    "\n",
+    "num_features = {\n",
+    "     'RandomForest': 150,\n",
+    "     'XGBoost': 150,\n",
+    "     'GradientBoosting': 150,\n",
+    "     'AdaBoost': 150,\n",
+    "     'CatBoost': 150,\n",
+    "     'LightGBM': 150,\n",
+    "}\n",
+    "\n",
+    "search_spaces = {\n",
+    "    'RandomForest': {\n",
+    "        'n_estimators': [100, 200, 300],\n",
+    "        'criterion': ['gini', 'entropy'],\n",
+    "        'max_depth': [None, 7, 15],\n",
+    "        'bootstrap': [True, False]\n",
+    "    },\n",
+    "    'XGBoost': {\n",
+    "        'n_estimators': [100, 200, 300],\n",
+    "        'max_depth': [5, 10],\n",
+    "        'learning_rate': [0.01, 0.1, 0.2],\n",
+    "        'gamma': [0, 0.2, 0.4],\n",
+    "    },\n",
+    "    'GradientBoosting': {\n",
+    "        'n_estimators': [100, 200, 300],\n",
+    "        'learning_rate': [0.01, 0.1, 0.2],\n",
+    "        'max_depth': [5, 10],\n",
+    "        'subsample': [0.7, 0.9, 1.0],\n",
+    "    },\n",
+    "    'AdaBoost': {\n",
+    "        'n_estimators': [100, 200, 300],\n",
+    "        'learning_rate': [0.1, 0.5, 1.0],\n",
+    "        'algorithm': ['SAMME', 'SAMME.R']\n",
+    "    },\n",
+    "    'CatBoost': {\n",
+    "        'iterations': [100, 200, 300],\n",
+    "        'depth': [5, 7, 9],\n",
+    "        'learning_rate': [0.01, 0.1, 0.2],\n",
+    "    },\n",
+    "    'LightGBM': {\n",
+    "        'n_estimators': [100, 200, 300],\n",
+    "        'num_leaves': [31, 63, 127],\n",
+    "        'learning_rate': [0.01, 0.1, 0.2],\n",
+    "        'subsample': [0.7, 0.9, 1.0],\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "def hyperparameter_optimization(clf, search_space, X, y):\n",
+    "    combined_results = []\n",
+    "    for random_state in range(3):\n",
+    "        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)\n",
+    "        opt = BayesSearchCV(clf, search_space, n_iter=30, cv=cv, random_state=random_state, n_jobs=-1, verbose=0)\n",
+    "        opt.fit(X, y)\n",
+    "        combined_results.append(opt.best_params_)\n",
+    "    best_params = pd.DataFrame(combined_results).mode().iloc[0].to_dict()\n",
+    "    return best_params\n",
+    "\n",
+    "for random_state in range(9,10):\n",
+    "    print(f\"Processing for Random State: {random_state}\")\n",
+    "\n",
+    "    X = splitted_dataset.drop('depression_category', axis=1)\n",
+    "    y = splitted_dataset['depression_category']\n",
+    "    \n",
+    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n",
+    "    \n",
+    "    lof = LocalOutlierFactor()\n",
+    "    yhat = lof.fit_predict(X_train)\n",
+    "    mask = yhat != -1\n",
+    "    X_train, y_train = X_train[mask], y_train[mask]\n",
+    "    \n",
+    "    original_columns = X.columns.tolist()\n",
+    "\n",
+    "    smote = SMOTE(random_state=random_state)\n",
+    "    X_res, y_res = smote.fit_resample(X_train, y_train)\n",
+    "\n",
+    "    print(f\"Number of training labels after ROS: {y_res.value_counts()}\")\n",
+    "\n",
+    "    print(f\"Number of test labels before resampling: {y_test.value_counts()}\") \n",
+    "    \n",
+    "    sampling_strategy_undersample = {0: 372}\n",
+    "    rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n",
+    "    X_test, y_test = rus.fit_resample(X_test, y_test)\n",
+    "\n",
+    "    print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "    scaler = MinMaxScaler()\n",
+    "    \n",
+    "    X_res = scaler.fit_transform(X_res)\n",
+    "    X_test = scaler.transform(X_test)\n",
+    "    \n",
+    "    X_res = pd.DataFrame(X_res, columns=original_columns)\n",
+    "    X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "    log_reg = LogisticRegression(C=0.09659168435718246, max_iter=100, solver='lbfgs', random_state=random_state)\n",
+    "    log_reg.fit(X_res, y_res)\n",
+    "    selector = SelectFromModel(log_reg, prefit=True)\n",
+    "    \n",
+    "    importance = np.abs(log_reg.coef_[0])\n",
+    "    indices = np.argsort(importance)[::-1]\n",
+    "    important_features = [original_columns[i] for i in indices[:300]]\n",
+    "\n",
+    "    for clf_name, clf in classifiers.items():\n",
+    "        print(f\"Optimizing {clf_name}\")\n",
+    "        num_top_features = num_features[clf_name]\n",
+    "        selected_features = important_features[:num_top_features]\n",
+    "        \n",
+    "        X_res_fi = pd.DataFrame(X_res, columns=original_columns)[selected_features]\n",
+    "        \n",
+    "        best_params = hyperparameter_optimization(clf, search_spaces[clf_name], X_res_fi, y_res)\n",
+    "        if 'n_estimators' in best_params:\n",
+    "            best_params['n_estimators'] = int(best_params['n_estimators'])\n",
+    "        if 'max_depth' in best_params:\n",
+    "            best_params['max_depth'] = int(best_params['max_depth'])\n",
+    "        if 'iterations' in best_params:\n",
+    "            best_params['iterations'] = int(best_params['iterations'])\n",
+    "        clf.set_params(**best_params)\n",
+    "        print(f\"Best parameters for {clf_name}: {best_params}\")\n",
+    "\n",
+    "        X_test_fi = pd.DataFrame(X_test, columns=original_columns)[selected_features]\n",
+    "        accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc = train_evaluate_model(clf, X_res_fi, y_res, X_test_fi, y_test, clf_name=clf_name)\n",
+    "        metric_sums[clf_name]['accuracy'] += accuracy\n",
+    "        metric_sums[clf_name]['precision'] += precision\n",
+    "        metric_sums[clf_name]['recall'] += recall\n",
+    "        metric_sums[clf_name]['f1'] += f1\n",
+    "        conf_matrices[clf_name].append(conf_matrix)\n",
+    "        accuracy_scores[clf_name].append(accuracy)\n",
+    "        precision_scores[clf_name].append(precision)\n",
+    "        recall_scores[clf_name].append(recall)\n",
+    "        f1_scores[clf_name].append(f1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "65df93d4-12d3-4d68-b402-633354849dff",
+   "metadata": {},
+   "source": [
+    "### DES Training (all)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "709bb1f8-249e-4409-a537-c6bbe9a399f3",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "metric_sums_des = {\n",
+    "    'KNORAE': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'KNORAU': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'KNOP': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'DESMI': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'METADES': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'DESKNN': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'DESP': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'FIRE-KNORA-U': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'FIRE-KNORA-E': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'FIRE-METADES': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'FIRE-DESKNN': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'FIRE-DESP': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'FIRE-KNOP': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "}\n",
+    "\n",
+    "conf_matrices_des = {\n",
+    "    'KNORAE': [],\n",
+    "    'KNORAU': [],\n",
+    "    'KNOP': [],\n",
+    "    'DESMI': [],\n",
+    "    'METADES': [],\n",
+    "    'DESKNN': [],\n",
+    "    'DESP': [],\n",
+    "    'FIRE-KNORA-U': [],\n",
+    "    'FIRE-KNORA-E': [],\n",
+    "    'FIRE-METADES': [],\n",
+    "    'FIRE-DESKNN': [],\n",
+    "    'FIRE-DESP': [],\n",
+    "    'FIRE-KNOP': [],\n",
+    "}\n",
+    "\n",
+    "roc_curves = defaultdict(list)\n",
+    "roc_aucs = defaultdict(list)\n",
+    "accuracy_scores = defaultdict(list)\n",
+    "precision_scores = defaultdict(list)\n",
+    "recall_scores = defaultdict(list)\n",
+    "f1_scores = defaultdict(list)\n",
+    "feature_importance_runs = []\n",
+    "\n",
+    "# Uncomment wanted combinations\n",
+    "base_classifiers = {\n",
+    "    # 'DecisionTree': DecisionTreeClassifier(\n",
+    "    #     random_state=0, \n",
+    "    #     criterion='gini', \n",
+    "    #     max_depth=6, \n",
+    "    #     min_samples_leaf=10, \n",
+    "    #     min_samples_split=9\n",
+    "    # ),\n",
+    "    # 'LogisticRegression': LogisticRegression(\n",
+    "    #     random_state=0, \n",
+    "    #     C=0.09659168435718246, \n",
+    "    #     max_iter=100, \n",
+    "    #     solver='lbfgs'\n",
+    "    # ),\n",
+    "    # 'NaiveBayes': GaussianNB(\n",
+    "    #     var_smoothing=0.0058873326349240295\n",
+    "    # ),\n",
+    "    # 'KNeighbors': KNeighborsClassifier(\n",
+    "    #     metric='manhattan', \n",
+    "    #     n_neighbors=15, \n",
+    "    #     weights='uniform'\n",
+    "    # ),\n",
+    "    # 'MLP': MLPClassifier(\n",
+    "    #     random_state=0, \n",
+    "    #     max_iter=1000, \n",
+    "    #     alpha=0.0003079393718075164, \n",
+    "    #     hidden_layer_sizes=195, \n",
+    "    #     learning_rate_init=0.0001675266159417717\n",
+    "    # ),\n",
+    "    # 'SVC': SVC(probability=True, kernel = 'rbf', C = 1.5, gamma = 'auto'),\n",
+    "    # 'RF': RandomForestClassifier(n_estimators=143, criterion='entropy', max_depth=15, random_state=0),\n",
+    "    'XGB': XGBClassifier(n_estimators=200, max_depth=3, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss', random_state=0),\n",
+    "    'GB': GradientBoostingClassifier(n_estimators=300, max_depth=3, learning_rate=0.05),\n",
+    "    'AB': AdaBoostClassifier(n_estimators=400, learning_rate=0.1),\n",
+    "    # 'CB': CatBoostClassifier(depth = 3, iterations = 168, learning_rate = 0.1, verbose = 0),\n",
+    "    # 'LGBM': LGBMClassifier(learning_rate = 0.1, max_depth = 3, n_estimators = 200) \n",
+    "}\n",
+    "\n",
+    "random_state = 0\n",
+    "\n",
+    "for random_state in range(10):\n",
+    "    print(f\"Processing for Random State: {random_state}\")\n",
+    "\n",
+    "    X = splitted_dataset.drop('depression_category', axis=1)\n",
+    "    y = splitted_dataset['depression_category']\n",
+    "    \n",
+    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n",
+    "    \n",
+    "    lof = LocalOutlierFactor()\n",
+    "    yhat = lof.fit_predict(X_train)\n",
+    "    mask = yhat != -1\n",
+    "    X_train, y_train = X_train[mask], y_train[mask]\n",
+    "    \n",
+    "    original_columns = X.columns.tolist()\n",
+    "\n",
+    "    smote = SMOTE(random_state=random_state)\n",
+    "    X_res, y_res = smote.fit_resample(X_train, y_train)\n",
+    "\n",
+    "    print(f\"Number of test labels before resampling: {y_test.value_counts()}\")\n",
+    "    sampling_strategy_undersample = {0: 372}\n",
+    "    rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n",
+    "    X_test, y_test = rus.fit_resample(X_test, y_test) \n",
+    "\n",
+    "    print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "    scaler = MinMaxScaler()\n",
+    "    X_res = scaler.fit_transform(X_res)\n",
+    "    X_test = scaler.transform(X_test)\n",
+    "    \n",
+    "    X_res = pd.DataFrame(X_res, columns=original_columns)\n",
+    "    X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "    xgb_fs = XGBClassifier(n_estimators=200, max_depth=3, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss', random_state=random_state)\n",
+    "    xgb_fs.fit(X_res, y_res)\n",
+    "\n",
+    "    feature_importances = xgb_fs.feature_importances_\n",
+    "    indices = np.argsort(feature_importances)[::-1]\n",
+    "    top_50_features = [original_columns[i] for i in indices[:50]]\n",
+    "    current_run_features = {original_columns[i]: feature_importances[i] for i in indices[:50]}\n",
+    "    \n",
+    "    feature_importance_runs.append(current_run_features)\n",
+    "\n",
+    "    X_res_fi = X_res[top_50_features]\n",
+    "    X_test_fi = X_test[top_50_features]\n",
+    "    \n",
+    "    model_pool = list(base_classifiers.values())\n",
+    "    \n",
+    "    for clf in model_pool:\n",
+    "        clf.fit(X_res_fi, y_res)\n",
+    "    \n",
+    "    des_models = {\n",
+    "        'KNORAE': KNORAE(pool_classifiers=model_pool, random_state=random_state),\n",
+    "        'KNORAU': KNORAU(pool_classifiers=model_pool, random_state=random_state),\n",
+    "        'DESMI': DESMI(pool_classifiers=model_pool, random_state=random_state),\n",
+    "        'METADES': METADES(pool_classifiers=model_pool, random_state=random_state),\n",
+    "        'DESKNN': DESKNN(pool_classifiers=model_pool, random_state=random_state),\n",
+    "        'DESP': DESP(pool_classifiers=model_pool, random_state=random_state),\n",
+    "        'KNOP': KNOP(pool_classifiers=model_pool, random_state=random_state, k=9),\n",
+    "        'FIRE-KNORA-U': KNORAU(pool_classifiers=model_pool, DFP=True, k=9, random_state = random_state),\n",
+    "        'FIRE-KNORA-E': KNORAE(pool_classifiers=model_pool, DFP=True, k=9, random_state = random_state),\n",
+    "        'FIRE-METADES': METADES(pool_classifiers=model_pool, DFP=True, k=9, random_state = random_state),\n",
+    "        'FIRE-DESKNN': DESKNN(pool_classifiers=model_pool, DFP=True, k=9, random_state = random_state),\n",
+    "        'FIRE-DESP': DESP(pool_classifiers=model_pool, DFP=True, k=9, random_state = random_state),\n",
+    "        'FIRE-KNOP': KNOP(pool_classifiers=model_pool, DFP=True, k=40, random_state = random_state)\n",
+    "    }\n",
+    "\n",
+    "    for des_name, des_model in des_models.items():\n",
+    "        accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc = train_evaluate_model(\n",
+    "            des_model, X_res_fi, y_res, X_test_fi, y_test, clf_name=des_name\n",
+    "        )\n",
+    "        metric_sums_des[des_name]['accuracy'] += accuracy\n",
+    "        metric_sums_des[des_name]['precision'] += precision\n",
+    "        metric_sums_des[des_name]['recall'] += recall\n",
+    "        metric_sums_des[des_name]['f1'] += f1\n",
+    "        conf_matrices_des[des_name].append(conf_matrix)\n",
+    "        roc_curves[des_name].append((fpr, tpr))\n",
+    "        roc_aucs[des_name].append(roc_auc)\n",
+    "        accuracy_scores[des_name].append(accuracy)\n",
+    "        precision_scores[des_name].append(precision)\n",
+    "        recall_scores[des_name].append(recall)\n",
+    "        f1_scores[des_name].append(f1)\n",
+    "\n",
+    "        print(f'Confusion Matrix for {des_name} at Random State {random_state}:\\n{conf_matrix}\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34b0ead4-1d8c-4d0a-b0c7-71cd71f6ab7d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_combined_roc_curve(roc_curves, classifier_names):\n",
+    "    plt.figure(figsize=(12, 8))\n",
+    "    mean_fpr = np.linspace(0, 1, 100)\n",
+    "    colors = plt.cm.get_cmap('tab20', len(classifier_names))\n",
+    "    \n",
+    "    for i, clf_name in enumerate(classifier_names):\n",
+    "        tprs = []\n",
+    "        for fpr, tpr in roc_curves[clf_name]:\n",
+    "            tprs.append(np.interp(mean_fpr, fpr, tpr))\n",
+    "        mean_tpr = np.mean(tprs, axis=0)\n",
+    "        mean_tpr[-1] = 1.0\n",
+    "        mean_auc = auc(mean_fpr, mean_tpr)\n",
+    "        plt.plot(mean_fpr, mean_tpr, color=colors(i), lw=2, linestyle='-', marker='o', markersize=4, \n",
+    "                 label=f'{clf_name} (AUC = {mean_auc:.3f})')\n",
+    "\n",
+    "    plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--')\n",
+    "    plt.xlim([0.0, 1.0])\n",
+    "    plt.ylim([0.0, 1.05])\n",
+    "    plt.xlabel('False Positive Rate', fontsize=26)\n",
+    "    plt.ylabel('True Positive Rate', fontsize=26)\n",
+    "    plt.xticks(fontsize=30)  # Increase x-axis numbers font size\n",
+    "    plt.yticks(fontsize=30)  # Increase y-axis numbers font size\n",
+    "    plt.legend(loc=\"center left\", bbox_to_anchor=(1.05, 0.5), fontsize=26, frameon=True, framealpha=0.9)  # Place legend beside the plot\n",
+    "    plt.grid(True)\n",
+    "\n",
+    "    filename='bonk.svg'\n",
+    "\n",
+    "    plt.savefig(filename, format='svg', bbox_inches = 'tight')\n",
+    "    plt.show()\n",
+    "\n",
+    "    display(FileLink(filename))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4cfd8d73-26fd-4dbc-a6ca-92a9643e1d27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('\\nAverage Metrics over 10 Random States:')\n",
+    "for des_name, metrics in metric_sums_des.items():\n",
+    "    avg_accuracy = metrics['accuracy'] / 10\n",
+    "    avg_precision = metrics['precision'] / 10\n",
+    "    avg_recall = metrics['recall'] / 10\n",
+    "    avg_f1 = metrics['f1'] / 10\n",
+    "    std_accuracy = np.std(accuracy_scores[des_name])\n",
+    "    std_precision = np.std(precision_scores[des_name])\n",
+    "    std_recall = np.std(recall_scores[des_name])\n",
+    "    std_f1 = np.std(f1_scores[des_name])\n",
+    "    avg_auc = np.mean(roc_aucs[des_name])\n",
+    "    print(f'{des_name} - Accuracy: {avg_accuracy:.4f} ± {std_accuracy:.4f}, Precision: {avg_precision:.4f} ± {std_precision:.4f}, Recall: {avg_recall:.4f} ± {std_recall:.4f}, F1-Score: {avg_f1:.4f} ± {std_f1:.4f}, AUC: {avg_auc:.4f}')\n",
+    "\n",
+    "plot_combined_roc_curve(roc_curves, list(des_models.keys()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32780b70-dae5-4f7d-9ae4-128dc782fad4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(accuracy_scores)\n",
+    "scores = [df[col].values for col in df.columns]\n",
+    "\n",
+    "stat, p = friedmanchisquare(*scores)\n",
+    "print(f'Friedman Test Statistic: {stat}, p-value: {p}')\n",
+    "\n",
+    "ranks = df.rank(axis=1, method='average', ascending=False)\n",
+    "average_ranks = ranks.mean().values\n",
+    "\n",
+    "n_datasets = df.shape[0]\n",
+    "alpha = 0.05\n",
+    "\n",
+    "cd = compute_CD(average_ranks, n_datasets, alpha='0.05')\n",
+    "print(f'Critical Difference: {cd}')\n",
+    "\n",
+    "classifiers = [f\"{clf} ({rank:.2f})\" for clf, rank in zip(df.columns, average_ranks)]\n",
+    "\n",
+    "plt.figure(figsize=(14, 10))\n",
+    "\n",
+    "graph_ranks(average_ranks, classifiers, cd=cd, width=6, textspace=1)\n",
+    "plt.xlabel('Classifiers')\n",
+    "\n",
+    "plt.text(0.5, 1.19, f'Friedman-Nemenyi: {stat:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=16)\n",
+    "plt.text(0.5, 1.10, f'CD: {cd:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=16)\n",
+    "\n",
+    "plt.tight_layout()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f25a2b29-129f-4314-b2f9-8aff9615d5d7",
+   "metadata": {},
+   "source": [
+    "### Shap (will mostly be exported files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed3c43d0-68d2-4aac-ac44-25a28dc6dc75",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example with XGB\n",
+    "\n",
+    "random_state = 2\n",
+    "print(f\"Processing for Random State: {random_state}\")\n",
+    "\n",
+    "X = splitted_dataset.drop('depression_category', axis=1)\n",
+    "y = splitted_dataset['depression_category']\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n",
+    "\n",
+    "lof = LocalOutlierFactor()\n",
+    "yhat = lof.fit_predict(X_train)\n",
+    "mask = yhat != -1\n",
+    "X_train, y_train = X_train[mask], y_train[mask]\n",
+    "\n",
+    "original_columns = X.columns.tolist()\n",
+    "\n",
+    "smote = SMOTE(random_state=random_state)\n",
+    "X_res, y_res = smote.fit_resample(X_train, y_train)\n",
+    "\n",
+    "print(f\"Number of training labels after SMOTE: {y_res.value_counts()}\")\n",
+    "print(f\"Number of test labels before resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "sampling_strategy_undersample = {0: 372}\n",
+    "rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n",
+    "X_test, y_test = rus.fit_resample(X_test, y_test)\n",
+    "\n",
+    "print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "# Normalization\n",
+    "# scaler = MinMaxScaler()\n",
+    "# X_res = scaler.fit_transform(X_res)\n",
+    "# X_test = scaler.transform(X_test)\n",
+    "\n",
+    "X_res = pd.DataFrame(X_res, columns=original_columns)\n",
+    "X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "# Train XGBoost model on all features\n",
+    "model = xgb.XGBClassifier(n_estimators=200, max_depth=3, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss', random_state=random_state)\n",
+    "model.fit(X_res, y_res)\n",
+    "\n",
+    "y_pred = model.predict(X_test)\n",
+    "\n",
+    "accuracy = accuracy_score(y_test, y_pred)\n",
+    "print(f'Accuracy: {accuracy:.4f}')\n",
+    "\n",
+    "explainer = shap.Explainer(model, X_res)\n",
+    "shap_values = explainer(X_res)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2cb9c929-4e86-42df-bda2-48730004c154",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_shap_waterfall(instance_index, filename):\n",
+    "    shap_value = shap_values[instance_index]\n",
+    "    plt.figure(figsize=(14, 8))\n",
+    "    \n",
+    "    shap.plots.waterfall(shap_value, show=False)\n",
+    "    \n",
+    "    ax = plt.gca()\n",
+    "    \n",
+    "    ax.tick_params(axis='both', which='major', labelsize=16)\n",
+    "    ax.set_xlabel(ax.get_xlabel(), fontsize=20)\n",
+    "    ax.set_ylabel(ax.get_ylabel(), fontsize=22)\n",
+    "    \n",
+    "    plt.tight_layout()\n",
+    "\n",
+    "    plt.savefig(filename, format='svg')\n",
+    "    \n",
+    "    plt.close()\n",
+    "\n",
+    "plot_shap_waterfall(0, \"waterfall_plot_instance_0.svg\")\n",
+    "plot_shap_waterfall(562, \"waterfall_plot_instance_562.svg\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e111213e-caea-48a0-adb7-c6b2362eed4f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "plt.figure(figsize=(14, 8))\n",
+    "shap.summary_plot(\n",
+    "    shap_values,\n",
+    "    X_res,\n",
+    "    plot_type=\"bar\",\n",
+    "    feature_names=original_columns,\n",
+    "    show=False\n",
+    ")\n",
+    "\n",
+    "ax = plt.gca()\n",
+    "ax.tick_params(axis='both', which='major', labelsize=16)\n",
+    "ax.set_xlabel(ax.get_xlabel(), fontsize=16)\n",
+    "ax.set_ylabel(ax.get_ylabel(), fontsize=22)\n",
+    "\n",
+    "plt.savefig(\"shap_summary_plot.svg\", format='svg')\n",
+    "plt.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ae84637-a018-49a1-ad4e-522aa937bfad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "shap.initjs()\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(14, 8))\n",
+    "\n",
+    "shap.summary_plot(\n",
+    "    shap_values,\n",
+    "    X_res,\n",
+    "    plot_type=\"dot\",\n",
+    "    feature_names=original_columns,\n",
+    "    show=False\n",
+    ")\n",
+    "\n",
+    "ax.tick_params(axis='both', which='major', labelsize=16)\n",
+    "ax.set_xlabel(ax.get_xlabel(), fontsize=16)\n",
+    "ax.set_ylabel(ax.get_ylabel(), fontsize=22)\n",
+    "\n",
+    "fig.savefig(\"shap_summary_dot_plot.svg\", format='svg', bbox_inches='tight')\n",
+    "\n",
+    "plt.close(fig)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "05f0de1c-3850-4d77-9184-d593451022c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.tree import DecisionTreeClassifier, export_text\n",
+    "from sklearn import tree\n",
+    "\n",
+    "random_state = 5\n",
+    "\n",
+    "X = splitted_dataset.drop('depression_category', axis=1)\n",
+    "y = splitted_dataset['depression_category']\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n",
+    "\n",
+    "lof = LocalOutlierFactor()\n",
+    "yhat = lof.fit_predict(X_train)\n",
+    "mask = yhat != -1\n",
+    "X_train, y_train = X_train[mask], y_train[mask]\n",
+    "\n",
+    "original_columns = X.columns.tolist()\n",
+    "\n",
+    "smote = SMOTE(random_state=random_state)\n",
+    "X_res, y_res = smote.fit_resample(X_train, y_train)\n",
+    "\n",
+    "print(f\"Number of training labels after ROS: {y_res.value_counts()}\")\n",
+    "print(f\"Number of test labels before resampling: {y_test.value_counts()}\")\n",
+    "sampling_strategy_undersample = {0: 372}\n",
+    "rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n",
+    "X_test, y_test = rus.fit_resample(X_test, y_test)\n",
+    "\n",
+    "print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "# Normalization\n",
+    "# scaler = MinMaxScaler()\n",
+    "# X_res = scaler.fit_transform(X_res)\n",
+    "# X_test = scaler.transform(X_test)\n",
+    "\n",
+    "X_res = pd.DataFrame(X_res, columns=original_columns)\n",
+    "X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "decision_tree_model = DecisionTreeClassifier(\n",
+    "    random_state=0, \n",
+    "    criterion='gini', \n",
+    "    max_depth=6, \n",
+    "    min_samples_leaf=10, \n",
+    "    min_samples_split=9\n",
+    ")\n",
+    "decision_tree_model.fit(X_res, y_res)\n",
+    "\n",
+    "plt.figure(figsize=(20, 14))\n",
+    "tree.plot_tree(\n",
+    "    decision_tree_model, \n",
+    "    feature_names=original_columns, \n",
+    "    class_names=['depression', 'normal'],\n",
+    "    filled=True, \n",
+    "    rounded=True, \n",
+    "    fontsize=10,\n",
+    "    max_depth = 3\n",
+    ")\n",
+    "\n",
+    "plt.savefig(\"decision_tree_plot.svg\", format='svg')\n",
+    "plt.close()\n",
+    "\n",
+    "print(\"Decision Tree plot saved as 'decision_tree_plot.svg'\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b46a92a6-f370-4df4-ac29-8f382fc1820b",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "tree_rules = export_text(decision_tree_model, feature_names=original_columns, max_depth=50)\n",
+    "print(\"Decision rules for the tree (up to depth 3):\")\n",
+    "print(tree_rules)\n",
+    "\n",
+    "node_indicator = decision_tree_model.decision_path(X_test)\n",
+    "\n",
+    "sample_id = 0\n",
+    "node_index = node_indicator.indices[node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]]\n",
+    "\n",
+    "print(f\"\\nDecision path for sample {sample_id}:\")\n",
+    "for node_id in node_index:\n",
+    "    if X_test.iloc[sample_id, decision_tree_model.tree_.feature[node_id]] <= decision_tree_model.tree_.threshold[node_id]:\n",
+    "        threshold_sign = \"<=\"\n",
+    "    else:\n",
+    "        threshold_sign = \">\"\n",
+    "    print(f\"Node {node_id}: (X_test[{sample_id}, {decision_tree_model.tree_.feature[node_id]}] = {X_test.iloc[sample_id, decision_tree_model.tree_.feature[node_id]]}) \"\n",
+    "          f\"{threshold_sign} {decision_tree_model.tree_.threshold[node_id]}\")\n",
+    "\n",
+    "# Get prediction for a specific test sample\n",
+    "predicted_class = decision_tree_model.predict([X_test.iloc[sample_id]])\n",
+    "print(f\"\\nPredicted class for test sample {sample_id}: {predicted_class}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

regressionLayer.ipynb ADDED Viewed

	@@ -0,0 +1,962 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c4849e1e-1928-4e8b-a12f-37c786b50aca",
+   "metadata": {},
+   "source": [
+    "# Regression Layer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dddcb2a8-73d3-4476-b88a-bc1494a2c830",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from sklearn.model_selection import cross_val_score, KFold\n",
+    "from sklearn.impute import KNNImputer\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from xgboost import XGBClassifier\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "from sklearn.experimental import enable_iterative_imputer\n",
+    "from imblearn.pipeline import make_pipeline as make_pipeline_imb\n",
+    "from imblearn.over_sampling import SMOTE,SMOTENC\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from collections import Counter\n",
+    "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.neighbors import LocalOutlierFactor\n",
+    "from sklearn.utils import resample\n",
+    "import warnings\n",
+    "from imblearn.over_sampling import RandomOverSampler\n",
+    "from imblearn.under_sampling import RandomUnderSampler\n",
+    "from sklearn.preprocessing import MinMaxScaler\n",
+    "from imblearn.pipeline import Pipeline\n",
+    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
+    "from sklearn.preprocessing import LabelEncoder, PowerTransformer\n",
+    "from collections import defaultdict\n",
+    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "from sklearn.neural_network import MLPClassifier\n",
+    "import Orange\n",
+    "from scipy.stats import friedmanchisquare, rankdata\n",
+    "import shap\n",
+    "import scikit_posthocs as sp\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "from IPython.display import FileLink, display\n",
+    "import math\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from skopt.space import Integer, Real\n",
+    "from sklearn.model_selection import StratifiedKFold\n",
+    "from skopt import BayesSearchCV\n",
+    "import xgboost as xgb\n",
+    "from imblearn.over_sampling import SMOTE\n",
+    "from sklearn.tree import DecisionTreeClassifier, export_text\n",
+    "from sklearn import tree\n",
+    "from skopt.space import Real, Integer, Categorical\n",
+    "from skopt.callbacks import VerboseCallback\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import MinMaxScaler\n",
+    "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, VotingRegressor\n",
+    "from catboost import CatBoostRegressor\n",
+    "from xgboost import XGBRegressor\n",
+    "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "from sklearn.neighbors import LocalOutlierFactor\n",
+    "from lightgbm import LGBMRegressor\n",
+    "from IPython.display import display, FileLink"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39d6683c-fd3f-4daa-afdc-2e7f83c3fce3",
+   "metadata": {},
+   "source": [
+    "### Preparation before training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec17e47a-8d92-498d-8f28-d8259d6ebc4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Call Dataset\n",
+    "pd.set_option('display.max_rows', 10)\n",
+    "initial_df = pd.read_csv('3labelv4Regression.csv')\n",
+    "initial_df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d3751352-73bc-42c6-b6a5-84a3badf14ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# All categorical features except for label\n",
+    "cols = initial_df.columns\n",
+    "num_cols = initial_df._get_numeric_data().columns\n",
+    "categorical_features = list(set(cols) - set(num_cols))\n",
+    "categorical_features.remove('depression_category')\n",
+    "\n",
+    "# Label Encode all categorical, but keep missing values\n",
+    "le_initial_df = initial_df.copy()\n",
+    "dropped_labels = le_initial_df['depression_category']\n",
+    "le_initial_df = le_initial_df.drop('depression_category', axis = 1)\n",
+    "\n",
+    "for col in le_initial_df.columns:\n",
+    "    if le_initial_df[col].dtype == 'object':\n",
+    "        le_initial_df[col] = le_initial_df[col].fillna('missing')\n",
+    "\n",
+    "        label_encoder = LabelEncoder()\n",
+    "        le_initial_df[col] = label_encoder.fit_transform(le_initial_df[col])\n",
+    "\n",
+    "        missing_value_index = np.where(label_encoder.classes_ == 'missing')[0]\n",
+    "        \n",
+    "        le_initial_df[col] = le_initial_df[col].replace(missing_value_index, np.nan)\n",
+    "\n",
+    "le_initial_df = pd.concat([le_initial_df, dropped_labels], axis = 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9fa21a95-21f1-4274-86ba-d7977813066b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "le_initial_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ba54a81-d4de-4884-99d3-29867eb7ea40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Seperate and Combine\n",
+    "le_df_normal = le_initial_df[le_initial_df['depression_category'] == 'normal']\n",
+    "le_df_mild = le_initial_df[le_initial_df['depression_category'] == 'mild']\n",
+    "le_df_moderatesevere = le_initial_df[le_initial_df['depression_category'] == 'moderatesevere']\n",
+    "\n",
+    "le_df_depression = pd.concat([le_df_normal, le_df_mild, le_df_moderatesevere], ignore_index = False)\n",
+    "\n",
+    "le_df_depression['depression_category'] = 'depression'\n",
+    "\n",
+    "# Check depression category counts\n",
+    "dataframes = [le_df_normal, le_df_mild, le_df_moderatesevere]\n",
+    "le_initial_df = pd.concat(dataframes, ignore_index=True)\n",
+    "label_counts = le_initial_df['depression_category'].value_counts()\n",
+    "label_counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c517213f-f6bb-4298-99ee-3b29f6f7d0cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Some outlier.\n",
+    "# threshold = int(0.8 * le_df_normal.shape[1])\n",
+    "# le_df_normal = le_df_normal.dropna(thresh = threshold)\n",
+    "# threshold = int(0.8 * le_df_depression.shape[1])\n",
+    "# le_df_depression = le_df_depression.dropna(thresh = threshold)\n",
+    "\n",
+    "# Check depression category counts\n",
+    "dataframes = [le_df_normal, le_df_mild, le_df_moderatesevere]\n",
+    "le_initial_df = pd.concat(dataframes, ignore_index=True)\n",
+    "label_counts = le_initial_df['depression_category'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c5bdf52-c645-4a11-920a-579e28db1a50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imputation\n",
+    "different_le_dfs = [le_df_normal, le_df_mild, le_df_moderatesevere]\n",
+    "imputed_le_dfs = []\n",
+    "from sklearn.impute import IterativeImputer\n",
+    "for le_df in different_le_dfs:\n",
+    "    y = le_df['depression_category']\n",
+    "    X = le_df.drop('depression_category', axis = 1)\n",
+    "    \n",
+    "    imputer = SimpleImputer(strategy='median')\n",
+    "    imputed_data = imputer.fit_transform(X)\n",
+    "    imputed_df = pd.DataFrame(imputed_data, columns = X.columns)\n",
+    "\n",
+    "    imputed_df['depression_category'] = y.reset_index(drop = True)\n",
+    "    imputed_le_dfs.append(imputed_df)\n",
+    "\n",
+    "concatenated_le_dfs = pd.concat(imputed_le_dfs, ignore_index = True)\n",
+    "concatenated_le_dfs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e871a5f-beab-4f90-b8c4-e922ef86d0f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Full label encode depression category\n",
+    "fully_LE_concatenated_le_dfs = concatenated_le_dfs.copy()\n",
+    "fully_LE_concatenated_le_dfs['depression_category'] = label_encoder.fit_transform(fully_LE_concatenated_le_dfs['depression_category'])\n",
+    "\n",
+    "# The dataset after category connect, imputation, and label encoding\n",
+    "splitted_dataset = fully_LE_concatenated_le_dfs.copy()\n",
+    "splitted_dataset = splitted_dataset.drop('depression_category', axis = 1)\n",
+    "splitted_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e602f4d-efb2-4ac9-99bb-c6fbeca791cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "warnings.filterwarnings('ignore')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "68563dc9-38c9-4ea5-8c08-9de904f58f43",
+   "metadata": {},
+   "source": [
+    "### Regression Training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e339a41-e484-4a11-84f0-770cbaacb09d",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Optimized parameters\n",
+    "regressors = {\n",
+    "    'CBR': CatBoostRegressor(verbose=0, iterations=2000, learning_rate=0.01, depth=5),\n",
+    "    'XGBR': XGBRegressor(learning_rate=0.04403027347366962, max_depth=3, n_estimators=238),\n",
+    "    'LGBMR': LGBMRegressor(learning_rate=0.02904035023286438, num_leaves=20, n_estimators=170),\n",
+    "    'GBR': GradientBoostingRegressor(learning_rate=0.03, max_depth=2, n_estimators=700),\n",
+    "    'RFR': RandomForestRegressor(max_depth=12, max_features=1.0, n_estimators=300),\n",
+    "    'ETR': ExtraTreesRegressor(max_depth=12, max_features=1.0, n_estimators=64),\n",
+    "    'ABR': AdaBoostRegressor(learning_rate=0.29915504677867777, n_estimators=92)\n",
+    "}\n",
+    "\n",
+    "# Default parameters\n",
+    "# regressors = {\n",
+    "#     'CBR': CatBoostRegressor(verbose=0),\n",
+    "#     'XGBR': XGBRegressor(),\n",
+    "#     'LGBMR': LGBMRegressor(),\n",
+    "#     'GBR': GradientBoostingRegressor(),\n",
+    "#     'RFR': RandomForestRegressor(),\n",
+    "#     'ETR': ExtraTreesRegressor(),\n",
+    "#     'ABR': AdaBoostRegressor()\n",
+    "# }\n",
+    "\n",
+    "voting_regressor = VotingRegressor(estimators=[\n",
+    "    ('cbr', regressors['CBR']),\n",
+    "    ('xgbr', regressors['XGBR']),\n",
+    "    ('gbr', regressors['GBR']),\n",
+    "    ('abr', regressors['ABR'])\n",
+    "])\n",
+    "\n",
+    "regressors['Voting'] = voting_regressor\n",
+    "\n",
+    "metric_sums = {name: {'rmse': 0, 'mae': 0, 'r2': 0} for name in regressors.keys()}\n",
+    "metric_stds = {name: {'rmse': 0, 'mae': 0, 'r2': 0} for name in regressors.keys()}\n",
+    "rmse_scores = {name: [] for name in regressors.keys()}\n",
+    "mae_scores = {name: [] for name in regressors.keys()}\n",
+    "r2_scores = {name: [] for name in regressors.keys()}\n",
+    "\n",
+    "for random_state in range(10):\n",
+    "    print(f'Processing for Random State: {random_state}')\n",
+    "\n",
+    "    X = splitted_dataset.drop('total_sum', axis=1)\n",
+    "    y = splitted_dataset['total_sum']\n",
+    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)\n",
+    "\n",
+    "    lof = LocalOutlierFactor()\n",
+    "    yhat = lof.fit_predict(X_train)\n",
+    "\n",
+    "    mask = yhat != -1\n",
+    "    X_train, y_train = X_train[mask], y_train[mask]\n",
+    "\n",
+    "    original_columns = X.columns.tolist()\n",
+    "\n",
+    "    print(f\"Number of training labels after outlier removal: {len(y_train)}\")\n",
+    "    print(f\"Number of test labels: {len(y_test)}\")\n",
+    "\n",
+    "    scaler = MinMaxScaler()\n",
+    "    X_train = scaler.fit_transform(X_train)\n",
+    "    X_test = scaler.transform(X_test)\n",
+    "    \n",
+    "    X_train = pd.DataFrame(X_train, columns=original_columns)\n",
+    "    X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "    # Feature selection using XGBRegressor\n",
+    "    xgb = XGBRegressor(random_state=random_state)\n",
+    "    xgb.fit(X_train, y_train)\n",
+    "    selector = SelectFromModel(xgb, prefit=True)\n",
+    "\n",
+    "    importance = np.abs(xgb.feature_importances_)\n",
+    "    indices = np.argsort(importance)[::-1]\n",
+    "    important_features = [original_columns[i] for i in indices[:50]]\n",
+    "\n",
+    "    for reg_name, reg in regressors.items():\n",
+    "        selected_features = important_features\n",
+    "        \n",
+    "        X_train_fi = pd.DataFrame(X_train, columns=original_columns)[selected_features]\n",
+    "        X_test_fi = pd.DataFrame(X_test, columns=original_columns)[selected_features]\n",
+    "\n",
+    "        reg.fit(X_train_fi, y_train)\n",
+    "        y_pred = reg.predict(X_test_fi)\n",
+    "\n",
+    "        y_pred = np.round(y_pred)\n",
+    "        \n",
+    "        rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
+    "        mae = mean_absolute_error(y_test, y_pred)\n",
+    "        r2 = r2_score(y_test, y_pred)\n",
+    "\n",
+    "        metric_sums[reg_name]['rmse'] += rmse\n",
+    "        metric_sums[reg_name]['mae'] += mae\n",
+    "        metric_sums[reg_name]['r2'] += r2\n",
+    "        rmse_scores[reg_name].append(rmse)\n",
+    "        mae_scores[reg_name].append(mae)\n",
+    "        r2_scores[reg_name].append(r2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0929bf2c-fd41-4e0e-8070-4a6317f3efab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculate and print the average metrics and their standard deviations\n",
+    "for reg_name in regressors.keys():\n",
+    "    avg_rmse = metric_sums[reg_name]['rmse'] / 10\n",
+    "    avg_mae = metric_sums[reg_name]['mae'] / 10\n",
+    "    avg_r2 = metric_sums[reg_name]['r2'] / 10\n",
+    "    std_rmse = np.std(rmse_scores[reg_name])\n",
+    "    std_mae = np.std(mae_scores[reg_name])\n",
+    "    std_r2 = np.std(r2_scores[reg_name])\n",
+    "    \n",
+    "    print(f\"Regressor: {reg_name}\")\n",
+    "    print(f\"Average RMSE: {avg_rmse} ± {std_rmse}\")\n",
+    "    print(f\"Average MAE: {avg_mae} ± {std_mae}\")\n",
+    "    print(f\"Average R2: {avg_r2} ± {std_r2}\")\n",
+    "    print(\"------\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "07f62248-70b4-479b-b782-3aec714111f9",
+   "metadata": {},
+   "source": [
+    "### Shap n FN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6ce4639-53d9-4dcd-a49c-80555326f197",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Preparation code to make CD diagram from older version of Orange\n",
+    "def compute_CD(avranks, n, alpha=\"0.05\", test=\"nemenyi\"):\n",
+    "    \"\"\"\n",
+    "    Returns critical difference for Nemenyi or Bonferroni-Dunn test\n",
+    "    according to given alpha (either alpha=\"0.05\" or alpha=\"0.1\") for average\n",
+    "    ranks and number of tested datasets N. Test can be either \"nemenyi\" for\n",
+    "    for Nemenyi two tailed test or \"bonferroni-dunn\" for Bonferroni-Dunn test.\n",
+    "\n",
+    "    This function is deprecated and will be removed in Orange 3.34.\n",
+    "    \"\"\"\n",
+    "    k = len(avranks)\n",
+    "    d = {(\"nemenyi\", \"0.05\"): [0, 0, 1.959964, 2.343701, 2.569032, 2.727774,\n",
+    "                               2.849705, 2.94832, 3.030879, 3.101730, 3.163684,\n",
+    "                               3.218654, 3.268004, 3.312739, 3.353618, 3.39123,\n",
+    "                               3.426041, 3.458425, 3.488685, 3.517073,\n",
+    "                               3.543799],\n",
+    "         (\"nemenyi\", \"0.1\"): [0, 0, 1.644854, 2.052293, 2.291341, 2.459516,\n",
+    "                              2.588521, 2.692732, 2.779884, 2.854606, 2.919889,\n",
+    "                              2.977768, 3.029694, 3.076733, 3.119693, 3.159199,\n",
+    "                              3.195743, 3.229723, 3.261461, 3.291224, 3.319233],\n",
+    "         (\"bonferroni-dunn\", \"0.05\"): [0, 0, 1.960, 2.241, 2.394, 2.498, 2.576,\n",
+    "                                       2.638, 2.690, 2.724, 2.773],\n",
+    "         (\"bonferroni-dunn\", \"0.1\"): [0, 0, 1.645, 1.960, 2.128, 2.241, 2.326,\n",
+    "                                      2.394, 2.450, 2.498, 2.539]}\n",
+    "    q = d[(test, alpha)]\n",
+    "    cd = q[k] * (k * (k + 1) / (6.0 * n)) ** 0.5\n",
+    "    return cd\n",
+    "\n",
+    "\n",
+    "def graph_ranks(avranks, names, cd=None, cdmethod=None, lowv=None, highv=None,\n",
+    "                width=6, textspace=1, reverse=False, filename=None, **kwargs):\n",
+    "    \"\"\"\n",
+    "    Draws a CD graph, which is used to display  the differences in methods'\n",
+    "    performance. See Janez Demsar, Statistical Comparisons of Classifiers over\n",
+    "    Multiple Data Sets, 7(Jan):1--30, 2006.\n",
+    "\n",
+    "    Needs matplotlib to work.\n",
+    "\n",
+    "    The image is ploted on `plt` imported using\n",
+    "    `import matplotlib.pyplot as plt`.\n",
+    "\n",
+    "    This function is deprecated and will be removed in Orange 3.34.\n",
+    "\n",
+    "    Args:\n",
+    "        avranks (list of float): average ranks of methods.\n",
+    "        names (list of str): names of methods.\n",
+    "        cd (float): Critical difference used for statistically significance of\n",
+    "            difference between methods.\n",
+    "        cdmethod (int, optional): the method that is compared with other methods\n",
+    "            If omitted, show pairwise comparison of methods\n",
+    "        lowv (int, optional): the lowest shown rank\n",
+    "        highv (int, optional): the highest shown rank\n",
+    "        width (int, optional): default width in inches (default: 6)\n",
+    "        textspace (int, optional): space on figure sides (in inches) for the\n",
+    "            method names (default: 1)\n",
+    "        reverse (bool, optional):  if set to `True`, the lowest rank is on the\n",
+    "            right (default: `False`)\n",
+    "        filename (str, optional): output file name (with extension). If not\n",
+    "            given, the function does not write a file.\n",
+    "    \"\"\"\n",
+    "    try:\n",
+    "        import matplotlib.pyplot as plt\n",
+    "        from matplotlib.backends.backend_agg import FigureCanvasAgg\n",
+    "    except ImportError:\n",
+    "        raise ImportError(\"Function graph_ranks requires matplotlib.\")\n",
+    "\n",
+    "    width = float(width)\n",
+    "    textspace = float(textspace)\n",
+    "\n",
+    "    def nth(l, n):\n",
+    "        \"\"\"\n",
+    "        Returns only nth elemnt in a list.\n",
+    "        \"\"\"\n",
+    "        n = lloc(l, n)\n",
+    "        return [a[n] for a in l]\n",
+    "\n",
+    "    def lloc(l, n):\n",
+    "        \"\"\"\n",
+    "        List location in list of list structure.\n",
+    "        Enable the use of negative locations:\n",
+    "        -1 is the last element, -2 second last...\n",
+    "        \"\"\"\n",
+    "        if n < 0:\n",
+    "            return len(l[0]) + n\n",
+    "        else:\n",
+    "            return n\n",
+    "\n",
+    "    def mxrange(lr):\n",
+    "        \"\"\"\n",
+    "        Multiple xranges. Can be used to traverse matrices.\n",
+    "        This function is very slow due to unknown number of\n",
+    "        parameters.\n",
+    "\n",
+    "        >>> mxrange([3,5])\n",
+    "        [(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)]\n",
+    "\n",
+    "        >>> mxrange([[3,5,1],[9,0,-3]])\n",
+    "        [(3, 9), (3, 6), (3, 3), (4, 9), (4, 6), (4, 3)]\n",
+    "\n",
+    "        \"\"\"\n",
+    "        if not len(lr):\n",
+    "            yield ()\n",
+    "        else:\n",
+    "            # it can work with single numbers\n",
+    "            index = lr[0]\n",
+    "            if isinstance(index, int):\n",
+    "                index = [index]\n",
+    "            for a in range(*index):\n",
+    "                for b in mxrange(lr[1:]):\n",
+    "                    yield tuple([a] + list(b))\n",
+    "\n",
+    "    def print_figure(fig, *args, **kwargs):\n",
+    "        canvas = FigureCanvasAgg(fig)\n",
+    "        canvas.print_figure(*args, **kwargs)\n",
+    "\n",
+    "    sums = avranks\n",
+    "\n",
+    "    tempsort = sorted([(a, i) for i, a in enumerate(sums)], reverse=reverse)\n",
+    "    ssums = nth(tempsort, 0)\n",
+    "    sortidx = nth(tempsort, 1)\n",
+    "    nnames = [names[x] for x in sortidx]\n",
+    "\n",
+    "    if lowv is None:\n",
+    "        lowv = min(1, int(math.floor(min(ssums))))\n",
+    "    if highv is None:\n",
+    "        highv = max(len(avranks), int(math.ceil(max(ssums))))\n",
+    "\n",
+    "    cline = 0.4\n",
+    "\n",
+    "    k = len(sums)\n",
+    "\n",
+    "    lines = None\n",
+    "\n",
+    "    linesblank = 0\n",
+    "    scalewidth = width - 2 * textspace\n",
+    "\n",
+    "    def rankpos(rank):\n",
+    "        if not reverse:\n",
+    "            a = rank - lowv\n",
+    "        else:\n",
+    "            a = highv - rank\n",
+    "        return textspace + scalewidth / (highv - lowv) * a\n",
+    "\n",
+    "    distanceh = 0.25\n",
+    "\n",
+    "    if cd and cdmethod is None:\n",
+    "        # get pairs of non significant methods\n",
+    "\n",
+    "        def get_lines(sums, hsd):\n",
+    "            # get all pairs\n",
+    "            lsums = len(sums)\n",
+    "            allpairs = [(i, j) for i, j in mxrange([[lsums], [lsums]]) if j > i]\n",
+    "            # remove not significant\n",
+    "            notSig = [(i, j) for i, j in allpairs\n",
+    "                      if abs(sums[i] - sums[j]) <= hsd]\n",
+    "            # keep only longest\n",
+    "\n",
+    "            def no_longer(ij_tuple, notSig):\n",
+    "                i, j = ij_tuple\n",
+    "                for i1, j1 in notSig:\n",
+    "                    if (i1 <= i and j1 > j) or (i1 < i and j1 >= j):\n",
+    "                        return False\n",
+    "                return True\n",
+    "\n",
+    "            longest = [(i, j) for i, j in notSig if no_longer((i, j), notSig)]\n",
+    "\n",
+    "            return longest\n",
+    "\n",
+    "        lines = get_lines(ssums, cd)\n",
+    "        linesblank = 0.2 + 0.2 + (len(lines) - 1) * 0.1\n",
+    "\n",
+    "        # add scale\n",
+    "        distanceh = 0.25\n",
+    "        cline += distanceh\n",
+    "\n",
+    "    # calculate height needed height of an image\n",
+    "    minnotsignificant = max(2 * 0.2, linesblank)\n",
+    "    height = cline + ((k + 1) / 2) * 0.2 + minnotsignificant\n",
+    "\n",
+    "    fig = plt.figure(figsize=(width, height))\n",
+    "    fig.set_facecolor('white')\n",
+    "    ax = fig.add_axes([0, 0, 1, 1])  # reverse y axis\n",
+    "    ax.set_axis_off()\n",
+    "\n",
+    "    hf = 1. / height  # height factor\n",
+    "    wf = 1. / width\n",
+    "\n",
+    "    def hfl(l):\n",
+    "        return [a * hf for a in l]\n",
+    "\n",
+    "    def wfl(l):\n",
+    "        return [a * wf for a in l]\n",
+    "\n",
+    "\n",
+    "    # Upper left corner is (0,0).\n",
+    "    ax.plot([0, 1], [0, 1], c=\"w\")\n",
+    "    ax.set_xlim(0, 1)\n",
+    "    ax.set_ylim(1, 0)\n",
+    "\n",
+    "    def line(l, color='k', **kwargs):\n",
+    "        \"\"\"\n",
+    "        Input is a list of pairs of points.\n",
+    "        \"\"\"\n",
+    "        ax.plot(wfl(nth(l, 0)), hfl(nth(l, 1)), color=color, **kwargs)\n",
+    "\n",
+    "    def text(x, y, s, *args, **kwargs):\n",
+    "        ax.text(wf * x, hf * y, s, fontsize = 14, *args, **kwargs)\n",
+    "\n",
+    "    line([(textspace, cline), (width - textspace, cline)], linewidth=0.7)\n",
+    "\n",
+    "    bigtick = 0.1\n",
+    "    smalltick = 0.05\n",
+    "\n",
+    "    tick = None\n",
+    "    for a in list(np.arange(lowv, highv, 0.5)) + [highv]:\n",
+    "        tick = smalltick\n",
+    "        if a == int(a):\n",
+    "            tick = bigtick\n",
+    "        line([(rankpos(a), cline - tick / 2),\n",
+    "              (rankpos(a), cline)],\n",
+    "             linewidth=0.7)\n",
+    "\n",
+    "    for a in range(lowv, highv + 1):\n",
+    "        text(rankpos(a), cline - tick / 2 - 0.05, str(a),\n",
+    "             ha=\"center\", va=\"bottom\")\n",
+    "\n",
+    "    k = len(ssums)\n",
+    "\n",
+    "    for i in range(math.ceil(k / 2)):\n",
+    "        chei = cline + minnotsignificant + i * 0.2\n",
+    "        line([(rankpos(ssums[i]), cline),\n",
+    "              (rankpos(ssums[i]), chei),\n",
+    "              (textspace - 0.1, chei)],\n",
+    "             linewidth=0.7)\n",
+    "        text(textspace - 0.2, chei, nnames[i], ha=\"right\", va=\"center\")\n",
+    "\n",
+    "    for i in range(math.ceil(k / 2), k):\n",
+    "        chei = cline + minnotsignificant + (k - i - 1) * 0.2\n",
+    "        line([(rankpos(ssums[i]), cline),\n",
+    "              (rankpos(ssums[i]), chei),\n",
+    "              (textspace + scalewidth + 0.1, chei)],\n",
+    "             linewidth=0.7)\n",
+    "        text(textspace + scalewidth + 0.2, chei, nnames[i],\n",
+    "             ha=\"left\", va=\"center\")\n",
+    "\n",
+    "    if cd and cdmethod is None:\n",
+    "        # upper scale\n",
+    "        if not reverse:\n",
+    "            begin, end = rankpos(lowv), rankpos(lowv + cd)\n",
+    "        else:\n",
+    "            begin, end = rankpos(highv), rankpos(highv - cd)\n",
+    "\n",
+    "        line([(begin, distanceh), (end, distanceh)], linewidth=0.7)\n",
+    "        line([(begin, distanceh + bigtick / 2),\n",
+    "              (begin, distanceh - bigtick / 2)],\n",
+    "             linewidth=0.7)\n",
+    "        line([(end, distanceh + bigtick / 2),\n",
+    "              (end, distanceh - bigtick / 2)],\n",
+    "             linewidth=0.7)\n",
+    "        text((begin + end) / 2, distanceh - 0.05, \"CD\",\n",
+    "             ha=\"center\", va=\"bottom\")\n",
+    "\n",
+    "        # no-significance lines\n",
+    "        def draw_lines(lines, side=0.05, height=0.1):\n",
+    "            start = cline + 0.2\n",
+    "            for l, r in lines:\n",
+    "                line([(rankpos(ssums[l]) - side, start),\n",
+    "                      (rankpos(ssums[r]) + side, start)],\n",
+    "                     linewidth=2.5)\n",
+    "                start += height\n",
+    "\n",
+    "        draw_lines(lines)\n",
+    "\n",
+    "    elif cd:\n",
+    "        begin = rankpos(avranks[cdmethod] - cd)\n",
+    "        end = rankpos(avranks[cdmethod] + cd)\n",
+    "        line([(begin, cline), (end, cline)],\n",
+    "             linewidth=2.5)\n",
+    "        line([(begin, cline + bigtick / 2),\n",
+    "              (begin, cline - bigtick / 2)],\n",
+    "             linewidth=2.5)\n",
+    "        line([(end, cline + bigtick / 2),\n",
+    "              (end, cline - bigtick / 2)],\n",
+    "             linewidth=2.5)\n",
+    "\n",
+    "    if filename:\n",
+    "        print_figure(fig, filename, **kwargs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eeeb1424-63f5-4338-bdc9-5e36e8e4ac09",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# FN\n",
+    "df = pd.DataFrame(rmse_scores)\n",
+    "df\n",
+    "\n",
+    "scores = [df[col].values for col in df.columns]\n",
+    "\n",
+    "stat, p = friedmanchisquare(*scores)\n",
+    "print(f'Friedman Test Statistic: {stat}, p-value: {p}')\n",
+    "\n",
+    "ranks = df.rank(axis=1, method='average')\n",
+    "average_ranks = ranks.mean().values\n",
+    "\n",
+    "n_datasets = df.shape[0]\n",
+    "alpha = 0.05\n",
+    "\n",
+    "from scikit_posthocs import posthoc_nemenyi_friedman\n",
+    "cd = np.sqrt((len(df.columns) * (len(df.columns) + 1)) / (6 * n_datasets)) * np.sqrt(2 / alpha)\n",
+    "print(f'Critical Difference: {cd}')\n",
+    "\n",
+    "classifiers = [f\"{clf} ({rank:.2f})\" for clf, rank in zip(df.columns, average_ranks)]\n",
+    "\n",
+    "plt.figure(figsize=(16, 10))\n",
+    "\n",
+    "graph_ranks(average_ranks, classifiers, cd=cd, width=6, textspace=1)\n",
+    "plt.xlabel('Classifiers')\n",
+    "\n",
+    "plt.text(0.5, 1.19, f'Friedman-Nemenyi: {stat:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=16)\n",
+    "plt.text(0.5, 1.10, f'CD: {cd:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=16)\n",
+    "\n",
+    "plt.tight_layout()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d88fdb63-9223-48f5-b4c3-78c5ab76938b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# SHAP\n",
+    "import shap\n",
+    "import matplotlib.pyplot as plt\n",
+    "import pandas as pd\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import MinMaxScaler\n",
+    "from catboost import CatBoostRegressor\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "from sklearn.neighbors import LocalOutlierFactor\n",
+    "from xgboost import XGBRegressor\n",
+    "\n",
+    "cbr = CatBoostRegressor(verbose=0, iterations=2000, learning_rate=0.01, depth=5)\n",
+    "\n",
+    "X = splitted_dataset.drop('total_sum', axis=1)\n",
+    "y = splitted_dataset['total_sum']\n",
+    "random_state = 0\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)\n",
+    "\n",
+    "lof = LocalOutlierFactor()\n",
+    "yhat = lof.fit_predict(X_train)\n",
+    "\n",
+    "mask = yhat != -1\n",
+    "X_train, y_train = X_train[mask], y_train[mask]\n",
+    "\n",
+    "original_columns = X.columns.tolist()\n",
+    "\n",
+    "X_train = pd.DataFrame(X_train, columns=original_columns)\n",
+    "X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "xgb = XGBRegressor(random_state=random_state)\n",
+    "xgb.fit(X_train, y_train)\n",
+    "selector = SelectFromModel(xgb, prefit=True)\n",
+    "\n",
+    "importance = np.abs(xgb.feature_importances_)\n",
+    "indices = np.argsort(importance)[::-1]\n",
+    "important_features = [original_columns[i] for i in indices[:50]]\n",
+    "\n",
+    "X_train_fi = X_train[important_features]\n",
+    "X_test_fi = X_test[important_features]\n",
+    "\n",
+    "cbr.fit(X_train_fi, y_train)\n",
+    "\n",
+    "# Compute SHAP values using shap.Explainer\n",
+    "explainer = shap.Explainer(cbr, X_train_fi)\n",
+    "shap_values = explainer(X_train_fi)\n",
+    "plt.figure(figsize=(12, 8))\n",
+    "shap.summary_plot(shap_values, X_train_fi, plot_type=\"bar\", feature_names=important_features, show=False)\n",
+    "plt.savefig(\"shap_summary_plot.svg\", format='svg')  # Save the plot as SVG\n",
+    "plt.close()\n",
+    "\n",
+    "display(FileLink(\"shap_summary_plot.svg\"))\n",
+    "\n",
+    "sorted_indices = np.argsort(y_train.values)\n",
+    "low_value_index = sorted_indices[0]\n",
+    "high_value_index = sorted_indices[-1]\n",
+    "\n",
+    "print(f\"Array position with low target value: {low_value_index}, Target Value: {y_train.values[low_value_index]}\")\n",
+    "print(f\"Array position with high target value: {high_value_index}, Target Value: {y_train.values[high_value_index]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6cce3ab-2480-4e87-a4ad-36054e12553a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Function to plot SHAP waterfall plot for a specific instance and save as SVG\n",
+    "def plot_shap_waterfall(instance_index, filename):\n",
+    "    shap_value = shap_values[instance_index]\n",
+    "    \n",
+    "    plt.figure(figsize=(14, 8))\n",
+    "    \n",
+    "    shap.plots.waterfall(shap_value, show=False)\n",
+    "    \n",
+    "    plt.tight_layout()\n",
+    "\n",
+    "    plt.savefig(filename, format='svg')\n",
+    "    \n",
+    "    plt.close()\n",
+    "\n",
+    "plot_shap_waterfall(482, \"waterfall_plot_instance_0.svg\")\n",
+    "plot_shap_waterfall(70, \"waterfall_plot_instance_1.svg\")\n",
+    "\n",
+    "display(FileLink(\"waterfall_plot_instance_0.svg\"))\n",
+    "display(FileLink(\"waterfall_plot_instance_1.svg\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "176f6725-ac13-444e-8d6e-fe3cc7a63b95",
+   "metadata": {},
+   "source": [
+    "### Hyperparameter Optimization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "213e4d7c-5719-47bc-831d-46809d65fb49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Models\n",
+    "regressors = {\n",
+    "    'CBR': CatBoostRegressor(verbose=0),\n",
+    "    #'XGBR': XGBRegressor(),\n",
+    "    #'LGBMR': LGBMRegressor(),\n",
+    "    # 'GBR': GradientBoostingRegressor(),\n",
+    "    #'RFR': RandomForestRegressor(),\n",
+    "    # 'ETR': ExtraTreesRegressor(),\n",
+    "    # 'ABR': AdaBoostRegressor()\n",
+    "}\n",
+    "\n",
+    "# Define parameter grids for the models\n",
+    "param_grids = {\n",
+    "    'CBR': {\n",
+    "        'iterations': Integer(100, 500),\n",
+    "        'learning_rate': Real(0.01, 0.1),\n",
+    "        'depth': Integer(3, 10),\n",
+    "    },\n",
+    "    # 'GBR': {\n",
+    "    #     'n_estimators': Integer(50, 300),\n",
+    "    #     'learning_rate': Real(0.01, 0.1),\n",
+    "    #     'max_depth': Integer(3, 10)\n",
+    "    # },\n",
+    "    # 'RFR': {\n",
+    "    #     'n_estimators': Integer(50, 300),\n",
+    "    #     'max_depth': Integer(3, 20)\n",
+    "    # },\n",
+    "    # 'XGBR': {\n",
+    "    #     'n_estimators': Integer(50, 300),\n",
+    "    #     'learning_rate': Real(0.01, 0.1),\n",
+    "    #     'max_depth': Integer(3, 10),\n",
+    "    # },\n",
+    "    # 'LGBMR': {\n",
+    "    #     'n_estimators': Integer(50, 300),\n",
+    "    #     'learning_rate': Real(0.01, 0.1),\n",
+    "    #     'num_leaves': Integer(20, 50),\n",
+    "    # },\n",
+    "    # 'ETR': {\n",
+    "    #     'n_estimators': Integer(50, 300),\n",
+    "    #     'max_depth': Integer(3, 20)\n",
+    "    # },\n",
+    "#     'ABR': {\n",
+    "#         'n_estimators': Integer(50, 300),\n",
+    "#         'learning_rate': Real(0.01, 1.0)\n",
+    "#     }\n",
+    "}\n",
+    "\n",
+    "# Function to perform hyperparameter tuning\n",
+    "def hyperparameter_tuning(model, param_grid, X_train, y_train):\n",
+    "    bayes_search = BayesSearchCV(model, search_spaces=param_grid, n_iter=50, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=0)\n",
+    "    bayes_search.fit(X_train, y_train)\n",
+    "    return bayes_search.best_estimator_\n",
+    "\n",
+    "metric_sums = {name: {'rmse': 0, 'mae': 0, 'r2': 0} for name in regressors.keys()}\n",
+    "metric_stds = {name: {'rmse': 0, 'mae': 0, 'r2': 0} for name in regressors.keys()}\n",
+    "rmse_scores = {name: [] for name in regressors.keys()}\n",
+    "mae_scores = {name: [] for name in regressors.keys()}\n",
+    "r2_scores = {name: [] for name in regressors.keys()}\n",
+    "\n",
+    "random_state = 5\n",
+    "print(f'Processing for Random State: {random_state}')\n",
+    "\n",
+    "X = splitted_dataset.drop('total_sum', axis=1)\n",
+    "y = splitted_dataset['total_sum']\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)\n",
+    "\n",
+    "lof = LocalOutlierFactor()\n",
+    "yhat = lof.fit_predict(X_train)\n",
+    "\n",
+    "mask = yhat != -1\n",
+    "X_train, y_train = X_train[mask], y_train[mask]\n",
+    "\n",
+    "original_columns = X.columns.tolist()\n",
+    "\n",
+    "print(f\"Number of training labels after outlier removal: {len(y_train)}\")\n",
+    "print(f\"Number of test labels: {len(y_test)}\")\n",
+    "\n",
+    "scaler = MinMaxScaler()\n",
+    "X_train = scaler.fit_transform(X_train)\n",
+    "X_test = scaler.transform(X_test)\n",
+    "\n",
+    "X_train = pd.DataFrame(X_train, columns=original_columns)\n",
+    "X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "xgb = XGBRegressor(random_state=random_state)\n",
+    "xgb.fit(X_train, y_train)\n",
+    "selector = SelectFromModel(xgb, prefit=True)\n",
+    "\n",
+    "importance = np.abs(xgb.feature_importances_)\n",
+    "indices = np.argsort(importance)[::-1]\n",
+    "important_features = [original_columns[i] for i in indices[:50]]\n",
+    "\n",
+    "X_train = X_train[important_features]\n",
+    "X_test = X_test[important_features]\n",
+    "\n",
+    "best_models = {}\n",
+    "for model_name, param_grid in param_grids.items():\n",
+    "    if model_name == 'GBR':\n",
+    "        model = GradientBoostingRegressor()\n",
+    "    elif model_name == 'RFR':\n",
+    "        model = RandomForestRegressor()\n",
+    "    elif model_name == 'XGBR':\n",
+    "        model = XGBRegressor()\n",
+    "    elif model_name == 'LGBMR':\n",
+    "        model = LGBMRegressor()\n",
+    "    elif model_name == 'ETR':\n",
+    "        model = ExtraTreesRegressor()\n",
+    "    elif model_name == 'ABR':\n",
+    "        model = AdaBoostRegressor()\n",
+    "    elif model_name == 'CBR':\n",
+    "        model = CatBoostRegressor(verbose=0)\n",
+    "    \n",
+    "    print(f\"Optimizing {model_name}...\")\n",
+    "    best_model = hyperparameter_tuning(model, param_grid, X_train, y_train)\n",
+    "    best_models[model_name] = best_model\n",
+    "    print(f\"Best parameters for {model_name}: {best_model.get_params()}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

severityPredictionLayer.ipynb ADDED Viewed

	@@ -0,0 +1,1858 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "83e18217-bbe8-4a9f-bab9-9be999d44761",
+   "metadata": {},
+   "source": [
+    "# Severity Prediction Layer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dddcb2a8-73d3-4476-b88a-bc1494a2c830",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from sklearn.model_selection import cross_val_score, KFold\n",
+    "from sklearn.impute import KNNImputer\n",
+    "from sklearn.pipeline import make_pipeline\n",
+    "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier\n",
+    "from xgboost import XGBClassifier\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "from sklearn.experimental import enable_iterative_imputer\n",
+    "from imblearn.pipeline import make_pipeline as make_pipeline_imb\n",
+    "from imblearn.over_sampling import SMOTE,SMOTENC\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from collections import Counter\n",
+    "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n",
+    "from sklearn.ensemble import GradientBoostingClassifier\n",
+    "from sklearn.ensemble import VotingClassifier\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.ensemble import BaggingClassifier\n",
+    "from sklearn.neighbors import KNeighborsClassifier\n",
+    "from sklearn.ensemble import ExtraTreesClassifier\n",
+    "from deslib.dcs import APosteriori\n",
+    "from deslib.des import KNORAE, KNORAU, KNOP, DESMI\n",
+    "from sklearn.neighbors import LocalOutlierFactor\n",
+    "from sklearn.utils import resample\n",
+    "import warnings\n",
+    "from imblearn.over_sampling import RandomOverSampler\n",
+    "from imblearn.under_sampling import RandomUnderSampler\n",
+    "from sklearn.preprocessing import MinMaxScaler\n",
+    "from imblearn.pipeline import Pipeline\n",
+    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
+    "from sklearn.preprocessing import LabelEncoder, PowerTransformer\n",
+    "from collections import defaultdict\n",
+    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier\n",
+    "from catboost import CatBoostClassifier\n",
+    "from lightgbm import LGBMClassifier\n",
+    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "from sklearn.neural_network import MLPClassifier\n",
+    "import Orange\n",
+    "from scipy.stats import friedmanchisquare, rankdata\n",
+    "import shap\n",
+    "import scikit_posthocs as sp\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "from IPython.display import FileLink, display\n",
+    "import math\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from skopt.space import Integer, Real\n",
+    "from sklearn.model_selection import StratifiedKFold\n",
+    "from skopt import BayesSearchCV\n",
+    "import xgboost as xgb\n",
+    "from imblearn.over_sampling import SMOTE\n",
+    "from sklearn.tree import DecisionTreeClassifier, export_text\n",
+    "from sklearn import tree\n",
+    "from skopt.space import Real, Integer, Categorical\n",
+    "from skopt.callbacks import VerboseCallback\n",
+    "from deslib.des.knora_e import KNORAE\n",
+    "from deslib.des.knora_u import KNORAU\n",
+    "from deslib.des.knop import KNOP\n",
+    "from deslib.des.meta_des import METADES\n",
+    "from deslib.des.des_knn import DESKNN\n",
+    "from deslib.des.des_p import DESP"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39d6683c-fd3f-4daa-afdc-2e7f83c3fce3",
+   "metadata": {},
+   "source": [
+    "### Preparation before training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec17e47a-8d92-498d-8f28-d8259d6ebc4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Call Dataset\n",
+    "pd.set_option('display.max_rows', 10)\n",
+    "initial_df = pd.read_csv('3labelv4Classification.csv')\n",
+    "initial_df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d3751352-73bc-42c6-b6a5-84a3badf14ff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# All categorical features except for label\n",
+    "cols = initial_df.columns\n",
+    "num_cols = initial_df._get_numeric_data().columns\n",
+    "categorical_features = list(set(cols) - set(num_cols))\n",
+    "categorical_features.remove('depression_category')\n",
+    "\n",
+    "# Label Encode all categorical, but keep missing values\n",
+    "le_initial_df = initial_df.copy()\n",
+    "dropped_labels = le_initial_df['depression_category']\n",
+    "le_initial_df = le_initial_df.drop('depression_category', axis = 1)\n",
+    "\n",
+    "for col in le_initial_df.columns:\n",
+    "    if le_initial_df[col].dtype == 'object':\n",
+    "        le_initial_df[col] = le_initial_df[col].fillna('missing')\n",
+    "\n",
+    "        label_encoder = LabelEncoder()\n",
+    "        le_initial_df[col] = label_encoder.fit_transform(le_initial_df[col])\n",
+    "\n",
+    "        missing_value_index = np.where(label_encoder.classes_ == 'missing')[0]\n",
+    "        \n",
+    "        le_initial_df[col] = le_initial_df[col].replace(missing_value_index, np.nan)\n",
+    "\n",
+    "le_initial_df = pd.concat([le_initial_df, dropped_labels], axis = 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9fa21a95-21f1-4274-86ba-d7977813066b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "le_initial_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ba54a81-d4de-4884-99d3-29867eb7ea40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Seperate and Combine\n",
+    "le_df_mild = le_initial_df[le_initial_df['depression_category'] == 'mild']\n",
+    "le_df_moderatesevere = le_initial_df[le_initial_df['depression_category'] == 'moderatesevere']\n",
+    "\n",
+    "# Check depression category counts\n",
+    "dataframes = [le_df_mild, le_df_moderatesevere]\n",
+    "le_initial_df = pd.concat(dataframes, ignore_index=True)\n",
+    "label_counts = le_initial_df['depression_category'].value_counts()\n",
+    "label_counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8c5bdf52-c645-4a11-920a-579e28db1a50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imputation\n",
+    "different_le_dfs = [le_df_mild, le_df_moderatesevere]\n",
+    "imputed_le_dfs = []\n",
+    "from sklearn.impute import IterativeImputer\n",
+    "for le_df in different_le_dfs:\n",
+    "    y = le_df['depression_category']\n",
+    "    X = le_df.drop('depression_category', axis = 1)\n",
+    "    \n",
+    "    imputer = SimpleImputer(strategy='median')\n",
+    "    imputed_data = imputer.fit_transform(X)\n",
+    "    imputed_df = pd.DataFrame(imputed_data, columns = X.columns)\n",
+    "\n",
+    "    imputed_df['depression_category'] = y.reset_index(drop = True)\n",
+    "    imputed_le_dfs.append(imputed_df)\n",
+    "\n",
+    "concatenated_le_dfs = pd.concat(imputed_le_dfs, ignore_index = True)\n",
+    "concatenated_le_dfs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e871a5f-beab-4f90-b8c4-e922ef86d0f0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Full label encode depression category\n",
+    "fully_LE_concatenated_le_dfs = concatenated_le_dfs.copy()\n",
+    "fully_LE_concatenated_le_dfs['depression_category'] = label_encoder.fit_transform(fully_LE_concatenated_le_dfs['depression_category'])\n",
+    "\n",
+    "# The dataset after category connect, imputation, and label encoding\n",
+    "splitted_dataset = fully_LE_concatenated_le_dfs.copy()\n",
+    "splitted_dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "198fdc3c-ccbc-4ac5-8621-c0c5dc771cbb",
+   "metadata": {},
+   "source": [
+    "### Setup for training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "101c5549-7bc3-4573-867b-f09776e254db",
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def plot_combined_roc_curve(roc_curves, classifier_names):\n",
+    "    plt.figure(figsize=(12, 8))\n",
+    "    mean_fpr = np.linspace(0, 1, 100)\n",
+    "    colors = plt.cm.get_cmap('tab20', len(classifier_names))\n",
+    "    \n",
+    "    for i, clf_name in enumerate(classifier_names):\n",
+    "        tprs = []\n",
+    "        for fpr, tpr in roc_curves[clf_name]:\n",
+    "            tprs.append(np.interp(mean_fpr, fpr, tpr))\n",
+    "        mean_tpr = np.mean(tprs, axis=0)\n",
+    "        mean_tpr[-1] = 1.0\n",
+    "        mean_auc = auc(mean_fpr, mean_tpr)\n",
+    "        plt.plot(mean_fpr, mean_tpr, color=colors(i), lw=2, linestyle='-', marker='o', markersize=4, \n",
+    "                 label=f'{clf_name} (AUC = {mean_auc:.3f})')\n",
+    "\n",
+    "    plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--')\n",
+    "    plt.xlim([0.0, 1.0])\n",
+    "    plt.ylim([0.0, 1.05])\n",
+    "    plt.xlabel('False Positive Rate', fontsize=26)\n",
+    "    plt.ylabel('True Positive Rate', fontsize=26)\n",
+    "    plt.xticks(fontsize=30)\n",
+    "    plt.yticks(fontsize=30)\n",
+    "    plt.legend(loc=\"lower right\", fontsize=22, frameon=True, framealpha=0.9)\n",
+    "    plt.grid(True)\n",
+    "\n",
+    "    filename='bonk.svg'\n",
+    "\n",
+    "    plt.savefig(filename, format='svg')\n",
+    "    plt.show()\n",
+    "\n",
+    "    display(FileLink(filename))\n",
+    "\n",
+    "# Preparation code to make CD diagram from older version of Orange\n",
+    "def compute_CD(avranks, n, alpha=\"0.05\", test=\"nemenyi\"):\n",
+    "    \"\"\"\n",
+    "    Returns critical difference for Nemenyi or Bonferroni-Dunn test\n",
+    "    according to given alpha (either alpha=\"0.05\" or alpha=\"0.1\") for average\n",
+    "    ranks and number of tested datasets N. Test can be either \"nemenyi\" for\n",
+    "    for Nemenyi two tailed test or \"bonferroni-dunn\" for Bonferroni-Dunn test.\n",
+    "\n",
+    "    This function is deprecated and will be removed in Orange 3.34.\n",
+    "    \"\"\"\n",
+    "    k = len(avranks)\n",
+    "    d = {(\"nemenyi\", \"0.05\"): [0, 0, 1.959964, 2.343701, 2.569032, 2.727774,\n",
+    "                               2.849705, 2.94832, 3.030879, 3.101730, 3.163684,\n",
+    "                               3.218654, 3.268004, 3.312739, 3.353618, 3.39123,\n",
+    "                               3.426041, 3.458425, 3.488685, 3.517073,\n",
+    "                               3.543799],\n",
+    "         (\"nemenyi\", \"0.1\"): [0, 0, 1.644854, 2.052293, 2.291341, 2.459516,\n",
+    "                              2.588521, 2.692732, 2.779884, 2.854606, 2.919889,\n",
+    "                              2.977768, 3.029694, 3.076733, 3.119693, 3.159199,\n",
+    "                              3.195743, 3.229723, 3.261461, 3.291224, 3.319233],\n",
+    "         (\"bonferroni-dunn\", \"0.05\"): [0, 0, 1.960, 2.241, 2.394, 2.498, 2.576,\n",
+    "                                       2.638, 2.690, 2.724, 2.773],\n",
+    "         (\"bonferroni-dunn\", \"0.1\"): [0, 0, 1.645, 1.960, 2.128, 2.241, 2.326,\n",
+    "                                      2.394, 2.450, 2.498, 2.539]}\n",
+    "    q = d[(test, alpha)]\n",
+    "    cd = q[k] * (k * (k + 1) / (6.0 * n)) ** 0.5\n",
+    "    return cd\n",
+    "\n",
+    "\n",
+    "def graph_ranks(avranks, names, cd=None, cdmethod=None, lowv=None, highv=None,\n",
+    "                width=6, textspace=1, reverse=False, filename=None, **kwargs):\n",
+    "    \"\"\"\n",
+    "    Draws a CD graph, which is used to display  the differences in methods'\n",
+    "    performance. See Janez Demsar, Statistical Comparisons of Classifiers over\n",
+    "    Multiple Data Sets, 7(Jan):1--30, 2006.\n",
+    "\n",
+    "    Needs matplotlib to work.\n",
+    "\n",
+    "    The image is ploted on `plt` imported using\n",
+    "    `import matplotlib.pyplot as plt`.\n",
+    "\n",
+    "    This function is deprecated and will be removed in Orange 3.34.\n",
+    "\n",
+    "    Args:\n",
+    "        avranks (list of float): average ranks of methods.\n",
+    "        names (list of str): names of methods.\n",
+    "        cd (float): Critical difference used for statistically significance of\n",
+    "            difference between methods.\n",
+    "        cdmethod (int, optional): the method that is compared with other methods\n",
+    "            If omitted, show pairwise comparison of methods\n",
+    "        lowv (int, optional): the lowest shown rank\n",
+    "        highv (int, optional): the highest shown rank\n",
+    "        width (int, optional): default width in inches (default: 6)\n",
+    "        textspace (int, optional): space on figure sides (in inches) for the\n",
+    "            method names (default: 1)\n",
+    "        reverse (bool, optional):  if set to `True`, the lowest rank is on the\n",
+    "            right (default: `False`)\n",
+    "        filename (str, optional): output file name (with extension). If not\n",
+    "            given, the function does not write a file.\n",
+    "    \"\"\"\n",
+    "    try:\n",
+    "        import matplotlib.pyplot as plt\n",
+    "        from matplotlib.backends.backend_agg import FigureCanvasAgg\n",
+    "    except ImportError:\n",
+    "        raise ImportError(\"Function graph_ranks requires matplotlib.\")\n",
+    "\n",
+    "    width = float(width)\n",
+    "    textspace = float(textspace)\n",
+    "\n",
+    "    def nth(l, n):\n",
+    "        \"\"\"\n",
+    "        Returns only nth elemnt in a list.\n",
+    "        \"\"\"\n",
+    "        n = lloc(l, n)\n",
+    "        return [a[n] for a in l]\n",
+    "\n",
+    "    def lloc(l, n):\n",
+    "        \"\"\"\n",
+    "        List location in list of list structure.\n",
+    "        Enable the use of negative locations:\n",
+    "        -1 is the last element, -2 second last...\n",
+    "        \"\"\"\n",
+    "        if n < 0:\n",
+    "            return len(l[0]) + n\n",
+    "        else:\n",
+    "            return n\n",
+    "\n",
+    "    def mxrange(lr):\n",
+    "        \"\"\"\n",
+    "        Multiple xranges. Can be used to traverse matrices.\n",
+    "        This function is very slow due to unknown number of\n",
+    "        parameters.\n",
+    "\n",
+    "        >>> mxrange([3,5])\n",
+    "        [(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)]\n",
+    "\n",
+    "        >>> mxrange([[3,5,1],[9,0,-3]])\n",
+    "        [(3, 9), (3, 6), (3, 3), (4, 9), (4, 6), (4, 3)]\n",
+    "\n",
+    "        \"\"\"\n",
+    "        if not len(lr):\n",
+    "            yield ()\n",
+    "        else:\n",
+    "            # it can work with single numbers\n",
+    "            index = lr[0]\n",
+    "            if isinstance(index, int):\n",
+    "                index = [index]\n",
+    "            for a in range(*index):\n",
+    "                for b in mxrange(lr[1:]):\n",
+    "                    yield tuple([a] + list(b))\n",
+    "\n",
+    "    def print_figure(fig, *args, **kwargs):\n",
+    "        canvas = FigureCanvasAgg(fig)\n",
+    "        canvas.print_figure(*args, **kwargs)\n",
+    "\n",
+    "    sums = avranks\n",
+    "\n",
+    "    tempsort = sorted([(a, i) for i, a in enumerate(sums)], reverse=reverse)\n",
+    "    ssums = nth(tempsort, 0)\n",
+    "    sortidx = nth(tempsort, 1)\n",
+    "    nnames = [names[x] for x in sortidx]\n",
+    "\n",
+    "    if lowv is None:\n",
+    "        lowv = min(1, int(math.floor(min(ssums))))\n",
+    "    if highv is None:\n",
+    "        highv = max(len(avranks), int(math.ceil(max(ssums))))\n",
+    "\n",
+    "    cline = 0.4\n",
+    "\n",
+    "    k = len(sums)\n",
+    "\n",
+    "    lines = None\n",
+    "\n",
+    "    linesblank = 0\n",
+    "    scalewidth = width - 2 * textspace\n",
+    "\n",
+    "    def rankpos(rank):\n",
+    "        if not reverse:\n",
+    "            a = rank - lowv\n",
+    "        else:\n",
+    "            a = highv - rank\n",
+    "        return textspace + scalewidth / (highv - lowv) * a\n",
+    "\n",
+    "    distanceh = 0.25\n",
+    "\n",
+    "    if cd and cdmethod is None:\n",
+    "        # get pairs of non significant methods\n",
+    "\n",
+    "        def get_lines(sums, hsd):\n",
+    "            # get all pairs\n",
+    "            lsums = len(sums)\n",
+    "            allpairs = [(i, j) for i, j in mxrange([[lsums], [lsums]]) if j > i]\n",
+    "            # remove not significant\n",
+    "            notSig = [(i, j) for i, j in allpairs\n",
+    "                      if abs(sums[i] - sums[j]) <= hsd]\n",
+    "            # keep only longest\n",
+    "\n",
+    "            def no_longer(ij_tuple, notSig):\n",
+    "                i, j = ij_tuple\n",
+    "                for i1, j1 in notSig:\n",
+    "                    if (i1 <= i and j1 > j) or (i1 < i and j1 >= j):\n",
+    "                        return False\n",
+    "                return True\n",
+    "\n",
+    "            longest = [(i, j) for i, j in notSig if no_longer((i, j), notSig)]\n",
+    "\n",
+    "            return longest\n",
+    "\n",
+    "        lines = get_lines(ssums, cd)\n",
+    "        linesblank = 0.2 + 0.2 + (len(lines) - 1) * 0.1\n",
+    "\n",
+    "        # add scale\n",
+    "        distanceh = 0.25\n",
+    "        cline += distanceh\n",
+    "\n",
+    "    # calculate height needed height of an image\n",
+    "    minnotsignificant = max(2 * 0.2, linesblank)\n",
+    "    height = cline + ((k + 1) / 2) * 0.2 + minnotsignificant\n",
+    "\n",
+    "    fig = plt.figure(figsize=(width, height))\n",
+    "    fig.set_facecolor('white')\n",
+    "    ax = fig.add_axes([0, 0, 1, 1])  # reverse y axis\n",
+    "    ax.set_axis_off()\n",
+    "\n",
+    "    hf = 1. / height  # height factor\n",
+    "    wf = 1. / width\n",
+    "\n",
+    "    def hfl(l):\n",
+    "        return [a * hf for a in l]\n",
+    "\n",
+    "    def wfl(l):\n",
+    "        return [a * wf for a in l]\n",
+    "\n",
+    "\n",
+    "    # Upper left corner is (0,0).\n",
+    "    ax.plot([0, 1], [0, 1], c=\"w\")\n",
+    "    ax.set_xlim(0, 1)\n",
+    "    ax.set_ylim(1, 0)\n",
+    "\n",
+    "    def line(l, color='k', **kwargs):\n",
+    "        \"\"\"\n",
+    "        Input is a list of pairs of points.\n",
+    "        \"\"\"\n",
+    "        ax.plot(wfl(nth(l, 0)), hfl(nth(l, 1)), color=color, **kwargs)\n",
+    "\n",
+    "    def text(x, y, s, *args, **kwargs):\n",
+    "        ax.text(wf * x, hf * y, s, fontsize = 14, *args, **kwargs)\n",
+    "\n",
+    "    line([(textspace, cline), (width - textspace, cline)], linewidth=0.7)\n",
+    "\n",
+    "    bigtick = 0.1\n",
+    "    smalltick = 0.05\n",
+    "\n",
+    "    tick = None\n",
+    "    for a in list(np.arange(lowv, highv, 0.5)) + [highv]:\n",
+    "        tick = smalltick\n",
+    "        if a == int(a):\n",
+    "            tick = bigtick\n",
+    "        line([(rankpos(a), cline - tick / 2),\n",
+    "              (rankpos(a), cline)],\n",
+    "             linewidth=0.7)\n",
+    "\n",
+    "    for a in range(lowv, highv + 1):\n",
+    "        text(rankpos(a), cline - tick / 2 - 0.05, str(a),\n",
+    "             ha=\"center\", va=\"bottom\")\n",
+    "\n",
+    "    k = len(ssums)\n",
+    "\n",
+    "    for i in range(math.ceil(k / 2)):\n",
+    "        chei = cline + minnotsignificant + i * 0.2\n",
+    "        line([(rankpos(ssums[i]), cline),\n",
+    "              (rankpos(ssums[i]), chei),\n",
+    "              (textspace - 0.1, chei)],\n",
+    "             linewidth=0.7)\n",
+    "        text(textspace - 0.2, chei, nnames[i], ha=\"right\", va=\"center\")\n",
+    "\n",
+    "    for i in range(math.ceil(k / 2), k):\n",
+    "        chei = cline + minnotsignificant + (k - i - 1) * 0.2\n",
+    "        line([(rankpos(ssums[i]), cline),\n",
+    "              (rankpos(ssums[i]), chei),\n",
+    "              (textspace + scalewidth + 0.1, chei)],\n",
+    "             linewidth=0.7)\n",
+    "        text(textspace + scalewidth + 0.2, chei, nnames[i],\n",
+    "             ha=\"left\", va=\"center\")\n",
+    "\n",
+    "    if cd and cdmethod is None:\n",
+    "        # upper scale\n",
+    "        if not reverse:\n",
+    "            begin, end = rankpos(lowv), rankpos(lowv + cd)\n",
+    "        else:\n",
+    "            begin, end = rankpos(highv), rankpos(highv - cd)\n",
+    "\n",
+    "        line([(begin, distanceh), (end, distanceh)], linewidth=0.7)\n",
+    "        line([(begin, distanceh + bigtick / 2),\n",
+    "              (begin, distanceh - bigtick / 2)],\n",
+    "             linewidth=0.7)\n",
+    "        line([(end, distanceh + bigtick / 2),\n",
+    "              (end, distanceh - bigtick / 2)],\n",
+    "             linewidth=0.7)\n",
+    "        text((begin + end) / 2, distanceh - 0.05, \"CD\",\n",
+    "             ha=\"center\", va=\"bottom\")\n",
+    "\n",
+    "        # no-significance lines\n",
+    "        def draw_lines(lines, side=0.05, height=0.1):\n",
+    "            start = cline + 0.2\n",
+    "            for l, r in lines:\n",
+    "                line([(rankpos(ssums[l]) - side, start),\n",
+    "                      (rankpos(ssums[r]) + side, start)],\n",
+    "                     linewidth=2.5)\n",
+    "                start += height\n",
+    "\n",
+    "        draw_lines(lines)\n",
+    "\n",
+    "    elif cd:\n",
+    "        begin = rankpos(avranks[cdmethod] - cd)\n",
+    "        end = rankpos(avranks[cdmethod] + cd)\n",
+    "        line([(begin, cline), (end, cline)],\n",
+    "             linewidth=2.5)\n",
+    "        line([(begin, cline + bigtick / 2),\n",
+    "              (begin, cline - bigtick / 2)],\n",
+    "             linewidth=2.5)\n",
+    "        line([(end, cline + bigtick / 2),\n",
+    "              (end, cline - bigtick / 2)],\n",
+    "             linewidth=2.5)\n",
+    "\n",
+    "    if filename:\n",
+    "        print_figure(fig, filename, **kwargs)\n",
+    "\n",
+    "def train_evaluate_model(clf, X_train, y_train, X_test, y_test, clf_name='Classifier'):\n",
+    "    clf.fit(X_train, y_train)\n",
+    "    y_pred = clf.predict(X_test)\n",
+    "    \n",
+    "    accuracy = accuracy_score(y_test, y_pred)\n",
+    "    precision = precision_score(y_test, y_pred, average='weighted')\n",
+    "    recall = recall_score(y_test, y_pred, average='weighted')\n",
+    "    f1 = f1_score(y_test, y_pred, average='weighted')\n",
+    "    conf_matrix = confusion_matrix(y_test, y_pred)\n",
+    "    \n",
+    "    if hasattr(clf, 'predict_proba'):\n",
+    "        y_score = clf.predict_proba(X_test)[:, 1]\n",
+    "    else:\n",
+    "        y_score = clf.decision_function(X_test)\n",
+    "        \n",
+    "    fpr, tpr, _ = roc_curve(y_test, y_score)\n",
+    "    roc_auc = auc(fpr, tpr)\n",
+    "    \n",
+    "    print(f'{clf_name} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}')\n",
+    "    return accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e602f4d-efb2-4ac9-99bb-c6fbeca791cc",
+   "metadata": {
+    "jupyter": {
+     "source_hidden": true
+    }
+   },
+   "outputs": [],
+   "source": [
+    "warnings.filterwarnings('ignore')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "773b2524-af11-464f-97a9-f4add85be0d2",
+   "metadata": {},
+   "source": [
+    "### Training (classic/static)\n",
+    "In order to run classical/static, make sure to uncomment the one you need. \"Post Training\" is after one of these classical/static is done."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5b8873c-13d3-43b7-8902-14b73e8d5409",
+   "metadata": {},
+   "source": [
+    "#### Classical Classifiers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "264648a0-b0d8-4a63-9167-cea3a4ed1e18",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Optimized Classifiers Det\n",
+    "# classifiers = {\n",
+    "#     'DT': DecisionTreeClassifier(\n",
+    "#         random_state=0, \n",
+    "#         criterion='gini', \n",
+    "#         max_depth=6, \n",
+    "#         min_samples_leaf=10, \n",
+    "#         min_samples_split=9\n",
+    "#     ),\n",
+    "#     'LR': LogisticRegression(\n",
+    "#         random_state=0, \n",
+    "#         C=0.09659168435718246, \n",
+    "#         max_iter=100, \n",
+    "#         solver='lbfgs'\n",
+    "#     ),\n",
+    "#     'NB': GaussianNB(\n",
+    "#         var_smoothing=0.0058873326349240295\n",
+    "#     ),\n",
+    "#     'KN': KNeighborsClassifier(\n",
+    "#         metric='manhattan', \n",
+    "#         n_neighbors=8, \n",
+    "#         weights='uniform'\n",
+    "#     ),\n",
+    "#     'MLP': MLPClassifier(\n",
+    "#         random_state=0, \n",
+    "#         max_iter=1000, \n",
+    "#         alpha=0.0003079393718075164, \n",
+    "#         hidden_layer_sizes=195, \n",
+    "#         learning_rate_init=0.0001675266159417717\n",
+    "#     ),\n",
+    "#     'SVC': SVC(probability=True, kernel = 'rbf', C = 0.95, gamma = 'scale')}\n",
+    "\n",
+    "# Optimized Classifiers Sevpred\n",
+    "# classifiers = {\n",
+    "#     'DT': DecisionTreeClassifier(\n",
+    "#         random_state=0, \n",
+    "#         criterion='entropy', \n",
+    "#         max_depth=20, \n",
+    "#         min_samples_leaf=8, \n",
+    "#         min_samples_split=6\n",
+    "#     ),\n",
+    "#     'LR': LogisticRegression(\n",
+    "#         random_state=0, \n",
+    "#         C=2.2095350994035026, \n",
+    "#         max_iter=152, \n",
+    "#         solver='lbfgs'\n",
+    "#     ),\n",
+    "#     'NB': GaussianNB(\n",
+    "#         var_smoothing=0.00995456588724228\n",
+    "#     ),\n",
+    "#     'KN': KNeighborsClassifier(\n",
+    "#         metric='manhattan', \n",
+    "#         n_neighbors=1, \n",
+    "#         weights='uniform'\n",
+    "#     ),\n",
+    "#     'MLP': MLPClassifier(\n",
+    "#         random_state=0, \n",
+    "#         max_iter=1000, \n",
+    "#         alpha=8.512480164062713e-06, \n",
+    "#         hidden_layer_sizes=87, \n",
+    "#         learning_rate_init=0.002859975932024275\n",
+    "#     ),\n",
+    "#     'SVC': SVC(\n",
+    "#         probability=True, \n",
+    "#         kernel='rbf', \n",
+    "#         C=100, \n",
+    "#         gamma=0.1\n",
+    "#     )\n",
+    "# }\n",
+    "\n",
+    "# Default classifiers\n",
+    "# classifiers = {\n",
+    "#     'DecisionTree': DecisionTreeClassifier(random_state=0),\n",
+    "#     'LogisticRegression': LogisticRegression(max_iter=1000, random_state=0),\n",
+    "#     'NaiveBayes': GaussianNB(),\n",
+    "#     'KNeighbors': KNeighborsClassifier(),\n",
+    "#     'MLP': MLPClassifier(max_iter=1000, random_state=0),\n",
+    "#     'SVC': SVC(probability=True, random_state=0)\n",
+    "# }\n",
+    "\n",
+    "# Main\n",
+    "# Initialize\n",
+    "metric_sums = defaultdict(lambda: {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0})\n",
+    "conf_matrices = defaultdict(list)\n",
+    "roc_curves = defaultdict(list)\n",
+    "roc_aucs = defaultdict(list)\n",
+    "accuracy_scores = defaultdict(list)\n",
+    "precision_scores = defaultdict(list)\n",
+    "recall_scores = defaultdict(list)\n",
+    "f1_scores = defaultdict(list)\n",
+    "\n",
+    "# Loop over 10 different random states\n",
+    "for random_state in range(10):\n",
+    "    print(f\"Processing for Random State: {random_state}\")\n",
+    "\n",
+    "    # Splitting the data\n",
+    "    X = splitted_dataset.drop('depression_category', axis=1)\n",
+    "    y = splitted_dataset['depression_category']\n",
+    "    \n",
+    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n",
+    "    \n",
+    "    # Identify outliers in the training dataset\n",
+    "    lof = LocalOutlierFactor()\n",
+    "    yhat = lof.fit_predict(X_train)\n",
+    "    # Select all rows that are not outliers\n",
+    "    mask = yhat != -1\n",
+    "    X_train, y_train = X_train[mask], y_train[mask]\n",
+    "    \n",
+    "    original_columns = X.columns.tolist()\n",
+    "\n",
+    "    # SMOTE\n",
+    "    smote = SMOTE(random_state=random_state)\n",
+    "    X_res, y_res = smote.fit_resample(X_train, y_train)\n",
+    "\n",
+    "    print(f\"Number of training labels after ROS: {y_res.value_counts()}\")\n",
+    "\n",
+    "    print(f\"Number of test labels before resampling: {y_test.value_counts()}\") \n",
+    "    \n",
+    "    sampling_strategy_undersample = {0: 372}\n",
+    "    rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n",
+    "    X_test, y_test = rus.fit_resample(X_test, y_test)\n",
+    "    print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "    # Normalization\n",
+    "    scaler = MinMaxScaler()\n",
+    "    \n",
+    "    X_res = scaler.fit_transform(X_res)\n",
+    "    X_test = scaler.transform(X_test)\n",
+    "    \n",
+    "    X_res = pd.DataFrame(X_res, columns=original_columns)\n",
+    "    X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "   # Correlation Feat Analysis\n",
+    "    corr_df = X_res.copy()\n",
+    "    corr_df['target'] = y_res\n",
+    "    \n",
+    "    corr_mat = corr_df.corr()\n",
+    "    target_correlation = corr_mat['target'].drop('target')\n",
+    "    top_features = target_correlation.abs().sort_values(ascending=False).head(200).index.tolist()\n",
+    "    \n",
+    "    # Only take top features\n",
+    "    X_res_fi = X_res[top_features]\n",
+    "    X_test_fi = X_test[top_features]\n",
+    "\n",
+    "    # Evaluate classifiers\n",
+    "    for clf_name, clf in classifiers.items():\n",
+    "        # Ensure the random state for classifiers is consistent\n",
+    "        if hasattr(clf, 'random_state'):\n",
+    "            clf.set_params(random_state=random_state)\n",
+    "        accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc = train_evaluate_model(clf, X_res_fi, y_res, X_test_fi, y_test, clf_name=clf_name)\n",
+    "        metric_sums[clf_name]['accuracy'] += accuracy\n",
+    "        metric_sums[clf_name]['precision'] += precision\n",
+    "        metric_sums[clf_name]['recall'] += recall\n",
+    "        metric_sums[clf_name]['f1'] += f1\n",
+    "        conf_matrices[clf_name].append(conf_matrix)\n",
+    "        roc_curves[clf_name].append((fpr, tpr))\n",
+    "        roc_aucs[clf_name].append(roc_auc)\n",
+    "        accuracy_scores[clf_name].append(accuracy)\n",
+    "        precision_scores[clf_name].append(precision)\n",
+    "        recall_scores[clf_name].append(recall)\n",
+    "        f1_scores[clf_name].append(f1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0ec7e67e-a086-48a7-83de-26b54ef03899",
+   "metadata": {},
+   "source": [
+    "#### Static Classifiers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9e20dd2-bdd8-4626-be89-8bb866212de9",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Initialize\n",
+    "metric_sums = defaultdict(lambda: {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0})\n",
+    "conf_matrices = defaultdict(list)\n",
+    "roc_curves = defaultdict(list)\n",
+    "roc_aucs = defaultdict(list)\n",
+    "accuracy_scores = defaultdict(list)\n",
+    "precision_scores = defaultdict(list)\n",
+    "recall_scores = defaultdict(list)\n",
+    "f1_scores = defaultdict(list)\n",
+    "\n",
+    "# Optimized Classifiers Detection\n",
+    "# classifiers = {\n",
+    "#     'RF': RandomForestClassifier(n_estimators=143, criterion='entropy', max_depth=15, random_state=0),\n",
+    "#     'XGB': XGBClassifier(n_estimators=200, max_depth=3, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss', random_state=0),\n",
+    "#     'GB': GradientBoostingClassifier(n_estimators=300, max_depth=3, learning_rate=0.05),\n",
+    "    # 'AB': AdaBoostClassifier(n_estimators=400, learning_rate=0.1),\n",
+    "    # 'CB': CatBoostClassifier(depth = 3, iterations = 168, learning_rate = 0.1, verbose = 0),\n",
+    "    # 'LGBM': LGBMClassifier(learning_rate = 0.1, max_depth = 3, n_estimators = 200) \n",
+    "# }\n",
+    "\n",
+    "\n",
+    "# Optimized Classifiers SevPred\n",
+    "# classifiers = {\n",
+    "#     'RF': RandomForestClassifier(\n",
+    "#         n_estimators=300, \n",
+    "#         criterion='entropy', \n",
+    "#         max_depth=15, \n",
+    "#         bootstrap=False, \n",
+    "#         random_state=0\n",
+    "#     ),\n",
+    "#     'XGB': XGBClassifier(\n",
+    "#         n_estimators=300, \n",
+    "#         max_depth=5, \n",
+    "#         learning_rate=0.2, \n",
+    "#         gamma=0.0, \n",
+    "#         use_label_encoder=False, \n",
+    "#         eval_metric='mlogloss', \n",
+    "#         random_state=0\n",
+    "#     ),\n",
+    "#     'GB': GradientBoostingClassifier(\n",
+    "#         n_estimators=100, \n",
+    "#         max_depth=5, \n",
+    "#         learning_rate=0.1, \n",
+    "#         subsample=0.7\n",
+    "#     ),\n",
+    "#     'AB': AdaBoostClassifier(\n",
+    "#         n_estimators=300, \n",
+    "#         learning_rate=0.5, \n",
+    "#         algorithm='SAMME'\n",
+    "#     ),\n",
+    "#     'CB': CatBoostClassifier(\n",
+    "#         depth=4,\n",
+    "#         iterations=180,\n",
+    "#         learning_rate=0.09,\n",
+    "#         verbose=0\n",
+    "#     ),\n",
+    "#     'LGBM': LGBMClassifier(\n",
+    "#         learning_rate=0.08,\n",
+    "#         max_depth=4,\n",
+    "#         n_estimators=220\n",
+    "#     )\n",
+    "# }\n",
+    "\n",
+    "\n",
+    "# Default Classifiers\n",
+    "# classifiers = {\n",
+    "#     'RandomForest': RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=7, random_state=0),\n",
+    "#     'XGBoost': XGBClassifier(n_estimators=100, max_depth=7, use_label_encoder=False, eval_metric='mlogloss', random_state=0),\n",
+    "#     'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=0),\n",
+    "#     'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=0),\n",
+    "#     'CatBoost': CatBoostClassifier(n_estimators=100, verbose=0, random_state=0),\n",
+    "#     'LightGBM': LGBMClassifier(n_estimators=100, random_state=0)\n",
+    "# }\n",
+    "\n",
+    "# voting_clf = VotingClassifier(estimators=[\n",
+    "#     ('rf', classifiers['RF']),\n",
+    "#     ('xgb', classifiers['XGB']),\n",
+    "#     ('gb', classifiers['GB']),\n",
+    "#     ('ada', classifiers['AB']),\n",
+    "#     ('cat', classifiers['CB']),\n",
+    "#     ('lgbm', classifiers['LGBM'])\n",
+    "# ], voting='soft', n_jobs=1)\n",
+    "\n",
+    "# classifiers['Vot'] = voting_clf\n",
+    "\n",
+    "# num_features = {\n",
+    "#      'RF': 150,\n",
+    "#      'XGB': 150,\n",
+    "#      'GB': 150,\n",
+    "     # 'AB': 150,\n",
+    "     # 'CB': 150,\n",
+    "     # 'LGBM': 150,\n",
+    "     # 'Vot': 150\n",
+    "# }\n",
+    "\n",
+    "for random_state in range(10):\n",
+    "    print(f\"Processing for Random State: {random_state}\")\n",
+    "\n",
+    "    X = splitted_dataset.drop('depression_category', axis=1)\n",
+    "    y = splitted_dataset['depression_category']\n",
+    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n",
+    "    \n",
+    "    lof = LocalOutlierFactor()\n",
+    "    yhat = lof.fit_predict(X_train)\n",
+    "    mask = yhat != -1\n",
+    "    X_train, y_train = X_train[mask], y_train[mask]\n",
+    "    \n",
+    "    original_columns = X.columns.tolist()\n",
+    "\n",
+    "    smote = SMOTE(random_state=random_state)\n",
+    "    X_res, y_res = smote.fit_resample(X_train, y_train)\n",
+    "\n",
+    "    print(f\"Number of training labels after ROS: {y_res.value_counts()}\")\n",
+    "\n",
+    "    print(f\"Number of test labels before resampling: {y_test.value_counts()}\") \n",
+    "    \n",
+    "    sampling_strategy_undersample = {0: 155}\n",
+    "    rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n",
+    "    X_test, y_test = rus.fit_resample(X_test, y_test)\n",
+    "\n",
+    "    print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "    scaler = MinMaxScaler()\n",
+    "    \n",
+    "    X_res = scaler.fit_transform(X_res)\n",
+    "    X_test = scaler.transform(X_test)\n",
+    "    \n",
+    "    X_res = pd.DataFrame(X_res, columns=original_columns)\n",
+    "    X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "    log_reg = LogisticRegression(C=0.09659168435718246, max_iter=100, solver='lbfgs', random_state=random_state)\n",
+    "    log_reg.fit(X_res, y_res)\n",
+    "    selector = SelectFromModel(log_reg, prefit=True)\n",
+    "    \n",
+    "    importance = np.abs(log_reg.coef_[0])\n",
+    "    indices = np.argsort(importance)[::-1]\n",
+    "    important_features = [original_columns[i] for i in indices]\n",
+    "    \n",
+    "    for clf_name, clf in classifiers.items():\n",
+    "        num_top_features = num_features[clf_name]\n",
+    "        selected_features = important_features[:num_top_features]\n",
+    "        \n",
+    "        X_res_fi = pd.DataFrame(X_res, columns=original_columns)[selected_features]\n",
+    "        X_test_fi = pd.DataFrame(X_test, columns=original_columns)[selected_features]\n",
+    "\n",
+    "        accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc = train_evaluate_model(\n",
+    "            clf, X_res_fi, y_res, X_test_fi, y_test, clf_name=clf_name\n",
+    "        )\n",
+    "        metric_sums[clf_name]['accuracy'] += accuracy\n",
+    "        metric_sums[clf_name]['precision'] += precision\n",
+    "        metric_sums[clf_name]['recall'] += recall\n",
+    "        metric_sums[clf_name]['f1'] += f1\n",
+    "        conf_matrices[clf_name].append(conf_matrix)\n",
+    "        roc_curves[clf_name].append((fpr, tpr))\n",
+    "        roc_aucs[clf_name].append(roc_auc)\n",
+    "        accuracy_scores[clf_name].append(accuracy)\n",
+    "        precision_scores[clf_name].append(precision)\n",
+    "        recall_scores[clf_name].append(recall)\n",
+    "        f1_scores[clf_name].append(f1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c0158f58-e6d5-4adf-90e3-f1892294ad24",
+   "metadata": {},
+   "source": [
+    "### Post Training (classic/static)\n",
+    "Only run after one of the training methods above are done"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2ab5543-ad6e-4463-8d9a-35fe3e4e8eb4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('\\nAverage Metrics over 10 Random States:')\n",
+    "for clf_name, metrics in metric_sums.items():\n",
+    "    avg_accuracy = metrics['accuracy'] / 10\n",
+    "    avg_precision = metrics['precision'] / 10\n",
+    "    avg_recall = metrics['recall'] / 10\n",
+    "    avg_f1 = metrics['f1'] / 10\n",
+    "    std_accuracy = np.std(accuracy_scores[clf_name])\n",
+    "    std_precision = np.std(precision_scores[clf_name])\n",
+    "    std_recall = np.std(recall_scores[clf_name])\n",
+    "    std_f1 = np.std(f1_scores[clf_name])\n",
+    "    avg_auc = np.mean(roc_aucs[clf_name])\n",
+    "    print(f'{clf_name} - Accuracy: {avg_accuracy:.4f} ± {std_accuracy:.4f}, Precision: {avg_precision:.4f} ± {std_precision:.4f}, Recall: {avg_recall:.4f} ± {std_recall:.4f}, F1-Score: {avg_f1:.4f} ± {std_f1:.4f}, AUC: {avg_auc:.4f}')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "21d5c872-3452-41ca-8f7c-5f4ceb184fba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plot ROC Curves for each classifier in one graph\n",
+    "plot_combined_roc_curve(roc_curves, classifiers.keys())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26f7a715-efc8-44a6-8c53-e59b6268e551",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# FN Curve\n",
+    "df = pd.DataFrame(accuracy_scores)\n",
+    "scores = [df[col].values for col in df.columns]\n",
+    "stat, p = friedmanchisquare(*scores)\n",
+    "print(f'Friedman Test Statistic: {stat}, p-value: {p}')\n",
+    "ranks = df.rank(axis=1, method='average', ascending=False)\n",
+    "average_ranks = ranks.mean().values\n",
+    "n_datasets = df.shape[0]\n",
+    "alpha = 0.05\n",
+    "cd = compute_CD(average_ranks, n_datasets, alpha='0.05')\n",
+    "print(f'Critical Difference: {cd}')\n",
+    "classifiers = [f\"{clf} ({rank:.2f})\" for clf, rank in zip(df.columns, average_ranks)]\n",
+    "plt.figure(figsize=(14, 8))\n",
+    "graph_ranks(average_ranks, classifiers, cd=cd, width=6, textspace=1)\n",
+    "plt.text(0.5, 1.19, f'Friedman-Nemenyi: {stat:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=14)\n",
+    "plt.text(0.5, 1.10, f'CD: {cd:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=14)\n",
+    "plt.tight_layout()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88072d69-8bb5-46ab-9c79-6990db7cc8d2",
+   "metadata": {},
+   "source": [
+    "### Hyperparameter optimization (classic/static)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a4a1546-c24f-4349-bf1b-75ba937700be",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Hyperparameter optimization classic\n",
+    "search_spaces = {\n",
+    "    'DecisionTree': {\n",
+    "        'criterion': Categorical(['gini', 'entropy']),\n",
+    "        'max_depth': Integer(1, 20),\n",
+    "        'min_samples_split': Integer(2, 10),\n",
+    "        'min_samples_leaf': Integer(1, 10)\n",
+    "    },\n",
+    "    'LogisticRegression': {\n",
+    "        'C': Real(1e-6, 1e+6, prior='log-uniform'),\n",
+    "        'solver': Categorical(['lbfgs', 'liblinear']),\n",
+    "        'max_iter': Integer(100, 1000)\n",
+    "    },\n",
+    "    'NaiveBayes': {\n",
+    "        'var_smoothing': Real(1e-9, 1e-2, prior='log-uniform')\n",
+    "    },\n",
+    "    'KNeighbors': {\n",
+    "        'n_neighbors': Integer(1, 30),\n",
+    "        'weights': Categorical(['uniform', 'distance']),\n",
+    "        'metric': Categorical(['euclidean', 'manhattan', 'minkowski'])\n",
+    "    },\n",
+    "    'MLP': {\n",
+    "        'hidden_layer_sizes': Integer(50, 200),\n",
+    "        'alpha': Real(1e-6, 1e-2, prior='log-uniform'),\n",
+    "        'learning_rate_init': Real(1e-4, 1e-2, prior='log-uniform')\n",
+    "    },\n",
+    "    'SVC': {\n",
+    "        'C': [0.1, 1, 10, 100, 1000],\n",
+    "        'gamma': [1, 0.1, 0.01, 0.001, 0.0001],\n",
+    "        'kernel': ['rbf']\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "classifiers = {\n",
+    "    'DecisionTree': DecisionTreeClassifier(random_state=0),\n",
+    "    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=0),\n",
+    "    'NaiveBayes': GaussianNB(),\n",
+    "    'KNeighbors': KNeighborsClassifier(),\n",
+    "    'MLP': MLPClassifier(max_iter=1000, random_state=0),\n",
+    "    'SVC': SVC(probability=True, random_state=0)\n",
+    "}\n",
+    "\n",
+    "top_features_count = {\n",
+    "    'DecisionTree': 200,\n",
+    "    'LogisticRegression': 200,\n",
+    "    'NaiveBayes': 200,\n",
+    "    'KNeighbors': 200,\n",
+    "    'MLP': 200,\n",
+    "    'SVC': 200\n",
+    "}\n",
+    "\n",
+    "random_state = 0\n",
+    "print(f\"Processing for Random State: {random_state}\")\n",
+    "\n",
+    "X = splitted_dataset.drop('depression_category', axis=1)\n",
+    "y = splitted_dataset['depression_category']\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n",
+    "\n",
+    "lof = LocalOutlierFactor()\n",
+    "yhat = lof.fit_predict(X_train)\n",
+    "mask = yhat != -1\n",
+    "X_train, y_train = X_train[mask], y_train[mask]\n",
+    "\n",
+    "original_columns = X.columns.tolist()\n",
+    "\n",
+    "smote = SMOTE(random_state=random_state)\n",
+    "X_res, y_res = smote.fit_resample(X_train, y_train)\n",
+    "\n",
+    "print(f\"Number of training labels after ROS: {y_res.value_counts()}\")\n",
+    "\n",
+    "print(f\"Number of test labels before resampling: {y_test.value_counts()}\") \n",
+    "\n",
+    "sampling_strategy_undersample = {0: 155}\n",
+    "rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n",
+    "X_test, y_test = rus.fit_resample(X_test, y_test)\n",
+    "print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "scaler = MinMaxScaler()\n",
+    "\n",
+    "X_res = scaler.fit_transform(X_res)\n",
+    "X_test = scaler.transform(X_test)\n",
+    "\n",
+    "X_res = pd.DataFrame(X_res, columns=original_columns)\n",
+    "X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "corr_df = X_res.copy()\n",
+    "corr_df['target'] = y_res\n",
+    "\n",
+    "corr_mat = corr_df.corr()\n",
+    "target_correlation = corr_mat['target'].drop('target')\n",
+    "\n",
+    "for clf_name, clf in classifiers.items():\n",
+    "    print(f\"Optimizing {clf_name}\")\n",
+    "    \n",
+    "    top_features = target_correlation.abs().sort_values(ascending=False).head(top_features_count[clf_name]).index.tolist()\n",
+    "    \n",
+    "    X_res_fi = X_res[top_features]\n",
+    "    X_test_fi = X_test[top_features]\n",
+    "    \n",
+    "    opt = BayesSearchCV(clf, search_spaces[clf_name], n_iter=30, cv=3, random_state=random_state, n_jobs=-1, verbose = 30)\n",
+    "    opt.fit(X_res_fi, y_res)\n",
+    "    \n",
+    "    best_clf = opt.best_estimator_\n",
+    "    best_params = opt.best_params_\n",
+    "\n",
+    "    print(f\"Best parameters for {clf_name}: {best_params}\")\n",
+    "    \n",
+    "    accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc = train_evaluate_model(best_clf, X_res_fi, y_res, X_test_fi, y_test, clf_name=clf_name)\n",
+    "    print(f\"Best results for {clf_name}:\")\n",
+    "    print(f'Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}, AUC: {roc_auc:.4f}')\n",
+    "    print(conf_matrix)\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f473cc7-577d-4dcd-a8b1-be7f33200fbc",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Hyperparameter optimization static\n",
+    "metric_sums = defaultdict(lambda: {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0})\n",
+    "conf_matrices = defaultdict(list)\n",
+    "accuracy_scores = defaultdict(list)\n",
+    "precision_scores = defaultdict(list)\n",
+    "recall_scores = defaultdict(list)\n",
+    "f1_scores = defaultdict(list)\n",
+    "\n",
+    "classifiers = {\n",
+    "    # 'RandomForest': RandomForestClassifier(),\n",
+    "    # 'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),\n",
+    "    # 'AdaBoost': AdaBoostClassifier(),\n",
+    "    # 'GradientBoosting': GradientBoostingClassifier(),\n",
+    "    'CatBoost': CatBoostClassifier(verbose=0),\n",
+    "    'LightGBM': LGBMClassifier()\n",
+    "}\n",
+    "\n",
+    "num_features = {\n",
+    "     # 'RandomForest': 150,\n",
+    "     # 'XGBoost': 150,\n",
+    "     # 'GradientBoosting': 150,\n",
+    "     # 'AdaBoost': 150,\n",
+    "     'CatBoost': 150,\n",
+    "     'LightGBM': 150,\n",
+    "}\n",
+    "\n",
+    "search_spaces = {\n",
+    "    # 'RandomForest': {\n",
+    "    #     'n_estimators': [100, 200, 300],\n",
+    "    #     'criterion': ['gini', 'entropy'],\n",
+    "    #     'max_depth': [None, 7, 15],\n",
+    "    #     'bootstrap': [True, False]\n",
+    "    # },\n",
+    "    # 'XGBoost': {\n",
+    "    #     'n_estimators': [100, 200, 300],\n",
+    "    #     'max_depth': [5, 10],\n",
+    "    #     'learning_rate': [0.01, 0.1, 0.2],\n",
+    "    #     'gamma': [0, 0.2, 0.4],\n",
+    "    # },\n",
+    "    # 'GradientBoosting': {\n",
+    "    #     'n_estimators': [100, 200, 300],\n",
+    "    #     'learning_rate': [0.01, 0.1, 0.2],\n",
+    "    #     'max_depth': [5, 10],\n",
+    "    #     'subsample': [0.7, 0.9, 1.0],\n",
+    "    # },\n",
+    "    # 'AdaBoost': {\n",
+    "    #     'n_estimators': [100, 200, 300],\n",
+    "    #     'learning_rate': [0.1, 0.5, 1.0],\n",
+    "    #     'algorithm': ['SAMME', 'SAMME.R']\n",
+    "    # },\n",
+    "    'CatBoost': {\n",
+    "        'iterations': [100, 200, 300],\n",
+    "        'depth': [5, 7, 9],\n",
+    "    #    'learning_rate': [0.01, 0.1, 0.2],\n",
+    "    },\n",
+    "    'LightGBM': {\n",
+    "        'n_estimators': [100, 200, 300],\n",
+    "        'num_leaves': [31, 63, 127],\n",
+    "    #    'learning_rate': [0.01, 0.1, 0.2],\n",
+    "    #    'subsample': [0.7, 0.9, 1.0],\n",
+    "    }\n",
+    "}\n",
+    "\n",
+    "def hyperparameter_optimization(clf, search_space, X, y):\n",
+    "    combined_results = []\n",
+    "    for random_state in range(3):\n",
+    "        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)\n",
+    "        opt = BayesSearchCV(clf, search_space, n_iter=30, cv=cv, random_state=random_state, n_jobs=-1, verbose=0)\n",
+    "        opt.fit(X, y)\n",
+    "        combined_results.append(opt.best_params_)\n",
+    "    best_params = pd.DataFrame(combined_results).mode().iloc[0].to_dict()\n",
+    "    return best_params\n",
+    "\n",
+    "for random_state in range(9,10):\n",
+    "    print(f\"Processing for Random State: {random_state}\")\n",
+    "\n",
+    "    X = splitted_dataset.drop('depression_category', axis=1)\n",
+    "    y = splitted_dataset['depression_category']\n",
+    "    \n",
+    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n",
+    "    \n",
+    "    lof = LocalOutlierFactor()\n",
+    "    yhat = lof.fit_predict(X_train)\n",
+    "    mask = yhat != -1\n",
+    "    X_train, y_train = X_train[mask], y_train[mask]\n",
+    "    \n",
+    "    original_columns = X.columns.tolist()\n",
+    "\n",
+    "    smote = SMOTE(random_state=random_state)\n",
+    "    X_res, y_res = smote.fit_resample(X_train, y_train)\n",
+    "\n",
+    "    print(f\"Number of training labels after ROS: {y_res.value_counts()}\")\n",
+    "\n",
+    "    print(f\"Number of test labels before resampling: {y_test.value_counts()}\") \n",
+    "    \n",
+    "    sampling_strategy_undersample = {0: 155}\n",
+    "    rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n",
+    "    X_test, y_test = rus.fit_resample(X_test, y_test)\n",
+    "\n",
+    "    print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "    scaler = MinMaxScaler()\n",
+    "    \n",
+    "    X_res = scaler.fit_transform(X_res)\n",
+    "    X_test = scaler.transform(X_test)\n",
+    "    \n",
+    "    X_res = pd.DataFrame(X_res, columns=original_columns)\n",
+    "    X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "    log_reg = LogisticRegression(C=0.09659168435718246, max_iter=100, solver='lbfgs', random_state=random_state)\n",
+    "    log_reg.fit(X_res, y_res)\n",
+    "    selector = SelectFromModel(log_reg, prefit=True)\n",
+    "    \n",
+    "    importance = np.abs(log_reg.coef_[0])\n",
+    "    indices = np.argsort(importance)[::-1]\n",
+    "    important_features = [original_columns[i] for i in indices[:300]]\n",
+    "\n",
+    "    for clf_name, clf in classifiers.items():\n",
+    "        print(f\"Optimizing {clf_name}\")\n",
+    "        num_top_features = num_features[clf_name]\n",
+    "        selected_features = important_features[:num_top_features]\n",
+    "        \n",
+    "        X_res_fi = pd.DataFrame(X_res, columns=original_columns)[selected_features]\n",
+    "        \n",
+    "        best_params = hyperparameter_optimization(clf, search_spaces[clf_name], X_res_fi, y_res)\n",
+    "        if 'n_estimators' in best_params:\n",
+    "            best_params['n_estimators'] = int(best_params['n_estimators'])\n",
+    "        if 'max_depth' in best_params:\n",
+    "            best_params['max_depth'] = int(best_params['max_depth'])\n",
+    "        if 'iterations' in best_params:\n",
+    "            best_params['iterations'] = int(best_params['iterations'])\n",
+    "        clf.set_params(**best_params)\n",
+    "        print(f\"Best parameters for {clf_name}: {best_params}\")\n",
+    "\n",
+    "        X_test_fi = pd.DataFrame(X_test, columns=original_columns)[selected_features]\n",
+    "        accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc = train_evaluate_model(clf, X_res_fi, y_res, X_test_fi, y_test, clf_name=clf_name)\n",
+    "        metric_sums[clf_name]['accuracy'] += accuracy\n",
+    "        metric_sums[clf_name]['precision'] += precision\n",
+    "        metric_sums[clf_name]['recall'] += recall\n",
+    "        metric_sums[clf_name]['f1'] += f1\n",
+    "        conf_matrices[clf_name].append(conf_matrix)\n",
+    "        accuracy_scores[clf_name].append(accuracy)\n",
+    "        precision_scores[clf_name].append(precision)\n",
+    "        recall_scores[clf_name].append(recall)\n",
+    "        f1_scores[clf_name].append(f1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "65df93d4-12d3-4d68-b402-633354849dff",
+   "metadata": {},
+   "source": [
+    "### DES Training (all)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "709bb1f8-249e-4409-a537-c6bbe9a399f3",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "metric_sums_des = {\n",
+    "    'KNORAE': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'KNORAU': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'KNOP': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'DESMI': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'METADES': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'DESKNN': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'DESP': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'FIRE-KNORA-U': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'FIRE-KNORA-E': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'FIRE-METADES': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'FIRE-DESKNN': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'FIRE-DESP': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "    'FIRE-KNOP': {'accuracy': 0, 'precision': 0, 'recall': 0, 'f1': 0},\n",
+    "}\n",
+    "\n",
+    "conf_matrices_des = {\n",
+    "    'KNORAE': [],\n",
+    "    'KNORAU': [],\n",
+    "    'KNOP': [],\n",
+    "    'DESMI': [],\n",
+    "    'METADES': [],\n",
+    "    'DESKNN': [],\n",
+    "    'DESP': [],\n",
+    "    'FIRE-KNORA-U': [],\n",
+    "    'FIRE-KNORA-E': [],\n",
+    "    'FIRE-METADES': [],\n",
+    "    'FIRE-DESKNN': [],\n",
+    "    'FIRE-DESP': [],\n",
+    "    'FIRE-KNOP': [],\n",
+    "}\n",
+    "\n",
+    "roc_curves = defaultdict(list)\n",
+    "roc_aucs = defaultdict(list)\n",
+    "accuracy_scores = defaultdict(list)\n",
+    "precision_scores = defaultdict(list)\n",
+    "recall_scores = defaultdict(list)\n",
+    "f1_scores = defaultdict(list)\n",
+    "feature_importance_runs = []\n",
+    "\n",
+    "# Uncomment wanted combinations\n",
+    "# base_classifiers = {\n",
+    "    # 'DecisionTree': DecisionTreeClassifier(\n",
+    "    #     random_state=0, \n",
+    "    #     criterion='gini', \n",
+    "    #     max_depth=6, \n",
+    "    #     min_samples_leaf=10, \n",
+    "    #     min_samples_split=9\n",
+    "    # ),\n",
+    "    # 'LogisticRegression': LogisticRegression(\n",
+    "    #     random_state=0, \n",
+    "    #     C=0.09659168435718246, \n",
+    "    #     max_iter=100, \n",
+    "    #     solver='lbfgs'\n",
+    "    # ),\n",
+    "    # 'NaiveBayes': GaussianNB(\n",
+    "    #     var_smoothing=0.0058873326349240295\n",
+    "    # ),\n",
+    "    # 'KNeighbors': KNeighborsClassifier(\n",
+    "    #     metric='manhattan', \n",
+    "    #     n_neighbors=15, \n",
+    "    #     weights='uniform'\n",
+    "    # ),\n",
+    "    # 'MLP': MLPClassifier(\n",
+    "    #     random_state=0, \n",
+    "    #     max_iter=1000, \n",
+    "    #     alpha=0.0003079393718075164, \n",
+    "    #     hidden_layer_sizes=195, \n",
+    "    #     learning_rate_init=0.0001675266159417717\n",
+    "    # ),\n",
+    "    # 'SVC': SVC(probability=True, kernel = 'rbf', C = 1.5, gamma = 'auto'),\n",
+    "    # 'RF': RandomForestClassifier(n_estimators=143, criterion='entropy', max_depth=15, random_state=0),\n",
+    "    # 'XGB': XGBClassifier(n_estimators=200, max_depth=3, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss', random_state=0),\n",
+    "    # 'GB': GradientBoostingClassifier(n_estimators=300, max_depth=3, learning_rate=0.05),\n",
+    "    # 'AB': AdaBoostClassifier(n_estimators=400, learning_rate=0.1),\n",
+    "    # 'CB': CatBoostClassifier(depth = 3, iterations = 168, learning_rate = 0.1, verbose = 0),\n",
+    "    # 'LGBM': LGBMClassifier(learning_rate = 0.1, max_depth = 3, n_estimators = 200) \n",
+    "# }\n",
+    "\n",
+    "random_state = 0\n",
+    "\n",
+    "for random_state in range(10):\n",
+    "    print(f\"Processing for Random State: {random_state}\")\n",
+    "\n",
+    "    X = splitted_dataset.drop('depression_category', axis=1)\n",
+    "    y = splitted_dataset['depression_category']\n",
+    "    \n",
+    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n",
+    "    \n",
+    "    lof = LocalOutlierFactor()\n",
+    "    yhat = lof.fit_predict(X_train)\n",
+    "    mask = yhat != -1\n",
+    "    X_train, y_train = X_train[mask], y_train[mask]\n",
+    "    \n",
+    "    original_columns = X.columns.tolist()\n",
+    "\n",
+    "    smote = SMOTE(random_state=random_state)\n",
+    "    X_res, y_res = smote.fit_resample(X_train, y_train)\n",
+    "\n",
+    "    print(f\"Number of test labels before resampling: {y_test.value_counts()}\")\n",
+    "    sampling_strategy_undersample = {0: 155}\n",
+    "    rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n",
+    "    X_test, y_test = rus.fit_resample(X_test, y_test) \n",
+    "\n",
+    "    print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "    scaler = MinMaxScaler()\n",
+    "    X_res = scaler.fit_transform(X_res)\n",
+    "    X_test = scaler.transform(X_test)\n",
+    "    \n",
+    "    X_res = pd.DataFrame(X_res, columns=original_columns)\n",
+    "    X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "    ada_fs = AdaBoostClassifier(n_estimators=100, random_state = random_state)\n",
+    "    ada_fs.fit(X_res, y_res)\n",
+    "\n",
+    "    feature_importances = ada_fs.feature_importances_\n",
+    "    indices = np.argsort(feature_importances)[::-1]\n",
+    "    top_50_features = [original_columns[i] for i in indices[:50]]\n",
+    "    current_run_features = {original_columns[i]: feature_importances[i] for i in indices[:50]}\n",
+    "    \n",
+    "    feature_importance_runs.append(current_run_features)\n",
+    "\n",
+    "    X_res_fi = X_res[top_50_features]\n",
+    "    X_test_fi = X_test[top_50_features]\n",
+    "    \n",
+    "    model_pool = list(base_classifiers.values())\n",
+    "    \n",
+    "    for clf in model_pool:\n",
+    "        clf.fit(X_res_fi, y_res)\n",
+    "    \n",
+    "    des_models = {\n",
+    "        'KNORAE': KNORAE(pool_classifiers=model_pool, random_state=random_state),\n",
+    "        'KNORAU': KNORAU(pool_classifiers=model_pool, random_state=random_state),\n",
+    "        'DESMI': DESMI(pool_classifiers=model_pool, random_state=random_state),\n",
+    "        'METADES': METADES(pool_classifiers=model_pool, random_state=random_state),\n",
+    "        'DESKNN': DESKNN(pool_classifiers=model_pool, random_state=random_state),\n",
+    "        'DESP': DESP(pool_classifiers=model_pool, random_state=random_state),\n",
+    "        'KNOP': KNOP(pool_classifiers=model_pool, random_state=random_state, k=9),\n",
+    "        'FIRE-KNORA-U': KNORAU(pool_classifiers=model_pool, DFP=True, k=9, random_state = random_state),\n",
+    "        'FIRE-KNORA-E': KNORAE(pool_classifiers=model_pool, DFP=True, k=9, random_state = random_state),\n",
+    "        'FIRE-METADES': METADES(pool_classifiers=model_pool, DFP=True, k=9, random_state = random_state),\n",
+    "        'FIRE-DESKNN': DESKNN(pool_classifiers=model_pool, DFP=True, k=9, random_state = random_state),\n",
+    "        'FIRE-DESP': DESP(pool_classifiers=model_pool, DFP=True, k=9, random_state = random_state),\n",
+    "        'FIRE-KNOP': KNOP(pool_classifiers=model_pool, DFP=True, k=40, random_state = random_state)\n",
+    "    }\n",
+    "\n",
+    "    for des_name, des_model in des_models.items():\n",
+    "        accuracy, precision, recall, f1, conf_matrix, fpr, tpr, roc_auc = train_evaluate_model(\n",
+    "            des_model, X_res_fi, y_res, X_test_fi, y_test, clf_name=des_name\n",
+    "        )\n",
+    "        metric_sums_des[des_name]['accuracy'] += accuracy\n",
+    "        metric_sums_des[des_name]['precision'] += precision\n",
+    "        metric_sums_des[des_name]['recall'] += recall\n",
+    "        metric_sums_des[des_name]['f1'] += f1\n",
+    "        conf_matrices_des[des_name].append(conf_matrix)\n",
+    "        roc_curves[des_name].append((fpr, tpr))\n",
+    "        roc_aucs[des_name].append(roc_auc)\n",
+    "        accuracy_scores[des_name].append(accuracy)\n",
+    "        precision_scores[des_name].append(precision)\n",
+    "        recall_scores[des_name].append(recall)\n",
+    "        f1_scores[des_name].append(f1)\n",
+    "\n",
+    "        print(f'Confusion Matrix for {des_name} at Random State {random_state}:\\n{conf_matrix}\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34b0ead4-1d8c-4d0a-b0c7-71cd71f6ab7d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_combined_roc_curve(roc_curves, classifier_names):\n",
+    "    plt.figure(figsize=(12, 8))\n",
+    "    mean_fpr = np.linspace(0, 1, 100)\n",
+    "    colors = plt.cm.get_cmap('tab20', len(classifier_names))\n",
+    "    \n",
+    "    for i, clf_name in enumerate(classifier_names):\n",
+    "        tprs = []\n",
+    "        for fpr, tpr in roc_curves[clf_name]:\n",
+    "            tprs.append(np.interp(mean_fpr, fpr, tpr))\n",
+    "        mean_tpr = np.mean(tprs, axis=0)\n",
+    "        mean_tpr[-1] = 1.0\n",
+    "        mean_auc = auc(mean_fpr, mean_tpr)\n",
+    "        plt.plot(mean_fpr, mean_tpr, color=colors(i), lw=2, linestyle='-', marker='o', markersize=4, \n",
+    "                 label=f'{clf_name} (AUC = {mean_auc:.3f})')\n",
+    "\n",
+    "    plt.plot([0, 1], [0, 1], color='grey', lw=2, linestyle='--')\n",
+    "    plt.xlim([0.0, 1.0])\n",
+    "    plt.ylim([0.0, 1.05])\n",
+    "    plt.xlabel('False Positive Rate', fontsize=26)\n",
+    "    plt.ylabel('True Positive Rate', fontsize=26)\n",
+    "    plt.xticks(fontsize=30)  # Increase x-axis numbers font size\n",
+    "    plt.yticks(fontsize=30)  # Increase y-axis numbers font size\n",
+    "    plt.legend(loc=\"center left\", bbox_to_anchor=(1.05, 0.5), fontsize=26, frameon=True, framealpha=0.9)  # Place legend beside the plot\n",
+    "    plt.grid(True)\n",
+    "\n",
+    "    filename='bonk.svg'\n",
+    "\n",
+    "    plt.savefig(filename, format='svg', bbox_inches = 'tight')\n",
+    "    plt.show()\n",
+    "\n",
+    "    display(FileLink(filename))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4cfd8d73-26fd-4dbc-a6ca-92a9643e1d27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('\\nAverage Metrics over 10 Random States:')\n",
+    "for des_name, metrics in metric_sums_des.items():\n",
+    "    avg_accuracy = metrics['accuracy'] / 10\n",
+    "    avg_precision = metrics['precision'] / 10\n",
+    "    avg_recall = metrics['recall'] / 10\n",
+    "    avg_f1 = metrics['f1'] / 10\n",
+    "    std_accuracy = np.std(accuracy_scores[des_name])\n",
+    "    std_precision = np.std(precision_scores[des_name])\n",
+    "    std_recall = np.std(recall_scores[des_name])\n",
+    "    std_f1 = np.std(f1_scores[des_name])\n",
+    "    avg_auc = np.mean(roc_aucs[des_name])\n",
+    "    print(f'{des_name} - Accuracy: {avg_accuracy:.4f} ± {std_accuracy:.4f}, Precision: {avg_precision:.4f} ± {std_precision:.4f}, Recall: {avg_recall:.4f} ± {std_recall:.4f}, F1-Score: {avg_f1:.4f} ± {std_f1:.4f}, AUC: {avg_auc:.4f}')\n",
+    "\n",
+    "plot_combined_roc_curve(roc_curves, list(des_models.keys()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32780b70-dae5-4f7d-9ae4-128dc782fad4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(accuracy_scores)\n",
+    "scores = [df[col].values for col in df.columns]\n",
+    "\n",
+    "stat, p = friedmanchisquare(*scores)\n",
+    "print(f'Friedman Test Statistic: {stat}, p-value: {p}')\n",
+    "\n",
+    "ranks = df.rank(axis=1, method='average', ascending=False)\n",
+    "average_ranks = ranks.mean().values\n",
+    "\n",
+    "n_datasets = df.shape[0]\n",
+    "alpha = 0.05\n",
+    "\n",
+    "cd = compute_CD(average_ranks, n_datasets, alpha='0.05')\n",
+    "print(f'Critical Difference: {cd}')\n",
+    "\n",
+    "classifiers = [f\"{clf} ({rank:.2f})\" for clf, rank in zip(df.columns, average_ranks)]\n",
+    "\n",
+    "plt.figure(figsize=(14, 10))\n",
+    "\n",
+    "graph_ranks(average_ranks, classifiers, cd=cd, width=6, textspace=1)\n",
+    "plt.xlabel('Classifiers')\n",
+    "\n",
+    "plt.text(0.5, 1.19, f'Friedman-Nemenyi: {stat:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=16)\n",
+    "plt.text(0.5, 1.10, f'CD: {cd:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=16)\n",
+    "\n",
+    "plt.tight_layout()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f25a2b29-129f-4314-b2f9-8aff9615d5d7",
+   "metadata": {},
+   "source": [
+    "### Shap (will mostly be exported files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed3c43d0-68d2-4aac-ac44-25a28dc6dc75",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Example with XGB\n",
+    "\n",
+    "random_state = 2\n",
+    "print(f\"Processing for Random State: {random_state}\")\n",
+    "\n",
+    "X = splitted_dataset.drop('depression_category', axis=1)\n",
+    "y = splitted_dataset['depression_category']\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n",
+    "\n",
+    "lof = LocalOutlierFactor()\n",
+    "yhat = lof.fit_predict(X_train)\n",
+    "mask = yhat != -1\n",
+    "X_train, y_train = X_train[mask], y_train[mask]\n",
+    "\n",
+    "original_columns = X.columns.tolist()\n",
+    "\n",
+    "smote = SMOTE(random_state=random_state)\n",
+    "X_res, y_res = smote.fit_resample(X_train, y_train)\n",
+    "\n",
+    "print(f\"Number of training labels after SMOTE: {y_res.value_counts()}\")\n",
+    "print(f\"Number of test labels before resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "sampling_strategy_undersample = {0: 155}\n",
+    "rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n",
+    "X_test, y_test = rus.fit_resample(X_test, y_test)\n",
+    "\n",
+    "print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "# Normalization\n",
+    "# scaler = MinMaxScaler()\n",
+    "# X_res = scaler.fit_transform(X_res)\n",
+    "# X_test = scaler.transform(X_test)\n",
+    "\n",
+    "X_res = pd.DataFrame(X_res, columns=original_columns)\n",
+    "X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "# Train XGBoost model on all features\n",
+    "model = xgb.XGBClassifier(n_estimators=200, max_depth=3, learning_rate=0.1, use_label_encoder=False, eval_metric='mlogloss', random_state=random_state)\n",
+    "model.fit(X_res, y_res)\n",
+    "\n",
+    "y_pred = model.predict(X_test)\n",
+    "\n",
+    "accuracy = accuracy_score(y_test, y_pred)\n",
+    "print(f'Accuracy: {accuracy:.4f}')\n",
+    "\n",
+    "explainer = shap.Explainer(model, X_res)\n",
+    "shap_values = explainer(X_res)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2cb9c929-4e86-42df-bda2-48730004c154",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def plot_shap_waterfall(instance_index, filename):\n",
+    "    shap_value = shap_values[instance_index]\n",
+    "    plt.figure(figsize=(14, 8))\n",
+    "    \n",
+    "    shap.plots.waterfall(shap_value, show=False)\n",
+    "    \n",
+    "    ax = plt.gca()\n",
+    "    \n",
+    "    ax.tick_params(axis='both', which='major', labelsize=16)\n",
+    "    ax.set_xlabel(ax.get_xlabel(), fontsize=20)\n",
+    "    ax.set_ylabel(ax.get_ylabel(), fontsize=22)\n",
+    "    \n",
+    "    plt.tight_layout()\n",
+    "\n",
+    "    plt.savefig(filename, format='svg')\n",
+    "    \n",
+    "    plt.close()\n",
+    "\n",
+    "plot_shap_waterfall(0, \"waterfall_plot_instance_0.svg\")\n",
+    "plot_shap_waterfall(562, \"waterfall_plot_instance_562.svg\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e111213e-caea-48a0-adb7-c6b2362eed4f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "plt.figure(figsize=(14, 8))\n",
+    "shap.summary_plot(\n",
+    "    shap_values,\n",
+    "    X_res,\n",
+    "    plot_type=\"bar\",\n",
+    "    feature_names=original_columns,\n",
+    "    show=False\n",
+    ")\n",
+    "\n",
+    "ax = plt.gca()\n",
+    "ax.tick_params(axis='both', which='major', labelsize=16)\n",
+    "ax.set_xlabel(ax.get_xlabel(), fontsize=16)\n",
+    "ax.set_ylabel(ax.get_ylabel(), fontsize=22)\n",
+    "\n",
+    "plt.savefig(\"shap_summary_plot.svg\", format='svg')\n",
+    "plt.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ae84637-a018-49a1-ad4e-522aa937bfad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "shap.initjs()\n",
+    "\n",
+    "fig, ax = plt.subplots(figsize=(14, 8))\n",
+    "\n",
+    "shap.summary_plot(\n",
+    "    shap_values,\n",
+    "    X_res,\n",
+    "    plot_type=\"dot\",\n",
+    "    feature_names=original_columns,\n",
+    "    show=False\n",
+    ")\n",
+    "\n",
+    "ax.tick_params(axis='both', which='major', labelsize=16)\n",
+    "ax.set_xlabel(ax.get_xlabel(), fontsize=16)\n",
+    "ax.set_ylabel(ax.get_ylabel(), fontsize=22)\n",
+    "\n",
+    "fig.savefig(\"shap_summary_dot_plot.svg\", format='svg', bbox_inches='tight')\n",
+    "\n",
+    "plt.close(fig)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "05f0de1c-3850-4d77-9184-d593451022c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.tree import DecisionTreeClassifier, export_text\n",
+    "from sklearn import tree\n",
+    "\n",
+    "random_state = 5\n",
+    "\n",
+    "X = splitted_dataset.drop('depression_category', axis=1)\n",
+    "y = splitted_dataset['depression_category']\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=random_state)\n",
+    "\n",
+    "lof = LocalOutlierFactor()\n",
+    "yhat = lof.fit_predict(X_train)\n",
+    "mask = yhat != -1\n",
+    "X_train, y_train = X_train[mask], y_train[mask]\n",
+    "\n",
+    "original_columns = X.columns.tolist()\n",
+    "\n",
+    "smote = SMOTE(random_state=random_state)\n",
+    "X_res, y_res = smote.fit_resample(X_train, y_train)\n",
+    "\n",
+    "print(f\"Number of training labels after ROS: {y_res.value_counts()}\")\n",
+    "print(f\"Number of test labels before resampling: {y_test.value_counts()}\")\n",
+    "sampling_strategy_undersample = {0: 155}\n",
+    "rus = RandomUnderSampler(sampling_strategy=sampling_strategy_undersample, random_state=random_state)\n",
+    "X_test, y_test = rus.fit_resample(X_test, y_test)\n",
+    "\n",
+    "print(f\"Number of test labels after resampling: {y_test.value_counts()}\")\n",
+    "\n",
+    "# Normalization\n",
+    "# scaler = MinMaxScaler()\n",
+    "# X_res = scaler.fit_transform(X_res)\n",
+    "# X_test = scaler.transform(X_test)\n",
+    "\n",
+    "X_res = pd.DataFrame(X_res, columns=original_columns)\n",
+    "X_test = pd.DataFrame(X_test, columns=original_columns)\n",
+    "\n",
+    "decision_tree_model = DecisionTreeClassifier(\n",
+    "    random_state=0, \n",
+    "    criterion='gini', \n",
+    "    max_depth=6, \n",
+    "    min_samples_leaf=10, \n",
+    "    min_samples_split=9\n",
+    ")\n",
+    "decision_tree_model.fit(X_res, y_res)\n",
+    "\n",
+    "plt.figure(figsize=(20, 14))\n",
+    "tree.plot_tree(\n",
+    "    decision_tree_model, \n",
+    "    feature_names=original_columns, \n",
+    "    class_names=['depression', 'normal'],\n",
+    "    filled=True, \n",
+    "    rounded=True, \n",
+    "    fontsize=10,\n",
+    "    max_depth = 3\n",
+    ")\n",
+    "\n",
+    "plt.savefig(\"decision_tree_plot.svg\", format='svg')\n",
+    "plt.close()\n",
+    "\n",
+    "print(\"Decision Tree plot saved as 'decision_tree_plot.svg'\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b46a92a6-f370-4df4-ac29-8f382fc1820b",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "tree_rules = export_text(decision_tree_model, feature_names=original_columns, max_depth=50)\n",
+    "print(\"Decision rules for the tree (up to depth 3):\")\n",
+    "print(tree_rules)\n",
+    "\n",
+    "node_indicator = decision_tree_model.decision_path(X_test)\n",
+    "\n",
+    "sample_id = 0\n",
+    "node_index = node_indicator.indices[node_indicator.indptr[sample_id]:node_indicator.indptr[sample_id + 1]]\n",
+    "\n",
+    "print(f\"\\nDecision path for sample {sample_id}:\")\n",
+    "for node_id in node_index:\n",
+    "    if X_test.iloc[sample_id, decision_tree_model.tree_.feature[node_id]] <= decision_tree_model.tree_.threshold[node_id]:\n",
+    "        threshold_sign = \"<=\"\n",
+    "    else:\n",
+    "        threshold_sign = \">\"\n",
+    "    print(f\"Node {node_id}: (X_test[{sample_id}, {decision_tree_model.tree_.feature[node_id]}] = {X_test.iloc[sample_id, decision_tree_model.tree_.feature[node_id]]}) \"\n",
+    "          f\"{threshold_sign} {decision_tree_model.tree_.threshold[node_id]}\")\n",
+    "\n",
+    "# Get prediction for a specific test sample\n",
+    "predicted_class = decision_tree_model.predict([X_test.iloc[sample_id]])\n",
+    "print(f\"\\nPredicted class for test sample {sample_id}: {predicted_class}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}