Spaces:

fdabbiras
/

MDD

No application file

MDD

File size: 37,859 Bytes

33e757b

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c4849e1e-1928-4e8b-a12f-37c786b50aca",
   "metadata": {},
   "source": [
    "# Regression Layer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dddcb2a8-73d3-4476-b88a-bc1494a2c830",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import cross_val_score, KFold\n",
    "from sklearn.impute import KNNImputer\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from xgboost import XGBClassifier\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.experimental import enable_iterative_imputer\n",
    "from imblearn.pipeline import make_pipeline as make_pipeline_imb\n",
    "from imblearn.over_sampling import SMOTE,SMOTENC\n",
    "from sklearn.model_selection import train_test_split\n",
    "from collections import Counter\n",
    "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.neighbors import LocalOutlierFactor\n",
    "from sklearn.utils import resample\n",
    "import warnings\n",
    "from imblearn.over_sampling import RandomOverSampler\n",
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from imblearn.pipeline import Pipeline\n",
    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
    "from sklearn.preprocessing import LabelEncoder, PowerTransformer\n",
    "from collections import defaultdict\n",
    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.neural_network import MLPClassifier\n",
    "import Orange\n",
    "from scipy.stats import friedmanchisquare, rankdata\n",
    "import shap\n",
    "import scikit_posthocs as sp\n",
    "from sklearn.feature_selection import SelectFromModel\n",
    "from IPython.display import FileLink, display\n",
    "import math\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from skopt.space import Integer, Real\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from skopt import BayesSearchCV\n",
    "import xgboost as xgb\n",
    "from imblearn.over_sampling import SMOTE\n",
    "from sklearn.tree import DecisionTreeClassifier, export_text\n",
    "from sklearn import tree\n",
    "from skopt.space import Real, Integer, Categorical\n",
    "from skopt.callbacks import VerboseCallback\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, VotingRegressor\n",
    "from catboost import CatBoostRegressor\n",
    "from xgboost import XGBRegressor\n",
    "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n",
    "from sklearn.feature_selection import SelectFromModel\n",
    "from sklearn.neighbors import LocalOutlierFactor\n",
    "from lightgbm import LGBMRegressor\n",
    "from IPython.display import display, FileLink"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "39d6683c-fd3f-4daa-afdc-2e7f83c3fce3",
   "metadata": {},
   "source": [
    "### Preparation before training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec17e47a-8d92-498d-8f28-d8259d6ebc4e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Call Dataset\n",
    "pd.set_option('display.max_rows', 10)\n",
    "initial_df = pd.read_csv('3labelv4Regression.csv')\n",
    "initial_df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d3751352-73bc-42c6-b6a5-84a3badf14ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "# All categorical features except for label\n",
    "cols = initial_df.columns\n",
    "num_cols = initial_df._get_numeric_data().columns\n",
    "categorical_features = list(set(cols) - set(num_cols))\n",
    "categorical_features.remove('depression_category')\n",
    "\n",
    "# Label Encode all categorical, but keep missing values\n",
    "le_initial_df = initial_df.copy()\n",
    "dropped_labels = le_initial_df['depression_category']\n",
    "le_initial_df = le_initial_df.drop('depression_category', axis = 1)\n",
    "\n",
    "for col in le_initial_df.columns:\n",
    "    if le_initial_df[col].dtype == 'object':\n",
    "        le_initial_df[col] = le_initial_df[col].fillna('missing')\n",
    "\n",
    "        label_encoder = LabelEncoder()\n",
    "        le_initial_df[col] = label_encoder.fit_transform(le_initial_df[col])\n",
    "\n",
    "        missing_value_index = np.where(label_encoder.classes_ == 'missing')[0]\n",
    "        \n",
    "        le_initial_df[col] = le_initial_df[col].replace(missing_value_index, np.nan)\n",
    "\n",
    "le_initial_df = pd.concat([le_initial_df, dropped_labels], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9fa21a95-21f1-4274-86ba-d7977813066b",
   "metadata": {},
   "outputs": [],
   "source": [
    "le_initial_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0ba54a81-d4de-4884-99d3-29867eb7ea40",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Seperate and Combine\n",
    "le_df_normal = le_initial_df[le_initial_df['depression_category'] == 'normal']\n",
    "le_df_mild = le_initial_df[le_initial_df['depression_category'] == 'mild']\n",
    "le_df_moderatesevere = le_initial_df[le_initial_df['depression_category'] == 'moderatesevere']\n",
    "\n",
    "le_df_depression = pd.concat([le_df_normal, le_df_mild, le_df_moderatesevere], ignore_index = False)\n",
    "\n",
    "le_df_depression['depression_category'] = 'depression'\n",
    "\n",
    "# Check depression category counts\n",
    "dataframes = [le_df_normal, le_df_mild, le_df_moderatesevere]\n",
    "le_initial_df = pd.concat(dataframes, ignore_index=True)\n",
    "label_counts = le_initial_df['depression_category'].value_counts()\n",
    "label_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c517213f-f6bb-4298-99ee-3b29f6f7d0cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Some outlier.\n",
    "# threshold = int(0.8 * le_df_normal.shape[1])\n",
    "# le_df_normal = le_df_normal.dropna(thresh = threshold)\n",
    "# threshold = int(0.8 * le_df_depression.shape[1])\n",
    "# le_df_depression = le_df_depression.dropna(thresh = threshold)\n",
    "\n",
    "# Check depression category counts\n",
    "dataframes = [le_df_normal, le_df_mild, le_df_moderatesevere]\n",
    "le_initial_df = pd.concat(dataframes, ignore_index=True)\n",
    "label_counts = le_initial_df['depression_category'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8c5bdf52-c645-4a11-920a-579e28db1a50",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Imputation\n",
    "different_le_dfs = [le_df_normal, le_df_mild, le_df_moderatesevere]\n",
    "imputed_le_dfs = []\n",
    "from sklearn.impute import IterativeImputer\n",
    "for le_df in different_le_dfs:\n",
    "    y = le_df['depression_category']\n",
    "    X = le_df.drop('depression_category', axis = 1)\n",
    "    \n",
    "    imputer = SimpleImputer(strategy='median')\n",
    "    imputed_data = imputer.fit_transform(X)\n",
    "    imputed_df = pd.DataFrame(imputed_data, columns = X.columns)\n",
    "\n",
    "    imputed_df['depression_category'] = y.reset_index(drop = True)\n",
    "    imputed_le_dfs.append(imputed_df)\n",
    "\n",
    "concatenated_le_dfs = pd.concat(imputed_le_dfs, ignore_index = True)\n",
    "concatenated_le_dfs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e871a5f-beab-4f90-b8c4-e922ef86d0f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Full label encode depression category\n",
    "fully_LE_concatenated_le_dfs = concatenated_le_dfs.copy()\n",
    "fully_LE_concatenated_le_dfs['depression_category'] = label_encoder.fit_transform(fully_LE_concatenated_le_dfs['depression_category'])\n",
    "\n",
    "# The dataset after category connect, imputation, and label encoding\n",
    "splitted_dataset = fully_LE_concatenated_le_dfs.copy()\n",
    "splitted_dataset = splitted_dataset.drop('depression_category', axis = 1)\n",
    "splitted_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e602f4d-efb2-4ac9-99bb-c6fbeca791cc",
   "metadata": {},
   "outputs": [],
   "source": [
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "68563dc9-38c9-4ea5-8c08-9de904f58f43",
   "metadata": {},
   "source": [
    "### Regression Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e339a41-e484-4a11-84f0-770cbaacb09d",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Optimized parameters\n",
    "regressors = {\n",
    "    'CBR': CatBoostRegressor(verbose=0, iterations=2000, learning_rate=0.01, depth=5),\n",
    "    'XGBR': XGBRegressor(learning_rate=0.04403027347366962, max_depth=3, n_estimators=238),\n",
    "    'LGBMR': LGBMRegressor(learning_rate=0.02904035023286438, num_leaves=20, n_estimators=170),\n",
    "    'GBR': GradientBoostingRegressor(learning_rate=0.03, max_depth=2, n_estimators=700),\n",
    "    'RFR': RandomForestRegressor(max_depth=12, max_features=1.0, n_estimators=300),\n",
    "    'ETR': ExtraTreesRegressor(max_depth=12, max_features=1.0, n_estimators=64),\n",
    "    'ABR': AdaBoostRegressor(learning_rate=0.29915504677867777, n_estimators=92)\n",
    "}\n",
    "\n",
    "# Default parameters\n",
    "# regressors = {\n",
    "#     'CBR': CatBoostRegressor(verbose=0),\n",
    "#     'XGBR': XGBRegressor(),\n",
    "#     'LGBMR': LGBMRegressor(),\n",
    "#     'GBR': GradientBoostingRegressor(),\n",
    "#     'RFR': RandomForestRegressor(),\n",
    "#     'ETR': ExtraTreesRegressor(),\n",
    "#     'ABR': AdaBoostRegressor()\n",
    "# }\n",
    "\n",
    "voting_regressor = VotingRegressor(estimators=[\n",
    "    ('cbr', regressors['CBR']),\n",
    "    ('xgbr', regressors['XGBR']),\n",
    "    ('gbr', regressors['GBR']),\n",
    "    ('abr', regressors['ABR'])\n",
    "])\n",
    "\n",
    "regressors['Voting'] = voting_regressor\n",
    "\n",
    "metric_sums = {name: {'rmse': 0, 'mae': 0, 'r2': 0} for name in regressors.keys()}\n",
    "metric_stds = {name: {'rmse': 0, 'mae': 0, 'r2': 0} for name in regressors.keys()}\n",
    "rmse_scores = {name: [] for name in regressors.keys()}\n",
    "mae_scores = {name: [] for name in regressors.keys()}\n",
    "r2_scores = {name: [] for name in regressors.keys()}\n",
    "\n",
    "for random_state in range(10):\n",
    "    print(f'Processing for Random State: {random_state}')\n",
    "\n",
    "    X = splitted_dataset.drop('total_sum', axis=1)\n",
    "    y = splitted_dataset['total_sum']\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)\n",
    "\n",
    "    lof = LocalOutlierFactor()\n",
    "    yhat = lof.fit_predict(X_train)\n",
    "\n",
    "    mask = yhat != -1\n",
    "    X_train, y_train = X_train[mask], y_train[mask]\n",
    "\n",
    "    original_columns = X.columns.tolist()\n",
    "\n",
    "    print(f\"Number of training labels after outlier removal: {len(y_train)}\")\n",
    "    print(f\"Number of test labels: {len(y_test)}\")\n",
    "\n",
    "    scaler = MinMaxScaler()\n",
    "    X_train = scaler.fit_transform(X_train)\n",
    "    X_test = scaler.transform(X_test)\n",
    "    \n",
    "    X_train = pd.DataFrame(X_train, columns=original_columns)\n",
    "    X_test = pd.DataFrame(X_test, columns=original_columns)\n",
    "\n",
    "    # Feature selection using XGBRegressor\n",
    "    xgb = XGBRegressor(random_state=random_state)\n",
    "    xgb.fit(X_train, y_train)\n",
    "    selector = SelectFromModel(xgb, prefit=True)\n",
    "\n",
    "    importance = np.abs(xgb.feature_importances_)\n",
    "    indices = np.argsort(importance)[::-1]\n",
    "    important_features = [original_columns[i] for i in indices[:50]]\n",
    "\n",
    "    for reg_name, reg in regressors.items():\n",
    "        selected_features = important_features\n",
    "        \n",
    "        X_train_fi = pd.DataFrame(X_train, columns=original_columns)[selected_features]\n",
    "        X_test_fi = pd.DataFrame(X_test, columns=original_columns)[selected_features]\n",
    "\n",
    "        reg.fit(X_train_fi, y_train)\n",
    "        y_pred = reg.predict(X_test_fi)\n",
    "\n",
    "        y_pred = np.round(y_pred)\n",
    "        \n",
    "        rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
    "        mae = mean_absolute_error(y_test, y_pred)\n",
    "        r2 = r2_score(y_test, y_pred)\n",
    "\n",
    "        metric_sums[reg_name]['rmse'] += rmse\n",
    "        metric_sums[reg_name]['mae'] += mae\n",
    "        metric_sums[reg_name]['r2'] += r2\n",
    "        rmse_scores[reg_name].append(rmse)\n",
    "        mae_scores[reg_name].append(mae)\n",
    "        r2_scores[reg_name].append(r2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0929bf2c-fd41-4e0e-8070-4a6317f3efab",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate and print the average metrics and their standard deviations\n",
    "for reg_name in regressors.keys():\n",
    "    avg_rmse = metric_sums[reg_name]['rmse'] / 10\n",
    "    avg_mae = metric_sums[reg_name]['mae'] / 10\n",
    "    avg_r2 = metric_sums[reg_name]['r2'] / 10\n",
    "    std_rmse = np.std(rmse_scores[reg_name])\n",
    "    std_mae = np.std(mae_scores[reg_name])\n",
    "    std_r2 = np.std(r2_scores[reg_name])\n",
    "    \n",
    "    print(f\"Regressor: {reg_name}\")\n",
    "    print(f\"Average RMSE: {avg_rmse} ± {std_rmse}\")\n",
    "    print(f\"Average MAE: {avg_mae} ± {std_mae}\")\n",
    "    print(f\"Average R2: {avg_r2} ± {std_r2}\")\n",
    "    print(\"------\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "07f62248-70b4-479b-b782-3aec714111f9",
   "metadata": {},
   "source": [
    "### Shap n FN"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6ce4639-53d9-4dcd-a49c-80555326f197",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preparation code to make CD diagram from older version of Orange\n",
    "def compute_CD(avranks, n, alpha=\"0.05\", test=\"nemenyi\"):\n",
    "    \"\"\"\n",
    "    Returns critical difference for Nemenyi or Bonferroni-Dunn test\n",
    "    according to given alpha (either alpha=\"0.05\" or alpha=\"0.1\") for average\n",
    "    ranks and number of tested datasets N. Test can be either \"nemenyi\" for\n",
    "    for Nemenyi two tailed test or \"bonferroni-dunn\" for Bonferroni-Dunn test.\n",
    "\n",
    "    This function is deprecated and will be removed in Orange 3.34.\n",
    "    \"\"\"\n",
    "    k = len(avranks)\n",
    "    d = {(\"nemenyi\", \"0.05\"): [0, 0, 1.959964, 2.343701, 2.569032, 2.727774,\n",
    "                               2.849705, 2.94832, 3.030879, 3.101730, 3.163684,\n",
    "                               3.218654, 3.268004, 3.312739, 3.353618, 3.39123,\n",
    "                               3.426041, 3.458425, 3.488685, 3.517073,\n",
    "                               3.543799],\n",
    "         (\"nemenyi\", \"0.1\"): [0, 0, 1.644854, 2.052293, 2.291341, 2.459516,\n",
    "                              2.588521, 2.692732, 2.779884, 2.854606, 2.919889,\n",
    "                              2.977768, 3.029694, 3.076733, 3.119693, 3.159199,\n",
    "                              3.195743, 3.229723, 3.261461, 3.291224, 3.319233],\n",
    "         (\"bonferroni-dunn\", \"0.05\"): [0, 0, 1.960, 2.241, 2.394, 2.498, 2.576,\n",
    "                                       2.638, 2.690, 2.724, 2.773],\n",
    "         (\"bonferroni-dunn\", \"0.1\"): [0, 0, 1.645, 1.960, 2.128, 2.241, 2.326,\n",
    "                                      2.394, 2.450, 2.498, 2.539]}\n",
    "    q = d[(test, alpha)]\n",
    "    cd = q[k] * (k * (k + 1) / (6.0 * n)) ** 0.5\n",
    "    return cd\n",
    "\n",
    "\n",
    "def graph_ranks(avranks, names, cd=None, cdmethod=None, lowv=None, highv=None,\n",
    "                width=6, textspace=1, reverse=False, filename=None, **kwargs):\n",
    "    \"\"\"\n",
    "    Draws a CD graph, which is used to display  the differences in methods'\n",
    "    performance. See Janez Demsar, Statistical Comparisons of Classifiers over\n",
    "    Multiple Data Sets, 7(Jan):1--30, 2006.\n",
    "\n",
    "    Needs matplotlib to work.\n",
    "\n",
    "    The image is ploted on `plt` imported using\n",
    "    `import matplotlib.pyplot as plt`.\n",
    "\n",
    "    This function is deprecated and will be removed in Orange 3.34.\n",
    "\n",
    "    Args:\n",
    "        avranks (list of float): average ranks of methods.\n",
    "        names (list of str): names of methods.\n",
    "        cd (float): Critical difference used for statistically significance of\n",
    "            difference between methods.\n",
    "        cdmethod (int, optional): the method that is compared with other methods\n",
    "            If omitted, show pairwise comparison of methods\n",
    "        lowv (int, optional): the lowest shown rank\n",
    "        highv (int, optional): the highest shown rank\n",
    "        width (int, optional): default width in inches (default: 6)\n",
    "        textspace (int, optional): space on figure sides (in inches) for the\n",
    "            method names (default: 1)\n",
    "        reverse (bool, optional):  if set to `True`, the lowest rank is on the\n",
    "            right (default: `False`)\n",
    "        filename (str, optional): output file name (with extension). If not\n",
    "            given, the function does not write a file.\n",
    "    \"\"\"\n",
    "    try:\n",
    "        import matplotlib.pyplot as plt\n",
    "        from matplotlib.backends.backend_agg import FigureCanvasAgg\n",
    "    except ImportError:\n",
    "        raise ImportError(\"Function graph_ranks requires matplotlib.\")\n",
    "\n",
    "    width = float(width)\n",
    "    textspace = float(textspace)\n",
    "\n",
    "    def nth(l, n):\n",
    "        \"\"\"\n",
    "        Returns only nth elemnt in a list.\n",
    "        \"\"\"\n",
    "        n = lloc(l, n)\n",
    "        return [a[n] for a in l]\n",
    "\n",
    "    def lloc(l, n):\n",
    "        \"\"\"\n",
    "        List location in list of list structure.\n",
    "        Enable the use of negative locations:\n",
    "        -1 is the last element, -2 second last...\n",
    "        \"\"\"\n",
    "        if n < 0:\n",
    "            return len(l[0]) + n\n",
    "        else:\n",
    "            return n\n",
    "\n",
    "    def mxrange(lr):\n",
    "        \"\"\"\n",
    "        Multiple xranges. Can be used to traverse matrices.\n",
    "        This function is very slow due to unknown number of\n",
    "        parameters.\n",
    "\n",
    "        >>> mxrange([3,5])\n",
    "        [(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)]\n",
    "\n",
    "        >>> mxrange([[3,5,1],[9,0,-3]])\n",
    "        [(3, 9), (3, 6), (3, 3), (4, 9), (4, 6), (4, 3)]\n",
    "\n",
    "        \"\"\"\n",
    "        if not len(lr):\n",
    "            yield ()\n",
    "        else:\n",
    "            # it can work with single numbers\n",
    "            index = lr[0]\n",
    "            if isinstance(index, int):\n",
    "                index = [index]\n",
    "            for a in range(*index):\n",
    "                for b in mxrange(lr[1:]):\n",
    "                    yield tuple([a] + list(b))\n",
    "\n",
    "    def print_figure(fig, *args, **kwargs):\n",
    "        canvas = FigureCanvasAgg(fig)\n",
    "        canvas.print_figure(*args, **kwargs)\n",
    "\n",
    "    sums = avranks\n",
    "\n",
    "    tempsort = sorted([(a, i) for i, a in enumerate(sums)], reverse=reverse)\n",
    "    ssums = nth(tempsort, 0)\n",
    "    sortidx = nth(tempsort, 1)\n",
    "    nnames = [names[x] for x in sortidx]\n",
    "\n",
    "    if lowv is None:\n",
    "        lowv = min(1, int(math.floor(min(ssums))))\n",
    "    if highv is None:\n",
    "        highv = max(len(avranks), int(math.ceil(max(ssums))))\n",
    "\n",
    "    cline = 0.4\n",
    "\n",
    "    k = len(sums)\n",
    "\n",
    "    lines = None\n",
    "\n",
    "    linesblank = 0\n",
    "    scalewidth = width - 2 * textspace\n",
    "\n",
    "    def rankpos(rank):\n",
    "        if not reverse:\n",
    "            a = rank - lowv\n",
    "        else:\n",
    "            a = highv - rank\n",
    "        return textspace + scalewidth / (highv - lowv) * a\n",
    "\n",
    "    distanceh = 0.25\n",
    "\n",
    "    if cd and cdmethod is None:\n",
    "        # get pairs of non significant methods\n",
    "\n",
    "        def get_lines(sums, hsd):\n",
    "            # get all pairs\n",
    "            lsums = len(sums)\n",
    "            allpairs = [(i, j) for i, j in mxrange([[lsums], [lsums]]) if j > i]\n",
    "            # remove not significant\n",
    "            notSig = [(i, j) for i, j in allpairs\n",
    "                      if abs(sums[i] - sums[j]) <= hsd]\n",
    "            # keep only longest\n",
    "\n",
    "            def no_longer(ij_tuple, notSig):\n",
    "                i, j = ij_tuple\n",
    "                for i1, j1 in notSig:\n",
    "                    if (i1 <= i and j1 > j) or (i1 < i and j1 >= j):\n",
    "                        return False\n",
    "                return True\n",
    "\n",
    "            longest = [(i, j) for i, j in notSig if no_longer((i, j), notSig)]\n",
    "\n",
    "            return longest\n",
    "\n",
    "        lines = get_lines(ssums, cd)\n",
    "        linesblank = 0.2 + 0.2 + (len(lines) - 1) * 0.1\n",
    "\n",
    "        # add scale\n",
    "        distanceh = 0.25\n",
    "        cline += distanceh\n",
    "\n",
    "    # calculate height needed height of an image\n",
    "    minnotsignificant = max(2 * 0.2, linesblank)\n",
    "    height = cline + ((k + 1) / 2) * 0.2 + minnotsignificant\n",
    "\n",
    "    fig = plt.figure(figsize=(width, height))\n",
    "    fig.set_facecolor('white')\n",
    "    ax = fig.add_axes([0, 0, 1, 1])  # reverse y axis\n",
    "    ax.set_axis_off()\n",
    "\n",
    "    hf = 1. / height  # height factor\n",
    "    wf = 1. / width\n",
    "\n",
    "    def hfl(l):\n",
    "        return [a * hf for a in l]\n",
    "\n",
    "    def wfl(l):\n",
    "        return [a * wf for a in l]\n",
    "\n",
    "\n",
    "    # Upper left corner is (0,0).\n",
    "    ax.plot([0, 1], [0, 1], c=\"w\")\n",
    "    ax.set_xlim(0, 1)\n",
    "    ax.set_ylim(1, 0)\n",
    "\n",
    "    def line(l, color='k', **kwargs):\n",
    "        \"\"\"\n",
    "        Input is a list of pairs of points.\n",
    "        \"\"\"\n",
    "        ax.plot(wfl(nth(l, 0)), hfl(nth(l, 1)), color=color, **kwargs)\n",
    "\n",
    "    def text(x, y, s, *args, **kwargs):\n",
    "        ax.text(wf * x, hf * y, s, fontsize = 14, *args, **kwargs)\n",
    "\n",
    "    line([(textspace, cline), (width - textspace, cline)], linewidth=0.7)\n",
    "\n",
    "    bigtick = 0.1\n",
    "    smalltick = 0.05\n",
    "\n",
    "    tick = None\n",
    "    for a in list(np.arange(lowv, highv, 0.5)) + [highv]:\n",
    "        tick = smalltick\n",
    "        if a == int(a):\n",
    "            tick = bigtick\n",
    "        line([(rankpos(a), cline - tick / 2),\n",
    "              (rankpos(a), cline)],\n",
    "             linewidth=0.7)\n",
    "\n",
    "    for a in range(lowv, highv + 1):\n",
    "        text(rankpos(a), cline - tick / 2 - 0.05, str(a),\n",
    "             ha=\"center\", va=\"bottom\")\n",
    "\n",
    "    k = len(ssums)\n",
    "\n",
    "    for i in range(math.ceil(k / 2)):\n",
    "        chei = cline + minnotsignificant + i * 0.2\n",
    "        line([(rankpos(ssums[i]), cline),\n",
    "              (rankpos(ssums[i]), chei),\n",
    "              (textspace - 0.1, chei)],\n",
    "             linewidth=0.7)\n",
    "        text(textspace - 0.2, chei, nnames[i], ha=\"right\", va=\"center\")\n",
    "\n",
    "    for i in range(math.ceil(k / 2), k):\n",
    "        chei = cline + minnotsignificant + (k - i - 1) * 0.2\n",
    "        line([(rankpos(ssums[i]), cline),\n",
    "              (rankpos(ssums[i]), chei),\n",
    "              (textspace + scalewidth + 0.1, chei)],\n",
    "             linewidth=0.7)\n",
    "        text(textspace + scalewidth + 0.2, chei, nnames[i],\n",
    "             ha=\"left\", va=\"center\")\n",
    "\n",
    "    if cd and cdmethod is None:\n",
    "        # upper scale\n",
    "        if not reverse:\n",
    "            begin, end = rankpos(lowv), rankpos(lowv + cd)\n",
    "        else:\n",
    "            begin, end = rankpos(highv), rankpos(highv - cd)\n",
    "\n",
    "        line([(begin, distanceh), (end, distanceh)], linewidth=0.7)\n",
    "        line([(begin, distanceh + bigtick / 2),\n",
    "              (begin, distanceh - bigtick / 2)],\n",
    "             linewidth=0.7)\n",
    "        line([(end, distanceh + bigtick / 2),\n",
    "              (end, distanceh - bigtick / 2)],\n",
    "             linewidth=0.7)\n",
    "        text((begin + end) / 2, distanceh - 0.05, \"CD\",\n",
    "             ha=\"center\", va=\"bottom\")\n",
    "\n",
    "        # no-significance lines\n",
    "        def draw_lines(lines, side=0.05, height=0.1):\n",
    "            start = cline + 0.2\n",
    "            for l, r in lines:\n",
    "                line([(rankpos(ssums[l]) - side, start),\n",
    "                      (rankpos(ssums[r]) + side, start)],\n",
    "                     linewidth=2.5)\n",
    "                start += height\n",
    "\n",
    "        draw_lines(lines)\n",
    "\n",
    "    elif cd:\n",
    "        begin = rankpos(avranks[cdmethod] - cd)\n",
    "        end = rankpos(avranks[cdmethod] + cd)\n",
    "        line([(begin, cline), (end, cline)],\n",
    "             linewidth=2.5)\n",
    "        line([(begin, cline + bigtick / 2),\n",
    "              (begin, cline - bigtick / 2)],\n",
    "             linewidth=2.5)\n",
    "        line([(end, cline + bigtick / 2),\n",
    "              (end, cline - bigtick / 2)],\n",
    "             linewidth=2.5)\n",
    "\n",
    "    if filename:\n",
    "        print_figure(fig, filename, **kwargs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "eeeb1424-63f5-4338-bdc9-5e36e8e4ac09",
   "metadata": {},
   "outputs": [],
   "source": [
    "# FN\n",
    "df = pd.DataFrame(rmse_scores)\n",
    "df\n",
    "\n",
    "scores = [df[col].values for col in df.columns]\n",
    "\n",
    "stat, p = friedmanchisquare(*scores)\n",
    "print(f'Friedman Test Statistic: {stat}, p-value: {p}')\n",
    "\n",
    "ranks = df.rank(axis=1, method='average')\n",
    "average_ranks = ranks.mean().values\n",
    "\n",
    "n_datasets = df.shape[0]\n",
    "alpha = 0.05\n",
    "\n",
    "from scikit_posthocs import posthoc_nemenyi_friedman\n",
    "cd = np.sqrt((len(df.columns) * (len(df.columns) + 1)) / (6 * n_datasets)) * np.sqrt(2 / alpha)\n",
    "print(f'Critical Difference: {cd}')\n",
    "\n",
    "classifiers = [f\"{clf} ({rank:.2f})\" for clf, rank in zip(df.columns, average_ranks)]\n",
    "\n",
    "plt.figure(figsize=(16, 10))\n",
    "\n",
    "graph_ranks(average_ranks, classifiers, cd=cd, width=6, textspace=1)\n",
    "plt.xlabel('Classifiers')\n",
    "\n",
    "plt.text(0.5, 1.19, f'Friedman-Nemenyi: {stat:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=16)\n",
    "plt.text(0.5, 1.10, f'CD: {cd:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=16)\n",
    "\n",
    "plt.tight_layout()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d88fdb63-9223-48f5-b4c3-78c5ab76938b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# SHAP\n",
    "import shap\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "from catboost import CatBoostRegressor\n",
    "from sklearn.feature_selection import SelectFromModel\n",
    "from sklearn.neighbors import LocalOutlierFactor\n",
    "from xgboost import XGBRegressor\n",
    "\n",
    "cbr = CatBoostRegressor(verbose=0, iterations=2000, learning_rate=0.01, depth=5)\n",
    "\n",
    "X = splitted_dataset.drop('total_sum', axis=1)\n",
    "y = splitted_dataset['total_sum']\n",
    "random_state = 0\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)\n",
    "\n",
    "lof = LocalOutlierFactor()\n",
    "yhat = lof.fit_predict(X_train)\n",
    "\n",
    "mask = yhat != -1\n",
    "X_train, y_train = X_train[mask], y_train[mask]\n",
    "\n",
    "original_columns = X.columns.tolist()\n",
    "\n",
    "X_train = pd.DataFrame(X_train, columns=original_columns)\n",
    "X_test = pd.DataFrame(X_test, columns=original_columns)\n",
    "\n",
    "xgb = XGBRegressor(random_state=random_state)\n",
    "xgb.fit(X_train, y_train)\n",
    "selector = SelectFromModel(xgb, prefit=True)\n",
    "\n",
    "importance = np.abs(xgb.feature_importances_)\n",
    "indices = np.argsort(importance)[::-1]\n",
    "important_features = [original_columns[i] for i in indices[:50]]\n",
    "\n",
    "X_train_fi = X_train[important_features]\n",
    "X_test_fi = X_test[important_features]\n",
    "\n",
    "cbr.fit(X_train_fi, y_train)\n",
    "\n",
    "# Compute SHAP values using shap.Explainer\n",
    "explainer = shap.Explainer(cbr, X_train_fi)\n",
    "shap_values = explainer(X_train_fi)\n",
    "plt.figure(figsize=(12, 8))\n",
    "shap.summary_plot(shap_values, X_train_fi, plot_type=\"bar\", feature_names=important_features, show=False)\n",
    "plt.savefig(\"shap_summary_plot.svg\", format='svg')  # Save the plot as SVG\n",
    "plt.close()\n",
    "\n",
    "display(FileLink(\"shap_summary_plot.svg\"))\n",
    "\n",
    "sorted_indices = np.argsort(y_train.values)\n",
    "low_value_index = sorted_indices[0]\n",
    "high_value_index = sorted_indices[-1]\n",
    "\n",
    "print(f\"Array position with low target value: {low_value_index}, Target Value: {y_train.values[low_value_index]}\")\n",
    "print(f\"Array position with high target value: {high_value_index}, Target Value: {y_train.values[high_value_index]}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f6cce3ab-2480-4e87-a4ad-36054e12553a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function to plot SHAP waterfall plot for a specific instance and save as SVG\n",
    "def plot_shap_waterfall(instance_index, filename):\n",
    "    shap_value = shap_values[instance_index]\n",
    "    \n",
    "    plt.figure(figsize=(14, 8))\n",
    "    \n",
    "    shap.plots.waterfall(shap_value, show=False)\n",
    "    \n",
    "    plt.tight_layout()\n",
    "\n",
    "    plt.savefig(filename, format='svg')\n",
    "    \n",
    "    plt.close()\n",
    "\n",
    "plot_shap_waterfall(482, \"waterfall_plot_instance_0.svg\")\n",
    "plot_shap_waterfall(70, \"waterfall_plot_instance_1.svg\")\n",
    "\n",
    "display(FileLink(\"waterfall_plot_instance_0.svg\"))\n",
    "display(FileLink(\"waterfall_plot_instance_1.svg\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "176f6725-ac13-444e-8d6e-fe3cc7a63b95",
   "metadata": {},
   "source": [
    "### Hyperparameter Optimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "213e4d7c-5719-47bc-831d-46809d65fb49",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Models\n",
    "regressors = {\n",
    "    'CBR': CatBoostRegressor(verbose=0),\n",
    "    #'XGBR': XGBRegressor(),\n",
    "    #'LGBMR': LGBMRegressor(),\n",
    "    # 'GBR': GradientBoostingRegressor(),\n",
    "    #'RFR': RandomForestRegressor(),\n",
    "    # 'ETR': ExtraTreesRegressor(),\n",
    "    # 'ABR': AdaBoostRegressor()\n",
    "}\n",
    "\n",
    "# Define parameter grids for the models\n",
    "param_grids = {\n",
    "    'CBR': {\n",
    "        'iterations': Integer(100, 500),\n",
    "        'learning_rate': Real(0.01, 0.1),\n",
    "        'depth': Integer(3, 10),\n",
    "    },\n",
    "    # 'GBR': {\n",
    "    #     'n_estimators': Integer(50, 300),\n",
    "    #     'learning_rate': Real(0.01, 0.1),\n",
    "    #     'max_depth': Integer(3, 10)\n",
    "    # },\n",
    "    # 'RFR': {\n",
    "    #     'n_estimators': Integer(50, 300),\n",
    "    #     'max_depth': Integer(3, 20)\n",
    "    # },\n",
    "    # 'XGBR': {\n",
    "    #     'n_estimators': Integer(50, 300),\n",
    "    #     'learning_rate': Real(0.01, 0.1),\n",
    "    #     'max_depth': Integer(3, 10),\n",
    "    # },\n",
    "    # 'LGBMR': {\n",
    "    #     'n_estimators': Integer(50, 300),\n",
    "    #     'learning_rate': Real(0.01, 0.1),\n",
    "    #     'num_leaves': Integer(20, 50),\n",
    "    # },\n",
    "    # 'ETR': {\n",
    "    #     'n_estimators': Integer(50, 300),\n",
    "    #     'max_depth': Integer(3, 20)\n",
    "    # },\n",
    "#     'ABR': {\n",
    "#         'n_estimators': Integer(50, 300),\n",
    "#         'learning_rate': Real(0.01, 1.0)\n",
    "#     }\n",
    "}\n",
    "\n",
    "# Function to perform hyperparameter tuning\n",
    "def hyperparameter_tuning(model, param_grid, X_train, y_train):\n",
    "    bayes_search = BayesSearchCV(model, search_spaces=param_grid, n_iter=50, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=0)\n",
    "    bayes_search.fit(X_train, y_train)\n",
    "    return bayes_search.best_estimator_\n",
    "\n",
    "metric_sums = {name: {'rmse': 0, 'mae': 0, 'r2': 0} for name in regressors.keys()}\n",
    "metric_stds = {name: {'rmse': 0, 'mae': 0, 'r2': 0} for name in regressors.keys()}\n",
    "rmse_scores = {name: [] for name in regressors.keys()}\n",
    "mae_scores = {name: [] for name in regressors.keys()}\n",
    "r2_scores = {name: [] for name in regressors.keys()}\n",
    "\n",
    "random_state = 5\n",
    "print(f'Processing for Random State: {random_state}')\n",
    "\n",
    "X = splitted_dataset.drop('total_sum', axis=1)\n",
    "y = splitted_dataset['total_sum']\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)\n",
    "\n",
    "lof = LocalOutlierFactor()\n",
    "yhat = lof.fit_predict(X_train)\n",
    "\n",
    "mask = yhat != -1\n",
    "X_train, y_train = X_train[mask], y_train[mask]\n",
    "\n",
    "original_columns = X.columns.tolist()\n",
    "\n",
    "print(f\"Number of training labels after outlier removal: {len(y_train)}\")\n",
    "print(f\"Number of test labels: {len(y_test)}\")\n",
    "\n",
    "scaler = MinMaxScaler()\n",
    "X_train = scaler.fit_transform(X_train)\n",
    "X_test = scaler.transform(X_test)\n",
    "\n",
    "X_train = pd.DataFrame(X_train, columns=original_columns)\n",
    "X_test = pd.DataFrame(X_test, columns=original_columns)\n",
    "\n",
    "xgb = XGBRegressor(random_state=random_state)\n",
    "xgb.fit(X_train, y_train)\n",
    "selector = SelectFromModel(xgb, prefit=True)\n",
    "\n",
    "importance = np.abs(xgb.feature_importances_)\n",
    "indices = np.argsort(importance)[::-1]\n",
    "important_features = [original_columns[i] for i in indices[:50]]\n",
    "\n",
    "X_train = X_train[important_features]\n",
    "X_test = X_test[important_features]\n",
    "\n",
    "best_models = {}\n",
    "for model_name, param_grid in param_grids.items():\n",
    "    if model_name == 'GBR':\n",
    "        model = GradientBoostingRegressor()\n",
    "    elif model_name == 'RFR':\n",
    "        model = RandomForestRegressor()\n",
    "    elif model_name == 'XGBR':\n",
    "        model = XGBRegressor()\n",
    "    elif model_name == 'LGBMR':\n",
    "        model = LGBMRegressor()\n",
    "    elif model_name == 'ETR':\n",
    "        model = ExtraTreesRegressor()\n",
    "    elif model_name == 'ABR':\n",
    "        model = AdaBoostRegressor()\n",
    "    elif model_name == 'CBR':\n",
    "        model = CatBoostRegressor(verbose=0)\n",
    "    \n",
    "    print(f\"Optimizing {model_name}...\")\n",
    "    best_model = hyperparameter_tuning(model, param_grid, X_train, y_train)\n",
    "    best_models[model_name] = best_model\n",
    "    print(f\"Best parameters for {model_name}: {best_model.get_params()}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}