{ "cells": [ { "cell_type": "markdown", "id": "c4849e1e-1928-4e8b-a12f-37c786b50aca", "metadata": {}, "source": [ "# Regression Layer" ] }, { "cell_type": "code", "execution_count": null, "id": "dddcb2a8-73d3-4476-b88a-bc1494a2c830", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from sklearn.model_selection import cross_val_score, KFold\n", "from sklearn.impute import KNNImputer\n", "from sklearn.pipeline import make_pipeline\n", "from xgboost import XGBClassifier\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.experimental import enable_iterative_imputer\n", "from imblearn.pipeline import make_pipeline as make_pipeline_imb\n", "from imblearn.over_sampling import SMOTE,SMOTENC\n", "from sklearn.model_selection import train_test_split\n", "from collections import Counter\n", "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n", "from sklearn.svm import SVC\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.neighbors import LocalOutlierFactor\n", "from sklearn.utils import resample\n", "import warnings\n", "from imblearn.over_sampling import RandomOverSampler\n", "from imblearn.under_sampling import RandomUnderSampler\n", "from sklearn.preprocessing import MinMaxScaler\n", "from imblearn.pipeline import Pipeline\n", "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n", "from sklearn.preprocessing import LabelEncoder, PowerTransformer\n", "from collections import defaultdict\n", "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.neural_network import MLPClassifier\n", "import Orange\n", "from scipy.stats import friedmanchisquare, rankdata\n", "import shap\n", "import scikit_posthocs as sp\n", "from sklearn.feature_selection import SelectFromModel\n", "from IPython.display import FileLink, display\n", "import math\n", "from sklearn.ensemble import RandomForestClassifier\n", "from skopt.space import Integer, Real\n", "from sklearn.model_selection import StratifiedKFold\n", "from skopt import BayesSearchCV\n", "import xgboost as xgb\n", "from imblearn.over_sampling import SMOTE\n", "from sklearn.tree import DecisionTreeClassifier, export_text\n", "from sklearn import tree\n", "from skopt.space import Real, Integer, Categorical\n", "from skopt.callbacks import VerboseCallback\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import MinMaxScaler\n", "from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor, VotingRegressor\n", "from catboost import CatBoostRegressor\n", "from xgboost import XGBRegressor\n", "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score\n", "from sklearn.feature_selection import SelectFromModel\n", "from sklearn.neighbors import LocalOutlierFactor\n", "from lightgbm import LGBMRegressor\n", "from IPython.display import display, FileLink" ] }, { "cell_type": "markdown", "id": "39d6683c-fd3f-4daa-afdc-2e7f83c3fce3", "metadata": {}, "source": [ "### Preparation before training" ] }, { "cell_type": "code", "execution_count": null, "id": "ec17e47a-8d92-498d-8f28-d8259d6ebc4e", "metadata": {}, "outputs": [], "source": [ "# Call Dataset\n", "pd.set_option('display.max_rows', 10)\n", "initial_df = pd.read_csv('3labelv4Regression.csv')\n", "initial_df.info()" ] }, { "cell_type": "code", "execution_count": null, "id": "d3751352-73bc-42c6-b6a5-84a3badf14ff", "metadata": {}, "outputs": [], "source": [ "# All categorical features except for label\n", "cols = initial_df.columns\n", "num_cols = initial_df._get_numeric_data().columns\n", "categorical_features = list(set(cols) - set(num_cols))\n", "categorical_features.remove('depression_category')\n", "\n", "# Label Encode all categorical, but keep missing values\n", "le_initial_df = initial_df.copy()\n", "dropped_labels = le_initial_df['depression_category']\n", "le_initial_df = le_initial_df.drop('depression_category', axis = 1)\n", "\n", "for col in le_initial_df.columns:\n", " if le_initial_df[col].dtype == 'object':\n", " le_initial_df[col] = le_initial_df[col].fillna('missing')\n", "\n", " label_encoder = LabelEncoder()\n", " le_initial_df[col] = label_encoder.fit_transform(le_initial_df[col])\n", "\n", " missing_value_index = np.where(label_encoder.classes_ == 'missing')[0]\n", " \n", " le_initial_df[col] = le_initial_df[col].replace(missing_value_index, np.nan)\n", "\n", "le_initial_df = pd.concat([le_initial_df, dropped_labels], axis = 1)" ] }, { "cell_type": "code", "execution_count": null, "id": "9fa21a95-21f1-4274-86ba-d7977813066b", "metadata": {}, "outputs": [], "source": [ "le_initial_df" ] }, { "cell_type": "code", "execution_count": null, "id": "0ba54a81-d4de-4884-99d3-29867eb7ea40", "metadata": {}, "outputs": [], "source": [ "# Seperate and Combine\n", "le_df_normal = le_initial_df[le_initial_df['depression_category'] == 'normal']\n", "le_df_mild = le_initial_df[le_initial_df['depression_category'] == 'mild']\n", "le_df_moderatesevere = le_initial_df[le_initial_df['depression_category'] == 'moderatesevere']\n", "\n", "le_df_depression = pd.concat([le_df_normal, le_df_mild, le_df_moderatesevere], ignore_index = False)\n", "\n", "le_df_depression['depression_category'] = 'depression'\n", "\n", "# Check depression category counts\n", "dataframes = [le_df_normal, le_df_mild, le_df_moderatesevere]\n", "le_initial_df = pd.concat(dataframes, ignore_index=True)\n", "label_counts = le_initial_df['depression_category'].value_counts()\n", "label_counts" ] }, { "cell_type": "code", "execution_count": null, "id": "c517213f-f6bb-4298-99ee-3b29f6f7d0cb", "metadata": {}, "outputs": [], "source": [ "# Some outlier.\n", "# threshold = int(0.8 * le_df_normal.shape[1])\n", "# le_df_normal = le_df_normal.dropna(thresh = threshold)\n", "# threshold = int(0.8 * le_df_depression.shape[1])\n", "# le_df_depression = le_df_depression.dropna(thresh = threshold)\n", "\n", "# Check depression category counts\n", "dataframes = [le_df_normal, le_df_mild, le_df_moderatesevere]\n", "le_initial_df = pd.concat(dataframes, ignore_index=True)\n", "label_counts = le_initial_df['depression_category'].value_counts()" ] }, { "cell_type": "code", "execution_count": null, "id": "8c5bdf52-c645-4a11-920a-579e28db1a50", "metadata": {}, "outputs": [], "source": [ "# Imputation\n", "different_le_dfs = [le_df_normal, le_df_mild, le_df_moderatesevere]\n", "imputed_le_dfs = []\n", "from sklearn.impute import IterativeImputer\n", "for le_df in different_le_dfs:\n", " y = le_df['depression_category']\n", " X = le_df.drop('depression_category', axis = 1)\n", " \n", " imputer = SimpleImputer(strategy='median')\n", " imputed_data = imputer.fit_transform(X)\n", " imputed_df = pd.DataFrame(imputed_data, columns = X.columns)\n", "\n", " imputed_df['depression_category'] = y.reset_index(drop = True)\n", " imputed_le_dfs.append(imputed_df)\n", "\n", "concatenated_le_dfs = pd.concat(imputed_le_dfs, ignore_index = True)\n", "concatenated_le_dfs" ] }, { "cell_type": "code", "execution_count": null, "id": "0e871a5f-beab-4f90-b8c4-e922ef86d0f0", "metadata": {}, "outputs": [], "source": [ "# Full label encode depression category\n", "fully_LE_concatenated_le_dfs = concatenated_le_dfs.copy()\n", "fully_LE_concatenated_le_dfs['depression_category'] = label_encoder.fit_transform(fully_LE_concatenated_le_dfs['depression_category'])\n", "\n", "# The dataset after category connect, imputation, and label encoding\n", "splitted_dataset = fully_LE_concatenated_le_dfs.copy()\n", "splitted_dataset = splitted_dataset.drop('depression_category', axis = 1)\n", "splitted_dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "3e602f4d-efb2-4ac9-99bb-c6fbeca791cc", "metadata": {}, "outputs": [], "source": [ "warnings.filterwarnings('ignore')" ] }, { "cell_type": "markdown", "id": "68563dc9-38c9-4ea5-8c08-9de904f58f43", "metadata": {}, "source": [ "### Regression Training" ] }, { "cell_type": "code", "execution_count": null, "id": "5e339a41-e484-4a11-84f0-770cbaacb09d", "metadata": { "scrolled": true }, "outputs": [], "source": [ "# Optimized parameters\n", "regressors = {\n", " 'CBR': CatBoostRegressor(verbose=0, iterations=2000, learning_rate=0.01, depth=5),\n", " 'XGBR': XGBRegressor(learning_rate=0.04403027347366962, max_depth=3, n_estimators=238),\n", " 'LGBMR': LGBMRegressor(learning_rate=0.02904035023286438, num_leaves=20, n_estimators=170),\n", " 'GBR': GradientBoostingRegressor(learning_rate=0.03, max_depth=2, n_estimators=700),\n", " 'RFR': RandomForestRegressor(max_depth=12, max_features=1.0, n_estimators=300),\n", " 'ETR': ExtraTreesRegressor(max_depth=12, max_features=1.0, n_estimators=64),\n", " 'ABR': AdaBoostRegressor(learning_rate=0.29915504677867777, n_estimators=92)\n", "}\n", "\n", "# Default parameters\n", "# regressors = {\n", "# 'CBR': CatBoostRegressor(verbose=0),\n", "# 'XGBR': XGBRegressor(),\n", "# 'LGBMR': LGBMRegressor(),\n", "# 'GBR': GradientBoostingRegressor(),\n", "# 'RFR': RandomForestRegressor(),\n", "# 'ETR': ExtraTreesRegressor(),\n", "# 'ABR': AdaBoostRegressor()\n", "# }\n", "\n", "voting_regressor = VotingRegressor(estimators=[\n", " ('cbr', regressors['CBR']),\n", " ('xgbr', regressors['XGBR']),\n", " ('gbr', regressors['GBR']),\n", " ('abr', regressors['ABR'])\n", "])\n", "\n", "regressors['Voting'] = voting_regressor\n", "\n", "metric_sums = {name: {'rmse': 0, 'mae': 0, 'r2': 0} for name in regressors.keys()}\n", "metric_stds = {name: {'rmse': 0, 'mae': 0, 'r2': 0} for name in regressors.keys()}\n", "rmse_scores = {name: [] for name in regressors.keys()}\n", "mae_scores = {name: [] for name in regressors.keys()}\n", "r2_scores = {name: [] for name in regressors.keys()}\n", "\n", "for random_state in range(10):\n", " print(f'Processing for Random State: {random_state}')\n", "\n", " X = splitted_dataset.drop('total_sum', axis=1)\n", " y = splitted_dataset['total_sum']\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)\n", "\n", " lof = LocalOutlierFactor()\n", " yhat = lof.fit_predict(X_train)\n", "\n", " mask = yhat != -1\n", " X_train, y_train = X_train[mask], y_train[mask]\n", "\n", " original_columns = X.columns.tolist()\n", "\n", " print(f\"Number of training labels after outlier removal: {len(y_train)}\")\n", " print(f\"Number of test labels: {len(y_test)}\")\n", "\n", " scaler = MinMaxScaler()\n", " X_train = scaler.fit_transform(X_train)\n", " X_test = scaler.transform(X_test)\n", " \n", " X_train = pd.DataFrame(X_train, columns=original_columns)\n", " X_test = pd.DataFrame(X_test, columns=original_columns)\n", "\n", " # Feature selection using XGBRegressor\n", " xgb = XGBRegressor(random_state=random_state)\n", " xgb.fit(X_train, y_train)\n", " selector = SelectFromModel(xgb, prefit=True)\n", "\n", " importance = np.abs(xgb.feature_importances_)\n", " indices = np.argsort(importance)[::-1]\n", " important_features = [original_columns[i] for i in indices[:50]]\n", "\n", " for reg_name, reg in regressors.items():\n", " selected_features = important_features\n", " \n", " X_train_fi = pd.DataFrame(X_train, columns=original_columns)[selected_features]\n", " X_test_fi = pd.DataFrame(X_test, columns=original_columns)[selected_features]\n", "\n", " reg.fit(X_train_fi, y_train)\n", " y_pred = reg.predict(X_test_fi)\n", "\n", " y_pred = np.round(y_pred)\n", " \n", " rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n", " mae = mean_absolute_error(y_test, y_pred)\n", " r2 = r2_score(y_test, y_pred)\n", "\n", " metric_sums[reg_name]['rmse'] += rmse\n", " metric_sums[reg_name]['mae'] += mae\n", " metric_sums[reg_name]['r2'] += r2\n", " rmse_scores[reg_name].append(rmse)\n", " mae_scores[reg_name].append(mae)\n", " r2_scores[reg_name].append(r2)" ] }, { "cell_type": "code", "execution_count": null, "id": "0929bf2c-fd41-4e0e-8070-4a6317f3efab", "metadata": {}, "outputs": [], "source": [ "# Calculate and print the average metrics and their standard deviations\n", "for reg_name in regressors.keys():\n", " avg_rmse = metric_sums[reg_name]['rmse'] / 10\n", " avg_mae = metric_sums[reg_name]['mae'] / 10\n", " avg_r2 = metric_sums[reg_name]['r2'] / 10\n", " std_rmse = np.std(rmse_scores[reg_name])\n", " std_mae = np.std(mae_scores[reg_name])\n", " std_r2 = np.std(r2_scores[reg_name])\n", " \n", " print(f\"Regressor: {reg_name}\")\n", " print(f\"Average RMSE: {avg_rmse} ± {std_rmse}\")\n", " print(f\"Average MAE: {avg_mae} ± {std_mae}\")\n", " print(f\"Average R2: {avg_r2} ± {std_r2}\")\n", " print(\"------\")" ] }, { "cell_type": "markdown", "id": "07f62248-70b4-479b-b782-3aec714111f9", "metadata": {}, "source": [ "### Shap n FN" ] }, { "cell_type": "code", "execution_count": null, "id": "f6ce4639-53d9-4dcd-a49c-80555326f197", "metadata": {}, "outputs": [], "source": [ "# Preparation code to make CD diagram from older version of Orange\n", "def compute_CD(avranks, n, alpha=\"0.05\", test=\"nemenyi\"):\n", " \"\"\"\n", " Returns critical difference for Nemenyi or Bonferroni-Dunn test\n", " according to given alpha (either alpha=\"0.05\" or alpha=\"0.1\") for average\n", " ranks and number of tested datasets N. Test can be either \"nemenyi\" for\n", " for Nemenyi two tailed test or \"bonferroni-dunn\" for Bonferroni-Dunn test.\n", "\n", " This function is deprecated and will be removed in Orange 3.34.\n", " \"\"\"\n", " k = len(avranks)\n", " d = {(\"nemenyi\", \"0.05\"): [0, 0, 1.959964, 2.343701, 2.569032, 2.727774,\n", " 2.849705, 2.94832, 3.030879, 3.101730, 3.163684,\n", " 3.218654, 3.268004, 3.312739, 3.353618, 3.39123,\n", " 3.426041, 3.458425, 3.488685, 3.517073,\n", " 3.543799],\n", " (\"nemenyi\", \"0.1\"): [0, 0, 1.644854, 2.052293, 2.291341, 2.459516,\n", " 2.588521, 2.692732, 2.779884, 2.854606, 2.919889,\n", " 2.977768, 3.029694, 3.076733, 3.119693, 3.159199,\n", " 3.195743, 3.229723, 3.261461, 3.291224, 3.319233],\n", " (\"bonferroni-dunn\", \"0.05\"): [0, 0, 1.960, 2.241, 2.394, 2.498, 2.576,\n", " 2.638, 2.690, 2.724, 2.773],\n", " (\"bonferroni-dunn\", \"0.1\"): [0, 0, 1.645, 1.960, 2.128, 2.241, 2.326,\n", " 2.394, 2.450, 2.498, 2.539]}\n", " q = d[(test, alpha)]\n", " cd = q[k] * (k * (k + 1) / (6.0 * n)) ** 0.5\n", " return cd\n", "\n", "\n", "def graph_ranks(avranks, names, cd=None, cdmethod=None, lowv=None, highv=None,\n", " width=6, textspace=1, reverse=False, filename=None, **kwargs):\n", " \"\"\"\n", " Draws a CD graph, which is used to display the differences in methods'\n", " performance. See Janez Demsar, Statistical Comparisons of Classifiers over\n", " Multiple Data Sets, 7(Jan):1--30, 2006.\n", "\n", " Needs matplotlib to work.\n", "\n", " The image is ploted on `plt` imported using\n", " `import matplotlib.pyplot as plt`.\n", "\n", " This function is deprecated and will be removed in Orange 3.34.\n", "\n", " Args:\n", " avranks (list of float): average ranks of methods.\n", " names (list of str): names of methods.\n", " cd (float): Critical difference used for statistically significance of\n", " difference between methods.\n", " cdmethod (int, optional): the method that is compared with other methods\n", " If omitted, show pairwise comparison of methods\n", " lowv (int, optional): the lowest shown rank\n", " highv (int, optional): the highest shown rank\n", " width (int, optional): default width in inches (default: 6)\n", " textspace (int, optional): space on figure sides (in inches) for the\n", " method names (default: 1)\n", " reverse (bool, optional): if set to `True`, the lowest rank is on the\n", " right (default: `False`)\n", " filename (str, optional): output file name (with extension). If not\n", " given, the function does not write a file.\n", " \"\"\"\n", " try:\n", " import matplotlib.pyplot as plt\n", " from matplotlib.backends.backend_agg import FigureCanvasAgg\n", " except ImportError:\n", " raise ImportError(\"Function graph_ranks requires matplotlib.\")\n", "\n", " width = float(width)\n", " textspace = float(textspace)\n", "\n", " def nth(l, n):\n", " \"\"\"\n", " Returns only nth elemnt in a list.\n", " \"\"\"\n", " n = lloc(l, n)\n", " return [a[n] for a in l]\n", "\n", " def lloc(l, n):\n", " \"\"\"\n", " List location in list of list structure.\n", " Enable the use of negative locations:\n", " -1 is the last element, -2 second last...\n", " \"\"\"\n", " if n < 0:\n", " return len(l[0]) + n\n", " else:\n", " return n\n", "\n", " def mxrange(lr):\n", " \"\"\"\n", " Multiple xranges. Can be used to traverse matrices.\n", " This function is very slow due to unknown number of\n", " parameters.\n", "\n", " >>> mxrange([3,5])\n", " [(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)]\n", "\n", " >>> mxrange([[3,5,1],[9,0,-3]])\n", " [(3, 9), (3, 6), (3, 3), (4, 9), (4, 6), (4, 3)]\n", "\n", " \"\"\"\n", " if not len(lr):\n", " yield ()\n", " else:\n", " # it can work with single numbers\n", " index = lr[0]\n", " if isinstance(index, int):\n", " index = [index]\n", " for a in range(*index):\n", " for b in mxrange(lr[1:]):\n", " yield tuple([a] + list(b))\n", "\n", " def print_figure(fig, *args, **kwargs):\n", " canvas = FigureCanvasAgg(fig)\n", " canvas.print_figure(*args, **kwargs)\n", "\n", " sums = avranks\n", "\n", " tempsort = sorted([(a, i) for i, a in enumerate(sums)], reverse=reverse)\n", " ssums = nth(tempsort, 0)\n", " sortidx = nth(tempsort, 1)\n", " nnames = [names[x] for x in sortidx]\n", "\n", " if lowv is None:\n", " lowv = min(1, int(math.floor(min(ssums))))\n", " if highv is None:\n", " highv = max(len(avranks), int(math.ceil(max(ssums))))\n", "\n", " cline = 0.4\n", "\n", " k = len(sums)\n", "\n", " lines = None\n", "\n", " linesblank = 0\n", " scalewidth = width - 2 * textspace\n", "\n", " def rankpos(rank):\n", " if not reverse:\n", " a = rank - lowv\n", " else:\n", " a = highv - rank\n", " return textspace + scalewidth / (highv - lowv) * a\n", "\n", " distanceh = 0.25\n", "\n", " if cd and cdmethod is None:\n", " # get pairs of non significant methods\n", "\n", " def get_lines(sums, hsd):\n", " # get all pairs\n", " lsums = len(sums)\n", " allpairs = [(i, j) for i, j in mxrange([[lsums], [lsums]]) if j > i]\n", " # remove not significant\n", " notSig = [(i, j) for i, j in allpairs\n", " if abs(sums[i] - sums[j]) <= hsd]\n", " # keep only longest\n", "\n", " def no_longer(ij_tuple, notSig):\n", " i, j = ij_tuple\n", " for i1, j1 in notSig:\n", " if (i1 <= i and j1 > j) or (i1 < i and j1 >= j):\n", " return False\n", " return True\n", "\n", " longest = [(i, j) for i, j in notSig if no_longer((i, j), notSig)]\n", "\n", " return longest\n", "\n", " lines = get_lines(ssums, cd)\n", " linesblank = 0.2 + 0.2 + (len(lines) - 1) * 0.1\n", "\n", " # add scale\n", " distanceh = 0.25\n", " cline += distanceh\n", "\n", " # calculate height needed height of an image\n", " minnotsignificant = max(2 * 0.2, linesblank)\n", " height = cline + ((k + 1) / 2) * 0.2 + minnotsignificant\n", "\n", " fig = plt.figure(figsize=(width, height))\n", " fig.set_facecolor('white')\n", " ax = fig.add_axes([0, 0, 1, 1]) # reverse y axis\n", " ax.set_axis_off()\n", "\n", " hf = 1. / height # height factor\n", " wf = 1. / width\n", "\n", " def hfl(l):\n", " return [a * hf for a in l]\n", "\n", " def wfl(l):\n", " return [a * wf for a in l]\n", "\n", "\n", " # Upper left corner is (0,0).\n", " ax.plot([0, 1], [0, 1], c=\"w\")\n", " ax.set_xlim(0, 1)\n", " ax.set_ylim(1, 0)\n", "\n", " def line(l, color='k', **kwargs):\n", " \"\"\"\n", " Input is a list of pairs of points.\n", " \"\"\"\n", " ax.plot(wfl(nth(l, 0)), hfl(nth(l, 1)), color=color, **kwargs)\n", "\n", " def text(x, y, s, *args, **kwargs):\n", " ax.text(wf * x, hf * y, s, fontsize = 14, *args, **kwargs)\n", "\n", " line([(textspace, cline), (width - textspace, cline)], linewidth=0.7)\n", "\n", " bigtick = 0.1\n", " smalltick = 0.05\n", "\n", " tick = None\n", " for a in list(np.arange(lowv, highv, 0.5)) + [highv]:\n", " tick = smalltick\n", " if a == int(a):\n", " tick = bigtick\n", " line([(rankpos(a), cline - tick / 2),\n", " (rankpos(a), cline)],\n", " linewidth=0.7)\n", "\n", " for a in range(lowv, highv + 1):\n", " text(rankpos(a), cline - tick / 2 - 0.05, str(a),\n", " ha=\"center\", va=\"bottom\")\n", "\n", " k = len(ssums)\n", "\n", " for i in range(math.ceil(k / 2)):\n", " chei = cline + minnotsignificant + i * 0.2\n", " line([(rankpos(ssums[i]), cline),\n", " (rankpos(ssums[i]), chei),\n", " (textspace - 0.1, chei)],\n", " linewidth=0.7)\n", " text(textspace - 0.2, chei, nnames[i], ha=\"right\", va=\"center\")\n", "\n", " for i in range(math.ceil(k / 2), k):\n", " chei = cline + minnotsignificant + (k - i - 1) * 0.2\n", " line([(rankpos(ssums[i]), cline),\n", " (rankpos(ssums[i]), chei),\n", " (textspace + scalewidth + 0.1, chei)],\n", " linewidth=0.7)\n", " text(textspace + scalewidth + 0.2, chei, nnames[i],\n", " ha=\"left\", va=\"center\")\n", "\n", " if cd and cdmethod is None:\n", " # upper scale\n", " if not reverse:\n", " begin, end = rankpos(lowv), rankpos(lowv + cd)\n", " else:\n", " begin, end = rankpos(highv), rankpos(highv - cd)\n", "\n", " line([(begin, distanceh), (end, distanceh)], linewidth=0.7)\n", " line([(begin, distanceh + bigtick / 2),\n", " (begin, distanceh - bigtick / 2)],\n", " linewidth=0.7)\n", " line([(end, distanceh + bigtick / 2),\n", " (end, distanceh - bigtick / 2)],\n", " linewidth=0.7)\n", " text((begin + end) / 2, distanceh - 0.05, \"CD\",\n", " ha=\"center\", va=\"bottom\")\n", "\n", " # no-significance lines\n", " def draw_lines(lines, side=0.05, height=0.1):\n", " start = cline + 0.2\n", " for l, r in lines:\n", " line([(rankpos(ssums[l]) - side, start),\n", " (rankpos(ssums[r]) + side, start)],\n", " linewidth=2.5)\n", " start += height\n", "\n", " draw_lines(lines)\n", "\n", " elif cd:\n", " begin = rankpos(avranks[cdmethod] - cd)\n", " end = rankpos(avranks[cdmethod] + cd)\n", " line([(begin, cline), (end, cline)],\n", " linewidth=2.5)\n", " line([(begin, cline + bigtick / 2),\n", " (begin, cline - bigtick / 2)],\n", " linewidth=2.5)\n", " line([(end, cline + bigtick / 2),\n", " (end, cline - bigtick / 2)],\n", " linewidth=2.5)\n", "\n", " if filename:\n", " print_figure(fig, filename, **kwargs)" ] }, { "cell_type": "code", "execution_count": null, "id": "eeeb1424-63f5-4338-bdc9-5e36e8e4ac09", "metadata": {}, "outputs": [], "source": [ "# FN\n", "df = pd.DataFrame(rmse_scores)\n", "df\n", "\n", "scores = [df[col].values for col in df.columns]\n", "\n", "stat, p = friedmanchisquare(*scores)\n", "print(f'Friedman Test Statistic: {stat}, p-value: {p}')\n", "\n", "ranks = df.rank(axis=1, method='average')\n", "average_ranks = ranks.mean().values\n", "\n", "n_datasets = df.shape[0]\n", "alpha = 0.05\n", "\n", "from scikit_posthocs import posthoc_nemenyi_friedman\n", "cd = np.sqrt((len(df.columns) * (len(df.columns) + 1)) / (6 * n_datasets)) * np.sqrt(2 / alpha)\n", "print(f'Critical Difference: {cd}')\n", "\n", "classifiers = [f\"{clf} ({rank:.2f})\" for clf, rank in zip(df.columns, average_ranks)]\n", "\n", "plt.figure(figsize=(16, 10))\n", "\n", "graph_ranks(average_ranks, classifiers, cd=cd, width=6, textspace=1)\n", "plt.xlabel('Classifiers')\n", "\n", "plt.text(0.5, 1.19, f'Friedman-Nemenyi: {stat:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=16)\n", "plt.text(0.5, 1.10, f'CD: {cd:.3f}', horizontalalignment='center', transform=plt.gca().transAxes, fontsize=16)\n", "\n", "plt.tight_layout()" ] }, { "cell_type": "code", "execution_count": null, "id": "d88fdb63-9223-48f5-b4c3-78c5ab76938b", "metadata": {}, "outputs": [], "source": [ "# SHAP\n", "import shap\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import MinMaxScaler\n", "from catboost import CatBoostRegressor\n", "from sklearn.feature_selection import SelectFromModel\n", "from sklearn.neighbors import LocalOutlierFactor\n", "from xgboost import XGBRegressor\n", "\n", "cbr = CatBoostRegressor(verbose=0, iterations=2000, learning_rate=0.01, depth=5)\n", "\n", "X = splitted_dataset.drop('total_sum', axis=1)\n", "y = splitted_dataset['total_sum']\n", "random_state = 0\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)\n", "\n", "lof = LocalOutlierFactor()\n", "yhat = lof.fit_predict(X_train)\n", "\n", "mask = yhat != -1\n", "X_train, y_train = X_train[mask], y_train[mask]\n", "\n", "original_columns = X.columns.tolist()\n", "\n", "X_train = pd.DataFrame(X_train, columns=original_columns)\n", "X_test = pd.DataFrame(X_test, columns=original_columns)\n", "\n", "xgb = XGBRegressor(random_state=random_state)\n", "xgb.fit(X_train, y_train)\n", "selector = SelectFromModel(xgb, prefit=True)\n", "\n", "importance = np.abs(xgb.feature_importances_)\n", "indices = np.argsort(importance)[::-1]\n", "important_features = [original_columns[i] for i in indices[:50]]\n", "\n", "X_train_fi = X_train[important_features]\n", "X_test_fi = X_test[important_features]\n", "\n", "cbr.fit(X_train_fi, y_train)\n", "\n", "# Compute SHAP values using shap.Explainer\n", "explainer = shap.Explainer(cbr, X_train_fi)\n", "shap_values = explainer(X_train_fi)\n", "plt.figure(figsize=(12, 8))\n", "shap.summary_plot(shap_values, X_train_fi, plot_type=\"bar\", feature_names=important_features, show=False)\n", "plt.savefig(\"shap_summary_plot.svg\", format='svg') # Save the plot as SVG\n", "plt.close()\n", "\n", "display(FileLink(\"shap_summary_plot.svg\"))\n", "\n", "sorted_indices = np.argsort(y_train.values)\n", "low_value_index = sorted_indices[0]\n", "high_value_index = sorted_indices[-1]\n", "\n", "print(f\"Array position with low target value: {low_value_index}, Target Value: {y_train.values[low_value_index]}\")\n", "print(f\"Array position with high target value: {high_value_index}, Target Value: {y_train.values[high_value_index]}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f6cce3ab-2480-4e87-a4ad-36054e12553a", "metadata": {}, "outputs": [], "source": [ "# Function to plot SHAP waterfall plot for a specific instance and save as SVG\n", "def plot_shap_waterfall(instance_index, filename):\n", " shap_value = shap_values[instance_index]\n", " \n", " plt.figure(figsize=(14, 8))\n", " \n", " shap.plots.waterfall(shap_value, show=False)\n", " \n", " plt.tight_layout()\n", "\n", " plt.savefig(filename, format='svg')\n", " \n", " plt.close()\n", "\n", "plot_shap_waterfall(482, \"waterfall_plot_instance_0.svg\")\n", "plot_shap_waterfall(70, \"waterfall_plot_instance_1.svg\")\n", "\n", "display(FileLink(\"waterfall_plot_instance_0.svg\"))\n", "display(FileLink(\"waterfall_plot_instance_1.svg\"))" ] }, { "cell_type": "markdown", "id": "176f6725-ac13-444e-8d6e-fe3cc7a63b95", "metadata": {}, "source": [ "### Hyperparameter Optimization" ] }, { "cell_type": "code", "execution_count": null, "id": "213e4d7c-5719-47bc-831d-46809d65fb49", "metadata": {}, "outputs": [], "source": [ "# Models\n", "regressors = {\n", " 'CBR': CatBoostRegressor(verbose=0),\n", " #'XGBR': XGBRegressor(),\n", " #'LGBMR': LGBMRegressor(),\n", " # 'GBR': GradientBoostingRegressor(),\n", " #'RFR': RandomForestRegressor(),\n", " # 'ETR': ExtraTreesRegressor(),\n", " # 'ABR': AdaBoostRegressor()\n", "}\n", "\n", "# Define parameter grids for the models\n", "param_grids = {\n", " 'CBR': {\n", " 'iterations': Integer(100, 500),\n", " 'learning_rate': Real(0.01, 0.1),\n", " 'depth': Integer(3, 10),\n", " },\n", " # 'GBR': {\n", " # 'n_estimators': Integer(50, 300),\n", " # 'learning_rate': Real(0.01, 0.1),\n", " # 'max_depth': Integer(3, 10)\n", " # },\n", " # 'RFR': {\n", " # 'n_estimators': Integer(50, 300),\n", " # 'max_depth': Integer(3, 20)\n", " # },\n", " # 'XGBR': {\n", " # 'n_estimators': Integer(50, 300),\n", " # 'learning_rate': Real(0.01, 0.1),\n", " # 'max_depth': Integer(3, 10),\n", " # },\n", " # 'LGBMR': {\n", " # 'n_estimators': Integer(50, 300),\n", " # 'learning_rate': Real(0.01, 0.1),\n", " # 'num_leaves': Integer(20, 50),\n", " # },\n", " # 'ETR': {\n", " # 'n_estimators': Integer(50, 300),\n", " # 'max_depth': Integer(3, 20)\n", " # },\n", "# 'ABR': {\n", "# 'n_estimators': Integer(50, 300),\n", "# 'learning_rate': Real(0.01, 1.0)\n", "# }\n", "}\n", "\n", "# Function to perform hyperparameter tuning\n", "def hyperparameter_tuning(model, param_grid, X_train, y_train):\n", " bayes_search = BayesSearchCV(model, search_spaces=param_grid, n_iter=50, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=0)\n", " bayes_search.fit(X_train, y_train)\n", " return bayes_search.best_estimator_\n", "\n", "metric_sums = {name: {'rmse': 0, 'mae': 0, 'r2': 0} for name in regressors.keys()}\n", "metric_stds = {name: {'rmse': 0, 'mae': 0, 'r2': 0} for name in regressors.keys()}\n", "rmse_scores = {name: [] for name in regressors.keys()}\n", "mae_scores = {name: [] for name in regressors.keys()}\n", "r2_scores = {name: [] for name in regressors.keys()}\n", "\n", "random_state = 5\n", "print(f'Processing for Random State: {random_state}')\n", "\n", "X = splitted_dataset.drop('total_sum', axis=1)\n", "y = splitted_dataset['total_sum']\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)\n", "\n", "lof = LocalOutlierFactor()\n", "yhat = lof.fit_predict(X_train)\n", "\n", "mask = yhat != -1\n", "X_train, y_train = X_train[mask], y_train[mask]\n", "\n", "original_columns = X.columns.tolist()\n", "\n", "print(f\"Number of training labels after outlier removal: {len(y_train)}\")\n", "print(f\"Number of test labels: {len(y_test)}\")\n", "\n", "scaler = MinMaxScaler()\n", "X_train = scaler.fit_transform(X_train)\n", "X_test = scaler.transform(X_test)\n", "\n", "X_train = pd.DataFrame(X_train, columns=original_columns)\n", "X_test = pd.DataFrame(X_test, columns=original_columns)\n", "\n", "xgb = XGBRegressor(random_state=random_state)\n", "xgb.fit(X_train, y_train)\n", "selector = SelectFromModel(xgb, prefit=True)\n", "\n", "importance = np.abs(xgb.feature_importances_)\n", "indices = np.argsort(importance)[::-1]\n", "important_features = [original_columns[i] for i in indices[:50]]\n", "\n", "X_train = X_train[important_features]\n", "X_test = X_test[important_features]\n", "\n", "best_models = {}\n", "for model_name, param_grid in param_grids.items():\n", " if model_name == 'GBR':\n", " model = GradientBoostingRegressor()\n", " elif model_name == 'RFR':\n", " model = RandomForestRegressor()\n", " elif model_name == 'XGBR':\n", " model = XGBRegressor()\n", " elif model_name == 'LGBMR':\n", " model = LGBMRegressor()\n", " elif model_name == 'ETR':\n", " model = ExtraTreesRegressor()\n", " elif model_name == 'ABR':\n", " model = AdaBoostRegressor()\n", " elif model_name == 'CBR':\n", " model = CatBoostRegressor(verbose=0)\n", " \n", " print(f\"Optimizing {model_name}...\")\n", " best_model = hyperparameter_tuning(model, param_grid, X_train, y_train)\n", " best_models[model_name] = best_model\n", " print(f\"Best parameters for {model_name}: {best_model.get_params()}\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }