{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "# import seaborn as sns\n", "# import matplotlib.pyplot as plt\n", "import os\n", "from sklearn.model_selection import train_test_split\n", "from feature_engineering import discretization as dc\n", "\n", "# plt.style.use('seaborn-colorblind')\n", "# %matplotlib inline\n", "#from feature_cleaning import rare_values as ra" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "use_cols = [\n", " 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n", " 'Survived'\n", "]\n", "\n", "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeSibSpFare
003male22.017.2500
111female38.0171.2833
213female26.007.9250
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age SibSp Fare\n", "0 0 3 male 22.0 1 7.2500\n", "1 1 1 female 38.0 1 71.2833\n", "2 1 3 female 26.0 0 7.9250" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head(3)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((623, 6), (268, 6))" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Note that we include target variable in the X_train \n", "# because we need it to supervise our discretization\n", "# this is not the standard way of using train-test-split\n", "X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n", " random_state=0)\n", "X_train.shape, X_test.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Equal width binning\n", "divides the scope of possible values into N bins of the same width" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.preprocessing import KBinsDiscretizer\n", "enc_equal_width = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='uniform').fit(X_train[['Fare']])" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([array([ 0. , 170.7764, 341.5528, 512.3292])], dtype=object)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# equal width for every bins\n", "enc_equal_width.bin_edges_" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0 610\n", "1.0 11\n", "2.0 2\n", "Name: 0, dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = enc_equal_width.transform(X_train[['Fare']])\n", "pd.DataFrame(result)[0].value_counts()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Survived Pclass Sex Age SibSp Fare Fare_equal_width\n", "857 1 1 male 51.0 0 26.5500 0.0\n", "52 1 1 female 49.0 1 76.7292 0.0\n", "386 0 3 male 1.0 5 46.9000 0.0\n", "124 0 1 male 54.0 0 77.2875 0.0\n", "578 0 3 female NaN 1 14.4583 0.0\n", "549 1 2 male 8.0 1 36.7500 0.0\n", "118 0 1 male 24.0 0 247.5208 1.0\n", "12 0 3 male 20.0 0 8.0500 0.0\n", "157 0 3 male 30.0 0 8.0500 0.0\n", "127 1 3 male 24.0 0 7.1417 0.0\n" ] } ], "source": [ "# add the new discretized variable\n", "X_train_copy = X_train.copy(deep=True)\n", "X_train_copy['Fare_equal_width'] = enc_equal_width.transform(X_train[['Fare']])\n", "print(X_train_copy.head(10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Equal frequency binning\n", "divides the scope of possible values of the variable into N bins, \n", "where each bin carries the same amount of observations" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "enc_equal_freq = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='quantile').fit(X_train[['Fare']])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([array([ 0. , 8.69303333, 26.2875 , 512.3292 ])],\n", " dtype=object)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check the bin edges\n", "enc_equal_freq.bin_edges_" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2.0 209\n", "0.0 208\n", "1.0 206\n", "Name: 0, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# equal number of case for every bins\n", "result = enc_equal_freq.transform(X_train[['Fare']])\n", "pd.DataFrame(result)[0].value_counts()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Survived Pclass Sex Age SibSp Fare Fare_equal_freq\n", "857 1 1 male 51.0 0 26.5500 2.0\n", "52 1 1 female 49.0 1 76.7292 2.0\n", "386 0 3 male 1.0 5 46.9000 2.0\n", "124 0 1 male 54.0 0 77.2875 2.0\n", "578 0 3 female NaN 1 14.4583 1.0\n", "549 1 2 male 8.0 1 36.7500 2.0\n", "118 0 1 male 24.0 0 247.5208 2.0\n", "12 0 3 male 20.0 0 8.0500 0.0\n", "157 0 3 male 30.0 0 8.0500 0.0\n", "127 1 3 male 24.0 0 7.1417 0.0\n" ] } ], "source": [ "# add the new discretized variable\n", "X_train_copy = X_train.copy(deep=True)\n", "X_train_copy['Fare_equal_freq'] = enc_equal_freq.transform(X_train[['Fare']])\n", "print(X_train_copy.head(10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## K-means binning\n", "using k-means to partition values into clusters" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "collapsed": true }, "outputs": [], "source": [ "enc_kmeans = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='kmeans').fit(X_train[['Fare']])" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([array([ 0. , 93.5271531 , 338.08506324, 512.3292 ])],\n", " dtype=object)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# check the bin edges\n", "enc_kmeans.bin_edges_" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.0 587\n", "1.0 34\n", "2.0 2\n", "Name: 0, dtype: int64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "result = enc_kmeans.transform(X_train[['Fare']])\n", "pd.DataFrame(result)[0].value_counts()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Survived Pclass Sex Age SibSp Fare Fare_kmeans\n", "857 1 1 male 51.0 0 26.5500 0.0\n", "52 1 1 female 49.0 1 76.7292 0.0\n", "386 0 3 male 1.0 5 46.9000 0.0\n", "124 0 1 male 54.0 0 77.2875 0.0\n", "578 0 3 female NaN 1 14.4583 0.0\n", "549 1 2 male 8.0 1 36.7500 0.0\n", "118 0 1 male 24.0 0 247.5208 1.0\n", "12 0 3 male 20.0 0 8.0500 0.0\n", "157 0 3 male 30.0 0 8.0500 0.0\n", "127 1 3 male 24.0 0 7.1417 0.0\n" ] } ], "source": [ "# add the new discretized variable\n", "X_train_copy = X_train.copy(deep=True)\n", "X_train_copy['Fare_kmeans'] = enc_kmeans.transform(X_train[['Fare']])\n", "print(X_train_copy.head(10))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Discretisation with Decision Tree\n", "using a decision tree to identify the optimal splitting points that would determine the bins" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "collapsed": true }, "outputs": [], "source": [ "enc1 = dc.DiscretizeByDecisionTree(col='Fare',max_depth=2).fit(X=X_train,y=y_train)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,\n", " max_features=None, max_leaf_nodes=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n", " splitter='best')" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "enc1.tree_model" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "collapsed": true }, "outputs": [], "source": [ "data1 = enc1.transform(data)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Survived Pclass Sex Age SibSp Fare Fare_tree_discret\n", "0 0 3 male 22.0 1 7.2500 0.107143\n", "1 1 1 female 38.0 1 71.2833 0.442308\n", "2 1 3 female 26.0 0 7.9250 0.255319\n", "3 1 1 female 35.0 1 53.1000 0.442308\n", "4 0 3 male 35.0 0 8.0500 0.255319\n", "[0.10714286 0.44230769 0.25531915 0.74626866]\n" ] } ], "source": [ "# see how the new column Fare_tree_discret is distributed\n", "# the values are corresponding to the proba of the prediction by the tree\n", "print(data1.head(5))\n", "\n", "# the unique value of the discretisized column\n", "print(data1.Fare_tree_discret.unique())" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Fare Fare\n", "Fare_tree_discret \n", "0.107143 0.0000 7.5208\n", "0.255319 7.5500 10.5167\n", "0.442308 11.1333 73.5000\n", "0.746269 75.2500 512.3292\n" ] } ], "source": [ "# see how the bins are cut\n", "# because we use a tree with max-depth of 2, we have at most 2*2=4 bins generated by the tree\n", "col='Fare'\n", "bins = pd.concat([data1.groupby([col+'_tree_discret'])[col].min(),\n", " data1.groupby([col+'_tree_discret'])[col].max()], axis=1)\n", "print(bins)\n", "\n", "# all values between 0 to 7.5208 in the original variable 'Fare' \n", "# are given new value 0.107143 in the new column 'Fare_tree_discret'\n", "# and so on" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Discretisation with Decision Tree with optimal depth search" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "result ROC-AUC for each depth\n", " depth roc_auc_mean roc_auc_std\n", "0 2 0.662132 0.026253\n", "1 3 0.647950 0.045010\n", "2 4 0.650984 0.035127\n", "3 5 0.651180 0.027663\n", "4 6 0.653961 0.037421\n", "5 7 0.643688 0.033513\n", "optimal_depth: [2]\n" ] } ], "source": [ "# search for the best depth from range 2-7\n", "# we see when depth=2 we get the best roc-auc mean\n", "enc2 = dc.DiscretizeByDecisionTree(col='Fare',max_depth=[2,3,4,5,6,7]).fit(X=X_train,y=y_train)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DecisionTreeClassifier(class_weight=None, criterion='gini',\n", " max_depth=array([2], dtype=int64), max_features=None,\n", " max_leaf_nodes=None, min_impurity_decrease=0.0,\n", " min_impurity_split=None, min_samples_leaf=1,\n", " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", " presort=False, random_state=None, splitter='best')" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# using optimal depth=2 we train the model, same result as last one\n", "enc2.tree_model" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeSibSpFareFare_tree_discret
003male22.017.25000.107143
111female38.0171.28330.442308
213female26.007.92500.255319
311female35.0153.10000.442308
403male35.008.05000.255319
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age SibSp Fare Fare_tree_discret\n", "0 0 3 male 22.0 1 7.2500 0.107143\n", "1 1 1 female 38.0 1 71.2833 0.442308\n", "2 1 3 female 26.0 0 7.9250 0.255319\n", "3 1 1 female 35.0 1 53.1000 0.442308\n", "4 0 3 male 35.0 0 8.0500 0.255319" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data2 = enc2.transform(data)\n", "data2.head(5)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Discretisation with ChiMerge\n", "supervised hierarchical bottom-up (merge) method that locally exploits the chi-square criterion to decide whether two adjacent intervals are similar enough to be merged" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Interval for variable Fare\n", " variable interval flag_0 flag_1\n", "0 Fare -inf,7.875 94.0 28.0\n", "1 Fare 7.875,7.8792 0.0 3.0\n", "2 Fare 7.8792,7.8958 25.0 1.0\n", "3 Fare 7.8958,73.5 245.0 160.0\n", "4 Fare 73.5+ 17.0 50.0\n" ] } ], "source": [ "enc3 = dc.ChiMerge(col='Fare',num_of_bins=5).fit(X=X_train,y='Survived')" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[-0.1, 7.875, 7.8792, 7.8958, 73.5, 512.3292]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# the bins boundary created by ChiMerge\n", "\n", "enc3.bins" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "collapsed": true }, "outputs": [], "source": [ "data3 = enc3.transform(data)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Survived Pclass Sex Age SibSp Fare Fare_chimerge\n", "0 0 3 male 22.0 1 7.2500 (-0.101, 7.875]\n", "1 1 1 female 38.0 1 71.2833 (7.896, 73.5]\n", "2 1 3 female 26.0 0 7.9250 (7.896, 73.5]\n", "3 1 1 female 35.0 1 53.1000 (7.896, 73.5]\n", "4 0 3 male 35.0 0 8.0500 (7.896, 73.5]\n" ] } ], "source": [ "print(data3.head(5))" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(-0.101, 7.875], (7.896, 73.5], (73.5, 512.329], (7.875, 7.879], (7.879, 7.896]]\n", "Categories (5, interval[float64]): [(-0.101, 7.875] < (7.875, 7.879] < (7.879, 7.896] < (7.896, 73.5] < (73.5, 512.329]]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# all values are grouped into 5 intervals\n", "data3.Fare_chimerge.unique()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }