File size: 23,783 Bytes

0ab7b0c

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "# import seaborn as sns\n",
    "# import matplotlib.pyplot as plt\n",
    "import os\n",
    "from sklearn.model_selection import train_test_split\n",
    "from feature_engineering import discretization as dc\n",
    "\n",
    "# plt.style.use('seaborn-colorblind')\n",
    "# %matplotlib inline\n",
    "#from feature_cleaning import rare_values as ra"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "use_cols = [\n",
    "    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
    "    'Survived'\n",
    "]\n",
    "\n",
    "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Fare</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>7.2500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>71.2833</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass     Sex   Age  SibSp     Fare\n",
       "0         0       3    male  22.0      1   7.2500\n",
       "1         1       1  female  38.0      1  71.2833\n",
       "2         1       3  female  26.0      0   7.9250"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((623, 6), (268, 6))"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Note that we include target variable in the X_train \n",
    "# because we need it to supervise our discretization\n",
    "# this is not the standard way of using train-test-split\n",
    "X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
    "                                                    random_state=0)\n",
    "X_train.shape, X_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Equal width binning\n",
    "divides the scope of possible values into N bins of the same width"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import KBinsDiscretizer\n",
    "enc_equal_width = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='uniform').fit(X_train[['Fare']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([array([  0.    , 170.7764, 341.5528, 512.3292])], dtype=object)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# equal width for every bins\n",
    "enc_equal_width.bin_edges_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    610\n",
       "1.0     11\n",
       "2.0      2\n",
       "Name: 0, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result = enc_equal_width.transform(X_train[['Fare']])\n",
    "pd.DataFrame(result)[0].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     Survived  Pclass     Sex   Age  SibSp      Fare  Fare_equal_width\n",
      "857         1       1    male  51.0      0   26.5500               0.0\n",
      "52          1       1  female  49.0      1   76.7292               0.0\n",
      "386         0       3    male   1.0      5   46.9000               0.0\n",
      "124         0       1    male  54.0      0   77.2875               0.0\n",
      "578         0       3  female   NaN      1   14.4583               0.0\n",
      "549         1       2    male   8.0      1   36.7500               0.0\n",
      "118         0       1    male  24.0      0  247.5208               1.0\n",
      "12          0       3    male  20.0      0    8.0500               0.0\n",
      "157         0       3    male  30.0      0    8.0500               0.0\n",
      "127         1       3    male  24.0      0    7.1417               0.0\n"
     ]
    }
   ],
   "source": [
    "# add the new discretized variable\n",
    "X_train_copy = X_train.copy(deep=True)\n",
    "X_train_copy['Fare_equal_width'] = enc_equal_width.transform(X_train[['Fare']])\n",
    "print(X_train_copy.head(10))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Equal frequency binning\n",
    "divides the scope of possible values of the variable into N bins, \n",
    "where each bin carries the same amount of observations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "enc_equal_freq = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='quantile').fit(X_train[['Fare']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([array([  0.        ,   8.69303333,  26.2875    , 512.3292    ])],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check the bin edges\n",
    "enc_equal_freq.bin_edges_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2.0    209\n",
       "0.0    208\n",
       "1.0    206\n",
       "Name: 0, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# equal number of case for every bins\n",
    "result = enc_equal_freq.transform(X_train[['Fare']])\n",
    "pd.DataFrame(result)[0].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     Survived  Pclass     Sex   Age  SibSp      Fare  Fare_equal_freq\n",
      "857         1       1    male  51.0      0   26.5500              2.0\n",
      "52          1       1  female  49.0      1   76.7292              2.0\n",
      "386         0       3    male   1.0      5   46.9000              2.0\n",
      "124         0       1    male  54.0      0   77.2875              2.0\n",
      "578         0       3  female   NaN      1   14.4583              1.0\n",
      "549         1       2    male   8.0      1   36.7500              2.0\n",
      "118         0       1    male  24.0      0  247.5208              2.0\n",
      "12          0       3    male  20.0      0    8.0500              0.0\n",
      "157         0       3    male  30.0      0    8.0500              0.0\n",
      "127         1       3    male  24.0      0    7.1417              0.0\n"
     ]
    }
   ],
   "source": [
    "# add the new discretized variable\n",
    "X_train_copy = X_train.copy(deep=True)\n",
    "X_train_copy['Fare_equal_freq'] = enc_equal_freq.transform(X_train[['Fare']])\n",
    "print(X_train_copy.head(10))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## K-means binning\n",
    "using k-means to partition values into clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "enc_kmeans = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='kmeans').fit(X_train[['Fare']])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([array([  0.        ,  93.5271531 , 338.08506324, 512.3292    ])],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check the bin edges\n",
    "enc_kmeans.bin_edges_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    587\n",
       "1.0     34\n",
       "2.0      2\n",
       "Name: 0, dtype: int64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result = enc_kmeans.transform(X_train[['Fare']])\n",
    "pd.DataFrame(result)[0].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     Survived  Pclass     Sex   Age  SibSp      Fare  Fare_kmeans\n",
      "857         1       1    male  51.0      0   26.5500          0.0\n",
      "52          1       1  female  49.0      1   76.7292          0.0\n",
      "386         0       3    male   1.0      5   46.9000          0.0\n",
      "124         0       1    male  54.0      0   77.2875          0.0\n",
      "578         0       3  female   NaN      1   14.4583          0.0\n",
      "549         1       2    male   8.0      1   36.7500          0.0\n",
      "118         0       1    male  24.0      0  247.5208          1.0\n",
      "12          0       3    male  20.0      0    8.0500          0.0\n",
      "157         0       3    male  30.0      0    8.0500          0.0\n",
      "127         1       3    male  24.0      0    7.1417          0.0\n"
     ]
    }
   ],
   "source": [
    "# add the new discretized variable\n",
    "X_train_copy = X_train.copy(deep=True)\n",
    "X_train_copy['Fare_kmeans'] = enc_kmeans.transform(X_train[['Fare']])\n",
    "print(X_train_copy.head(10))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Discretisation with Decision Tree\n",
    "using a decision tree to identify the optimal splitting points that would determine the bins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "enc1 = dc.DiscretizeByDecisionTree(col='Fare',max_depth=2).fit(X=X_train,y=y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,\n",
       "            max_features=None, max_leaf_nodes=None,\n",
       "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "            min_samples_leaf=1, min_samples_split=2,\n",
       "            min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n",
       "            splitter='best')"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "enc1.tree_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data1 = enc1.transform(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Survived  Pclass     Sex   Age  SibSp     Fare  Fare_tree_discret\n",
      "0         0       3    male  22.0      1   7.2500           0.107143\n",
      "1         1       1  female  38.0      1  71.2833           0.442308\n",
      "2         1       3  female  26.0      0   7.9250           0.255319\n",
      "3         1       1  female  35.0      1  53.1000           0.442308\n",
      "4         0       3    male  35.0      0   8.0500           0.255319\n",
      "[0.10714286 0.44230769 0.25531915 0.74626866]\n"
     ]
    }
   ],
   "source": [
    "# see how the new column Fare_tree_discret is distributed\n",
    "# the values are corresponding to the proba of the prediction by the tree\n",
    "print(data1.head(5))\n",
    "\n",
    "# the unique value of the discretisized column\n",
    "print(data1.Fare_tree_discret.unique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                      Fare      Fare\n",
      "Fare_tree_discret                   \n",
      "0.107143            0.0000    7.5208\n",
      "0.255319            7.5500   10.5167\n",
      "0.442308           11.1333   73.5000\n",
      "0.746269           75.2500  512.3292\n"
     ]
    }
   ],
   "source": [
    "# see how the bins are cut\n",
    "# because we use a tree with max-depth of 2, we have at most 2*2=4 bins generated by the tree\n",
    "col='Fare'\n",
    "bins = pd.concat([data1.groupby([col+'_tree_discret'])[col].min(),\n",
    "                  data1.groupby([col+'_tree_discret'])[col].max()], axis=1)\n",
    "print(bins)\n",
    "\n",
    "# all values between 0 to 7.5208 in the original variable 'Fare' \n",
    "# are given new value 0.107143 in the new column 'Fare_tree_discret'\n",
    "# and so on"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Discretisation with Decision Tree with optimal depth search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "result ROC-AUC for each depth\n",
      "   depth  roc_auc_mean  roc_auc_std\n",
      "0      2      0.662132     0.026253\n",
      "1      3      0.647950     0.045010\n",
      "2      4      0.650984     0.035127\n",
      "3      5      0.651180     0.027663\n",
      "4      6      0.653961     0.037421\n",
      "5      7      0.643688     0.033513\n",
      "optimal_depth: [2]\n"
     ]
    }
   ],
   "source": [
    "# search for the best depth from range 2-7\n",
    "# we see when depth=2 we get the best roc-auc mean\n",
    "enc2 = dc.DiscretizeByDecisionTree(col='Fare',max_depth=[2,3,4,5,6,7]).fit(X=X_train,y=y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DecisionTreeClassifier(class_weight=None, criterion='gini',\n",
       "            max_depth=array([2], dtype=int64), max_features=None,\n",
       "            max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
       "            min_impurity_split=None, min_samples_leaf=1,\n",
       "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
       "            presort=False, random_state=None, splitter='best')"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# using optimal depth=2 we train the model, same result as last one\n",
    "enc2.tree_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Fare_tree_discret</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>0.107143</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>0.442308</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>0.255319</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>0.442308</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>0.255319</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass     Sex   Age  SibSp     Fare  Fare_tree_discret\n",
       "0         0       3    male  22.0      1   7.2500           0.107143\n",
       "1         1       1  female  38.0      1  71.2833           0.442308\n",
       "2         1       3  female  26.0      0   7.9250           0.255319\n",
       "3         1       1  female  35.0      1  53.1000           0.442308\n",
       "4         0       3    male  35.0      0   8.0500           0.255319"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data2 = enc2.transform(data)\n",
    "data2.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Discretisation with ChiMerge\n",
    "supervised hierarchical bottom-up (merge) method that locally exploits the chi-square criterion to decide whether two adjacent intervals are similar enough to be merged"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Interval for variable Fare\n",
      "  variable       interval  flag_0  flag_1\n",
      "0     Fare     -inf,7.875    94.0    28.0\n",
      "1     Fare   7.875,7.8792     0.0     3.0\n",
      "2     Fare  7.8792,7.8958    25.0     1.0\n",
      "3     Fare    7.8958,73.5   245.0   160.0\n",
      "4     Fare          73.5+    17.0    50.0\n"
     ]
    }
   ],
   "source": [
    "enc3 = dc.ChiMerge(col='Fare',num_of_bins=5).fit(X=X_train,y='Survived')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[-0.1, 7.875, 7.8792, 7.8958, 73.5, 512.3292]"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the bins boundary created by ChiMerge\n",
    "\n",
    "enc3.bins"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "data3 = enc3.transform(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Survived  Pclass     Sex   Age  SibSp     Fare    Fare_chimerge\n",
      "0         0       3    male  22.0      1   7.2500  (-0.101, 7.875]\n",
      "1         1       1  female  38.0      1  71.2833    (7.896, 73.5]\n",
      "2         1       3  female  26.0      0   7.9250    (7.896, 73.5]\n",
      "3         1       1  female  35.0      1  53.1000    (7.896, 73.5]\n",
      "4         0       3    male  35.0      0   8.0500    (7.896, 73.5]\n"
     ]
    }
   ],
   "source": [
    "print(data3.head(5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(-0.101, 7.875], (7.896, 73.5], (73.5, 512.329], (7.875, 7.879], (7.879, 7.896]]\n",
       "Categories (5, interval[float64]): [(-0.101, 7.875] < (7.875, 7.879] < (7.879, 7.896] < (7.896, 73.5] < (73.5, 512.329]]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# all values are grouped into 5 intervals\n",
    "data3.Fare_chimerge.unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}