{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "# import seaborn as sns\n", "# import matplotlib.pyplot as plt\n", "import os\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import roc_curve, roc_auc_score\n", "\n", "# plt.style.use('seaborn-colorblind')\n", "# %matplotlib inline\n", "#from feature_cleaning import rare_values as ra" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "use_cols = [\n", " 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n", " 'Survived'\n", "]\n", "\n", "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeSibSpFare
003male22.017.2500
111female38.0171.2833
213female26.007.9250
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age SibSp Fare\n", "0 0 3 male 22.0 1 7.2500\n", "1 1 1 female 38.0 1 71.2833\n", "2 1 3 female 26.0 0 7.9250" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head(3)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((623, 6), (268, 6))" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Note that we include target variable in the X_train \n", "# because we need it to supervise our discretization\n", "# this is not the standard way of using train-test-split\n", "X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n", " random_state=0)\n", "X_train.shape, X_test.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Polynomial Expansion\n", "\n", "generate a new feature set consisting of all polynomial combinations of the features with degree less than or equal to the specified degree" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Pclass SibSp Pclass^2 Pclass SibSp SibSp^2\n", "0 1.0 0.0 1.0 0.0 0.0\n", "1 1.0 1.0 1.0 1.0 1.0\n", "2 3.0 5.0 9.0 15.0 25.0\n", "3 1.0 0.0 1.0 0.0 0.0\n", "4 3.0 1.0 9.0 3.0 1.0\n", "5 2.0 1.0 4.0 2.0 1.0\n" ] } ], "source": [ "# create polynomial combinations of feature 'Pclass','SibSp' with degree 2\n", "from sklearn.preprocessing import PolynomialFeatures\n", "pf = PolynomialFeatures(degree=2,include_bias=False).fit(X_train[['Pclass','SibSp']])\n", "tmp = pf.transform(X_train[['Pclass','SibSp']])\n", "X_train_copy = pd.DataFrame(tmp,columns=pf.get_feature_names(['Pclass','SibSp']))\n", "print(X_train_copy.head(6))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Learning by Trees\n", "GBDT derived feature + LR" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sample's belonging node of each base tree \n", "' [[ 7. 7. 6. ... 4. 7. 4.]\n", " [ 7. 7. 6. ... 14. 7. 7.]\n", " [11. 11. 11. ... 4. 6. 11.]\n", " ...\n", " [10. 10. 10. ... 4. 6. 10.]\n", " [13. 14. 13. ... 4. 7. 13.]\n", " [ 7. 7. 6. ... 6. 7. 7.]]\n", "AUC for GBDT derived feature + LR: 0.7746130952380953\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n", "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n", "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n", " warnings.warn(msg, FutureWarning)\n" ] } ], "source": [ "from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier\n", "from sklearn.preprocessing import OneHotEncoder\n", "\n", "gbdt = GradientBoostingClassifier(n_estimators=20)\n", "one_hot = OneHotEncoder()\n", "\n", "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n", "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n", "\n", "gbdt.fit(X_train, y_train)\n", "\n", "X_leaf_index = gbdt.apply(X_train)[:, :, 0] # apply return the node index on each tree \n", "print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n", "# fit one-hot encoder\n", "one_hot.fit(X_leaf_index) \n", "X_one_hot = one_hot.transform(X_leaf_index) \n", "\n", "\n", "from sklearn.linear_model import LogisticRegression\n", "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n", "lr.fit(X_one_hot,y_train)\n", "y_pred = lr.predict_proba(\n", " one_hot.transform(gbdt.apply(X_test)[:, :, 0]))[:,1]\n", "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n", "print(\"AUC for GBDT derived feature + LR:\", roc_auc_score(y_test, y_pred))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Learning by Trees\n", "RandomForest derived feature + LR" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "sample's belonging node of each base tree \n", "' [[212 35 79 ... 146 60 46]\n", " [307 165 266 ... 136 132 44]\n", " [285 285 320 ... 301 294 300]\n", " ...\n", " [ 13 177 133 ... 186 169 117]\n", " [190 296 311 ... 282 289 297]\n", " [264 165 243 ... 152 110 314]]\n", "AUC for RandomForest derived feature + LR: 0.759672619047619\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n", "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n", "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n", " warnings.warn(msg, FutureWarning)\n" ] } ], "source": [ "rf = RandomForestClassifier(n_estimators=20)\n", "one_hot = OneHotEncoder()\n", "\n", "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n", "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n", "\n", "rf.fit(X_train, y_train)\n", "\n", "X_leaf_index = rf.apply(X_train) # apply return the node index on each tree \n", "print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n", "# fit one-hot encoder\n", "one_hot.fit(X_leaf_index) \n", "X_one_hot = one_hot.transform(X_leaf_index) \n", "\n", "\n", "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n", "lr.fit(X_one_hot,y_train)\n", "y_pred = lr.predict_proba(\n", " one_hot.transform(rf.apply(X_test)))[:,1]\n", "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n", "print(\"AUC for RandomForest derived feature + LR:\", roc_auc_score(y_test, y_pred))\n" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Feature Learning by Trees\n", "GBDT derived feature + Raw feature +LR" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AUC for GBDT derived feature + Raw feature +LR: 0.7603571428571428\n" ] } ], "source": [ "from scipy.sparse import hstack\n", "\n", "X_train_ext = hstack([one_hot.transform(gbdt.apply(X_train)[:, :, 0]), X_train])\n", "X_test_ext = hstack([one_hot.transform(gbdt.apply(X_test)[:, :, 0]), X_test])\n", "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n", "lr.fit(X_train_ext,y_train)\n", "y_pred = lr.predict_proba(X_test_ext)[:,1]\n", "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n", "print(\"AUC for GBDT derived feature + Raw feature +LR:\", roc_auc_score(y_test, y_pred))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Learning by Trees\n", "RandomForest derived feature + Raw feature +LR" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AUC for RandomForest derived feature + Raw feature + LR: 0.76\n" ] } ], "source": [ "X_train_ext = hstack([one_hot.transform(rf.apply(X_train)), X_train])\n", "X_test_ext = hstack([one_hot.transform(rf.apply(X_test)), X_test])\n", "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n", "lr.fit(X_train_ext,y_train)\n", "y_pred = lr.predict_proba(X_test_ext)[:,1]\n", "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n", "print(\"AUC for RandomForest derived feature + Raw feature + LR:\", roc_auc_score(y_test, y_pred))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Learning by Trees\n", "Use only Raw Feature + LR" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AUC for RandomForest derived feature + LR: 0.6988690476190476\n" ] } ], "source": [ "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n", "lr.fit(X_train,y_train)\n", "y_pred = lr.predict_proba(X_test)[:,1]\n", "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n", "print(\"AUC for RandomForest derived feature + LR:\", roc_auc_score(y_test, y_pred))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Learning by Trees\n", "\n", "Use only Raw Feature + GBDT" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AUC for Raw feature + GBDT: 0.7613988095238096\n" ] } ], "source": [ "gbdt = GradientBoostingClassifier(n_estimators=20)\n", "\n", "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n", "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n", "\n", "gbdt.fit(X_train, y_train)\n", "y_pred = gbdt.predict_proba(X_test)[:,1]\n", "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n", "print(\"AUC for Raw feature + GBDT:\", roc_auc_score(y_test, y_pred))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Learning by Trees\n", "\n", "Use only Raw Feature + RF\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "AUC for Raw feature + RF: 0.7235119047619047\n" ] } ], "source": [ "rf = RandomForestClassifier(n_estimators=20)\n", "\n", "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n", "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n", "\n", "rf.fit(X_train, y_train)\n", "y_pred = rf.predict_proba(X_test)[:,1]\n", "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n", "print(\"AUC for Raw feature + RF:\", roc_auc_score(y_test, y_pred))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Without tuning, we can see GBDT derived feature + LR get the best result" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }