{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "# import seaborn as sns\n", "# import matplotlib.pyplot as plt\n", "import os\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_selection import SelectFromModel\n", "from sklearn.ensemble import RandomForestClassifier\n", "# plt.style.use('seaborn-colorblind')\n", "# %matplotlib inline\n", "from feature_selection import feature_shuffle\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from sklearn.datasets import load_breast_cancer\n", "data = load_breast_cancer()\n", "data = pd.DataFrame(np.c_[data['data'], data['target']],\n", " columns= np.append(data['feature_names'], ['target']))" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
mean radiusmean texturemean perimetermean areamean smoothnessmean compactnessmean concavitymean concave pointsmean symmetrymean fractal dimension...worst textureworst perimeterworst areaworst smoothnessworst compactnessworst concavityworst concave pointsworst symmetryworst fractal dimensiontarget
017.9910.38122.801001.00.118400.277600.30010.147100.24190.07871...17.33184.602019.00.16220.66560.71190.26540.46010.118900.0
120.5717.77132.901326.00.084740.078640.08690.070170.18120.05667...23.41158.801956.00.12380.18660.24160.18600.27500.089020.0
219.6921.25130.001203.00.109600.159900.19740.127900.20690.05999...25.53152.501709.00.14440.42450.45040.24300.36130.087580.0
311.4220.3877.58386.10.142500.283900.24140.105200.25970.09744...26.5098.87567.70.20980.86630.68690.25750.66380.173000.0
420.2914.34135.101297.00.100300.132800.19800.104300.18090.05883...16.67152.201575.00.13740.20500.40000.16250.23640.076780.0
\n", "

5 rows × 31 columns

\n", "
" ], "text/plain": [ " mean radius mean texture mean perimeter mean area mean smoothness \\\n", "0 17.99 10.38 122.80 1001.0 0.11840 \n", "1 20.57 17.77 132.90 1326.0 0.08474 \n", "2 19.69 21.25 130.00 1203.0 0.10960 \n", "3 11.42 20.38 77.58 386.1 0.14250 \n", "4 20.29 14.34 135.10 1297.0 0.10030 \n", "\n", " mean compactness mean concavity mean concave points mean symmetry \\\n", "0 0.27760 0.3001 0.14710 0.2419 \n", "1 0.07864 0.0869 0.07017 0.1812 \n", "2 0.15990 0.1974 0.12790 0.2069 \n", "3 0.28390 0.2414 0.10520 0.2597 \n", "4 0.13280 0.1980 0.10430 0.1809 \n", "\n", " mean fractal dimension ... worst texture worst perimeter worst area \\\n", "0 0.07871 ... 17.33 184.60 2019.0 \n", "1 0.05667 ... 23.41 158.80 1956.0 \n", "2 0.05999 ... 25.53 152.50 1709.0 \n", "3 0.09744 ... 26.50 98.87 567.7 \n", "4 0.05883 ... 16.67 152.20 1575.0 \n", "\n", " worst smoothness worst compactness worst concavity worst concave points \\\n", "0 0.1622 0.6656 0.7119 0.2654 \n", "1 0.1238 0.1866 0.2416 0.1860 \n", "2 0.1444 0.4245 0.4504 0.2430 \n", "3 0.2098 0.8663 0.6869 0.2575 \n", "4 0.1374 0.2050 0.4000 0.1625 \n", "\n", " worst symmetry worst fractal dimension target \n", "0 0.4601 0.11890 0.0 \n", "1 0.2750 0.08902 0.0 \n", "2 0.3613 0.08758 0.0 \n", "3 0.6638 0.17300 0.0 \n", "4 0.2364 0.07678 0.0 \n", "\n", "[5 rows x 31 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.head(5)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((455, 30), (114, 30))" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n", " data.target, test_size=0.2,\n", " random_state=0)\n", "X_train.shape, X_test.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Feature Shuffling\n", "permute the values of each feature, one at the time, and measure how much the permutation decreases the accuracy, or the roc_auc, or the mse of the machine learning model.\n", "If the variables are important, this is, highly predictive, a random permutation of their values will decrease dramatically any of these metrics." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "auc_drop, selected_features = feature_shuffle.feature_shuffle_rf(X_train=X_train,\n", " y_train=y_train,\n", " random_state=0)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
featureauc_drop
22worst perimeter8.359457e-05
27worst concave points3.134796e-05
23worst area1.110223e-16
12perimeter error1.110223e-16
0mean radius0.000000e+00
16concavity error0.000000e+00
28worst symmetry0.000000e+00
26worst concavity0.000000e+00
25worst compactness0.000000e+00
24worst smoothness0.000000e+00
21worst texture0.000000e+00
20worst radius0.000000e+00
19fractal dimension error0.000000e+00
18symmetry error0.000000e+00
17concave points error0.000000e+00
15compactness error0.000000e+00
1mean texture0.000000e+00
14smoothness error0.000000e+00
13area error0.000000e+00
11texture error0.000000e+00
10radius error0.000000e+00
9mean fractal dimension0.000000e+00
8mean symmetry0.000000e+00
7mean concave points0.000000e+00
6mean concavity0.000000e+00
5mean compactness0.000000e+00
4mean smoothness0.000000e+00
3mean area0.000000e+00
2mean perimeter0.000000e+00
29worst fractal dimension0.000000e+00
\n", "
" ], "text/plain": [ " feature auc_drop\n", "22 worst perimeter 8.359457e-05\n", "27 worst concave points 3.134796e-05\n", "23 worst area 1.110223e-16\n", "12 perimeter error 1.110223e-16\n", "0 mean radius 0.000000e+00\n", "16 concavity error 0.000000e+00\n", "28 worst symmetry 0.000000e+00\n", "26 worst concavity 0.000000e+00\n", "25 worst compactness 0.000000e+00\n", "24 worst smoothness 0.000000e+00\n", "21 worst texture 0.000000e+00\n", "20 worst radius 0.000000e+00\n", "19 fractal dimension error 0.000000e+00\n", "18 symmetry error 0.000000e+00\n", "17 concave points error 0.000000e+00\n", "15 compactness error 0.000000e+00\n", "1 mean texture 0.000000e+00\n", "14 smoothness error 0.000000e+00\n", "13 area error 0.000000e+00\n", "11 texture error 0.000000e+00\n", "10 radius error 0.000000e+00\n", "9 mean fractal dimension 0.000000e+00\n", "8 mean symmetry 0.000000e+00\n", "7 mean concave points 0.000000e+00\n", "6 mean concavity 0.000000e+00\n", "5 mean compactness 0.000000e+00\n", "4 mean smoothness 0.000000e+00\n", "3 mean area 0.000000e+00\n", "2 mean perimeter 0.000000e+00\n", "29 worst fractal dimension 0.000000e+00" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# we select features that have auc_drop > 0\n", "auc_drop" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "22 worst perimeter\n", "27 worst concave points\n", "23 worst area\n", "12 perimeter error\n", "Name: feature, dtype: object" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "selected_features" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }