{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import os\n", "plt.style.use('seaborn-colorblind')\n", "%matplotlib inline\n", "from feature_cleaning import missing_data as ms\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(891, 6)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeSibSpFare
003male22.017.2500
111female38.0171.2833
213female26.007.9250
311female35.0153.1000
403male35.008.0500
503maleNaN08.4583
601male54.0051.8625
703male2.0321.0750
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age SibSp Fare\n", "0 0 3 male 22.0 1 7.2500\n", "1 1 1 female 38.0 1 71.2833\n", "2 1 3 female 26.0 0 7.9250\n", "3 1 1 female 35.0 1 53.1000\n", "4 0 3 male 35.0 0 8.0500\n", "5 0 3 male NaN 0 8.4583\n", "6 0 1 male 54.0 0 51.8625\n", "7 0 3 male 2.0 3 21.0750" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "use_cols = [\n", " 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n", " 'Survived'\n", "]\n", "\n", "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n", "print(data.shape)\n", "data.head(8)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Missing value checking\n", "check the total number & percentage of missing values\n", "per variable of a pandas Dataframe" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "result saved at ./output/ missing.csv\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
total missingproportion
Survived00.000000
Pclass00.000000
Sex00.000000
Age1770.198653
SibSp00.000000
Fare00.000000
\n", "
" ], "text/plain": [ " total missing proportion\n", "Survived 0 0.000000\n", "Pclass 0 0.000000\n", "Sex 0 0.000000\n", "Age 177 0.198653\n", "SibSp 0 0.000000\n", "Fare 0 0.000000" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# only variable Age has missing values, totally 177 cases\n", "# result is saved at the output dir (if given)\n", "\n", "ms.check_missing(data=data,output_path=r'./output/')" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Listwise deletion \n", "excluding all cases (listwise) that have missing values" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(714, 6)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 177 cases which has NA has been dropped \n", "data2 = ms.drop_missing(data=data)\n", "data2.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Add a variable to denote NA\n", "creating an additional variable indicating whether the data was missing for that observation" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 714\n", "1 177\n", "Name: Age_is_NA, dtype: int64\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeSibSpFareAge_is_NA
003male22.017.25000
111female38.0171.28330
213female26.007.92500
311female35.0153.10000
403male35.008.05000
503maleNaN08.45831
601male54.0051.86250
703male2.0321.07500
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age SibSp Fare Age_is_NA\n", "0 0 3 male 22.0 1 7.2500 0\n", "1 1 1 female 38.0 1 71.2833 0\n", "2 1 3 female 26.0 0 7.9250 0\n", "3 1 1 female 35.0 1 53.1000 0\n", "4 0 3 male 35.0 0 8.0500 0\n", "5 0 3 male NaN 0 8.4583 1\n", "6 0 1 male 54.0 0 51.8625 0\n", "7 0 3 male 2.0 3 21.0750 0" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Age_is_NA is created, 0-not missing 1-missing for that observation\n", "data3 = ms.add_var_denote_NA(data=data,NA_col=['Age'])\n", "print(data3.Age_is_NA.value_counts())\n", "data3.head(8)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## Arbitrary Value Imputation\n", "Replacing the NA by arbitrary values" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeSibSpFareAge_-999
003male22.017.250022.0
111female38.0171.283338.0
213female26.007.925026.0
311female35.0153.100035.0
403male35.008.050035.0
503maleNaN08.4583-999.0
601male54.0051.862554.0
703male2.0321.07502.0
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age SibSp Fare Age_-999\n", "0 0 3 male 22.0 1 7.2500 22.0\n", "1 1 1 female 38.0 1 71.2833 38.0\n", "2 1 3 female 26.0 0 7.9250 26.0\n", "3 1 1 female 35.0 1 53.1000 35.0\n", "4 0 3 male 35.0 0 8.0500 35.0\n", "5 0 3 male NaN 0 8.4583 -999.0\n", "6 0 1 male 54.0 0 51.8625 54.0\n", "7 0 3 male 2.0 3 21.0750 2.0" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data4 = ms.impute_NA_with_arbitrary(data=data,impute_value=-999,NA_col=['Age'])\n", "data4.head(8)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Mean/Median/Mode Imputation\n", "Replacing the NA by mean/median/mode of that variable" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "28.0\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeSibSpFareAge_impute_median
003male22.017.250022.0
111female38.0171.283338.0
213female26.007.925026.0
311female35.0153.100035.0
403male35.008.050035.0
503maleNaN08.458328.0
601male54.0051.862554.0
703male2.0321.07502.0
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age SibSp Fare Age_impute_median\n", "0 0 3 male 22.0 1 7.2500 22.0\n", "1 1 1 female 38.0 1 71.2833 38.0\n", "2 1 3 female 26.0 0 7.9250 26.0\n", "3 1 1 female 35.0 1 53.1000 35.0\n", "4 0 3 male 35.0 0 8.0500 35.0\n", "5 0 3 male NaN 0 8.4583 28.0\n", "6 0 1 male 54.0 0 51.8625 54.0\n", "7 0 3 male 2.0 3 21.0750 2.0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(data.Age.median())\n", "data5 = ms.impute_NA_with_avg(data=data,strategy='median',NA_col=['Age'])\n", "data5.head(8)" ] }, { "cell_type": "markdown", "metadata": { "collapsed": true }, "source": [ "## End of distribution Imputation\n", "replacing the NA by values that are at the far end of the distribution of that variable\n", "calculated by mean + 3*std" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeSibSpFareAge_impute_end_of_distri
003male22.017.250022.00000
111female38.0171.283338.00000
213female26.007.925026.00000
311female35.0153.100035.00000
403male35.008.050035.00000
503maleNaN08.458373.27861
601male54.0051.862554.00000
703male2.0321.07502.00000
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age SibSp Fare Age_impute_end_of_distri\n", "0 0 3 male 22.0 1 7.2500 22.00000\n", "1 1 1 female 38.0 1 71.2833 38.00000\n", "2 1 3 female 26.0 0 7.9250 26.00000\n", "3 1 1 female 35.0 1 53.1000 35.00000\n", "4 0 3 male 35.0 0 8.0500 35.00000\n", "5 0 3 male NaN 0 8.4583 73.27861\n", "6 0 1 male 54.0 0 51.8625 54.00000\n", "7 0 3 male 2.0 3 21.0750 2.00000" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data6 = ms.impute_NA_with_end_of_distribution(data=data,NA_col=['Age'])\n", "data6.head(8)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Random Imputation\n", "replacing the NA with random sampling from the pool of available observations of the variable\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
SurvivedPclassSexAgeSibSpFareAge_random
003male22.017.250022.0
111female38.0171.283338.0
213female26.007.925026.0
311female35.0153.100035.0
403male35.008.050035.0
503maleNaN08.458328.0
601male54.0051.862554.0
703male2.0321.07502.0
\n", "
" ], "text/plain": [ " Survived Pclass Sex Age SibSp Fare Age_random\n", "0 0 3 male 22.0 1 7.2500 22.0\n", "1 1 1 female 38.0 1 71.2833 38.0\n", "2 1 3 female 26.0 0 7.9250 26.0\n", "3 1 1 female 35.0 1 53.1000 35.0\n", "4 0 3 male 35.0 0 8.0500 35.0\n", "5 0 3 male NaN 0 8.4583 28.0\n", "6 0 1 male 54.0 0 51.8625 54.0\n", "7 0 3 male 2.0 3 21.0750 2.0" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data7 = ms.impute_NA_with_random(data=data,NA_col=['Age'])\n", "data7.head(8)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }