{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": true }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "# import seaborn as sns\n", "# import matplotlib.pyplot as plt\n", "import os\n", "# plt.style.use('seaborn-colorblind')\n", "# %matplotlib inline\n", "from feature_cleaning import rare_values as ra" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Load Dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Variable Pclass label proportion:\n", "3 0.551066\n", "1 0.242424\n", "2 0.206510\n", "Name: Pclass, dtype: float64\n", "Variable SibSp label proportion:\n", "0 0.682379\n", "1 0.234568\n", "2 0.031425\n", "4 0.020202\n", "3 0.017957\n", "8 0.007856\n", "5 0.005612\n", "Name: SibSp, dtype: float64\n" ] } ], "source": [ "use_cols = [\n", " 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n", " 'Survived'\n", "]\n", "\n", "# see column Pclass & SibSp's distributions\n", "# SibSp has values 3/8/5 that occur rarely, under 2%\n", "# Pclass has 3 values, but no one is under 20%\n", "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n", "for i in ['Pclass','SibSp']:\n", " print('Variable',i,'label proportion:')\n", " print(data[i].value_counts()/len(data))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Grouping into one new category\n", "Grouping the observations that show rare labels into a unique category ('rare')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# create the encoder and fit with our data\n", "enc = ra.GroupingRareValues(cols=['Pclass','SibSp'],threshold=0.01).fit(data)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{'col': 'Pclass', 'mapping': 3 3\n", "1 1\n", "2 2\n", "dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0 0\n", "1 1\n", "2 2\n", "4 4\n", "3 3\n", "8 rare\n", "5 rare\n", "dtype: object, 'data_type': dtype('int64')}]\n" ] } ], "source": [ "# let's see the mapping\n", "# for SibSp, values 5 & 8 are encoded as 'rare' as they appear less than 10%\n", "# for Pclass, nothing changed\n", "print(enc.mapping)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# perform transformation\n", "data2 = enc.transform(data)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 608\n", "1 209\n", "2 28\n", "4 18\n", "3 16\n", "rare 12\n", "Name: SibSp, dtype: int64\n" ] } ], "source": [ "# check the result\n", "print(data2.SibSp.value_counts())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Mode Imputation\n", "Replacing the rare label by most frequent label" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# create the encoder and fit with our data\n", "enc = ra.ModeImputation(cols=['Pclass','SibSp'],threshold=0.01).fit(data)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[{'col': 'Pclass', 'mapping': 3 3\n", "1 1\n", "2 2\n", "dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0 0\n", "1 1\n", "2 2\n", "4 4\n", "3 3\n", "8 0\n", "5 0\n", "dtype: int64, 'data_type': dtype('int64')}]\n" ] } ], "source": [ "# let's see the mapping\n", "# for SibSp, values 5 & 8 are encoded as 0, as label 0 is the most frequent label\n", "# for Pclass, nothing changed\n", "print(enc.mapping)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": true }, "outputs": [], "source": [ "# perform transformation\n", "data3 = enc.transform(data)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 620\n", "1 209\n", "2 28\n", "4 18\n", "3 16\n", "Name: SibSp, dtype: int64\n" ] } ], "source": [ "# check the result\n", "print(data3.SibSp.value_counts())" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.1" } }, "nbformat": 4, "nbformat_minor": 2 }