{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "# import seaborn as sns\n",
    "# import matplotlib.pyplot as plt\n",
    "import os\n",
    "# plt.style.use('seaborn-colorblind')\n",
    "# %matplotlib inline\n",
    "from feature_cleaning import rare_values as ra"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Variable Pclass label proportion:\n",
      "3    0.551066\n",
      "1    0.242424\n",
      "2    0.206510\n",
      "Name: Pclass, dtype: float64\n",
      "Variable SibSp label proportion:\n",
      "0    0.682379\n",
      "1    0.234568\n",
      "2    0.031425\n",
      "4    0.020202\n",
      "3    0.017957\n",
      "8    0.007856\n",
      "5    0.005612\n",
      "Name: SibSp, dtype: float64\n"
     ]
    }
   ],
   "source": [
    "use_cols = [\n",
    "    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
    "    'Survived'\n",
    "]\n",
    "\n",
    "# see column Pclass & SibSp's distributions\n",
    "# SibSp has values 3/8/5 that occur rarely, under 2%\n",
    "# Pclass has 3 values, but no one is under 20%\n",
    "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n",
    "for i in ['Pclass','SibSp']:\n",
    "    print('Variable',i,'label proportion:')\n",
    "    print(data[i].value_counts()/len(data))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Grouping into one new category\n",
    "Grouping the observations that show rare labels into a unique category ('rare')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# create the encoder and fit with our data\n",
    "enc = ra.GroupingRareValues(cols=['Pclass','SibSp'],threshold=0.01).fit(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'col': 'Pclass', 'mapping': 3    3\n",
      "1    1\n",
      "2    2\n",
      "dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0       0\n",
      "1       1\n",
      "2       2\n",
      "4       4\n",
      "3       3\n",
      "8    rare\n",
      "5    rare\n",
      "dtype: object, 'data_type': dtype('int64')}]\n"
     ]
    }
   ],
   "source": [
    "# let's see the mapping\n",
    "# for SibSp, values 5 & 8 are encoded as 'rare' as they appear less than 10%\n",
    "# for Pclass, nothing changed\n",
    "print(enc.mapping)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# perform transformation\n",
    "data2 = enc.transform(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0       608\n",
      "1       209\n",
      "2        28\n",
      "4        18\n",
      "3        16\n",
      "rare     12\n",
      "Name: SibSp, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# check the result\n",
    "print(data2.SibSp.value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Mode Imputation\n",
    "Replacing the rare label by most frequent label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# create the encoder and fit with our data\n",
    "enc = ra.ModeImputation(cols=['Pclass','SibSp'],threshold=0.01).fit(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'col': 'Pclass', 'mapping': 3    3\n",
      "1    1\n",
      "2    2\n",
      "dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0    0\n",
      "1    1\n",
      "2    2\n",
      "4    4\n",
      "3    3\n",
      "8    0\n",
      "5    0\n",
      "dtype: int64, 'data_type': dtype('int64')}]\n"
     ]
    }
   ],
   "source": [
    "# let's see the mapping\n",
    "# for SibSp, values 5 & 8 are encoded as 0, as label 0 is the most frequent label\n",
    "# for Pclass, nothing changed\n",
    "print(enc.mapping)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# perform transformation\n",
    "data3 = enc.transform(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0    620\n",
      "1    209\n",
      "2     28\n",
      "4     18\n",
      "3     16\n",
      "Name: SibSp, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "# check the result\n",
    "print(data3.SibSp.value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}