File size: 14,556 Bytes

0ab7b0c

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# from warnings import warn

# 2018.11.17 Created by Eamon.Zhang
# ChiMerge method modeified from https://github.com/tatsumiw/ChiMerge/blob/master/ChiMerge.py
# TODO: add more constraits to the discretized result.
class ChiMerge():
    """
    supervised discretization using the ChiMerge method.
    
    
    Parameters
    ----------
    confidenceVal: number
        default=3.841, correspond to p=0.05 dof=1
    num_of_bins: int
        number of bins after discretize
    col: str
        the column to be performed
        
    """
    
    def __init__(self, col=None, bins=None, confidenceVal=3.841, num_of_bins=10):
        self.col = col
        self._dim = None
        self.confidenceVal = confidenceVal
        self.bins = bins
        self.num_of_bins = num_of_bins


    def fit(self, X, y, **kwargs):
        """Fit encoder according to X and y.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.
        Returns
        -------
        self : encoder
            Returns self.
        """

        self._dim = X.shape[1]

        _, bins = self.chimerge(
            X_in=X,
            y=y,
            confidenceVal=self.confidenceVal,
            col=self.col,
            num_of_bins=self.num_of_bins
        )
        self.bins = bins
        return self
    
    
    def transform(self, X):
            """Perform the transformation to new data.
            Will use the tree model and the column list to discretize the
            column.
            Parameters
            ----------
            X : array-like, shape = [n_samples, n_features]
            Returns
            -------
            X : new dataframe with discretized new column.
            """
    
            if self._dim is None:
                raise ValueError('Must train encoder before it can be used to transform data.')
    
            #  make sure that it is the right size
            if X.shape[1] != self._dim:
                raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
    
            X, _ = self.chimerge(
                X_in=X,
                col=self.col,
                bins=self.bins
            )
    
            return X 

    def chimerge(self, X_in, y=None, confidenceVal=None, num_of_bins=None, col=None, bins=None):
        """
        discretize a variable using ChiMerge

        """

        X = X_in.copy(deep=True)

        if bins is not None:  # transform
            try:
                X[col+'_chimerge'] = pd.cut(X[col],bins=bins,include_lowest=True)
            except Exception as e:
                print(e)
       
        else: # fit
            try:               
                # create an array which save the num of 0/1 samples of the column to be chimerge
                total_num = X.groupby([col])[y].count()
                total_num = pd.DataFrame({'total_num': total_num}) 
                positive_class = X.groupby([col])[y].sum()
                positive_class = pd.DataFrame({'positive_class': positive_class}) 
                regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True,how='inner')  
                regroup.reset_index(inplace=True)
                regroup['negative_class'] = regroup['total_num'] - regroup['positive_class']  
                regroup = regroup.drop('total_num', axis=1)
                np_regroup = np.array(regroup)  
                # merge interval that have 0 pos/neg samples
                i = 0
                while (i <= np_regroup.shape[0] - 2):
                    if ((np_regroup[i, 1] == 0 and np_regroup[i + 1, 1] == 0) or ( np_regroup[i, 2] == 0 and np_regroup[i + 1, 2] == 0)):
                        np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i + 1, 1]  # pos
                        np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i + 1, 2]  # neg
                        np_regroup[i, 0] = np_regroup[i + 1, 0]
                        np_regroup = np.delete(np_regroup, i + 1, 0)
                        i = i - 1
                    i = i + 1
                # calculate chi for neighboring intervals
                # ∑[(yA-yB)²/yB]
                chi_table = np.array([])
                for i in np.arange(np_regroup.shape[0] - 1):
                    chi = (np_regroup[i, 1] * np_regroup[i + 1, 2] - np_regroup[i, 2] * np_regroup[i + 1, 1]) ** 2 \
                      * (np_regroup[i, 1] + np_regroup[i, 2] + np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) / \
                      ((np_regroup[i, 1] + np_regroup[i, 2]) * (np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) * (
                      np_regroup[i, 1] + np_regroup[i + 1, 1]) * (np_regroup[i, 2] + np_regroup[i + 1, 2]))
                    chi_table = np.append(chi_table, chi)
                # merge intervals that have closing chi
                while (1):
                    if (len(chi_table) <= (num_of_bins - 1) and min(chi_table) >= confidenceVal):
                        break
                    chi_min_index = np.argwhere(chi_table == min(chi_table))[0]  
                    np_regroup[chi_min_index, 1] = np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]
                    np_regroup[chi_min_index, 2] = np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]
                    np_regroup[chi_min_index, 0] = np_regroup[chi_min_index + 1, 0]
                    np_regroup = np.delete(np_regroup, chi_min_index + 1, 0)
        
                    if (chi_min_index == np_regroup.shape[0] - 1): 
                        chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
                                                       * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
                                                   ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
                        chi_table = np.delete(chi_table, chi_min_index, axis=0)
        
                    else:
                        chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
                                                   * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
                                                   ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
                        chi_table[chi_min_index] = (np_regroup[chi_min_index, 1] * np_regroup[chi_min_index + 1, 2] - np_regroup[chi_min_index, 2] * np_regroup[chi_min_index + 1, 1]) ** 2 \
                                                   * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) / \
                                               ((np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]) * (np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]))
                        chi_table = np.delete(chi_table, chi_min_index + 1, axis=0)
                result_data = pd.DataFrame()
                result_data['variable'] = [col] * np_regroup.shape[0]
                bins = []
                tmp = []
                for i in np.arange(np_regroup.shape[0]):
                    if i == 0:
                        y = '-inf' + ',' + str(np_regroup[i, 0])
                        #x = np_regroup[i, 0]
                        #list_temp.append(x)
                    elif i == np_regroup.shape[0] - 1:
                        y = str(np_regroup[i - 1, 0]) + '+'
                        #x = 100000000.
                        #list_temp.append(x)
                    else:
                        y = str(np_regroup[i - 1, 0]) + ',' + str(np_regroup[i, 0])
                        #x = np_regroup[i, 0]
                        #list_temp.append(x)
                    bins.append(np_regroup[i - 1, 0])
                    tmp.append(y)
                
                #list_temp.append(df[variable].max()+0.1)
                bins.append(X[col].min()-0.1)
                
                result_data['interval'] = tmp  
                result_data['flag_0'] = np_regroup[:, 2] 
                result_data['flag_1'] = np_regroup[:, 1]  
                bins.sort(reverse=False)
                print('Interval for variable %s' % col)
                print(result_data)
                
            except Exception as e:
                print(e)
        
        return X, bins
        
        
        
        
# 2018.11.15 Created by Eamon.Zhang
class DiscretizeByDecisionTree():
    """
    Discretisation with Decision Trees consists of using a decision tree 
    to identify the optimal splitting points that would determine the bins 
    or contiguous intervals:  
        
    1.train a decision tree of limited depth (2, 3 or 4) using the variable 
    we want to discretise to predict the target.
    2.the original variable values are then replaced by the 
    probability returned by the tree.

    Parameters
    ----------
    col: str
      column to discretise
    max_depth: int or list of int
      max depth of the tree. Can be an int or a list of int we want the tree model to search 
      for the optimal depth.
    
    """

    def __init__(self, col=None, max_depth=None, tree_model=None):
        self.col = col
        self._dim = None
        self.max_depth = max_depth
        self.tree_model = tree_model


    def fit(self, X, y, **kwargs):
        """Fit encoder according to X and y.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape = [n_samples]
            Target values.
        Returns
        -------
        self : encoder
            Returns self.
        """

        self._dim = X.shape[1]

        _, tree = self.discretize(
            X_in=X,
            y=y,
            max_depth=self.max_depth,
            col=self.col,
            tree_model=self.tree_model
        )
        self.tree_model = tree
        return self

    def transform(self, X):
        """Perform the transformation to new categorical data.
        Will use the tree model and the column list to discretize the
        column.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
        Returns
        -------
        X : new dataframe with discretized new column.
        """

        if self._dim is None:
            raise ValueError('Must train encoder before it can be used to transform data.')

        #  make sure that it is the right size
        if X.shape[1] != self._dim:
            raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))

        X, _ = self.discretize(
            X_in=X,
            col=self.col,
            tree_model=self.tree_model
        )

        return X 


    def discretize(self, X_in, y=None, max_depth=None, tree_model=None, col=None):
        """
        discretize a variable using DecisionTreeClassifier

        """

        X = X_in.copy(deep=True)

        if tree_model is not None:  # transform
            X[col+'_tree_discret'] = tree_model.predict_proba(X[col].to_frame())[:,1]

        else: # fit
            if isinstance(max_depth,int):
                tree_model = DecisionTreeClassifier(max_depth=max_depth)
                tree_model.fit(X[col].to_frame(), y)
                # X[col+'_tree_discret'] = tree_model.predict_proba(X[col].to_frame())[:,1]
                #print(x.tree_discret.unique())
#                bins = pd.concat( [X.groupby([col+'_tree_discret'])[col].min(),
#                                  X.groupby([col+'_tree_discret'])[col].max()], axis=1)
#                print('bins:')            
#                print(bins)
            
            elif len(max_depth)>1:
                score_ls = [] # here I will store the roc auc
                score_std_ls = [] # here I will store the standard deviation of the roc_auc
                for tree_depth in max_depth:
                    tree_model = DecisionTreeClassifier(max_depth=tree_depth)
                    scores = cross_val_score(tree_model, X[col].to_frame(), y, cv=3, scoring='roc_auc')
                    score_ls.append(np.mean(scores))
                    score_std_ls.append(np.std(scores))
                temp = pd.concat([pd.Series(max_depth), pd.Series(score_ls), pd.Series(score_std_ls)], axis=1)
                temp.columns = ['depth', 'roc_auc_mean', 'roc_auc_std']
                print('result ROC-AUC for each depth')
                print(temp)
                max_roc = temp.roc_auc_mean.max()
                optimal_depth=temp[temp.roc_auc_mean==max_roc]['depth'].values
                print('optimal_depth:',optimal_depth)
                tree_model = DecisionTreeClassifier(max_depth=optimal_depth)
                tree_model.fit(X[col].to_frame(), y)
#                bins = pd.concat( [X.groupby([col+'_tree_discret'])[col].min(),
#                                  X.groupby([col+'_tree_discret'])[col].max()], axis=1)
#                print('bins:')            
#                print(bins)
            else:
                raise ValueError('max_depth of a tree must be an integer or a list')

        return X, tree_model