Spaces:
Build error
Build error
| """ | |
| Discretizers classes, to be used in lime_tabular | |
| """ | |
| import numpy as np | |
| import sklearn | |
| import sklearn.tree | |
| import scipy | |
| from sklearn.utils import check_random_state | |
| from abc import ABCMeta, abstractmethod | |
| class BaseDiscretizer(): | |
| """ | |
| Abstract class - Build a class that inherits from this class to implement | |
| a custom discretizer. | |
| Method bins() is to be redefined in the child class, as it is the actual | |
| custom part of the discretizer. | |
| """ | |
| __metaclass__ = ABCMeta # abstract class | |
| def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None, | |
| data_stats=None): | |
| """Initializer | |
| Args: | |
| data: numpy 2d array | |
| categorical_features: list of indices (ints) corresponding to the | |
| categorical columns. These features will not be discretized. | |
| Everything else will be considered continuous, and will be | |
| discretized. | |
| categorical_names: map from int to list of names, where | |
| categorical_names[x][y] represents the name of the yth value of | |
| column x. | |
| feature_names: list of names (strings) corresponding to the columns | |
| in the training data. | |
| data_stats: must have 'means', 'stds', 'mins' and 'maxs', use this | |
| if you don't want these values to be computed from data | |
| """ | |
| self.to_discretize = ([x for x in range(data.shape[1]) | |
| if x not in categorical_features]) | |
| self.data_stats = data_stats | |
| self.names = {} | |
| self.lambdas = {} | |
| self.means = {} | |
| self.stds = {} | |
| self.mins = {} | |
| self.maxs = {} | |
| self.random_state = check_random_state(random_state) | |
| # To override when implementing a custom binning | |
| bins = self.bins(data, labels) | |
| bins = [np.unique(x) for x in bins] | |
| # Read the stats from data_stats if exists | |
| if data_stats: | |
| self.means = self.data_stats.get("means") | |
| self.stds = self.data_stats.get("stds") | |
| self.mins = self.data_stats.get("mins") | |
| self.maxs = self.data_stats.get("maxs") | |
| for feature, qts in zip(self.to_discretize, bins): | |
| n_bins = qts.shape[0] # Actually number of borders (= #bins-1) | |
| boundaries = np.min(data[:, feature]), np.max(data[:, feature]) | |
| name = feature_names[feature] | |
| self.names[feature] = ['%s <= %.2f' % (name, qts[0])] | |
| for i in range(n_bins - 1): | |
| self.names[feature].append('%.2f < %s <= %.2f' % | |
| (qts[i], name, qts[i + 1])) | |
| self.names[feature].append('%s > %.2f' % (name, qts[n_bins - 1])) | |
| self.lambdas[feature] = lambda x, qts=qts: np.searchsorted(qts, x) | |
| discretized = self.lambdas[feature](data[:, feature]) | |
| # If data stats are provided no need to compute the below set of details | |
| if data_stats: | |
| continue | |
| self.means[feature] = [] | |
| self.stds[feature] = [] | |
| for x in range(n_bins + 1): | |
| selection = data[discretized == x, feature] | |
| mean = 0 if len(selection) == 0 else np.mean(selection) | |
| self.means[feature].append(mean) | |
| std = 0 if len(selection) == 0 else np.std(selection) | |
| std += 0.00000000001 | |
| self.stds[feature].append(std) | |
| self.mins[feature] = [boundaries[0]] + qts.tolist() | |
| self.maxs[feature] = qts.tolist() + [boundaries[1]] | |
| def bins(self, data, labels): | |
| """ | |
| To be overridden | |
| Returns for each feature to discretize the boundaries | |
| that form each bin of the discretizer | |
| """ | |
| raise NotImplementedError("Must override bins() method") | |
| def discretize(self, data): | |
| """Discretizes the data. | |
| Args: | |
| data: numpy 2d or 1d array | |
| Returns: | |
| numpy array of same dimension, discretized. | |
| """ | |
| ret = data.copy() | |
| for feature in self.lambdas: | |
| if len(data.shape) == 1: | |
| ret[feature] = int(self.lambdas[feature](ret[feature])) | |
| else: | |
| ret[:, feature] = self.lambdas[feature]( | |
| ret[:, feature]).astype(int) | |
| return ret | |
| def get_undiscretize_values(self, feature, values): | |
| mins = np.array(self.mins[feature])[values] | |
| maxs = np.array(self.maxs[feature])[values] | |
| means = np.array(self.means[feature])[values] | |
| stds = np.array(self.stds[feature])[values] | |
| minz = (mins - means) / stds | |
| maxz = (maxs - means) / stds | |
| min_max_unequal = (minz != maxz) | |
| ret = minz | |
| ret[np.where(min_max_unequal)] = scipy.stats.truncnorm.rvs( | |
| minz[min_max_unequal], | |
| maxz[min_max_unequal], | |
| loc=means[min_max_unequal], | |
| scale=stds[min_max_unequal], | |
| random_state=self.random_state | |
| ) | |
| return ret | |
| def undiscretize(self, data): | |
| ret = data.copy() | |
| for feature in self.means: | |
| if len(data.shape) == 1: | |
| ret[feature] = self.get_undiscretize_values( | |
| feature, ret[feature].astype(int).reshape(-1, 1) | |
| ) | |
| else: | |
| ret[:, feature] = self.get_undiscretize_values( | |
| feature, ret[:, feature].astype(int) | |
| ) | |
| return ret | |
| class StatsDiscretizer(BaseDiscretizer): | |
| """ | |
| Class to be used to supply the data stats info when discretize_continuous is true | |
| """ | |
| def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None, | |
| data_stats=None): | |
| BaseDiscretizer.__init__(self, data, categorical_features, | |
| feature_names, labels=labels, | |
| random_state=random_state, | |
| data_stats=data_stats) | |
| def bins(self, data, labels): | |
| bins_from_stats = self.data_stats.get("bins") | |
| bins = [] | |
| if bins_from_stats is not None: | |
| for feature in self.to_discretize: | |
| bins_from_stats_feature = bins_from_stats.get(feature) | |
| if bins_from_stats_feature is not None: | |
| qts = np.array(bins_from_stats_feature) | |
| bins.append(qts) | |
| return bins | |
| class QuartileDiscretizer(BaseDiscretizer): | |
| def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): | |
| BaseDiscretizer.__init__(self, data, categorical_features, | |
| feature_names, labels=labels, | |
| random_state=random_state) | |
| def bins(self, data, labels): | |
| bins = [] | |
| for feature in self.to_discretize: | |
| qts = np.array(np.percentile(data[:, feature], [25, 50, 75])) | |
| bins.append(qts) | |
| return bins | |
| class DecileDiscretizer(BaseDiscretizer): | |
| def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): | |
| BaseDiscretizer.__init__(self, data, categorical_features, | |
| feature_names, labels=labels, | |
| random_state=random_state) | |
| def bins(self, data, labels): | |
| bins = [] | |
| for feature in self.to_discretize: | |
| qts = np.array(np.percentile(data[:, feature], | |
| [10, 20, 30, 40, 50, 60, 70, 80, 90])) | |
| bins.append(qts) | |
| return bins | |
| class EntropyDiscretizer(BaseDiscretizer): | |
| def __init__(self, data, categorical_features, feature_names, labels=None, random_state=None): | |
| if(labels is None): | |
| raise ValueError('Labels must be not None when using \ | |
| EntropyDiscretizer') | |
| BaseDiscretizer.__init__(self, data, categorical_features, | |
| feature_names, labels=labels, | |
| random_state=random_state) | |
| def bins(self, data, labels): | |
| bins = [] | |
| for feature in self.to_discretize: | |
| # Entropy splitting / at most 8 bins so max_depth=3 | |
| dt = sklearn.tree.DecisionTreeClassifier(criterion='entropy', | |
| max_depth=3, | |
| random_state=self.random_state) | |
| x = np.reshape(data[:, feature], (-1, 1)) | |
| dt.fit(x, labels) | |
| qts = dt.tree_.threshold[np.where(dt.tree_.children_left > -1)] | |
| if qts.shape[0] == 0: | |
| qts = np.array([np.median(data[:, feature])]) | |
| else: | |
| qts = np.sort(qts) | |
| bins.append(qts) | |
| return bins | |