# github: https://github.com/niteshsukhwani/MLSMOTE.git # -*- coding: utf-8 -*- # Importing required Library import random import numpy as np import pandas as pd from sklearn.datasets import make_classification from sklearn.neighbors import NearestNeighbors def create_dataset(n_sample=1000): """ Create a unevenly distributed sample data set multilabel classification using make_classification function args nsample: int, Number of sample to be created return X: pandas.DataFrame, feature vector dataframe with 10 features y: pandas.DataFrame, target vector dataframe with 5 labels """ X, y = make_classification( n_classes=5, class_sep=2, weights=[0.1, 0.025, 0.205, 0.008, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=10, n_clusters_per_class=1, n_samples=1000, random_state=10, ) y = pd.get_dummies(y, prefix="class") return pd.DataFrame(X), y def get_tail_label(df): """ Give tail label colums of the given target dataframe args df: pandas.DataFrame, target label df whose tail label has to identified return tail_label: list, a list containing column name of all the tail label """ columns = df.columns n = len(columns) irpl = np.zeros(n) for column in range(n): irpl[column] = df[columns[column]].value_counts()[1] irpl = max(irpl) / irpl mir = np.average(irpl) tail_label = [] for i in range(n): if irpl[i] > mir: tail_label.append(columns[i]) return tail_label def get_index(df): """ give the index of all tail_label rows args df: pandas.DataFrame, target label df from which index for tail label has to identified return index: list, a list containing index number of all the tail label """ tail_labels = get_tail_label(df) index = set() for tail_label in tail_labels: sub_index = set(df[df[tail_label] == 1].index) index = index.union(sub_index) return list(index) def get_minority_instace(X, y): """ Give minority dataframe containing all the tail labels args X: pandas.DataFrame, the feature vector dataframe y: pandas.DataFrame, the target vector dataframe return X_sub: pandas.DataFrame, the feature vector minority dataframe y_sub: pandas.DataFrame, the target vector minority dataframe """ index = get_index(y) X_sub = X[X.index.isin(index)].reset_index(drop=True) y_sub = y[y.index.isin(index)].reset_index(drop=True) return X_sub, y_sub def nearest_neighbour(X): """ Give index of 5 nearest neighbor of all the instance args X: np.array, array whose nearest neighbor has to find return indices: list of list, index of 5 NN of each element in X """ nbs = NearestNeighbors(n_neighbors=5, metric="euclidean", algorithm="kd_tree").fit(X) euclidean, indices = nbs.kneighbors(X) return indices def MLSMOTE(X, y, n_sample): """ Give the augmented data using MLSMOTE algorithm args X: pandas.DataFrame, input vector DataFrame y: pandas.DataFrame, feature vector dataframe n_sample: int, number of newly generated sample return new_X: pandas.DataFrame, augmented feature vector data target: pandas.DataFrame, augmented target vector data """ indices2 = nearest_neighbour(X) n = len(indices2) new_X = np.zeros((n_sample, X.shape[1])) target = np.zeros((n_sample, y.shape[1])) for i in range(n_sample): reference = random.randint(0, n - 1) neighbour = random.choice(indices2[reference, 1:]) all_point = indices2[reference] nn_df = y[y.index.isin(all_point)] ser = nn_df.sum(axis=0, skipna=True) target[i] = np.array([1 if val > 2 else 0 for val in ser]) ratio = random.random() gap = X.loc[reference, :] - X.loc[neighbour, :] new_X[i] = np.array(X.loc[reference, :] + ratio * gap) new_X = pd.DataFrame(new_X, columns=X.columns) target = pd.DataFrame(target, columns=y.columns) new_X = pd.concat([X, new_X], axis=0) target = pd.concat([y, target], axis=0) return new_X, target # Keep original MLSMOTE function name for direct use if __name__ == "__main__": """ main function to use the MLSMOTE """ X, y = create_dataset() # Creating a Dataframe X_sub, y_sub = get_minority_instace(X, y) # Getting minority instance of that datframe X_res, y_res = MLSMOTE(X_sub, y_sub, 100) # Applying MLSMOTE to augment the dataframe