Spaces:
Runtime error
Runtime error
| # Copyright 2019 Seth V. Neel, Michael J. Kearns, Aaron L. Roth, Zhiwei Steven Wu | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); you may not | |
| # use this file except in compliance with the License. You may obtain a copy of | |
| # the License at http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software distributed | |
| # under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |
| # CONDITIONS OF ANY KIND, either express or implied. See the License for the | |
| # specific language governing permissions and limitations under the License. | |
| """Functions for manipulating and loading input data.""" | |
| import argparse | |
| import numpy as np | |
| import pandas as pd | |
| def setup(): | |
| parser = argparse.ArgumentParser(description='Fairness Data Cleaning') | |
| parser.add_argument( | |
| '-n', | |
| '--name', | |
| type=str, | |
| help='name of the to store the new datasets (Required)') | |
| parser.add_argument('-d', | |
| '--dataset', | |
| type=str, | |
| help='name of the original dataset file (Required)') | |
| parser.add_argument( | |
| '-a', | |
| '--attributes', | |
| type=str, | |
| help= | |
| 'name of the file representing which attributes are protected (unprotected = 0, protected = 1, label = 2) (Required)' | |
| ) | |
| parser.add_argument( | |
| '-c', | |
| '--centered', | |
| default=False, | |
| action='store_true', | |
| required=False, | |
| help='Include this flag to determine whether data should be centered') | |
| args = parser.parse_args() | |
| return [args.name, args.dataset, args.attributes, args.centered] | |
| def clean_dataset(dataset, attributes, centered): | |
| """Clean a dataset, given the filename for the dataset and the filename for the attributes. | |
| Args: | |
| :param dataset: Filename for dataset. The dataset should be formatted such that categorical | |
| variables use one-hot encoding | |
| and the label should be 0/1 | |
| :param attributes: Filename for the attributes of the dataset. The file should have each column name in a list, | |
| and under this list should have 0 for an unprotected attribute, 1 for a protected attribute, and 2 for the | |
| attribute of the label. | |
| :param centered: boolean flag that determines whether to center the input covariates. | |
| :return X, X_prime, y: pandas dataframes of attributes, sensitive attributes, labels | |
| """ | |
| df = pd.read_csv(dataset) | |
| sens_df = pd.read_csv(attributes) | |
| ## Get and remove label Y | |
| y_col = [str(c) for c in sens_df.columns if sens_df[c][0] == 2] | |
| print('label feature: {}'.format(y_col)) | |
| if (len(y_col) > 1): | |
| raise ValueError('More than 1 label column used') | |
| if (len(y_col) < 1): | |
| raise ValueError('No label column used') | |
| y = df[y_col[0]] | |
| ## Do not use labels in rest of data | |
| X = df.loc[:, df.columns != y_col[0]] | |
| X = X.loc[:, X.columns != 'Unnamed: 0'] | |
| ## Create X_prime, by getting protected attributes | |
| sens_cols = [str(c) for c in sens_df.columns if sens_df[c][0] == 1] | |
| print('sensitive features: {}'.format(sens_cols)) | |
| sens_dict = {c: 1 if c in sens_cols else 0 for c in df.columns} | |
| X, sens_dict = one_hot_code(X, sens_dict) | |
| sens_names = [key for key in sens_dict.keys() if sens_dict[key] == 1] | |
| print( | |
| 'there are {} sensitive features including derivative features'.format( | |
| len(sens_names))) | |
| X_prime = X[sens_names] | |
| if centered: | |
| X = center(X) | |
| X_prime = center(X_prime) | |
| return X, X_prime, y | |
| def center(X): | |
| for col in X.columns: | |
| X.loc[:, col] = X.loc[:, col] - np.mean(X.loc[:, col]) | |
| return X | |
| def array_to_tuple(x): | |
| # have to cast ndarray to hashable type in get_baseline() | |
| x = tuple([el[0] for el in x]) if x.__class__.__name__ == 'ndarray' else x | |
| return x | |
| def one_hot_code(df1, sens_dict): | |
| cols = df1.columns | |
| for c in cols: | |
| if isinstance(df1[c][0], str): | |
| column = df1[c] | |
| df1 = df1.drop(c, 1) | |
| unique_values = list(set(column)) | |
| n = len(unique_values) | |
| if n > 2: | |
| for i in range(n): | |
| col_name = '{}.{}'.format(c, i) | |
| col_i = [ | |
| 1 if el == unique_values[i] else 0 for el in column | |
| ] | |
| df1[col_name] = col_i | |
| sens_dict[col_name] = sens_dict[c] | |
| del sens_dict[c] | |
| else: | |
| col_name = c | |
| col = [1 if el == unique_values[0] else 0 for el in column] | |
| df1[col_name] = col | |
| return df1, sens_dict | |
| def extract_df_from_ds(dataset): | |
| """Extract data frames from Transformer Data set | |
| Args: | |
| :param dataset: aif360 dataset | |
| Returns: | |
| :return X, X_prime, y: pandas dataframes of attributes, sensitive attributes, labels | |
| """ | |
| X = pd.DataFrame(dataset.convert_to_dataframe()[0]) | |
| # remove labels | |
| X = X.drop(columns=dataset.label_names) | |
| # get sensitive attributes | |
| X_prime = X[dataset.protected_attribute_names] | |
| y = tuple(dataset.labels[:, 0]) | |
| return X, X_prime, y | |
| def get_data(dataset): | |
| # Helper for main method | |
| """Given name of dataset, load in the three datasets associated from the clean.py file | |
| :param dataset: | |
| :return: | |
| """ | |
| X = pd.read_csv('dataset/' + dataset + '_features.csv') | |
| X_prime = pd.read_csv('dataset/' + dataset + '_protectedfeatures.csv') | |
| y = pd.read_csv('dataset/' + dataset + '_labels.csv', | |
| names=['index', 'label']) | |
| y = y['label'] | |
| return X, X_prime, y | |