""" Process demographics data -------- Process DOB, sex, marital status and SIMD data """ import json from utils.common import read_data, correct_column_names def initialize_demo_data(demo_file): """ Load in demographics dataset to correct format -------- :param demo_file: demographics data file name :return: demographics dataframe with correct column names and types """ print('Loading demographic data') # Read in data demo_cols = ['SafeHavenID', 'OBF_DOB', 'SEX', 'MARITAL_STATUS', 'SIMD_2009_QUINTILE', 'SIMD_2009_DECILE', 'SIMD_2009_VIGINTILE', 'SIMD_2012_QUINTILE', 'SIMD_2012_DECILE', 'SIMD_2012_VIGINTILE', 'SIMD_2016_QUINTILE', 'SIMD_2016_DECILE', 'SIMD_2016_VIGINTILE'] demo_types = ['int', 'object', 'str', 'str', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float', 'float'] df = read_data(demo_file, demo_cols, demo_types) # Nulls dropped later in process, only drop duplicates df = df.drop_duplicates() return df def process_sex(df): """ Process sex column in demographics -------- :param df: dataframe to update :return: updated dataframe """ print('One-hot encoding sex') df['sex_bin'] = (df.SEX == 'F').astype(int) return df def main(): # Load in config items with open('../../../config.json') as json_config_file: config = json.load(json_config_file) # Load in data demo_file = config['extract_data_path'] + 'Demographics_Cohort3R.csv' demo = initialize_demo_data(demo_file) # Create binary sex column demo = process_sex(demo) # Drop original columns demo = demo.drop('SEX', axis=1) # Correct column names new_cols = correct_column_names(demo.columns[1:], 'demo') demo.columns = ['SafeHavenID'] + new_cols # Save data demo.to_pickle(config['model_data_path'] + 'demo_proc.pkl') main()