| | """ |
| | Process demographics data |
| | -------- |
| | Process DOB, sex, marital status and SIMD data |
| | """ |
| | import json |
| | from utils.common import read_data, correct_column_names |
| |
|
| |
|
| | def initialize_demo_data(demo_file): |
| | """ |
| | Load in demographics dataset to correct format |
| | -------- |
| | :param demo_file: demographics data file name |
| | :return: demographics dataframe with correct column names and types |
| | """ |
| | print('Loading demographic data') |
| |
|
| | |
| | demo_cols = ['SafeHavenID', 'OBF_DOB', 'SEX', 'MARITAL_STATUS', |
| | 'SIMD_2009_QUINTILE', 'SIMD_2009_DECILE', |
| | 'SIMD_2009_VIGINTILE', 'SIMD_2012_QUINTILE', |
| | 'SIMD_2012_DECILE', 'SIMD_2012_VIGINTILE', |
| | 'SIMD_2016_QUINTILE', 'SIMD_2016_DECILE', |
| | 'SIMD_2016_VIGINTILE'] |
| | demo_types = ['int', 'object', 'str', 'str', 'float', 'float', 'float', |
| | 'float', 'float', 'float', 'float', 'float', 'float'] |
| | df = read_data(demo_file, demo_cols, demo_types) |
| |
|
| | |
| | df = df.drop_duplicates() |
| |
|
| | return df |
| |
|
| |
|
| | def process_sex(df): |
| | """ |
| | Process sex column in demographics |
| | -------- |
| | :param df: dataframe to update |
| | :return: updated dataframe |
| | """ |
| | print('One-hot encoding sex') |
| |
|
| | df['sex_bin'] = (df.SEX == 'F').astype(int) |
| |
|
| | return df |
| |
|
| |
|
| | def main(): |
| |
|
| | |
| | with open('../../../config.json') as json_config_file: |
| | config = json.load(json_config_file) |
| |
|
| | |
| | demo_file = config['extract_data_path'] + 'Demographics_Cohort3R.csv' |
| | demo = initialize_demo_data(demo_file) |
| |
|
| | |
| | demo = process_sex(demo) |
| |
|
| | |
| | demo = demo.drop('SEX', axis=1) |
| |
|
| | |
| | new_cols = correct_column_names(demo.columns[1:], 'demo') |
| | demo.columns = ['SafeHavenID'] + new_cols |
| |
|
| | |
| | demo.to_pickle(config['model_data_path'] + 'demo_proc.pkl') |
| |
|
| |
|
| | main() |
| |
|