File size: 2,002 Bytes
53a6def | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | """
Process demographics data
--------
Process DOB, sex, marital status and SIMD data
"""
import json
from utils.common import read_data, correct_column_names
def initialize_demo_data(demo_file):
"""
Load in demographics dataset to correct format
--------
:param demo_file: demographics data file name
:return: demographics dataframe with correct column names and types
"""
print('Loading demographic data')
# Read in data
demo_cols = ['SafeHavenID', 'OBF_DOB', 'SEX', 'MARITAL_STATUS',
'SIMD_2009_QUINTILE', 'SIMD_2009_DECILE',
'SIMD_2009_VIGINTILE', 'SIMD_2012_QUINTILE',
'SIMD_2012_DECILE', 'SIMD_2012_VIGINTILE',
'SIMD_2016_QUINTILE', 'SIMD_2016_DECILE',
'SIMD_2016_VIGINTILE']
demo_types = ['int', 'object', 'str', 'str', 'float', 'float', 'float',
'float', 'float', 'float', 'float', 'float', 'float']
df = read_data(demo_file, demo_cols, demo_types)
# Nulls dropped later in process, only drop duplicates
df = df.drop_duplicates()
return df
def process_sex(df):
"""
Process sex column in demographics
--------
:param df: dataframe to update
:return: updated dataframe
"""
print('One-hot encoding sex')
df['sex_bin'] = (df.SEX == 'F').astype(int)
return df
def main():
# Load in config items
with open('../../../config.json') as json_config_file:
config = json.load(json_config_file)
# Load in data
demo_file = config['extract_data_path'] + 'Demographics_Cohort3R.csv'
demo = initialize_demo_data(demo_file)
# Create binary sex column
demo = process_sex(demo)
# Drop original columns
demo = demo.drop('SEX', axis=1)
# Correct column names
new_cols = correct_column_names(demo.columns[1:], 'demo')
demo.columns = ['SafeHavenID'] + new_cols
# Save data
demo.to_pickle(config['model_data_path'] + 'demo_proc.pkl')
main()
|