""" Derive features from demographics for 2 models: Parallel model 1: uses both hospital and community exacerbation events Parallel model 2: uses only hospital exacerbation events """ import numpy as np import pandas as pd import sys import os import model_h import yaml with open("./training/config.yaml", "r") as config: config = yaml.safe_load(config) # Specify which model to generate features for model_type = config["model_settings"]["model_type"] # Setup log file log = open("./training/logging/process_demographics_" + model_type + ".log", "w") sys.stdout = log # Dataset to process - set through config file data_to_process = config["model_settings"]["data_to_process"] # Load cohort data if data_to_process == "forward_val": data = pd.read_pickle("./data/patient_labels_forward_val_hosp_comm.pkl") patient_details = pd.read_pickle("./data/patient_details_forward_val.pkl") else: data = pd.read_pickle("./data/patient_labels_" + model_type + ".pkl") patient_details = pd.read_pickle("./data/patient_details.pkl") data = data.merge( patient_details[["StudyId"]], on="StudyId", how="left", ) # Calculate age data["DateOfBirth"] = pd.to_datetime(data["DateOfBirth"], utc=True) data["Age"] = (data["IndexDate"] - data["DateOfBirth"]).dt.days data["Age"] = np.floor(data["Age"] / 365) data = data.drop(columns="DateOfBirth") # Bin patient age age_bins = [0, 50, 60, 70, 80, np.inf] age_labels = ["<50", "50-59", "60-69", "70-79", "80+"] data["AgeBinned"] = model_h.bin_numeric_column( col=data["Age"], bins=age_bins, labels=age_labels ) # Smoking status: TODO # Map the M and F sex column to binary (1=F) sex_mapping = {"F": 1, "M": 0} data["Sex_F"] = data.Sex.map(sex_mapping) data = data.drop(columns=["Sex"]) # Save data os.makedirs(config["outputs"]["processed_data_dir"], exist_ok=True) if data_to_process == "forward_val": data.to_pickle( os.path.join( config["outputs"]["processed_data_dir"], "demographics_forward_val_" + model_type + ".pkl", ) ) else: data.to_pickle( os.path.join( config["outputs"]["processed_data_dir"], "demographics_" + model_type + ".pkl", ) )