copd-model-h / training /encode_and_impute.py

Inital Upload

000de75 2 days ago

8.9 kB

	"""Script that performs encoding of categorical features and imputation.

	Performs encoding of categorical features, and imputation of missing values. After encoding
	and imputation are performed, features are dropped. Two versions of the data is saved:
	imputed and not imputed dataframes.
	"""

	import pandas as pd
	import numpy as np
	import os
	import sys
	import yaml
	import json
	import joblib
	import encoding
	import imputation


	with open("./training/config.yaml", "r") as config:
	config = yaml.safe_load(config)

	# Specify which model to generate features for
	model_type = config["model_settings"]["model_type"]

	# Setup log file
	log = open("./training/logging/encode_and_impute_" + model_type + ".log", "w")
	sys.stdout = log

	# Dataset to process - set through config file
	data_to_process = config["model_settings"]["data_to_process"]

	# Load data
	data = pd.read_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"{}_combined_{}.pkl".format(data_to_process, model_type),
	)
	)

	############################################################################
	# Target encode categorical data
	############################################################################

	categorical_cols = [
	"LatestSymptomDiaryQ8",
	"LatestSymptomDiaryQ9",
	"LatestSymptomDiaryQ10",
	"DaysSinceLastExac",
	"AgeBinned",
	"Comorbidities",
	"FEV1PercentPredicted",
	]

	# Multiple types of nans present in data ('nan' and np.NaN). Convert all these to 'nan' for
	# categorical columns
	for categorical_col in categorical_cols:
	data[categorical_col] = data[categorical_col].replace(np.nan, "nan")

	if data_to_process == "train":
	# Get target encodings for entire train set
	target_encodings = encoding.get_target_encodings(
	train_data=data,
	cols_to_encode=categorical_cols,
	target_col="ExacWithin3Months",
	smooth="auto",
	)
	train_encoded = encoding.apply_target_encodings(
	data=data,
	cols_to_encode=categorical_cols,
	encodings=target_encodings,
	drop_categorical_cols=False,
	)
	json.dump(
	target_encodings,
	open("./data/artifacts/target_encodings_" + model_type + ".json", "w"),
	)

	# K-fold target encode
	# Get info on which patients belong to which fold
	fold_patients = np.load(
	os.path.join(
	config["outputs"]["cohort_info_dir"],
	"fold_patients_{}.npy".format(model_type),
	),
	allow_pickle=True,
	)
	train_encoded_cv, target_encodings = encoding.kfold_target_encode(
	df=data,
	fold_ids=fold_patients,
	cols_to_encode=categorical_cols,
	id_col="StudyId",
	target="ExacWithin3Months",
	smooth="auto",
	drop_categorical_cols=False,
	)

	# Drop categorical cols except for AgeBinned as it is needed in imputation step
	categorical_cols.remove("AgeBinned")
	train_encoded = train_encoded.drop(columns=categorical_cols)
	train_encoded_cv = train_encoded_cv.drop(columns=categorical_cols)

	if (data_to_process == "test") \| (data_to_process == "forward_val"):
	# Encode test set/forward val set based on entire train set
	target_encodings = json.load(
	open("./data/artifacts/target_encodings_" + model_type + ".json")
	)
	test_encoded = encoding.apply_target_encodings(
	data=data,
	cols_to_encode=categorical_cols,
	encodings=target_encodings,
	drop_categorical_cols=False,
	)

	# Drop categorical cols except for AgeBinned as it is needed in imputation step
	categorical_cols.remove("AgeBinned")
	test_encoded = test_encoded.drop(columns=categorical_cols)

	############################################################################
	# Impute missing data
	############################################################################

	cols_to_ignore = [
	"StudyId",
	"PatientId",
	"IndexDate",
	"ExacWithin3Months",
	"HospExacWithin3Months",
	"CommExacWithin3Months",
	"Age",
	"Sex_F",
	"SafeHavenID",
	"AgeBinned",
	]

	if data_to_process == "train":
	# Impute entire train set
	not_imputed_train = train_encoded.copy()
	cols_to_impute = train_encoded.drop(columns=cols_to_ignore).columns

	imputer = imputation.get_imputer(
	train_data=train_encoded,
	cols_to_impute=cols_to_impute,
	average_type="median",
	cols_to_groupby=["AgeBinned", "Sex_F"],
	)
	imputed_train = imputation.apply_imputer(
	data=train_encoded,
	cols_to_impute=cols_to_impute,
	imputer=imputer,
	cols_to_groupby=["AgeBinned", "Sex_F"],
	)
	joblib.dump(imputer, "./data/artifacts/imputer_" + model_type + ".pkl")

	# K-fold impute
	not_imputed_train_cv = train_encoded_cv.copy()
	imputed_train_cv = imputation.kfold_impute(
	df=train_encoded,
	fold_ids=fold_patients,
	cols_to_impute=cols_to_impute,
	average_type="median",
	cols_to_groupby=["AgeBinned", "Sex_F"],
	id_col="StudyId",
	)

	df_columns = imputed_train.columns.tolist()

	if (data_to_process == "test") \| (data_to_process == "forward_val"):
	not_imputed_test = test_encoded.copy()
	cols_to_impute = test_encoded.drop(columns=cols_to_ignore).columns

	# Impute test set/forward val set based on entire train set
	imputer = joblib.load("./data/artifacts/imputer_" + model_type + ".pkl")
	imputed_test = imputation.apply_imputer(
	data=test_encoded,
	cols_to_impute=cols_to_impute,
	imputer=imputer,
	cols_to_groupby=["AgeBinned", "Sex_F"],
	)

	df_columns = imputed_test.columns.tolist()

	############################################################################
	# Reduce feature space
	############################################################################
	cols_to_drop_startswith = (
	"DiffLatest",
	"Var",
	"LatestEQ5D",
	"TotalEngagement",
	"Age",
	"NumHosp",
	"Required",
	"LungFunction",
	"EngagementCAT",
	"LatestSymptomDiary",
	"LatestAlbumin",
	"LatestEosinophils",
	"LatestNeutrophils",
	"LatestWhite Blood Count",
	)

	additional_cols_to_drop = [
	"PatientId",
	"SafeHavenID",
	"Sex_F",
	"NumCommExacPrior6mo",
	"AsthmaOverlap",
	"TimeSinceLungFunc",
	"LatestNeutLymphRatio",
	"EngagementEQ5DTW1",
	"EngagementMRCTW1",
	"LatestMRCQ1",
	"WeekAvgCATQ1",
	"WeekAvgCATQ3",
	"WeekAvgCATQ4",
	"WeekAvgCATQ5",
	"WeekAvgCATQ6",
	"WeekAvgCATQ7",
	"WeekAvgCATQ8",
	"WeekAvgSymptomDiaryQ1",
	"WeekAvgSymptomDiaryQ3",
	"WeekAvgSymptomDiaryScore",
	"EngagementSymptomDiaryTW1",
	"ScaledSumSymptomDiaryQ3TW1",
	# "Comorbidities_te",
	]

	cols_to_drop = []
	cols_to_drop.extend(
	[item for item in df_columns if item.startswith(cols_to_drop_startswith)]
	)
	cols_to_drop.extend(additional_cols_to_drop)

	if data_to_process == "train":
	imputed_train = imputed_train.drop(columns=cols_to_drop)
	not_imputed_train = not_imputed_train.drop(columns=cols_to_drop)
	imputed_train_cv = imputed_train_cv.drop(columns=cols_to_drop)
	not_imputed_train_cv = not_imputed_train_cv.drop(columns=cols_to_drop)
	if (data_to_process == "test") \| (data_to_process == "forward_val"):
	imputed_test = imputed_test.drop(columns=cols_to_drop)
	not_imputed_test = not_imputed_test.drop(columns=cols_to_drop)

	############################################################################
	# Save data
	############################################################################
	os.makedirs(config["outputs"]["model_input_data_dir"], exist_ok=True)

	if data_to_process == "train":
	imputed_train.to_pickle(
	os.path.join(
	config["outputs"]["model_input_data_dir"],
	"{}_imputed_{}.pkl".format(data_to_process, model_type),
	)
	)
	not_imputed_train.to_pickle(
	os.path.join(
	config["outputs"]["model_input_data_dir"],
	"{}_not_imputed_{}.pkl".format(data_to_process, model_type),
	)
	)
	imputed_train_cv.to_pickle(
	os.path.join(
	config["outputs"]["model_input_data_dir"],
	"{}_imputed_cv_{}.pkl".format(data_to_process, model_type),
	)
	)
	not_imputed_train_cv.to_pickle(
	os.path.join(
	config["outputs"]["model_input_data_dir"],
	"{}_not_imputed_cv_{}.pkl".format(data_to_process, model_type),
	)
	)

	if (data_to_process == "test") \| (data_to_process == "forward_val"):
	imputed_test.to_pickle(
	os.path.join(
	config["outputs"]["model_input_data_dir"],
	"{}_imputed_{}.pkl".format(data_to_process, model_type),
	)
	)
	not_imputed_test.to_pickle(
	os.path.join(
	config["outputs"]["model_input_data_dir"],
	"{}_not_imputed_{}.pkl".format(data_to_process, model_type),
	)
	)