copd-model-h / training /process_pros.py

Inital Upload

000de75 3 days ago

35.8 kB

	"""
	Derive features PRO responses for 2 models:
	Parallel model 1: uses both hospital and community exacerbation events
	Parallel model 2: uses only hospital exacerbation events
	"""

	import numpy as np
	import pandas as pd
	import sys
	import os
	import re
	from collections import defaultdict
	import yaml


	def calc_total_pro_engagement(pro_df, pro_name):
	"""
	Calculates PRO engagement per patient across their entire time within the service.

	Args:
	pro_df (pd.DataFrame): dataframe containing the onboarding date and the latest
	prediction date.
	pro_name (str): name of the PRO.

	Returns:
	pd.DataFrame: the input dateframe with an additional column stating the total
	engagement for each patient across the service.
	"""
	# Calculate time in service according to type of PRO
	if pro_name == "EQ5D":
	date_unit = "M"
	if pro_name == "MRC":
	date_unit = "W"
	if (pro_name == "CAT") \| (pro_name == "SymptomDiary"):
	date_unit = "D"
	pro_df["TimeInService"] = np.floor(
	(
	(pro_df.LatestPredictionDate - pro_df.FirstSubmissionDate)
	/ np.timedelta64(1, date_unit)
	)
	)

	# PRO engagement for the total time in service
	pro_response_count = pro_df.groupby("StudyId").count()[["PatientId"]].reset_index()
	pro_response_count = pro_response_count.rename(
	columns={"PatientId": "Response" + pro_name}
	)
	pro_df = pro_df.merge(pro_response_count, on="StudyId", how="left")
	pro_df["TotalEngagement" + pro_name] = round(
	pro_df["Response" + pro_name] / pro_df["TimeInService"], 2
	)
	return pro_df


	def calc_pro_engagement_in_time_window(pro_df, pro_name, time_window, data):
	"""
	Calculates PRO engagement per patient across a specified time window. The time
	window is in format 'months', and consists of the specified time period prior to
	IndexDate.

	Args:
	pro_df (pd.DataFrame): dataframe containing the index dates and PRO response
	submission dates.
	pro_name (str): name of the PRO.
	time_window (int): number of months in which to calculate PRO engagement.
	data (pd.DataFrame): main dataframe.

	Returns:
	pd.DataFrame: a dataframe containing the calculated PRO engagement.
	"""

	# Calculate time in service according to type of PRO.
	if pro_name == "EQ5D":
	unit_val = 1
	if pro_name == "MRC":
	unit_val = 4
	if (pro_name == "CAT") \| (pro_name == "SymptomDiary"):
	unit_val = 30

	pro_df["SubmissionTime"] = pd.to_datetime(pro_df["SubmissionTime"], utc=True)
	pro_engagement_6mo = pro_df.copy()
	pro_engagement_6mo["TimeSinceSubmission"] = (
	pro_engagement_6mo["IndexDate"] - pro_engagement_6mo["SubmissionTime"]
	).dt.days

	# Only include PRO responses within the specified time window
	pro_engagement_6mo = pro_engagement_6mo[
	pro_engagement_6mo["TimeSinceSubmission"].between(
	0, (time_window * 30), inclusive="both"
	)
	]

	# Calculate number of PRO responses within specified time window
	pro_engagement_6mo = (
	pro_engagement_6mo.groupby(["StudyId", "IndexDate"])
	.count()[["PatientId"]]
	.reset_index()
	)
	pro_engagement_6mo = pro_engagement_6mo.rename(
	columns={"PatientId": "ResponseCountTW" + str(time_window)}
	)
	pro_engagement_6mo["Engagement" + pro_name + "TW" + str(time_window)] = round(
	pro_engagement_6mo["ResponseCountTW" + str(time_window)]
	/ (time_window * unit_val),
	2,
	)
	pro_engagement_6mo = data[["StudyId", "IndexDate"]].merge(
	pro_engagement_6mo, on=["StudyId", "IndexDate"], how="left"
	)

	# Fill N/As with 0 as no engagement was observed for those patients
	pro_engagement_6mo = pro_engagement_6mo.fillna(0)
	return pro_engagement_6mo


	def calc_pro_engagement_at_specific_month(pro_df, pro_name, month_num, data):
	# Calculate time in service according to type of PRO.
	if pro_name == "EQ5D":
	unit_val = 1
	if pro_name == "MRC":
	unit_val = 4
	if (pro_name == "CAT") \| (pro_name == "SymptomDiary"):
	unit_val = 30

	pro_df["SubmissionTime"] = pd.to_datetime(pro_df["SubmissionTime"], utc=True)
	pro_engagement = pro_df.copy()
	pro_engagement["TimeSinceSubmission"] = (
	pro_engagement["IndexDate"] - pro_engagement["SubmissionTime"]
	).dt.days

	# Only include PRO responses for the month specified
	# Calculate the number of months between index date and specified month
	months_between_index_and_specified = month_num - 1
	pro_engagement = pro_engagement[
	pro_engagement["TimeSinceSubmission"].between(
	(months_between_index_and_specified * 30),
	(month_num * 30),
	inclusive="both",
	)
	]

	# Calculate number of PRO responses within specified time window
	pro_engagement = (
	pro_engagement.groupby(["StudyId", "IndexDate"])
	.count()[["PatientId"]]
	.reset_index()
	)
	pro_engagement = pro_engagement.rename(
	columns={"PatientId": "ResponseCountMonth" + str(month_num)}
	)
	pro_engagement["Engagement" + pro_name + "Month" + str(month_num)] = round(
	pro_engagement["ResponseCountMonth" + str(month_num)] / (1 * unit_val),
	2,
	)
	pro_engagement = data[["StudyId", "IndexDate"]].merge(
	pro_engagement, on=["StudyId", "IndexDate"], how="left"
	)

	# Fill N/As with 0 as no engagement was observed for those patients
	pro_engagement = pro_engagement.fillna(0)
	return pro_engagement


	def calc_last_pro_score(pro_df, pro_name):
	"""
	Calculates the most recent PRO response. The latest PRO score is set to be within 2
	months of the index date to allow recency of data without having many missing
	values.

	Args:
	pro_df (pd.DataFrame): dataframe containing the index dates and PRO response
	submission dates.
	pro_name (str): name of the PRO.

	Returns:
	pd.DataFrame: the input dateframe with additional columns stating the latest PRO
	score for each PRO question.
	"""
	# Calculate last PRO score
	pro_df["TimeSinceSubmission"] = (
	pro_df["IndexDate"] - pro_df["SubmissionTime"]
	).dt.days
	pro_df = pro_df[pro_df["TimeSinceSubmission"] > 0]
	pro_df = pro_df.sort_values(
	by=["StudyId", "IndexDate", "TimeSinceSubmission"], ascending=True
	)
	latest_pro = pro_df.drop_duplicates(subset=["StudyId", "IndexDate"], keep="first")

	# Ensure that the latest PRO Score is within 2 months of the index date
	latest_pro = latest_pro[latest_pro["TimeSinceSubmission"] <= 365]

	# Select specific columns
	question_cols = latest_pro.columns[
	latest_pro.columns.str.startswith(pro_name)
	].tolist()
	question_cols.extend(
	["StudyId", "IndexDate", "Score", "SubmissionTime", "TimeSinceSubmission"]
	)
	latest_pro = latest_pro[question_cols]

	# if pro_name == "EQ5D":
	# median_val_q1 = latest_pro["EQ5DScoreWithoutQ6"].median()
	# print(median_val_q1)
	# latest_pro = weigh_features_by_recency(
	# df=latest_pro,
	# feature="EQ5DScoreWithoutQ6",
	# feature_recency_days="TimeSinceSubmission",
	# median_value=median_val_q1,
	# decay_rate=0.001,
	# )
	# print(latest_pro.columns)
	#
	# # Add prefix to question columns
	# cols_to_rename = latest_pro.columns[
	# ~latest_pro.columns.isin(
	# ["StudyId", "IndexDate", "Score", "SubmissionTime"]
	# )
	# ]
	# latest_pro = latest_pro.rename(
	# columns=dict(zip(cols_to_rename, "Latest" + cols_to_rename))
	# )
	#
	# # Rename columns where prefix not added
	# latest_pro = latest_pro.rename(
	# columns={
	# "Score": "Latest" + pro_name + "Score",
	# "SubmissionTime": "LatestPRODate",
	# }
	# )
	#
	# elif pro_name == "MRC":
	# median_val_q1 = latest_pro["Score"].median()
	# print(median_val_q1)
	# latest_pro = weigh_features_by_recency(
	# df=latest_pro,
	# feature="Score",
	# feature_recency_days="TimeSinceSubmission",
	# median_value=median_val_q1,
	# decay_rate=0.001,
	# )
	# print(latest_pro.columns)

	# # Add prefix to question columns
	# cols_to_rename = latest_pro.columns[
	# ~latest_pro.columns.isin(
	# ["StudyId", "IndexDate", "Score", "SubmissionTime", "ScoreWeighted"]
	# )
	# ]
	# latest_pro = latest_pro.rename(
	# columns=dict(zip(cols_to_rename, "Latest" + cols_to_rename))
	# )

	# # Rename columns where prefix not added
	# latest_pro = latest_pro.rename(
	# columns={
	# "Score": "Latest" + pro_name + "Score",
	# "ScoreWeighted": "Latest" + pro_name + "ScoreWeighted",
	# "SubmissionTime": "LatestPRODate",
	# }
	# )

	# else:
	# Add prefix to question columns
	cols_to_rename = latest_pro.columns[
	~latest_pro.columns.isin(["StudyId", "IndexDate", "Score", "SubmissionTime"])
	]
	latest_pro = latest_pro.rename(
	columns=dict(zip(cols_to_rename, "Latest" + cols_to_rename))
	)

	# Rename columns where prefix not added
	latest_pro = latest_pro.rename(
	columns={
	"Score": "Latest" + pro_name + "Score",
	"SubmissionTime": "LatestPRODate",
	}
	)

	pro_df = pro_df.merge(latest_pro, on=["StudyId", "IndexDate"], how="left")
	return pro_df


	def calc_pro_score_prior_to_latest(pro_df, pro_name, time_prior_to_latest=60):
	"""
	Finds the PRO score prior to the latest PRO score before index date.

	Args:
	pro_df (pd.DataFrame): dataframe containing the latest PRO score and PRO
	response submission dates.
	pro_name (str): name of the PRO.
	time_prior_to_latest (int, optional): time period before latest PRO score in
	days. Default time frame set to 60 days (two months).

	Returns:
	pd.DataFrame: the input dateframe with additional columns stating the previous
	score closest to the latest PRO score for each PRO question.
	"""
	pro_previous = pro_df.copy()
	pro_previous = pro_previous[
	pro_previous["SubmissionTime"] < pro_previous["LatestPRODate"]
	]
	pro_previous = pro_previous.sort_values(
	by=["StudyId", "IndexDate", "SubmissionTime"], ascending=[True, True, False]
	)
	pro_previous = pro_previous.drop_duplicates(
	subset=["StudyId", "IndexDate"], keep="first"
	)

	# Make sure that previous score is within two months of the LatestPRODate
	pro_previous["TimeSinceLatestPRODate"] = (
	pro_previous["LatestPRODate"] - pro_previous["SubmissionTime"]
	).dt.days
	pro_previous = pro_previous[
	pro_previous["TimeSinceLatestPRODate"] <= time_prior_to_latest
	]

	# Add prefix to question columns
	cols_to_rename = [col for col in pro_previous if col.startswith(pro_name)]
	cols_to_rename = pro_previous[cols_to_rename].columns
	pro_previous = pro_previous.rename(
	columns=dict(zip(cols_to_rename, "Prev" + cols_to_rename))
	)
	pro_previous = pro_previous[["StudyId", "IndexDate", "Score"]].join(
	pro_previous.filter(regex="^Prev")
	)
	pro_previous = pro_previous.rename(columns={"Score": "Prev" + pro_name + "Score"})
	pro_df = pro_df.merge(pro_previous, on=["StudyId", "IndexDate"], how="left")
	return pro_df


	def define_mapping_for_calcs(pro_name, questions, prefixes):
	"""
	Defines the mapping for calculations between PRO responses.

	Args:
	pro_name (str): name of the PRO.
	questions (list): question names of PRO.
	prefixes (list): prefixes to identify which columns to use in calculations. The
	possible prefixes are: 'Avg', 'Prev', 'LongerAvg', 'WeekPrevAvg'.

	Returns:
	dict: mapping that maps columns for performing calculations.
	"""
	# Create empty dictionary to append questions
	mapping = defaultdict(list)

	# Iterate through questions and create mapping for calculations
	for question in questions:
	if (pro_name == "EQ5D") \| (pro_name == "MRC"):
	map_key = "Latest" + pro_name + question
	if (pro_name == "CAT") \| (pro_name == "SymptomDiary"):
	map_key = "WeekAvg" + pro_name + question
	for prefix in prefixes:
	mapping[map_key].append(prefix + pro_name + question)
	return mapping


	def calc_pro_average(pro_df, pro_name, time_window=None, avg_period=None):
	"""
	Calculate the PRO average before the latest PRO score and within a specified time
	window.

	Args:
	pro_df (pd.DataFrame): dataframe containing index dates and PRO submission
	dates.
	pro_name (str): name of the PRO.
	time_window (int, optional): time window (in months) used for calculating the
	average of PRO responses. Defaults to None.
	avg_period (str, optional): identifies which prefix to add to output columns.
	Defaults to None.

	Returns:
	pd.Dataframe: the input dateframe with additional columns with the calculated
	averages.
	"""
	# Calculate average in PRO responses for the time window specified prior to the
	# index date
	pro_df = pro_df.loc[
	:,
	~(
	pro_df.columns.str.startswith("Avg")
	\| pro_df.columns.str.startswith("Longer")
	),
	]

	if avg_period is None:
	prefix = "Avg"
	pro_df["AvgStartDate"] = pro_df["IndexDate"] - pd.DateOffset(months=time_window)
	avg_pro = pro_df[
	(pro_df["SubmissionTime"] >= pro_df["AvgStartDate"])
	& (pro_df["SubmissionTime"] < pro_df["LatestPRODate"])
	]
	else:
	pro_df["WeekStartDate"] = pro_df["IndexDate"] - pd.DateOffset(weeks=1)
	pro_df["WeekPrevStartDate"] = pro_df["WeekStartDate"] - pd.DateOffset(weeks=1)

	# When looking at daily PROs, three averages are calculated:
	# The weekly average is the average of PRO scores in the week prior to IndexDate
	if avg_period == "WeeklyAvg":
	prefix = "WeekAvg"
	avg_pro = pro_df[
	(pro_df["SubmissionTime"] >= pro_df["WeekStartDate"])
	& (pro_df["SubmissionTime"] <= pro_df["IndexDate"])
	]
	# The weekly previous average is the average of PRO scores in the week prior to the
	# WeeklyAvg. This is needed to calculate the difference of scores between the most
	# recent week and the week before that
	elif avg_period == "WeekPrevAvg":
	prefix = "WeekPrevAvg"
	avg_pro = pro_df[
	(pro_df["SubmissionTime"] >= pro_df["WeekPrevStartDate"])
	& (pro_df["SubmissionTime"] < pro_df["WeekStartDate"])
	]
	# Longer average calculated is the time window specified prior to the WeekStartDate
	elif avg_period == "LongerAvg":
	prefix = "LongerAvg"
	pro_df["AvgStartDate"] = pro_df["IndexDate"] - pd.DateOffset(months=time_window)
	avg_pro = pro_df[
	(pro_df["SubmissionTime"] >= pro_df["AvgStartDate"])
	& (pro_df["SubmissionTime"] < pro_df["WeekStartDate"])
	]

	# Select specific columns
	cols_required = avg_pro.columns[avg_pro.columns.str.startswith(pro_name)].tolist()
	cols_required.extend(["StudyId", "IndexDate", "Score"])
	avg_pro = avg_pro[cols_required]

	# Calculate average pro scores
	avg_pro = avg_pro.groupby(["StudyId", "IndexDate"]).mean().reset_index()

	# Add prefix to question columns
	cols_to_rename = avg_pro.columns[
	~avg_pro.columns.isin(["StudyId", "IndexDate", "Score"])
	]
	avg_pro = avg_pro.rename(columns=dict(zip(cols_to_rename, prefix + cols_to_rename)))

	# Rename columns where prefix not added
	avg_pro = avg_pro.rename(columns={"Score": prefix + pro_name + "Score"})

	# Merge average PRO with rest of the df
	pro_df = pro_df.merge(avg_pro, on=["StudyId", "IndexDate"], how="left")
	return pro_df


	def calc_diff_pro_scores(pro_df, pro_name, latest_pro, other_pro, time_window=None):
	"""
	Calculate the difference between PRO scores.

	Args:
	pro_df (pd.DataFrame): dataframe containing columns required for calculations.
	pro_name (str): name of the PRO.
	latest_pro (str): column name containing the latest PRO response for PROs EQ5D
	and MRC, and the latest week average for PROs CAT and SymptomDiary.
	other_pro (str): column name containing the other variable for calculating
	difference.
	time_window (int, optional): time window (in months) used to specify which
	column to use when calculating differences.

	Returns:
	pd.Dataframe: the input dateframe with additional columns with the calculated
	differences.
	"""
	# Remove prefix of score
	split_feat_name = re.findall(r"[A-Z][^A-Z]*", latest_pro)

	# Remove first element of list to get the base name of feature
	split_feat_name.pop(0)

	# Remove the second element in list if PRO is CAT or SymptomDiary
	if pro_name in ["CAT", "SymptomDiary"]:
	split_feat_name.pop(0)

	# Combine remaining elements of list
	stripped_feat_name = "".join(split_feat_name)

	if time_window is None:
	pro_df["DiffLatestPrev" + stripped_feat_name] = (
	pro_df[latest_pro] - pro_df[other_pro]
	)
	else:
	pro_df["DiffLatestAvg" + stripped_feat_name + "TW" + str(time_window)] = (
	pro_df[latest_pro] - pro_df[other_pro]
	)
	return pro_df


	def calc_variation(pro_df, pro_name):
	"""
	Calculate the variation (standard deviation) of PRO responses for a time window of
	1 month.

	Args:
	pro_df (pd.DataFrame): dataframe containing index dates and PRO submission
	dates.
	pro_name (str): name of the PRO.

	Returns:
	pd.Dataframe: the input dateframe with additional columns with the calculated
	variance.
	"""
	# Only calculate variation in the scores within 1 month before the IndexDate
	if "TimeSinceSubmission" not in pro_df:
	pro_df["TimeSinceSubmission"] = (
	pro_df["IndexDate"] - pro_df["SubmissionTime"]
	).dt.days
	pro_var = pro_df[
	(pro_df["TimeSinceSubmission"] > 0) & (pro_df["TimeSinceSubmission"] <= 30)
	]

	# Select specific columns
	cols_required = pro_var.columns[pro_var.columns.str.startswith(pro_name)].tolist()
	cols_required.extend(["StudyId", "IndexDate", "Score"])
	pro_var = pro_var[cols_required]

	# Calculate variation
	pro_var = pro_var.groupby(["StudyId", "IndexDate"]).std().reset_index()

	# Add prefix to question columns
	cols_to_rename = pro_var.columns[
	~pro_var.columns.isin(["StudyId", "IndexDate", "Score"])
	]
	pro_var = pro_var.rename(columns=dict(zip(cols_to_rename, "Var" + cols_to_rename)))

	# Rename columns where prefix not added
	pro_var = pro_var.rename(columns={"Score": "Var" + pro_name + "Score"})

	# Merge back to main df
	pro_df = pro_df.merge(pro_var, on=["StudyId", "IndexDate"], how="left")
	return pro_df


	def calc_sum_binary_vals(pro_df, binary_cols, time_window=1):
	"""
	For SymptomDiary questions that contain binary values, calculate the sum of the
	binary values for a specified time window.

	Args:
	pro_df (pd.DataFrame): dataframe containing index dates and PRO submission
	dates.
	binary_cols (list): column names for which sum of binary values is to be
	calculated for.
	time_window (int, optional): time window (in months) for which the sum of the
	binary values is calculated for. Defaults to 1.

	Returns:
	pd.Dataframe: a dataframe containing the sum of the binary values.
	"""
	# Make sure only entries before the index date and after the time window start date
	# are used
	pro_df["TimeWindowStartDate"] = pro_df["IndexDate"] - pd.DateOffset(
	months=time_window
	)
	pro_df = pro_df[
	(pro_df["SubmissionTime"] >= pro_df["TimeWindowStartDate"])
	& (pro_df["SubmissionTime"] <= pro_df["IndexDate"])
	]
	sum_df = pro_df.groupby(["StudyId", "IndexDate"])[binary_cols].sum()

	# Rename columns
	sum_df = sum_df.add_prefix("Sum")
	sum_df = sum_df.add_suffix("TW" + str(time_window))
	sum_df = sum_df.reset_index()
	return sum_df


	def scale_sum_to_response_rate(pro_df, sum, engagement_rate):
	"""
	Scale the sum calculated using copd.calc_sum_binary_vals() to the response
	rate to obtain a feature that is comparable between patients.

	Args:
	pro_df (pd.DataFrame): dataframe containing the columns for scaling the sum by
	the engagement rate.
	sum (str): column name that contains the data for the sum of the binary values.
	engagement_rate (str): column name that contains the data for the response rate.

	Returns:
	pd.Dataframe: the input dateframe with additional columns with the scaled sum.
	"""
	pro_df["Scaled" + sum] = pro_df[sum] / pro_df[engagement_rate]
	return pro_df


	with open("./training/config.yaml", "r") as config:
	config = yaml.safe_load(config)

	# Specify which model to generate features for
	model_type = config["model_settings"]["model_type"]

	# Setup log file
	log = open("./training/logging/process_pros_" + model_type + ".log", "w")
	sys.stdout = log

	# Dataset to process - set through config file
	data_to_process = config["model_settings"]["data_to_process"]

	# Load cohort data
	if data_to_process == "forward_val":
	data = pd.read_pickle("./data/patient_labels_forward_val_hosp_comm.pkl")
	patient_details = pd.read_pickle("./data/patient_details_forward_val.pkl")
	else:
	data = pd.read_pickle("./data/patient_labels_" + model_type + ".pkl")
	patient_details = pd.read_pickle("./data/patient_details.pkl")
	data = data[["StudyId", "IndexDate"]]
	patient_details = data.merge(
	patient_details[["StudyId", "FirstSubmissionDate", "LatestPredictionDate"]],
	on="StudyId",
	how="left",
	)

	# Calculate the lookback start date. Will need this to aggreggate data for model
	# features
	data["LookbackStartDate"] = data["IndexDate"] - pd.DateOffset(
	days=config["model_settings"]["lookback_period"]
	)

	############################################
	# Monthly PROs - EQ5D
	############################################
	pro_eq5d = pd.read_csv(config["inputs"]["raw_data_paths"]["pro_eq5d"], delimiter="\|")
	pro_eq5d = pro_eq5d.merge(
	patient_details,
	on="StudyId",
	how="inner",
	)

	# EQ5DQ6 is a much less structured question compared to the other questions in EQ5D.
	# A new score will be calculated using only EQ5DQ1-Q5 to prevent Q6 affecting the score
	pro_eq5d["EQ5DScoreWithoutQ6"] = pro_eq5d.loc[:, "EQ5DQ1":"EQ5DQ5"].sum(axis=1)

	# Calculate engagement over service
	pro_eq5d = calc_total_pro_engagement(pro_eq5d, "EQ5D")

	# Calculate engagement for a time window of 1 month (time window chosen based on signal
	# output observed from results of feature_eng_multiple_testing)
	pro_eq5d_engagement = calc_pro_engagement_in_time_window(
	pro_eq5d, "EQ5D", time_window=1, data=data
	)
	pro_eq5d = pro_eq5d.merge(pro_eq5d_engagement, on=["StudyId", "IndexDate"], how="left")

	# Calculate last PRO score
	pro_eq5d = calc_last_pro_score(pro_eq5d, "EQ5D")

	# Mapping to calculate the difference between the latest PRO scores and the average
	# PRO score
	question_names_eq5d = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Score", "ScoreWithoutQ6"]
	mapping_eq5d = define_mapping_for_calcs("EQ5D", question_names_eq5d, prefixes=["Avg"])

	# Calculate average PRO score for a time window of 1 month prior to IndexDate,
	# ignoring the latest PRO score
	pro_eq5d = calc_pro_average(pro_eq5d, "EQ5D", time_window=1)
	for key in mapping_eq5d:
	calc_diff_pro_scores(pro_eq5d, "EQ5D", key, mapping_eq5d[key][0], time_window=1)

	# Calculate variation of scores across 1 month
	pro_eq5d = calc_variation(pro_eq5d, "EQ5D")

	# Remove unwanted columns and duplicates
	pro_eq5d = pro_eq5d.loc[
	:,
	~(
	pro_eq5d.columns.str.startswith("Avg")
	\| pro_eq5d.columns.str.startswith("EQ5D")
	\| pro_eq5d.columns.str.startswith("Response")
	),
	]
	pro_eq5d = pro_eq5d.drop(
	columns=[
	"Score",
	"SubmissionTime",
	"FirstSubmissionDate",
	"TimeInService",
	"TimeSinceSubmission",
	"LatestPredictionDate",
	"LatestPRODate",
	]
	)
	pro_eq5d = pro_eq5d.drop_duplicates()

	############################################
	# Weekly PROs - MRC
	############################################
	pro_mrc = pd.read_csv(config["inputs"]["raw_data_paths"]["pro_mrc"], delimiter="\|")
	pro_mrc = pro_mrc.merge(
	patient_details,
	on="StudyId",
	how="inner",
	)

	# Calculate engagement over service
	pro_mrc = calc_total_pro_engagement(pro_mrc, "MRC")

	# Calculate engagement for a time window of 1 month
	pro_mrc_engagement = calc_pro_engagement_in_time_window(
	pro_mrc, "MRC", time_window=1, data=data
	)
	pro_mrc = pro_mrc.merge(pro_mrc_engagement, on=["StudyId", "IndexDate"], how="left")

	# Calculate last PRO score
	pro_mrc = calc_last_pro_score(pro_mrc, "MRC")

	# Mapping to calculate the difference between the latest PRO scores and the average
	# PRO score
	question_names_mrc = ["Q1"]
	mapping_mrc = define_mapping_for_calcs("MRC", question_names_mrc, prefixes=["Avg"])

	# Calculate average PRO score for a time window of 1 month prior to IndexDate,
	# ignoring the latest PRO score
	pro_mrc = calc_pro_average(pro_mrc, "MRC", time_window=1)
	for key in mapping_mrc:
	calc_diff_pro_scores(pro_mrc, "MRC", key, mapping_mrc[key][0], time_window=1)

	# Calculate variation of scores across 1 month
	pro_mrc = calc_variation(pro_mrc, "MRC")

	# Remove unwanted columns and duplicates
	pro_mrc = pro_mrc.loc[
	:,
	~(
	pro_mrc.columns.str.startswith("Avg")
	\| pro_mrc.columns.str.startswith("MRC")
	\| pro_mrc.columns.str.startswith("Response")
	),
	]
	pro_mrc = pro_mrc.drop(
	columns=[
	"SubmissionTime",
	"Score",
	"FirstSubmissionDate",
	"TimeInService",
	"TimeSinceSubmission",
	"LatestPredictionDate",
	"LatestPRODate",
	]
	)
	pro_mrc = pro_mrc.drop_duplicates()

	############################################
	# Daily PROs - CAT
	############################################
	pro_cat_full = pd.read_csv(config["inputs"]["raw_data_paths"]["pro_cat"], delimiter="\|")
	pro_cat = pro_cat_full.merge(
	patient_details,
	on="StudyId",
	how="inner",
	)

	# Calculate engagement over service
	pro_cat = calc_total_pro_engagement(pro_cat, "CAT")

	# Calculate engagement for a time window of 1 month
	pro_cat_engagement = calc_pro_engagement_in_time_window(
	pro_cat, "CAT", time_window=1, data=data
	)
	pro_cat = pro_cat.merge(pro_cat_engagement, on=["StudyId", "IndexDate"], how="left")

	# Calculate engagement in the month prior to the most recent month to index date
	pro_cat_month1 = calc_pro_engagement_at_specific_month(
	pro_cat, "CAT", month_num=1, data=data
	)
	pro_cat_month2 = calc_pro_engagement_at_specific_month(
	pro_cat, "CAT", month_num=2, data=data
	)
	pro_cat_month3 = calc_pro_engagement_at_specific_month(
	pro_cat, "CAT", month_num=3, data=data
	)
	pro_cat = pro_cat.merge(pro_cat_month1, on=["StudyId", "IndexDate"], how="left")
	pro_cat = pro_cat.merge(pro_cat_month2, on=["StudyId", "IndexDate"], how="left")
	pro_cat = pro_cat.merge(pro_cat_month3, on=["StudyId", "IndexDate"], how="left")
	pro_cat["EngagementDiffMonth1and2"] = (
	pro_cat["EngagementCATMonth1"] - pro_cat["EngagementCATMonth2"]
	)
	pro_cat["EngagementDiffMonth1and3"] = (
	pro_cat["EngagementCATMonth1"] - pro_cat["EngagementCATMonth3"]
	)

	# Calculate PRO average for the week before the index date
	pro_cat = calc_pro_average(pro_cat, "CAT", avg_period="WeeklyAvg")

	# Calculate variation of scores across 1 month
	pro_cat = calc_variation(pro_cat, "CAT")

	# Remove unwanted columns and duplicates
	pro_cat = pro_cat.loc[
	:,
	~(
	pro_cat.columns.str.startswith("CAT")
	\| pro_cat.columns.str.startswith("Response")
	),
	]
	pro_cat = pro_cat.drop(
	columns=[
	"Score",
	"SubmissionTime",
	"FirstSubmissionDate",
	"TimeSinceSubmission",
	"LatestPredictionDate",
	"TimeInService",
	"WeekStartDate",
	"WeekPrevStartDate",
	]
	)
	pro_cat = pro_cat.drop_duplicates()

	############################################
	# Daily PROs - Symptom Diary
	############################################

	# Symptom diary have some questions that are numeric and some that are categorical
	pro_sd_full = pd.read_csv(
	config["inputs"]["raw_data_paths"]["pro_symptom_diary"], delimiter="\|"
	)
	pro_sd = pro_sd_full.merge(
	patient_details,
	on="StudyId",
	how="inner",
	)

	# Calculate engagement over service
	pro_sd = calc_total_pro_engagement(pro_sd, "SymptomDiary")
	pro_sd_engagement = pro_sd[
	["StudyId", "PatientId", "IndexDate", "TotalEngagementSymptomDiary"]
	]

	# Calculate engagement for 1 month prior to IndexDate
	pro_sd_engagement_tw = calc_pro_engagement_in_time_window(
	pro_sd, "SymptomDiary", time_window=1, data=data
	)
	pro_sd_engagement = pro_sd_engagement.merge(
	pro_sd_engagement_tw, on=["StudyId", "IndexDate"], how="left"
	)
	pro_sd_engagement = pro_sd_engagement.drop_duplicates()

	###############################
	# Categorical questions
	# (Q8, Q9, Q10)
	###############################
	pro_cat_q5 = pro_cat_full[["StudyId", "SubmissionTime", "CATQ5"]]
	pro_sd_categ = pro_sd_full[
	[
	"StudyId",
	"SubmissionTime",
	"SymptomDiaryQ8",
	"SymptomDiaryQ9",
	"SymptomDiaryQ10",
	"Score",
	]
	]

	# Split timestamp column into separate date and time columns as same day entries in CAT
	# and SymptomDiary have different timestamps
	for df in [pro_cat_q5, pro_sd_categ]:
	df["Date"] = (pd.to_datetime(df["SubmissionTime"], utc=True)).dt.date
	pro_sd_cat = pro_sd_categ.merge(pro_cat_q5, on=["StudyId", "Date"], how="outer")

	# If CATQ5 is a 0, then Symptom Diary questions 8, 9 and 10 don't get asked. Add this as
	# an option to the columns. There are some cases where patients have a 0 in CATQ5 but
	# have also answered Symptom Diary questions 8, 9, and 10 - keep these answers as is.
	for col in ["SymptomDiaryQ8", "SymptomDiaryQ9", "SymptomDiaryQ10"]:
	pro_sd_cat[col] = np.where(
	(pro_sd_cat["CATQ5"] == 0) & (pro_sd_cat[col].isna()),
	"Question Not Asked",
	pro_sd_cat[col],
	)

	# Calculate the most recent score for SymptomDiary categorical questions
	pro_sd_cat = pro_sd_cat.merge(data[["StudyId", "IndexDate"]], on="StudyId", how="inner")
	pro_sd_cat = pro_sd_cat.rename(columns={"SubmissionTime_x": "SubmissionTime"})
	pro_sd_cat["SubmissionTime"] = pd.to_datetime(pro_sd_cat["SubmissionTime"], utc=True)
	pro_sd_cat = calc_last_pro_score(pro_sd_cat, "SymptomDiary")


	pro_sd_cat = pro_sd_cat.drop(
	columns=[
	"SubmissionTime",
	"SubmissionTime_y",
	"CATQ5",
	"SymptomDiaryQ8",
	"SymptomDiaryQ9",
	"Date",
	"SymptomDiaryQ10",
	"Score",
	"LatestSymptomDiaryScore",
	"LatestPRODate",
	"TimeSinceSubmission",
	]
	)
	pro_sd_cat = pro_sd_cat.drop_duplicates()

	###############################
	# Numeric questions
	# (Q1, Q2)
	# Q3 included for comparison
	###############################
	# Calculate PRO average for the week before the index date
	pro_sd_numeric = pro_sd[
	[
	"StudyId",
	"PatientId",
	"IndexDate",
	"SubmissionTime",
	"Score",
	"SymptomDiaryQ1",
	"SymptomDiaryQ2",
	"SymptomDiaryQ3",
	]
	]
	pro_sd_numeric = calc_pro_average(
	pro_sd_numeric, "SymptomDiary", avg_period="WeeklyAvg"
	)

	# Calculate variation of scores across 1 month
	pro_sd_numeric = calc_variation(pro_sd_numeric, "SymptomDiary")

	###############################
	# Binary questions
	# (Q3)
	###############################
	# Calculate sum of binary values for a time window of 1 months
	sd_sum_all = pro_sd_numeric[["StudyId", "IndexDate"]]
	sd_sum_all = sd_sum_all.drop_duplicates()
	sd_sum = calc_sum_binary_vals(
	pro_sd_numeric, binary_cols=["SymptomDiaryQ3"], time_window=1
	)
	sd_sum_all = sd_sum_all.merge(sd_sum, on=["StudyId", "IndexDate"], how="left")

	# Scale sums by how often patients responded
	sd_sum_all = sd_sum_all.merge(
	pro_sd_engagement, on=["StudyId", "IndexDate"], how="left"
	)
	mapping_scaling = {"SumSymptomDiaryQ3TW1": "EngagementSymptomDiaryTW1"}
	for key in mapping_scaling:
	scale_sum_to_response_rate(sd_sum_all, key, mapping_scaling[key])

	# Combine numeric, categorical and binary dfs
	pro_sd_all = pro_sd_numeric.merge(
	sd_sum_all, on=["StudyId", "PatientId", "IndexDate"], how="left"
	)
	pro_sd_all = pro_sd_all.merge(pro_sd_cat, on=["StudyId", "IndexDate"], how="left")

	# Remove unwanted columns from numeric df
	pro_sd_all = pro_sd_all.loc[
	:,
	~(
	pro_sd_all.columns.str.startswith("Symptom")
	\| pro_sd_all.columns.str.startswith("Sum")
	\| pro_sd_all.columns.str.startswith("Response")
	),
	]
	pro_sd_all = pro_sd_all.drop(
	columns=[
	"Score",
	"SubmissionTime",
	"TimeWindowStartDate",
	"WeekStartDate",
	"WeekPrevStartDate",
	"TimeSinceSubmission",
	]
	)
	pro_sd_all = pro_sd_all.drop_duplicates()

	# Combine pros into one df
	pro_df = pro_eq5d.merge(pro_mrc, on=["StudyId", "PatientId", "IndexDate"], how="left")
	pro_df = pro_df.merge(pro_cat, on=["StudyId", "PatientId", "IndexDate"], how="left")
	pro_df = pro_df.merge(pro_sd_all, on=["StudyId", "PatientId", "IndexDate"], how="left")

	###############################
	# Map some categorical features
	###############################

	# Replace SDQ8 with strings for phlegm difficulty
	q8_dict = {
	"1.0": "Not difficult",
	"2.0": "A little difficult",
	"3.0": "Quite difficult",
	"4.0": "Very difficult",
	}
	for key in q8_dict:
	pro_df["LatestSymptomDiaryQ8"] = pro_df["LatestSymptomDiaryQ8"].str.replace(
	key, q8_dict[key]
	)

	# Replace SDQ9 with strings for phlegm consistency
	q9_dict = {
	"1.0": "Watery",
	"2.0": "Sticky liquid",
	"3.0": "Semi-solid",
	"4.0": "Solid",
	}
	for key in q9_dict:
	pro_df["LatestSymptomDiaryQ9"] = pro_df["LatestSymptomDiaryQ9"].str.replace(
	key, q9_dict[key]
	)

	# Replace SDQ10 with strings for phlegm colour
	q10_dict = {
	"1.0": "White",
	"2.0": "Yellow",
	"3.0": "Green",
	"4.0": "Dark green",
	}
	for key in q10_dict:
	pro_df["LatestSymptomDiaryQ10"] = pro_df["LatestSymptomDiaryQ10"].str.replace(
	key, q10_dict[key]
	)

	pro_df = pro_df.drop(
	columns=[
	"PatientId",
	"LatestTimeSinceSubmission",
	"LatestTimeSinceSubmission_x",
	"LatestTimeSinceSubmission_y",
	]
	)

	# Save data
	os.makedirs(config["outputs"]["processed_data_dir"], exist_ok=True)
	if data_to_process == "forward_val":
	pro_df.to_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"pros_forward_val_" + model_type + ".pkl",
	)
	)
	else:
	pro_df.to_pickle(
	os.path.join(
	config["outputs"]["processed_data_dir"],
	"pros_" + model_type + ".pkl",
	)
	)