| | """ |
| | Derive features PRO responses for 2 models: |
| | Parallel model 1: uses both hospital and community exacerbation events |
| | Parallel model 2: uses only hospital exacerbation events |
| | """ |
| |
|
| | import numpy as np |
| | import pandas as pd |
| | import sys |
| | import os |
| | import re |
| | from collections import defaultdict |
| | import yaml |
| |
|
| |
|
| | def calc_total_pro_engagement(pro_df, pro_name): |
| | """ |
| | Calculates PRO engagement per patient across their entire time within the service. |
| | |
| | Args: |
| | pro_df (pd.DataFrame): dataframe containing the onboarding date and the latest |
| | prediction date. |
| | pro_name (str): name of the PRO. |
| | |
| | Returns: |
| | pd.DataFrame: the input dateframe with an additional column stating the total |
| | engagement for each patient across the service. |
| | """ |
| | |
| | if pro_name == "EQ5D": |
| | date_unit = "M" |
| | if pro_name == "MRC": |
| | date_unit = "W" |
| | if (pro_name == "CAT") | (pro_name == "SymptomDiary"): |
| | date_unit = "D" |
| | pro_df["TimeInService"] = np.floor( |
| | ( |
| | (pro_df.LatestPredictionDate - pro_df.FirstSubmissionDate) |
| | / np.timedelta64(1, date_unit) |
| | ) |
| | ) |
| |
|
| | |
| | pro_response_count = pro_df.groupby("StudyId").count()[["PatientId"]].reset_index() |
| | pro_response_count = pro_response_count.rename( |
| | columns={"PatientId": "Response" + pro_name} |
| | ) |
| | pro_df = pro_df.merge(pro_response_count, on="StudyId", how="left") |
| | pro_df["TotalEngagement" + pro_name] = round( |
| | pro_df["Response" + pro_name] / pro_df["TimeInService"], 2 |
| | ) |
| | return pro_df |
| |
|
| |
|
| | def calc_pro_engagement_in_time_window(pro_df, pro_name, time_window, data): |
| | """ |
| | Calculates PRO engagement per patient across a specified time window. The time |
| | window is in format 'months', and consists of the specified time period prior to |
| | IndexDate. |
| | |
| | Args: |
| | pro_df (pd.DataFrame): dataframe containing the index dates and PRO response |
| | submission dates. |
| | pro_name (str): name of the PRO. |
| | time_window (int): number of months in which to calculate PRO engagement. |
| | data (pd.DataFrame): main dataframe. |
| | |
| | Returns: |
| | pd.DataFrame: a dataframe containing the calculated PRO engagement. |
| | """ |
| |
|
| | |
| | if pro_name == "EQ5D": |
| | unit_val = 1 |
| | if pro_name == "MRC": |
| | unit_val = 4 |
| | if (pro_name == "CAT") | (pro_name == "SymptomDiary"): |
| | unit_val = 30 |
| |
|
| | pro_df["SubmissionTime"] = pd.to_datetime(pro_df["SubmissionTime"], utc=True) |
| | pro_engagement_6mo = pro_df.copy() |
| | pro_engagement_6mo["TimeSinceSubmission"] = ( |
| | pro_engagement_6mo["IndexDate"] - pro_engagement_6mo["SubmissionTime"] |
| | ).dt.days |
| |
|
| | |
| | pro_engagement_6mo = pro_engagement_6mo[ |
| | pro_engagement_6mo["TimeSinceSubmission"].between( |
| | 0, (time_window * 30), inclusive="both" |
| | ) |
| | ] |
| |
|
| | |
| | pro_engagement_6mo = ( |
| | pro_engagement_6mo.groupby(["StudyId", "IndexDate"]) |
| | .count()[["PatientId"]] |
| | .reset_index() |
| | ) |
| | pro_engagement_6mo = pro_engagement_6mo.rename( |
| | columns={"PatientId": "ResponseCountTW" + str(time_window)} |
| | ) |
| | pro_engagement_6mo["Engagement" + pro_name + "TW" + str(time_window)] = round( |
| | pro_engagement_6mo["ResponseCountTW" + str(time_window)] |
| | / (time_window * unit_val), |
| | 2, |
| | ) |
| | pro_engagement_6mo = data[["StudyId", "IndexDate"]].merge( |
| | pro_engagement_6mo, on=["StudyId", "IndexDate"], how="left" |
| | ) |
| |
|
| | |
| | pro_engagement_6mo = pro_engagement_6mo.fillna(0) |
| | return pro_engagement_6mo |
| |
|
| |
|
| | def calc_pro_engagement_at_specific_month(pro_df, pro_name, month_num, data): |
| | |
| | if pro_name == "EQ5D": |
| | unit_val = 1 |
| | if pro_name == "MRC": |
| | unit_val = 4 |
| | if (pro_name == "CAT") | (pro_name == "SymptomDiary"): |
| | unit_val = 30 |
| |
|
| | pro_df["SubmissionTime"] = pd.to_datetime(pro_df["SubmissionTime"], utc=True) |
| | pro_engagement = pro_df.copy() |
| | pro_engagement["TimeSinceSubmission"] = ( |
| | pro_engagement["IndexDate"] - pro_engagement["SubmissionTime"] |
| | ).dt.days |
| |
|
| | |
| | |
| | months_between_index_and_specified = month_num - 1 |
| | pro_engagement = pro_engagement[ |
| | pro_engagement["TimeSinceSubmission"].between( |
| | (months_between_index_and_specified * 30), |
| | (month_num * 30), |
| | inclusive="both", |
| | ) |
| | ] |
| |
|
| | |
| | pro_engagement = ( |
| | pro_engagement.groupby(["StudyId", "IndexDate"]) |
| | .count()[["PatientId"]] |
| | .reset_index() |
| | ) |
| | pro_engagement = pro_engagement.rename( |
| | columns={"PatientId": "ResponseCountMonth" + str(month_num)} |
| | ) |
| | pro_engagement["Engagement" + pro_name + "Month" + str(month_num)] = round( |
| | pro_engagement["ResponseCountMonth" + str(month_num)] / (1 * unit_val), |
| | 2, |
| | ) |
| | pro_engagement = data[["StudyId", "IndexDate"]].merge( |
| | pro_engagement, on=["StudyId", "IndexDate"], how="left" |
| | ) |
| |
|
| | |
| | pro_engagement = pro_engagement.fillna(0) |
| | return pro_engagement |
| |
|
| |
|
| | def calc_last_pro_score(pro_df, pro_name): |
| | """ |
| | Calculates the most recent PRO response. The latest PRO score is set to be within 2 |
| | months of the index date to allow recency of data without having many missing |
| | values. |
| | |
| | Args: |
| | pro_df (pd.DataFrame): dataframe containing the index dates and PRO response |
| | submission dates. |
| | pro_name (str): name of the PRO. |
| | |
| | Returns: |
| | pd.DataFrame: the input dateframe with additional columns stating the latest PRO |
| | score for each PRO question. |
| | """ |
| | |
| | pro_df["TimeSinceSubmission"] = ( |
| | pro_df["IndexDate"] - pro_df["SubmissionTime"] |
| | ).dt.days |
| | pro_df = pro_df[pro_df["TimeSinceSubmission"] > 0] |
| | pro_df = pro_df.sort_values( |
| | by=["StudyId", "IndexDate", "TimeSinceSubmission"], ascending=True |
| | ) |
| | latest_pro = pro_df.drop_duplicates(subset=["StudyId", "IndexDate"], keep="first") |
| |
|
| | |
| | latest_pro = latest_pro[latest_pro["TimeSinceSubmission"] <= 365] |
| |
|
| | |
| | question_cols = latest_pro.columns[ |
| | latest_pro.columns.str.startswith(pro_name) |
| | ].tolist() |
| | question_cols.extend( |
| | ["StudyId", "IndexDate", "Score", "SubmissionTime", "TimeSinceSubmission"] |
| | ) |
| | latest_pro = latest_pro[question_cols] |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | cols_to_rename = latest_pro.columns[ |
| | ~latest_pro.columns.isin(["StudyId", "IndexDate", "Score", "SubmissionTime"]) |
| | ] |
| | latest_pro = latest_pro.rename( |
| | columns=dict(zip(cols_to_rename, "Latest" + cols_to_rename)) |
| | ) |
| |
|
| | |
| | latest_pro = latest_pro.rename( |
| | columns={ |
| | "Score": "Latest" + pro_name + "Score", |
| | "SubmissionTime": "LatestPRODate", |
| | } |
| | ) |
| |
|
| | pro_df = pro_df.merge(latest_pro, on=["StudyId", "IndexDate"], how="left") |
| | return pro_df |
| |
|
| |
|
| | def calc_pro_score_prior_to_latest(pro_df, pro_name, time_prior_to_latest=60): |
| | """ |
| | Finds the PRO score prior to the latest PRO score before index date. |
| | |
| | Args: |
| | pro_df (pd.DataFrame): dataframe containing the latest PRO score and PRO |
| | response submission dates. |
| | pro_name (str): name of the PRO. |
| | time_prior_to_latest (int, optional): time period before latest PRO score in |
| | days. Default time frame set to 60 days (two months). |
| | |
| | Returns: |
| | pd.DataFrame: the input dateframe with additional columns stating the previous |
| | score closest to the latest PRO score for each PRO question. |
| | """ |
| | pro_previous = pro_df.copy() |
| | pro_previous = pro_previous[ |
| | pro_previous["SubmissionTime"] < pro_previous["LatestPRODate"] |
| | ] |
| | pro_previous = pro_previous.sort_values( |
| | by=["StudyId", "IndexDate", "SubmissionTime"], ascending=[True, True, False] |
| | ) |
| | pro_previous = pro_previous.drop_duplicates( |
| | subset=["StudyId", "IndexDate"], keep="first" |
| | ) |
| |
|
| | |
| | pro_previous["TimeSinceLatestPRODate"] = ( |
| | pro_previous["LatestPRODate"] - pro_previous["SubmissionTime"] |
| | ).dt.days |
| | pro_previous = pro_previous[ |
| | pro_previous["TimeSinceLatestPRODate"] <= time_prior_to_latest |
| | ] |
| |
|
| | |
| | cols_to_rename = [col for col in pro_previous if col.startswith(pro_name)] |
| | cols_to_rename = pro_previous[cols_to_rename].columns |
| | pro_previous = pro_previous.rename( |
| | columns=dict(zip(cols_to_rename, "Prev" + cols_to_rename)) |
| | ) |
| | pro_previous = pro_previous[["StudyId", "IndexDate", "Score"]].join( |
| | pro_previous.filter(regex="^Prev") |
| | ) |
| | pro_previous = pro_previous.rename(columns={"Score": "Prev" + pro_name + "Score"}) |
| | pro_df = pro_df.merge(pro_previous, on=["StudyId", "IndexDate"], how="left") |
| | return pro_df |
| |
|
| |
|
| | def define_mapping_for_calcs(pro_name, questions, prefixes): |
| | """ |
| | Defines the mapping for calculations between PRO responses. |
| | |
| | Args: |
| | pro_name (str): name of the PRO. |
| | questions (list): question names of PRO. |
| | prefixes (list): prefixes to identify which columns to use in calculations. The |
| | possible prefixes are: 'Avg', 'Prev', 'LongerAvg', 'WeekPrevAvg'. |
| | |
| | Returns: |
| | dict: mapping that maps columns for performing calculations. |
| | """ |
| | |
| | mapping = defaultdict(list) |
| |
|
| | |
| | for question in questions: |
| | if (pro_name == "EQ5D") | (pro_name == "MRC"): |
| | map_key = "Latest" + pro_name + question |
| | if (pro_name == "CAT") | (pro_name == "SymptomDiary"): |
| | map_key = "WeekAvg" + pro_name + question |
| | for prefix in prefixes: |
| | mapping[map_key].append(prefix + pro_name + question) |
| | return mapping |
| |
|
| |
|
| | def calc_pro_average(pro_df, pro_name, time_window=None, avg_period=None): |
| | """ |
| | Calculate the PRO average before the latest PRO score and within a specified time |
| | window. |
| | |
| | Args: |
| | pro_df (pd.DataFrame): dataframe containing index dates and PRO submission |
| | dates. |
| | pro_name (str): name of the PRO. |
| | time_window (int, optional): time window (in months) used for calculating the |
| | average of PRO responses. Defaults to None. |
| | avg_period (str, optional): identifies which prefix to add to output columns. |
| | Defaults to None. |
| | |
| | Returns: |
| | pd.Dataframe: the input dateframe with additional columns with the calculated |
| | averages. |
| | """ |
| | |
| | |
| | pro_df = pro_df.loc[ |
| | :, |
| | ~( |
| | pro_df.columns.str.startswith("Avg") |
| | | pro_df.columns.str.startswith("Longer") |
| | ), |
| | ] |
| |
|
| | if avg_period is None: |
| | prefix = "Avg" |
| | pro_df["AvgStartDate"] = pro_df["IndexDate"] - pd.DateOffset(months=time_window) |
| | avg_pro = pro_df[ |
| | (pro_df["SubmissionTime"] >= pro_df["AvgStartDate"]) |
| | & (pro_df["SubmissionTime"] < pro_df["LatestPRODate"]) |
| | ] |
| | else: |
| | pro_df["WeekStartDate"] = pro_df["IndexDate"] - pd.DateOffset(weeks=1) |
| | pro_df["WeekPrevStartDate"] = pro_df["WeekStartDate"] - pd.DateOffset(weeks=1) |
| |
|
| | |
| | |
| | if avg_period == "WeeklyAvg": |
| | prefix = "WeekAvg" |
| | avg_pro = pro_df[ |
| | (pro_df["SubmissionTime"] >= pro_df["WeekStartDate"]) |
| | & (pro_df["SubmissionTime"] <= pro_df["IndexDate"]) |
| | ] |
| | |
| | |
| | |
| | elif avg_period == "WeekPrevAvg": |
| | prefix = "WeekPrevAvg" |
| | avg_pro = pro_df[ |
| | (pro_df["SubmissionTime"] >= pro_df["WeekPrevStartDate"]) |
| | & (pro_df["SubmissionTime"] < pro_df["WeekStartDate"]) |
| | ] |
| | |
| | elif avg_period == "LongerAvg": |
| | prefix = "LongerAvg" |
| | pro_df["AvgStartDate"] = pro_df["IndexDate"] - pd.DateOffset(months=time_window) |
| | avg_pro = pro_df[ |
| | (pro_df["SubmissionTime"] >= pro_df["AvgStartDate"]) |
| | & (pro_df["SubmissionTime"] < pro_df["WeekStartDate"]) |
| | ] |
| |
|
| | |
| | cols_required = avg_pro.columns[avg_pro.columns.str.startswith(pro_name)].tolist() |
| | cols_required.extend(["StudyId", "IndexDate", "Score"]) |
| | avg_pro = avg_pro[cols_required] |
| |
|
| | |
| | avg_pro = avg_pro.groupby(["StudyId", "IndexDate"]).mean().reset_index() |
| |
|
| | |
| | cols_to_rename = avg_pro.columns[ |
| | ~avg_pro.columns.isin(["StudyId", "IndexDate", "Score"]) |
| | ] |
| | avg_pro = avg_pro.rename(columns=dict(zip(cols_to_rename, prefix + cols_to_rename))) |
| |
|
| | |
| | avg_pro = avg_pro.rename(columns={"Score": prefix + pro_name + "Score"}) |
| |
|
| | |
| | pro_df = pro_df.merge(avg_pro, on=["StudyId", "IndexDate"], how="left") |
| | return pro_df |
| |
|
| |
|
| | def calc_diff_pro_scores(pro_df, pro_name, latest_pro, other_pro, time_window=None): |
| | """ |
| | Calculate the difference between PRO scores. |
| | |
| | Args: |
| | pro_df (pd.DataFrame): dataframe containing columns required for calculations. |
| | pro_name (str): name of the PRO. |
| | latest_pro (str): column name containing the latest PRO response for PROs EQ5D |
| | and MRC, and the latest week average for PROs CAT and SymptomDiary. |
| | other_pro (str): column name containing the other variable for calculating |
| | difference. |
| | time_window (int, optional): time window (in months) used to specify which |
| | column to use when calculating differences. |
| | |
| | Returns: |
| | pd.Dataframe: the input dateframe with additional columns with the calculated |
| | differences. |
| | """ |
| | |
| | split_feat_name = re.findall(r"[A-Z][^A-Z]*", latest_pro) |
| |
|
| | |
| | split_feat_name.pop(0) |
| |
|
| | |
| | if pro_name in ["CAT", "SymptomDiary"]: |
| | split_feat_name.pop(0) |
| |
|
| | |
| | stripped_feat_name = "".join(split_feat_name) |
| |
|
| | if time_window is None: |
| | pro_df["DiffLatestPrev" + stripped_feat_name] = ( |
| | pro_df[latest_pro] - pro_df[other_pro] |
| | ) |
| | else: |
| | pro_df["DiffLatestAvg" + stripped_feat_name + "TW" + str(time_window)] = ( |
| | pro_df[latest_pro] - pro_df[other_pro] |
| | ) |
| | return pro_df |
| |
|
| |
|
| | def calc_variation(pro_df, pro_name): |
| | """ |
| | Calculate the variation (standard deviation) of PRO responses for a time window of |
| | 1 month. |
| | |
| | Args: |
| | pro_df (pd.DataFrame): dataframe containing index dates and PRO submission |
| | dates. |
| | pro_name (str): name of the PRO. |
| | |
| | Returns: |
| | pd.Dataframe: the input dateframe with additional columns with the calculated |
| | variance. |
| | """ |
| | |
| | if "TimeSinceSubmission" not in pro_df: |
| | pro_df["TimeSinceSubmission"] = ( |
| | pro_df["IndexDate"] - pro_df["SubmissionTime"] |
| | ).dt.days |
| | pro_var = pro_df[ |
| | (pro_df["TimeSinceSubmission"] > 0) & (pro_df["TimeSinceSubmission"] <= 30) |
| | ] |
| |
|
| | |
| | cols_required = pro_var.columns[pro_var.columns.str.startswith(pro_name)].tolist() |
| | cols_required.extend(["StudyId", "IndexDate", "Score"]) |
| | pro_var = pro_var[cols_required] |
| |
|
| | |
| | pro_var = pro_var.groupby(["StudyId", "IndexDate"]).std().reset_index() |
| |
|
| | |
| | cols_to_rename = pro_var.columns[ |
| | ~pro_var.columns.isin(["StudyId", "IndexDate", "Score"]) |
| | ] |
| | pro_var = pro_var.rename(columns=dict(zip(cols_to_rename, "Var" + cols_to_rename))) |
| |
|
| | |
| | pro_var = pro_var.rename(columns={"Score": "Var" + pro_name + "Score"}) |
| |
|
| | |
| | pro_df = pro_df.merge(pro_var, on=["StudyId", "IndexDate"], how="left") |
| | return pro_df |
| |
|
| |
|
| | def calc_sum_binary_vals(pro_df, binary_cols, time_window=1): |
| | """ |
| | For SymptomDiary questions that contain binary values, calculate the sum of the |
| | binary values for a specified time window. |
| | |
| | Args: |
| | pro_df (pd.DataFrame): dataframe containing index dates and PRO submission |
| | dates. |
| | binary_cols (list): column names for which sum of binary values is to be |
| | calculated for. |
| | time_window (int, optional): time window (in months) for which the sum of the |
| | binary values is calculated for. Defaults to 1. |
| | |
| | Returns: |
| | pd.Dataframe: a dataframe containing the sum of the binary values. |
| | """ |
| | |
| | |
| | pro_df["TimeWindowStartDate"] = pro_df["IndexDate"] - pd.DateOffset( |
| | months=time_window |
| | ) |
| | pro_df = pro_df[ |
| | (pro_df["SubmissionTime"] >= pro_df["TimeWindowStartDate"]) |
| | & (pro_df["SubmissionTime"] <= pro_df["IndexDate"]) |
| | ] |
| | sum_df = pro_df.groupby(["StudyId", "IndexDate"])[binary_cols].sum() |
| |
|
| | |
| | sum_df = sum_df.add_prefix("Sum") |
| | sum_df = sum_df.add_suffix("TW" + str(time_window)) |
| | sum_df = sum_df.reset_index() |
| | return sum_df |
| |
|
| |
|
| | def scale_sum_to_response_rate(pro_df, sum, engagement_rate): |
| | """ |
| | Scale the sum calculated using copd.calc_sum_binary_vals() to the response |
| | rate to obtain a feature that is comparable between patients. |
| | |
| | Args: |
| | pro_df (pd.DataFrame): dataframe containing the columns for scaling the sum by |
| | the engagement rate. |
| | sum (str): column name that contains the data for the sum of the binary values. |
| | engagement_rate (str): column name that contains the data for the response rate. |
| | |
| | Returns: |
| | pd.Dataframe: the input dateframe with additional columns with the scaled sum. |
| | """ |
| | pro_df["Scaled" + sum] = pro_df[sum] / pro_df[engagement_rate] |
| | return pro_df |
| |
|
| |
|
| | with open("./training/config.yaml", "r") as config: |
| | config = yaml.safe_load(config) |
| |
|
| | |
| | model_type = config["model_settings"]["model_type"] |
| |
|
| | |
| | log = open("./training/logging/process_pros_" + model_type + ".log", "w") |
| | sys.stdout = log |
| |
|
| | |
| | data_to_process = config["model_settings"]["data_to_process"] |
| |
|
| | |
| | if data_to_process == "forward_val": |
| | data = pd.read_pickle("./data/patient_labels_forward_val_hosp_comm.pkl") |
| | patient_details = pd.read_pickle("./data/patient_details_forward_val.pkl") |
| | else: |
| | data = pd.read_pickle("./data/patient_labels_" + model_type + ".pkl") |
| | patient_details = pd.read_pickle("./data/patient_details.pkl") |
| | data = data[["StudyId", "IndexDate"]] |
| | patient_details = data.merge( |
| | patient_details[["StudyId", "FirstSubmissionDate", "LatestPredictionDate"]], |
| | on="StudyId", |
| | how="left", |
| | ) |
| |
|
| | |
| | |
| | data["LookbackStartDate"] = data["IndexDate"] - pd.DateOffset( |
| | days=config["model_settings"]["lookback_period"] |
| | ) |
| |
|
| | |
| | |
| | |
| | pro_eq5d = pd.read_csv(config["inputs"]["raw_data_paths"]["pro_eq5d"], delimiter="|") |
| | pro_eq5d = pro_eq5d.merge( |
| | patient_details, |
| | on="StudyId", |
| | how="inner", |
| | ) |
| |
|
| | |
| | |
| | pro_eq5d["EQ5DScoreWithoutQ6"] = pro_eq5d.loc[:, "EQ5DQ1":"EQ5DQ5"].sum(axis=1) |
| |
|
| | |
| | pro_eq5d = calc_total_pro_engagement(pro_eq5d, "EQ5D") |
| |
|
| | |
| | |
| | pro_eq5d_engagement = calc_pro_engagement_in_time_window( |
| | pro_eq5d, "EQ5D", time_window=1, data=data |
| | ) |
| | pro_eq5d = pro_eq5d.merge(pro_eq5d_engagement, on=["StudyId", "IndexDate"], how="left") |
| |
|
| | |
| | pro_eq5d = calc_last_pro_score(pro_eq5d, "EQ5D") |
| |
|
| | |
| | |
| | question_names_eq5d = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Score", "ScoreWithoutQ6"] |
| | mapping_eq5d = define_mapping_for_calcs("EQ5D", question_names_eq5d, prefixes=["Avg"]) |
| |
|
| | |
| | |
| | pro_eq5d = calc_pro_average(pro_eq5d, "EQ5D", time_window=1) |
| | for key in mapping_eq5d: |
| | calc_diff_pro_scores(pro_eq5d, "EQ5D", key, mapping_eq5d[key][0], time_window=1) |
| |
|
| | |
| | pro_eq5d = calc_variation(pro_eq5d, "EQ5D") |
| |
|
| | |
| | pro_eq5d = pro_eq5d.loc[ |
| | :, |
| | ~( |
| | pro_eq5d.columns.str.startswith("Avg") |
| | | pro_eq5d.columns.str.startswith("EQ5D") |
| | | pro_eq5d.columns.str.startswith("Response") |
| | ), |
| | ] |
| | pro_eq5d = pro_eq5d.drop( |
| | columns=[ |
| | "Score", |
| | "SubmissionTime", |
| | "FirstSubmissionDate", |
| | "TimeInService", |
| | "TimeSinceSubmission", |
| | "LatestPredictionDate", |
| | "LatestPRODate", |
| | ] |
| | ) |
| | pro_eq5d = pro_eq5d.drop_duplicates() |
| |
|
| | |
| | |
| | |
| | pro_mrc = pd.read_csv(config["inputs"]["raw_data_paths"]["pro_mrc"], delimiter="|") |
| | pro_mrc = pro_mrc.merge( |
| | patient_details, |
| | on="StudyId", |
| | how="inner", |
| | ) |
| |
|
| | |
| | pro_mrc = calc_total_pro_engagement(pro_mrc, "MRC") |
| |
|
| | |
| | pro_mrc_engagement = calc_pro_engagement_in_time_window( |
| | pro_mrc, "MRC", time_window=1, data=data |
| | ) |
| | pro_mrc = pro_mrc.merge(pro_mrc_engagement, on=["StudyId", "IndexDate"], how="left") |
| |
|
| | |
| | pro_mrc = calc_last_pro_score(pro_mrc, "MRC") |
| |
|
| | |
| | |
| | question_names_mrc = ["Q1"] |
| | mapping_mrc = define_mapping_for_calcs("MRC", question_names_mrc, prefixes=["Avg"]) |
| |
|
| | |
| | |
| | pro_mrc = calc_pro_average(pro_mrc, "MRC", time_window=1) |
| | for key in mapping_mrc: |
| | calc_diff_pro_scores(pro_mrc, "MRC", key, mapping_mrc[key][0], time_window=1) |
| |
|
| | |
| | pro_mrc = calc_variation(pro_mrc, "MRC") |
| |
|
| | |
| | pro_mrc = pro_mrc.loc[ |
| | :, |
| | ~( |
| | pro_mrc.columns.str.startswith("Avg") |
| | | pro_mrc.columns.str.startswith("MRC") |
| | | pro_mrc.columns.str.startswith("Response") |
| | ), |
| | ] |
| | pro_mrc = pro_mrc.drop( |
| | columns=[ |
| | "SubmissionTime", |
| | "Score", |
| | "FirstSubmissionDate", |
| | "TimeInService", |
| | "TimeSinceSubmission", |
| | "LatestPredictionDate", |
| | "LatestPRODate", |
| | ] |
| | ) |
| | pro_mrc = pro_mrc.drop_duplicates() |
| |
|
| | |
| | |
| | |
| | pro_cat_full = pd.read_csv(config["inputs"]["raw_data_paths"]["pro_cat"], delimiter="|") |
| | pro_cat = pro_cat_full.merge( |
| | patient_details, |
| | on="StudyId", |
| | how="inner", |
| | ) |
| |
|
| | |
| | pro_cat = calc_total_pro_engagement(pro_cat, "CAT") |
| |
|
| | |
| | pro_cat_engagement = calc_pro_engagement_in_time_window( |
| | pro_cat, "CAT", time_window=1, data=data |
| | ) |
| | pro_cat = pro_cat.merge(pro_cat_engagement, on=["StudyId", "IndexDate"], how="left") |
| |
|
| | |
| | pro_cat_month1 = calc_pro_engagement_at_specific_month( |
| | pro_cat, "CAT", month_num=1, data=data |
| | ) |
| | pro_cat_month2 = calc_pro_engagement_at_specific_month( |
| | pro_cat, "CAT", month_num=2, data=data |
| | ) |
| | pro_cat_month3 = calc_pro_engagement_at_specific_month( |
| | pro_cat, "CAT", month_num=3, data=data |
| | ) |
| | pro_cat = pro_cat.merge(pro_cat_month1, on=["StudyId", "IndexDate"], how="left") |
| | pro_cat = pro_cat.merge(pro_cat_month2, on=["StudyId", "IndexDate"], how="left") |
| | pro_cat = pro_cat.merge(pro_cat_month3, on=["StudyId", "IndexDate"], how="left") |
| | pro_cat["EngagementDiffMonth1and2"] = ( |
| | pro_cat["EngagementCATMonth1"] - pro_cat["EngagementCATMonth2"] |
| | ) |
| | pro_cat["EngagementDiffMonth1and3"] = ( |
| | pro_cat["EngagementCATMonth1"] - pro_cat["EngagementCATMonth3"] |
| | ) |
| |
|
| | |
| | pro_cat = calc_pro_average(pro_cat, "CAT", avg_period="WeeklyAvg") |
| |
|
| | |
| | pro_cat = calc_variation(pro_cat, "CAT") |
| |
|
| | |
| | pro_cat = pro_cat.loc[ |
| | :, |
| | ~( |
| | pro_cat.columns.str.startswith("CAT") |
| | | pro_cat.columns.str.startswith("Response") |
| | ), |
| | ] |
| | pro_cat = pro_cat.drop( |
| | columns=[ |
| | "Score", |
| | "SubmissionTime", |
| | "FirstSubmissionDate", |
| | "TimeSinceSubmission", |
| | "LatestPredictionDate", |
| | "TimeInService", |
| | "WeekStartDate", |
| | "WeekPrevStartDate", |
| | ] |
| | ) |
| | pro_cat = pro_cat.drop_duplicates() |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | pro_sd_full = pd.read_csv( |
| | config["inputs"]["raw_data_paths"]["pro_symptom_diary"], delimiter="|" |
| | ) |
| | pro_sd = pro_sd_full.merge( |
| | patient_details, |
| | on="StudyId", |
| | how="inner", |
| | ) |
| |
|
| | |
| | pro_sd = calc_total_pro_engagement(pro_sd, "SymptomDiary") |
| | pro_sd_engagement = pro_sd[ |
| | ["StudyId", "PatientId", "IndexDate", "TotalEngagementSymptomDiary"] |
| | ] |
| |
|
| | |
| | pro_sd_engagement_tw = calc_pro_engagement_in_time_window( |
| | pro_sd, "SymptomDiary", time_window=1, data=data |
| | ) |
| | pro_sd_engagement = pro_sd_engagement.merge( |
| | pro_sd_engagement_tw, on=["StudyId", "IndexDate"], how="left" |
| | ) |
| | pro_sd_engagement = pro_sd_engagement.drop_duplicates() |
| |
|
| | |
| | |
| | |
| | |
| | pro_cat_q5 = pro_cat_full[["StudyId", "SubmissionTime", "CATQ5"]] |
| | pro_sd_categ = pro_sd_full[ |
| | [ |
| | "StudyId", |
| | "SubmissionTime", |
| | "SymptomDiaryQ8", |
| | "SymptomDiaryQ9", |
| | "SymptomDiaryQ10", |
| | "Score", |
| | ] |
| | ] |
| |
|
| | |
| | |
| | for df in [pro_cat_q5, pro_sd_categ]: |
| | df["Date"] = (pd.to_datetime(df["SubmissionTime"], utc=True)).dt.date |
| | pro_sd_cat = pro_sd_categ.merge(pro_cat_q5, on=["StudyId", "Date"], how="outer") |
| |
|
| | |
| | |
| | |
| | for col in ["SymptomDiaryQ8", "SymptomDiaryQ9", "SymptomDiaryQ10"]: |
| | pro_sd_cat[col] = np.where( |
| | (pro_sd_cat["CATQ5"] == 0) & (pro_sd_cat[col].isna()), |
| | "Question Not Asked", |
| | pro_sd_cat[col], |
| | ) |
| |
|
| | |
| | pro_sd_cat = pro_sd_cat.merge(data[["StudyId", "IndexDate"]], on="StudyId", how="inner") |
| | pro_sd_cat = pro_sd_cat.rename(columns={"SubmissionTime_x": "SubmissionTime"}) |
| | pro_sd_cat["SubmissionTime"] = pd.to_datetime(pro_sd_cat["SubmissionTime"], utc=True) |
| | pro_sd_cat = calc_last_pro_score(pro_sd_cat, "SymptomDiary") |
| |
|
| |
|
| | pro_sd_cat = pro_sd_cat.drop( |
| | columns=[ |
| | "SubmissionTime", |
| | "SubmissionTime_y", |
| | "CATQ5", |
| | "SymptomDiaryQ8", |
| | "SymptomDiaryQ9", |
| | "Date", |
| | "SymptomDiaryQ10", |
| | "Score", |
| | "LatestSymptomDiaryScore", |
| | "LatestPRODate", |
| | "TimeSinceSubmission", |
| | ] |
| | ) |
| | pro_sd_cat = pro_sd_cat.drop_duplicates() |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | pro_sd_numeric = pro_sd[ |
| | [ |
| | "StudyId", |
| | "PatientId", |
| | "IndexDate", |
| | "SubmissionTime", |
| | "Score", |
| | "SymptomDiaryQ1", |
| | "SymptomDiaryQ2", |
| | "SymptomDiaryQ3", |
| | ] |
| | ] |
| | pro_sd_numeric = calc_pro_average( |
| | pro_sd_numeric, "SymptomDiary", avg_period="WeeklyAvg" |
| | ) |
| |
|
| | |
| | pro_sd_numeric = calc_variation(pro_sd_numeric, "SymptomDiary") |
| |
|
| | |
| | |
| | |
| | |
| | |
| | sd_sum_all = pro_sd_numeric[["StudyId", "IndexDate"]] |
| | sd_sum_all = sd_sum_all.drop_duplicates() |
| | sd_sum = calc_sum_binary_vals( |
| | pro_sd_numeric, binary_cols=["SymptomDiaryQ3"], time_window=1 |
| | ) |
| | sd_sum_all = sd_sum_all.merge(sd_sum, on=["StudyId", "IndexDate"], how="left") |
| |
|
| | |
| | sd_sum_all = sd_sum_all.merge( |
| | pro_sd_engagement, on=["StudyId", "IndexDate"], how="left" |
| | ) |
| | mapping_scaling = {"SumSymptomDiaryQ3TW1": "EngagementSymptomDiaryTW1"} |
| | for key in mapping_scaling: |
| | scale_sum_to_response_rate(sd_sum_all, key, mapping_scaling[key]) |
| |
|
| | |
| | pro_sd_all = pro_sd_numeric.merge( |
| | sd_sum_all, on=["StudyId", "PatientId", "IndexDate"], how="left" |
| | ) |
| | pro_sd_all = pro_sd_all.merge(pro_sd_cat, on=["StudyId", "IndexDate"], how="left") |
| |
|
| | |
| | pro_sd_all = pro_sd_all.loc[ |
| | :, |
| | ~( |
| | pro_sd_all.columns.str.startswith("Symptom") |
| | | pro_sd_all.columns.str.startswith("Sum") |
| | | pro_sd_all.columns.str.startswith("Response") |
| | ), |
| | ] |
| | pro_sd_all = pro_sd_all.drop( |
| | columns=[ |
| | "Score", |
| | "SubmissionTime", |
| | "TimeWindowStartDate", |
| | "WeekStartDate", |
| | "WeekPrevStartDate", |
| | "TimeSinceSubmission", |
| | ] |
| | ) |
| | pro_sd_all = pro_sd_all.drop_duplicates() |
| |
|
| | |
| | pro_df = pro_eq5d.merge(pro_mrc, on=["StudyId", "PatientId", "IndexDate"], how="left") |
| | pro_df = pro_df.merge(pro_cat, on=["StudyId", "PatientId", "IndexDate"], how="left") |
| | pro_df = pro_df.merge(pro_sd_all, on=["StudyId", "PatientId", "IndexDate"], how="left") |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | q8_dict = { |
| | "1.0": "Not difficult", |
| | "2.0": "A little difficult", |
| | "3.0": "Quite difficult", |
| | "4.0": "Very difficult", |
| | } |
| | for key in q8_dict: |
| | pro_df["LatestSymptomDiaryQ8"] = pro_df["LatestSymptomDiaryQ8"].str.replace( |
| | key, q8_dict[key] |
| | ) |
| |
|
| | |
| | q9_dict = { |
| | "1.0": "Watery", |
| | "2.0": "Sticky liquid", |
| | "3.0": "Semi-solid", |
| | "4.0": "Solid", |
| | } |
| | for key in q9_dict: |
| | pro_df["LatestSymptomDiaryQ9"] = pro_df["LatestSymptomDiaryQ9"].str.replace( |
| | key, q9_dict[key] |
| | ) |
| |
|
| | |
| | q10_dict = { |
| | "1.0": "White", |
| | "2.0": "Yellow", |
| | "3.0": "Green", |
| | "4.0": "Dark green", |
| | } |
| | for key in q10_dict: |
| | pro_df["LatestSymptomDiaryQ10"] = pro_df["LatestSymptomDiaryQ10"].str.replace( |
| | key, q10_dict[key] |
| | ) |
| |
|
| | pro_df = pro_df.drop( |
| | columns=[ |
| | "PatientId", |
| | "LatestTimeSinceSubmission", |
| | "LatestTimeSinceSubmission_x", |
| | "LatestTimeSinceSubmission_y", |
| | ] |
| | ) |
| |
|
| | |
| | os.makedirs(config["outputs"]["processed_data_dir"], exist_ok=True) |
| | if data_to_process == "forward_val": |
| | pro_df.to_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "pros_forward_val_" + model_type + ".pkl", |
| | ) |
| | ) |
| | else: |
| | pro_df.to_pickle( |
| | os.path.join( |
| | config["outputs"]["processed_data_dir"], |
| | "pros_" + model_type + ".pkl", |
| | ) |
| | ) |
| |
|