| | """ |
| | Derive features from PRO responses for multiple time windows and select time window |
| | that gives the best signal. |
| | """ |
| |
|
| | import numpy as np |
| | import pandas as pd |
| | import model_h |
| | import matplotlib.pyplot as plt |
| | from collections import defaultdict |
| |
|
| |
|
| | def create_cols_for_plotting(pro_name, question_col_names=None, var_engagement=False): |
| | """ |
| | Create a mapping for the PRO questions specified that allows plotting of the results |
| | from the same question with different time windows on the same grid. The key of the |
| | dictionary is the PRO question (e.g. 'EQ5DQ1') and the values are a list containing |
| | column names to be plotted (e.g. ['LatestEQ5DQ1', 'DiffLatestAvgEQ5DQ1TW1']). |
| | |
| | Args: |
| | pro_name (str): name of PRO. |
| | question_col_names (list, optional): a list of question names required for |
| | plotting. Defaults to None. |
| | var_engagement (bool, optional): whether the variable to be plot is engagement. |
| | Defaults to False. |
| | |
| | Returns: |
| | dict of (str:list): dictionary containing mapping where each key maps to a list |
| | of column names. |
| | """ |
| | cols_for_plotting = defaultdict(list) |
| |
|
| | if var_engagement is False: |
| | for question in question_col_names: |
| | for time_window_num in range(1, 7): |
| | col_name = ( |
| | "DiffLatestAvg" + pro_name + question + "TW" + str(time_window_num) |
| | ) |
| | cols_for_plotting[pro_name + question].append(col_name) |
| | if (pro_name == "SymptomDiary") & (question == "Q3"): |
| | col_name = ( |
| | "ScaledSum" + pro_name + question + "TW" + str(time_window_num) |
| | ) |
| | cols_for_plotting["ScaledSum" + pro_name + question].append( |
| | col_name |
| | ) |
| | cols_for_plotting[pro_name + question].append( |
| | "DiffLatestPrev" + pro_name + question |
| | ) |
| | if (pro_name == "EQ5D") | (pro_name == "MRC"): |
| | cols_for_plotting[pro_name + question].append( |
| | "Latest" + pro_name + question |
| | ) |
| | if (pro_name == "CAT") | (pro_name == "SymptomDiary"): |
| | cols_for_plotting[pro_name + question].append( |
| | "WeekAvg" + pro_name + question |
| | ) |
| |
|
| | if var_engagement is True: |
| | for time_window_num in range(1, 7): |
| | col_name = "Engagement" + pro_name + "TW" + str(time_window_num) |
| | cols_for_plotting[pro_name].append(col_name) |
| | return cols_for_plotting |
| |
|
| |
|
| | def plot_feature_signal( |
| | data, nrows, ncols, figsize, cols_to_plot, fig_name, outcome="ExacWithin3Months" |
| | ): |
| | """ |
| | Plot boxplots for each multiple columns onto the same grid if multiple columns are |
| | specified. |
| | |
| | Args: |
| | data (pd.DataFrame): dataframe containing all data to plot and outcome column. |
| | nrows (int): number of rows for the subplot grid. |
| | ncols (int): number of columns for the subplot grid. |
| | figsize (tuple): (width, height) in inches. |
| | cols_to_plot (list): column names to plot. |
| | fig_name (str): name of figure required to save figure. |
| | outcome (str, optional): name of column to group values by for plotting the |
| | data. Defaults to 'ExacWithinMonths'. |
| | |
| | Returns: |
| | None. |
| | """ |
| | fig, ax = plt.subplots(nrows, ncols, figsize=figsize) |
| | if (nrows > 1) | (ncols > 1): |
| | ax = ax.flatten() |
| | for i, col in enumerate(cols_to_plot): |
| | data.boxplot( |
| | col, |
| | outcome, |
| | ax=ax[i], |
| | flierprops={"markersize": 2}, |
| | medianprops={"color": "black"}, |
| | |
| | ) |
| | else: |
| | for i, col in enumerate(cols_to_plot): |
| | data.boxplot( |
| | col, |
| | outcome, |
| | flierprops={"markersize": 2}, |
| | medianprops={"color": "black"}, |
| | ) |
| | plt.tight_layout() |
| | plt.savefig("./plots/boxplots/" + fig_name + ".png") |
| | plt.close() |
| |
|
| |
|
| | data = pd.read_pickle("./data/patient_labels_hosp_comm.pkl") |
| | patient_details = pd.read_pickle("./data/patient_details.pkl") |
| |
|
| | data = data.merge( |
| | patient_details[["StudyId", "FirstSubmissionDate", "LatestPredictionDate"]], |
| | on="StudyId", |
| | how="left", |
| | ) |
| |
|
| | |
| | |
| | data["LookbackStartDate"] = data["IndexDate"] - pd.DateOffset(days=180) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | pro_eq5d = pd.read_csv("<YOUR_DATA_PATH>/copd-dataset/CopdDatasetProEQ5D.txt", delimiter="|") |
| | pro_eq5d = pro_eq5d.merge( |
| | data[["StudyId", "IndexDate", "FirstSubmissionDate", "LatestPredictionDate"]], |
| | on="StudyId", |
| | how="inner", |
| | ) |
| |
|
| | |
| | |
| | pro_eq5d["EQ5DScoreWithoutQ6"] = pro_eq5d.loc[:, "EQ5DQ1":"EQ5DQ5"].sum(axis=1) |
| |
|
| | |
| | pro_eq5d = model_h.calc_total_pro_engagement(pro_eq5d, "EQ5D") |
| |
|
| | |
| | for time_window in range(1, 7): |
| | pro_eq5d_engagement = model_h.calc_pro_engagement_in_time_window( |
| | pro_eq5d, "EQ5D", time_window=time_window, data=data |
| | ) |
| | pro_eq5d = pro_eq5d.merge( |
| | pro_eq5d_engagement, on=["StudyId", "IndexDate"], how="left" |
| | ) |
| |
|
| | |
| | pro_eq5d = model_h.calc_last_pro_score(pro_eq5d, "EQ5D") |
| |
|
| | |
| | pro_eq5d = model_h.calc_pro_score_prior_to_latest(pro_eq5d, "EQ5D") |
| |
|
| | |
| | |
| | |
| | |
| | |
| | question_names_eq5d = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Score", "ScoreWithoutQ6"] |
| | mapping_eq5d = model_h.define_mapping_for_calcs( |
| | "EQ5D", question_names_eq5d, prefixes=["Avg", "Prev"] |
| | ) |
| |
|
| | |
| | |
| | for time_window in range(1, 7): |
| | pro_eq5d = model_h.calc_pro_average(pro_eq5d, "EQ5D", time_window=time_window) |
| | for key in mapping_eq5d: |
| | model_h.calc_diff_pro_scores( |
| | pro_eq5d, "EQ5D", key, mapping_eq5d[key][0], time_window=time_window |
| | ) |
| |
|
| | |
| | for key in mapping_eq5d: |
| | model_h.calc_diff_pro_scores(pro_eq5d, "EQ5D", key, mapping_eq5d[key][1]) |
| |
|
| | |
| | pro_eq5d = pro_eq5d.loc[ |
| | :, |
| | ~( |
| | pro_eq5d.columns.str.startswith("Avg") |
| | | pro_eq5d.columns.str.startswith("EQ5D") |
| | | pro_eq5d.columns.str.startswith("Prev") |
| | | pro_eq5d.columns.str.startswith("Response") |
| | ), |
| | ] |
| | pro_eq5d = pro_eq5d.drop( |
| | columns=[ |
| | "Score", |
| | "SubmissionTime", |
| | "FirstSubmissionDate", |
| | "TimeInService", |
| | "TimeSinceSubmission", |
| | "LatestPredictionDate", |
| | "LatestPRODate", |
| | ] |
| | ) |
| | pro_eq5d = pro_eq5d.drop_duplicates() |
| |
|
| | |
| | |
| | |
| | pro_mrc = pd.read_csv("<YOUR_DATA_PATH>/copd-dataset/CopdDatasetProMrc.txt", delimiter="|") |
| | pro_mrc = pro_mrc.merge( |
| | data[["StudyId", "IndexDate", "FirstSubmissionDate", "LatestPredictionDate"]], |
| | on="StudyId", |
| | how="inner", |
| | ) |
| |
|
| | |
| | pro_mrc = model_h.calc_total_pro_engagement(pro_mrc, "MRC") |
| |
|
| | |
| | for time_window in range(1, 7): |
| | pro_mrc_engagement = model_h.calc_pro_engagement_in_time_window( |
| | pro_mrc, "MRC", time_window=time_window, data=data |
| | ) |
| | pro_mrc = pro_mrc.merge(pro_mrc_engagement, on=["StudyId", "IndexDate"], how="left") |
| |
|
| | |
| | pro_mrc = model_h.calc_last_pro_score(pro_mrc, "MRC") |
| |
|
| | |
| | pro_mrc = model_h.calc_pro_score_prior_to_latest(pro_mrc, "MRC") |
| |
|
| | |
| | |
| | |
| | |
| | |
| | question_names_mrc = ["Q1"] |
| | mapping_mrc = model_h.define_mapping_for_calcs( |
| | "MRC", question_names_mrc, prefixes=["Avg", "Prev"] |
| | ) |
| |
|
| | |
| | |
| | for time_window in range(1, 7): |
| | pro_mrc = model_h.calc_pro_average(pro_mrc, "MRC", time_window=time_window) |
| | for key in mapping_mrc: |
| | model_h.calc_diff_pro_scores( |
| | pro_mrc, "MRC", key, mapping_mrc[key][0], time_window=time_window |
| | ) |
| |
|
| | |
| | for key in mapping_mrc: |
| | model_h.calc_diff_pro_scores(pro_mrc, "MRC", key, mapping_mrc[key][1]) |
| |
|
| | |
| | pro_mrc = pro_mrc.loc[ |
| | :, |
| | ~( |
| | pro_mrc.columns.str.startswith("Avg") |
| | | pro_mrc.columns.str.startswith("MRC") |
| | | pro_mrc.columns.str.startswith("Prev") |
| | | pro_mrc.columns.str.startswith("Response") |
| | ), |
| | ] |
| | pro_mrc = pro_mrc.drop( |
| | columns=[ |
| | "SubmissionTime", |
| | "Score", |
| | "FirstSubmissionDate", |
| | "TimeInService", |
| | "TimeSinceSubmission", |
| | "LatestPredictionDate", |
| | "LatestPRODate", |
| | ] |
| | ) |
| | pro_mrc = pro_mrc.drop_duplicates() |
| |
|
| | |
| | |
| | |
| | pro_cat = pd.read_csv("<YOUR_DATA_PATH>/copd-dataset/CopdDatasetProCat.txt", delimiter="|") |
| | pro_cat = pro_cat.merge( |
| | data[["StudyId", "IndexDate", "FirstSubmissionDate", "LatestPredictionDate"]], |
| | on="StudyId", |
| | how="inner", |
| | ) |
| |
|
| | |
| | pro_cat = model_h.calc_total_pro_engagement(pro_cat, "CAT") |
| |
|
| | |
| | for time_window in range(1, 7): |
| | pro_cat_engagement = model_h.calc_pro_engagement_in_time_window( |
| | pro_cat, "CAT", time_window=time_window, data=data |
| | ) |
| | pro_cat = pro_cat.merge(pro_cat_engagement, on=["StudyId", "IndexDate"], how="left") |
| |
|
| | |
| | pro_cat = model_h.calc_pro_average(pro_cat, "CAT", avg_period="WeeklyAvg") |
| |
|
| | |
| | pro_cat = model_h.calc_pro_average(pro_cat, "CAT", avg_period="WeekPrevAvg") |
| |
|
| | |
| | |
| | |
| | |
| | |
| | question_names_cat = ["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Score"] |
| | mapping_cat = model_h.define_mapping_for_calcs( |
| | "CAT", question_names_cat, prefixes=["LongerAvg", "WeekPrevAvg"] |
| | ) |
| |
|
| | |
| | |
| | for time_window in range(1, 7): |
| | pro_cat = model_h.calc_pro_average( |
| | pro_cat, "CAT", time_window=time_window, avg_period="LongerAvg" |
| | ) |
| | for key in mapping_cat: |
| | model_h.calc_diff_pro_scores( |
| | pro_cat, "CAT", key, mapping_cat[key][0], time_window=time_window |
| | ) |
| |
|
| | |
| | for key in mapping_cat: |
| | model_h.calc_diff_pro_scores(pro_cat, "CAT", key, mapping_cat[key][1]) |
| |
|
| | |
| | pro_cat = pro_cat.loc[ |
| | :, |
| | ~( |
| | pro_cat.columns.str.startswith("WeekPrev") |
| | | pro_cat.columns.str.startswith("Longer") |
| | | pro_cat.columns.str.startswith("CAT") |
| | | pro_cat.columns.str.startswith("Response") |
| | ), |
| | ] |
| | pro_cat = pro_cat.drop( |
| | columns=[ |
| | "Score", |
| | "SubmissionTime", |
| | "FirstSubmissionDate", |
| | "LatestPredictionDate", |
| | "TimeInService", |
| | "AvgStartDate", |
| | "WeekStartDate", |
| | ] |
| | ) |
| | pro_cat = pro_cat.drop_duplicates() |
| |
|
| | |
| | |
| | |
| | |
| | pro_sd = pd.read_csv( |
| | "<YOUR_DATA_PATH>/copd-dataset/CopdDatasetProSymptomDiary.txt", delimiter="|" |
| | ) |
| | pro_sd = pro_sd.merge( |
| | data[["StudyId", "IndexDate", "FirstSubmissionDate", "LatestPredictionDate"]], |
| | on="StudyId", |
| | how="inner", |
| | ) |
| |
|
| | |
| | pro_sd = model_h.calc_total_pro_engagement(pro_sd, "SymptomDiary") |
| | pro_sd_engagement = pro_sd[ |
| | ["StudyId", "PatientId", "IndexDate", "TotalEngagementSymptomDiary"] |
| | ] |
| |
|
| | |
| | for time_window in range(1, 7): |
| | pro_sd_engagement_tw = model_h.calc_pro_engagement_in_time_window( |
| | pro_sd, "SymptomDiary", time_window=time_window, data=data |
| | ) |
| | pro_sd_engagement = pro_sd_engagement.merge( |
| | pro_sd_engagement_tw, on=["StudyId", "IndexDate"], how="left" |
| | ) |
| | pro_sd_engagement = pro_sd_engagement.drop_duplicates() |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | pro_sd_numeric = pro_sd[ |
| | [ |
| | "StudyId", |
| | "PatientId", |
| | "IndexDate", |
| | "SubmissionTime", |
| | "Score", |
| | "SymptomDiaryQ1", |
| | "SymptomDiaryQ2", |
| | "SymptomDiaryQ3", |
| | ] |
| | ] |
| | pro_sd_numeric = model_h.calc_pro_average( |
| | pro_sd_numeric, "SymptomDiary", avg_period="WeeklyAvg" |
| | ) |
| |
|
| | |
| | pro_sd_numeric = model_h.calc_pro_average( |
| | pro_sd_numeric, "SymptomDiary", avg_period="WeekPrevAvg" |
| | ) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | question_names_sd = ["Q1", "Q2", "Q3"] |
| | mapping_sd = model_h.define_mapping_for_calcs( |
| | "SymptomDiary", question_names_sd, prefixes=["LongerAvg", "WeekPrevAvg"] |
| | ) |
| |
|
| | |
| | |
| | for time_window in range(1, 7): |
| | pro_sd_numeric = model_h.calc_pro_average( |
| | pro_sd_numeric, "SymptomDiary", time_window=time_window, avg_period="LongerAvg" |
| | ) |
| | for key in mapping_sd: |
| | model_h.calc_diff_pro_scores( |
| | pro_sd_numeric, |
| | "SymptomDiary", |
| | key, |
| | mapping_sd[key][0], |
| | time_window=time_window, |
| | ) |
| |
|
| | |
| | for key in mapping_sd: |
| | model_h.calc_diff_pro_scores(pro_sd_numeric, "SymptomDiary", key, mapping_sd[key][1]) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | sd_sum_all = pro_sd_numeric[["StudyId", "IndexDate"]] |
| | sd_sum_all = sd_sum_all.drop_duplicates() |
| | for time_window in range(1, 7): |
| | sd_sum = model_h.calc_sum_binary_vals( |
| | pro_sd_numeric, binary_cols=["SymptomDiaryQ3"], time_window=time_window |
| | ) |
| | sd_sum_all = sd_sum_all.merge(sd_sum, on=["StudyId", "IndexDate"], how="left") |
| |
|
| | |
| | sd_sum_all = sd_sum_all.merge( |
| | pro_sd_engagement, on=["StudyId", "IndexDate"], how="left" |
| | ) |
| | mapping_scaling = {} |
| | for time_window in range(1, 7): |
| | mapping_scaling["SumSymptomDiaryQ3TW" + str(time_window)] = ( |
| | "EngagementSymptomDiaryTW" + str(time_window) |
| | ) |
| | for key in mapping_scaling: |
| | model_h.scale_sum_to_response_rate(sd_sum_all, key, mapping_scaling[key]) |
| |
|
| | |
| | pro_sd_full = pro_sd_numeric.merge( |
| | sd_sum_all, on=["StudyId", "PatientId", "IndexDate"], how="left" |
| | ) |
| |
|
| | |
| | pro_sd_full = pro_sd_full.loc[ |
| | :, |
| | ~( |
| | pro_sd_full.columns.str.startswith("WeekPrev") |
| | | pro_sd_full.columns.str.startswith("Longer") |
| | | pro_sd_full.columns.str.startswith("Symptom") |
| | | pro_sd_full.columns.str.startswith("Sum") |
| | | pro_sd_full.columns.str.startswith("Response") |
| | ), |
| | ] |
| | pro_sd_full = pro_sd_full.drop( |
| | columns=[ |
| | "Score", |
| | "SubmissionTime", |
| | "AvgStartDate", |
| | "TimeWindowStartDate", |
| | "WeekStartDate", |
| | ] |
| | ) |
| | pro_sd_full = pro_sd_full.drop_duplicates() |
| |
|
| | |
| | |
| | |
| | data = data.merge(pro_eq5d, on=["StudyId", "PatientId", "IndexDate"], how="left") |
| | data = data.merge(pro_mrc, on=["StudyId", "PatientId", "IndexDate"], how="left") |
| | data = data.merge(pro_cat, on=["StudyId", "PatientId", "IndexDate"], how="left") |
| | data = data.merge(pro_sd_full, on=["StudyId", "PatientId", "IndexDate"], how="left") |
| |
|
| | |
| | feat_to_explore = data.loc[:, "TotalEngagementEQ5D":"ScaledSumSymptomDiaryQ3TW6"] |
| | feat_to_explore.loc[:, "ExacWithin3Months"] = data.loc[:, "ExacWithin3Months"] |
| | grouped_data_by_outcome = feat_to_explore.groupby("ExacWithin3Months").mean() |
| | grouped_data_by_outcome = grouped_data_by_outcome.T |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | cols_for_plotting = model_h.create_cols_for_plotting( |
| | "EQ5D", question_col_names=question_names_eq5d |
| | ) |
| | for key in cols_for_plotting: |
| | model_h.plot_feature_signal( |
| | data, |
| | nrows=3, |
| | ncols=3, |
| | figsize=(12, 12), |
| | cols_to_plot=cols_for_plotting[key], |
| | fig_name=key + "_boxplot", |
| | ) |
| |
|
| | |
| | cols_for_plotting = model_h.create_cols_for_plotting("EQ5D", var_engagement=True) |
| | for key in cols_for_plotting: |
| | model_h.plot_feature_signal( |
| | data, |
| | nrows=2, |
| | ncols=3, |
| | figsize=(12, 12), |
| | cols_to_plot=cols_for_plotting[key], |
| | fig_name=key + "_engagement", |
| | ) |
| |
|
| | |
| | |
| | |
| | |
| | cols_for_plotting = model_h.create_cols_for_plotting( |
| | "MRC", question_col_names=question_names_mrc |
| | ) |
| | for key in cols_for_plotting: |
| | model_h.plot_feature_signal( |
| | data, |
| | nrows=3, |
| | ncols=3, |
| | figsize=(12, 12), |
| | cols_to_plot=cols_for_plotting[key], |
| | fig_name=key + "_boxplot", |
| | ) |
| |
|
| | |
| | cols_for_plotting = model_h.create_cols_for_plotting("MRC", var_engagement=True) |
| | for key in cols_for_plotting: |
| | model_h.plot_feature_signal( |
| | data, |
| | nrows=2, |
| | ncols=3, |
| | figsize=(12, 12), |
| | cols_to_plot=cols_for_plotting[key], |
| | fig_name=key + "_engagement", |
| | ) |
| |
|
| | |
| | |
| | |
| | |
| | cols_for_plotting = model_h.create_cols_for_plotting( |
| | "CAT", question_col_names=question_names_cat |
| | ) |
| | for key in cols_for_plotting: |
| | model_h.plot_feature_signal( |
| | data, |
| | nrows=3, |
| | ncols=3, |
| | figsize=(12, 12), |
| | cols_to_plot=cols_for_plotting[key], |
| | fig_name=key + "_boxplot", |
| | ) |
| | |
| | cols_for_plotting = model_h.create_cols_for_plotting("CAT", var_engagement=True) |
| | for key in cols_for_plotting: |
| | model_h.plot_feature_signal( |
| | data, |
| | nrows=2, |
| | ncols=3, |
| | figsize=(12, 12), |
| | cols_to_plot=cols_for_plotting[key], |
| | fig_name=key + "_engagement", |
| | ) |
| |
|
| | |
| | |
| | |
| | |
| | cols_for_plotting = model_h.create_cols_for_plotting( |
| | "SymptomDiary", question_col_names=question_names_sd |
| | ) |
| | for key in cols_for_plotting: |
| | model_h.plot_feature_signal( |
| | data, |
| | nrows=3, |
| | ncols=3, |
| | figsize=(12, 12), |
| | cols_to_plot=cols_for_plotting[key], |
| | fig_name=key + "_boxplot", |
| | ) |
| |
|
| | |
| | cols_for_plotting = model_h.create_cols_for_plotting("SymptomDiary", var_engagement=True) |
| | for key in cols_for_plotting: |
| | model_h.plot_feature_signal( |
| | data, |
| | nrows=2, |
| | ncols=3, |
| | figsize=(12, 12), |
| | cols_to_plot=cols_for_plotting[key], |
| | fig_name=key + "_engagement", |
| | ) |
| |
|