| | """ |
| | Script uses both hospital and community exacerbation events. |
| | |
| | Collate all hospital, patient reported events and apply PRO LOGIC to determine the number |
| | of exacerbation events. Use exacerbation events to determine the number of rows required per |
| | patient in the data and generate random index dates and setup labels. Data starts at July |
| | 2022 and runs until Dec 2023 and will be used for forward validation of the model. |
| | """ |
| | import model_h |
| | import numpy as np |
| | import os |
| | import sys |
| | import pandas as pd |
| | import matplotlib.pyplot as plt |
| | from datetime import timedelta |
| | import random |
| | import yaml |
| |
|
| | with open("./training/config.yaml", "r") as config: |
| | config = yaml.safe_load(config) |
| |
|
| | |
| | log = open(os.path.join(config['outputs']['logging_dir'], "setup_labels_hosp_comm.log"), "w") |
| | sys.stdout = log |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | patient_details = pd.read_csv( |
| | config['inputs']['raw_data_paths']['patient_details'], |
| | usecols=[ |
| | "PatientId", |
| | "FirstSubmissionDate", |
| | "MostRecentSubmissionDate", |
| | "DateOfBirth", |
| | "Sex", |
| | "StudyId", |
| | ], |
| | delimiter="|", |
| | ) |
| |
|
| | |
| | |
| | receiver_patients = ["RC{:02d}".format(i) for i in range(1, 85)] |
| | |
| | receiver_patients.remove("RC34") |
| |
|
| | |
| | scaleup_patients = ["SU{:02d}".format(i) for i in range(1, 219)] |
| | |
| |
|
| | |
| | valid_patients = receiver_patients + scaleup_patients |
| |
|
| | |
| | patient_details = patient_details[ |
| | patient_details.StudyId.str.replace(" ", "").isin(valid_patients) |
| | ] |
| | |
| | patient_details = patient_details[ |
| | (patient_details.FirstSubmissionDate.notna()) |
| | & (patient_details.MostRecentSubmissionDate.notna()) |
| | ] |
| |
|
| | |
| | patient_details["EarliestIndexDate"] = config['model_settings']['forward_validation_earliest_date'] |
| |
|
| | |
| | patient_details["LatestPredictionDate"] = config['model_settings']['forward_validation_latest_date'] |
| |
|
| | date_cols = ["FirstSubmissionDate", "MostRecentSubmissionDate", "LatestPredictionDate", "EarliestIndexDate"] |
| | patient_details[date_cols] = patient_details[date_cols].apply( |
| | lambda x: pd.to_datetime(x, utc=True, format="mixed").dt.normalize(), axis=1 |
| | ) |
| |
|
| | |
| | |
| | patient_details["LatestPredictionDate"] = patient_details[ |
| | ["MostRecentSubmissionDate", "LatestPredictionDate"] |
| | ].min(axis=1) |
| |
|
| | |
| | patient_details["LatestIndexDate"] = patient_details[ |
| | "LatestPredictionDate" |
| | ] - pd.DateOffset(days=config['model_settings']['prediction_window']) |
| |
|
| | |
| | patient_details["EarliestDataDate"] = patient_details[ |
| | "EarliestIndexDate" |
| | ] - pd.DateOffset(days=config['model_settings']['lookback_period']) |
| |
|
| | |
| | |
| | print("Number of total patients", len(patient_details)) |
| | print( |
| | "Number of patients with too short of a window of data:", |
| | len( |
| | patient_details[ |
| | patient_details["EarliestIndexDate"] > patient_details["LatestIndexDate"] |
| | ] |
| | ), |
| | ) |
| | patient_details = patient_details[ |
| | patient_details["EarliestIndexDate"] < patient_details["LatestIndexDate"] |
| | ] |
| | patient_details.to_pickle("./data/patient_details_forward_val.pkl") |
| |
|
| | |
| | model_patients = list(patient_details.PatientId.unique()) |
| | model_study_ids = list(patient_details.StudyId.unique()) |
| |
|
| | print( |
| | "Model cohort: {} patients. {} RECEIVER and {} SU".format( |
| | len(model_patients), |
| | len(patient_details[patient_details["StudyId"].str.startswith("RC")]), |
| | len(patient_details[patient_details["StudyId"].str.startswith("SU")]), |
| | ) |
| | ) |
| |
|
| | df = patient_details[ |
| | [ |
| | "PatientId", |
| | "DateOfBirth", |
| | "Sex", |
| | "StudyId", |
| | "EarliestDataDate", |
| | "EarliestIndexDate", |
| | "LatestIndexDate", |
| | "LatestPredictionDate", |
| | ] |
| | ].copy() |
| |
|
| | |
| | df["DateOfEvent"] = df.apply( |
| | lambda x: pd.date_range(x.EarliestDataDate, x.LatestPredictionDate, freq="D"), |
| | axis=1, |
| | ) |
| | df = df.explode("DateOfEvent").reset_index(drop=True) |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | patient_events = pd.read_csv( |
| | config['inputs']['raw_data_paths']['patient_events'], |
| | delimiter="|", |
| | usecols=["PatientId", "DateOfEvent", "EventType"], |
| | ) |
| |
|
| | |
| | patient_events = patient_events[patient_events.PatientId.isin(model_patients)] |
| |
|
| | |
| | patient_events["IsHospExac"] = model_h.define_service_exac_event( |
| | events=patient_events.EventType, include_community=False |
| | ) |
| |
|
| | |
| | patient_events["IsHospAdmission"] = model_h.define_hospital_admission( |
| | patient_events.EventType |
| | ) |
| |
|
| | admissions = patient_events[patient_events.IsHospAdmission == 1][ |
| | ["PatientId", "DateOfEvent", "IsHospAdmission"] |
| | ] |
| | hosp_exacs = patient_events[patient_events.IsHospExac == 1][ |
| | ["PatientId", "DateOfEvent", "IsHospExac"] |
| | ] |
| | admissions["DateOfEvent"] = pd.to_datetime( |
| | admissions.DateOfEvent, utc=True |
| | ).dt.normalize() |
| | hosp_exacs["DateOfEvent"] = pd.to_datetime( |
| | hosp_exacs.DateOfEvent, utc=True |
| | ).dt.normalize() |
| |
|
| | hosp_exacs = hosp_exacs.drop_duplicates() |
| | admissions = admissions.drop_duplicates() |
| |
|
| | |
| | hosp_exacs.to_pickle("./data/hospital_exacerbations.pkl") |
| | admissions.to_pickle("./data/hospital_admissions.pkl") |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | symptom_diary = pd.read_csv( |
| | config['inputs']['raw_data_paths']['pro_symptom_diary'], |
| | usecols=[ |
| | "PatientId", |
| | "StudyId", |
| | "Score", |
| | "SubmissionTime", |
| | "SymptomDiaryQ5", |
| | "SymptomDiaryQ11a", |
| | "SymptomDiaryQ11b", |
| | ], |
| | delimiter="|", |
| | ) |
| |
|
| | Q5ChangeDate = pd.to_datetime(config['model_settings']['pro_q5_change_date'], utc=True) |
| | symptom_diary = model_h.filter_symptom_diary( |
| | df=symptom_diary, date_cutoff=Q5ChangeDate, patients=model_patients |
| | ) |
| |
|
| | weekly_pros = model_h.get_rescue_med_pro_responses(symptom_diary) |
| | weekly_pros = model_h.set_pro_exac_dates(weekly_pros) |
| | weekly_pros = weekly_pros[ |
| | [ |
| | "PatientId", |
| | "Q5Answered", |
| | "NegativeQ5", |
| | "IsCommExac", |
| | "DateOfEvent", |
| | "ExacDateUnknown", |
| | ] |
| | ] |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | df = df.merge(weekly_pros, on=["PatientId", "DateOfEvent"], how="left") |
| |
|
| | |
| | df = df.merge(hosp_exacs, on=["PatientId", "DateOfEvent"], how="left") |
| | df = model_h.fill_column_by_patient(df=df, id_col="PatientId", col="StudyId") |
| |
|
| | |
| | df = df.merge(admissions, on=["PatientId", "DateOfEvent"], how="left") |
| | df = model_h.fill_column_by_patient(df=df, id_col="PatientId", col="StudyId") |
| |
|
| | |
| | |
| | df["IsExac"] = np.where((df.IsCommExac == 1) | (df.IsHospExac == 1), 1, 0) |
| |
|
| | |
| | df = ( |
| | df.set_index("DateOfEvent") |
| | .groupby("StudyId") |
| | .resample("D") |
| | .asfreq() |
| | .drop("StudyId", axis=1) |
| | .reset_index() |
| | ) |
| |
|
| | |
| | df[ |
| | [ |
| | "Q5Answered", |
| | "NegativeQ5", |
| | "IsHospExac", |
| | "IsCommExac", |
| | "ExacDateUnknown", |
| | "IsExac", |
| | "IsHospAdmission", |
| | ] |
| | ] = df[ |
| | [ |
| | "Q5Answered", |
| | "NegativeQ5", |
| | "IsHospExac", |
| | "IsCommExac", |
| | "ExacDateUnknown", |
| | "IsExac", |
| | "IsHospAdmission", |
| | ] |
| | ].fillna( |
| | 0 |
| | ) |
| |
|
| | |
| | |
| | df = model_h.fill_column_by_patient(df=df, id_col="StudyId", col="LatestPredictionDate") |
| | df = model_h.fill_column_by_patient(df=df, id_col="StudyId", col="PatientId") |
| |
|
| | |
| | df = df[df.DateOfEvent <= df.LatestPredictionDate] |
| |
|
| | print("Starting number of exacerbations: {}".format(df.IsExac.sum())) |
| | print( |
| | "Number of exacerbations during COPD service: {}".format( |
| | len(df[(df.IsExac == 1) & (df.DateOfEvent >= df.EarliestDataDate)]) |
| | ) |
| | ) |
| | print( |
| | "Number of unique exacerbation patients: {}".format( |
| | len(df[df.IsExac == 1].PatientId.unique()) |
| | ) |
| | ) |
| | print( |
| | "Exacerbation breakdown: {} hospital, {} patient reported and {} overlapping".format( |
| | df.IsHospExac.sum(), |
| | df.IsCommExac.sum(), |
| | len(df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1)]), |
| | ) |
| | ) |
| | print( |
| | "Number of hospital exacerbations during COPD service: {} ({} unique patients)".format( |
| | len(df[(df.IsHospExac == 1) & (df.DateOfEvent >= df.EarliestDataDate)]), |
| | len( |
| | df[ |
| | (df.IsHospExac == 1) & (df.DateOfEvent >= df.EarliestDataDate) |
| | ].StudyId.unique() |
| | ), |
| | ) |
| | ) |
| | print( |
| | "Community exacerbations from weekly PROs: {} ({} unique patients)".format( |
| | len(df[df.IsCommExac == 1]), len(df[df.IsCommExac == 1].StudyId.unique()) |
| | ) |
| | ) |
| | print( |
| | "Number of patient reported exacerbations with unknown dates: {} ({} overlapping\ |
| | with hospital events)".format( |
| | df.ExacDateUnknown.sum(), |
| | len(df[(df.IsHospExac == 1) & (df.ExacDateUnknown == 1)]), |
| | ) |
| | ) |
| |
|
| | |
| | |
| | df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1), "ExacDateUnknown"] = 0 |
| | print("Remaining exacerbations with unknown dates: {}".format(df.ExacDateUnknown.sum())) |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | logic_min_days = config['model_settings']['pro_logic_min_days_after_exac'] |
| | logic_max_days = config['model_settings']['pro_logic_max_days_after_exac'] |
| |
|
| | |
| | df = ( |
| | df.groupby("StudyId") |
| | .apply( |
| | lambda x: model_h.calculate_days_since_last_event( |
| | df=x, event_col="IsExac", output_col="DaysSinceLastExac" |
| | ) |
| | ) |
| | .reset_index(drop=True) |
| | ) |
| |
|
| | |
| | df["RemoveRow"] = model_h.minimum_period_between_exacerbations( |
| | df, minimum_days=logic_min_days |
| | ) |
| |
|
| | |
| | df["RemoveRow"] = np.where(df["IsHospExac"] == 1, 0, df["RemoveRow"]) |
| |
|
| | print( |
| | "Number of community exacerbations excluded by PRO LOGIC {} day criterion: {}".format( |
| | logic_min_days, len(df[(df.IsExac == 1) & (df.RemoveRow == 1)]) |
| | ) |
| | ) |
| |
|
| | |
| | |
| | consecutive_replies = config['model_settings']['neg_consecutive_q5_replies'] |
| | df = model_h.apply_logic_response_criterion( |
| | df, |
| | minimum_period=logic_min_days, |
| | maximum_period=logic_max_days, |
| | N=consecutive_replies, |
| | ) |
| |
|
| | |
| | df["RemoveExac"] = np.where(df["IsHospExac"] == 1, 0, df["RemoveExac"]) |
| |
|
| | print( |
| | "Weekly rescue med (Q5) criterion applied to events occurring between {} and {} \ |
| | days after a previous event. {} consecutive negative replies required for the event to \ |
| | count as a new event".format( |
| | logic_min_days, logic_max_days, consecutive_replies |
| | ) |
| | ) |
| | print( |
| | "Number of exacerbations excluded by PRO LOGIC Q5 response criterion: {}".format( |
| | df.RemoveExac.sum() |
| | ) |
| | ) |
| | print( |
| | "Earliest and latest exacerbations excluded: {}, {}".format( |
| | df[df.RemoveExac == 1].DateOfEvent.min(), |
| | df[df.RemoveExac == 1].DateOfEvent.max(), |
| | ) |
| | ) |
| | print( |
| | "Remaining number of exacerbations: {}".format( |
| | len(df[(df.IsExac == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1)]) |
| | ) |
| | ) |
| | print( |
| | "Remaining exacerbations with unknown dates: {}".format( |
| | len(df[(df.ExacDateUnknown == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1)]) |
| | ) |
| | ) |
| |
|
| | |
| | df = model_h.remove_data_between_exacerbations(df) |
| |
|
| | |
| | df = model_h.remove_unknown_date_exacerbations(df) |
| |
|
| | |
| | df = df[df["RemoveRow"] != 1] |
| |
|
| | |
| | print("---Final exacerbation counts---") |
| | print("Final number of exacerbations: {}".format(df.IsExac.sum())) |
| | exac_patients = pd.Series(df[df.IsExac == 1].StudyId.unique()) |
| | print( |
| | "Number of unique exacerbation patients: {} ({} RC and {} SU)".format( |
| | len(exac_patients), |
| | exac_patients.str.startswith("RC").sum(), |
| | exac_patients.str.startswith("SU").sum(), |
| | ) |
| | ) |
| | print( |
| | "Exacerbation breakdown: {} hospital, {} patient reported and {} overlapping".format( |
| | df.IsHospExac.sum(), |
| | df.IsCommExac.sum(), |
| | len(df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1)]), |
| | ) |
| | ) |
| | df.to_pickle("./data/hosp_comm_exacs.pkl") |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | service_time = df[["StudyId", "LatestPredictionDate", "EarliestDataDate"]] |
| | service_time = service_time.drop_duplicates(subset="StudyId", keep="first") |
| | service_time["ServiceTime"] = ( |
| | service_time["LatestPredictionDate"] - service_time["EarliestDataDate"] |
| | ).dt.days |
| | avg_service_time = sum(service_time["ServiceTime"]) / len(service_time["ServiceTime"]) |
| | avg_service_time_months = round(avg_service_time / 30) |
| | print("Average time in service (days):", avg_service_time) |
| | print("Average time in service (months):", avg_service_time_months) |
| |
|
| | |
| | avg_exac_per_patient = round( |
| | len(df[df["IsExac"] == 1]) / df[df["IsExac"] == 1][["StudyId"]].nunique().item(), 2 |
| | ) |
| | print( |
| | "Number of exac/patient/months: {} exacerbations/patient in {} months".format( |
| | avg_exac_per_patient, avg_service_time_months |
| | ) |
| | ) |
| | print( |
| | "On average, 1 exacerbation occurs in a patient every: {} months".format( |
| | round(avg_service_time_months / avg_exac_per_patient, 2) |
| | ) |
| | ) |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | service_time["NumRows"] = round(service_time["ServiceTime"] / config['model_settings']['one_row_per_days_in_service']).astype("int") |
| | patient_details = pd.merge( |
| | patient_details, service_time[["StudyId", "NumRows"]], on="StudyId", how="left" |
| | ) |
| |
|
| | |
| | patient_details["NumDaysPossibleIndex"] = ( |
| | patient_details["LatestIndexDate"] - patient_details["EarliestIndexDate"] |
| | ).dt.days |
| | patient_details.to_csv("./data/pat_details_to_calc_index_dt.csv", index=False) |
| |
|
| | |
| | patient_details["NumRows"] = np.where(patient_details["NumRows"] > patient_details["NumDaysPossibleIndex"], patient_details["NumDaysPossibleIndex"], patient_details["NumRows"]) |
| |
|
| | |
| | |
| | |
| | random_seed_general = config['model_settings']['index_date_generation_master_seed'] |
| | random.seed(random_seed_general) |
| |
|
| | |
| | patient_details["RandomSeed"] = random.sample( |
| | range(0, 2**32), patient_details.shape[0] |
| | ) |
| |
|
| | |
| | rand_days_dict = {} |
| | rand_date_dict = {} |
| | for index, row in patient_details.iterrows(): |
| | np.random.seed(row["RandomSeed"]) |
| | rand_days_dict[row["StudyId"]] = np.random.choice( |
| | row["NumDaysPossibleIndex"], size=row["NumRows"], replace=False |
| | ) |
| | rand_date_dict[row["StudyId"]] = [ |
| | row["EarliestIndexDate"] + timedelta(days=int(day)) |
| | for day in rand_days_dict[row["StudyId"]] |
| | ] |
| |
|
| | |
| | index_date_df = pd.DataFrame.from_dict(rand_date_dict, orient="index").reset_index() |
| | index_date_df = index_date_df.rename(columns={"index": "StudyId"}) |
| |
|
| | |
| | index_date_df = ( |
| | pd.melt(index_date_df, id_vars=["StudyId"], value_name="IndexDate") |
| | .drop(["variable"], axis=1) |
| | .sort_values(by=["StudyId", "IndexDate"]) |
| | ) |
| | index_date_df = index_date_df.dropna() |
| | index_date_df = index_date_df.reset_index(drop=True) |
| |
|
| | |
| | exac_events = pd.merge(index_date_df, df, on="StudyId", how="left") |
| | exac_events["IndexDate"] = pd.to_datetime(exac_events["IndexDate"], utc=True) |
| |
|
| | |
| | |
| | exac_events["TimeToEvent"] = ( |
| | exac_events["DateOfEvent"] - exac_events["IndexDate"] |
| | ).dt.days |
| | exac_events["ExacWithin3Months"] = np.where( |
| | (exac_events["TimeToEvent"].between(1, config['model_settings']['prediction_window'], inclusive="both")) |
| | & (exac_events["IsExac"] == 1), |
| | 1, |
| | 0, |
| | ) |
| | exac_events["HospExacWithin3Months"] = np.where( |
| | (exac_events["TimeToEvent"].between(1, config['model_settings']['prediction_window'], inclusive="both")) |
| | & (exac_events["IsHospExac"] == 1), |
| | 1, |
| | 0, |
| | ) |
| | exac_events["CommExacWithin3Months"] = np.where( |
| | (exac_events["TimeToEvent"].between(1, config['model_settings']['prediction_window'], inclusive="both")) |
| | & (exac_events["IsCommExac"] == 1), |
| | 1, |
| | 0, |
| | ) |
| |
|
| | exac_events = exac_events.sort_values( |
| | by=["StudyId", "IndexDate", "ExacWithin3Months"], ascending=[True, True, False] |
| | ) |
| | exac_events = exac_events.drop_duplicates(subset=["StudyId", "IndexDate"], keep="first") |
| | exac_events = exac_events[ |
| | [ |
| | "StudyId", |
| | "PatientId", |
| | "IndexDate", |
| | "DateOfBirth", |
| | "Sex", |
| | "ExacWithin3Months", |
| | "HospExacWithin3Months", |
| | "CommExacWithin3Months", |
| | ] |
| | ] |
| |
|
| | |
| | exac_events.to_pickle("./data/patient_labels_forward_val_hosp_comm.pkl") |
| |
|
| | |
| | class_distribution = ( |
| | exac_events.groupby("ExacWithin3Months").count()[["StudyId"]].reset_index() |
| | ) |
| | class_distribution.plot.bar(x="ExacWithin3Months", y="StudyId") |
| | plt.savefig( |
| | "./plots/class_distributions/final_seed_" |
| | + str(random_seed_general) |
| | + "_class_distribution_hosp_comm.png", |
| | bbox_inches="tight", |
| | ) |
| |
|
| | print("---Summary info after setting up labels---") |
| | print("Number of unique patients:", exac_events["StudyId"].nunique()) |
| | print("Number of rows:", len(exac_events)) |
| | print( |
| | "Number of exacerbations within 3 months of index date:", |
| | len(exac_events[exac_events["ExacWithin3Months"] == 1]), |
| | ) |
| | print( |
| | "Percentage positive class (num exac/total rows): {} %".format( |
| | round( |
| | (len(exac_events[exac_events["ExacWithin3Months"] == 1]) / len(exac_events)) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print( |
| | "Percentage negative class: {} %".format( |
| | round( |
| | (len(exac_events[exac_events["ExacWithin3Months"] == 0]) / len(exac_events)) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print( |
| | "Percentage hospital exacs: {} %".format( |
| | round( |
| | (len(exac_events[exac_events["HospExacWithin3Months"] == 1]) / len(exac_events)) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print( |
| | "Percentage community exacs: {} %".format( |
| | round( |
| | (len(exac_events[exac_events["CommExacWithin3Months"] == 1]) / len(exac_events)) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print("Class balance:") |
| | print(class_distribution) |