| | """ |
| | Script uses both hospital and community exacerbation events. |
| | |
| | Collate all hospital, clincian verified and patient reported events and apply |
| | PRO LOGIC to determine the number of exacerbation events. Use exacerbation events to |
| | determine the number of rows required per patient in the data and generate random |
| | index dates and setup labels. |
| | """ |
| |
|
| | import model_h |
| | import numpy as np |
| | import os |
| | import sys |
| | import pandas as pd |
| | import matplotlib.pyplot as plt |
| | from datetime import timedelta |
| | import random |
| | import yaml |
| |
|
| | |
| | with open("./training/config.yaml", "r") as config: |
| | config = yaml.safe_load(config) |
| |
|
| | |
| | log = open( |
| | os.path.join( |
| | config["outputs"]["logging_dir"], |
| | "setup_labels" + config["model_settings"]["model_type"] + "2023.log", |
| | ), |
| | "w", |
| | ) |
| | sys.stdout = log |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | patient_details = pd.read_csv( |
| | config["inputs"]["raw_data_paths"]["patient_details"], |
| | usecols=[ |
| | "PatientId", |
| | "FirstSubmissionDate", |
| | "MostRecentSubmissionDate", |
| | "DateOfBirth", |
| | "Sex", |
| | "StudyId", |
| | ], |
| | delimiter="|", |
| | ) |
| |
|
| | |
| | |
| | receiver_patients = ["RC{:02d}".format(i) for i in range(1, 85)] |
| | |
| | receiver_patients.remove("RC34") |
| |
|
| | |
| | scaleup_patients = ["SU{:02d}".format(i) for i in range(1, 219)] |
| |
|
| | |
| | valid_patients = receiver_patients + scaleup_patients |
| |
|
| | |
| | patient_details = patient_details[ |
| | patient_details.StudyId.str.replace(" ", "").isin(valid_patients) |
| | ] |
| | |
| | patient_details = patient_details[ |
| | (patient_details.FirstSubmissionDate.notna()) |
| | & (patient_details.MostRecentSubmissionDate.notna()) |
| | ] |
| |
|
| | |
| | patient_deaths = pd.read_csv( |
| | config["inputs"]["raw_data_paths"]["patient_events"], |
| | usecols=[ |
| | "PatientId", |
| | "DateOfEvent", |
| | "EventType", |
| | ], |
| | delimiter="|", |
| | ) |
| | patient_deaths = patient_deaths[patient_deaths["EventType"] == "Death"] |
| | patient_deaths = patient_deaths.rename(columns={"DateOfEvent": "DeathDate"}) |
| | patient_deaths = patient_deaths.drop(columns=["EventType"]) |
| |
|
| | |
| | patient_details = patient_details.merge(patient_deaths, on="PatientId", how="left") |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | patient_details["LatestPredictionDate"] = config["model_settings"][ |
| | "latest_date_before_bug_break" |
| | ] |
| |
|
| | |
| | patient_details["AfterGapStartDate"] = config["model_settings"][ |
| | "after_bug_fixed_start_date" |
| | ] |
| | patient_details["DataEndDate"] = config["model_settings"]["training_data_end_date"] |
| |
|
| | date_cols = [ |
| | "FirstSubmissionDate", |
| | "MostRecentSubmissionDate", |
| | "LatestPredictionDate", |
| | "AfterGapStartDate", |
| | "DataEndDate", |
| | "DeathDate", |
| | ] |
| | patient_details[date_cols] = patient_details[date_cols].apply( |
| | lambda x: pd.to_datetime(x, utc=True, format="mixed").dt.normalize(), axis=1 |
| | ) |
| |
|
| | |
| | |
| | patient_details["LatestPredictionDate"] = patient_details[ |
| | ["MostRecentSubmissionDate", "LatestPredictionDate", "DeathDate"] |
| | ].min(axis=1) |
| |
|
| | patient_details["DataEndDate"] = patient_details[ |
| | ["MostRecentSubmissionDate", "DataEndDate", "DeathDate"] |
| | ].min(axis=1) |
| |
|
| | |
| | patient_details["LatestIndexDate"] = patient_details[ |
| | "LatestPredictionDate" |
| | ] - pd.DateOffset(days=config["model_settings"]["prediction_window"]) |
| |
|
| | patient_details["LatestIndexAfterGap"] = patient_details["DataEndDate"] - pd.DateOffset( |
| | days=config["model_settings"]["prediction_window"] |
| | ) |
| |
|
| | |
| | patient_details["EarliestIndexDate"] = patient_details[ |
| | "FirstSubmissionDate" |
| | ] + pd.DateOffset(days=config["model_settings"]["lookback_period"]) |
| |
|
| | patient_details["EarliestIndexAfterGap"] = patient_details[ |
| | "AfterGapStartDate" |
| | ] + pd.DateOffset(days=config["model_settings"]["lookback_period"]) |
| |
|
| | |
| | |
| | print("Number of total patients", len(patient_details)) |
| | print( |
| | "Number of patients with too short of a window of data:", |
| | len( |
| | patient_details[ |
| | patient_details["EarliestIndexDate"] > patient_details["LatestIndexDate"] |
| | ] |
| | ), |
| | ) |
| | patient_details = patient_details[ |
| | patient_details["EarliestIndexDate"] < patient_details["LatestIndexDate"] |
| | ] |
| |
|
| | patient_details["DatesAfterGap"] = np.where( |
| | patient_details["EarliestIndexAfterGap"] > patient_details["LatestIndexAfterGap"], |
| | False, |
| | True, |
| | ) |
| |
|
| | |
| | patient_details["FirstLength"] = ( |
| | patient_details["LatestIndexDate"] - patient_details["EarliestIndexDate"] |
| | ).dt.days |
| | patient_details["SecondLength"] = ( |
| | patient_details["LatestIndexAfterGap"] - patient_details["EarliestIndexAfterGap"] |
| | ).dt.days |
| | patient_details["LengthInService"] = np.where( |
| | patient_details["DatesAfterGap"] == True, |
| | patient_details["FirstLength"] + patient_details["SecondLength"], |
| | patient_details["FirstLength"], |
| | ) |
| | patient_details["TotalLength1"] = ( |
| | patient_details["LatestPredictionDate"] - patient_details["FirstSubmissionDate"] |
| | ).dt.days |
| | patient_details["TotalLength2"] = ( |
| | patient_details["DataEndDate"] - patient_details["AfterGapStartDate"] |
| | ).dt.days |
| | patient_details["TotalLengthInService"] = np.where( |
| | patient_details["DatesAfterGap"] == True, |
| | patient_details["TotalLength1"] + patient_details["TotalLength2"], |
| | patient_details["TotalLength1"], |
| | ) |
| |
|
| | |
| | patient_details.to_pickle( |
| | os.path.join(config["outputs"]["output_data_dir"], "patient_details.pkl") |
| | ) |
| |
|
| | |
| | model_patients = list(patient_details.PatientId.unique()) |
| | model_study_ids = list(patient_details.StudyId.unique()) |
| |
|
| | print( |
| | "Model cohort: {} patients. {} RECEIVER and {} SU".format( |
| | len(model_patients), |
| | len(patient_details[patient_details["StudyId"].str.startswith("RC")]), |
| | len(patient_details[patient_details["StudyId"].str.startswith("SU")]), |
| | ) |
| | ) |
| |
|
| | df1 = patient_details[ |
| | [ |
| | "PatientId", |
| | "DateOfBirth", |
| | "Sex", |
| | "StudyId", |
| | "FirstSubmissionDate", |
| | "EarliestIndexDate", |
| | "LatestIndexDate", |
| | "LatestPredictionDate", |
| | "AfterGapStartDate", |
| | "EarliestIndexAfterGap", |
| | "LatestIndexAfterGap", |
| | "DataEndDate", |
| | ] |
| | ].copy() |
| | df2 = df1.copy() |
| |
|
| | |
| | df1["DateOfEvent"] = df1.apply( |
| | lambda x: pd.date_range(x.FirstSubmissionDate, x.LatestPredictionDate, freq="D"), |
| | axis=1, |
| | ) |
| | |
| | df2["DateOfEvent"] = df2.apply( |
| | lambda x: pd.date_range(x.AfterGapStartDate, x.DataEndDate, freq="D"), |
| | axis=1, |
| | ) |
| |
|
| | |
| | df1 = df1.explode("DateOfEvent").reset_index(drop=True) |
| | df2 = df2.explode("DateOfEvent").reset_index(drop=True) |
| | df2 = df2.dropna(subset=["DateOfEvent"]) |
| | df = pd.concat([df1, df2]) |
| | df = df.sort_values(by=["StudyId", "DateOfEvent"]) |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | patient_events = pd.read_csv( |
| | config["inputs"]["raw_data_paths"]["patient_events"], |
| | delimiter="|", |
| | usecols=["PatientId", "DateOfEvent", "EventType"], |
| | ) |
| |
|
| | |
| | patient_events = patient_events[patient_events.PatientId.isin(model_patients)] |
| |
|
| | |
| | patient_events["IsHospExac"] = model_h.define_service_exac_event( |
| | events=patient_events.EventType, include_community=False |
| | ) |
| |
|
| | |
| | patient_events["IsHospAdmission"] = model_h.define_hospital_admission( |
| | patient_events.EventType |
| | ) |
| |
|
| | admissions = patient_events[patient_events.IsHospAdmission == 1][ |
| | ["PatientId", "DateOfEvent", "IsHospAdmission"] |
| | ] |
| | hosp_exacs = patient_events[patient_events.IsHospExac == 1][ |
| | ["PatientId", "DateOfEvent", "IsHospExac"] |
| | ] |
| | admissions["DateOfEvent"] = pd.to_datetime( |
| | admissions.DateOfEvent, utc=True |
| | ).dt.normalize() |
| | hosp_exacs["DateOfEvent"] = pd.to_datetime( |
| | hosp_exacs.DateOfEvent, utc=True |
| | ).dt.normalize() |
| |
|
| | hosp_exacs = hosp_exacs.drop_duplicates() |
| | admissions = admissions.drop_duplicates() |
| |
|
| | |
| | hosp_exacs.to_pickle( |
| | os.path.join(config["outputs"]["output_data_dir"], "hospital_exacerbations.pkl") |
| | ) |
| | admissions.to_pickle( |
| | os.path.join(config["outputs"]["output_data_dir"], "hospital_admissions.pkl") |
| | ) |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | symptom_diary = pd.read_csv( |
| | config["inputs"]["raw_data_paths"]["pro_symptom_diary"], |
| | usecols=[ |
| | "PatientId", |
| | "StudyId", |
| | "Score", |
| | "SubmissionTime", |
| | "SymptomDiaryQ5", |
| | "SymptomDiaryQ11a", |
| | "SymptomDiaryQ11b", |
| | ], |
| | delimiter="|", |
| | ) |
| |
|
| | Q5ChangeDate = pd.to_datetime(config["model_settings"]["pro_q5_change_date"], utc=True) |
| | symptom_diary = model_h.filter_symptom_diary( |
| | df=symptom_diary, date_cutoff=Q5ChangeDate, patients=model_patients |
| | ) |
| |
|
| | weekly_pros = model_h.get_rescue_med_pro_responses(symptom_diary) |
| | weekly_pros = model_h.set_pro_exac_dates(weekly_pros) |
| | weekly_pros = weekly_pros[ |
| | [ |
| | "PatientId", |
| | "Q5Answered", |
| | "NegativeQ5", |
| | "IsCommExac", |
| | "DateOfEvent", |
| | "ExacDateUnknown", |
| | ] |
| | ] |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | receiver = pd.read_excel( |
| | config["inputs"]["raw_data_paths"]["receiver_community_verified_events"] |
| | ) |
| | receiver = receiver.rename( |
| | columns={"Study number": "StudyId", "Exacerbation recorded": "DateRecorded"} |
| | ) |
| | receiver_exacs = model_h.extract_clinician_verified_exacerbations(receiver) |
| |
|
| | |
| | scaleup = pd.read_excel( |
| | config["inputs"]["raw_data_paths"]["scale_up_community_verified_events"] |
| | ) |
| | scaleup = scaleup.rename( |
| | columns={"Study Number": "StudyId", "Date Exacerbation recorded": "DateRecorded"} |
| | ) |
| | scaleup["StudyId"] = scaleup["StudyId"].ffill() |
| | scaleup_exacs = model_h.extract_clinician_verified_exacerbations(scaleup) |
| |
|
| | |
| | verified_exacs = pd.concat([receiver_exacs, scaleup_exacs]) |
| | verified_exacs = verified_exacs[verified_exacs.StudyId.isin(model_study_ids)] |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | df = df.merge(verified_exacs, on=["StudyId", "DateOfEvent"], how="left") |
| |
|
| | |
| | df = df.merge(weekly_pros, on=["PatientId", "DateOfEvent"], how="left") |
| |
|
| | |
| | df = df.merge(hosp_exacs, on=["PatientId", "DateOfEvent"], how="left") |
| | df = model_h.fill_column_by_patient(df=df, id_col="PatientId", col="StudyId") |
| |
|
| | |
| | df = df.merge(admissions, on=["PatientId", "DateOfEvent"], how="left") |
| | df = model_h.fill_column_by_patient(df=df, id_col="PatientId", col="StudyId") |
| |
|
| | |
| | df["ExacDateUnknown"] = np.where( |
| | (df.ExacDateUnknown_x == 1) | (df.ExacDateUnknown_y == 1), 1, 0 |
| | ) |
| | df["IsCommExac"] = np.where((df.IsCommExac_x == 1) | (df.IsCommExac_y == 1), 1, 0) |
| |
|
| | |
| | |
| | df["IsExac"] = np.where((df.IsCommExac == 1) | (df.IsHospExac == 1), 1, 0) |
| |
|
| | |
| | df = ( |
| | df.set_index("DateOfEvent") |
| | .groupby("StudyId") |
| | .resample("D") |
| | .asfreq() |
| | .drop("StudyId", axis=1) |
| | .reset_index() |
| | ) |
| |
|
| | |
| | df[ |
| | [ |
| | "Q5Answered", |
| | "NegativeQ5", |
| | "IsHospExac", |
| | "IsCommExac", |
| | "ExacDateUnknown", |
| | "IsExac", |
| | "IsHospAdmission", |
| | ] |
| | ] = df[ |
| | [ |
| | "Q5Answered", |
| | "NegativeQ5", |
| | "IsHospExac", |
| | "IsCommExac", |
| | "ExacDateUnknown", |
| | "IsExac", |
| | "IsHospAdmission", |
| | ] |
| | ].fillna( |
| | 0 |
| | ) |
| |
|
| | |
| | df = model_h.fill_column_by_patient(df=df, id_col="StudyId", col="FirstSubmissionDate") |
| | df = model_h.fill_column_by_patient(df=df, id_col="StudyId", col="LatestPredictionDate") |
| | df = model_h.fill_column_by_patient(df=df, id_col="StudyId", col="PatientId") |
| |
|
| | print("Starting number of exacerbations: {}".format(df.IsExac.sum())) |
| | print( |
| | "Number of exacerbations during COPD service: {}".format( |
| | len(df[(df.IsExac == 1) & (df.DateOfEvent >= df.FirstSubmissionDate)]) |
| | ) |
| | ) |
| | print( |
| | "Number of unique exacerbation patients: {}".format( |
| | len(df[df.IsExac == 1].PatientId.unique()) |
| | ) |
| | ) |
| | print( |
| | "Exacerbation breakdown: {} hospital, {} patient reported and {} overlapping".format( |
| | df.IsHospExac.sum(), |
| | df.IsCommExac.sum(), |
| | len(df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1)]), |
| | ) |
| | ) |
| | print( |
| | "Number of hospital exacerbations during COPD service: {} ({} unique patients)".format( |
| | len(df[(df.IsHospExac == 1) & (df.DateOfEvent >= df.FirstSubmissionDate)]), |
| | len( |
| | df[ |
| | (df.IsHospExac == 1) & (df.DateOfEvent >= df.FirstSubmissionDate) |
| | ].StudyId.unique() |
| | ), |
| | ) |
| | ) |
| | print( |
| | "Clinician verified community exacerbations during COPD service: {} ({} unique patients)".format( |
| | len(df[df.IsCommExac_x == 1]), len(df[df.IsCommExac_x == 1].StudyId.unique()) |
| | ) |
| | ) |
| | print( |
| | "Community exacerbations from weekly PROs: {} ({} unique patients)".format( |
| | len(df[df.IsCommExac_y == 1]), len(df[df.IsCommExac_y == 1].StudyId.unique()) |
| | ) |
| | ) |
| | print( |
| | "Number of patient reported exacerbations with unknown dates: {} ({} overlapping\ |
| | with hospital events)".format( |
| | df.ExacDateUnknown.sum(), |
| | len(df[(df.IsHospExac == 1) & (df.ExacDateUnknown == 1)]), |
| | ) |
| | ) |
| |
|
| | |
| | |
| | df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1), "ExacDateUnknown"] = 0 |
| | print("Remaining exacerbations with unknown dates: {}".format(df.ExacDateUnknown.sum())) |
| |
|
| | df = df.drop( |
| | columns=["IsCommExac_x", "IsCommExac_y", "ExacDateUnknown_x", "ExacDateUnknown_y"] |
| | ) |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | logic_min_days = config["model_settings"]["pro_logic_min_days_after_exac"] |
| | logic_max_days = config["model_settings"]["pro_logic_max_days_after_exac"] |
| |
|
| | |
| | df = ( |
| | df.groupby("StudyId") |
| | .apply( |
| | lambda x: model_h.calculate_days_since_last_event( |
| | df=x, event_col="IsExac", output_col="DaysSinceLastExac" |
| | ) |
| | ) |
| | .reset_index(drop=True) |
| | ) |
| |
|
| | |
| | df["RemoveRow"] = model_h.minimum_period_between_exacerbations( |
| | df, minimum_days=logic_min_days |
| | ) |
| |
|
| | |
| | df["RemoveRow"] = np.where(df["IsHospExac"] == 1, 0, df["RemoveRow"]) |
| |
|
| | print( |
| | "Number of community exacerbations excluded by PRO LOGIC {} day criterion: {}".format( |
| | logic_min_days, len(df[(df.IsExac == 1) & (df.RemoveRow == 1)]) |
| | ) |
| | ) |
| |
|
| | |
| | |
| | consecutive_replies = config["model_settings"]["neg_consecutive_q5_replies"] |
| | df = model_h.apply_logic_response_criterion( |
| | df, |
| | minimum_period=logic_min_days, |
| | maximum_period=logic_max_days, |
| | N=consecutive_replies, |
| | ) |
| |
|
| | |
| | df["RemoveExac"] = np.where(df["IsHospExac"] == 1, 0, df["RemoveExac"]) |
| |
|
| | print( |
| | "Weekly rescue med (Q5) criterion applied to events occurring between {} and {} \ |
| | days after a previous event. {} consecutive negative replies required for the event to \ |
| | count as a new event".format( |
| | logic_min_days, logic_max_days, consecutive_replies |
| | ) |
| | ) |
| | print( |
| | "Number of exacerbations excluded by PRO LOGIC Q5 response criterion: {}".format( |
| | df.RemoveExac.sum() |
| | ) |
| | ) |
| | print( |
| | "Earliest and latest exacerbations excluded: {}, {}".format( |
| | df[df.RemoveExac == 1].DateOfEvent.min(), |
| | df[df.RemoveExac == 1].DateOfEvent.max(), |
| | ) |
| | ) |
| | print( |
| | "Remaining number of exacerbations: {}".format( |
| | len(df[(df.IsExac == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1)]) |
| | ) |
| | ) |
| | print( |
| | "Remaining exacerbations with unknown dates: {}".format( |
| | len(df[(df.ExacDateUnknown == 1) & (df.RemoveRow != 1) & (df.RemoveExac != 1)]) |
| | ) |
| | ) |
| |
|
| | |
| | df = model_h.remove_data_between_exacerbations(df) |
| |
|
| | |
| | df = model_h.remove_unknown_date_exacerbations(df) |
| |
|
| | |
| | df = df[df["RemoveRow"] != 1] |
| |
|
| | |
| | print("---Final exacerbation counts---") |
| | print("Final number of exacerbations: {}".format(df.IsExac.sum())) |
| | exac_patients = pd.Series(df[df.IsExac == 1].StudyId.unique()) |
| | print( |
| | "Number of unique exacerbation patients: {} ({} RC and {} SU)".format( |
| | len(exac_patients), |
| | exac_patients.str.startswith("RC").sum(), |
| | exac_patients.str.startswith("SU").sum(), |
| | ) |
| | ) |
| | print( |
| | "Exacerbation breakdown: {} hospital, {} patient reported and {} overlapping".format( |
| | df.IsHospExac.sum(), |
| | df.IsCommExac.sum(), |
| | len(df.loc[(df.IsCommExac == 1) & (df.IsHospExac == 1)]), |
| | ) |
| | ) |
| | df.to_pickle(os.path.join(config["outputs"]["output_data_dir"], "hosp_comm_exacs.pkl")) |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | service_time = patient_details[["StudyId", "TotalLengthInService"]] |
| | service_time = service_time.drop_duplicates(subset="StudyId", keep="first") |
| | print(service_time) |
| |
|
| | avg_service_time = sum(service_time["TotalLengthInService"]) / len( |
| | service_time["TotalLengthInService"] |
| | ) |
| | avg_service_time_months = round(avg_service_time / 30) |
| | print("Average time in service (days):", avg_service_time) |
| | print("Average time in service (months):", avg_service_time_months) |
| |
|
| | |
| | avg_exac_per_patient = round( |
| | len(df[(df["IsExac"] == 1)]) / df[(df["IsExac"] == 1) | (df["IsExac"] == 0)][["StudyId"]].nunique().item(), 2 |
| | ) |
| | print( |
| | "Number of exac/patient/months: {} exacerbations/patient in {} months".format( |
| | avg_exac_per_patient, avg_service_time_months |
| | ) |
| | ) |
| | print( |
| | "On average, 1 exacerbation occurs in a patient every: {} months".format( |
| | round(avg_service_time_months / avg_exac_per_patient, 2) |
| | ) |
| | ) |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | service_time["NumRows"] = round( |
| | service_time["TotalLengthInService"] |
| | / config["model_settings"]["one_row_per_days_in_service"] |
| | ).astype("int") |
| | |
| | service_time["NumRows"] = np.where( |
| | service_time["NumRows"] < 1, 1, service_time["NumRows"] |
| | ) |
| | patient_details = pd.merge( |
| | patient_details, service_time[["StudyId", "NumRows"]], on="StudyId", how="left" |
| | ) |
| |
|
| | |
| | patient_details["NumDaysPossibleIndex"] = ( |
| | patient_details["LatestIndexDate"] - patient_details["EarliestIndexDate"] |
| | ).dt.days |
| |
|
| | patient_details["NumDaysPossibleIndex2"] = ( |
| | patient_details["LatestIndexAfterGap"] - patient_details["EarliestIndexAfterGap"] |
| | ).dt.days |
| |
|
| | patient_details.to_csv( |
| | os.path.join( |
| | config["outputs"]["output_data_dir"], "pat_details_to_calc_index_dt.csv" |
| | ), |
| | index=False, |
| | ) |
| |
|
| | |
| | |
| | |
| | random_seed_general = config["model_settings"]["index_date_generation_master_seed"] |
| | random.seed(random_seed_general) |
| |
|
| | |
| | patient_details["RandomSeed"] = random.sample(range(0, 2**32), patient_details.shape[0]) |
| |
|
| | |
| | rand_days_dict = {} |
| | rand_date_dict = {} |
| | for index, row in patient_details.iterrows(): |
| | np.random.seed(row["RandomSeed"]) |
| | rand_days_dict[row["StudyId"]] = np.random.choice( |
| | row["LengthInService"], size=row["NumRows"], replace=False |
| | ) |
| | rand_date_dict[row["StudyId"]] = [ |
| | ( |
| | row["EarliestIndexDate"] + timedelta(days=int(day)) |
| | if day <= row["NumDaysPossibleIndex"] |
| | else row["EarliestIndexAfterGap"] |
| | + timedelta(days=int(day - row["NumDaysPossibleIndex"])) |
| | ) |
| | for day in rand_days_dict[row["StudyId"]] |
| | ] |
| |
|
| | |
| | index_date_df = pd.DataFrame.from_dict(rand_date_dict, orient="index").reset_index() |
| | index_date_df = index_date_df.rename(columns={"index": "StudyId"}) |
| |
|
| | |
| | index_date_df = ( |
| | pd.melt(index_date_df, id_vars=["StudyId"], value_name="IndexDate") |
| | .drop(["variable"], axis=1) |
| | .sort_values(by=["StudyId", "IndexDate"]) |
| | ) |
| | index_date_df = index_date_df.dropna() |
| | index_date_df = index_date_df.reset_index(drop=True) |
| |
|
| | |
| | exac_events = pd.merge(index_date_df, df, on="StudyId", how="left") |
| | exac_events["IndexDate"] = pd.to_datetime(exac_events["IndexDate"], utc=True) |
| |
|
| | |
| | |
| | exac_events["TimeToEvent"] = ( |
| | exac_events["DateOfEvent"] - exac_events["IndexDate"] |
| | ).dt.days |
| | exac_events["ExacWithin3Months"] = np.where( |
| | ( |
| | exac_events["TimeToEvent"].between( |
| | 1, config["model_settings"]["prediction_window"], inclusive="both" |
| | ) |
| | ) |
| | & (exac_events["IsExac"] == 1), |
| | 1, |
| | 0, |
| | ) |
| | exac_events["HospExacWithin3Months"] = np.where( |
| | ( |
| | exac_events["TimeToEvent"].between( |
| | 1, config["model_settings"]["prediction_window"], inclusive="both" |
| | ) |
| | ) |
| | & (exac_events["IsHospExac"] == 1), |
| | 1, |
| | 0, |
| | ) |
| | exac_events["CommExacWithin3Months"] = np.where( |
| | ( |
| | exac_events["TimeToEvent"].between( |
| | 1, config["model_settings"]["prediction_window"], inclusive="both" |
| | ) |
| | ) |
| | & (exac_events["IsCommExac"] == 1), |
| | 1, |
| | 0, |
| | ) |
| |
|
| | exac_events = exac_events.sort_values( |
| | by=["StudyId", "IndexDate", "ExacWithin3Months"], ascending=[True, True, False] |
| | ) |
| | exac_events = exac_events.drop_duplicates(subset=["StudyId", "IndexDate"], keep="first") |
| | exac_events = exac_events[ |
| | [ |
| | "StudyId", |
| | "PatientId", |
| | "IndexDate", |
| | "DateOfBirth", |
| | "Sex", |
| | "ExacWithin3Months", |
| | "HospExacWithin3Months", |
| | "CommExacWithin3Months", |
| | ] |
| | ] |
| |
|
| | |
| | exac_events.to_pickle( |
| | os.path.join(config["outputs"]["output_data_dir"], "patient_labels_hosp_comm.pkl") |
| | ) |
| |
|
| | |
| | class_distribution = ( |
| | exac_events.groupby("ExacWithin3Months").count()[["StudyId"]].reset_index() |
| | ) |
| | class_distribution.plot.bar(x="ExacWithin3Months", y="StudyId") |
| | plt.savefig( |
| | "./plots/class_distributions/final_seed_" |
| | + str(random_seed_general) |
| | + "_class_distribution_hosp_comm.png", |
| | bbox_inches="tight", |
| | ) |
| |
|
| | print("---Summary info after setting up labels---") |
| | print("Number of unique patients:", exac_events["StudyId"].nunique()) |
| | print("Number of rows:", len(exac_events)) |
| | print( |
| | "Number of exacerbations within 3 months of index date:", |
| | len(exac_events[exac_events["ExacWithin3Months"] == 1]), |
| | ) |
| | print( |
| | "Percentage positive class (num exac/total rows): {} %".format( |
| | round( |
| | (len(exac_events[exac_events["ExacWithin3Months"] == 1]) / len(exac_events)) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print( |
| | "Percentage negative class: {} %".format( |
| | round( |
| | (len(exac_events[exac_events["ExacWithin3Months"] == 0]) / len(exac_events)) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print( |
| | "Percentage hospital exacs: {} %".format( |
| | round( |
| | ( |
| | len(exac_events[exac_events["HospExacWithin3Months"] == 1]) |
| | / len(exac_events) |
| | ) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print( |
| | "Percentage community exacs: {} %".format( |
| | round( |
| | ( |
| | len(exac_events[exac_events["CommExacWithin3Months"] == 1]) |
| | / len(exac_events) |
| | ) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print("Class balance:") |
| | print(class_distribution) |
| |
|
| | print("---Events based on dates---") |
| | verified_events = exac_events[ |
| | exac_events["IndexDate"] <= pd.to_datetime("2021-11-30", utc=True) |
| | ] |
| | unverified_events = exac_events[ |
| | exac_events["IndexDate"] > pd.to_datetime("2021-11-30", utc=True) |
| | ] |
| | print("---Verified events---") |
| | print( |
| | "Percentage positive class (num exac/total rows): {} %".format( |
| | round( |
| | ( |
| | len(verified_events[verified_events["ExacWithin3Months"] == 1]) |
| | / len(verified_events) |
| | ) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print( |
| | "Percentage negative class: {} %".format( |
| | round( |
| | ( |
| | len(verified_events[verified_events["ExacWithin3Months"] == 0]) |
| | / len(verified_events) |
| | ) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print( |
| | "Percentage hospital exacs: {} %".format( |
| | round( |
| | ( |
| | len(verified_events[verified_events["HospExacWithin3Months"] == 1]) |
| | / len(verified_events) |
| | ) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print( |
| | "Percentage community exacs: {} %".format( |
| | round( |
| | ( |
| | len(verified_events[verified_events["CommExacWithin3Months"] == 1]) |
| | / len(verified_events) |
| | ) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print("---Unverified events---") |
| | print( |
| | "Percentage positive class (num exac/total rows): {} %".format( |
| | round( |
| | ( |
| | len(unverified_events[unverified_events["ExacWithin3Months"] == 1]) |
| | / len(unverified_events) |
| | ) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print( |
| | "Percentage negative class: {} %".format( |
| | round( |
| | ( |
| | len(unverified_events[unverified_events["ExacWithin3Months"] == 0]) |
| | / len(unverified_events) |
| | ) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print( |
| | "Percentage hospital exacs: {} %".format( |
| | round( |
| | ( |
| | len(unverified_events[unverified_events["HospExacWithin3Months"] == 1]) |
| | / len(unverified_events) |
| | ) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print( |
| | "Percentage community exacs: {} %".format( |
| | round( |
| | ( |
| | len(unverified_events[unverified_events["CommExacWithin3Months"] == 1]) |
| | / len(unverified_events) |
| | ) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print( |
| | "Train date range", exac_events["IndexDate"].min(), exac_events["IndexDate"].max() |
| | ) |
| |
|