| | """ |
| | Script uses only hospital exacerbation events. |
| | |
| | Collate all hospital to determine the number of exacerbation events. Use exacerbation |
| | events to determine the number of rows required per patient in the data and generate |
| | random index dates and setup labels. |
| | """ |
| | import model_h |
| | import numpy as np |
| | import os |
| | import pandas as pd |
| | import sys |
| | import matplotlib.pyplot as plt |
| | from datetime import timedelta |
| | import random |
| |
|
| | data_dir_service = "<YOUR_DATA_PATH>/copd-dataset" |
| | data_dir_model = "./data" |
| |
|
| | |
| | log = open("./training/logging/setup_labels_only_hosp.log", "w") |
| | sys.stdout = log |
| |
|
| | |
| | model_time_window = 90 |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | patient_details = pd.read_csv( |
| | os.path.join(data_dir_service, "CopdDatasetPatientDetails.txt"), |
| | usecols=[ |
| | "PatientId", |
| | "FirstSubmissionDate", |
| | "MostRecentSubmissionDate", |
| | "DateOfBirth", |
| | "Sex", |
| | "StudyId", |
| | ], |
| | delimiter="|", |
| | ) |
| |
|
| | |
| | |
| | |
| | receiver_patients = ["RC{:02d}".format(i) for i in range(1, 85)] |
| | |
| | receiver_patients.remove("RC34") |
| | |
| | scaleup_patients = ["SU{:02d}".format(i) for i in range(1, 219)] |
| | |
| |
|
| | |
| | valid_patients = receiver_patients + scaleup_patients |
| |
|
| | |
| | patient_details = patient_details[ |
| | patient_details.StudyId.str.replace(" ", "").isin(valid_patients) |
| | ] |
| | |
| | patient_details = patient_details[ |
| | (patient_details.FirstSubmissionDate.notna()) |
| | & (patient_details.MostRecentSubmissionDate.notna()) |
| | ] |
| |
|
| | |
| | patient_details["LatestPredictionDate"] = "2022-02-28" |
| |
|
| | date_cols = ["FirstSubmissionDate", "MostRecentSubmissionDate", "LatestPredictionDate"] |
| | patient_details[date_cols] = patient_details[date_cols].apply( |
| | lambda x: pd.to_datetime(x, utc=True, format="mixed").dt.normalize(), axis=1 |
| | ) |
| | |
| | |
| | patient_details["LatestPredictionDate"] = patient_details[ |
| | ["MostRecentSubmissionDate", "LatestPredictionDate"] |
| | ].min(axis=1) |
| |
|
| | |
| | patient_details["LatestIndexDate"] = patient_details[ |
| | "LatestPredictionDate" |
| | ] - pd.DateOffset(days=model_time_window) |
| |
|
| | |
| | patient_details["EarliestIndexDate"] = patient_details[ |
| | "FirstSubmissionDate" |
| | ] + pd.DateOffset(days=180) |
| |
|
| | |
| | |
| | print("Number of total patients", len(patient_details)) |
| | print( |
| | "Number of patients with too short of a window of data:", |
| | len( |
| | patient_details[ |
| | patient_details["EarliestIndexDate"] > patient_details["LatestIndexDate"] |
| | ] |
| | ), |
| | ) |
| | patient_details = patient_details[ |
| | patient_details["EarliestIndexDate"] < patient_details["LatestIndexDate"] |
| | ] |
| |
|
| | |
| | model_patients = list(patient_details.PatientId.unique()) |
| | model_study_ids = list(patient_details.StudyId.unique()) |
| |
|
| | print( |
| | "Model cohort: {} patients. {} RECEIVER and {} SU".format( |
| | len(model_patients), |
| | len(patient_details[patient_details["StudyId"].str.startswith("RC")]), |
| | len(patient_details[patient_details["StudyId"].str.startswith("SU")]), |
| | ) |
| | ) |
| |
|
| | df = patient_details[ |
| | [ |
| | "PatientId", |
| | "DateOfBirth", |
| | "Sex", |
| | "StudyId", |
| | "FirstSubmissionDate", |
| | "EarliestIndexDate", |
| | "LatestIndexDate", |
| | "LatestPredictionDate", |
| | ] |
| | ].copy() |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | hosp_exacs = pd.read_pickle(os.path.join(data_dir_model, "hospital_exacerbations.pkl")) |
| | admissions = pd.read_pickle(os.path.join(data_dir_model, "hospital_admissions.pkl")) |
| |
|
| | |
| | hosp_exacs = hosp_exacs.merge(admissions, on=["PatientId", "DateOfEvent"], how="outer") |
| |
|
| | |
| | patient_id_lookup = patient_details[["PatientId", "StudyId"]] |
| | hosp_exacs["StudyId"] = np.NaN |
| | hosp_exacs["StudyId"] = np.where( |
| | hosp_exacs.StudyId.isnull(), |
| | hosp_exacs.PatientId.map(patient_id_lookup.set_index("PatientId").StudyId), |
| | hosp_exacs.StudyId, |
| | ) |
| | hosp_exacs = hosp_exacs.sort_values( |
| | by=["StudyId", "DateOfEvent", "IsHospExac", "IsHospAdmission"], |
| | ascending=[True, True, False, False], |
| | ) |
| | exac_data = hosp_exacs.drop_duplicates(subset=["StudyId", "DateOfEvent"], keep="first") |
| | exac_data.to_pickle(os.path.join(data_dir_model, "only_hosp_exacs.pkl")) |
| |
|
| | |
| | exac_data = pd.merge( |
| | exac_data, |
| | df[["StudyId", "PatientId", "FirstSubmissionDate", "LatestPredictionDate"]], |
| | on=["StudyId", "PatientId"], |
| | how="left", |
| | ) |
| |
|
| | |
| | exac_data = exac_data[exac_data["DateOfEvent"] > exac_data["FirstSubmissionDate"]] |
| |
|
| | |
| | exac_data = exac_data[exac_data.DateOfEvent <= exac_data.LatestPredictionDate] |
| | exac_data = exac_data.drop(columns=["FirstSubmissionDate", "LatestPredictionDate"]) |
| |
|
| | df = pd.merge(df, exac_data, on=["StudyId", "PatientId"], how="left") |
| | df = df.rename(columns={"IsHospExac": "IsExac"}) |
| |
|
| | print("Starting number of exacerbations: {}".format(df.IsExac.sum())) |
| | print( |
| | "Number of unique exacerbation patients: {}".format( |
| | len(df[df.IsExac == 1].PatientId.unique()) |
| | ) |
| | ) |
| | print( |
| | "Hospital exacerbations: {} ({} unique patients)".format( |
| | len(df[(df.IsExac == 1)]), len(df[(df.IsExac == 1)].StudyId.unique()) |
| | ) |
| | ) |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | service_time = df[["StudyId", "LatestPredictionDate", "FirstSubmissionDate"]] |
| | service_time = service_time.drop_duplicates(subset="StudyId", keep="first") |
| | service_time["ServiceTime"] = ( |
| | service_time["LatestPredictionDate"] - service_time["FirstSubmissionDate"] |
| | ).dt.days |
| | avg_service_time = sum(service_time["ServiceTime"]) / len(service_time["ServiceTime"]) |
| | avg_service_time_months = round(avg_service_time / 30) |
| | print("Average time in service (days):", avg_service_time) |
| | print("Average time in service (months):", avg_service_time_months) |
| |
|
| | |
| | avg_exac_per_patient = round( |
| | len(df[df["IsExac"] == 1]) / df[df["IsExac"] == 1][["StudyId"]].nunique().item(), 2 |
| | ) |
| | print( |
| | "Number of exac/patient/months: {} exacerbations/patient in {} months".format( |
| | avg_exac_per_patient, avg_service_time_months |
| | ) |
| | ) |
| | print( |
| | "On average, 1 exacerbation occurs in a patient every: {} months".format( |
| | round(avg_service_time_months / avg_exac_per_patient, 2) |
| | ) |
| | ) |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | service_time["NumRows"] = round(service_time["ServiceTime"] / 180).astype("int") |
| | patient_details = pd.merge( |
| | patient_details, service_time[["StudyId", "NumRows"]], on="StudyId", how="left" |
| | ) |
| |
|
| | |
| | patient_details["NumDaysPossibleIndex"] = ( |
| | patient_details["LatestIndexDate"] - patient_details["EarliestIndexDate"] |
| | ).dt.days |
| | |
| | patient_details.to_csv("./data/pat_details_to_calc_index_dt.csv", index=False) |
| |
|
| | |
| | |
| | |
| | random_seed_general = 2188398760 |
| | random.seed(random_seed_general) |
| |
|
| | |
| | patient_details["RandomSeed"] = random.sample( |
| | range(0, 2**32), patient_details.shape[0] |
| | ) |
| |
|
| | |
| | rand_days_dict = {} |
| | rand_date_dict = {} |
| | for index, row in patient_details.iterrows(): |
| | np.random.seed(row["RandomSeed"]) |
| | rand_days_dict[row["StudyId"]] = np.random.choice( |
| | row["NumDaysPossibleIndex"], size=row["NumRows"], replace=False |
| | ) |
| | rand_date_dict[row["StudyId"]] = [ |
| | row["EarliestIndexDate"] + timedelta(days=int(day)) |
| | for day in rand_days_dict[row["StudyId"]] |
| | ] |
| |
|
| | |
| | index_date_df = pd.DataFrame.from_dict(rand_date_dict, orient="index").reset_index() |
| | index_date_df = index_date_df.rename(columns={"index": "StudyId"}) |
| |
|
| | |
| | index_date_df = ( |
| | pd.melt(index_date_df, id_vars=["StudyId"], value_name="IndexDate") |
| | .drop(["variable"], axis=1) |
| | .sort_values(by=["StudyId", "IndexDate"]) |
| | ) |
| | index_date_df = index_date_df.dropna() |
| | index_date_df = index_date_df.reset_index(drop=True) |
| |
|
| | |
| | exac_events = pd.merge(index_date_df, df, on="StudyId", how="left") |
| | exac_events["IndexDate"] = pd.to_datetime(exac_events["IndexDate"], utc=True) |
| |
|
| | |
| | |
| | exac_events["TimeToEvent"] = ( |
| | exac_events["DateOfEvent"] - exac_events["IndexDate"] |
| | ).dt.days |
| | exac_events["ExacWithin3Months"] = np.where( |
| | (exac_events["TimeToEvent"].between(1, model_time_window, inclusive="both")) |
| | & (exac_events["IsExac"] == 1), |
| | 1, |
| | 0, |
| | ) |
| | exac_events = exac_events.sort_values( |
| | by=["StudyId", "IndexDate", "ExacWithin3Months"], ascending=[True, True, False] |
| | ) |
| | exac_events = exac_events.drop_duplicates(subset=["StudyId", "IndexDate"], keep="first") |
| | exac_events = exac_events[ |
| | ["StudyId", "PatientId", "IndexDate", "DateOfBirth", "Sex", "ExacWithin3Months"] |
| | ] |
| |
|
| | |
| | exac_events.to_pickle(os.path.join(data_dir_model, "patient_labels_only_hosp.pkl")) |
| |
|
| | |
| | class_distribution = ( |
| | exac_events.groupby("ExacWithin3Months").count()[["StudyId"]].reset_index() |
| | ) |
| | class_distribution.plot.bar(x="ExacWithin3Months", y="StudyId") |
| | plt.title("Class distribution of hospital exacerbations occuring within 3 months") |
| | plt.savefig( |
| | "./plots/class_distributions/final_seed_" |
| | + str(random_seed_general) |
| | + "_class_distribution_only_hosp.png", |
| | bbox_inches="tight", |
| | ) |
| |
|
| | print("---Summary info after setting up labels---") |
| | print("Number of unique patients:", exac_events["StudyId"].nunique()) |
| | print("Number of rows:", len(exac_events)) |
| | print( |
| | "Number of exacerbations within 3 months of index date:", |
| | len(exac_events[exac_events["ExacWithin3Months"] == 1]), |
| | ) |
| | print( |
| | "Percentage positive class (num exac/total rows): {} %".format( |
| | round( |
| | (len(exac_events[exac_events["ExacWithin3Months"] == 1]) / len(exac_events)) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print( |
| | "Percentage negative class: {} %".format( |
| | round( |
| | (len(exac_events[exac_events["ExacWithin3Months"] == 0]) / len(exac_events)) |
| | * 100, |
| | 2, |
| | ) |
| | ) |
| | ) |
| | print("Class balance:") |
| | print(class_distribution) |
| |
|