| | """Prepare final train set for cross-validation (K fold encoded, scaled and imputed).""" |
| | import copd |
| | |
| | from lenusml import crossvalidation |
| | import os |
| | import pandas as pd |
| | import numpy as np |
| | import seaborn as sns |
| | from sklearn.preprocessing import MinMaxScaler |
| | from sklearn.impute import SimpleImputer |
| |
|
| | sns.set(style='darkgrid', context='talk') |
| | sns.set_palette('dark') |
| | muted = sns.palettes.color_palette(palette='muted') |
| | dark = sns.palettes.color_palette(palette='dark') |
| |
|
| | data_dir = '<YOUR_DATA_PATH>/train_data/' |
| | cohort_info_dir = '../data/cohort_info/' |
| | output_data_dir = '../data/models/model1' |
| |
|
| | fold_patients = np.load(os.path.join(cohort_info_dir, 'fold_patients.npy'), |
| | allow_pickle=True) |
| |
|
| | data = pd.read_pickle(os.path.join(data_dir, 'train_data.pkl')) |
| |
|
| | exacs = data[data.IsExac == 1] |
| | exac_patients = exacs.StudyId.unique() |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | bool_mapping = {True: 1, False: 0} |
| | data['RequiredAcuteNIV'] = data.RequiredAcuteNIV.replace(bool_mapping) |
| | data['RequiredICUAdmission'] = data.RequiredICUAdmission.replace(bool_mapping) |
| |
|
| | |
| | sex_mapping = {'F': 1, 'M': 0} |
| | data['Sex_F'] = data.Sex.map(sex_mapping) |
| | data = data.drop(columns=['Sex']) |
| |
|
| | |
| | |
| | |
| | cat = pd.read_csv(os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProCat.txt'), |
| | delimiter="|") |
| |
|
| | symptom_diary = pd.read_csv( |
| | os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProSymptomDiary.txt'), |
| | usecols=['PatientId', 'SubmissionTime', 'SymptomDiaryQ1', 'SymptomDiaryQ2', |
| | 'SymptomDiaryQ3', 'SymptomDiaryQ8', 'SymptomDiaryQ9', 'SymptomDiaryQ10'], |
| | delimiter="|") |
| |
|
| | cat['SubmissionTime'] = pd.to_datetime(cat.SubmissionTime, |
| | utc=True).dt.normalize() |
| | symptom_diary['SubmissionTime'] = pd.to_datetime(symptom_diary.SubmissionTime, |
| | utc=True).dt.normalize() |
| |
|
| |
|
| | |
| | cat = cat[cat.PatientId.isin(data.PatientId)] |
| | symptom_diary = symptom_diary[symptom_diary.PatientId.isin(data.PatientId)] |
| |
|
| | |
| | |
| | daily_pros = pd.merge(cat.drop_duplicates(subset=['PatientId', 'SubmissionTime']), |
| | symptom_diary.drop_duplicates(subset=['PatientId', |
| | 'SubmissionTime']), |
| | on=['PatientId', 'SubmissionTime'], how='inner') |
| |
|
| | |
| | numeric_pros = ['CATQ1', 'CATQ2', 'CATQ3', 'CATQ4', 'CATQ5', 'CATQ6', 'CATQ7', |
| | 'CATQ8', 'SymptomDiaryQ1', 'SymptomDiaryQ2', 'Score'] |
| |
|
| | mean_pros = copd.rolling_mean_previous_period(df=daily_pros, cols=numeric_pros, |
| | date_col='SubmissionTime', |
| | id_col='StudyId', window=3) |
| |
|
| | |
| | daily_pros = daily_pros.merge(mean_pros, on=['StudyId', 'SubmissionTime'], how='left') |
| |
|
| | daily_pros = copd.calculate_diff_from_rolling_mean(df=daily_pros, cols=numeric_pros) |
| |
|
| | |
| | daily_pros = daily_pros.loc[:, ~daily_pros.columns.str.endswith('_ave')] |
| |
|
| | |
| | train_data = pd.merge_asof(data.sort_values(by='DateOfEvent'), daily_pros.drop( |
| | columns=['StudyId']).sort_values(by='SubmissionTime'), |
| | left_on='DateOfEvent', right_on='SubmissionTime', |
| | by='PatientId', direction='backward') |
| |
|
| | |
| | |
| | |
| | comorbidities = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetCoMorbidityDetails.txt', |
| | delimiter='|') |
| | comorbidities = comorbidities.drop(columns=['Id', 'Created']) |
| | |
| | comorbidity_list = list(comorbidities.columns) |
| | comorbidity_list.remove('PatientId') |
| |
|
| | |
| | comorbidities = comorbidities[comorbidities.PatientId.isin(data.PatientId)] |
| | print('Train patients with entries in CopdDatasetCoMorbidityDetails: {} out of {}'.format( |
| | len(comorbidities), len(data.PatientId.unique()))) |
| | comorbidities[comorbidity_list] = comorbidities[comorbidity_list].replace( |
| | bool_mapping).fillna(0) |
| | print('Comorbidity counts:', '\n', comorbidities[comorbidity_list].sum()) |
| |
|
| | |
| | train_data = train_data.merge(comorbidities, on='PatientId', how='left') |
| | print('Comorbidity counts after merging with patient days:', '\n', |
| | train_data[comorbidity_list].sum()) |
| | train_data[comorbidity_list] = train_data[comorbidity_list].fillna(0) |
| |
|
| | |
| | train_data['Comorbidities'] = train_data[comorbidity_list].sum(axis=1) |
| | comorb_counts = train_data.groupby('StudyId')['Comorbidities'].max().reset_index() |
| | |
| | |
| |
|
| | comorb_counts.loc[comorb_counts.StudyId.isin(exac_patients), 'IsExacPatient'] = 1 |
| | comorb_counts['IsExacPatient'] = comorb_counts['IsExacPatient'].fillna(0) |
| |
|
| | |
| | comorbidity_list.remove('AsthmaOverlap') |
| | train_data = train_data.drop(columns=comorbidity_list) |
| |
|
| | |
| | |
| | |
| | |
| | inhaler_type = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetUsualTherapies.txt', |
| | delimiter='|', usecols=['StudyId', 'InhalerType']) |
| | |
| | inhaler_type = inhaler_type[inhaler_type.StudyId.isin(data.StudyId)] |
| | |
| | inhaler_type = copd.triple_inhaler_therapy_service( |
| | df=inhaler_type, id_col='StudyId', inhaler_col='InhalerType', include_mitt=True) |
| |
|
| | print('Patients taking triple inhaler therapy: ', '\n', |
| | inhaler_type.TripleTherapy.value_counts()) |
| | train_data = train_data.merge(inhaler_type, on='StudyId', how='left') |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | train_data['SymptomDiaryQ8'] = train_data.SymptomDiaryQ8.replace( |
| | {1: 'Not difficult', 2: 'A little difficult', 3: 'Quite difficult', |
| | 4: 'Very difficult', np.nan: 'None'}) |
| |
|
| | |
| | |
| | train_data['SymptomDiaryQ9'] = train_data.SymptomDiaryQ9.replace( |
| | {1: 'Watery', 2: 'Sticky liquid', 3: 'Semi-solid', 4: 'Solid', np.nan: 'None'}) |
| |
|
| | |
| | |
| | train_data['SymptomDiaryQ10'] = train_data.SymptomDiaryQ10.replace( |
| | {1: 'White', 2: 'Yellow', 3: 'Green', 4: 'Dark green', np.nan: 'None'}) |
| |
|
| | |
| | train_data['SmokingStatus'] = train_data.SmokingStatus.replace( |
| | {1: 'Smoker', 2: 'Ex-smoker', 3: 'Non-smoker'}) |
| |
|
| | train_data['InExacWindow'] = train_data.IsExac.replace({0: False, 1: True}) |
| |
|
| | |
| | |
| | |
| |
|
| | train_data['DaysSinceCAT'] = (train_data.DateOfEvent - |
| | train_data.SubmissionTime).dt.days.astype('int') |
| |
|
| | DaysSinceCAT_cutoff = 14 |
| | train_data = train_data[train_data.DaysSinceCAT <= DaysSinceCAT_cutoff] |
| | |
| | |
| | |
| |
|
| | |
| | exac_bins = [-1, 0, 21, 90, 180, np.inf] |
| | exac_labels = ['None', '<21 days', '21 - 89 days', '90 - 179 days', '>= 180 days'] |
| |
|
| | train_data['DaysSinceLastExac'] = copd.bin_numeric_column( |
| | col=train_data['DaysSinceLastExac'], bins=exac_bins, labels=exac_labels) |
| |
|
| | |
| | age_bins = [0, 50, 60, 70, 80, np.inf] |
| | age_labels = ['<50', '50-59', '60-69', '70-79', '80+'] |
| |
|
| | train_data['Age'] = copd.bin_numeric_column( |
| | col=train_data['Age'], bins=age_bins, labels=age_labels) |
| |
|
| | |
| | comorb_bins = [0, 1, 3, np.inf] |
| | comorb_labels = ['None', '1-2', '3+'] |
| | train_data['Comorbidities'] = copd.bin_numeric_column( |
| | col=train_data['Comorbidities'], bins=comorb_bins, labels=comorb_labels) |
| |
|
| | comorb_counts['Comorbidities_binned'] = copd.bin_numeric_column( |
| | col=comorb_counts['Comorbidities'], bins=comorb_bins, labels=comorb_labels) |
| |
|
| | |
| | spirometry_bins = [0, 30, 50, 80, np.inf] |
| | spirometry_labels = ['Very severe', 'Severe', 'Moderate', 'Mild'] |
| |
|
| | train_data['FEV1PercentPredicted'] = copd.bin_numeric_column( |
| | col=train_data['LungFunction_FEV1PercentPredicted'], bins=spirometry_bins, |
| | labels=spirometry_labels) |
| |
|
| | train_data = train_data.drop(columns=['LungFunction_FEV1PercentPredicted']) |
| | |
| | train_data.loc[ |
| | train_data['FEV1PercentPredicted'] == 'nan', 'FEV1PercentPredicted'] = 'Mild' |
| | train_data['FEV1PercentPredicted'].value_counts() |
| |
|
| | |
| | |
| | |
| | train_data['HighestEosinophilCount_0_3'] = np.where( |
| | train_data['LabsHighestEosinophilCount'] >= 0.3, 1, 0) |
| | train_data = train_data.drop(columns=['LabsHighestEosinophilCount']) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | categorical_columns = ['SmokingStatus', 'SymptomDiaryQ8', 'SymptomDiaryQ9', |
| | 'SymptomDiaryQ10', 'DaysSinceLastExac', 'Age', 'Comorbidities', |
| | 'FEV1PercentPredicted'] |
| | train_data[categorical_columns] = train_data[categorical_columns].astype("str") |
| | data_encoded = copd.kfold_encode_train_data(df=train_data, fold_patients=fold_patients, |
| | cols_to_encode=categorical_columns, |
| | target='IsExac', id_col='StudyId') |
| | data_encoded = data_encoded.drop(columns=categorical_columns, axis=1) |
| |
|
| | |
| | |
| | |
| | data_encoded = data_encoded.drop(columns=['PatientId', 'InExacWindow', |
| | 'DateOfEvent', 'SubmissionTime', |
| | 'FirstSubmissionDate', 'LatestPredictionDate']) |
| |
|
| | scaler = MinMaxScaler() |
| | train_data_scaled = crossvalidation.kfold_process_train_data(df=data_encoded, |
| | fold_patients=fold_patients, |
| | processor=scaler, |
| | id_col='StudyId', |
| | target='IsExac') |
| |
|
| | |
| | |
| | |
| | |
| | imputer = SimpleImputer(missing_values=np.nan, strategy='median') |
| | train_data_imputed = crossvalidation.kfold_process_train_data(df=train_data_scaled, |
| | fold_patients=fold_patients, |
| | processor=imputer, |
| | id_col='StudyId', |
| | target='IsExac') |
| | |
| | |
| | |
| |
|
| | |
| | os.makedirs(output_data_dir, exist_ok=True) |
| | train_data_imputed.to_pickle(os.path.join(output_data_dir, 'train_data_cv.pkl')) |
| | print('Final train data saved (CV)') |
| |
|