| | """Prepare final test set for modelling (K fold encoded, scaled and imputed).""" |
| | import copd |
| | import json |
| | import joblib |
| | from lenusml import encoding |
| | import os |
| | import pandas as pd |
| | import numpy as np |
| |
|
| | data_dir = '<YOUR_DATA_PATH>/test_data/' |
| | cohort_info_dir = '../data/cohort_info/' |
| | output_data_dir = '../data/models/model1' |
| | artifact_dir = os.path.join(output_data_dir, 'artifacts') |
| |
|
| | data = pd.read_pickle(os.path.join(data_dir, 'test_data.pkl')) |
| |
|
| | |
| | |
| | |
| | bool_mapping = {True: 1, False: 0} |
| | data['RequiredAcuteNIV'] = data.RequiredAcuteNIV.replace(bool_mapping) |
| | data['RequiredICUAdmission'] = data.RequiredICUAdmission.replace(bool_mapping) |
| |
|
| | |
| | sex_mapping = {'F': 1, 'M': 0} |
| | data['Sex_F'] = data.Sex.map(sex_mapping) |
| | data = data.drop(columns=['Sex']) |
| |
|
| | |
| | |
| | |
| | cat = pd.read_csv(os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProCat.txt'), |
| | delimiter="|") |
| |
|
| | symptom_diary = pd.read_csv( |
| | os.path.join('<YOUR_DATA_PATH>/copd-dataset/', 'CopdDatasetProSymptomDiary.txt'), |
| | usecols=['PatientId', 'SubmissionTime', 'SymptomDiaryQ1', 'SymptomDiaryQ2', |
| | 'SymptomDiaryQ3', 'SymptomDiaryQ8', 'SymptomDiaryQ9', 'SymptomDiaryQ10'], |
| | delimiter="|") |
| |
|
| | cat['SubmissionTime'] = pd.to_datetime(cat.SubmissionTime, |
| | utc=True).dt.normalize() |
| | symptom_diary['SubmissionTime'] = pd.to_datetime(symptom_diary.SubmissionTime, |
| | utc=True).dt.normalize() |
| |
|
| |
|
| | |
| | cat = cat[cat.PatientId.isin(data.PatientId)] |
| | symptom_diary = symptom_diary[symptom_diary.PatientId.isin(data.PatientId)] |
| |
|
| | |
| | |
| | daily_pros = pd.merge(cat.drop_duplicates(subset=['PatientId', 'SubmissionTime']), |
| | symptom_diary.drop_duplicates(subset=['PatientId', |
| | 'SubmissionTime']), |
| | on=['PatientId', 'SubmissionTime'], how='inner') |
| |
|
| | |
| | numeric_pros = ['CATQ1', 'CATQ2', 'CATQ3', 'CATQ4', 'CATQ5', 'CATQ6', 'CATQ7', |
| | 'CATQ8', 'SymptomDiaryQ1', 'SymptomDiaryQ2', 'Score'] |
| |
|
| | mean_pros = copd.rolling_mean_previous_period(df=daily_pros, cols=numeric_pros, |
| | date_col='SubmissionTime', |
| | id_col='StudyId', window=3) |
| |
|
| | |
| | daily_pros = daily_pros.merge(mean_pros, on=['StudyId', 'SubmissionTime'], how='left') |
| |
|
| | daily_pros = copd.calculate_diff_from_rolling_mean(df=daily_pros, cols=numeric_pros) |
| |
|
| | |
| | daily_pros = daily_pros.loc[:, ~daily_pros.columns.str.endswith('_ave')] |
| |
|
| | |
| | test_data = pd.merge_asof(data.sort_values(by='DateOfEvent'), daily_pros.drop( |
| | columns=['StudyId']).sort_values(by='SubmissionTime'), |
| | left_on='DateOfEvent', right_on='SubmissionTime', |
| | by='PatientId', direction='backward') |
| |
|
| | |
| | |
| | |
| | comorbidities = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetCoMorbidityDetails.txt', |
| | delimiter='|') |
| | comorbidities = comorbidities.drop(columns=['Id', 'Created']) |
| | |
| | comorbidity_list = list(comorbidities.columns) |
| | comorbidity_list.remove('PatientId') |
| |
|
| | |
| | comorbidities = comorbidities[comorbidities.PatientId.isin(data.PatientId)] |
| | print('Test patients with entries in CopdDatasetCoMorbidityDetails: {} out of {}'.format( |
| | len(comorbidities), len(data.PatientId.unique()))) |
| | comorbidities[comorbidity_list] = comorbidities[comorbidity_list].replace( |
| | bool_mapping).fillna(0) |
| | print('Comorbidity counts:', '\n', comorbidities[comorbidity_list].sum()) |
| |
|
| | |
| | test_data = test_data.merge(comorbidities, on='PatientId', how='left') |
| | print('Comorbidity counts after merging with patient days:', '\n', |
| | test_data[comorbidity_list].sum()) |
| | test_data[comorbidity_list] = test_data[comorbidity_list].fillna(0) |
| |
|
| | |
| | test_data['Comorbidities'] = test_data[comorbidity_list].sum(axis=1) |
| | comorb_counts = test_data.groupby('StudyId')['Comorbidities'].max().reset_index() |
| | print('Patient comorbidity counts after infilling missing values: \n', |
| | comorb_counts.value_counts()) |
| |
|
| | |
| | comorbidity_list.remove('AsthmaOverlap') |
| | test_data = test_data.drop(columns=comorbidity_list) |
| |
|
| | |
| | |
| | |
| | |
| | inhaler_type = pd.read_csv('<YOUR_DATA_PATH>/copd-dataset/CopdDatasetUsualTherapies.txt', |
| | delimiter='|', usecols=['StudyId', 'InhalerType']) |
| | |
| | inhaler_type = inhaler_type[inhaler_type.StudyId.isin(data.StudyId)] |
| | |
| | inhaler_type = copd.triple_inhaler_therapy_service( |
| | df=inhaler_type, id_col='StudyId', inhaler_col='InhalerType', include_mitt=True) |
| |
|
| | print('Patients taking triple inhaler therapy: ', '\n', |
| | inhaler_type.TripleTherapy.value_counts()) |
| | test_data = test_data.merge(inhaler_type, on='StudyId', how='left') |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | |
| | test_data['SymptomDiaryQ8'] = test_data.SymptomDiaryQ8.replace( |
| | {1: 'Not difficult', 2: 'A little difficult', 3: 'Quite difficult', |
| | 4: 'Very difficult', np.nan: 'None'}) |
| |
|
| | |
| | |
| | test_data['SymptomDiaryQ9'] = test_data.SymptomDiaryQ9.replace( |
| | {1: 'Watery', 2: 'Sticky liquid', 3: 'Semi-solid', 4: 'Solid', np.nan: 'None'}) |
| |
|
| | |
| | |
| | test_data['SymptomDiaryQ10'] = test_data.SymptomDiaryQ10.replace( |
| | {1: 'White', 2: 'Yellow', 3: 'Green', 4: 'Dark green', np.nan: 'None'}) |
| |
|
| | |
| | test_data['SmokingStatus'] = test_data.SmokingStatus.replace( |
| | {1: 'Smoker', 2: 'Ex-smoker', 3: 'Non-smoker'}) |
| |
|
| | test_data['InExacWindow'] = test_data.IsExac.replace({0: False, 1: True}) |
| |
|
| | |
| | |
| | |
| |
|
| | test_data['DaysSinceCAT'] = (test_data.DateOfEvent - |
| | test_data.SubmissionTime).dt.days.astype('int') |
| |
|
| | DaysSinceCAT_cutoff = 14 |
| | test_data = test_data[test_data.DaysSinceCAT <= DaysSinceCAT_cutoff] |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | exac_bins = [-1, 0, 21, 90, 180, np.inf] |
| | exac_labels = ['None', '<21 days', '21 - 89 days', '90 - 179 days', '>= 180 days'] |
| |
|
| | test_data['DaysSinceLastExac'] = copd.bin_numeric_column( |
| | col=test_data['DaysSinceLastExac'], bins=exac_bins, labels=exac_labels) |
| |
|
| | |
| | age_bins = [0, 50, 60, 70, 80, np.inf] |
| | age_labels = ['<50', '50-59', '60-69', '70-79', '80+'] |
| |
|
| | test_data['Age'] = copd.bin_numeric_column( |
| | col=test_data['Age'], bins=age_bins, labels=age_labels) |
| |
|
| | |
| | comorb_bins = [0, 1, 3, np.inf] |
| | comorb_labels = ['None', '1-2', '3+'] |
| | test_data['Comorbidities'] = copd.bin_numeric_column( |
| | col=test_data['Comorbidities'], bins=comorb_bins, labels=comorb_labels) |
| |
|
| | comorb_counts['Comorbidities_binned'] = copd.bin_numeric_column( |
| | col=comorb_counts['Comorbidities'], bins=comorb_bins, labels=comorb_labels) |
| |
|
| | |
| | spirometry_bins = [0, 30, 50, 80, np.inf] |
| | spirometry_labels = ['Very severe', 'Severe', 'Moderate', 'Mild'] |
| |
|
| | test_data['FEV1PercentPredicted'] = copd.bin_numeric_column( |
| | col=test_data['LungFunction_FEV1PercentPredicted'], bins=spirometry_bins, |
| | labels=spirometry_labels) |
| |
|
| | test_data = test_data.drop(columns=['LungFunction_FEV1PercentPredicted']) |
| | |
| | test_data.loc[ |
| | test_data['FEV1PercentPredicted'] == 'nan', 'FEV1PercentPredicted'] = 'Mild' |
| | test_data['FEV1PercentPredicted'].value_counts() |
| |
|
| | |
| | |
| | |
| | test_data['HighestEosinophilCount_0_3'] = np.where( |
| | test_data['LabsHighestEosinophilCount'] >= 0.3, 1, 0) |
| | test_data = test_data.drop(columns=['LabsHighestEosinophilCount']) |
| |
|
| | |
| | categorical_columns = ['SmokingStatus', 'SymptomDiaryQ8', 'SymptomDiaryQ9', |
| | 'SymptomDiaryQ10', 'DaysSinceLastExac', 'Age', 'Comorbidities', |
| | 'FEV1PercentPredicted'] |
| |
|
| | test_data[categorical_columns] = test_data[categorical_columns].astype("str") |
| |
|
| | |
| | target_encodings = json.load(open(os.path.join(artifact_dir, |
| | "target_encodings.json"))) |
| |
|
| | data_encoded = encoding.apply_target_encodings(data=test_data, encodings=target_encodings, |
| | cols_to_encode=categorical_columns) |
| |
|
| | |
| | |
| | |
| | data_encoded = data_encoded.drop(columns=['PatientId', 'InExacWindow', |
| | 'DateOfEvent', 'SubmissionTime', |
| | 'FirstSubmissionDate', 'LatestPredictionDate']) |
| |
|
| |
|
| | scaler = joblib.load(os.path.join(artifact_dir, 'scaler.pkl')) |
| |
|
| | |
| | test_data_scaled = scaler.transform( |
| | data_encoded.drop(columns=['StudyId', 'IsExac'])) |
| |
|
| | |
| | test_data_scaled = pd.DataFrame(test_data_scaled, columns=data_encoded.drop( |
| | columns=['StudyId', 'IsExac']).columns) |
| | test_data_scaled.insert(0, 'StudyId', data_encoded.StudyId.values) |
| | test_data_scaled['IsExac'] = data_encoded.IsExac.values |
| | print('Test data scaled') |
| |
|
| | |
| | |
| | |
| | imputer = joblib.load(os.path.join(artifact_dir, 'imputer.pkl')) |
| | imputer |
| | |
| | test_data_imputed = imputer.transform(test_data_scaled.drop( |
| | columns=['StudyId', 'IsExac'])) |
| |
|
| | |
| | test_data_imputed = pd.DataFrame(test_data_imputed, columns=test_data_scaled.drop( |
| | columns=['StudyId', 'IsExac']).columns) |
| | test_data_imputed.insert(0, 'StudyId', test_data_scaled.StudyId.values) |
| | test_data_imputed['IsExac'] = test_data_scaled.IsExac.values |
| | print('Test data imputed') |
| |
|
| | |
| | |
| | |
| |
|
| | |
| | test_data_imputed.to_pickle(os.path.join(output_data_dir, 'test_data.pkl')) |
| | print('Final test data saved') |
| |
|