| | """ |
| | Script for preprocessing labs data |
| | -------- |
| | Track median values for labs tests over the previous 2 years for patients |
| | with resulting dataset containing 1 row of information per patient per year |
| | """ |
| | import json |
| | import pandas as pd |
| | import numpy as np |
| | from datetime import date |
| | from dateutil.relativedelta import relativedelta |
| | from utils.common import (read_data, correct_column_names, |
| | first_patient_appearance) |
| | from utils.labs_processing import add_total_labs |
| |
|
| |
|
| | def initialize_labs_data(labs_file): |
| | """ |
| | Load in labs dataset to correct format |
| | -------- |
| | :param labs_file: labs data file name |
| | :return: labs dataframe with correct column names and types |
| | """ |
| | print('Loading labs data') |
| |
|
| | |
| | old_cols = ['SafeHavenID', 'SAMPLEDATE', 'CLINICALCODEDESCRIPTION', |
| | 'QUANTITYVALUE', 'RANGEHIGHVALUE', 'RANGELOWVALUE'] |
| | labs_types = ['int', 'object', 'str', 'float', 'float', 'float'] |
| | df = read_data(labs_file, old_cols, labs_types) |
| |
|
| | |
| | new_cols = ['SafeHavenID', 'SampleDate', 'ClinicalCodeDescription', |
| | 'QuantityValue', 'RangeHighValue', 'RangeLowValue'] |
| | mapping = dict(zip(old_cols, new_cols)) |
| | df = df.rename(columns=mapping) |
| |
|
| | |
| | df = df.dropna().drop_duplicates() |
| |
|
| | |
| | num_cols = ['QuantityValue', 'RangeHighValue', 'RangeLowValue'] |
| | df = df[(df[num_cols] > -1).all(axis=1)] |
| |
|
| | |
| | final_cols = ['SafeHavenID', 'SampleDate', 'ClinicalCodeDescription', |
| | 'QuantityValue'] |
| | df = df[final_cols] |
| |
|
| | |
| | df['SampleDate'] = pd.to_datetime(df.SampleDate) |
| |
|
| | return df |
| |
|
| |
|
| | def clean_labs(df): |
| | """ |
| | Clean descriptions and select relevant tests |
| | -------- |
| | :param df: pandas dataframe |
| | :return: cleaned dataframe |
| | """ |
| | print('Cleaning labs data') |
| |
|
| | lab_tests = ['ALT', 'AST', 'Albumin', 'Alkaline Phosphatase', 'Basophils', |
| | 'C Reactive Protein', 'Chloride', 'Creatinine', 'Eosinophils', |
| | 'Estimated GFR', 'Haematocrit', 'Haemoglobin', 'Lymphocytes', |
| | 'MCH', 'Mean Cell Volume', 'Monocytes', 'Neutrophils', |
| | 'PCO2 (temp corrected', 'Platelets', 'Potassium', |
| | 'Red Blood Count', 'Serum vitamin B12', 'Sodium', |
| | 'Total Bilirubin', 'Urea', 'White Blood Count'] |
| |
|
| | |
| | str_col = 'ClinicalCodeDescription' |
| | df[str_col] = df[str_col].str.strip() |
| | |
| | |
| | with open('mappings/test_mapping.json') as json_file: |
| | test_mapping = json.load(json_file) |
| |
|
| | |
| | for k, v in test_mapping.items(): |
| | df[str_col] = df[str_col].replace(v, k) |
| |
|
| | |
| | df = df[[desc in lab_tests for desc in df[str_col]]] |
| |
|
| | return df |
| |
|
| |
|
| | def add_neut_lypmh(df): |
| | """ |
| | Pivot dataframe and calculate neut_lypmh feature |
| | -------- |
| | :param df: pandas dataframe |
| | :return: pivoted dataframe |
| | """ |
| | print('Calculating neut_lypmh data') |
| |
|
| | |
| | df = pd.pivot_table( |
| | df, index=['SafeHavenID', 'SampleDate'], |
| | columns=['ClinicalCodeDescription'], values='QuantityValue', |
| | dropna=True).reset_index() |
| |
|
| | |
| | df['neut_lymph'] = df.Neutrophils / df.Lymphocytes |
| |
|
| | |
| | df['neut_lymph'] = df.neut_lymph.replace([np.inf, -np.inf], np.nan) |
| |
|
| | return df |
| |
|
| |
|
| | def add_eoy_column(df, dt_col, eoy_date): |
| | """ |
| | Add EOY relative to user-specified end date |
| | -------- |
| | :param df: dataframe |
| | :param dt_col: date column in dataframe |
| | :param eoy_date: EOY date from config |
| | :return: updated df with EOY column added |
| | """ |
| | |
| | df = df.reset_index(drop=True) |
| |
|
| | |
| | end_date = pd.to_datetime(eoy_date) |
| | end_month = end_date.month |
| | end_day = end_date.day |
| |
|
| | |
| | df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year] |
| |
|
| | |
| | eoy_index = df.columns[df.columns == 'eoy'] |
| | adm_vs_eoy = df[dt_col] > df.eoy |
| | row_index = df.index[adm_vs_eoy] |
| | df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1) |
| | df['eoy'] = pd.to_datetime(df.eoy) |
| |
|
| | return df |
| |
|
| |
|
| | def reduce_labs_data(df, dt_col): |
| | """ |
| | Reduce dataset to 1 row per ID per year looking back at the median values |
| | over the previous 2 years |
| | -------- |
| | :param df: pandas dataframe |
| | :param dt_col: date column |
| | :return: reduced labs dataframe |
| | """ |
| | print('Reducing labs to 1 row per patient per year') |
| |
|
| | group_cols = ['SafeHavenID', 'eoy'] |
| | med_cols = ['ALT', 'AST', 'Albumin', 'Alkaline Phosphatase', 'Basophils', |
| | 'C Reactive Protein', 'Chloride', 'Creatinine', 'Eosinophils', |
| | 'Estimated GFR', 'Haematocrit', 'Haemoglobin', 'Lymphocytes', |
| | 'MCH', 'Mean Cell Volume', 'Monocytes', 'Neutrophils', |
| | 'Platelets', 'Potassium', 'Red Blood Count', 'Sodium', |
| | 'Total Bilirubin', 'Urea', 'White Blood Count', 'neut_lymph'] |
| |
|
| | |
| | df['labs'] = 1 |
| |
|
| | |
| | df = df.sort_values(dt_col) |
| |
|
| | |
| | shifted = df[['eoy']] + pd.DateOffset(years=1) |
| | new_tab = df[['SafeHavenID', dt_col] + med_cols].join(shifted) |
| | combined_cols = ['SafeHavenID', 'eoy', dt_col] + med_cols |
| | combined = pd.concat([df[combined_cols], new_tab]) |
| | combined = combined.sort_values(dt_col) |
| |
|
| | |
| | df_med = combined.groupby(group_cols).median() |
| |
|
| | |
| | new_med_cols = [col + '_med_2yr' for col in df_med.columns] |
| | df_med.columns = new_med_cols |
| |
|
| | |
| | test = [] |
| | for k, v in df.groupby('SafeHavenID')['eoy'].unique().to_dict().items(): |
| | test.append(df_med.loc[(k, v), ]) |
| | df_med = pd.concat(test) |
| |
|
| | |
| | df_last = df[group_cols + ['labs_to_date']] |
| | df_last = df_last.groupby(group_cols).last() |
| |
|
| | |
| | df_sum = df[group_cols + ['labs']] |
| | df_sum = df.groupby(group_cols)['labs'].sum() |
| |
|
| | |
| | df_sum = df_sum.to_frame() |
| | df_sum.columns = ['labs_per_year'] |
| |
|
| | |
| | df_annual = df_med.join(df_last).join(df_sum) |
| | |
| | return df_annual |
| |
|
| |
|
| | def main(): |
| |
|
| | |
| | with open('../../../config.json') as json_config_file: |
| | config = json.load(json_config_file) |
| |
|
| | |
| | labs_file = config['extract_data_path'] + 'SCI_Store_Cohort3R.csv' |
| | labs = initialize_labs_data(labs_file) |
| |
|
| | |
| | labs = clean_labs(labs) |
| |
|
| | |
| | data_path = config['model_data_path'] |
| | first_patient_appearance(labs, 'SampleDate', 'labs', data_path) |
| |
|
| | |
| | labs = add_neut_lypmh(labs) |
| |
|
| | |
| | labs = add_eoy_column(labs, 'SampleDate', config['date']) |
| | labs = labs.sort_values('SampleDate') |
| |
|
| | |
| | labs['labs_to_date'] = 1 |
| | labs = labs.groupby('SafeHavenID').apply(add_total_labs) |
| | labs = labs.reset_index(drop=True) |
| |
|
| | |
| | labs_yearly = reduce_labs_data(labs, 'SampleDate') |
| |
|
| | |
| | labs_yearly.columns = correct_column_names(labs_yearly.columns, 'labs') |
| |
|
| | |
| | labs_yearly.to_pickle(data_path + 'labs_proc.pkl') |
| |
|
| |
|
| | main() |
| |
|