""" Script for preprocessing labs data -------- Track median values for labs tests over the previous 2 years for patients with resulting dataset containing 1 row of information per patient per year """ import json import pandas as pd import numpy as np from datetime import date from dateutil.relativedelta import relativedelta from utils.common import (read_data, correct_column_names, first_patient_appearance) from utils.labs_processing import add_total_labs def initialize_labs_data(labs_file): """ Load in labs dataset to correct format -------- :param labs_file: labs data file name :return: labs dataframe with correct column names and types """ print('Loading labs data') # Read in data old_cols = ['SafeHavenID', 'SAMPLEDATE', 'CLINICALCODEDESCRIPTION', 'QUANTITYVALUE', 'RANGEHIGHVALUE', 'RANGELOWVALUE'] labs_types = ['int', 'object', 'str', 'float', 'float', 'float'] df = read_data(labs_file, old_cols, labs_types) # Rename columns to CamelCase new_cols = ['SafeHavenID', 'SampleDate', 'ClinicalCodeDescription', 'QuantityValue', 'RangeHighValue', 'RangeLowValue'] mapping = dict(zip(old_cols, new_cols)) df = df.rename(columns=mapping) # Drop any nulls, duplicates or negative (broken) test values df = df.dropna().drop_duplicates() # Check tests are valid (values > -1) num_cols = ['QuantityValue', 'RangeHighValue', 'RangeLowValue'] df = df[(df[num_cols] > -1).all(axis=1)] # Select final columns final_cols = ['SafeHavenID', 'SampleDate', 'ClinicalCodeDescription', 'QuantityValue'] df = df[final_cols] # Convert date df['SampleDate'] = pd.to_datetime(df.SampleDate) return df def clean_labs(df): """ Clean descriptions and select relevant tests -------- :param df: pandas dataframe :return: cleaned dataframe """ print('Cleaning labs data') lab_tests = ['ALT', 'AST', 'Albumin', 'Alkaline Phosphatase', 'Basophils', 'C Reactive Protein', 'Chloride', 'Creatinine', 'Eosinophils', 'Estimated GFR', 'Haematocrit', 'Haemoglobin', 'Lymphocytes', 'MCH', 'Mean Cell Volume', 'Monocytes', 'Neutrophils', 'PCO2 (temp corrected', 'Platelets', 'Potassium', 'Red Blood Count', 'Serum vitamin B12', 'Sodium', 'Total Bilirubin', 'Urea', 'White Blood Count'] # Strip any whitespaces str_col = 'ClinicalCodeDescription' df[str_col] = df[str_col].str.strip() # Read in test mapping with open('mappings/test_mapping.json') as json_file: test_mapping = json.load(json_file) # Correct names for relevant tests for k, v in test_mapping.items(): df[str_col] = df[str_col].replace(v, k) # Select relevant tests df = df[[desc in lab_tests for desc in df[str_col]]] return df def add_neut_lypmh(df): """ Pivot dataframe and calculate neut_lypmh feature -------- :param df: pandas dataframe :return: pivoted dataframe """ print('Calculating neut_lypmh data') # Pivot table with CCDesc as headers and QuantityValue as values df = pd.pivot_table( df, index=['SafeHavenID', 'SampleDate'], columns=['ClinicalCodeDescription'], values='QuantityValue', dropna=True).reset_index() # Add neut_lymph feature df['neut_lymph'] = df.Neutrophils / df.Lymphocytes # Replace any infinite values df['neut_lymph'] = df.neut_lymph.replace([np.inf, -np.inf], np.nan) return df def add_eoy_column(df, dt_col, eoy_date): """ Add EOY relative to user-specified end date -------- :param df: dataframe :param dt_col: date column in dataframe :param eoy_date: EOY date from config :return: updated df with EOY column added """ # Needed to stop error with creating a new column df = df.reset_index(drop=True) # Add column with user-specified end of year date end_date = pd.to_datetime(eoy_date) end_month = end_date.month end_day = end_date.day # Add for every year df['eoy'] = [date(y, end_month, end_day) for y in df[dt_col].dt.year] # Check that EOY date is after dt_col for each entry eoy_index = df.columns[df.columns == 'eoy'] adm_vs_eoy = df[dt_col] > df.eoy row_index = df.index[adm_vs_eoy] df.loc[row_index, eoy_index] = df[adm_vs_eoy].eoy + relativedelta(years=1) df['eoy'] = pd.to_datetime(df.eoy) return df def reduce_labs_data(df, dt_col): """ Reduce dataset to 1 row per ID per year looking back at the median values over the previous 2 years -------- :param df: pandas dataframe :param dt_col: date column :return: reduced labs dataframe """ print('Reducing labs to 1 row per patient per year') group_cols = ['SafeHavenID', 'eoy'] med_cols = ['ALT', 'AST', 'Albumin', 'Alkaline Phosphatase', 'Basophils', 'C Reactive Protein', 'Chloride', 'Creatinine', 'Eosinophils', 'Estimated GFR', 'Haematocrit', 'Haemoglobin', 'Lymphocytes', 'MCH', 'Mean Cell Volume', 'Monocytes', 'Neutrophils', 'Platelets', 'Potassium', 'Red Blood Count', 'Sodium', 'Total Bilirubin', 'Urea', 'White Blood Count', 'neut_lymph'] # Add column to track labs per year df['labs'] = 1 # Sort by date and extract year df = df.sort_values(dt_col) # Include data from previous year shifted = df[['eoy']] + pd.DateOffset(years=1) new_tab = df[['SafeHavenID', dt_col] + med_cols].join(shifted) combined_cols = ['SafeHavenID', 'eoy', dt_col] + med_cols combined = pd.concat([df[combined_cols], new_tab]) combined = combined.sort_values(dt_col) # Extract median data for last 2 years df_med = combined.groupby(group_cols).median() # Rename median columns new_med_cols = [col + '_med_2yr' for col in df_med.columns] df_med.columns = new_med_cols # Only carry forward year data that appeared in df test = [] for k, v in df.groupby('SafeHavenID')['eoy'].unique().to_dict().items(): test.append(df_med.loc[(k, v), ]) df_med = pd.concat(test) # Extract features to find last value of df_last = df[group_cols + ['labs_to_date']] df_last = df_last.groupby(group_cols).last() # Extract features to calculate sum of df_sum = df[group_cols + ['labs']] df_sum = df.groupby(group_cols)['labs'].sum() # Rename sum columns df_sum = df_sum.to_frame() df_sum.columns = ['labs_per_year'] # Merge datasets df_annual = df_med.join(df_last).join(df_sum) return df_annual def main(): # Load in config items with open('../../../config.json') as json_config_file: config = json.load(json_config_file) # Load in data labs_file = config['extract_data_path'] + 'SCI_Store_Cohort3R.csv' labs = initialize_labs_data(labs_file) # Clean data labs = clean_labs(labs) # Save first date in dataset data_path = config['model_data_path'] first_patient_appearance(labs, 'SampleDate', 'labs', data_path) # Pivot and add neut_lypmh labs = add_neut_lypmh(labs) # Add EOY column relative to user specified date labs = add_eoy_column(labs, 'SampleDate', config['date']) labs = labs.sort_values('SampleDate') # Track each lab event labs['labs_to_date'] = 1 labs = labs.groupby('SafeHavenID').apply(add_total_labs) labs = labs.reset_index(drop=True) # Reduce labs to 1 row per ID per year labs_yearly = reduce_labs_data(labs, 'SampleDate') # Correct column names labs_yearly.columns = correct_column_names(labs_yearly.columns, 'labs') # Save data labs_yearly.to_pickle(data_path + 'labs_proc.pkl') main()