""" Map GOLD standard COPD groupings from REC/SUP IDs to SafeHavenIDs. -------- NB: Data contained within 'RC_SU1_spirometry_data.csv' has been created using from data within the teams space. """ import pandas as pd # Set file paths file_path = '/copd.model-e/' input_file_path = file_path + 'training/src/data/' output_file_path = '/Model_E_Extracts/rec_sup_spirometry_data.pkl' def read_data(file): """ Read in data source -------- :param file: string filename :return: dataframe """ df = pd.read_csv(file) return df def calc_gold_grade(data): """ Calculate GOLD grade for COPD classification using FEV1% -------- :param data: dataframe containing FEV1% column :return: GOLD grade values based on if else statement """ fev1 = data['FEV1%'] if fev1 >= 80: val = 'GOLD 1' elif (fev1 >= 50) & (fev1 < 80): val = 'GOLD 2' elif (fev1 >= 30) & (fev1 < 50): val = 'GOLD 3' elif fev1 < 30: val = 'GOLD 4' else: val = '' return val def add_SH_mappings_for_RC_and_SU1(RC_IDs, SU1_IDs, spirometry_data): """ Join the SH ID mappings to the spirometry data for RC and SU1 -------- :param RC_IDs: dataframe containing RECEIVER - SH ID mappings :param SU1_IDs: dataframe containing SU1 - SH ID mappings :param spirometry_data: spirometry data for RC and SU1 :return: RC and SU1 spirometry data with SH ID mapping columns """ receiver_IDs = RC_IDs.rename(columns={'RNo': 'StudyId'}) scaleup_IDs = SU1_IDs.rename(columns={'Study_Number': 'StudyId'}) all_service_IDs = pd.concat([receiver_IDs, scaleup_IDs], ignore_index=True) spirometry_mappings = pd.merge( spirometry_data, all_service_IDs, on="StudyId", how="left").dropna() type_map = {'FEV1%': 'int32', 'SafeHavenID': 'int32'} spirometry_mappings = spirometry_mappings.astype(type_map) return spirometry_mappings def main(): # Read spirometry data rec_sup_spiro_file = input_file_path + "RC_SU1_spirometry_data.csv" rec_sup_spiro_data = read_data(rec_sup_spiro_file).dropna() # Create new columns showing the GOLD group of each study participant rec_sup_spiro_data['GOLD grade'] = rec_sup_spiro_data.apply( calc_gold_grade, axis=1) # Read RC and SU1 SafeHaven ID mapping files rec_id_file = "/EXAMPLE_STUDY_DATA/Cohort3Rand.csv" sup_id_file = "/SU_IDs/Scale_Up_lookup.csv" rec_id_map_data = read_data(rec_id_file) sup_id_map_data = read_data(sup_id_file) # Join spirometry data to SH mappings mapped_data = add_SH_mappings_for_RC_and_SU1( rec_id_map_data, sup_id_map_data, rec_sup_spiro_data) # Save data mapped_data.to_pickle(output_file_path) main()