copd-model-e / validation /spirometry_scripts /spirometry_RC_SU_mapping.py
IamGrooooot's picture
Model E: Unsupervised PCA + clustering risk stratification
53a6def
"""
Map GOLD standard COPD groupings from REC/SUP IDs to SafeHavenIDs.
--------
NB: Data contained within 'RC_SU1_spirometry_data.csv' has been created using
from data within the teams space.
"""
import pandas as pd
# Set file paths
file_path = '<YOUR_DATA_PATH>/copd.model-e/'
input_file_path = file_path + 'training/src/data/'
output_file_path = '<YOUR_DATA_PATH>/Model_E_Extracts/rec_sup_spirometry_data.pkl'
def read_data(file):
"""
Read in data source
--------
:param file: string filename
:return: dataframe
"""
df = pd.read_csv(file)
return df
def calc_gold_grade(data):
"""
Calculate GOLD grade for COPD classification using FEV1%
--------
:param data: dataframe containing FEV1% column
:return: GOLD grade values based on if else statement
"""
fev1 = data['FEV1%']
if fev1 >= 80:
val = 'GOLD 1'
elif (fev1 >= 50) & (fev1 < 80):
val = 'GOLD 2'
elif (fev1 >= 30) & (fev1 < 50):
val = 'GOLD 3'
elif fev1 < 30:
val = 'GOLD 4'
else:
val = ''
return val
def add_SH_mappings_for_RC_and_SU1(RC_IDs, SU1_IDs, spirometry_data):
"""
Join the SH ID mappings to the spirometry data for RC and SU1
--------
:param RC_IDs: dataframe containing RECEIVER - SH ID mappings
:param SU1_IDs: dataframe containing SU1 - SH ID mappings
:param spirometry_data: spirometry data for RC and SU1
:return: RC and SU1 spirometry data with SH ID mapping columns
"""
receiver_IDs = RC_IDs.rename(columns={'RNo': 'StudyId'})
scaleup_IDs = SU1_IDs.rename(columns={'Study_Number': 'StudyId'})
all_service_IDs = pd.concat([receiver_IDs, scaleup_IDs], ignore_index=True)
spirometry_mappings = pd.merge(
spirometry_data, all_service_IDs, on="StudyId", how="left").dropna()
type_map = {'FEV1%': 'int32', 'SafeHavenID': 'int32'}
spirometry_mappings = spirometry_mappings.astype(type_map)
return spirometry_mappings
def main():
# Read spirometry data
rec_sup_spiro_file = input_file_path + "RC_SU1_spirometry_data.csv"
rec_sup_spiro_data = read_data(rec_sup_spiro_file).dropna()
# Create new columns showing the GOLD group of each study participant
rec_sup_spiro_data['GOLD grade'] = rec_sup_spiro_data.apply(
calc_gold_grade, axis=1)
# Read RC and SU1 SafeHaven ID mapping files
rec_id_file = "<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA/Cohort3Rand.csv"
sup_id_file = "<YOUR_DATA_PATH>/SU_IDs/Scale_Up_lookup.csv"
rec_id_map_data = read_data(rec_id_file)
sup_id_map_data = read_data(sup_id_file)
# Join spirometry data to SH mappings
mapped_data = add_SH_mappings_for_RC_and_SU1(
rec_id_map_data, sup_id_map_data, rec_sup_spiro_data)
# Save data
mapped_data.to_pickle(output_file_path)
main()