File size: 2,839 Bytes
53a6def
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
Map GOLD standard COPD groupings from REC/SUP IDs to SafeHavenIDs.
--------
NB: Data contained within 'RC_SU1_spirometry_data.csv' has been created using
from data within the teams space.
"""
import pandas as pd


# Set file paths
file_path = '<YOUR_DATA_PATH>/copd.model-e/'
input_file_path = file_path + 'training/src/data/'
output_file_path = '<YOUR_DATA_PATH>/Model_E_Extracts/rec_sup_spirometry_data.pkl'


def read_data(file):
    """
    Read in data source
    --------
    :param file: string filename
    :return: dataframe
    """
    df = pd.read_csv(file)

    return df


def calc_gold_grade(data):
    """
    Calculate GOLD grade for COPD classification using FEV1%
    --------
    :param data: dataframe containing FEV1% column
    :return: GOLD grade values based on if else statement
    """
    fev1 = data['FEV1%']
    if fev1 >= 80:
        val = 'GOLD 1'
    elif (fev1 >= 50) & (fev1 < 80):
        val = 'GOLD 2'
    elif (fev1 >= 30) & (fev1 < 50):
        val = 'GOLD 3'
    elif fev1 < 30:
        val = 'GOLD 4'
    else:
        val = ''

    return val


def add_SH_mappings_for_RC_and_SU1(RC_IDs, SU1_IDs, spirometry_data):
    """
    Join the SH ID mappings to the spirometry data for RC and SU1
    --------
    :param RC_IDs: dataframe containing RECEIVER - SH ID mappings
    :param SU1_IDs: dataframe containing SU1 - SH ID mappings
    :param spirometry_data: spirometry data for RC and SU1
    :return: RC and SU1 spirometry data with SH ID mapping columns
    """
    receiver_IDs = RC_IDs.rename(columns={'RNo': 'StudyId'})
    scaleup_IDs = SU1_IDs.rename(columns={'Study_Number': 'StudyId'})
    all_service_IDs = pd.concat([receiver_IDs, scaleup_IDs], ignore_index=True)
    spirometry_mappings = pd.merge(
        spirometry_data, all_service_IDs, on="StudyId", how="left").dropna()
    type_map = {'FEV1%': 'int32', 'SafeHavenID': 'int32'}
    spirometry_mappings = spirometry_mappings.astype(type_map)

    return spirometry_mappings


def main():

    # Read spirometry data
    rec_sup_spiro_file = input_file_path + "RC_SU1_spirometry_data.csv"
    rec_sup_spiro_data = read_data(rec_sup_spiro_file).dropna()

    # Create new columns showing the GOLD group of each study participant
    rec_sup_spiro_data['GOLD grade'] = rec_sup_spiro_data.apply(
        calc_gold_grade, axis=1)

    # Read RC and SU1 SafeHaven ID mapping files
    rec_id_file = "<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA/Cohort3Rand.csv"
    sup_id_file = "<YOUR_DATA_PATH>/SU_IDs/Scale_Up_lookup.csv"
    rec_id_map_data = read_data(rec_id_file)
    sup_id_map_data = read_data(sup_id_file)

    # Join spirometry data to SH mappings
    mapped_data = add_SH_mappings_for_RC_and_SU1(
        rec_id_map_data, sup_id_map_data, rec_sup_spiro_data)

    # Save data
    mapped_data.to_pickle(output_file_path)


main()