File size: 1,849 Bytes
53a6def | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | """
Process GPLES data
--------
Extract the number of COPD GP events per patient per year
"""
import pandas as pd
from utils.common import read_data, first_patient_appearance
def initialize_gples_data(file):
"""
Load in and convert GPLES dataset to correct format
--------
:param file: filename to read from
:return: gples dataframe with correct column names and types
"""
print('Loading GPLES data')
# Read in data
gp_cols = ['SafeHavenID', 'EventDate', 'ShortName']
gp_types = ['int', 'object', 'str']
df = read_data(file, gp_cols, gp_types)
# Drop nulls and duplicates
df = df.dropna().drop_duplicates()
# Convert date columns to correct type
df.columns = ['SafeHavenID', 'ADMDATE', 'ShortName']
df['ADMDATE'] = pd.to_datetime(df['ADMDATE'])
# Only track COPD events
df = df[df.ShortName == 'COPD'][['SafeHavenID', 'ADMDATE']]
df['gp_copd_event'] = 1
return df
def extract_yearly_data(df):
"""
Extract data per year from GPLES dataset
--------
:param df: gples dataframe to be processed
:return: reduced gples dataset
"""
print('Reducing GPLES data')
# Extract year column for historical features
df['year'] = df.ADMDATE.dt.year
# Extract yearly data
group_cols = ['SafeHavenID', 'year']
gples_events = df.groupby(group_cols)[['gp_copd_event']].sum()
return gples_events
def main():
# Load data
gp_file = "<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA/GPLES_Cohort3R.csv"
gples = initialize_gples_data(gp_file)
# Save first date in dataset
first_patient_appearance(gples, 'ADMDATE', 'gples')
# Reduce GPLES to 1 row per year per ID
gples_yearly = extract_yearly_data(gples)
# Save data
gples_yearly.to_pickle('<YOUR_DATA_PATH>/Model_E_Extracts/gples_proc.pkl')
main()
|