""" Process GPLES data -------- Extract the number of COPD GP events per patient per year """ import pandas as pd from utils.common import read_data, first_patient_appearance def initialize_gples_data(file): """ Load in and convert GPLES dataset to correct format -------- :param file: filename to read from :return: gples dataframe with correct column names and types """ print('Loading GPLES data') # Read in data gp_cols = ['SafeHavenID', 'EventDate', 'ShortName'] gp_types = ['int', 'object', 'str'] df = read_data(file, gp_cols, gp_types) # Drop nulls and duplicates df = df.dropna().drop_duplicates() # Convert date columns to correct type df.columns = ['SafeHavenID', 'ADMDATE', 'ShortName'] df['ADMDATE'] = pd.to_datetime(df['ADMDATE']) # Only track COPD events df = df[df.ShortName == 'COPD'][['SafeHavenID', 'ADMDATE']] df['gp_copd_event'] = 1 return df def extract_yearly_data(df): """ Extract data per year from GPLES dataset -------- :param df: gples dataframe to be processed :return: reduced gples dataset """ print('Reducing GPLES data') # Extract year column for historical features df['year'] = df.ADMDATE.dt.year # Extract yearly data group_cols = ['SafeHavenID', 'year'] gples_events = df.groupby(group_cols)[['gp_copd_event']].sum() return gples_events def main(): # Load data gp_file = "/EXAMPLE_STUDY_DATA/GPLES_Cohort3R.csv" gples = initialize_gples_data(gp_file) # Save first date in dataset first_patient_appearance(gples, 'ADMDATE', 'gples') # Reduce GPLES to 1 row per year per ID gples_yearly = extract_yearly_data(gples) # Save data gples_yearly.to_pickle('/Model_E_Extracts/gples_proc.pkl') main()