| | """ |
| | Process GPLES data |
| | -------- |
| | Extract the number of COPD GP events per patient per year |
| | """ |
| | import pandas as pd |
| | from utils.common import read_data, first_patient_appearance |
| |
|
| |
|
| | def initialize_gples_data(file): |
| | """ |
| | Load in and convert GPLES dataset to correct format |
| | -------- |
| | :param file: filename to read from |
| | :return: gples dataframe with correct column names and types |
| | """ |
| | print('Loading GPLES data') |
| |
|
| | |
| | gp_cols = ['SafeHavenID', 'EventDate', 'ShortName'] |
| | gp_types = ['int', 'object', 'str'] |
| | df = read_data(file, gp_cols, gp_types) |
| |
|
| | |
| | df = df.dropna().drop_duplicates() |
| |
|
| | |
| | df.columns = ['SafeHavenID', 'ADMDATE', 'ShortName'] |
| | df['ADMDATE'] = pd.to_datetime(df['ADMDATE']) |
| |
|
| | |
| | df = df[df.ShortName == 'COPD'][['SafeHavenID', 'ADMDATE']] |
| | df['gp_copd_event'] = 1 |
| |
|
| | return df |
| |
|
| |
|
| | def extract_yearly_data(df): |
| | """ |
| | Extract data per year from GPLES dataset |
| | -------- |
| | :param df: gples dataframe to be processed |
| | :return: reduced gples dataset |
| | """ |
| | print('Reducing GPLES data') |
| |
|
| | |
| | df['year'] = df.ADMDATE.dt.year |
| |
|
| | |
| | group_cols = ['SafeHavenID', 'year'] |
| | gples_events = df.groupby(group_cols)[['gp_copd_event']].sum() |
| |
|
| | return gples_events |
| |
|
| |
|
| | def main(): |
| |
|
| | |
| | gp_file = "<YOUR_DATA_PATH>/EXAMPLE_STUDY_DATA/GPLES_Cohort3R.csv" |
| | gples = initialize_gples_data(gp_file) |
| |
|
| | |
| | first_patient_appearance(gples, 'ADMDATE', 'gples') |
| |
|
| | |
| | gples_yearly = extract_yearly_data(gples) |
| |
|
| | |
| | gples_yearly.to_pickle('<YOUR_DATA_PATH>/Model_E_Extracts/gples_proc.pkl') |
| |
|
| |
|
| | main() |
| |
|