import copd import os import pandas as pd from scipy.stats import ks_2samp, cramervonmises_2samp import seaborn as sns import matplotlib.pyplot as plt sns.set(style='darkgrid', context='talk') sns.set_palette('dark') muted = sns.palettes.color_palette(palette='muted') dark = sns.palettes.color_palette(palette='dark') data_dir = '/lenus-samples-dataset' # Load platform data # DataServerDatasetSample.txt contains columns: 'Id', 'CategoryValue', 'ClientAssignedId', # 'ClientId', 'CreationDate', 'CreatorSubject', 'DiscriminatedTypeIdentifier', 'EndDate', # 'QuantityId', 'SampleId', 'SampleTypeDiscriminator', 'StartDate', 'Subject', # 'TypeIdentifier' # 'QuantityId' is a unique identifier to link one platform measurement (steps, HR etc) # 'CreatorSubject' refers to the patient. Links to 'Id' in DataServerDatasetQuantity lenus_sample = pd.read_csv(os.path.join(data_dir, "DataServerDatasetSample.txt"), delimiter="|", usecols=['StartDate', 'EndDate', 'CreatorSubject', 'QuantityId', 'TypeIdentifier', 'CreationDate']) # Convert datetime columns (strings) to datetime objects (in UTC) # Not using a pandas apply to all columns here because it's very slow date_cols = ['StartDate', 'EndDate', 'CreationDate'] for col in date_cols: lenus_sample[col] = pd.to_datetime(lenus_sample[col], utc=True).dt.normalize() # DataServerDatasetQuantity.txt contains columns: 'Id', 'Unit', 'value' # 'Id' links to 'QuantityId' in DataServerDatasetSample lenus_quantity = pd.read_csv(os.path.join(data_dir, "DataServerDatasetQuantity.txt"), delimiter="|") # Merge platform data on measurement id platform_data = lenus_sample.merge(lenus_quantity, left_on='QuantityId', right_on='Id').drop(columns=['Id']) # Apply lookups to units and measurement types platform_data['Units'] = copd.unit_lookup(platform_data['Unit']) type_lookup = pd.read_csv('./lookups/type_lookup.txt') platform_data = platform_data.merge(type_lookup, left_on='TypeIdentifier', right_on=type_lookup.index) # Drop unwanted columns platform_data = platform_data.drop(columns=['TypeIdentifier', 'Unit']) # Pivot the platform data to obtain columns for each measurement type platform_data = pd.pivot_table(platform_data, values='Value', index=['StartDate', 'EndDate', 'CreationDate', 'CreatorSubject'], columns=['Description']).reset_index() data = pd.read_pickle(os.path.join('/copd-dataset', 'exac_data.pkl')) patients = data.LenusId.unique() def filter_on_date_and_id(df, min_date, patients): return df[(df.CreationDate >= min_date) & (df.CreatorSubject.isin(patients))] def resample_and_merge_median(df, fitbit): # fitbit['DateOfEvent'] = fitbit['CreationDate'] # Resample for one value per day fitbit = fitbit.set_index('CreationDate').groupby('CreatorSubject').resample( '1d').median().dropna().reset_index() data = df.merge(fitbit, left_on=['LenusId', 'DateOfEvent'], right_on=['CreatorSubject', 'CreationDate'], how='inner') return data def resample_and_merge_last(df, fitbit): fitbit['DateOfEvent'] = fitbit['CreationDate'] # Resample for one value per day fitbit = fitbit.set_index('CreationDate').groupby('CreatorSubject').resample( '1d').last().dropna().reset_index(drop=True) data = df.merge(fitbit, left_on=['LenusId', 'DateOfEvent'], right_on=['CreatorSubject', 'DateOfEvent'], how='inner') return data def print_numbers(df, measurement): fitbit_patients = pd.Series(df.StudyId.unique()) print('{} patient days with {} data across {} unique patients ({} RC and {} SU)'. format(len(df), measurement, len(df.PatientId.unique()), fitbit_patients.str.startswith('RC').sum(), fitbit_patients.str.startswith('SU').sum())) exac_patients = pd.Series(df[df.IsExac == 1].StudyId.unique()) print('{} exacerbations across {} patients ({} RC and {} SU)'.format(df.IsExac.sum(), len(df[df.IsExac == 1].PatientId.unique()), exac_patients.str.startswith('RC').sum(), exac_patients.str.startswith('SU').sum())) # Select heart rate data from all platform data heart_rate = platform_data[platform_data['heart rate'].notna()][ ['CreationDate', 'CreatorSubject', 'heart rate']] # Filter for patients and dates of interest heart_rate = filter_on_date_and_id(heart_rate, min_date='2010-01-01', patients=patients) heart_rate.columns hr_data = resample_and_merge_last(df=data, fitbit=heart_rate) print_numbers(hr_data, 'HR') steps = platform_data[platform_data['number of steps taken;'].notna()][[ 'CreationDate', 'CreatorSubject', 'number of steps taken;']] # Filter for patients and dates of interest steps = filter_on_date_and_id(steps, min_date='2010-01-01', patients=patients) steps_data = resample_and_merge_median(df=data, fitbit=steps) print_numbers(steps_data, 'steps') hr_exac_patients = hr_data[hr_data.IsExac == 1]['PatientId'].unique() hr_data = hr_data[hr_data.PatientId.isin(hr_exac_patients)] hr_exac = hr_data[hr_data.IsExac == 1]['heart rate'] hr_no_exac = hr_data[hr_data.IsExac == 0]['heart rate'] ks_2samp(hr_exac, hr_no_exac) cramervonmises_2samp(hr_exac, hr_no_exac) steps_exac_patients = steps_data[steps_data.IsExac == 1]['PatientId'].unique() steps_data = steps_data[steps_data.PatientId.isin(steps_exac_patients)] steps_exac = steps_data[steps_data.IsExac == 1]['number of steps taken;'] steps_no_exac = steps_data[steps_data.IsExac == 0]['number of steps taken;'] ks_2samp(steps_exac, steps_no_exac) cramervonmises_2samp(steps_exac, steps_no_exac) fig, axes = plt.subplots(nrows=1, ncols=2, sharex=True, sharey=True, constrained_layout=True, figsize=(8, 6)) sns.histplot(hr_data[hr_data.IsExac == 0], x="heart rate", binwidth=5, binrange=[50, 100], alpha=.6, stat="density", legend=True, ax=axes[0], color=dark[0]) axes[0].set_xlabel(None) plt.legend(['a']) sns.histplot(hr_data[hr_data.IsExac == 1], x="heart rate", binwidth=5, binrange=[50, 100], alpha=.6, stat="density", legend=True, ax=axes[1], color=dark[1]) axes[1].set_xlabel(None) fig.supxlabel('heart rate') plt.legend(['b'])