copd-model-c / training /fitbit_exploration.py
IamGrooooot's picture
Initial release: 72-hour COPD exacerbation prediction model
e69d4e4
import copd
import os
import pandas as pd
from scipy.stats import ks_2samp, cramervonmises_2samp
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style='darkgrid', context='talk')
sns.set_palette('dark')
muted = sns.palettes.color_palette(palette='muted')
dark = sns.palettes.color_palette(palette='dark')
data_dir = '<YOUR_DATA_PATH>/lenus-samples-dataset'
# Load platform data
# DataServerDatasetSample.txt contains columns: 'Id', 'CategoryValue', 'ClientAssignedId',
# 'ClientId', 'CreationDate', 'CreatorSubject', 'DiscriminatedTypeIdentifier', 'EndDate',
# 'QuantityId', 'SampleId', 'SampleTypeDiscriminator', 'StartDate', 'Subject',
# 'TypeIdentifier'
# 'QuantityId' is a unique identifier to link one platform measurement (steps, HR etc)
# 'CreatorSubject' refers to the patient. Links to 'Id' in DataServerDatasetQuantity
lenus_sample = pd.read_csv(os.path.join(data_dir, "DataServerDatasetSample.txt"),
delimiter="|", usecols=['StartDate', 'EndDate',
'CreatorSubject', 'QuantityId',
'TypeIdentifier', 'CreationDate'])
# Convert datetime columns (strings) to datetime objects (in UTC)
# Not using a pandas apply to all columns here because it's very slow
date_cols = ['StartDate', 'EndDate', 'CreationDate']
for col in date_cols:
lenus_sample[col] = pd.to_datetime(lenus_sample[col], utc=True).dt.normalize()
# DataServerDatasetQuantity.txt contains columns: 'Id', 'Unit', 'value'
# 'Id' links to 'QuantityId' in DataServerDatasetSample
lenus_quantity = pd.read_csv(os.path.join(data_dir, "DataServerDatasetQuantity.txt"),
delimiter="|")
# Merge platform data on measurement id
platform_data = lenus_sample.merge(lenus_quantity, left_on='QuantityId',
right_on='Id').drop(columns=['Id'])
# Apply lookups to units and measurement types
platform_data['Units'] = copd.unit_lookup(platform_data['Unit'])
type_lookup = pd.read_csv('./lookups/type_lookup.txt')
platform_data = platform_data.merge(type_lookup, left_on='TypeIdentifier',
right_on=type_lookup.index)
# Drop unwanted columns
platform_data = platform_data.drop(columns=['TypeIdentifier', 'Unit'])
# Pivot the platform data to obtain columns for each measurement type
platform_data = pd.pivot_table(platform_data, values='Value',
index=['StartDate', 'EndDate', 'CreationDate',
'CreatorSubject'],
columns=['Description']).reset_index()
data = pd.read_pickle(os.path.join('<YOUR_DATA_PATH>/copd-dataset', 'exac_data.pkl'))
patients = data.LenusId.unique()
def filter_on_date_and_id(df, min_date, patients):
return df[(df.CreationDate >= min_date) & (df.CreatorSubject.isin(patients))]
def resample_and_merge_median(df, fitbit):
# fitbit['DateOfEvent'] = fitbit['CreationDate']
# Resample for one value per day
fitbit = fitbit.set_index('CreationDate').groupby('CreatorSubject').resample(
'1d').median().dropna().reset_index()
data = df.merge(fitbit, left_on=['LenusId', 'DateOfEvent'],
right_on=['CreatorSubject', 'CreationDate'], how='inner')
return data
def resample_and_merge_last(df, fitbit):
fitbit['DateOfEvent'] = fitbit['CreationDate']
# Resample for one value per day
fitbit = fitbit.set_index('CreationDate').groupby('CreatorSubject').resample(
'1d').last().dropna().reset_index(drop=True)
data = df.merge(fitbit, left_on=['LenusId', 'DateOfEvent'],
right_on=['CreatorSubject', 'DateOfEvent'], how='inner')
return data
def print_numbers(df, measurement):
fitbit_patients = pd.Series(df.StudyId.unique())
print('{} patient days with {} data across {} unique patients ({} RC and {} SU)'.
format(len(df), measurement, len(df.PatientId.unique()),
fitbit_patients.str.startswith('RC').sum(),
fitbit_patients.str.startswith('SU').sum()))
exac_patients = pd.Series(df[df.IsExac == 1].StudyId.unique())
print('{} exacerbations across {} patients ({} RC and {} SU)'.format(df.IsExac.sum(),
len(df[df.IsExac == 1].PatientId.unique()),
exac_patients.str.startswith('RC').sum(),
exac_patients.str.startswith('SU').sum()))
# Select heart rate data from all platform data
heart_rate = platform_data[platform_data['heart rate'].notna()][
['CreationDate', 'CreatorSubject', 'heart rate']]
# Filter for patients and dates of interest
heart_rate = filter_on_date_and_id(heart_rate, min_date='2010-01-01', patients=patients)
heart_rate.columns
hr_data = resample_and_merge_last(df=data, fitbit=heart_rate)
print_numbers(hr_data, 'HR')
steps = platform_data[platform_data['number of steps taken;'].notna()][[
'CreationDate', 'CreatorSubject', 'number of steps taken;']]
# Filter for patients and dates of interest
steps = filter_on_date_and_id(steps, min_date='2010-01-01', patients=patients)
steps_data = resample_and_merge_median(df=data, fitbit=steps)
print_numbers(steps_data, 'steps')
hr_exac_patients = hr_data[hr_data.IsExac == 1]['PatientId'].unique()
hr_data = hr_data[hr_data.PatientId.isin(hr_exac_patients)]
hr_exac = hr_data[hr_data.IsExac == 1]['heart rate']
hr_no_exac = hr_data[hr_data.IsExac == 0]['heart rate']
ks_2samp(hr_exac, hr_no_exac)
cramervonmises_2samp(hr_exac, hr_no_exac)
steps_exac_patients = steps_data[steps_data.IsExac == 1]['PatientId'].unique()
steps_data = steps_data[steps_data.PatientId.isin(steps_exac_patients)]
steps_exac = steps_data[steps_data.IsExac == 1]['number of steps taken;']
steps_no_exac = steps_data[steps_data.IsExac == 0]['number of steps taken;']
ks_2samp(steps_exac, steps_no_exac)
cramervonmises_2samp(steps_exac, steps_no_exac)
fig, axes = plt.subplots(nrows=1, ncols=2, sharex=True, sharey=True,
constrained_layout=True, figsize=(8, 6))
sns.histplot(hr_data[hr_data.IsExac == 0], x="heart rate", binwidth=5, binrange=[50, 100],
alpha=.6, stat="density", legend=True, ax=axes[0], color=dark[0])
axes[0].set_xlabel(None)
plt.legend(['a'])
sns.histplot(hr_data[hr_data.IsExac == 1], x="heart rate", binwidth=5, binrange=[50, 100],
alpha=.6, stat="density", legend=True, ax=axes[1], color=dark[1])
axes[1].set_xlabel(None)
fig.supxlabel('heart rate')
plt.legend(['b'])