File size: 1,861 Bytes
e69d4e4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | import os
import pandas as pd
data_dir = '<YOUR_DATA_PATH>/'
# Read lookups for RECEIVER
# receiver = pd.read_csv(os.path.join(data_dir, 'Receiver_IDs', 'COHORT_CONSENTED_2.csv'))
# receiver = receiver.rename(columns={'Study number': 'StudyId'})
# receiver = receiver[['SafeHavenID', 'StudyId']]
receiver = pd.read_csv(os.path.join(data_dir, 'Cohort3Rand.csv'))
receiver = receiver.rename(columns={'RNo': 'StudyId'})
# Read lookups for scale up
scaleup = pd.read_csv(os.path.join(data_dir, 'SU_IDs', 'Scale_Up_lookup.csv'))
scaleup = scaleup.rename(columns={'Study_Number': 'StudyId'})
# Concatenate tables and drop missing SH IDs (some study patients not in data extract)
all_patients = pd.concat([receiver, scaleup]).dropna()
# Save final mapping between StudyId and SafeHavenID
all_patients.to_pickle(os.path.join(data_dir, 'sh_to_studyid_mapping.pkl'))
# Check for matching age and sex between SafeHaven and Lenus data (mapping sanity check)
lenus_demographics = pd.read_csv(os.path.join(data_dir, 'copd-dataset',
'CopdDatasetPatientDetails.txt'),
usecols=['StudyId', 'DateOfBirth', 'Sex'], sep='|')
sh_demographics = pd.read_csv(os.path.join(data_dir, 'EXAMPLE_STUDY_DATA',
'Demographics_Cohort4.csv'),
usecols=['SafeHavenID', 'SEX', 'OBF_DOB'])
sh_demographics['OBF_DOB'] = pd.to_datetime(
sh_demographics['OBF_DOB'], utc=True).dt.normalize()
mapping = all_patients.merge(sh_demographics, on='SafeHavenID', how='inner')
mapping = mapping.merge(lenus_demographics, on='StudyId', how='inner')
# Check patient sex matches
mapping[mapping.SEX != mapping.Sex]
# There is one mismatch
all_patients[all_patients.duplicated(subset='SafeHavenID')]
# Check patient DOB matches
mapping[mapping.OBF_DOB != mapping.DateOfBirth]
|