import os import pandas as pd data_dir = '/' # Read lookups for RECEIVER # receiver = pd.read_csv(os.path.join(data_dir, 'Receiver_IDs', 'COHORT_CONSENTED_2.csv')) # receiver = receiver.rename(columns={'Study number': 'StudyId'}) # receiver = receiver[['SafeHavenID', 'StudyId']] receiver = pd.read_csv(os.path.join(data_dir, 'Cohort3Rand.csv')) receiver = receiver.rename(columns={'RNo': 'StudyId'}) # Read lookups for scale up scaleup = pd.read_csv(os.path.join(data_dir, 'SU_IDs', 'Scale_Up_lookup.csv')) scaleup = scaleup.rename(columns={'Study_Number': 'StudyId'}) # Concatenate tables and drop missing SH IDs (some study patients not in data extract) all_patients = pd.concat([receiver, scaleup]).dropna() # Save final mapping between StudyId and SafeHavenID all_patients.to_pickle(os.path.join(data_dir, 'sh_to_studyid_mapping.pkl')) # Check for matching age and sex between SafeHaven and Lenus data (mapping sanity check) lenus_demographics = pd.read_csv(os.path.join(data_dir, 'copd-dataset', 'CopdDatasetPatientDetails.txt'), usecols=['StudyId', 'DateOfBirth', 'Sex'], sep='|') sh_demographics = pd.read_csv(os.path.join(data_dir, 'EXAMPLE_STUDY_DATA', 'Demographics_Cohort4.csv'), usecols=['SafeHavenID', 'SEX', 'OBF_DOB']) sh_demographics['OBF_DOB'] = pd.to_datetime( sh_demographics['OBF_DOB'], utc=True).dt.normalize() mapping = all_patients.merge(sh_demographics, on='SafeHavenID', how='inner') mapping = mapping.merge(lenus_demographics, on='StudyId', how='inner') # Check patient sex matches mapping[mapping.SEX != mapping.Sex] # There is one mismatch all_patients[all_patients.duplicated(subset='SafeHavenID')] # Check patient DOB matches mapping[mapping.OBF_DOB != mapping.DateOfBirth]