| import os |
| import pandas as pd |
| import yaml |
|
|
| with open("./training/config.yaml", "r") as config: |
| config = yaml.safe_load(config) |
|
|
| |
| receiver = pd.read_csv(config['inputs']['raw_data_paths']['receiver_cohort']) |
| receiver = receiver.rename(columns={"RNo": "StudyId"}) |
|
|
| |
| scaleup = pd.read_csv(config['inputs']['raw_data_paths']['scale_up_cohort']) |
| scaleup = scaleup.rename(columns={"Study_Number": "StudyId"}) |
|
|
| |
| all_patients = pd.concat([receiver, scaleup]).dropna() |
|
|
| |
| all_patients.to_pickle(os.path.join(config['outputs']['output_data_dir'], "sh_to_studyid_mapping.pkl")) |
|
|
| |
| lenus_demographics = pd.read_csv( |
| config['inputs']['raw_data_paths']['patient_details'], |
| usecols=["StudyId", "DateOfBirth", "Sex"], |
| sep="|", |
| ) |
| sh_demographics = pd.read_csv( |
| config['inputs']['raw_data_paths']['sh_demographics'], |
| usecols=["SafeHavenID", "SEX", "OBF_DOB"], |
| ) |
|
|
| sh_demographics["OBF_DOB"] = pd.to_datetime( |
| sh_demographics["OBF_DOB"], utc=True |
| ).dt.normalize() |
|
|
| mapping = all_patients.merge(sh_demographics, on="SafeHavenID", how="inner") |
| mapping = mapping.merge(lenus_demographics, on="StudyId", how="inner") |
|
|
| |
| print(mapping[mapping.SEX != mapping.Sex]) |
| |
| print(all_patients[all_patients.duplicated(subset="SafeHavenID", keep=False)]) |
|
|
| |
| print(mapping[mapping.OBF_DOB != mapping.DateOfBirth]) |
|
|
| print(mapping[mapping["StudyId"] == "SU126"]) |
|
|