copd-model-h / training /create_sh_lookup_table.py
IamGrooooot's picture
Inital Upload
000de75
import os
import pandas as pd
import yaml
with open("./training/config.yaml", "r") as config:
config = yaml.safe_load(config)
# Read lookups for RECEIVER
receiver = pd.read_csv(config['inputs']['raw_data_paths']['receiver_cohort'])
receiver = receiver.rename(columns={"RNo": "StudyId"})
# Read lookups for Scale Up
scaleup = pd.read_csv(config['inputs']['raw_data_paths']['scale_up_cohort'])
scaleup = scaleup.rename(columns={"Study_Number": "StudyId"})
# Concatenate tables and drop missing SH IDs (some study patients not in data extract)
all_patients = pd.concat([receiver, scaleup]).dropna()
# Save final mapping between StudyId and SafeHavenID
all_patients.to_pickle(os.path.join(config['outputs']['output_data_dir'], "sh_to_studyid_mapping.pkl"))
# Check for matching age and sex between SafeHaven and Lenus data (mapping sanity check)
lenus_demographics = pd.read_csv(
config['inputs']['raw_data_paths']['patient_details'],
usecols=["StudyId", "DateOfBirth", "Sex"],
sep="|",
)
sh_demographics = pd.read_csv(
config['inputs']['raw_data_paths']['sh_demographics'],
usecols=["SafeHavenID", "SEX", "OBF_DOB"],
)
sh_demographics["OBF_DOB"] = pd.to_datetime(
sh_demographics["OBF_DOB"], utc=True
).dt.normalize()
mapping = all_patients.merge(sh_demographics, on="SafeHavenID", how="inner")
mapping = mapping.merge(lenus_demographics, on="StudyId", how="inner")
# Check patient sex matches
print(mapping[mapping.SEX != mapping.Sex])
# There is one mismatch
print(all_patients[all_patients.duplicated(subset="SafeHavenID", keep=False)])
# Check patient DOB matches
print(mapping[mapping.OBF_DOB != mapping.DateOfBirth])
print(mapping[mapping["StudyId"] == "SU126"])