| """ |
| prep_data.py β Run once to generate parquet data files for the Patient Portal app. |
| Saves 3 parquets to streamlit/ root (parent of this script's directory). |
| |
| cd streamlit/app |
| python prep_data.py |
| """ |
| import json |
|
|
| import numpy as np |
| import pandas as pd |
| from datasets import load_dataset |
| from pathlib import Path |
|
|
| OUT_DIR = Path(__file__).parent.parent |
|
|
| PATIENT_REPO = "ADS599-Capstone/interim_data" |
| DATA_REPO = "ADS599-Capstone/sbs_predictions" |
|
|
| print("Loading HuggingFace datasets β this may take a few minutes...") |
|
|
| patient_df = load_dataset(PATIENT_REPO, name='cohort_full', split='cohort_base').to_pandas() |
| weight_df = load_dataset(PATIENT_REPO, name='weight', split='weight_full').to_pandas() |
| height_df = load_dataset(PATIENT_REPO, name='height', split='height_full').to_pandas() |
| tv_df = load_dataset(PATIENT_REPO, name='triage_vitals', split='triage_vitals_full').to_pandas() |
| data_df = load_dataset(DATA_REPO, name='sbs_preds', split='sbs_preds_full').to_pandas() |
|
|
| print(f" cohort: {len(patient_df):,} rows") |
| print(f" data_df: {data_df.shape}") |
|
|
| |
| rng = np.random.default_rng(10) |
| stay_meta = data_df.drop_duplicates('ed_stay_id')[['ed_stay_id', 'terminal_event']] |
| icu_stays = stay_meta[stay_meta['terminal_event'] == 'transfer_icu']['ed_stay_id'].tolist() |
| dis_stays = stay_meta[stay_meta['terminal_event'] == 'discharge']['ed_stay_id'].tolist() |
| n_icu = min(1000, len(icu_stays)) |
| n_dis = min(4000, len(dis_stays)) |
| sampled_ids = set( |
| rng.choice(icu_stays, n_icu, replace=False).tolist() + |
| rng.choice(dis_stays, n_dis, replace=False).tolist() |
| ) |
| data_df = data_df[data_df['ed_stay_id'].isin(sampled_ids)].copy() |
| print(f" Sampled {len(sampled_ids):,} stays ({n_icu:,} ICU, {n_dis:,} discharge)") |
|
|
| with open(OUT_DIR / 'sampled_stay_ids.json', 'w') as f: |
| json.dump(sorted(sampled_ids), f) |
|
|
| |
| dispensed_meds = list(data_df.loc[:, 'ACE Inhibitor':'Other'].columns) if 'ACE Inhibitor' in data_df.columns else [] |
| recon = [c for c in data_df.columns if c.startswith('recon_')] |
| vitals = [c for c in data_df.columns if c.startswith('current_')] |
| vital_change = [c for c in data_df.columns if c.endswith(('_rolling1h', '_delta', '_rate_per_min'))] |
| lab_ohe = [c for c in data_df.columns if c.endswith(('_Normal', '_Pending', '_Abnormal')) and '-' in c] |
| micro_ohe = [c for c in data_df.columns if c.endswith(('_Pending', '_Positive', '_Negative', '_Other')) and '-' not in c and not c.startswith(('ecg_status', 'rad_status'))] |
| ecg_ohe = [c for c in data_df.columns if c.startswith('ecg_status')] |
| rad_ohe = [c for c in data_df.columns if c.startswith('rad_status')] |
| arrival = [c for c in data_df.columns if c.startswith('arrival_')] |
| missing = [c for c in data_df.columns if c.endswith('_missing')] |
|
|
| state_cols = ( |
| ['gender', 'anchor_age', 'acuity', 'height', 'weight', 'time_since_last_min'] |
| + dispensed_meds + recon + vitals + vital_change |
| + lab_ohe + micro_ohe + ecg_ohe + rad_ohe + arrival + missing |
| ) |
| state_cols = [c for c in state_cols if c in data_df.columns] |
|
|
| with open(OUT_DIR / 'state_cols.json', 'w') as f: |
| json.dump(state_cols, f) |
| print(f" state_cols.json: {len(state_cols)} features") |
|
|
| eds = set(data_df['ed_stay_id'].unique()) |
|
|
| |
| avg_weight = weight_df.groupby('subject_id')['result_value'].mean().rename('weight').reset_index() |
| avg_height = height_df.groupby('subject_id')['result_value'].mean().rename('height').reset_index() |
|
|
| |
| |
| cohort_cols = [ |
| 'ed_stay_id', 'subject_id', 'ed_intime', |
| 'arrival_transport', 'gender', 'anchor_age', 'language', |
| ] |
|
|
| triage_vital_cols = [ |
| c for c in tv_df.columns |
| if c.startswith('current') and c != 'current_mean_arterial_pressure' |
| ] |
| triage_cols = triage_vital_cols + ['acuity', 'chiefcomplaint', 'ed_stay_id'] |
| triage_baseline = tv_df[tv_df['source'] == 'triage'][triage_cols] |
|
|
| patient_stats = ( |
| patient_df[cohort_cols] |
| .merge(avg_weight, on='subject_id', how='left') |
| .merge(avg_height, on='subject_id', how='left') |
| .merge(triage_baseline, on='ed_stay_id', how='left') |
| ) |
| patient_stats = patient_stats[patient_stats['ed_stay_id'].isin(eds)].copy() |
| patient_stats['ed_intime'] = pd.to_datetime(patient_stats['ed_intime']) |
|
|
| patient_stats.to_parquet(OUT_DIR / 'patient_stats.parquet', index=False) |
| print(f" patient_stats.parquet: {len(patient_stats):,} rows, {len(patient_stats.columns)} cols") |
|
|
| |
| |
| prob_cols = [ |
| 'ed_stay_id', 'in_ed', 'in_ward', 'terminal_event', |
| 'terminal_code', 'step_idx', 'time', 'p_icu', |
| ] |
| patient_probability = data_df[prob_cols].copy() |
| patient_probability.to_parquet(OUT_DIR / 'patient_probability.parquet', index=False) |
| print(f" patient_probability.parquet: {len(patient_probability):,} rows") |
|
|
| |
| |
| step_feature_cols = [ |
| 'ed_stay_id', 'step_idx', 'time', 'in_ed', 'in_ward', 'terminal_code', |
| |
| 'current_temperature', 'current_heartrate', 'current_resprate', |
| 'current_o2sat', 'current_sbp', 'current_dbp', 'current_pain', 'current_map', |
| |
| 'Chemistry-Blood_Pending', 'Chemistry-Blood_Normal', 'Chemistry-Blood_Abnormal', |
| 'Hematology-Blood_Pending', 'Hematology-Blood_Normal', 'Hematology-Blood_Abnormal', |
| 'Blood Gas-Blood_Pending', 'Blood Gas-Blood_Normal', 'Blood Gas-Blood_Abnormal', |
| 'BLOOD CULTURE_Pending', 'BLOOD CULTURE_Negative', 'BLOOD CULTURE_Positive', |
| |
| 'ecg_status_Normal', 'ecg_status_Moderate', 'ecg_status_Acute', |
| 'rad_status_Normal', 'rad_status_Moderate', 'rad_status_Acute', |
| |
| 'Antibiotic', 'IV Fluid', 'Analgesic - Opioid/NSAID', 'Analgesic - Acetaminophen', |
| 'Antiemetic', 'Anticoagulant', 'Corticosteroid', |
| 'Benzodiazepine - Sedative/Anxiolytic', 'Beta Blocker', 'Diuretic', 'Bronchodilator', |
| |
| 'p_icu', |
| ] |
| available_cols = [c for c in step_feature_cols if c in data_df.columns] |
| missing_cols = [c for c in step_feature_cols if c not in data_df.columns] |
| if missing_cols: |
| print(f" WARNING: columns not found in data_df: {missing_cols}") |
|
|
| step_features = data_df[available_cols].copy() |
| step_features.to_parquet(OUT_DIR / 'step_features.parquet', index=False) |
| print(f" step_features.parquet: {len(step_features):,} rows, {len(available_cols)} cols") |
|
|
| print("\nDone. Parquets saved to:", OUT_DIR) |