| import streamlit as st |
| import pandas as pd |
| import json |
| import os |
| import posixpath |
| from huggingface_hub import hf_hub_download |
| from huggingface_hub import list_repo_files |
| import io |
| import zipfile |
|
|
| |
| REPO_ID = "PortPy-Project/PortPy_Dataset" |
|
|
| |
| token = os.getenv("HF_TOKEN") |
|
|
| @st.cache_data |
| def get_patient_ids(): |
| |
| file = hf_hub_download(REPO_ID, repo_type="dataset", filename="data_info.jsonl", local_dir="./temp", token=token) |
| with open(file) as f: |
| |
| data_info = [json.loads(line) for line in f] |
| patient_ids = [pat['patient_id'] for pat in data_info] |
| df = pd.DataFrame(patient_ids, columns=["patient_id"]) |
| df["disease_site"] = df["patient_id"].str.extract(r"^(.*?)_") |
| return df |
|
|
|
|
| @st.cache_data |
| def load_all_metadata(disease_site): |
| |
| patient_df = get_patient_ids() |
| filtered_patients = patient_df[patient_df["disease_site"] == disease_site] |
|
|
| metadata = {} |
| for patient_id in filtered_patients["patient_id"]: |
| |
| structs = load_structure_metadata(patient_id) |
| |
| beams = load_beam_metadata(patient_id) |
| planner_file = hf_hub_download(REPO_ID, repo_type="dataset", filename=f"data/{patient_id}/PlannerBeams.json", local_dir="./temp", token=token) |
| with open(planner_file) as f: |
| planner_data = json.load(f) |
| planner_beam_ids = planner_data.get("IDs", []) |
| metadata[patient_id] = { |
| "structures": structs, |
| "beams": beams, |
| "planner_beam_ids": planner_beam_ids |
| } |
|
|
| return metadata |
|
|
| @st.cache_data |
| def load_structure_metadata(patient_id): |
| file = hf_hub_download(REPO_ID, repo_type="dataset", filename=f"data/{patient_id}/StructureSet_MetaData.json", local_dir="./temp", token=token) |
| with open(file) as f: |
| return json.load(f) |
|
|
| @st.cache_data |
| def load_beam_metadata(patient_id): |
| beam_meta_paths = [] |
|
|
| files = list_repo_files(repo_id=REPO_ID, repo_type="dataset") |
| beam_meta_paths = [ |
| f for f in files |
| if f.startswith(f"data/{patient_id}/Beams/Beam_") and f.endswith("_MetaData.json") |
| ] |
| |
| |
|
|
| beam_meta = [] |
| for path in beam_meta_paths: |
| file = hf_hub_download(REPO_ID, repo_type="dataset", filename=path, local_dir="./temp", token=token) |
| with open(file) as f: |
| beam_meta.append(json.load(f)) |
| return beam_meta |
|
|
| def get_patient_summary_from_cached_data(patient_id, all_metadata): |
| structs = all_metadata[patient_id]["structures"] |
| beams = all_metadata[patient_id]["beams"] |
|
|
| ptv_vol = None |
| for s in structs: |
| if "PTV" in s["name"].upper(): |
| ptv_vol = s.get("volume_cc") |
| break |
|
|
| return { |
| "ptv_volume": ptv_vol, |
| "num_beams": len(beams), |
| "beams": beams |
| } |
|
|
| def filter_matched_data(filtered_patients, query_ptv_vol, beam_gantry_filter, |
| beam_collimator_filter, beam_energy_filter, beam_couch_filter, |
| only_planner, all_metadata): |
| matched = [] |
| gantry_angles = set(map(int, beam_gantry_filter.split(","))) if beam_gantry_filter else None |
| collimator_angles = set(map(int, beam_collimator_filter.split(","))) if beam_collimator_filter else None |
| couch_angles = set(map(int, beam_couch_filter.split(","))) if beam_couch_filter else None |
| energies = set(beam_energy_filter.replace(" ", "").split(",")) if beam_energy_filter else None |
|
|
| for pid in filtered_patients["patient_id"]: |
| |
| summary = get_patient_summary_from_cached_data(pid, all_metadata) |
| if summary["ptv_volume"] is None or summary["ptv_volume"] < query_ptv_vol: |
| continue |
|
|
| |
| selected_beams = summary["beams"] |
| if gantry_angles: |
| selected_beams = [b for b in selected_beams if b["gantry_angle"] in gantry_angles] |
| if collimator_angles: |
| selected_beams = [b for b in selected_beams if b["collimator_angle"] in collimator_angles] |
| if couch_angles: |
| selected_beams = [b for b in selected_beams if b["couch_angle"] in couch_angles] |
| if energies: |
| selected_beams = [b for b in selected_beams if b['energy_MV'] in energies] |
|
|
| selected_beam_ids = [b["ID"] for b in selected_beams] |
| if not selected_beam_ids: |
| continue |
|
|
| if only_planner: |
| planner_beam_ids = set(all_metadata[pid]["planner_beam_ids"]) |
| selected_beam_ids = list(planner_beam_ids.intersection(selected_beam_ids)) |
| if not selected_beam_ids: |
| continue |
|
|
| matched.append({ |
| "patient_id": pid, |
| "num_beams": len(selected_beam_ids), |
| "ptv_volume": summary["ptv_volume"], |
| "selected_beam_ids": selected_beam_ids |
| }) |
|
|
| return pd.DataFrame(matched) |
|
|
| def download_data(repo_id, patient_ids, beam_ids=None, planner_beam_ids=True, max_retries=2, local_dir='./'): |
| from huggingface_hub import hf_hub_download |
|
|
| downloaded_files = [] |
| for patient_id in patient_ids: |
| static_files = [ |
| "CT_Data.h5", "CT_MetaData.json", |
| "StructureSet_Data.h5", "StructureSet_MetaData.json", |
| "OptimizationVoxels_Data.h5", "OptimizationVoxels_MetaData.json", |
| "PlannerBeams.json" |
| ] |
| for filename in static_files: |
| hf_path = posixpath.join("data", patient_id, filename) |
| for attempt in range(max_retries): |
| try: |
| local_path = hf_hub_download( |
| repo_id=repo_id, |
| repo_type="dataset", |
| filename=hf_path, |
| local_dir=local_dir, |
| token=token |
| ) |
| downloaded_files.append(local_path) |
| break |
| except Exception as e: |
| if attempt == max_retries - 1: |
| st.error(f"Failed to download {hf_path}: {e}") |
|
|
| |
| |
| |
| try: |
| all_files = list_repo_files(repo_id, repo_type="dataset") |
| dicom_prefix = f"data/{patient_id}/DicomFiles/" |
| dicom_files = [f for f in all_files if f.startswith(dicom_prefix)] |
|
|
| for hf_path in dicom_files: |
| for attempt in range(max_retries): |
| try: |
| local_path = hf_hub_download( |
| repo_id=repo_id, |
| repo_type="dataset", |
| filename=hf_path, |
| local_dir=local_dir, |
| token=token |
| ) |
| downloaded_files.append(local_path) |
| break |
| except Exception as e: |
| if attempt == max_retries - 1: |
| st.error(f"Failed to download {hf_path}: {e}") |
|
|
| except Exception as e: |
| st.error(f"Error listing DICOM files for {patient_id}: {e}") |
| if planner_beam_ids: |
| planner_file = os.path.join(local_dir, 'data', patient_id, "PlannerBeams.json") |
| try: |
| with open(planner_file, "r") as f: |
| planner_data = json.load(f) |
| beam_ids = planner_data.get("IDs", []) |
| except Exception as e: |
| st.error(f"Error reading PlannerBeams.json: {e}") |
| beam_ids = [] |
|
|
| if beam_ids is not None: |
| for bid in beam_ids: |
| beam_data_file = f"Beams/Beam_{bid}_Data.h5" |
| beam_meta_file = f"Beams/Beam_{bid}_MetaData.json" |
| for beam_file in [beam_data_file, beam_meta_file]: |
| hf_path = posixpath.join("data", patient_id, beam_file) |
| for attempt in range(max_retries): |
| try: |
| local_path = hf_hub_download( |
| repo_id=repo_id, |
| repo_type="dataset", |
| filename=hf_path, |
| local_dir=local_dir, |
| token=token |
| ) |
| downloaded_files.append(local_path) |
| break |
| except Exception as e: |
| if attempt == max_retries - 1: |
| st.error(f"Failed to download {hf_path}: {e}") |
| return downloaded_files |
|
|
| from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode |
|
|
| def show_aggrid_table(df): |
| gb = GridOptionsBuilder.from_dataframe(df) |
| gb.configure_default_column(groupable=True, value=True, enableRowGroup=True, aggFunc='sum', editable=False) |
| gb.configure_grid_options(domLayout='normal') |
|
|
| |
| gb.configure_selection('multiple', use_checkbox=True) |
| gb.configure_column("patient_id", checkboxSelection=True) |
|
|
| grid_options = gb.build() |
|
|
| grid_response = AgGrid( |
| df, |
| gridOptions=grid_options, |
| enable_enterprise_modules=False, |
| allow_unsafe_jscode=True, |
| fit_columns_on_grid_load=True, |
| theme='balham', |
| update_mode=GridUpdateMode.SELECTION_CHANGED |
| ) |
|
|
| return grid_response |
|
|
| def main(): |
| st.set_page_config(page_title="PortPy Metadata Explorer", layout="wide") |
| st.title("📊 PortPy Metadata Explorer & Downloader") |
|
|
| patient_df = get_patient_ids() |
| disease_site = st.sidebar.selectbox("Select Disease Site", patient_df["disease_site"].unique()) |
| all_metadata = load_all_metadata(disease_site) |
|
|
| filtered_patients = pd.DataFrame(all_metadata.keys(), columns=["patient_id"]) |
|
|
|
|
| beam_gantry_filter = st.sidebar.text_input("Gantry Angles (comma-separated)", "") |
| beam_collimator_filter = st.sidebar.text_input("Collimator Angles (comma-separated)", "") |
| beam_energy_filter = st.sidebar.text_input("Beam Energies (comma-separated)", "") |
| beam_couch_filter = st.sidebar.text_input("Couch Angles (comma-separated)", "") |
| query_ptv_vol = st.sidebar.number_input("Minimum PTV volume (cc):", value=0) |
|
|
| |
| only_planner = st.sidebar.checkbox( |
| "Show only planner beams (if selected it will download only planner beams)", |
| value=True, |
| ) |
|
|
| results_df = filter_matched_data( |
| filtered_patients, query_ptv_vol, beam_gantry_filter, |
| beam_collimator_filter, beam_energy_filter, beam_couch_filter, |
| only_planner, all_metadata |
| ) |
| |
| |
| grid_response = show_aggrid_table(results_df) |
|
|
| selected_rows = grid_response.get("selected_rows", pd.DataFrame()) |
|
|
| if isinstance(selected_rows, pd.DataFrame): |
| print(selected_rows) |
| if not selected_rows.empty: |
| for _, row in selected_rows.iterrows(): |
| pid = row["patient_id"] |
| st.markdown(f"### Patient: {pid}") |
| st.markdown("#### Structures") |
| st.dataframe(pd.DataFrame(all_metadata[pid]["structures"])) |
| st.markdown("#### Beams") |
| st.dataframe(pd.DataFrame(all_metadata[pid]["beams"])) |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| if "open_download_expander" not in st.session_state: |
| st.session_state["open_download_expander"] = False |
| with st.expander("Download matched patients", expanded=st.session_state["open_download_expander"]): |
| |
| to_download = st.sidebar.multiselect("Select Patients to Download", results_df["patient_id"].tolist()) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| if st.sidebar.button("Download Selected Patients"): |
| st.session_state["open_download_expander"] = True |
| if not to_download: |
| st.warning("No patients selected.") |
| else: |
| progress = st.progress(0) |
| status = st.empty() |
|
|
| local_dir = "./downloaded" |
| os.makedirs(local_dir, exist_ok=True) |
|
|
| patient_to_beams = { |
| row["patient_id"]: row["selected_beam_ids"] |
| for _, row in results_df.iterrows() |
| if row["patient_id"] in to_download |
| } |
|
|
| total = len(patient_to_beams) |
| for i, (pid, beam_ids) in enumerate(patient_to_beams.items(), start=1): |
| status.write(f"Downloading {pid} ({i}/{total})…") |
|
|
| download_data(REPO_ID, [pid], beam_ids=beam_ids, |
| planner_beam_ids=only_planner, |
| local_dir=local_dir) |
|
|
| progress.progress(i / total) |
|
|
|
|
| status.success("All downloads complete. Preparing zip…") |
|
|
|
|
| |
| buf = io.BytesIO() |
| with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: |
| for root, _, files in os.walk(local_dir): |
| for f in files: |
| full_path = os.path.join(root, f) |
| rel_path = os.path.relpath(full_path, local_dir) |
| zf.write(full_path, rel_path) |
| buf.seek(0) |
|
|
| |
| st.download_button( |
| label="Your download is ready! Click to save.", |
| data=buf, |
| file_name="portpy_patients.zip", |
| mime="application/zip", |
| ) |
|
|
| |
| |
| |
| |
|
|
| if __name__ == "__main__": |
| main() |
| |