Spaces:

raylim
/

mosaic-zero

Sleeping

App Files Files Community

raylim commited on Oct 17, 2025

Commit

6bd4e42

unverified ·

0 Parent(s):

initial commit

Browse files

Files changed (12) hide show

.gitignore +14 -0
.python-version +1 -0
README.md +91 -0
pyproject.toml +36 -0
src/mosaic/__init__.py +0 -0
src/mosaic/favicon.svg +14 -0
src/mosaic/gradio_app.py +843 -0
src/mosaic/inference/__init__.py +2 -0
src/mosaic/inference/aeon.py +173 -0
src/mosaic/inference/data.py +262 -0
src/mosaic/inference/paladin.py +310 -0
uv.lock +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,14 @@

+**/.DS_Store
+__pycache__/
+*.log
+.venv*/
+venv*/
+*.pyc
+outputs/
+*.env
+tmp*
+*~
+*.swp
+.idea/
+.vscode/
+data/

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

README.md ADDED Viewed

	@@ -0,0 +1,91 @@

+# Mosaic: H&E Whole Slide Image Cancer Subtype and Biomarker Inference
+Mosaic is a deep learning model designed for predicting cancer subtypes and biomarkers from Hematoxylin and Eosin (H&E) stained whole slide images (WSIs). This repository provides the code, pre-trained models, and instructions to use Mosaic for your own datasets.
+## Table of Contents
+- [Installation](#installation)
+- [Usage](#usage)
+### System requirements
+Supported systems:
+- Linux (x86) with GPU (NVIDIA CUDA)
+### Pre-requisites
+- [python3.11+](https://www.python.org/)
+- [uv](https://docs.astral.sh/uv/)
+    ```bash
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    ```
+## Installation
+```bash
+uv pip install git+ssh://git@github.com/pathology-data-mining/paladin_webapp.git@dev
+```
+## Usage
+### Initial Setup
+<b>NOTE</b>: In order to run this app, the user needs to be added to the [PDM Group](https://huggingface.co/PDM-Group) and the user needs to set the following environment variable. The token may be obtained from clicking on the user icon on the top right of the HuggingFace website and selecting "Access Tokens". When creating the token, select all read options for your private space and the PDM-Group space.
+```bash
+export HF_TOKEN="TOKEN-FROM-HUGGINGFACE"
+```
+Additionally, set the location for huggingface home where models and other data from HuggingFace may be downloaded.
+```bash
+export HF_HOME="PATH-TO-HUGGINGFACE-HOME"
+```
+### Web Application
+Run the web application with:
+```bash
+mosaic_app
+```
+It will start a web server on port 7860 by default. You can access the web interface by navigating to `http://localhost:7860` in your web browser.
+### Command Line Interface
+To process a WSI, use the following command:
+```bash
+mosaic_app --slide-path /path/to/your/wsi.svs --output-dir /path/to/output/directory
+```
+To process a batch of WSIs, use:
+```bash
+mosaic_app --slide-csv /path/to/your/wsi_list.csv --output-dir /path/to/output/directory
+```
+The CSV file should at least contain columns `Slide`, and `Site Type`.
+Optionally, it can also contain `Cancer Subtype`, `Segmentation Config`, and `IHC Subtype`.
+- `Slide` should contain the full path to the WSI file.
+- `Site Type` should be one of `Primary`, or `Metastatic`.
+- `Cancer Subtype` should be the oncotree code for the cancer subtype.
+- `Segmentation Config` should be one of `Biopsy`, `Resection`, or `TCGA`.
+- `IHC Subtype` should be one of `HR+/HER2+`, `HR+/HER2-`, `HR-/HER2+`, or `HR-/HER2-`.
+See additional options with the help command. This command may take a few seconds to run:
+```bash
+mosaic_app --help
+```
+If setting port to run in server mode, you may check for available ports using `ss -tuln | grep :PORT` where PORT is the port number you want to check. No output indicates the port may be available. If port is available, set environment variable `export GRADIO_SERVER_PORT="PORT"`
+### Notes
+- The first time you run the application, it will download the necessary models from HuggingFace. This may take some time depending on your internet connection.
+- The models are downloaded to a directory relative to where you run the application. (A subdirectory named `data`).

pyproject.toml ADDED Viewed

	@@ -0,0 +1,36 @@

+[build-system]
+requires = ["uv_build>=0.8.1,<0.9.0"]
+build-backend = "uv_build"
+[project]
+name = "mosaic"
+version = "0.1.0"
+description = "Mussel-Aeon-Paladin workflow for digital pathology"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+  "gradio>=5.49.0",
+  "loguru>=0.7.3",
+  "memory-profiler>=0.61.0",
+  "mussel[torch-gpu]",
+  "paladin",
+]
+[project.scripts]
+aeon_inference = "mosaic.inference.aeon:main"
+paladin_inference = "mosaic.inference.paladin:main"
+mosaic = "mosaic.gradio_app:main"
+[dependency-groups]
+dev = ["black>=25.1.0", "pylint>=3.3.6"]
+[tool.pylint."messages control"]
+disable = [
+  "logging-fstring-interpolation",
+  "broad-exception-caught",
+  "unspecified-encoding",
+]
+[tool.uv.sources]
+paladin = { git = "ssh://git@github.com/pathology-data-mining/paladin.git", rev = "dev" }
+mussel = { git = "https://github.com/pathology-data-mining/Mussel.git", rev = "ray-dev" }

src/mosaic/__init__.py ADDED Viewed

File without changes

src/mosaic/favicon.svg ADDED Viewed

src/mosaic/gradio_app.py ADDED Viewed

	@@ -0,0 +1,843 @@

+from argparse import ArgumentParser
+import gradio as gr
+import pandas as pd
+import pickle
+from mussel.models import ModelType
+from mussel.utils import get_features, segment_tissue, filter_features
+from mussel.utils.segment import draw_slide_mask
+from mussel.cli.tessellate import BiopsySegConfig, ResectionSegConfig, TcgaSegConfig
+import torch
+from pathlib import Path
+from huggingface_hub import snapshot_download
+import tempfile
+import requests
+from mosaic.inference import run_aeon, run_paladin
+from loguru import logger
+current_dir = Path(__file__).parent
+# This path should be outside your project directory if running locally
+TEMP_USER_DATA_DIR = Path(tempfile.gettempdir()) / "mosaic_user_data"
+IHC_SUBTYPES = ["", "HR+/HER2+", "HR+/HER2-", "HR-/HER2+", "HR-/HER2-"]
+SETTINGS_COLUMNS = [
+    "Slide",
+    "Site Type",
+    "Cancer Subtype",
+    "IHC Subtype",
+    "Segmentation Config",
+]
+oncotree_code_map = {}
+def get_oncotree_code_name(code):
+    global oncotree_code_map
+    if code in oncotree_code_map.keys():
+        return oncotree_code_map[code]
+    url = f"https://oncotree.mskcc.org/api/tumorTypes/search/code/{code}?exactMatch=true&version=oncotree_2025_04_08"
+    response = requests.get(url)
+    code_name = "Unknown"
+    if response.status_code == 200:
+        data = response.json()
+        if data:
+            code_name = data[0]["name"]
+    oncotree_code_map[code] = code_name
+    return code_name
+def download_and_process_models():
+    global cancer_subtype_name_map, reversed_cancer_subtype_name_map, cancer_subtypes
+    snapshot_download(repo_id="PDM-Group/paladin-aeon-models", local_dir="data")
+    model_map = pd.read_csv(
+        "data/paladin_model_map.csv",
+    )
+    cancer_subtypes = model_map["cancer_subtype"].unique().tolist()
+    cancer_subtype_name_map = {
+        f"{get_oncotree_code_name(code)} ({code})": code for code in cancer_subtypes
+    }
+    cancer_subtype_name_map["Unknown"] = "UNK"
+    reversed_cancer_subtype_name_map = {
+        value: key for key, value in cancer_subtype_name_map.items()
+    }
+def create_user_directory(state, request: gr.Request):
+    """Create a unique directory for each user session."""
+    session_hash = request.session_hash
+    if session_hash is None:
+        return None, None
+    user_dir = TEMP_USER_DATA_DIR / session_hash
+    user_dir.mkdir(parents=True, exist_ok=True)
+    return user_dir
+def load_settings(slide_csv_path):
+    """Load settings from CSV file and validate columns."""
+    settings_df = pd.read_csv(slide_csv_path, na_filter=False)
+    if "Segmentation Config" not in settings_df.columns:
+        settings_df["Segmentation Config"] = "Biopsy"
+    if "Cancer Subtype" not in settings_df.columns:
+        settings_df["Cancer Subtype"] = "Unknown"
+    if "IHC Subtype" not in settings_df.columns:
+        settings_df["IHC Subtype"] = ""
+    if not set(SETTINGS_COLUMNS).issubset(settings_df.columns):
+        raise ValueError("Missing required column in CSV file")
+    settings_df = settings_df[SETTINGS_COLUMNS]
+    return settings_df
+def validate_settings(settings_df):
+    """Validate settings DataFrame and provide warnings for invalid entries."""
+    settings_df.columns = SETTINGS_COLUMNS
+    warnings = []
+    for idx, row in settings_df.iterrows():
+        slide_name = row["Slide"]
+        subtype = row["Cancer Subtype"]
+        if subtype in cancer_subtypes:
+            settings_df.at[idx, "Cancer Subtype"] = reversed_cancer_subtype_name_map[
+                subtype
+            ]
+        if settings_df.at[idx, "Cancer Subtype"] not in cancer_subtype_name_map.keys():
+            warnings.append(
+                f"Slide {slide_name}: Unknown cancer subtype. Valid subtypes are: {', '.join(cancer_subtype_name_map.keys())}. "
+            )
+            settings_df.at[idx, "Cancer Subtype"] = "Unknown"
+        if row["Site Type"] not in ["Metastatic", "Primary"]:
+            warnings.append(
+                f"Slide {slide_name}: Unknown site type. Valid types are: Metastatic, Primary. "
+            )
+            settings_df.at[idx, "Site Type"] = "Primary"
+        if (
+            "Breast" not in settings_df.at[idx, "Cancer Subtype"]
+            and row["IHC Subtype"] != ""
+        ):
+            warnings.append(
+                f"Slide {slide_name}: IHC subtype should be empty for non-breast cancer subtypes. "
+            )
+            settings_df.at[idx, "IHC Subtype"] = ""
+        if row["IHC Subtype"] not in IHC_SUBTYPES:
+            warnings.append(
+                f"Slide {slide_name}: Unknown IHC subtype. Valid subtypes are: {', '.join(IHC_SUBTYPES)}. "
+            )
+            settings_df.at[idx, "IHC Subtype"] = ""
+        if row["Segmentation Config"] not in ["Biopsy", "Resection", "TCGA"]:
+            warnings.append(
+                f"Slide {slide_name}: Unknown segmentation config. Valid configs are: Biopsy, Resection, TCGA. "
+            )
+            settings_df.at[idx, "Segmentation Config"] = "Biopsy"
+    if warnings:
+        gr.Warning("\n".join(warnings))
+    return settings_df
+def export_to_csv(df):
+    if df is None or df.empty:
+        raise gr.Error("No data to export.")
+    csv_path = "paladin_results.csv"
+    df.to_csv(csv_path, index=False)
+    return csv_path
+def analyze_slides(
+    slides,
+    settings_input,
+    user_dir,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if slides is None or len(slides) == 0:
+        raise gr.Error("Please upload at least one slide.")
+    if user_dir is None:
+        user_dir = create_user_directory(None, gr.Request())
+    settings_input = validate_settings(settings_input)
+    if len(slides) != len(settings_input):
+        raise gr.Error("Missing settings for uploaded slides")
+    all_slide_masks = []
+    all_aeon_results = []
+    all_paladin_results = []
+    progress(0.0, desc="Starting analysis")
+    for idx, row in settings_input.iterrows():
+        slide_name = row["Slide"]
+        progress(
+            idx / len(settings_input),
+            desc=f"Analyzing {slide_name}, slide {idx + 1} of {len(settings_input)}",
+        )
+        for x in slides:
+            s = x.split("/")[-1]
+            if s == slide_name:
+                slide_mask = x
+        (
+            slide_mask,
+            aeon_results,
+            paladin_results,
+        ) = analyze_slide(
+            slides[idx],
+            row["Segmentation Config"],
+            row["Site Type"],
+            row["Cancer Subtype"],
+            row["IHC Subtype"],
+            progress=progress,
+        )
+        if aeon_results is not None:
+            if len(slides) > 1:
+                aeon_results.columns = [f"{slide_name}"]
+            if row["Cancer Subtype"] == "Unknown":
+                all_aeon_results.append(aeon_results)
+        if paladin_results is not None:
+            paladin_results.insert(
+                0, "Slide", pd.Series([slide_name] * len(paladin_results))
+            )
+            all_paladin_results.append(paladin_results)
+        if slide_mask is not None:
+            all_slide_masks.append((slide_mask, slide_name))
+        # yield slide_mask, None, None, None  # Yield intermediate results
+    progress(0.99, desc="Analysis complete, wrapping up results")
+    timestamp = pd.Timestamp.now().strftime("%Y%m%d-%H%M%S")
+    combined_paladin_results = (
+        pd.concat(all_paladin_results, ignore_index=True)
+        if all_paladin_results
+        else pd.DataFrame()
+    )
+    combined_aeon_results = gr.DataFrame(visible=False)
+    aeon_output = gr.DownloadButton(visible=False)
+    if all_aeon_results:
+        combined_aeon_results = pd.concat(all_aeon_results, axis=1)
+        combined_aeon_results.reset_index(inplace=True)
+        combined_aeon_results = combined_aeon_results.round(3)
+        cancer_subtype_names = [
+            f"{get_oncotree_code_name(code)} ({code})"
+            for code in combined_aeon_results["Cancer Subtype"]
+        ]
+        combined_aeon_results["Cancer Subtype"] = cancer_subtype_names
+        aeon_output_path = user_dir / f"aeon_results-{timestamp}.csv"
+        combined_aeon_results.to_csv(aeon_output_path)
+        combined_aeon_results = gr.DataFrame(
+            combined_aeon_results,
+            visible=True,
+            column_widths=["4px"] + ["2px"] * (combined_aeon_results.shape[1] - 1),
+        )
+        aeon_output = gr.DownloadButton(value=aeon_output_path, visible=True)
+    # Convert Oncotree codes to names for display
+    cancer_subtype_names = [
+        f"{get_oncotree_code_name(code)} ({code})"
+        for code in combined_paladin_results["Cancer Subtype"]
+    ]
+    combined_paladin_results["Cancer Subtype"] = cancer_subtype_names
+    if len(combined_paladin_results) > 0:
+        combined_paladin_results["Score"] = combined_paladin_results["Score"].round(3)
+    paladin_output = gr.DownloadButton(visible=False)
+    if len(combined_paladin_results) > 0:
+        paladin_output_path = user_dir / f"paladin_results-{timestamp}.csv"
+        combined_paladin_results.to_csv(paladin_output_path, index=False)
+        paladin_output = gr.DownloadButton(value=paladin_output_path, visible=True)
+    progress(1.0, desc="All done!")
+    return (
+        all_slide_masks,
+        combined_aeon_results,
+        aeon_output,
+        combined_paladin_results if len(combined_paladin_results) > 0 else None,
+        paladin_output,
+        user_dir,
+    )
+def analyze_slide(
+    slide_path,
+    seg_config,
+    site_type,
+    cancer_subtype,
+    ihc_subtype="",
+    num_workers=4,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if slide_path is None:
+        raise gr.Error("Please upload a slide.")
+    # Step 1: Segment tissue
+    start_time = pd.Timestamp.now()
+    if seg_config == "Biopsy":
+        seg_config = BiopsySegConfig()
+    elif seg_config == "Resection":
+        seg_config = ResectionSegConfig()
+    elif seg_config == "TCGA":
+        seg_config = TcgaSegConfig()
+    else:
+        raise ValueError(f"Unknown segmentation configuration: {seg_config}")
+    progress(0.0, desc="Segmenting tissue")
+    logger.info(f"Segmenting tissue for slide: {slide_path}")
+    if values := segment_tissue(
+        slide_path=slide_path,
+        patch_size=224,
+        mpp=0.5,
+        seg_level=-1,
+        segment_threshold=seg_config.segment_threshold,
+        median_blur_ksize=seg_config.median_blur_ksize,
+        morphology_ex_kernel=seg_config.morphology_ex_kernel,
+        tissue_area_threshold=seg_config.tissue_area_threshold,
+        hole_area_threshold=seg_config.hole_area_threshold,
+        max_num_holes=seg_config.max_num_holes,
+    ):
+        polygon, _, coords, attrs = values
+    else:
+        gr.Warning(f"No tissue detected in slide: {slide_path}")
+        return None, None, None
+    end_time = pd.Timestamp.now()
+    logger.info(f"Tissue segmentation took {end_time - start_time}")
+    logger.info(f"Found {len(coords)} tissue tiles")
+    progress(0.2, desc="Tissue segmented")
+    # Draw slide mask for visualization
+    logger.info("Drawing slide mask")
+    progress(0.25, desc="Drawing slide mask")
+    slide_mask = draw_slide_mask(
+        slide_path, polygon, outline="black", fill=(255, 0, 0, 80), vis_level=-1
+    )
+    logger.info("Slide mask drawn")
+    # Step 2: Extract features with CTransPath
+    start_time = pd.Timestamp.now()
+    progress(0.3, desc="Extracting CTransPath features")
+    logger.info("Extracting CTransPath features")
+    ctranspath_features, _ = get_features(
+        coords,
+        slide_path,
+        attrs,
+        model_type=ModelType.CTRANSPATH,
+        model_path="data/ctranspath.pth",
+        num_workers=num_workers,
+        batch_size=64,
+        use_gpu=True,
+    )
+    end_time = pd.Timestamp.now()
+    max_gpu_memory = (
+        torch.cuda.max_memory_allocated() / (1024**3)
+        if torch.cuda.is_available()
+        else 0
+    )
+    logger.info(
+        f"CTransPath Feature extraction took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
+    )
+    torch.cuda.reset_peak_memory_stats()
+    # Step 3: Filter features using marker classifier
+    start_time = pd.Timestamp.now()
+    marker_classifier = pickle.load(open("data/marker_classifier.pkl", "rb"))
+    progress(0.35, desc="Filtering features with marker classifier")
+    logger.info("Filtering features with marker classifier")
+    _, filtered_coords = filter_features(
+        ctranspath_features,
+        coords,
+        marker_classifier,
+        threshold=0.25,
+    )
+    end_time = pd.Timestamp.now()
+    logger.info(f"Feature filtering took {end_time - start_time}")
+    logger.info(
+        f"Filtered from {len(coords)} to {len(filtered_coords)} tiles using marker classifier"
+    )
+    # Step 4: Extract features with Optimus on filtered coords
+    start_time = pd.Timestamp.now()
+    progress(0.4, desc="Extracting Optimus features")
+    logger.info("Extracting Optimus features")
+    features, _ = get_features(
+        filtered_coords,
+        slide_path,
+        attrs,
+        model_type=ModelType.OPTIMUS,
+        model_path="data/optimus.pkl",
+        num_workers=num_workers,
+        batch_size=64,
+        use_gpu=True,
+    )
+    end_time = pd.Timestamp.now()
+    max_gpu_memory = (
+        torch.cuda.max_memory_allocated() / (1024**3)
+        if torch.cuda.is_available()
+        else 0
+    )
+    logger.info(
+        f"Optimus Feature extraction took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
+    )
+    torch.cuda.reset_peak_memory_stats()
+    # Step 3: Run Aeon to predict histology if not supplied
+    if cancer_subtype == "Unknown":
+        start_time = pd.Timestamp.now()
+        progress(0.9, desc="Running Aeon for cancer subtype inference")
+        logger.info("Running Aeon for cancer subtype inference")
+        aeon_results, _ = run_aeon(
+            features=features,
+            model_path="data/aeon_model.pkl",
+            metastatic=(site_type == "Metastatic"),
+            batch_size=8,
+            num_workers=num_workers,
+            use_cpu=False,
+        )
+        end_time = pd.Timestamp.now()
+        max_gpu_memory = (
+            torch.cuda.max_memory_allocated() / (1024**3)
+            if torch.cuda.is_available()
+            else 0
+        )
+        logger.info(
+            f"Aeon inference took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
+        )
+        torch.cuda.reset_peak_memory_stats()
+    else:
+        cancer_subtype_code = cancer_subtype_name_map.get(cancer_subtype)
+        aeon_results = pd.DataFrame(
+            {
+                "Cancer Subtype": [cancer_subtype_code],
+                "Confidence": [1.0],
+            }
+        )
+        logger.info(f"Using user-supplied cancer subtype: {cancer_subtype}")
+    # Step 4: Run Paladin to predict biomarkers
+    if len(aeon_results) == 0:
+        logger.warning("No Aeon results, skipping Paladin inference")
+        return slide_mask, None, None
+    start_time = pd.Timestamp.now()
+    progress(0.95, desc="Running Paladin for biomarker inference")
+    logger.info("Running Paladin for biomarker inference")
+    paladin_results = run_paladin(
+        features=features,
+        model_map_path="data/paladin_model_map.csv",
+        aeon_results=aeon_results,
+        metastatic=(site_type == "Metastatic"),
+        batch_size=8,
+        num_workers=num_workers,
+        use_cpu=False,
+    )
+    end_time = pd.Timestamp.now()
+    max_gpu_memory = (
+        torch.cuda.max_memory_allocated() / (1024**3)
+        if torch.cuda.is_available()
+        else 0
+    )
+    logger.info(
+        f"Paladin inference took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
+    )
+    aeon_results.set_index("Cancer Subtype", inplace=True)
+    return slide_mask, aeon_results, paladin_results
+def launch_gradio(server_name, server_port, share):
+    with gr.Blocks(title="Mosaic") as demo:
+        user_dir_state = gr.State(None)
+        gr.Markdown(
+            "# Mosaic: H&E Whole Slide Image Cancer Subtype and Biomarker Inference"
+        )
+        gr.Markdown(
+            "Upload an H&E whole slide image in SVS or TIFF format. The slide will be processed to infer cancer subtype and relevant biomarkers."
+        )
+        with gr.Row():
+            with gr.Column():
+                input_slides = gr.File(
+                    label="Upload H&E Whole Slide Image",
+                    file_types=[".svs", ".tiff", ".tif"],
+                    file_count="multiple",
+                )
+                site_dropdown = gr.Dropdown(
+                    choices=["Primary", "Metastatic"],
+                    label="Site Type",
+                    value="Primary",
+                )
+                cancer_subtype_dropdown = gr.Dropdown(
+                    choices=[name for name in cancer_subtype_name_map.keys()],
+                    label="Cancer Subtype",
+                    value="Unknown",
+                )
+                ihc_subtype_dropdown = gr.Dropdown(
+                    choices=IHC_SUBTYPES,
+                    label="IHC Subtype (if applicable)",
+                    value="",
+                    visible=False,
+                )
+                seg_config_dropdown = gr.Dropdown(
+                    choices=["Biopsy", "Resection", "TCGA"],
+                    label="Segmentation Config",
+                    value="Biopsy",
+                )
+                with gr.Row():
+                    settings_input = gr.Dataframe(
+                        headers=[
+                            "Slide",
+                            "Site Type",
+                            "Cancer Subtype",
+                            "IHC Subtype",
+                            "Segmentation Config",
+                        ],
+                        label="Current Settings",
+                        datatype=["str", "str", "str", "str", "str"],
+                        visible=False,
+                        interactive=True,
+                        static_columns="Slide",
+                    )
+                with gr.Row():
+                    settings_csv = gr.File(
+                        file_types=[".csv"], label="Upload Settings CSV", visible=False
+                    )
+                with gr.Row():
+                    clear_button = gr.Button("Clear")
+                    analyze_button = gr.Button("Analyze", variant="primary")
+            with gr.Column():
+                slide_masks = gr.Gallery(
+                    label="Slide Masks",
+                    columns=3,
+                    object_fit="contain",
+                    height="auto",
+                )
+                aeon_output_table = gr.Dataframe(
+                    headers=["Cancer Subtype", "Slide Name"],
+                    label="Cancer Subtype Inference Confidence",
+                    datatype=["str", "number"],
+                    visible=False,
+                )
+                aeon_download_button = gr.DownloadButton(
+                    "Download Aeon Results as CSV",
+                    label="Download Results",
+                    visible=False,
+                )
+                paladin_output_table = gr.Dataframe(
+                    headers=["Slide", "Cancer Subtype", "Biomarker", "Score"],
+                    label="Biomarker Inference",
+                    datatype=["str", "str", "str", "number"],
+                )
+                paladin_download_button = gr.DownloadButton(
+                    "Download Paladin Results as CSV",
+                    label="Download Results",
+                    visible=False,
+                )
+        @clear_button.click(
+            outputs=[
+                input_slides,
+                slide_masks,
+                paladin_output_table,
+                paladin_download_button,
+                aeon_output_table,
+                aeon_download_button,
+                settings_input,
+                settings_csv,
+            ],
+        )
+        def clear_fn():
+            return (
+                None,
+                None,
+                None,
+                None,
+                gr.Dataframe(visible=False),
+                gr.DownloadButton(visible=False),
+                gr.Dataframe(visible=False),
+                gr.File(visible=False),
+            )
+        def get_settings(files, site_type, cancer_subtype, ihc_subtype, seg_config):
+            if files is None:
+                return pd.DataFrame()
+            settings = []
+            for file in files:
+                filename = file.name if hasattr(file, "name") else file
+                slide_name = filename.split("/")[-1]
+                settings.append(
+                    [slide_name, site_type, cancer_subtype, ihc_subtype, seg_config]
+                )
+            df = pd.DataFrame(settings, columns=SETTINGS_COLUMNS)
+            return df
+        # Only display settings table and upload button if multiple slides are uploaded
+        @gr.on(
+            [
+                input_slides.change,
+                site_dropdown.change,
+                cancer_subtype_dropdown.change,
+                ihc_subtype_dropdown.change,
+                seg_config_dropdown.change,
+            ],
+            inputs=[
+                input_slides,
+                site_dropdown,
+                cancer_subtype_dropdown,
+                ihc_subtype_dropdown,
+                seg_config_dropdown,
+            ],
+            outputs=[settings_input, settings_csv, ihc_subtype_dropdown],
+        )
+        def update_settings(files, site_type, cancer_subtype, ihc_subtype, seg_config):
+            has_ihc = "Breast" in cancer_subtype
+            if not files:
+                return None, None, gr.Dropdown(visible=has_ihc)
+            settings_df = get_settings(
+                files, site_type, cancer_subtype, ihc_subtype, seg_config
+            )
+            if settings_df is not None:
+                has_ihc = any("Breast" in cs for cs in settings_df["Cancer Subtype"])
+            visible = files and len(files) > 1
+            return (
+                gr.Dataframe(settings_df, visible=visible),
+                gr.File(visible=visible),
+                gr.Dropdown(visible=has_ihc),
+            )
+        @settings_csv.upload(
+            inputs=[settings_csv],
+            outputs=[settings_input],
+        )
+        def read_settings(file):
+            if file is None:
+                return None
+            df = load_settings(file.name if hasattr(file, "name") else file)
+            return gr.Dataframe(df, visible=True)
+        analyze_button.click(
+            analyze_slides,
+            inputs=[
+                input_slides,
+                settings_input,
+                user_dir_state,
+            ],
+            outputs=[
+                slide_masks,
+                aeon_output_table,
+                aeon_download_button,
+                paladin_output_table,
+                paladin_download_button,
+                user_dir_state,
+            ],
+            queue=True,
+            show_progress_on=paladin_output_table,
+        )
+        settings_input.change(
+            validate_settings, inputs=[settings_input], outputs=[settings_input]
+        )
+        demo.load(
+            create_user_directory,
+            inputs=[user_dir_state],
+            outputs=[user_dir_state],
+        )
+    demo.queue(max_size=10, default_concurrency_limit=8)
+    demo.launch(
+        server_name=server_name,
+        share=share,
+        server_port=server_port,
+        show_error=True,
+        favicon_path=current_dir / "favicon.svg",
+    )
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
+    parser.add_argument(
+        "--server-name", type=str, default="0.0.0.0", help="Server name for Gradio app"
+    )
+    parser.add_argument(
+        "--server-port", type=int, default=None, help="Server port for Gradio app"
+    )
+    parser.add_argument(
+        "--share", action="store_true", help="Share Gradio app publicly"
+    )
+    parser.add_argument(
+        "--slide-csv",
+        type=str,
+        help="CSV file with slide settings (for batch processing), see README for format",
+    )
+    parser.add_argument(
+        "--slide-path",
+        type=str,
+        help="Path to a single slide (for single slide processing), not used if --slide-csv is provided",
+    )
+    parser.add_argument(
+        "--site-type",
+        type=str,
+        choices=["Primary", "Metastatic"],
+        default="Primary",
+        help="Site type of the slide (for single slide processing)",
+    )
+    parser.add_argument(
+        "--cancer-subtype",
+        type=str,
+        default="Unknown",
+        help="Cancer subtype of the slide (for single slide processing), use 'Unknown' to infer with Aeon",
+    )
+    parser.add_argument(
+        "--ihc-subtype",
+        type=str,
+        choices=IHC_SUBTYPES,
+        default="",
+        help="IHC subtype if cancer subtype is breast (for single slide processing)",
+    )
+    parser.add_argument(
+        "--segmentation-config",
+        type=str,
+        choices=["Biopsy", "Resection", "TCGA"],
+        default="Biopsy",
+        help="Segmentation configuration (for single slide processing)",
+    )
+    parser.add_argument(
+        "--output-dir", type=str, help="Directory to save output results"
+    )
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=4,
+        help="Number of workers for feature extraction",
+    )
+    args = parser.parse_args()
+    if args.debug:
+        logger.add("debug.log", level="DEBUG")
+        logger.debug("Debug logging enabled")
+    download_and_process_models()
+    if args.slide_path and not args.slide_csv:
+        # Single slide processing mode
+        if not args.output_dir:
+            raise ValueError("Please provide --output-dir to save results")
+        settings_df = pd.DataFrame(
+            [
+                [
+                    args.slide_path,
+                    args.site_type,
+                    args.cancer_subtype,
+                    args.ihc_subtype,
+                    args.segmentation_config,
+                ]
+            ],
+            columns=SETTINGS_COLUMNS,
+        )
+        settings_df = validate_settings(settings_df)
+        slide_mask, aeon_results, paladin_results = analyze_slide(
+            args.slide_path,
+            args.segmentation_config,
+            args.site_type,
+            args.cancer_subtype,
+            args.ihc_subtype,
+            num_workers=args.num_workers,
+        )
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        slide_name = Path(args.slide_path).stem
+        if slide_mask is not None:
+            mask_path = output_dir / f"{slide_name}_mask.png"
+            slide_mask.save(mask_path)
+            logger.info(f"Saved slide mask to {mask_path}")
+        if aeon_results is not None:
+            aeon_output_path = output_dir / f"{slide_name}_aeon_results.csv"
+            aeon_results.reset_index().to_csv(aeon_output_path, index=False)
+            logger.info(f"Saved Aeon results to {aeon_output_path}")
+        if paladin_results is not None and len(paladin_results) > 0:
+            paladin_output_path = output_dir / f"{slide_name}_paladin_results.csv"
+            paladin_results.to_csv(paladin_output_path, index=False)
+            logger.info(f"Saved Paladin results to {paladin_output_path}")
+    elif args.slide_csv:
+        if not args.output_dir:
+            raise ValueError("Please provide --output-dir to save results")
+        # Batch processing mode
+        output_dir = Path(args.output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        all_paladin_results = []
+        all_aeon_results = []
+        settings_df = load_settings(args.slide_csv)
+        settings_df = validate_settings(settings_df)
+        for idx, row in settings_df.iterrows():
+            slide_path = row["Slide"]
+            seg_config = row["Segmentation Config"]
+            site_type = row["Site Type"]
+            cancer_subtype = row["Cancer Subtype"]
+            ihc_subtype = row.get("IHC Subtype", "")
+            logger.info(
+                f"Processing slide {slide_path} ({idx + 1} of {len(settings_df)})"
+            )
+            slide_mask, aeon_results, paladin_results = analyze_slide(
+                slide_path,
+                seg_config,
+                site_type,
+                cancer_subtype,
+                ihc_subtype,
+                num_workers=args.num_workers,
+            )
+            slide_name = Path(slide_path).stem
+            mask_path = output_dir / f"{slide_name}_mask.png"
+            slide_mask.save(mask_path)
+            logger.info(f"Saved slide mask to {mask_path}")
+            if aeon_results is not None:
+                aeon_output_path = output_dir / f"{slide_name}_aeon_results.csv"
+                aeon_results.reset_index().to_csv(aeon_output_path, index=False)
+                logger.info(f"Saved Aeon results to {aeon_output_path}")
+            if paladin_results is not None and len(paladin_results) > 0:
+                paladin_output_path = output_dir / f"{slide_name}_paladin_results.csv"
+                paladin_results.to_csv(paladin_output_path, index=False)
+                logger.info(f"Saved Paladin results to {paladin_output_path}")
+            if aeon_results is not None:
+                aeon_results.columns = [f"{slide_name}"]
+                all_aeon_results.append(aeon_results)
+            if paladin_results is not None and len(paladin_results) > 0:
+                paladin_results.insert(
+                    0, "Slide", pd.Series([slide_name] * len(paladin_results))
+                )
+                all_paladin_results.append(paladin_results)
+        if all_aeon_results:
+            combined_aeon_results = pd.concat(all_aeon_results, axis=1)
+            combined_aeon_results.reset_index(inplace=True)
+            cancer_subtype_names = [
+                f"{get_oncotree_code_name(code)} ({code})"
+                for code in combined_aeon_results["Cancer Subtype"]
+            ]
+            combined_aeon_results["Cancer Subtype"] = cancer_subtype_names
+            combined_aeon_output_path = output_dir / "combined_aeon_results.csv"
+            combined_aeon_results.to_csv(combined_aeon_output_path, index=False)
+            logger.info(f"Saved combined Aeon results to {combined_aeon_output_path}")
+        if all_paladin_results:
+            combined_paladin_results = pd.concat(all_paladin_results, ignore_index=True)
+            cancer_subtype_names = [
+                f"{get_oncotree_code_name(code)} ({code})"
+                for code in combined_paladin_results["Cancer Subtype"]
+            ]
+            combined_paladin_results["Cancer Subtype"] = cancer_subtype_names
+            combined_paladin_output_path = output_dir / "combined_paladin_results.csv"
+            combined_paladin_results.to_csv(combined_paladin_output_path, index=False)
+            logger.info(
+                f"Saved combined Paladin results to {combined_paladin_output_path}"
+            )
+    else:
+        launch_gradio(
+            server_name=args.server_name,
+            server_port=args.server_port,
+            share=args.share,
+        )
+if __name__ == "__main__":
+    main()

src/mosaic/inference/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .aeon import run as run_aeon
2	+ from .paladin import run as run_paladin

src/mosaic/inference/aeon.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import pickle  # nosec
+import sys
+from argparse import ArgumentParser
+from pathlib import Path
+import pandas as pd
+import torch
+from torch.utils.data import DataLoader
+from mosaic.inference.data import (
+    SiteType,
+    TileFeatureTensorDataset,
+    INT_TO_CANCER_TYPE_MAP,
+    CANCER_TYPE_TO_INT_MAP,
+)
+from loguru import logger
+cancer_types_to_drop = [
+    "UDMN",
+    "ADNOS",
+    "CUP",
+    "CUPNOS",
+    "BRCNOS",
+    "GNOS",
+    "SCCNOS",
+    "PDC",
+    "NSCLC",
+    "BRCA",
+    "SARCNOS",
+    "NETNOS",
+    "MEL",
+    "RCC",
+    "BRCANOS",
+    "COADREAD",
+    "MUP",
+    "NECNOS",
+    "UCEC",
+    "NOT",
+]
+col_indices_to_drop = [CANCER_TYPE_TO_INT_MAP[x] for x in cancer_types_to_drop]
+BATCH_SIZE = 8
+NUM_WORKERS = 8
+def run(
+    features, model_path, metastatic=False, batch_size=8, num_workers=8, use_cpu=False
+):
+    device = torch.device(
+        "cuda" if not use_cpu and torch.cuda.is_available() else "cpu"
+    )
+    with open(model_path, "rb") as f:
+        model = pickle.load(f)  # nosec
+    model.to(device)
+    model.eval()
+    site_type = SiteType.METASTASIS if metastatic else SiteType.PRIMARY
+    # For UI, InferenceDataset will just be a single slide.  Sample id is not relevant.
+    dataset = TileFeatureTensorDataset(
+        site_type=site_type,
+        tile_features=features,
+        n_max_tiles=20000,
+    )
+    dataloader = DataLoader(
+        dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers
+    )
+    results = []
+    batch = next(iter(dataloader))
+    with torch.no_grad():
+        batch["tile_tensor"] = batch["tile_tensor"].to(device)
+        y = model(batch)
+        y["logits"][:, col_indices_to_drop] = -1e6
+        batch_size = y["logits"].shape[0]
+        assert batch_size == 1
+        softmax = torch.nn.functional.softmax(y["logits"][0], dim=0)
+        argmax = torch.argmax(softmax, dim=0)
+        class_assignment = INT_TO_CANCER_TYPE_MAP[argmax.item()]
+        max_confidence = softmax[argmax].item()
+        mean_confidence = torch.mean(softmax).item()
+        logger.info(
+            f"class {class_assignment} :  confidence {max_confidence:8.5f} "
+            f"(mean {mean_confidence:8.5f})"
+        )
+        part_embedding = y["whole_part_representation"][0].cpu()
+        for cancer_subtype, j in sorted(CANCER_TYPE_TO_INT_MAP.items()):
+            confidence = softmax[j].item()
+            results.append((cancer_subtype, confidence))
+        results.sort(key=lambda row: row[1], reverse=True)
+    results_df = pd.DataFrame(results, columns=["Cancer Subtype", "Confidence"])
+    return results_df, part_embedding
+def parse_args():
+    parser = ArgumentParser(
+        description="Run Aeon inference on a specified set of slides"
+    )
+    parser.add_argument(
+        "-i",
+        "--features-path",
+        required=True,
+        help="Pathname to a .pt file with optimus tile features for this slide",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-prediction-path",
+        help="The filename for the Aeon predictions file (CSV)",
+        required=True,
+    )
+    parser.add_argument(
+        "--output-embedding-path",
+        help="The filename for the whole-part representation of the slide (.pt)",
+    )
+    parser.add_argument(
+        "--model-path",
+        type=str,
+        help="Pathname to the pickle file for an Aeon model",
+        required=True,
+    )
+    parser.add_argument(
+        "--metastatic", action="store_true", help="Tissue is from a metastatic site"
+    )
+    parser.add_argument("--batch-size", type=int, default=BATCH_SIZE, help="Batch size")
+    parser.add_argument(
+        "--num-workers", type=int, default=NUM_WORKERS, help="Number of workers"
+    )
+    parser.add_argument("--use-cpu", action="store_true", help="Use CPU")
+    opt = parser.parse_args()
+    return opt
+def main():
+    opt = parse_args()
+    output_path = opt.output_prediction_path
+    logger.info(f"output_path: '{output_path}'")
+    embedding_path = opt.output_embedding_path
+    logger.info(f"part_embedding_path: '{embedding_path}'")
+    features = torch.load(opt.features_path)
+    results_df, part_embedding = run(
+        features=features,
+        model_path=opt.model_path,
+        metastatic=opt.metastatic,
+        batch_size=opt.batch_size,
+        num_workers=opt.num_workers,
+        use_cpu=opt.use_cpu,
+    )
+    results_df.to_csv(output_path, index=False)
+    logger.info(f"Wrote {output_path}")
+    if embedding_path:
+        torch.save(part_embedding, embedding_path)
+        logger.info(f"Wrote {embedding_path}")
+if __name__ == "__main__":
+    main()

src/mosaic/inference/data.py ADDED Viewed

	@@ -0,0 +1,262 @@

+from enum import Enum
+from typing import List
+import torch
+from torch.utils.data import Dataset
+import numpy as np
+CANCER_TYPE_TO_INT_MAP = {
+    "AASTR": 0,
+    "ACC": 1,
+    "ACRM": 2,
+    "ACYC": 3,
+    "ADNOS": 4,
+    "ALUCA": 5,
+    "AMPCA": 6,
+    "ANGS": 7,
+    "ANSC": 8,
+    "AODG": 9,
+    "APAD": 10,
+    "ARMM": 11,
+    "ARMS": 12,
+    "ASTR": 13,
+    "ATM": 14,
+    "BA": 15,
+    "BCC": 16,
+    "BLAD": 17,
+    "BLCA": 18,
+    "BMGCT": 19,
+    "BRCA": 20,
+    "BRCANOS": 21,
+    "BRCNOS": 22,
+    "CCOV": 23,
+    "CCRCC": 24,
+    "CESC": 25,
+    "CHDM": 26,
+    "CHOL": 27,
+    "CHRCC": 28,
+    "CHS": 29,
+    "COAD": 30,
+    "COADREAD": 31,
+    "CSCC": 32,
+    "CSCLC": 33,
+    "CUP": 34,
+    "CUPNOS": 35,
+    "DA": 36,
+    "DASTR": 37,
+    "DDLS": 38,
+    "DES": 39,
+    "DIFG": 40,
+    "DSRCT": 41,
+    "DSTAD": 42,
+    "ECAD": 43,
+    "EGC": 44,
+    "EHAE": 45,
+    "EHCH": 46,
+    "EMPD": 47,
+    "EOV": 48,
+    "EPDCA": 49,
+    "EPIS": 50,
+    "EPM": 51,
+    "ERMS": 52,
+    "ES": 53,
+    "ESCA": 54,
+    "ESCC": 55,
+    "GB": 56,
+    "GBAD": 57,
+    "GBC": 58,
+    "GBM": 59,
+    "GCCAP": 60,
+    "GEJ": 61,
+    "GINET": 62,
+    "GIST": 63,
+    "GNOS": 64,
+    "GRCT": 65,
+    "HCC": 66,
+    "HGGNOS": 67,
+    "HGNEC": 68,
+    "HGSFT": 69,
+    "HGSOC": 70,
+    "HNMUCM": 71,
+    "HNSC": 72,
+    "IDC": 73,
+    "IHCH": 74,
+    "ILC": 75,
+    "LGGNOS": 76,
+    "LGSOC": 77,
+    "LMS": 78,
+    "LNET": 79,
+    "LUAD": 80,
+    "LUAS": 81,
+    "LUCA": 82,
+    "LUNE": 83,
+    "LUPC": 84,
+    "LUSC": 85,
+    "LXSC": 86,
+    "MAAP": 87,
+    "MACR": 88,
+    "MBC": 89,
+    "MCC": 90,
+    "MDLC": 91,
+    "MEL": 92,
+    "MFH": 93,
+    "MFS": 94,
+    "MGCT": 95,
+    "MNG": 96,
+    "MOV": 97,
+    "MPNST": 98,
+    "MRLS": 99,
+    "MUP": 100,
+    "MXOV": 101,
+    "NBL": 102,
+    "NECNOS": 103,
+    "NETNOS": 104,
+    "NOT": 105,
+    "NPC": 106,
+    "NSCLC": 107,
+    "NSCLCPD": 108,
+    "NSGCT": 109,
+    "OCS": 110,
+    "OCSC": 111,
+    "ODG": 112,
+    "OOVC": 113,
+    "OPHSC": 114,
+    "OS": 115,
+    "PAAC": 116,
+    "PAAD": 117,
+    "PAASC": 118,
+    "PAMPCA": 119,
+    "PANET": 120,
+    "PAST": 121,
+    "PDC": 122,
+    "PECOMA": 123,
+    "PEMESO": 124,
+    "PHC": 125,
+    "PLBMESO": 126,
+    "PLEMESO": 127,
+    "PLMESO": 128,
+    "PRAD": 129,
+    "PRCC": 130,
+    "PSEC": 131,
+    "PTAD": 132,
+    "RBL": 133,
+    "RCC": 134,
+    "RCSNOS": 135,
+    "READ": 136,
+    "RMS": 137,
+    "SARCNOS": 138,
+    "SBC": 139,
+    "SBOV": 140,
+    "SBWDNET": 141,
+    "SCBC": 142,
+    "SCCNOS": 143,
+    "SCHW": 144,
+    "SCLC": 145,
+    "SCUP": 146,
+    "SDCA": 147,
+    "SEM": 148,
+    "SFT": 149,
+    "SKCM": 150,
+    "SOC": 151,
+    "SPDAC": 152,
+    "SSRCC": 153,
+    "STAD": 154,
+    "SYNS": 155,
+    "TAC": 156,
+    "THAP": 157,
+    "THHC": 158,
+    "THME": 159,
+    "THPA": 160,
+    "THPD": 161,
+    "THYC": 162,
+    "THYM": 163,
+    "TYST": 164,
+    "UCCC": 165,
+    "UCEC": 166,
+    "UCP": 167,
+    "UCS": 168,
+    "UCU": 169,
+    "UDMN": 170,
+    "UEC": 171,
+    "ULMS": 172,
+    "UM": 173,
+    "UMEC": 174,
+    "URCC": 175,
+    "USARC": 176,
+    "USC": 177,
+    "UTUC": 178,
+    "VMM": 179,
+    "VSC": 180,
+    "WDLS": 181,
+    "WT": 182,
+}
+INT_TO_CANCER_TYPE_MAP = {v: k for k, v in CANCER_TYPE_TO_INT_MAP.items()}
+class SiteType(Enum):
+    PRIMARY = "Primary"
+    METASTASIS = "Metastasis"
+class TileFeatureTensorDataset(Dataset):
+    def __init__(
+        self,
+        site_type: SiteType,
+        tile_features: np.ndarray,
+        n_max_tiles: int = 20000,
+    ) -> None:
+        """Initialize the dataset.
+        Args:
+            site_type: the site type as str, either "Primary" or "Metastasis"
+            tile_features: the tile feature array
+            n_max_tiles: the maximum number of tiles to use as int
+        Returns:
+            None
+        """
+        self.site_type = site_type
+        self.n_max_tiles = n_max_tiles
+        self.features = self._get_features(tile_features)
+    def __len__(self) -> int:
+        """Return the length of the dataset.
+        Returns:
+            int: the length of the dataset
+        """
+        return 1
+    def _get_features(self, features) -> torch.Tensor:
+        """Get the tile features
+        Args:
+            features: the tile features as a numpy array
+        Returns:
+            torch.Tensor: the tile tensor
+        """
+        features = torch.tensor(features, dtype=torch.float32)
+        if features.shape[0] > self.n_max_tiles:
+            indices = torch.randperm(features.shape[0])[: self.n_max_tiles]
+            features = features[indices]
+        if features.shape[0] < self.n_max_tiles:
+            padding = torch.zeros(
+                self.n_max_tiles - features.shape[0], features.shape[1]
+            )
+            features = torch.cat([features, padding], dim=0)
+        return features
+    def __getitem__(self, idx: int) -> dict:
+        """Return an item from the dataset.
+        Args:
+            idx: the index of the item to return
+        Returns:
+            dict: the item
+        """
+        return {
+            "site": self.site_type.value,
+            "tile_tensor": self.features
+        }

src/mosaic/inference/paladin.py ADDED Viewed

	@@ -0,0 +1,310 @@

+import csv
+import pickle  # nosec
+import sys
+from argparse import ArgumentParser
+from collections import defaultdict
+from pathlib import Path
+from typing import Any, List, Optional
+import numpy
+import pandas as pd
+import numpy as np
+import torch
+from loguru import logger
+from torch.utils.data import DataLoader
+from mosaic.inference.data import SiteType, TileFeatureTensorDataset
+# Constants
+BATCH_SIZE = 8
+NUM_WORKERS = 16
+class UsageError(Exception):
+    """A UsageError is raised when there's a problem with the command-line arguments."""
+    pass
+def load_model_map(model_map_path: str) -> dict[Any, Any]:
+    """Load the table mapping cancer_subtypes and targets to the paladin
+    model (a pickle file) that predicts that target for that cancer subtype.
+    A dict is returned, mapping each cancer_subtype to a table mapping a
+    target to the pathname for the model that predicts it.
+    """
+    models = defaultdict(dict)
+    with Path(model_map_path).open() as fp:
+        rdr = csv.DictReader(fp)
+        for row in rdr:
+            cancer_subtype = row["cancer_subtype"]
+            target = row["target_name"]
+            model = row["model_path"]
+            models[cancer_subtype][target] = model
+    return models
+def load_aeon_scores(df: pd.DataFrame) -> dict[str, float]:
+    """Load the output table from a single-slide Aeon run, listing Oncotree
+    cancer subtypes and their confidence values.
+    A dict is returned, mapping each cancersubtype to its confidence score.
+    """
+    score = {}
+    for _, row in df.iterrows():
+        subtype = row["Cancer Subtype"]
+        confidence = row["Confidence"]
+        score[subtype] = confidence
+    return score
+def select_cancer_subtypes(aeon_scores: dict[str, float], k=1) -> list[str]:
+    """Return the three top-scoring cancer_subtypes, based on the given Aeon scores."""
+    sorted_cancer_subtypes = list(
+        sorted([(v, k) for k, v in aeon_scores.items()], reverse=True)
+    )
+    return [cancer_subtype for score, cancer_subtype in sorted_cancer_subtypes[:k]]
+def select_models(cancer_subtypes: list[str], model_map: dict[Any, Any]) -> list[Any]:
+    """ """
+    models = []
+    for cancer_subtype, target, model in model_map.items():
+        if cancer_subtype in cancer_subtypes:
+            models.append((cancer_subtype, target, model))
+    return models
+def run_model(device, dataset, model_path: str, num_workers, batch_size) -> float:
+    """Run inference for the given embeddings and model.
+    The point estimate is returned.
+    """
+    logger.debug(f"[loading model {model_path}]")
+    with Path(model_path).open("rb") as f:
+        model = pickle.load(f)  # nosec
+        # model = CPU_Unpickler(f).load()  # nosec
+    model.to(device)
+    model.eval()
+    dataloader = DataLoader(
+        dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers
+    )
+    results_df = []
+    batch = next(iter(dataloader))
+    with torch.no_grad():
+        batch["tile_tensor"] = batch["tile_tensor"].to(device)
+        outputs = model(batch)
+        logits = outputs["logits"]
+        # Apply softplus to ensure positive values for beta-binomial parameters
+        logits = torch.nn.functional.softplus(logits) + 1.0  # enforce concavity
+        point_estimates = logits_to_point_estimates(logits)
+        # sample_id = batch['sample_id'][0]
+        class_assignment = point_estimates[0].item()
+    return class_assignment
+def logits_to_point_estimates(logits):
+    # logits is a tensor of shape (batch_size, 2 * (n_clf_tasks + n_reg_tasks))
+    # need to convert it to a tensor of shape (batch_size, n_clf_tasks + n_reg_tasks)
+    return logits[:, ::2] / (logits[:, ::2] + logits[:, 1::2])
+def run(
+    features: np.ndarray,
+    aeon_results: Optional[pd.DataFrame] = None,
+    cancer_subtype_codes: List[str] = None,
+    model_map_path: str = None,
+    model_path: str = None,
+    metastatic: bool = False,
+    batch_size: int = BATCH_SIZE,
+    num_workers: int = NUM_WORKERS,
+    use_cpu: bool = False,
+):
+    """Run Paladin inference on a single slide, using the given embeddings
+    and either a single model or a table mapping cancer_subtypes and targets to models.
+    If cancer_subtype_codes is given, it is a list of OncoTree codes for the slide.
+    If aeon_predictions_path is given, it is the pathname to a CSV file
+    with the output of an Aeon run on the slide.
+    If both are given, an error is raised.
+    The output is written to the given output_path (a CSV file).
+    """
+    if aeon_results is not None:
+        aeon_scores = load_aeon_scores(aeon_results)
+        target_cancer_subtypes = select_cancer_subtypes(aeon_scores)
+    else:
+        target_cancer_subtypes = cancer_subtype_codes
+    # Build a dataset to feed to the model
+    site = SiteType.METASTASIS if metastatic else SiteType.PRIMARY
+    dataset = TileFeatureTensorDataset(
+        tile_features=features,
+        site_type=site,
+        n_max_tiles=20000,
+    )
+    device = torch.device(
+        "cuda" if not use_cpu and torch.cuda.is_available() else "cpu"
+    )
+    results = []
+    if model_path:
+        cancer_subtype, target = "None", "None"
+        try:
+            score = run_model(device, dataset, model_path, num_workers, batch_size)
+            results.append((cancer_subtype, target, score))
+            logger.info(
+                f"cancer_subtype: {cancer_subtype}  target: {target}  score: {score}"
+            )
+        except Exception as exc:
+            logger.error(
+                f"Unable to run model for {cancer_subtype} target {target}\n{exc}"
+            )
+    elif model_map_path:
+        model_map = load_model_map(model_map_path)
+        for cancer_subtype in target_cancer_subtypes:
+            if cancer_subtype not in model_map:
+                logger.warning(f"Warning: no models found for {cancer_subtype}")
+                continue
+            if "MSI_TYPE" in model_map[cancer_subtype]:
+                # Run MSI_TYPE model first, to determine if we should run other/MSS models
+                logger.info(f"Running MSI_TYPE model for {cancer_subtype} first")
+                try:
+                    msi_score = run_model(
+                        device,
+                        dataset,
+                        model_map[cancer_subtype]["MSI_TYPE"],
+                        num_workers,
+                        batch_size,
+                    )
+                    results.append((cancer_subtype, "MSI_TYPE", msi_score))
+                    logger.info(
+                        f"cancer_subtype: {cancer_subtype}  target: MSI score: {msi_score}"
+                    )
+                    # If MSI score is high, skip MSS models
+                    if msi_score >= 0.5:
+                        logger.info(
+                            f"Skipping MSS models for {cancer_subtype} due to high MSI score"
+                        )
+                        continue
+                    else:
+                        logger.info(
+                            f"Running MSS models for {cancer_subtype} due to low MSI score"
+                        )
+                except Exception as exc:
+                    logger.error(
+                        f"Unable to run model for {cancer_subtype} target MSI_TYPE\n{exc}"
+                    )
+            for target, model in sorted(model_map[cancer_subtype].items()):
+                # Skip MSI_TYPE model, already run above
+                if target == "MSI_TYPE":
+                    continue
+                try:
+                    score = run_model(device, dataset, model, num_workers, batch_size)
+                    results.append((cancer_subtype, target, score))
+                    logger.info(
+                        f"cancer_subtype: {cancer_subtype}  target: {target}  score: {score}"
+                    )
+                except Exception as exc:
+                    logger.error(
+                        f"Unable to run model for {cancer_subtype} target {target}\n{exc}"
+                    )
+    df = pd.DataFrame(results, columns=["Cancer Subtype", "Biomarker", "Score"])
+    return df
+def parse_args():
+    parser = ArgumentParser(description="Run Paladin inference on a single slide")
+    parser.add_argument(
+        "-i",
+        "--features-path",
+        required=True,
+        help="Pathname to a .pt file with optimus embeddings for this slide",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-path",
+        help="The filename for the Paladin predictions file (CSV)",
+        required=True,
+    )
+    parser.add_argument(
+        "-c",
+        "--cancer-subtype-codes",
+        help="One or more cancer_subtypes (OncoTree codes, comma-separated)",
+    )
+    parser.add_argument(
+        "-a",
+        "--aeon-predictions-path",
+        help="Pathname to an aeon-predictions file (CSV) for this slide",
+    )
+    parser.add_argument(
+        "-mm",
+        "--model-map-path",
+        help="A CSV file mapping cancer subtypes and targets to Paladin models (.pkl files). Contains columns 'cancer_subtype', 'target_name', and 'model_path'.",
+    )
+    parser.add_argument(
+        "-m",
+        "--model-path",
+        help="The filename for a Paladin model to run inference with",
+    )
+    parser.add_argument(
+        "--metastatic", action="store_true", help="Tissue is from a metastatic site"
+    )
+    parser.add_argument("--batch-size", type=int, default=BATCH_SIZE, help="Batch size")
+    parser.add_argument(
+        "--num-workers",
+        type=int,
+        default=NUM_WORKERS,
+        help="Number of workers for data loading",
+    )
+    parser.add_argument("--use-cpu", action="store_true", help="Use CPU")
+    opt = parser.parse_args()
+    if opt.cancer_subtype_codes and opt.aeon_predictions_path:
+        raise UsageError(
+            "You may specify either --codes or --aeon-predictions-path, but not both."
+        )
+    if opt.cancer_subtype_codes:
+        opt.cancer_subtype_codes = opt.cancer_subtype_codes.split(",")
+    if opt.model_path is None and opt.model_map_path is None:
+        raise UsageError("You must specify either --model-path or --model-map-path")
+    return opt
+def main():
+    opt = parse_args()
+    features = torch.load(opt.features_path)
+    logger.info(f"Loaded features from {opt.features_path}")
+    aeon_results = None
+    if opt.aeon_predictions_path:
+        aeon_results = pd.read_csv(opt.aeon_predictions_path)
+        logger.info(f"Loaded Aeon results from {opt.aeon_predictions_path}")
+    df = run(
+        features=features,
+        aeon_results=aeon_results,
+        cancer_subtype_codes=opt.cancer_subtype_codes,
+        model_map_path=opt.model_map_path,
+        model_path=opt.model_path,
+        metastatic=opt.metastatic,
+        batch_size=opt.batch_size,
+        num_workers=opt.num_workers,
+        use_cpu=opt.use_cpu,
+    )
+    df.to_csv(opt.output_path, index=False)
+    logger.info(f"Wrote {opt.output_path}")
+if __name__ == "__main__":
+    main()

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff