Spaces:

raylim
/

mosaic-zero

Sleeping

App Files Files Community

raylim Claude Sonnet 4.5 commited on Jan 8

Commit

751062d

unverified ·

1 Parent(s): c5e7bb2

Fix model file location to use HuggingFace cache directory

Browse files

Changed all model file references from hardcoded local data/ directory to
use HuggingFace cache directory. This fixes the issue where files couldn't
be found after downloading from HuggingFace Hub.

Changes:
- Add get_data_directory() function to gradio_app.py that checks
MOSAIC_DATA_DIR environment variable first, then falls back to local
data/ for development
- Update gradio_app.py to download to HF cache (removed local_dir parameter)
and set MOSAIC_DATA_DIR environment variable
- Update all model file references in aeon.py, data.py, utils.py, and
analysis.py to use get_data_directory() instead of hardcoded paths

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (5) hide show

src/mosaic/analysis.py +13 -7
src/mosaic/gradio_app.py +52 -5
src/mosaic/inference/aeon.py +3 -1
src/mosaic/inference/data.py +6 -4
src/mosaic/ui/utils.py +4 -2

src/mosaic/analysis.py CHANGED Viewed

@@ -61,6 +61,7 @@ from mussel.utils.segment import draw_slide_mask
 from mussel.cli.tessellate import BiopsySegConfig, ResectionSegConfig, TcgaSegConfig
 from loguru import logger
 from mosaic.inference import run_aeon, run_paladin
 # Log hardware detection at module load
 logger.info(f"Hardware: {GPU_TYPE} | batch_size={DEFAULT_BATCH_SIZE}, num_workers={DEFAULT_NUM_WORKERS}")
@@ -92,13 +93,14 @@ def _extract_ctranspath_features(coords, slide_path, attrs, num_workers):
         logger.info(f"Running CTransPath with {num_workers} workers")
     start_time = pd.Timestamp.now()
     ctranspath_features, _ = get_features(
         coords,
         slide_path,
         attrs,
         model_type=ModelType.CTRANSPATH,
-        model_path="data/ctranspath.pth",
         num_workers=num_workers,
         batch_size=batch_size,
         use_gpu=True,
@@ -136,13 +138,14 @@ def _extract_optimus_features(filtered_coords, slide_path, attrs, num_workers):
         logger.info(f"Running Optimus with {num_workers} workers")
     start_time = pd.Timestamp.now()
     features, _ = get_features(
         filtered_coords,
         slide_path,
         attrs,
         model_type=ModelType.OPTIMUS,
-        model_path="data/optimus.pkl",
         num_workers=num_workers,
         batch_size=batch_size,
         use_gpu=True,
@@ -179,9 +182,10 @@ def _run_aeon_inference(features, site_type, num_workers, sex=None, tissue_site_
     start_time = pd.Timestamp.now()
     logger.info("Running Aeon for cancer subtype inference")
     aeon_results, _ = run_aeon(
         features=features,
-        model_path="data/aeon_model.pkl",
         metastatic=(site_type == "Metastatic"),
         batch_size=8,
         num_workers=num_workers,
@@ -231,9 +235,10 @@ def _run_paladin_inference(features, aeon_results, site_type, num_workers):
     start_time = pd.Timestamp.now()
     logger.info("Running Paladin for biomarker inference")
     paladin_results = run_paladin(
         features=features,
-        model_map_path="data/paladin_model_map.csv",
         aeon_results=aeon_results,
         metastatic=(site_type == "Metastatic"),
         batch_size=8,
@@ -339,7 +344,8 @@ def _run_inference_pipeline_impl(
     # Step 3: Filter features using marker classifier (CPU operation)
     start_time = pd.Timestamp.now()
-    marker_classifier = pickle.load(open("data/marker_classifier.pkl", "rb"))
     progress(0.35, desc="Filtering features with marker classifier")
     logger.info("Filtering features with marker classifier")
     _, filtered_coords = filter_features(

 from mussel.cli.tessellate import BiopsySegConfig, ResectionSegConfig, TcgaSegConfig
 from loguru import logger
 from mosaic.inference import run_aeon, run_paladin
+from mosaic.gradio_app import get_data_directory
 # Log hardware detection at module load
 logger.info(f"Hardware: {GPU_TYPE} | batch_size={DEFAULT_BATCH_SIZE}, num_workers={DEFAULT_NUM_WORKERS}")
         logger.info(f"Running CTransPath with {num_workers} workers")
     start_time = pd.Timestamp.now()
+    data_dir = get_data_directory()
     ctranspath_features, _ = get_features(
         coords,
         slide_path,
         attrs,
         model_type=ModelType.CTRANSPATH,
+        model_path=str(data_dir / "ctranspath.pth"),
         num_workers=num_workers,
         batch_size=batch_size,
         use_gpu=True,
         logger.info(f"Running Optimus with {num_workers} workers")
     start_time = pd.Timestamp.now()
+    data_dir = get_data_directory()
     features, _ = get_features(
         filtered_coords,
         slide_path,
         attrs,
         model_type=ModelType.OPTIMUS,
+        model_path=str(data_dir / "optimus.pkl"),
         num_workers=num_workers,
         batch_size=batch_size,
         use_gpu=True,
     start_time = pd.Timestamp.now()
     logger.info("Running Aeon for cancer subtype inference")
+    data_dir = get_data_directory()
     aeon_results, _ = run_aeon(
         features=features,
+        model_path=str(data_dir / "aeon_model.pkl"),
         metastatic=(site_type == "Metastatic"),
         batch_size=8,
         num_workers=num_workers,
     start_time = pd.Timestamp.now()
     logger.info("Running Paladin for biomarker inference")
+    data_dir = get_data_directory()
     paladin_results = run_paladin(
         features=features,
+        model_map_path=str(data_dir / "paladin_model_map.csv"),
         aeon_results=aeon_results,
         metastatic=(site_type == "Metastatic"),
         batch_size=8,
     # Step 3: Filter features using marker classifier (CPU operation)
     start_time = pd.Timestamp.now()
+    data_dir = get_data_directory()
+    marker_classifier = pickle.load(open(data_dir / "marker_classifier.pkl", "rb"))
     progress(0.35, desc="Filtering features with marker classifier")
     logger.info("Filtering features with marker classifier")
     _, filtered_coords = filter_features(

src/mosaic/gradio_app.py CHANGED Viewed

@@ -12,6 +12,7 @@ import pandas as pd
 from pathlib import Path
 from huggingface_hub import snapshot_download
 from loguru import logger
 from mosaic.ui import launch_gradio
 from mosaic.ui.app import set_cancer_subtype_maps
@@ -25,23 +26,69 @@ from mosaic.ui.utils import (
 )
 from mosaic.analysis import analyze_slide
 def download_and_process_models():
     """Download models from HuggingFace and initialize cancer subtype mappings.
     Downloads the Paladin and Aeon models from the PDM-Group HuggingFace repository
-    and creates mappings between cancer subtype names and OncoTree codes.
     Returns:
         tuple: (cancer_subtype_name_map, reversed_cancer_subtype_name_map, cancer_subtypes)
             - cancer_subtype_name_map: Dict mapping display names to OncoTree codes
             - reversed_cancer_subtype_name_map: Dict mapping OncoTree codes to display names
             - cancer_subtypes: List of all supported cancer subtype codes
     """
-    snapshot_download(repo_id="PDM-Group/paladin-aeon-models", local_dir="data")
     model_map = pd.read_csv(
-        "data/paladin_model_map.csv",
     )
     cancer_subtypes = model_map["cancer_subtype"].unique().tolist()
     cancer_subtype_name_map = {"Unknown": "UNK"}

 from pathlib import Path
 from huggingface_hub import snapshot_download
 from loguru import logger
+import os
 from mosaic.ui import launch_gradio
 from mosaic.ui.app import set_cancer_subtype_maps
 )
 from mosaic.analysis import analyze_slide
+# Global variable to store the model data directory path
+_MODEL_DATA_DIR = None
+def get_data_directory():
+    """Get the directory containing model data files.
+    Returns the HuggingFace cache directory path for the model repository,
+    or falls back to local 'data/' directory if not yet downloaded.
+    Returns:
+        Path: Path to the model data directory
+    """
+    global _MODEL_DATA_DIR
+    if _MODEL_DATA_DIR is not None:
+        return _MODEL_DATA_DIR
+    # Check if environment variable is set
+    if "MOSAIC_DATA_DIR" in os.environ:
+        _MODEL_DATA_DIR = Path(os.environ["MOSAIC_DATA_DIR"])
+        return _MODEL_DATA_DIR
+    # Check if local data/ directory exists (for development/backward compat)
+    local_data = Path("data")
+    if local_data.exists() and (local_data / "paladin_model_map.csv").exists():
+        _MODEL_DATA_DIR = local_data
+        return _MODEL_DATA_DIR
+    # Fall back to repo root data/ directory
+    _MODEL_DATA_DIR = local_data
+    return _MODEL_DATA_DIR
 def download_and_process_models():
     """Download models from HuggingFace and initialize cancer subtype mappings.
     Downloads the Paladin and Aeon models from the PDM-Group HuggingFace repository
+    to the HuggingFace cache directory and creates mappings between cancer subtype
+    names and OncoTree codes.
     Returns:
         tuple: (cancer_subtype_name_map, reversed_cancer_subtype_name_map, cancer_subtypes)
             - cancer_subtype_name_map: Dict mapping display names to OncoTree codes
             - reversed_cancer_subtype_name_map: Dict mapping OncoTree codes to display names
             - cancer_subtypes: List of all supported cancer subtype codes
     """
+    global _MODEL_DATA_DIR
+    # Download to HF cache directory (not local_dir)
+    # This returns the path to the cached snapshot
+    logger.info("Downloading models from HuggingFace Hub to cache directory...")
+    cache_dir = snapshot_download(
+        repo_id="PDM-Group/paladin-aeon-models",
+        # No local_dir - use HF cache
+    )
+    _MODEL_DATA_DIR = Path(cache_dir)
+    logger.info(f"Models downloaded to: {_MODEL_DATA_DIR}")
+    # Also set environment variable for other modules to use
+    os.environ["MOSAIC_DATA_DIR"] = str(_MODEL_DATA_DIR)
     model_map = pd.read_csv(
+        _MODEL_DATA_DIR / "paladin_model_map.csv",
     )
     cancer_subtypes = model_map["cancer_subtype"].unique().tolist()
     cancer_subtype_name_map = {"Unknown": "UNK"}

src/mosaic/inference/aeon.py CHANGED Viewed

@@ -22,6 +22,7 @@ from mosaic.inference.data import (
 )
 from loguru import logger
 # Cancer types excluded from prediction (too broad or ambiguous)
 # These are used to mask out predictions for overly general cancer types
@@ -69,7 +70,8 @@ def run(
     model.eval()
     # Load the correct mapping from metadata for this model
-    metadata_path = Path(__file__).parent.parent.parent.parent / "data" / "metadata" / "target_dict.tsv"
     with open(metadata_path) as f:
         target_dict_str = f.read().strip().replace("'", '"')
         target_dict = json.loads(target_dict_str)

 )
 from loguru import logger
+from mosaic.gradio_app import get_data_directory
 # Cancer types excluded from prediction (too broad or ambiguous)
 # These are used to mask out predictions for overly general cancer types
     model.eval()
     # Load the correct mapping from metadata for this model
+    data_dir = get_data_directory()
+    metadata_path = data_dir / "metadata" / "target_dict.tsv"
     with open(metadata_path) as f:
         target_dict_str = f.read().strip().replace("'", '"')
         target_dict = json.loads(target_dict_str)

src/mosaic/inference/data.py CHANGED Viewed

@@ -13,6 +13,8 @@ import torch
 from torch.utils.data import Dataset
 import numpy as np
 CANCER_TYPE_TO_INT_MAP = {
     "AASTR": 0,
     "ACC": 1,
@@ -219,10 +221,10 @@ def get_tissue_site_map():
     """
     global _TISSUE_SITE_MAP
     if _TISSUE_SITE_MAP is None:
-        from pathlib import Path
         import pandas as pd
-        csv_path = Path(__file__).parent.parent.parent.parent / "data" / "tissue_site_original_to_idx.csv"
         try:
             df = pd.read_csv(csv_path)
         except FileNotFoundError as e:
@@ -262,10 +264,10 @@ def get_sex_map():
     """
     global _SEX_MAP
     if _SEX_MAP is None:
-        from pathlib import Path
         import pandas as pd
-        csv_path = Path(__file__).parent.parent.parent.parent / "data" / "sex_original_to_idx.csv"
         try:
             df = pd.read_csv(csv_path)
         except FileNotFoundError as e:

 from torch.utils.data import Dataset
 import numpy as np
+from mosaic.gradio_app import get_data_directory
 CANCER_TYPE_TO_INT_MAP = {
     "AASTR": 0,
     "ACC": 1,
     """
     global _TISSUE_SITE_MAP
     if _TISSUE_SITE_MAP is None:
         import pandas as pd
+        data_dir = get_data_directory()
+        csv_path = data_dir / "tissue_site_original_to_idx.csv"
         try:
             df = pd.read_csv(csv_path)
         except FileNotFoundError as e:
     """
     global _SEX_MAP
     if _SEX_MAP is None:
         import pandas as pd
+        data_dir = get_data_directory()
+        csv_path = data_dir / "sex_original_to_idx.csv"
         try:
             df = pd.read_csv(csv_path)
         except FileNotFoundError as e:

src/mosaic/ui/utils.py CHANGED Viewed

@@ -13,6 +13,8 @@ import pandas as pd
 import gradio as gr
 import requests
 # This path should be outside your project directory if running locally
 TEMP_USER_DATA_DIR = Path(tempfile.gettempdir()) / "mosaic_user_data"
@@ -42,8 +44,8 @@ def get_tissue_sites():
     global tissue_site_list
     if tissue_site_list is None:
         try:
-            current_dir = Path(__file__).parent.parent.parent.parent
-            tissue_site_map_path = current_dir / "data" / "tissue_site_original_to_idx.csv"
             df = pd.read_csv(tissue_site_map_path)
             # Get unique tissue sites and sort them
             tissue_site_list = ["Unknown"] + sorted(df["TISSUE_SITE"].unique().tolist())

 import gradio as gr
 import requests
+from mosaic.gradio_app import get_data_directory
 # This path should be outside your project directory if running locally
 TEMP_USER_DATA_DIR = Path(tempfile.gettempdir()) / "mosaic_user_data"
     global tissue_site_list
     if tissue_site_list is None:
         try:
+            data_dir = get_data_directory()
+            tissue_site_map_path = data_dir / "tissue_site_original_to_idx.csv"
             df = pd.read_csv(tissue_site_map_path)
             # Get unique tissue sites and sort them
             tissue_site_list = ["Unknown"] + sorted(df["TISSUE_SITE"].unique().tolist())