Spaces:
Sleeping
Sleeping
Fix model file location to use HuggingFace cache directory
Browse filesChanged all model file references from hardcoded local data/ directory to
use HuggingFace cache directory. This fixes the issue where files couldn't
be found after downloading from HuggingFace Hub.
Changes:
- Add get_data_directory() function to gradio_app.py that checks
MOSAIC_DATA_DIR environment variable first, then falls back to local
data/ for development
- Update gradio_app.py to download to HF cache (removed local_dir parameter)
and set MOSAIC_DATA_DIR environment variable
- Update all model file references in aeon.py, data.py, utils.py, and
analysis.py to use get_data_directory() instead of hardcoded paths
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
- src/mosaic/analysis.py +13 -7
- src/mosaic/gradio_app.py +52 -5
- src/mosaic/inference/aeon.py +3 -1
- src/mosaic/inference/data.py +6 -4
- src/mosaic/ui/utils.py +4 -2
src/mosaic/analysis.py
CHANGED
|
@@ -61,6 +61,7 @@ from mussel.utils.segment import draw_slide_mask
|
|
| 61 |
from mussel.cli.tessellate import BiopsySegConfig, ResectionSegConfig, TcgaSegConfig
|
| 62 |
from loguru import logger
|
| 63 |
from mosaic.inference import run_aeon, run_paladin
|
|
|
|
| 64 |
|
| 65 |
# Log hardware detection at module load
|
| 66 |
logger.info(f"Hardware: {GPU_TYPE} | batch_size={DEFAULT_BATCH_SIZE}, num_workers={DEFAULT_NUM_WORKERS}")
|
|
@@ -92,13 +93,14 @@ def _extract_ctranspath_features(coords, slide_path, attrs, num_workers):
|
|
| 92 |
logger.info(f"Running CTransPath with {num_workers} workers")
|
| 93 |
|
| 94 |
start_time = pd.Timestamp.now()
|
| 95 |
-
|
|
|
|
| 96 |
ctranspath_features, _ = get_features(
|
| 97 |
coords,
|
| 98 |
slide_path,
|
| 99 |
attrs,
|
| 100 |
model_type=ModelType.CTRANSPATH,
|
| 101 |
-
model_path="
|
| 102 |
num_workers=num_workers,
|
| 103 |
batch_size=batch_size,
|
| 104 |
use_gpu=True,
|
|
@@ -136,13 +138,14 @@ def _extract_optimus_features(filtered_coords, slide_path, attrs, num_workers):
|
|
| 136 |
logger.info(f"Running Optimus with {num_workers} workers")
|
| 137 |
|
| 138 |
start_time = pd.Timestamp.now()
|
| 139 |
-
|
|
|
|
| 140 |
features, _ = get_features(
|
| 141 |
filtered_coords,
|
| 142 |
slide_path,
|
| 143 |
attrs,
|
| 144 |
model_type=ModelType.OPTIMUS,
|
| 145 |
-
model_path="
|
| 146 |
num_workers=num_workers,
|
| 147 |
batch_size=batch_size,
|
| 148 |
use_gpu=True,
|
|
@@ -179,9 +182,10 @@ def _run_aeon_inference(features, site_type, num_workers, sex=None, tissue_site_
|
|
| 179 |
|
| 180 |
start_time = pd.Timestamp.now()
|
| 181 |
logger.info("Running Aeon for cancer subtype inference")
|
|
|
|
| 182 |
aeon_results, _ = run_aeon(
|
| 183 |
features=features,
|
| 184 |
-
model_path="
|
| 185 |
metastatic=(site_type == "Metastatic"),
|
| 186 |
batch_size=8,
|
| 187 |
num_workers=num_workers,
|
|
@@ -231,9 +235,10 @@ def _run_paladin_inference(features, aeon_results, site_type, num_workers):
|
|
| 231 |
|
| 232 |
start_time = pd.Timestamp.now()
|
| 233 |
logger.info("Running Paladin for biomarker inference")
|
|
|
|
| 234 |
paladin_results = run_paladin(
|
| 235 |
features=features,
|
| 236 |
-
model_map_path="
|
| 237 |
aeon_results=aeon_results,
|
| 238 |
metastatic=(site_type == "Metastatic"),
|
| 239 |
batch_size=8,
|
|
@@ -339,7 +344,8 @@ def _run_inference_pipeline_impl(
|
|
| 339 |
|
| 340 |
# Step 3: Filter features using marker classifier (CPU operation)
|
| 341 |
start_time = pd.Timestamp.now()
|
| 342 |
-
|
|
|
|
| 343 |
progress(0.35, desc="Filtering features with marker classifier")
|
| 344 |
logger.info("Filtering features with marker classifier")
|
| 345 |
_, filtered_coords = filter_features(
|
|
|
|
| 61 |
from mussel.cli.tessellate import BiopsySegConfig, ResectionSegConfig, TcgaSegConfig
|
| 62 |
from loguru import logger
|
| 63 |
from mosaic.inference import run_aeon, run_paladin
|
| 64 |
+
from mosaic.gradio_app import get_data_directory
|
| 65 |
|
| 66 |
# Log hardware detection at module load
|
| 67 |
logger.info(f"Hardware: {GPU_TYPE} | batch_size={DEFAULT_BATCH_SIZE}, num_workers={DEFAULT_NUM_WORKERS}")
|
|
|
|
| 93 |
logger.info(f"Running CTransPath with {num_workers} workers")
|
| 94 |
|
| 95 |
start_time = pd.Timestamp.now()
|
| 96 |
+
|
| 97 |
+
data_dir = get_data_directory()
|
| 98 |
ctranspath_features, _ = get_features(
|
| 99 |
coords,
|
| 100 |
slide_path,
|
| 101 |
attrs,
|
| 102 |
model_type=ModelType.CTRANSPATH,
|
| 103 |
+
model_path=str(data_dir / "ctranspath.pth"),
|
| 104 |
num_workers=num_workers,
|
| 105 |
batch_size=batch_size,
|
| 106 |
use_gpu=True,
|
|
|
|
| 138 |
logger.info(f"Running Optimus with {num_workers} workers")
|
| 139 |
|
| 140 |
start_time = pd.Timestamp.now()
|
| 141 |
+
|
| 142 |
+
data_dir = get_data_directory()
|
| 143 |
features, _ = get_features(
|
| 144 |
filtered_coords,
|
| 145 |
slide_path,
|
| 146 |
attrs,
|
| 147 |
model_type=ModelType.OPTIMUS,
|
| 148 |
+
model_path=str(data_dir / "optimus.pkl"),
|
| 149 |
num_workers=num_workers,
|
| 150 |
batch_size=batch_size,
|
| 151 |
use_gpu=True,
|
|
|
|
| 182 |
|
| 183 |
start_time = pd.Timestamp.now()
|
| 184 |
logger.info("Running Aeon for cancer subtype inference")
|
| 185 |
+
data_dir = get_data_directory()
|
| 186 |
aeon_results, _ = run_aeon(
|
| 187 |
features=features,
|
| 188 |
+
model_path=str(data_dir / "aeon_model.pkl"),
|
| 189 |
metastatic=(site_type == "Metastatic"),
|
| 190 |
batch_size=8,
|
| 191 |
num_workers=num_workers,
|
|
|
|
| 235 |
|
| 236 |
start_time = pd.Timestamp.now()
|
| 237 |
logger.info("Running Paladin for biomarker inference")
|
| 238 |
+
data_dir = get_data_directory()
|
| 239 |
paladin_results = run_paladin(
|
| 240 |
features=features,
|
| 241 |
+
model_map_path=str(data_dir / "paladin_model_map.csv"),
|
| 242 |
aeon_results=aeon_results,
|
| 243 |
metastatic=(site_type == "Metastatic"),
|
| 244 |
batch_size=8,
|
|
|
|
| 344 |
|
| 345 |
# Step 3: Filter features using marker classifier (CPU operation)
|
| 346 |
start_time = pd.Timestamp.now()
|
| 347 |
+
data_dir = get_data_directory()
|
| 348 |
+
marker_classifier = pickle.load(open(data_dir / "marker_classifier.pkl", "rb"))
|
| 349 |
progress(0.35, desc="Filtering features with marker classifier")
|
| 350 |
logger.info("Filtering features with marker classifier")
|
| 351 |
_, filtered_coords = filter_features(
|
src/mosaic/gradio_app.py
CHANGED
|
@@ -12,6 +12,7 @@ import pandas as pd
|
|
| 12 |
from pathlib import Path
|
| 13 |
from huggingface_hub import snapshot_download
|
| 14 |
from loguru import logger
|
|
|
|
| 15 |
|
| 16 |
from mosaic.ui import launch_gradio
|
| 17 |
from mosaic.ui.app import set_cancer_subtype_maps
|
|
@@ -25,23 +26,69 @@ from mosaic.ui.utils import (
|
|
| 25 |
)
|
| 26 |
from mosaic.analysis import analyze_slide
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
def download_and_process_models():
|
| 30 |
"""Download models from HuggingFace and initialize cancer subtype mappings.
|
| 31 |
-
|
| 32 |
Downloads the Paladin and Aeon models from the PDM-Group HuggingFace repository
|
| 33 |
-
and creates mappings between cancer subtype
|
| 34 |
-
|
|
|
|
| 35 |
Returns:
|
| 36 |
tuple: (cancer_subtype_name_map, reversed_cancer_subtype_name_map, cancer_subtypes)
|
| 37 |
- cancer_subtype_name_map: Dict mapping display names to OncoTree codes
|
| 38 |
- reversed_cancer_subtype_name_map: Dict mapping OncoTree codes to display names
|
| 39 |
- cancer_subtypes: List of all supported cancer subtype codes
|
| 40 |
"""
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
model_map = pd.read_csv(
|
| 44 |
-
"
|
| 45 |
)
|
| 46 |
cancer_subtypes = model_map["cancer_subtype"].unique().tolist()
|
| 47 |
cancer_subtype_name_map = {"Unknown": "UNK"}
|
|
|
|
| 12 |
from pathlib import Path
|
| 13 |
from huggingface_hub import snapshot_download
|
| 14 |
from loguru import logger
|
| 15 |
+
import os
|
| 16 |
|
| 17 |
from mosaic.ui import launch_gradio
|
| 18 |
from mosaic.ui.app import set_cancer_subtype_maps
|
|
|
|
| 26 |
)
|
| 27 |
from mosaic.analysis import analyze_slide
|
| 28 |
|
| 29 |
+
# Global variable to store the model data directory path
|
| 30 |
+
_MODEL_DATA_DIR = None
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def get_data_directory():
|
| 34 |
+
"""Get the directory containing model data files.
|
| 35 |
+
|
| 36 |
+
Returns the HuggingFace cache directory path for the model repository,
|
| 37 |
+
or falls back to local 'data/' directory if not yet downloaded.
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
Path: Path to the model data directory
|
| 41 |
+
"""
|
| 42 |
+
global _MODEL_DATA_DIR
|
| 43 |
+
if _MODEL_DATA_DIR is not None:
|
| 44 |
+
return _MODEL_DATA_DIR
|
| 45 |
+
|
| 46 |
+
# Check if environment variable is set
|
| 47 |
+
if "MOSAIC_DATA_DIR" in os.environ:
|
| 48 |
+
_MODEL_DATA_DIR = Path(os.environ["MOSAIC_DATA_DIR"])
|
| 49 |
+
return _MODEL_DATA_DIR
|
| 50 |
+
|
| 51 |
+
# Check if local data/ directory exists (for development/backward compat)
|
| 52 |
+
local_data = Path("data")
|
| 53 |
+
if local_data.exists() and (local_data / "paladin_model_map.csv").exists():
|
| 54 |
+
_MODEL_DATA_DIR = local_data
|
| 55 |
+
return _MODEL_DATA_DIR
|
| 56 |
+
|
| 57 |
+
# Fall back to repo root data/ directory
|
| 58 |
+
_MODEL_DATA_DIR = local_data
|
| 59 |
+
return _MODEL_DATA_DIR
|
| 60 |
+
|
| 61 |
|
| 62 |
def download_and_process_models():
|
| 63 |
"""Download models from HuggingFace and initialize cancer subtype mappings.
|
| 64 |
+
|
| 65 |
Downloads the Paladin and Aeon models from the PDM-Group HuggingFace repository
|
| 66 |
+
to the HuggingFace cache directory and creates mappings between cancer subtype
|
| 67 |
+
names and OncoTree codes.
|
| 68 |
+
|
| 69 |
Returns:
|
| 70 |
tuple: (cancer_subtype_name_map, reversed_cancer_subtype_name_map, cancer_subtypes)
|
| 71 |
- cancer_subtype_name_map: Dict mapping display names to OncoTree codes
|
| 72 |
- reversed_cancer_subtype_name_map: Dict mapping OncoTree codes to display names
|
| 73 |
- cancer_subtypes: List of all supported cancer subtype codes
|
| 74 |
"""
|
| 75 |
+
global _MODEL_DATA_DIR
|
| 76 |
+
|
| 77 |
+
# Download to HF cache directory (not local_dir)
|
| 78 |
+
# This returns the path to the cached snapshot
|
| 79 |
+
logger.info("Downloading models from HuggingFace Hub to cache directory...")
|
| 80 |
+
cache_dir = snapshot_download(
|
| 81 |
+
repo_id="PDM-Group/paladin-aeon-models",
|
| 82 |
+
# No local_dir - use HF cache
|
| 83 |
+
)
|
| 84 |
+
_MODEL_DATA_DIR = Path(cache_dir)
|
| 85 |
+
logger.info(f"Models downloaded to: {_MODEL_DATA_DIR}")
|
| 86 |
+
|
| 87 |
+
# Also set environment variable for other modules to use
|
| 88 |
+
os.environ["MOSAIC_DATA_DIR"] = str(_MODEL_DATA_DIR)
|
| 89 |
|
| 90 |
model_map = pd.read_csv(
|
| 91 |
+
_MODEL_DATA_DIR / "paladin_model_map.csv",
|
| 92 |
)
|
| 93 |
cancer_subtypes = model_map["cancer_subtype"].unique().tolist()
|
| 94 |
cancer_subtype_name_map = {"Unknown": "UNK"}
|
src/mosaic/inference/aeon.py
CHANGED
|
@@ -22,6 +22,7 @@ from mosaic.inference.data import (
|
|
| 22 |
)
|
| 23 |
|
| 24 |
from loguru import logger
|
|
|
|
| 25 |
|
| 26 |
# Cancer types excluded from prediction (too broad or ambiguous)
|
| 27 |
# These are used to mask out predictions for overly general cancer types
|
|
@@ -69,7 +70,8 @@ def run(
|
|
| 69 |
model.eval()
|
| 70 |
|
| 71 |
# Load the correct mapping from metadata for this model
|
| 72 |
-
|
|
|
|
| 73 |
with open(metadata_path) as f:
|
| 74 |
target_dict_str = f.read().strip().replace("'", '"')
|
| 75 |
target_dict = json.loads(target_dict_str)
|
|
|
|
| 22 |
)
|
| 23 |
|
| 24 |
from loguru import logger
|
| 25 |
+
from mosaic.gradio_app import get_data_directory
|
| 26 |
|
| 27 |
# Cancer types excluded from prediction (too broad or ambiguous)
|
| 28 |
# These are used to mask out predictions for overly general cancer types
|
|
|
|
| 70 |
model.eval()
|
| 71 |
|
| 72 |
# Load the correct mapping from metadata for this model
|
| 73 |
+
data_dir = get_data_directory()
|
| 74 |
+
metadata_path = data_dir / "metadata" / "target_dict.tsv"
|
| 75 |
with open(metadata_path) as f:
|
| 76 |
target_dict_str = f.read().strip().replace("'", '"')
|
| 77 |
target_dict = json.loads(target_dict_str)
|
src/mosaic/inference/data.py
CHANGED
|
@@ -13,6 +13,8 @@ import torch
|
|
| 13 |
from torch.utils.data import Dataset
|
| 14 |
import numpy as np
|
| 15 |
|
|
|
|
|
|
|
| 16 |
CANCER_TYPE_TO_INT_MAP = {
|
| 17 |
"AASTR": 0,
|
| 18 |
"ACC": 1,
|
|
@@ -219,10 +221,10 @@ def get_tissue_site_map():
|
|
| 219 |
"""
|
| 220 |
global _TISSUE_SITE_MAP
|
| 221 |
if _TISSUE_SITE_MAP is None:
|
| 222 |
-
from pathlib import Path
|
| 223 |
import pandas as pd
|
| 224 |
|
| 225 |
-
|
|
|
|
| 226 |
try:
|
| 227 |
df = pd.read_csv(csv_path)
|
| 228 |
except FileNotFoundError as e:
|
|
@@ -262,10 +264,10 @@ def get_sex_map():
|
|
| 262 |
"""
|
| 263 |
global _SEX_MAP
|
| 264 |
if _SEX_MAP is None:
|
| 265 |
-
from pathlib import Path
|
| 266 |
import pandas as pd
|
| 267 |
|
| 268 |
-
|
|
|
|
| 269 |
try:
|
| 270 |
df = pd.read_csv(csv_path)
|
| 271 |
except FileNotFoundError as e:
|
|
|
|
| 13 |
from torch.utils.data import Dataset
|
| 14 |
import numpy as np
|
| 15 |
|
| 16 |
+
from mosaic.gradio_app import get_data_directory
|
| 17 |
+
|
| 18 |
CANCER_TYPE_TO_INT_MAP = {
|
| 19 |
"AASTR": 0,
|
| 20 |
"ACC": 1,
|
|
|
|
| 221 |
"""
|
| 222 |
global _TISSUE_SITE_MAP
|
| 223 |
if _TISSUE_SITE_MAP is None:
|
|
|
|
| 224 |
import pandas as pd
|
| 225 |
|
| 226 |
+
data_dir = get_data_directory()
|
| 227 |
+
csv_path = data_dir / "tissue_site_original_to_idx.csv"
|
| 228 |
try:
|
| 229 |
df = pd.read_csv(csv_path)
|
| 230 |
except FileNotFoundError as e:
|
|
|
|
| 264 |
"""
|
| 265 |
global _SEX_MAP
|
| 266 |
if _SEX_MAP is None:
|
|
|
|
| 267 |
import pandas as pd
|
| 268 |
|
| 269 |
+
data_dir = get_data_directory()
|
| 270 |
+
csv_path = data_dir / "sex_original_to_idx.csv"
|
| 271 |
try:
|
| 272 |
df = pd.read_csv(csv_path)
|
| 273 |
except FileNotFoundError as e:
|
src/mosaic/ui/utils.py
CHANGED
|
@@ -13,6 +13,8 @@ import pandas as pd
|
|
| 13 |
import gradio as gr
|
| 14 |
import requests
|
| 15 |
|
|
|
|
|
|
|
| 16 |
# This path should be outside your project directory if running locally
|
| 17 |
TEMP_USER_DATA_DIR = Path(tempfile.gettempdir()) / "mosaic_user_data"
|
| 18 |
|
|
@@ -42,8 +44,8 @@ def get_tissue_sites():
|
|
| 42 |
global tissue_site_list
|
| 43 |
if tissue_site_list is None:
|
| 44 |
try:
|
| 45 |
-
|
| 46 |
-
tissue_site_map_path =
|
| 47 |
df = pd.read_csv(tissue_site_map_path)
|
| 48 |
# Get unique tissue sites and sort them
|
| 49 |
tissue_site_list = ["Unknown"] + sorted(df["TISSUE_SITE"].unique().tolist())
|
|
|
|
| 13 |
import gradio as gr
|
| 14 |
import requests
|
| 15 |
|
| 16 |
+
from mosaic.gradio_app import get_data_directory
|
| 17 |
+
|
| 18 |
# This path should be outside your project directory if running locally
|
| 19 |
TEMP_USER_DATA_DIR = Path(tempfile.gettempdir()) / "mosaic_user_data"
|
| 20 |
|
|
|
|
| 44 |
global tissue_site_list
|
| 45 |
if tissue_site_list is None:
|
| 46 |
try:
|
| 47 |
+
data_dir = get_data_directory()
|
| 48 |
+
tissue_site_map_path = data_dir / "tissue_site_original_to_idx.csv"
|
| 49 |
df = pd.read_csv(tissue_site_map_path)
|
| 50 |
# Get unique tissue sites and sort them
|
| 51 |
tissue_site_list = ["Unknown"] + sorted(df["TISSUE_SITE"].unique().tolist())
|