raylim Claude Sonnet 4.5 commited on
Commit
751062d
·
unverified ·
1 Parent(s): c5e7bb2

Fix model file location to use HuggingFace cache directory

Browse files

Changed all model file references from hardcoded local data/ directory to
use HuggingFace cache directory. This fixes the issue where files couldn't
be found after downloading from HuggingFace Hub.

Changes:
- Add get_data_directory() function to gradio_app.py that checks
MOSAIC_DATA_DIR environment variable first, then falls back to local
data/ for development
- Update gradio_app.py to download to HF cache (removed local_dir parameter)
and set MOSAIC_DATA_DIR environment variable
- Update all model file references in aeon.py, data.py, utils.py, and
analysis.py to use get_data_directory() instead of hardcoded paths

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

src/mosaic/analysis.py CHANGED
@@ -61,6 +61,7 @@ from mussel.utils.segment import draw_slide_mask
61
  from mussel.cli.tessellate import BiopsySegConfig, ResectionSegConfig, TcgaSegConfig
62
  from loguru import logger
63
  from mosaic.inference import run_aeon, run_paladin
 
64
 
65
  # Log hardware detection at module load
66
  logger.info(f"Hardware: {GPU_TYPE} | batch_size={DEFAULT_BATCH_SIZE}, num_workers={DEFAULT_NUM_WORKERS}")
@@ -92,13 +93,14 @@ def _extract_ctranspath_features(coords, slide_path, attrs, num_workers):
92
  logger.info(f"Running CTransPath with {num_workers} workers")
93
 
94
  start_time = pd.Timestamp.now()
95
-
 
96
  ctranspath_features, _ = get_features(
97
  coords,
98
  slide_path,
99
  attrs,
100
  model_type=ModelType.CTRANSPATH,
101
- model_path="data/ctranspath.pth",
102
  num_workers=num_workers,
103
  batch_size=batch_size,
104
  use_gpu=True,
@@ -136,13 +138,14 @@ def _extract_optimus_features(filtered_coords, slide_path, attrs, num_workers):
136
  logger.info(f"Running Optimus with {num_workers} workers")
137
 
138
  start_time = pd.Timestamp.now()
139
-
 
140
  features, _ = get_features(
141
  filtered_coords,
142
  slide_path,
143
  attrs,
144
  model_type=ModelType.OPTIMUS,
145
- model_path="data/optimus.pkl",
146
  num_workers=num_workers,
147
  batch_size=batch_size,
148
  use_gpu=True,
@@ -179,9 +182,10 @@ def _run_aeon_inference(features, site_type, num_workers, sex=None, tissue_site_
179
 
180
  start_time = pd.Timestamp.now()
181
  logger.info("Running Aeon for cancer subtype inference")
 
182
  aeon_results, _ = run_aeon(
183
  features=features,
184
- model_path="data/aeon_model.pkl",
185
  metastatic=(site_type == "Metastatic"),
186
  batch_size=8,
187
  num_workers=num_workers,
@@ -231,9 +235,10 @@ def _run_paladin_inference(features, aeon_results, site_type, num_workers):
231
 
232
  start_time = pd.Timestamp.now()
233
  logger.info("Running Paladin for biomarker inference")
 
234
  paladin_results = run_paladin(
235
  features=features,
236
- model_map_path="data/paladin_model_map.csv",
237
  aeon_results=aeon_results,
238
  metastatic=(site_type == "Metastatic"),
239
  batch_size=8,
@@ -339,7 +344,8 @@ def _run_inference_pipeline_impl(
339
 
340
  # Step 3: Filter features using marker classifier (CPU operation)
341
  start_time = pd.Timestamp.now()
342
- marker_classifier = pickle.load(open("data/marker_classifier.pkl", "rb"))
 
343
  progress(0.35, desc="Filtering features with marker classifier")
344
  logger.info("Filtering features with marker classifier")
345
  _, filtered_coords = filter_features(
 
61
  from mussel.cli.tessellate import BiopsySegConfig, ResectionSegConfig, TcgaSegConfig
62
  from loguru import logger
63
  from mosaic.inference import run_aeon, run_paladin
64
+ from mosaic.gradio_app import get_data_directory
65
 
66
  # Log hardware detection at module load
67
  logger.info(f"Hardware: {GPU_TYPE} | batch_size={DEFAULT_BATCH_SIZE}, num_workers={DEFAULT_NUM_WORKERS}")
 
93
  logger.info(f"Running CTransPath with {num_workers} workers")
94
 
95
  start_time = pd.Timestamp.now()
96
+
97
+ data_dir = get_data_directory()
98
  ctranspath_features, _ = get_features(
99
  coords,
100
  slide_path,
101
  attrs,
102
  model_type=ModelType.CTRANSPATH,
103
+ model_path=str(data_dir / "ctranspath.pth"),
104
  num_workers=num_workers,
105
  batch_size=batch_size,
106
  use_gpu=True,
 
138
  logger.info(f"Running Optimus with {num_workers} workers")
139
 
140
  start_time = pd.Timestamp.now()
141
+
142
+ data_dir = get_data_directory()
143
  features, _ = get_features(
144
  filtered_coords,
145
  slide_path,
146
  attrs,
147
  model_type=ModelType.OPTIMUS,
148
+ model_path=str(data_dir / "optimus.pkl"),
149
  num_workers=num_workers,
150
  batch_size=batch_size,
151
  use_gpu=True,
 
182
 
183
  start_time = pd.Timestamp.now()
184
  logger.info("Running Aeon for cancer subtype inference")
185
+ data_dir = get_data_directory()
186
  aeon_results, _ = run_aeon(
187
  features=features,
188
+ model_path=str(data_dir / "aeon_model.pkl"),
189
  metastatic=(site_type == "Metastatic"),
190
  batch_size=8,
191
  num_workers=num_workers,
 
235
 
236
  start_time = pd.Timestamp.now()
237
  logger.info("Running Paladin for biomarker inference")
238
+ data_dir = get_data_directory()
239
  paladin_results = run_paladin(
240
  features=features,
241
+ model_map_path=str(data_dir / "paladin_model_map.csv"),
242
  aeon_results=aeon_results,
243
  metastatic=(site_type == "Metastatic"),
244
  batch_size=8,
 
344
 
345
  # Step 3: Filter features using marker classifier (CPU operation)
346
  start_time = pd.Timestamp.now()
347
+ data_dir = get_data_directory()
348
+ marker_classifier = pickle.load(open(data_dir / "marker_classifier.pkl", "rb"))
349
  progress(0.35, desc="Filtering features with marker classifier")
350
  logger.info("Filtering features with marker classifier")
351
  _, filtered_coords = filter_features(
src/mosaic/gradio_app.py CHANGED
@@ -12,6 +12,7 @@ import pandas as pd
12
  from pathlib import Path
13
  from huggingface_hub import snapshot_download
14
  from loguru import logger
 
15
 
16
  from mosaic.ui import launch_gradio
17
  from mosaic.ui.app import set_cancer_subtype_maps
@@ -25,23 +26,69 @@ from mosaic.ui.utils import (
25
  )
26
  from mosaic.analysis import analyze_slide
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def download_and_process_models():
30
  """Download models from HuggingFace and initialize cancer subtype mappings.
31
-
32
  Downloads the Paladin and Aeon models from the PDM-Group HuggingFace repository
33
- and creates mappings between cancer subtype names and OncoTree codes.
34
-
 
35
  Returns:
36
  tuple: (cancer_subtype_name_map, reversed_cancer_subtype_name_map, cancer_subtypes)
37
  - cancer_subtype_name_map: Dict mapping display names to OncoTree codes
38
  - reversed_cancer_subtype_name_map: Dict mapping OncoTree codes to display names
39
  - cancer_subtypes: List of all supported cancer subtype codes
40
  """
41
- snapshot_download(repo_id="PDM-Group/paladin-aeon-models", local_dir="data")
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  model_map = pd.read_csv(
44
- "data/paladin_model_map.csv",
45
  )
46
  cancer_subtypes = model_map["cancer_subtype"].unique().tolist()
47
  cancer_subtype_name_map = {"Unknown": "UNK"}
 
12
  from pathlib import Path
13
  from huggingface_hub import snapshot_download
14
  from loguru import logger
15
+ import os
16
 
17
  from mosaic.ui import launch_gradio
18
  from mosaic.ui.app import set_cancer_subtype_maps
 
26
  )
27
  from mosaic.analysis import analyze_slide
28
 
29
+ # Global variable to store the model data directory path
30
+ _MODEL_DATA_DIR = None
31
+
32
+
33
+ def get_data_directory():
34
+ """Get the directory containing model data files.
35
+
36
+ Returns the HuggingFace cache directory path for the model repository,
37
+ or falls back to local 'data/' directory if not yet downloaded.
38
+
39
+ Returns:
40
+ Path: Path to the model data directory
41
+ """
42
+ global _MODEL_DATA_DIR
43
+ if _MODEL_DATA_DIR is not None:
44
+ return _MODEL_DATA_DIR
45
+
46
+ # Check if environment variable is set
47
+ if "MOSAIC_DATA_DIR" in os.environ:
48
+ _MODEL_DATA_DIR = Path(os.environ["MOSAIC_DATA_DIR"])
49
+ return _MODEL_DATA_DIR
50
+
51
+ # Check if local data/ directory exists (for development/backward compat)
52
+ local_data = Path("data")
53
+ if local_data.exists() and (local_data / "paladin_model_map.csv").exists():
54
+ _MODEL_DATA_DIR = local_data
55
+ return _MODEL_DATA_DIR
56
+
57
+ # Fall back to repo root data/ directory
58
+ _MODEL_DATA_DIR = local_data
59
+ return _MODEL_DATA_DIR
60
+
61
 
62
  def download_and_process_models():
63
  """Download models from HuggingFace and initialize cancer subtype mappings.
64
+
65
  Downloads the Paladin and Aeon models from the PDM-Group HuggingFace repository
66
+ to the HuggingFace cache directory and creates mappings between cancer subtype
67
+ names and OncoTree codes.
68
+
69
  Returns:
70
  tuple: (cancer_subtype_name_map, reversed_cancer_subtype_name_map, cancer_subtypes)
71
  - cancer_subtype_name_map: Dict mapping display names to OncoTree codes
72
  - reversed_cancer_subtype_name_map: Dict mapping OncoTree codes to display names
73
  - cancer_subtypes: List of all supported cancer subtype codes
74
  """
75
+ global _MODEL_DATA_DIR
76
+
77
+ # Download to HF cache directory (not local_dir)
78
+ # This returns the path to the cached snapshot
79
+ logger.info("Downloading models from HuggingFace Hub to cache directory...")
80
+ cache_dir = snapshot_download(
81
+ repo_id="PDM-Group/paladin-aeon-models",
82
+ # No local_dir - use HF cache
83
+ )
84
+ _MODEL_DATA_DIR = Path(cache_dir)
85
+ logger.info(f"Models downloaded to: {_MODEL_DATA_DIR}")
86
+
87
+ # Also set environment variable for other modules to use
88
+ os.environ["MOSAIC_DATA_DIR"] = str(_MODEL_DATA_DIR)
89
 
90
  model_map = pd.read_csv(
91
+ _MODEL_DATA_DIR / "paladin_model_map.csv",
92
  )
93
  cancer_subtypes = model_map["cancer_subtype"].unique().tolist()
94
  cancer_subtype_name_map = {"Unknown": "UNK"}
src/mosaic/inference/aeon.py CHANGED
@@ -22,6 +22,7 @@ from mosaic.inference.data import (
22
  )
23
 
24
  from loguru import logger
 
25
 
26
  # Cancer types excluded from prediction (too broad or ambiguous)
27
  # These are used to mask out predictions for overly general cancer types
@@ -69,7 +70,8 @@ def run(
69
  model.eval()
70
 
71
  # Load the correct mapping from metadata for this model
72
- metadata_path = Path(__file__).parent.parent.parent.parent / "data" / "metadata" / "target_dict.tsv"
 
73
  with open(metadata_path) as f:
74
  target_dict_str = f.read().strip().replace("'", '"')
75
  target_dict = json.loads(target_dict_str)
 
22
  )
23
 
24
  from loguru import logger
25
+ from mosaic.gradio_app import get_data_directory
26
 
27
  # Cancer types excluded from prediction (too broad or ambiguous)
28
  # These are used to mask out predictions for overly general cancer types
 
70
  model.eval()
71
 
72
  # Load the correct mapping from metadata for this model
73
+ data_dir = get_data_directory()
74
+ metadata_path = data_dir / "metadata" / "target_dict.tsv"
75
  with open(metadata_path) as f:
76
  target_dict_str = f.read().strip().replace("'", '"')
77
  target_dict = json.loads(target_dict_str)
src/mosaic/inference/data.py CHANGED
@@ -13,6 +13,8 @@ import torch
13
  from torch.utils.data import Dataset
14
  import numpy as np
15
 
 
 
16
  CANCER_TYPE_TO_INT_MAP = {
17
  "AASTR": 0,
18
  "ACC": 1,
@@ -219,10 +221,10 @@ def get_tissue_site_map():
219
  """
220
  global _TISSUE_SITE_MAP
221
  if _TISSUE_SITE_MAP is None:
222
- from pathlib import Path
223
  import pandas as pd
224
 
225
- csv_path = Path(__file__).parent.parent.parent.parent / "data" / "tissue_site_original_to_idx.csv"
 
226
  try:
227
  df = pd.read_csv(csv_path)
228
  except FileNotFoundError as e:
@@ -262,10 +264,10 @@ def get_sex_map():
262
  """
263
  global _SEX_MAP
264
  if _SEX_MAP is None:
265
- from pathlib import Path
266
  import pandas as pd
267
 
268
- csv_path = Path(__file__).parent.parent.parent.parent / "data" / "sex_original_to_idx.csv"
 
269
  try:
270
  df = pd.read_csv(csv_path)
271
  except FileNotFoundError as e:
 
13
  from torch.utils.data import Dataset
14
  import numpy as np
15
 
16
+ from mosaic.gradio_app import get_data_directory
17
+
18
  CANCER_TYPE_TO_INT_MAP = {
19
  "AASTR": 0,
20
  "ACC": 1,
 
221
  """
222
  global _TISSUE_SITE_MAP
223
  if _TISSUE_SITE_MAP is None:
 
224
  import pandas as pd
225
 
226
+ data_dir = get_data_directory()
227
+ csv_path = data_dir / "tissue_site_original_to_idx.csv"
228
  try:
229
  df = pd.read_csv(csv_path)
230
  except FileNotFoundError as e:
 
264
  """
265
  global _SEX_MAP
266
  if _SEX_MAP is None:
 
267
  import pandas as pd
268
 
269
+ data_dir = get_data_directory()
270
+ csv_path = data_dir / "sex_original_to_idx.csv"
271
  try:
272
  df = pd.read_csv(csv_path)
273
  except FileNotFoundError as e:
src/mosaic/ui/utils.py CHANGED
@@ -13,6 +13,8 @@ import pandas as pd
13
  import gradio as gr
14
  import requests
15
 
 
 
16
  # This path should be outside your project directory if running locally
17
  TEMP_USER_DATA_DIR = Path(tempfile.gettempdir()) / "mosaic_user_data"
18
 
@@ -42,8 +44,8 @@ def get_tissue_sites():
42
  global tissue_site_list
43
  if tissue_site_list is None:
44
  try:
45
- current_dir = Path(__file__).parent.parent.parent.parent
46
- tissue_site_map_path = current_dir / "data" / "tissue_site_original_to_idx.csv"
47
  df = pd.read_csv(tissue_site_map_path)
48
  # Get unique tissue sites and sort them
49
  tissue_site_list = ["Unknown"] + sorted(df["TISSUE_SITE"].unique().tolist())
 
13
  import gradio as gr
14
  import requests
15
 
16
+ from mosaic.gradio_app import get_data_directory
17
+
18
  # This path should be outside your project directory if running locally
19
  TEMP_USER_DATA_DIR = Path(tempfile.gettempdir()) / "mosaic_user_data"
20
 
 
44
  global tissue_site_list
45
  if tissue_site_list is None:
46
  try:
47
+ data_dir = get_data_directory()
48
+ tissue_site_map_path = data_dir / "tissue_site_original_to_idx.csv"
49
  df = pd.read_csv(tissue_site_map_path)
50
  # Get unique tissue sites and sort them
51
  tissue_site_list = ["Unknown"] + sorted(df["TISSUE_SITE"].unique().tolist())