Spaces:
Sleeping
Sleeping
Make all GPU memory stats collection optional with try-except
Browse files- Wrap all max_memory_allocated and reset_peak_memory_stats calls in try-except
- Prevents any CUDA-related errors from breaking execution
- Logs timing info even if memory stats fail
- Applied to CTransPath, Optimus, Aeon, and Paladin functions
- Ensures robustness on ZeroGPU and other GPU environments
- src/mosaic/analysis.py +48 -36
src/mosaic/analysis.py
CHANGED
|
@@ -117,16 +117,19 @@ def _extract_ctranspath_features(coords, slide_path, attrs, num_workers):
|
|
| 117 |
ctranspath_features = np.concatenate(all_features, axis=0)
|
| 118 |
|
| 119 |
end_time = pd.Timestamp.now()
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
if torch.cuda.is_available()
|
| 123 |
-
else 0
|
| 124 |
-
)
|
| 125 |
-
logger.info(
|
| 126 |
-
f"CTransPath extraction took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
|
| 127 |
-
)
|
| 128 |
if torch.cuda.is_available():
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
return ctranspath_features, coords
|
| 132 |
|
|
@@ -209,16 +212,19 @@ def _extract_optimus_features(filtered_coords, slide_path, attrs, num_workers):
|
|
| 209 |
features = np.concatenate(all_features, axis=0)
|
| 210 |
|
| 211 |
end_time = pd.Timestamp.now()
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
if torch.cuda.is_available()
|
| 215 |
-
else 0
|
| 216 |
-
)
|
| 217 |
-
logger.info(
|
| 218 |
-
f"Optimus extraction took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
|
| 219 |
-
)
|
| 220 |
if torch.cuda.is_available():
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
return features
|
| 224 |
|
|
@@ -253,16 +259,19 @@ def _run_aeon_inference(features, site_type, num_workers):
|
|
| 253 |
use_cpu=False,
|
| 254 |
)
|
| 255 |
end_time = pd.Timestamp.now()
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
if torch.cuda.is_available()
|
| 259 |
-
else 0
|
| 260 |
-
)
|
| 261 |
-
logger.info(
|
| 262 |
-
f"Aeon inference took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
|
| 263 |
-
)
|
| 264 |
if torch.cuda.is_available():
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
|
| 267 |
return aeon_results
|
| 268 |
|
|
@@ -299,16 +308,19 @@ def _run_paladin_inference(features, aeon_results, site_type, num_workers):
|
|
| 299 |
use_cpu=False,
|
| 300 |
)
|
| 301 |
end_time = pd.Timestamp.now()
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
if torch.cuda.is_available()
|
| 305 |
-
else 0
|
| 306 |
-
)
|
| 307 |
-
logger.info(
|
| 308 |
-
f"Paladin inference took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
|
| 309 |
-
)
|
| 310 |
if torch.cuda.is_available():
|
| 311 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 312 |
|
| 313 |
return paladin_results
|
| 314 |
|
|
|
|
| 117 |
ctranspath_features = np.concatenate(all_features, axis=0)
|
| 118 |
|
| 119 |
end_time = pd.Timestamp.now()
|
| 120 |
+
|
| 121 |
+
# Log memory stats if CUDA is available
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
if torch.cuda.is_available():
|
| 123 |
+
try:
|
| 124 |
+
max_gpu_memory = torch.cuda.max_memory_allocated() / (1024**3)
|
| 125 |
+
logger.info(
|
| 126 |
+
f"CTransPath extraction took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
|
| 127 |
+
)
|
| 128 |
+
torch.cuda.reset_peak_memory_stats()
|
| 129 |
+
except Exception:
|
| 130 |
+
logger.info(f"CTransPath extraction took {end_time - start_time}")
|
| 131 |
+
else:
|
| 132 |
+
logger.info(f"CTransPath extraction took {end_time - start_time}")
|
| 133 |
|
| 134 |
return ctranspath_features, coords
|
| 135 |
|
|
|
|
| 212 |
features = np.concatenate(all_features, axis=0)
|
| 213 |
|
| 214 |
end_time = pd.Timestamp.now()
|
| 215 |
+
|
| 216 |
+
# Log memory stats if CUDA is available
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
if torch.cuda.is_available():
|
| 218 |
+
try:
|
| 219 |
+
max_gpu_memory = torch.cuda.max_memory_allocated() / (1024**3)
|
| 220 |
+
logger.info(
|
| 221 |
+
f"Optimus extraction took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
|
| 222 |
+
)
|
| 223 |
+
torch.cuda.reset_peak_memory_stats()
|
| 224 |
+
except Exception:
|
| 225 |
+
logger.info(f"Optimus extraction took {end_time - start_time}")
|
| 226 |
+
else:
|
| 227 |
+
logger.info(f"Optimus extraction took {end_time - start_time}")
|
| 228 |
|
| 229 |
return features
|
| 230 |
|
|
|
|
| 259 |
use_cpu=False,
|
| 260 |
)
|
| 261 |
end_time = pd.Timestamp.now()
|
| 262 |
+
|
| 263 |
+
# Log memory stats if CUDA is available
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
if torch.cuda.is_available():
|
| 265 |
+
try:
|
| 266 |
+
max_gpu_memory = torch.cuda.max_memory_allocated() / (1024**3)
|
| 267 |
+
logger.info(
|
| 268 |
+
f"Aeon inference took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
|
| 269 |
+
)
|
| 270 |
+
torch.cuda.reset_peak_memory_stats()
|
| 271 |
+
except Exception:
|
| 272 |
+
logger.info(f"Aeon inference took {end_time - start_time}")
|
| 273 |
+
else:
|
| 274 |
+
logger.info(f"Aeon inference took {end_time - start_time}")
|
| 275 |
|
| 276 |
return aeon_results
|
| 277 |
|
|
|
|
| 308 |
use_cpu=False,
|
| 309 |
)
|
| 310 |
end_time = pd.Timestamp.now()
|
| 311 |
+
|
| 312 |
+
# Log memory stats if CUDA is available
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
if torch.cuda.is_available():
|
| 314 |
+
try:
|
| 315 |
+
max_gpu_memory = torch.cuda.max_memory_allocated() / (1024**3)
|
| 316 |
+
logger.info(
|
| 317 |
+
f"Paladin inference took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
|
| 318 |
+
)
|
| 319 |
+
torch.cuda.reset_peak_memory_stats()
|
| 320 |
+
except Exception:
|
| 321 |
+
logger.info(f"Paladin inference took {end_time - start_time}")
|
| 322 |
+
else:
|
| 323 |
+
logger.info(f"Paladin inference took {end_time - start_time}")
|
| 324 |
|
| 325 |
return paladin_results
|
| 326 |
|