Spaces:

raylim
/

mosaic-zero

Sleeping

App Files Files Community

raylim commited on Dec 8, 2025

Commit

c6bd865

unverified ·

1 Parent(s): aafc601

Make all GPU memory stats collection optional with try-except

Browse files

- Wrap all max_memory_allocated and reset_peak_memory_stats calls in try-except
- Prevents any CUDA-related errors from breaking execution
- Logs timing info even if memory stats fail
- Applied to CTransPath, Optimus, Aeon, and Paladin functions
- Ensures robustness on ZeroGPU and other GPU environments

Files changed (1) hide show

src/mosaic/analysis.py +48 -36

src/mosaic/analysis.py CHANGED Viewed

@@ -117,16 +117,19 @@ def _extract_ctranspath_features(coords, slide_path, attrs, num_workers):
     ctranspath_features = np.concatenate(all_features, axis=0)
     end_time = pd.Timestamp.now()
-    max_gpu_memory = (
-        torch.cuda.max_memory_allocated() / (1024**3)
-        if torch.cuda.is_available()
-        else 0
-    )
-    logger.info(
-        f"CTransPath extraction took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
-    )
     if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
     return ctranspath_features, coords
@@ -209,16 +212,19 @@ def _extract_optimus_features(filtered_coords, slide_path, attrs, num_workers):
     features = np.concatenate(all_features, axis=0)
     end_time = pd.Timestamp.now()
-    max_gpu_memory = (
-        torch.cuda.max_memory_allocated() / (1024**3)
-        if torch.cuda.is_available()
-        else 0
-    )
-    logger.info(
-        f"Optimus extraction took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
-    )
     if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
     return features
@@ -253,16 +259,19 @@ def _run_aeon_inference(features, site_type, num_workers):
         use_cpu=False,
     )
     end_time = pd.Timestamp.now()
-    max_gpu_memory = (
-        torch.cuda.max_memory_allocated() / (1024**3)
-        if torch.cuda.is_available()
-        else 0
-    )
-    logger.info(
-        f"Aeon inference took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
-    )
     if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
     return aeon_results
@@ -299,16 +308,19 @@ def _run_paladin_inference(features, aeon_results, site_type, num_workers):
         use_cpu=False,
     )
     end_time = pd.Timestamp.now()
-    max_gpu_memory = (
-        torch.cuda.max_memory_allocated() / (1024**3)
-        if torch.cuda.is_available()
-        else 0
-    )
-    logger.info(
-        f"Paladin inference took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
-    )
     if torch.cuda.is_available():
-        torch.cuda.reset_peak_memory_stats()
     return paladin_results

     ctranspath_features = np.concatenate(all_features, axis=0)
     end_time = pd.Timestamp.now()
+    # Log memory stats if CUDA is available
     if torch.cuda.is_available():
+        try:
+            max_gpu_memory = torch.cuda.max_memory_allocated() / (1024**3)
+            logger.info(
+                f"CTransPath extraction took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
+            )
+            torch.cuda.reset_peak_memory_stats()
+        except Exception:
+            logger.info(f"CTransPath extraction took {end_time - start_time}")
+    else:
+        logger.info(f"CTransPath extraction took {end_time - start_time}")
     return ctranspath_features, coords
     features = np.concatenate(all_features, axis=0)
     end_time = pd.Timestamp.now()
+    # Log memory stats if CUDA is available
     if torch.cuda.is_available():
+        try:
+            max_gpu_memory = torch.cuda.max_memory_allocated() / (1024**3)
+            logger.info(
+                f"Optimus extraction took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
+            )
+            torch.cuda.reset_peak_memory_stats()
+        except Exception:
+            logger.info(f"Optimus extraction took {end_time - start_time}")
+    else:
+        logger.info(f"Optimus extraction took {end_time - start_time}")
     return features
         use_cpu=False,
     )
     end_time = pd.Timestamp.now()
+    # Log memory stats if CUDA is available
     if torch.cuda.is_available():
+        try:
+            max_gpu_memory = torch.cuda.max_memory_allocated() / (1024**3)
+            logger.info(
+                f"Aeon inference took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
+            )
+            torch.cuda.reset_peak_memory_stats()
+        except Exception:
+            logger.info(f"Aeon inference took {end_time - start_time}")
+    else:
+        logger.info(f"Aeon inference took {end_time - start_time}")
     return aeon_results
         use_cpu=False,
     )
     end_time = pd.Timestamp.now()
+    # Log memory stats if CUDA is available
     if torch.cuda.is_available():
+        try:
+            max_gpu_memory = torch.cuda.max_memory_allocated() / (1024**3)
+            logger.info(
+                f"Paladin inference took {end_time - start_time} and used {max_gpu_memory:.2f} GB GPU memory"
+            )
+            torch.cuda.reset_peak_memory_stats()
+        except Exception:
+            logger.info(f"Paladin inference took {end_time - start_time}")
+    else:
+        logger.info(f"Paladin inference took {end_time - start_time}")
     return paladin_results