Spaces:

tan7271
/

RoboEvalGradio

Sleeping

App Files Files Community

Christopher Tan commited on Nov 26, 2025

Commit

d2fd76d

1 Parent(s): 8fbc522

clearing cache before switching models

Browse files

Files changed (4) hide show

__pycache__/app.cpython-313.pyc +0 -0
__pycache__/inference_openvla.cpython-313.pyc +0 -0
app.py +34 -0
inference_openvla.py +33 -2

__pycache__/app.cpython-313.pyc CHANGED Viewed

Binary files a/__pycache__/app.cpython-313.pyc and b/__pycache__/app.cpython-313.pyc differ

__pycache__/inference_openvla.cpython-313.pyc CHANGED Viewed

Binary files a/__pycache__/inference_openvla.cpython-313.pyc and b/__pycache__/inference_openvla.cpython-313.pyc differ

app.py CHANGED Viewed

@@ -598,6 +598,34 @@ def cleanup_workers():
 atexit.register(cleanup_workers)
 @dataclasses.dataclass
 class InferenceRequest:
     """Normalized payload for invoking model backends from the UI."""
@@ -632,6 +660,9 @@ def run_pi0_inference(request: InferenceRequest) -> Tuple[Optional[str], str]:
     """Dispatch OpenPI inference to subprocess"""
     model_key = "openpi"  # Define model_key for this function
     try:
         request.progress(0, desc="Starting OpenPI worker...")
         worker = get_inference_worker(model_key)
@@ -774,6 +805,9 @@ def run_openvla_inference(request: InferenceRequest) -> Tuple[Optional[str], str
     """Dispatch OpenVLA inference to subprocess"""
     model_key = "openvla"  # Define model_key for this function
     try:
         request.progress(0, desc="Starting OpenVLA worker...")
         worker = get_inference_worker(model_key)

 atexit.register(cleanup_workers)
+def terminate_other_worker(current_model_key: str):
+    """Terminate the other model's worker to free GPU memory when switching models."""
+    global _INFERENCE_WORKERS, _WORKER_STDERR
+    # Find the other model key
+    other_model_key = "openvla" if current_model_key == "openpi" else "openpi"
+    # Check if the other worker is running
+    other_worker = _INFERENCE_WORKERS.get(other_model_key)
+    if other_worker and other_worker.poll() is None:
+        print(f"Terminating {other_model_key} worker to free GPU memory for {current_model_key}...", flush=True)
+        try:
+            other_worker.terminate()
+            try:
+                other_worker.wait(timeout=5)
+                print(f"✓ {other_model_key} worker terminated successfully", flush=True)
+            except subprocess.TimeoutExpired:
+                print(f"⚠️  {other_model_key} worker didn't terminate gracefully, killing...", flush=True)
+                other_worker.kill()
+                other_worker.wait()
+        except Exception as e:
+            print(f"⚠️  Error terminating {other_model_key} worker: {e}", flush=True)
+        finally:
+            # Mark as terminated
+            _INFERENCE_WORKERS[other_model_key] = None
+            _WORKER_STDERR[other_model_key] = []  # Clear stderr buffer
 @dataclasses.dataclass
 class InferenceRequest:
     """Normalized payload for invoking model backends from the UI."""
     """Dispatch OpenPI inference to subprocess"""
     model_key = "openpi"  # Define model_key for this function
     try:
+        # Terminate OpenVLA worker if running to free GPU memory
+        terminate_other_worker(model_key)
         request.progress(0, desc="Starting OpenPI worker...")
         worker = get_inference_worker(model_key)
     """Dispatch OpenVLA inference to subprocess"""
     model_key = "openvla"  # Define model_key for this function
     try:
+        # Terminate OpenPI worker if running to free GPU memory
+        terminate_other_worker(model_key)
         request.progress(0, desc="Starting OpenVLA worker...")
         worker = get_inference_worker(model_key)

inference_openvla.py CHANGED Viewed

@@ -29,6 +29,8 @@ try:
     os.environ["DISPLAY"] = ":99"
     os.environ["LIBGL_ALWAYS_SOFTWARE"] = "1"
     os.environ["GALLIUM_DRIVER"] = "llvmpipe"
     # Debug: verify environment variables are set
     print(f"DEBUG: MUJOCO_GL={os.environ.get('MUJOCO_GL')}, PYOPENGL_PLATFORM={os.environ.get('PYOPENGL_PLATFORM')}", file=sys.stderr, flush=True)
 except Exception as e:
@@ -203,6 +205,30 @@ DEFAULT_DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
 DEFAULT_DOWNSAMPLE_RATE = 25
 CAMERA_RESOLUTION = (256, 256)
 # Environment registry
 _ENV_CLASSES = {
     "CubeHandover": (CubeHandover, "handover the rod from one hand to the other hand"),
@@ -240,8 +266,7 @@ _ENV_CLASSES = {
     "StackTwoBlocksPositionOrientation": (StackTwoBlocksPositionAndOrientation, "stack the two cubes")
 }
-# Global model cache
-_MODEL_CACHE = {}
 def get_checkpoint_path(task_name: str, ckpt_path: Optional[str] = None) -> str:
@@ -330,6 +355,12 @@ def load_vla_model(ckpt_path: str, device: str = DEFAULT_DEVICE) -> Tuple[AutoPr
     if ckpt_path in _MODEL_CACHE:
         return _MODEL_CACHE[ckpt_path]
     if not os.path.exists(ckpt_path):
         raise FileNotFoundError(f"Checkpoint path does not exist: {ckpt_path}")

     os.environ["DISPLAY"] = ":99"
     os.environ["LIBGL_ALWAYS_SOFTWARE"] = "1"
     os.environ["GALLIUM_DRIVER"] = "llvmpipe"
+    # PyTorch CUDA memory allocator settings to reduce fragmentation
+    os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
     # Debug: verify environment variables are set
     print(f"DEBUG: MUJOCO_GL={os.environ.get('MUJOCO_GL')}, PYOPENGL_PLATFORM={os.environ.get('PYOPENGL_PLATFORM')}", file=sys.stderr, flush=True)
 except Exception as e:
 DEFAULT_DOWNSAMPLE_RATE = 25
 CAMERA_RESOLUTION = (256, 256)
+# Model cache
+_MODEL_CACHE: Dict[str, Tuple[AutoProcessor, AutoModelForVision2Seq]] = {}
+def clear_gpu_memory():
+    """Clear PyTorch GPU memory and model cache."""
+    global _MODEL_CACHE
+    # Clear the model cache
+    if _MODEL_CACHE:
+        print(f"Clearing {len(_MODEL_CACHE)} cached model(s) to free GPU memory...", file=sys.stderr, flush=True)
+        _MODEL_CACHE.clear()
+    # Clear PyTorch CUDA cache
+    try:
+        import gc
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize()
+        gc.collect()
+        print("GPU memory cleared successfully", file=sys.stderr, flush=True)
+    except Exception as e:
+        print(f"Warning: Could not fully clear GPU memory: {e}", file=sys.stderr, flush=True)
 # Environment registry
 _ENV_CLASSES = {
     "CubeHandover": (CubeHandover, "handover the rod from one hand to the other hand"),
     "StackTwoBlocksPositionOrientation": (StackTwoBlocksPositionAndOrientation, "stack the two cubes")
 }
+# Model cache is defined above (line 209)
 def get_checkpoint_path(task_name: str, ckpt_path: Optional[str] = None) -> str:
     if ckpt_path in _MODEL_CACHE:
         return _MODEL_CACHE[ckpt_path]
+    # Clear GPU memory before loading new model if cache is not empty
+    # This helps when switching from OpenPI to OpenVLA
+    if _MODEL_CACHE:
+        print("Clearing GPU memory before loading new model...", file=sys.stderr, flush=True)
+        clear_gpu_memory()
     if not os.path.exists(ckpt_path):
         raise FileNotFoundError(f"Checkpoint path does not exist: {ckpt_path}")