Spaces:

usingcolor
/

MambaEye

Sleeping

App Files Files Community

usingcolor commited on Mar 30

Commit

0f440c7

1 Parent(s): f33c68f

chore: initialize git repository and project structure

Browse files

Files changed (1) hide show

app.py +39 -19

app.py CHANGED Viewed

@@ -53,9 +53,6 @@ from mambaeye.positional_encoding import sinusoidal_position_encoding_2d
 from mamba_ssm.utils.generation import InferenceParams
 # Global Configuration
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(f"Using device: {DEVICE}")
 TARGET_CANVAS_SIZE = 512
 PATCH_SIZE = 16
 CATEGORIES = ResNet50_Weights.IMAGENET1K_V1.meta["categories"]
@@ -79,22 +76,41 @@ _GLOBAL_MODEL = None
 def get_model():
     global _GLOBAL_MODEL
     if _GLOBAL_MODEL is None:
         print(f"Downloading {MODEL_FILENAME} from {MODEL_REPO}...")
         try:
             checkpoint_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILENAME)
             model = MambaEye(**MODEL_CONFIG)
-            # On zero_gpu, downloading weights might happen on CPU first
-            map_loc = torch.device('cpu')
-            model.load_state_dict(torch.load(checkpoint_path, map_location=map_loc))
             model.eval()
             _GLOBAL_MODEL = model
             print("Model loaded successfully.")
         except Exception as e:
             print(f"Failed to load model: {e}")
             raise
-    return _GLOBAL_MODEL
 def _compute_move_embedding(patch_location: torch.Tensor, cur_location: torch.Tensor = None) -> torch.Tensor:
     if cur_location is None:
@@ -184,8 +200,7 @@ def run_auto_scan(image, scan_pattern, sequence_length):
     if image is None:
         return None, {"Upload Image": 1.0}, None, "Upload Image"
-    model = get_model()
-    model.to(DEVICE)
     state = init_state_for_image(image)
@@ -203,18 +218,17 @@ def run_auto_scan(image, scan_pattern, sequence_length):
     )
     inference_params = InferenceParams(max_seqlen=4000, max_batch_size=1)
-    state['inference_params'] = inference_params
     patches_list = []
     moves_list = []
     cur_location = None
     for px, py in positions:
-        loc_tensor = torch.tensor([[px, py]], dtype=torch.long, device=DEVICE)
         move_emb = _compute_move_embedding(loc_tensor, cur_location)
         cur_location = loc_tensor
-        patch = extract_patch(state['canvas_tensor'], px, py).to(DEVICE)
         patches_list.append(patch)
         moves_list.append(move_emb.squeeze(0))
@@ -229,8 +243,10 @@ def run_auto_scan(image, scan_pattern, sequence_length):
     state['cur_location'] = cur_location.cpu()
     state['drawn_positions'] = positions
     state['sequence_length'] = sequence_length
-    # On ZeroGPU spaces safely store Tensors back to CPU State
     state['canvas_tensor'] = state['canvas_tensor'].cpu()
     img_display, _ = draw_patches_on_image(
         state['original_image'], state['drawn_positions'],
@@ -244,14 +260,15 @@ def on_click(evt: gr.SelectData, original_image, state):
     if original_image is None:
         return None, {"Upload Image": 1.0}, state, "Upload Image"
-    model = get_model()
-    model.to(DEVICE)
     if state is None or state.get('inference_params') is None:
-        # Initialize state to begin a new purely user-guided sequence
         state = init_state_for_image(original_image)
         state['inference_params'] = InferenceParams(max_seqlen=4000, max_batch_size=1)
     x_orig, y_orig = evt.index
     orig_h, orig_w = state['original_image'].shape[:2]
     ratio = min(TARGET_CANVAS_SIZE / orig_w, TARGET_CANVAS_SIZE / orig_h)
@@ -262,11 +279,11 @@ def on_click(evt: gr.SelectData, original_image, state):
     px = (canvas_x // PATCH_SIZE) * PATCH_SIZE
     py = (canvas_y // PATCH_SIZE) * PATCH_SIZE
-    cur_loc = state['cur_location'].to(DEVICE) if state['cur_location'] is not None else None
-    loc_tensor = torch.tensor([[px, py]], dtype=torch.long, device=DEVICE)
     move_emb = _compute_move_embedding(loc_tensor, cur_loc)
-    patch = extract_patch(state['canvas_tensor'], px, py).to(DEVICE)
     img_seq = patch.unsqueeze(0).unsqueeze(0) # (1, 1, 768)
     move_seq = move_emb.unsqueeze(0) # (1, 1, 512)
@@ -280,6 +297,9 @@ def on_click(evt: gr.SelectData, original_image, state):
     state['drawn_positions'].append((px, py))
     state['sequence_length'] += 1
     img_display, _ = draw_patches_on_image(
         state['original_image'], state['drawn_positions'],
         state['x_offset'], state['y_offset'], state['h'], state['w']

 from mamba_ssm.utils.generation import InferenceParams
 # Global Configuration
 TARGET_CANVAS_SIZE = 512
 PATCH_SIZE = 16
 CATEGORIES = ResNet50_Weights.IMAGENET1K_V1.meta["categories"]
 def get_model():
     global _GLOBAL_MODEL
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     if _GLOBAL_MODEL is None:
         print(f"Downloading {MODEL_FILENAME} from {MODEL_REPO}...")
         try:
             checkpoint_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILENAME)
             model = MambaEye(**MODEL_CONFIG)
+            # Since this runs inside ZeroGPU worker, load directly to device
+            model.load_state_dict(torch.load(checkpoint_path, map_location=device))
+            model.to(device)
             model.eval()
             _GLOBAL_MODEL = model
             print("Model loaded successfully.")
         except Exception as e:
             print(f"Failed to load model: {e}")
             raise
+    return _GLOBAL_MODEL, device
+def transfer_inference_params(params, device):
+    """Recursively moves the KV cache state of MambaEye InferenceParams to CPU or CUDA."""
+    if params is None or getattr(params, "key_value_memory_dict", None) is None:
+        return params
+    for k, v in params.key_value_memory_dict.items():
+        if isinstance(v, torch.Tensor):
+            params.key_value_memory_dict[k] = v.to(device)
+        elif isinstance(v, tuple):
+            params.key_value_memory_dict[k] = tuple(x.to(device) if isinstance(x, torch.Tensor) else x for x in v)
+        elif isinstance(v, list):
+            params.key_value_memory_dict[k] = [x.to(device) if isinstance(x, torch.Tensor) else x for x in v]
+        elif isinstance(v, dict): # E.g., layers map
+            for k2, v2 in v.items():
+                if hasattr(v2, "to"):
+                    params.key_value_memory_dict[k][k2] = v2.to(device)
+    return params
 def _compute_move_embedding(patch_location: torch.Tensor, cur_location: torch.Tensor = None) -> torch.Tensor:
     if cur_location is None:
     if image is None:
         return None, {"Upload Image": 1.0}, None, "Upload Image"
+    model, device = get_model()
     state = init_state_for_image(image)
     )
     inference_params = InferenceParams(max_seqlen=4000, max_batch_size=1)
     patches_list = []
     moves_list = []
     cur_location = None
     for px, py in positions:
+        loc_tensor = torch.tensor([[px, py]], dtype=torch.long, device=device)
         move_emb = _compute_move_embedding(loc_tensor, cur_location)
         cur_location = loc_tensor
+        patch = extract_patch(state['canvas_tensor'], px, py).to(device)
         patches_list.append(patch)
         moves_list.append(move_emb.squeeze(0))
     state['cur_location'] = cur_location.cpu()
     state['drawn_positions'] = positions
     state['sequence_length'] = sequence_length
+    # On ZeroGPU spaces securely move Tensors back to CPU State
     state['canvas_tensor'] = state['canvas_tensor'].cpu()
+    state['inference_params'] = transfer_inference_params(inference_params, torch.device('cpu'))
     img_display, _ = draw_patches_on_image(
         state['original_image'], state['drawn_positions'],
     if original_image is None:
         return None, {"Upload Image": 1.0}, state, "Upload Image"
+    model, device = get_model()
     if state is None or state.get('inference_params') is None:
         state = init_state_for_image(original_image)
         state['inference_params'] = InferenceParams(max_seqlen=4000, max_batch_size=1)
+    # Move InferenceParams back to the functional device correctly!
+    state['inference_params'] = transfer_inference_params(state['inference_params'], device)
     x_orig, y_orig = evt.index
     orig_h, orig_w = state['original_image'].shape[:2]
     ratio = min(TARGET_CANVAS_SIZE / orig_w, TARGET_CANVAS_SIZE / orig_h)
     px = (canvas_x // PATCH_SIZE) * PATCH_SIZE
     py = (canvas_y // PATCH_SIZE) * PATCH_SIZE
+    cur_loc = state['cur_location'].to(device) if state['cur_location'] is not None else None
+    loc_tensor = torch.tensor([[px, py]], dtype=torch.long, device=device)
     move_emb = _compute_move_embedding(loc_tensor, cur_loc)
+    patch = extract_patch(state['canvas_tensor'], px, py).to(device)
     img_seq = patch.unsqueeze(0).unsqueeze(0) # (1, 1, 768)
     move_seq = move_emb.unsqueeze(0) # (1, 1, 512)
     state['drawn_positions'].append((px, py))
     state['sequence_length'] += 1
+    # Strip back to CPU for Gradio Session Memory
+    state['inference_params'] = transfer_inference_params(state['inference_params'], torch.device('cpu'))
     img_display, _ = draw_patches_on_image(
         state['original_image'], state['drawn_positions'],
         state['x_offset'], state['y_offset'], state['h'], state['w']