Update moondream.py

Browse files

fix: caled_dot_product_attention expands to a 3-D mask into (B, n_heads, q_len, kv_len) and bombs out.

Files changed (1) hide show

moondream.py +38 -27

moondream.py CHANGED Viewed

@@ -26,6 +26,8 @@ from .region import decode_coordinate, encode_coordinate, decode_size, encode_si
 from .text import text_encoder, lm_head
 from typing import Optional, List, Union
 from .lora import variant_state_dict
 ImageEncodingSettings = TypedDict(
     "ImageEncodingSettings",
@@ -911,24 +913,27 @@ class MoondreamModel(nn.Module):
     def _generate_points_batched(
         self,
-        hidden,              # (B,1,C)
-        next_token,          # (B,1)
-        pos: int,            # shared scalar next position
         include_size: bool = True,
         max_objects: int = 50,
         lora=None,
     ):
         """
         Vectorized version of _generate_points() that decodes x -> y -> size -> next-token
-        for all rows in the batch simultaneously. Returns list-of-lists of dicts, len B.
         """
         B = hidden.size(0)
         device = self.device
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
-        # 4-D mask: (B, 1, q_len=1, kv_len)
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if pos > 0:
             mask[:, :, :, :pos] = True
@@ -939,29 +944,29 @@ class MoondreamModel(nn.Module):
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
-                # --- x coordinate ---
-                x_logits = decode_coordinate(hidden, self.region)       # (B,1,1024) or (B,1024)
                 if x_logits.dim() == 3:
-                    x_logits = x_logits.squeeze(1)
-                x_bin = x_logits.argmax(dim=-1).to(torch.float32)       # (B,)
-                x_center = x_bin / float(x_logits.size(-1))             # (B,)
-                x_in = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)  # (B,1)
-                x_emb = encode_coordinate(x_in, self.region).unsqueeze(1)   # (B,1,C)
-                # advance attention one step
                 mask[:, :, :, pos] = True
                 logits, hidden = self._decode_one_tok(x_emb, mask, pos_id, lora)
                 pos += 1
                 pos_id[0] = pos
-                # --- y coordinate ---
-                y_logits = decode_coordinate(hidden, self.region)
                 if y_logits.dim() == 3:
                     y_logits = y_logits.squeeze(1)
-                y_bin = y_logits.argmax(dim=-1).to(torch.float32)
-                y_center = y_bin / float(y_logits.size(-1))             # (B,)
-                y_in = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)  # (B,1)
-                y_emb = encode_coordinate(y_in, self.region).unsqueeze(1)
                 mask[:, :, :, pos] = True
                 logits, hidden = self._decode_one_tok(y_emb, mask, pos_id, lora)
@@ -969,18 +974,23 @@ class MoondreamModel(nn.Module):
                 pos_id[0] = pos
                 if include_size:
-                    # --- size ---
-                    size_logits = decode_size(hidden, self.region)
-                    w_logits, h_logits = size_logits[0].squeeze(1), size_logits[1].squeeze(1)
                     w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                     h_bin = h_logits.argmax(dim=-1).to(torch.float32)
-                    # bins -> size in [0,1] (inverse of log-scale mapping)
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
-                    size_in = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B,2)
                     size_emb = encode_size(size_in, self.region).unsqueeze(1)       # (B,1,C)
-                    # record boxes
                     for i in range(B):
                         if alive[i]:
                             out[i].append({
@@ -990,21 +1000,22 @@ class MoondreamModel(nn.Module):
                                 "y_max": (y_center[i] + h[i] / 2).item(),
                             })
                     mask[:, :, :, pos] = True
                     logits, hidden = self._decode_one_tok(size_emb, mask, pos_id, lora)
                     pos += 1
                     pos_id[0] = pos
                     next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
                 else:
                     for i in range(B):
                         if alive[i]:
                             out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
                     mask[:, :, :, pos] = True
                     logits, hidden = self._decode_one_tok(y_emb, mask, pos_id, lora)
                     pos += 1
                     pos_id[0] = pos
-                    next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
                 counts = counts + (~finished_now & alive).to(counts.dtype)

 from .text import text_encoder, lm_head
 from typing import Optional, List, Union
 from .lora import variant_state_dict
+from .layers import mlp
 ImageEncodingSettings = TypedDict(
     "ImageEncodingSettings",
     def _generate_points_batched(
         self,
+        hidden,              # (B,1,C)   last hidden after prefill (per label row)
+        next_token,          # (B,1)     (kept for parity; not used when temperature=0)
+        pos: int,            # shared scalar next position for all rows
         include_size: bool = True,
         max_objects: int = 50,
         lora=None,
     ):
         """
         Vectorized version of _generate_points() that decodes x -> y -> size -> next-token
+        for all rows in the batch simultaneously. Returns list-of-lists of dicts (len B).
+        Batch-safe: uses 4-D masks and avoids region.decode_size() (which flattens batch).
         """
+        import torch
         B = hidden.size(0)
         device = self.device
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
+        # 4-D mask: (B, 1, q_len=1, kv_len), True means "visible" to match model's convention
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if pos > 0:
             mask[:, :, :, :pos] = True
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
+                # --- x coordinate (batched) ---
+                x_logits = decode_coordinate(hidden, self.region)  # (B,1,1024) or (B,1024)
                 if x_logits.dim() == 3:
+                    x_logits = x_logits.squeeze(1)                 # (B,1024)
+                x_bin    = x_logits.argmax(dim=-1).to(torch.float32)
+                x_center = x_bin / float(x_logits.size(-1))        # (B,)
+                x_in     = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)  # (B,1)
+                x_emb    = encode_coordinate(x_in, self.region).unsqueeze(1)  # (B,1,C)
+                # advance one token
                 mask[:, :, :, pos] = True
                 logits, hidden = self._decode_one_tok(x_emb, mask, pos_id, lora)
                 pos += 1
                 pos_id[0] = pos
+                # --- y coordinate (batched) ---
+                y_logits = decode_coordinate(hidden, self.region)  # (B,1,1024) or (B,1024)
                 if y_logits.dim() == 3:
                     y_logits = y_logits.squeeze(1)
+                y_bin    = y_logits.argmax(dim=-1).to(torch.float32)
+                y_center = y_bin / float(y_logits.size(-1))
+                y_in     = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)   # (B,1)
+                y_emb    = encode_coordinate(y_in, self.region).unsqueeze(1)
                 mask[:, :, :, pos] = True
                 logits, hidden = self._decode_one_tok(y_emb, mask, pos_id, lora)
                 pos_id[0] = pos
                 if include_size:
+                    # ---- size (batched, *without* region.decode_size which flattens batch) ----
+                    # size_out_dim is 2*1024 (W then H). mlp() preserves (B,1,·).
+                    size_logits = mlp(hidden, self.region["size_decoder"]).squeeze(1)  # (B, 2048)
+                    half = size_logits.size(-1) // 2
+                    w_logits, h_logits = size_logits[:, :half], size_logits[:, half:]   # (B,1024),(B,1024)
                     w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                     h_bin = h_logits.argmax(dim=-1).to(torch.float32)
+                    # inverse log-scale mapping used by the repo
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
+                    size_in  = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B,2)
                     size_emb = encode_size(size_in, self.region).unsqueeze(1)       # (B,1,C)
+                    # commit boxes
                     for i in range(B):
                         if alive[i]:
                             out[i].append({
                                 "y_max": (y_center[i] + h[i] / 2).item(),
                             })
+                    # decide continuation
                     mask[:, :, :, pos] = True
                     logits, hidden = self._decode_one_tok(size_emb, mask, pos_id, lora)
                     pos += 1
                     pos_id[0] = pos
                     next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
                 else:
+                    # points mode
                     for i in range(B):
                         if alive[i]:
                             out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
                     mask[:, :, :, pos] = True
                     logits, hidden = self._decode_one_tok(y_emb, mask, pos_id, lora)
                     pos += 1
                     pos_id[0] = pos
+                    next_tok = logits.argmax(dim=-1).squeeze(-1)
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
                 counts = counts + (~finished_now & alive).to(counts.dtype)