JigsawStack
/

moondream2-batched

@@ -895,81 +895,96 @@ class MoondreamModel(nn.Module):
             probs = self._apply_top_p(probs, top_p)
             next_token = torch.multinomial(probs, num_samples=1)   # (B, 1)
-        pos_vec = torch.tensor([pos], device=self.device, dtype=torch.long).repeat(B) + torch.tensor(lens, device=self.device)
         return last_hidden, next_token, pos_vec  # (B,1,C), (B,1), (B,)
-    def _generate_points_batched(self, hidden, next_token, pos_vec, include_size: bool = True, max_objects: int = 50, lora=None):
         """
-        Vectorized version of _generate_points() that decodes x -> y -> size -> next-token
-        for all rows in the batch simultaneously.
         Returns: list-of-lists of dicts, length B.
         """
         B = hidden.size(0)
         device = self.device
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
-        # Per-row attention/masking state
         max_ctx = self.config.text.max_context
         mask = torch.zeros(B, 1, max_ctx, device=device, dtype=torch.bool)
-        for i in range(B):
-            mask[i, :, : int(pos_vec[i].item())] = 1
-        pos_ids = pos_vec.clone()
-        alive = torch.ones(B, dtype=torch.bool, device=device)
         counts = torch.zeros(B, dtype=torch.int32, device=device)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
-                # --- x coordinate (from current hidden) ---
-                x_logits = decode_coordinate(hidden, self.region)  # (B, 1, 1024) or (B, 1024)
                 if x_logits.dim() == 3:
-                    x_logits = x_logits.squeeze(1)                 # (B, 1024)
-                x_bin = x_logits.argmax(dim=-1).to(torch.float32)  # (B,)
                 x_center = x_bin / float(x_logits.size(-1))              # (B,)
-                x_input  = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)   # (B, 1)  ✅
                 x_emb    = encode_coordinate(x_input, self.region).unsqueeze(1)  # (B,1,C)
-                # step: decode to get hidden for y
-                for i in range(B):
-                    if alive[i]:
-                        mask[i, :, pos_ids[i]] = 1
-                logits, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
-                pos_ids = pos_ids + alive.to(torch.long)
                 # --- y coordinate ---
                 y_logits = decode_coordinate(hidden, self.region)
                 if y_logits.dim() == 3:
-                    y_logits = y_logits.squeeze(1)                 # (B, 1024)
-                y_bin = y_logits.argmax(dim=-1).to(torch.float32)
-                y_center = y_bin / float(y_logits.size(-1))              # (B,)
-                y_input  = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)   # (B, 1)  ✅
-                y_emb    = encode_coordinate(y_input, self.region).unsqueeze(1)
-                # step: decode to get hidden for size (or eos)
-                for i in range(B):
-                    if alive[i]:
-                        mask[i, :, pos_ids[i]] = 1
-                logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
-                pos_ids = pos_ids + alive.to(torch.long)
                 if include_size:
-                    # --- size logits (batched) ---
-                    size_logits = decode_size(hidden, self.region)   # tuple/list [w_logits, h_logits] shaped (B,1,1024)
-                    w_logits, h_logits = size_logits[0].squeeze(1), size_logits[1].squeeze(1)  # (B,1024), (B,1024)
                     w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                     h_bin = h_logits.argmax(dim=-1).to(torch.float32)
-                    # Convert from log-scale bin to size in [0,1]
-                    w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
-                    h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
-                    size_input = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B, 2)  ✅
-                    size_emb   = encode_size(size_input, self.region).unsqueeze(1)
-                    # Commit boxes for alive rows
                     for i in range(B):
                         if not alive[i]:
                             continue
@@ -979,35 +994,40 @@ class MoondreamModel(nn.Module):
                             "x_max": (x_center[i] + w[i] / 2).item(),
                             "y_max": (y_center[i] + h[i] / 2).item(),
                         })
                     # step: decode "next token" to decide continuation
-                    for i in range(B):
-                        if alive[i]:
-                            mask[i, :, pos_ids[i]] = 1
-                    logits, hidden = self._decode_one_tok(size_emb, mask, pos_ids, lora)
-                    pos_ids = pos_ids + alive.to(torch.long)
                     next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
                 else:
                     # Points mode (no size)
-                    for i in range(B):
-                        if not alive[i]:
-                            continue
-                        out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
-                    # step: decode next token from y_emb
                     for i in range(B):
                         if alive[i]:
-                            mask[i, :, pos_ids[i]] = 1
-                    logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
-                    pos_ids = pos_ids + alive.to(torch.long)
-                    next_tok = logits.argmax(dim=-1).squeeze(-1)
-                # Update which rows are done and count
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
                 counts = counts + (~finished_now & alive).to(counts.dtype)
                 alive &= ~finished_now
         return out
     def detect_multi(self, image, objects, settings=None):
         """
         Parallel multi-label detection.
@@ -1053,7 +1073,7 @@ class MoondreamModel(nn.Module):
                 d["label"] = lab
             res[lab] = lst
         return {"objects": res}
-    # === END: Batched multi-label detection additions ===
     def _detect_gaze(
         self,
         image: EncodedImage,

             probs = self._apply_top_p(probs, top_p)
             next_token = torch.multinomial(probs, num_samples=1)   # (B, 1)
+        pos_vec = torch.full((B,), pos + T, device=self.device, dtype=torch.long)
         return last_hidden, next_token, pos_vec  # (B,1,C), (B,1), (B,)
+    def _generate_points_batched(
+    self,
+    hidden: torch.Tensor,          # (B, 1, C) last hidden per row from prefill
+    next_token: torch.Tensor,      # (B, 1) not used directly (kept for parity)
+    pos_vec: torch.Tensor,         # (B,) next write pos per row after prefill
+    include_size: bool = True,
+    max_objects: int = 50,
+    lora=None,
+):
         """
+        Batched decode loop for multi-label detection.
+        - Uses a *shared* scalar position id per step (q_len = 1), as expected by RoPE.
+        - Maintains a per-row attention mask and 'alive' flags.
+        - Feeds coord encoders with (B,1) tensors; size encoder with (B,2).
         Returns: list-of-lists of dicts, length B.
         """
         B = hidden.size(0)
         device = self.device
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
+        # --- Shared write position (scalar) consistent with RoPE q_len=1 ---
+        # We align rows by padding; using the maximum ensures all KV rows can decode in lockstep.
+        pos = int(pos_vec.max().item())
+        # Per-row attention mask (1 = visible). Mark everything up to 'pos' as visible.
         max_ctx = self.config.text.max_context
         mask = torch.zeros(B, 1, max_ctx, device=device, dtype=torch.bool)
+        mask[:, :, :pos] = 1
+        alive  = torch.ones(B, dtype=torch.bool, device=device)
         counts = torch.zeros(B, dtype=torch.int32, device=device)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
+                # --- x coordinate ---
+                x_logits = decode_coordinate(hidden, self.region)        # (B,1,1024) or (B,1024)
                 if x_logits.dim() == 3:
+                    x_logits = x_logits.squeeze(1)                       # (B,1024)
+                x_bin    = x_logits.argmax(dim=-1).to(torch.float32)     # (B,)
                 x_center = x_bin / float(x_logits.size(-1))              # (B,)
+                x_input  = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)   # (B,1)
                 x_emb    = encode_coordinate(x_input, self.region).unsqueeze(1)  # (B,1,C)
+                # step: decode hidden for y (advance shared pos)
+                mask[:, :, pos] = 1
+                logits, hidden = self._decode_one_tok(
+                    x_emb,
+                    mask,
+                    torch.tensor([pos], device=device, dtype=torch.long),  # length-1 (q_len=1)
+                    lora,
+                )
+                pos += 1
                 # --- y coordinate ---
                 y_logits = decode_coordinate(hidden, self.region)
                 if y_logits.dim() == 3:
+                    y_logits = y_logits.squeeze(1)
+                y_bin    = y_logits.argmax(dim=-1).to(torch.float32)
+                y_center = y_bin / float(y_logits.size(-1))               # (B,)
+                y_input  = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)    # (B,1)
+                y_emb    = encode_coordinate(y_input, self.region).unsqueeze(1)  # (B,1,C)
+                # step: decode hidden for size / eos (advance shared pos)
+                mask[:, :, pos] = 1
+                logits, hidden = self._decode_one_tok(
+                    y_emb,
+                    mask,
+                    torch.tensor([pos], device=device, dtype=torch.long),
+                    lora,
+                )
+                pos += 1
                 if include_size:
+                    # --- size (batched) ---
+                    size_logits = decode_size(hidden, self.region)        # ([B,1,1024],[B,1,1024])
+                    w_logits, h_logits = size_logits[0].squeeze(1), size_logits[1].squeeze(1)  # (B,1024)
                     w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                     h_bin = h_logits.argmax(dim=-1).to(torch.float32)
+                    # Convert log-scale bins -> sizes in [0,1]
+                    w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)   # (B,)
+                    h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)   # (B,)
+                    size_input = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B,2)
+                    size_emb   = encode_size(size_input, self.region).unsqueeze(1)    # (B,1,C)
+                    # Record boxes for alive rows
                     for i in range(B):
                         if not alive[i]:
                             continue
                             "x_max": (x_center[i] + w[i] / 2).item(),
                             "y_max": (y_center[i] + h[i] / 2).item(),
                         })
                     # step: decode "next token" to decide continuation
+                    mask[:, :, pos] = 1
+                    logits, hidden = self._decode_one_tok(
+                        size_emb,
+                        mask,
+                        torch.tensor([pos], device=device, dtype=torch.long),
+                        lora,
+                    )
+                    pos += 1
                     next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
                 else:
                     # Points mode (no size)
                     for i in range(B):
                         if alive[i]:
+                            out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
+                    mask[:, :, pos] = 1
+                    logits, hidden = self._decode_one_tok(
+                        y_emb,
+                        mask,
+                        torch.tensor([pos], device=device, dtype=torch.long),
+                        lora,
+                    )
+                    pos += 1
+                    next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
+                # Update finished/alive bookkeeping
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
                 counts = counts + (~finished_now & alive).to(counts.dtype)
                 alive &= ~finished_now
         return out
     def detect_multi(self, image, objects, settings=None):
         """
         Parallel multi-label detection.
                 d["label"] = lab
             res[lab] = lst
         return {"objects": res}
     def _detect_gaze(
         self,
         image: EncodedImage,