JigsawStack
/

moondream2-batched

@@ -77,50 +77,54 @@ class KVCache(nn.Module):
             "v_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
         )
-    # --- replace the whole method in KVCache ---
     def update(self, pos_ids, k, v):
         """
-        Supports both:
           • Prefill:  k,v  = (B, n_kv_heads, q_len, d),  pos_ids = (q_len,)
-          • 1-step:   k,v  = (B, n_kv_heads, 1, d),      pos_ids = (B,)
-        Writes into caches shaped (B, n_kv_heads, T_max, d).
         """
         kout, vout = self.k_cache, self.v_cache
         if not torch.is_tensor(pos_ids):
-            # Scalar position for singleton batch (legacy)
             kout[:, :, pos_ids, :] = k
             vout[:, :, pos_ids, :] = v
             return kout, vout
-        # Normalize dtype
         pos_ids = pos_ids.to(dtype=torch.long, device=k.device)
-        # Shapes
         if k.dim() != 4 or v.dim() != 4:
             raise RuntimeError(f"KV update expects k,v 4D. Got k={tuple(k.shape)} v={tuple(v.shape)}")
         B, Hkv, q_len, D = k.shape
-        # Ensure cache batch matches B
         if kout.size(0) != B:
-            raise RuntimeError(f"KV cache batch mismatch: cache.B={kout.size(0)} vs k.B={B}")
-        # Case A: PREFILL — per-row write of a whole span of positions
         if pos_ids.dim() == 1 and pos_ids.numel() == q_len:
             for i in range(B):
-                kout[i, :, pos_ids, :] = k[i]  # (Hkv, q_len, D)
                 vout[i, :, pos_ids, :] = v[i]
             return kout, vout
-        # Case B: STEP DECODE — one new position per row (q_len must be 1)
-        if pos_ids.dim() == 1 and pos_ids.numel() == B and q_len == 1:
             for i in range(B):
-                pi = int(pos_ids[i].item())
                 kout[i, :, pi, :] = k[i, :, 0, :]
                 vout[i, :, pi, :] = v[i, :, 0, :]
             return kout, vout
-        # Optional legacy: scalar pos for everyone
         if pos_ids.dim() == 0 and q_len == 1:
             pi = int(pos_ids.item())
             kout[:, :, pi, :] = k[:, :, 0, :]
@@ -135,6 +139,7 @@ class KVCache(nn.Module):
 class MoondreamModel(nn.Module):
     def __init__(
@@ -968,93 +973,105 @@ class MoondreamModel(nn.Module):
         return last_hidden, next_token, pos_vec
-    def _generate_points_batched(self, hidden, next_token, pos_vec,
-                             include_size: bool = True, max_objects: int = 50, lora=None):
         B = hidden.size(0)
         device = self.device
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
-        # 4-D mask: (B,1,1,kv_len) and fill historical prefix per row
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
-        for i in range(B):
-            p = int(pos_vec[i].item())
-            if p > 0: mask[i, 0, 0, :p] = True
-        pos_ids = pos_vec.clone()  # (B,)
-        alive   = torch.ones(B, dtype=torch.bool, device=device)
-        counts  = torch.zeros(B, dtype=torch.int32, device=device)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
-                # --- x ---
                 x_logits = decode_coordinate(hidden, self.region)       # (B,1,1024) or (B,1024)
-                if x_logits.dim() == 3: x_logits = x_logits.squeeze(1)
-                x_bin = x_logits.argmax(dim=-1).to(torch.float32)
-                x_center = x_bin / float(x_logits.size(-1))             # (B,)
                 x_in = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)  # (B,1)
                 x_emb = encode_coordinate(x_in, self.region).unsqueeze(1)   # (B,1,C)
-                # advance one position per *alive* row
-                for i in range(B):
-                    if alive[i]: mask[i, 0, 0, int(pos_ids[i].item())] = True
                 logits, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
-                pos_ids = pos_ids + alive.to(torch.long)
-                # --- y ---
                 y_logits = decode_coordinate(hidden, self.region)
-                if y_logits.dim() == 3: y_logits = y_logits.squeeze(1)
-                y_bin = y_logits.argmax(dim=-1).to(torch.float32)
-                y_center = y_bin / float(y_logits.size(-1))
-                y_in = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)
                 y_emb = encode_coordinate(y_in, self.region).unsqueeze(1)
-                for i in range(B):
-                    if alive[i]: mask[i, 0, 0, int(pos_ids[i].item())] = True
                 logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
-                pos_ids = pos_ids + alive.to(torch.long)
                 if include_size:
-                    size_logits = decode_size(hidden, self.region)  # Expect [(B,1,1024),(B,1,1024)] or (tuple)
-                    # be robust to either rank
-                    w_logits = size_logits[0].squeeze(1) if size_logits[0].dim() == 3 else size_logits[0]
-                    h_logits = size_logits[1].squeeze(1) if size_logits[1].dim() == 3 else size_logits[1]
                     w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                     h_bin = h_logits.argmax(dim=-1).to(torch.float32)
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
-                    size_in = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)   # (B,2)
                     size_emb = encode_size(size_in, self.region).unsqueeze(1)       # (B,1,C)
                     for i in range(B):
-                        if not alive[i]: continue
-                        out[i].append({
-                            "x_min": (x_center[i] - w[i] / 2).item(),
-                            "y_min": (y_center[i] - h[i] / 2).item(),
-                            "x_max": (x_center[i] + w[i] / 2).item(),
-                            "y_max": (y_center[i] + h[i] / 2).item(),
-                        })
-                    for i in range(B):
-                        if alive[i]: mask[i, 0, 0, int(pos_ids[i].item())] = True
                     logits, hidden = self._decode_one_tok(size_emb, mask, pos_ids, lora)
-                    pos_ids = pos_ids + alive.to(torch.long)
                     next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
                 else:
                     for i in range(B):
                         if alive[i]:
                             out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
-                    for i in range(B):
-                        if alive[i]: mask[i, 0, 0, int(pos_ids[i].item())] = True
                     logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
-                    pos_ids = pos_ids + alive.to(torch.long)
                     next_tok = logits.argmax(dim=-1).squeeze(-1)
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
-                counts = counts + ((~finished_now) & alive).to(counts.dtype)
                 alive &= ~finished_now
         return out
@@ -1062,6 +1079,7 @@ class MoondreamModel(nn.Module):
     def detect_multi(self, image, objects, settings=None):
         """
         Parallel multi-label detection.

             "v_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
         )
     def update(self, pos_ids, k, v):
         """
+        Supports:
           • Prefill:  k,v  = (B, n_kv_heads, q_len, d),  pos_ids = (q_len,)
+          • 1-step:   k,v  = (B, n_kv_heads, 1, d),      pos_ids = (B,) or (B,1)
+          • Legacy:   k,v  = (1, n_kv_heads, q_len, d),  pos_ids = scalar int
+        Writes into self.k_cache/self.v_cache shaped (B, n_kv_heads, T_max, d).
         """
         kout, vout = self.k_cache, self.v_cache
         if not torch.is_tensor(pos_ids):
+            # Scalar legacy path
             kout[:, :, pos_ids, :] = k
             vout[:, :, pos_ids, :] = v
             return kout, vout
         pos_ids = pos_ids.to(dtype=torch.long, device=k.device)
         if k.dim() != 4 or v.dim() != 4:
             raise RuntimeError(f"KV update expects k,v 4D. Got k={tuple(k.shape)} v={tuple(v.shape)}")
         B, Hkv, q_len, D = k.shape
+        # Make sure cache batch matches B (expand-from-1 is ok, otherwise error)
         if kout.size(0) != B:
+            if kout.size(0) == 1:
+                self.k_cache = kout.expand(B, -1, -1, -1).clone()
+                self.v_cache = vout.expand(B, -1, -1, -1).clone()
+                kout, vout = self.k_cache, self.v_cache
+            else:
+                raise RuntimeError(f"KV cache batch mismatch: cache.B={kout.size(0)} vs k.B={B}")
+        # Case A: PREFILL — pos_ids indexes a contiguous range per row
         if pos_ids.dim() == 1 and pos_ids.numel() == q_len:
             for i in range(B):
+                kout[i, :, pos_ids, :] = k[i]            # (Hkv, q_len, D)
                 vout[i, :, pos_ids, :] = v[i]
             return kout, vout
+        # Case B: STEP — q_len == 1 and one position per row
+        if q_len == 1 and pos_ids.numel() == B:
+            pos_ids_flat = pos_ids.view(-1)              # handle (B,1) or (B,)
             for i in range(B):
+                pi = int(pos_ids_flat[i].item())
                 kout[i, :, pi, :] = k[i, :, 0, :]
                 vout[i, :, pi, :] = v[i, :, 0, :]
             return kout, vout
+        # Case C: scalar for everyone
         if pos_ids.dim() == 0 and q_len == 1:
             pi = int(pos_ids.item())
             kout[:, :, pi, :] = k[:, :, 0, :]
 class MoondreamModel(nn.Module):
     def __init__(
         return last_hidden, next_token, pos_vec
+    # In class MoondreamModel, replace the whole method:
+    def _generate_points_batched(
+        self,
+        hidden,              # (B,1,C)
+        next_token,          # (B,1)  (not used when temperature=0, but ok)
+        pos: int,            # shared scalar next position
+        include_size: bool = True,
+        max_objects: int = 50,
+        lora=None,
+    ):
+        """
+        Vectorized version of _generate_points() that decodes x -> y -> size -> next-token
+        for all rows in the batch simultaneously. Returns list-of-lists of dicts, len B.
+        """
         B = hidden.size(0)
         device = self.device
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
+        # 4-D mask: (B, 1, q_len=1, kv_len)
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
+        if pos > 0:
+            mask[:, :, :, :pos] = True
+        # IMPORTANT: position_ids must be (B, 1) for rotary; KVCache.update accepts (B,1) too
+        pos_ids = torch.full((B, 1), pos, device=device, dtype=torch.long)
+        alive = torch.ones(B, dtype=torch.bool, device=device)
+        counts = torch.zeros(B, dtype=torch.int32, device=device)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
+                # --- x coordinate ---
                 x_logits = decode_coordinate(hidden, self.region)       # (B,1,1024) or (B,1024)
+                if x_logits.dim() == 3:
+                    x_logits = x_logits.squeeze(1)
+                x_center = x_logits.argmax(dim=-1).to(torch.float32) / float(x_logits.size(-1))  # (B,)
                 x_in = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)  # (B,1)
                 x_emb = encode_coordinate(x_in, self.region).unsqueeze(1)   # (B,1,C)
+                # advance attention one step
+                mask[:, :, :, pos] = True
                 logits, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
+                pos += 1
+                pos_ids[:, 0] = pos
+                # --- y coordinate ---
                 y_logits = decode_coordinate(hidden, self.region)
+                if y_logits.dim() == 3:
+                    y_logits = y_logits.squeeze(1)
+                y_center = y_logits.argmax(dim=-1).to(torch.float32) / float(y_logits.size(-1))
+                y_in = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)  # (B,1)
                 y_emb = encode_coordinate(y_in, self.region).unsqueeze(1)
+                mask[:, :, :, pos] = True
                 logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
+                pos += 1
+                pos_ids[:, 0] = pos
                 if include_size:
+                    # --- size ---
+                    size_logits = decode_size(hidden, self.region)  # tuple/list [w_logits, h_logits]
+                    # Support both (B,1,1024) and (B,1024)
+                    w_logits = size_logits[0].squeeze(1)
+                    h_logits = size_logits[1].squeeze(1)
                     w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                     h_bin = h_logits.argmax(dim=-1).to(torch.float32)
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
+                    size_in = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B,2)
                     size_emb = encode_size(size_in, self.region).unsqueeze(1)       # (B,1,C)
+                    # record boxes
                     for i in range(B):
+                        if alive[i]:
+                            out[i].append({
+                                "x_min": (x_center[i] - w[i] / 2).item(),
+                                "y_min": (y_center[i] - h[i] / 2).item(),
+                                "x_max": (x_center[i] + w[i] / 2).item(),
+                                "y_max": (y_center[i] + h[i] / 2).item(),
+                            })
+                    mask[:, :, :, pos] = True
                     logits, hidden = self._decode_one_tok(size_emb, mask, pos_ids, lora)
+                    pos += 1
+                    pos_ids[:, 0] = pos
                     next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
                 else:
                     for i in range(B):
                         if alive[i]:
                             out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
+                    mask[:, :, :, pos] = True
                     logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
+                    pos += 1
+                    pos_ids[:, 0] = pos
                     next_tok = logits.argmax(dim=-1).squeeze(-1)
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
+                counts = counts + (~finished_now & alive).to(counts.dtype)
                 alive &= ~finished_now
         return out
     def detect_multi(self, image, objects, settings=None):
         """
         Parallel multi-label detection.