JigsawStack
/

moondream2-batched

@@ -77,18 +77,16 @@ class KVCache(nn.Module):
             "v_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
         )
-    # In class KVCache, REPLACE the whole update() with this:
     def update(self, pos_ids, k, v):
         """
         Supports:
           • Prefill:  k,v  = (B, n_kv_heads, q_len, d),  pos_ids = (q_len,)
           • 1-step:   k,v  = (B, n_kv_heads, 1, d),      pos_ids = (B,1) or (B,)
-          • Legacy:   k,v  = (1, n_kv_heads, q_len, d),  pos_ids = scalar
         Writes into self.k_cache/self.v_cache shaped (B, n_kv_heads, T_max, d).
         """
         kout, vout = self.k_cache, self.v_cache
-        # Normalize pos_ids
         if not torch.is_tensor(pos_ids):
             pos_ids = torch.tensor(pos_ids, device=k.device, dtype=torch.long)
         else:
@@ -98,7 +96,7 @@ class KVCache(nn.Module):
             raise RuntimeError(f"KV update expects k,v 4D. Got k={tuple(k.shape)} v={tuple(v.shape)}")
         B, Hkv, q_len, D = k.shape
-        # Ensure cache batch matches B (expand-from-1 allowed)
         if kout.size(0) != B:
             if kout.size(0) == 1:
                 self.k_cache = kout.expand(B, -1, -1, -1).clone()
@@ -107,14 +105,14 @@ class KVCache(nn.Module):
             else:
                 raise RuntimeError(f"KV cache batch mismatch: cache.B={kout.size(0)} vs k.B={B}")
-        # Case A: PREFILL — vector of length q_len (same for all B rows)
         if pos_ids.dim() == 1 and pos_ids.numel() == q_len:
             for i in range(B):
-                kout[i, :, pos_ids, :] = k[i]   # (Hkv, q_len, D)
                 vout[i, :, pos_ids, :] = v[i]
             return kout, vout
-        # Case B: 1-STEP — q_len == 1 with (B,) or (B,1) per-row positions
         if q_len == 1 and pos_ids.numel() == B:
             pos_ids = pos_ids.view(B)
             for i in range(B):
@@ -123,16 +121,15 @@ class KVCache(nn.Module):
                 vout[i, :, pi, :] = v[i, :, 0, :]
             return kout, vout
-        # Case C: scalar for everyone & q_len == 1
         if pos_ids.dim() == 0 and q_len == 1:
             pi = int(pos_ids.item())
             kout[:, :, pi, :] = k[:, :, 0, :]
             vout[:, :, pi, :] = v[:, :, 0, :]
             return kout, vout
-        raise RuntimeError(
-            f"Unsupported KV update combo: k={tuple(k.shape)}, pos_ids={tuple(pos_ids.shape)}"
-        )
@@ -213,10 +210,6 @@ class MoondreamModel(nn.Module):
     def _reset_kv_caches(self, batch_size: int = 1):
-        """
-        Recreate KV caches with the requested batch size so subsequent calls
-        (e.g., encode_image) start from a consistent shape.
-        """
         c = self.config.text
         head_dim = c.dim // c.n_heads
         for blk in self.text.blocks:
@@ -225,6 +218,7 @@ class MoondreamModel(nn.Module):
             shape = (batch_size, c.n_kv_heads, c.max_context, head_dim)
             blk.kv_cache.k_cache = torch.zeros(shape, device=device, dtype=dtype)
             blk.kv_cache.v_cache = torch.zeros(shape, device=device, dtype=dtype)
     def _setup_caches(self):
@@ -589,12 +583,13 @@ class MoondreamModel(nn.Module):
         elif not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
-        # At the VERY TOP of encode_image(), right after the type checks:
         for blk in self.text.blocks:
             if blk.kv_cache.k_cache.size(0) != 1:
                 blk.kv_cache.k_cache = blk.kv_cache.k_cache[:1].contiguous()
                 blk.kv_cache.v_cache = blk.kv_cache.v_cache[:1].contiguous()
         lora = (
             variant_state_dict(settings["variant"], device=self.device)
@@ -933,6 +928,7 @@ class MoondreamModel(nn.Module):
             b.kv_cache.k_cache[:, :, :T, :] = k.expand(batch_size, -1, -1, -1)
             b.kv_cache.v_cache[:, :, :T, :] = v.expand(batch_size, -1, -1, -1)
     def _prefill_prompt_batched(self, labels, pos: int, lora=None,
                             temperature: float = 0.0, top_p: float = 0.0):
         tpl = self.config.tokenizer.templates["detect"]
@@ -949,12 +945,11 @@ class MoondreamModel(nn.Module):
         prompt_ids = torch.full((B, T), eos, device=self.device, dtype=torch.long)
         for i, ids in enumerate(rows):
-            prompt_ids[i, : ids.numel()] = ids
         prompt_emb = text_encoder(prompt_ids, self.text)  # (B,T,C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
-        # 4-D mask: (B,1,T,kv_len) for SDPA
         base = self.attn_mask[:, :, pos:pos+T, :]         # (1,1,T,kv_len)
         mask = base.expand(B, -1, -1, -1).contiguous()    # (B,1,T,kv_len)
@@ -967,29 +962,26 @@ class MoondreamModel(nn.Module):
         last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
         if temperature == 0.0:
-            next_token = last_logits.argmax(dim=-1, keepdim=True)  # (B,1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
-            next_token = torch.multinomial(probs, num_samples=1)   # (B,1)
-        # CRITICAL: per-row next position
-        pos_vec = torch.tensor(lens, device=self.device, dtype=torch.long) + pos      # (B,)
-        # At the end of _prefill_prompt_batched(), return a Python int:
-        pos_end = int((pos + T))
         return last_hidden, next_token, pos_end
     def _generate_points_batched(
         self,
         hidden,              # (B,1,C)
-        next_token,          # (B,1) (unused in greedy, but OK)
         pos,                 # int or Tensor; normalized below
         include_size: bool = True,
         max_objects: int = 50,
         lora=None,
-        use_soft_argmax: bool = True,    # NEW: reduces jitter/hallucinations
     ):
         B = hidden.size(0)
         device = self.device
@@ -997,55 +989,48 @@ class MoondreamModel(nn.Module):
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
-        # Normalize pos to a scalar int (supports int, (1,), (B,), (B,1))
         if torch.is_tensor(pos):
-            pos = int(pos.max().item())  # safe upper bound; we manage per-row with pos_ids/alive
-        # 4-D mask: (B, 1, q_len=1, kv_len)
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if pos > 0:
             mask[:, :, :, :pos] = True
-        # position_ids must be (B,1)
         pos_ids = torch.full((B, 1), pos, device=device, dtype=torch.long)
         alive = torch.ones(B, dtype=torch.bool, device=device)
         counts = torch.zeros(B, dtype=torch.int32, device=device)
-        # helpers ---------------------------------------------------------
         def _argmax01(logits):
-            # logits: (B, bins)
             if use_soft_argmax:
                 probs = torch.softmax(logits, dim=-1)
                 bins = torch.arange(probs.size(-1), device=logits.device, dtype=torch.float32)
                 idx = (probs * bins).sum(dim=-1) / (probs.size(-1) - 1)
-                return idx  # 0..1
             else:
                 idx = logits.argmax(dim=-1).to(torch.float32)
                 return idx / float(logits.size(-1) - 1)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
-                # --- x ---------------------------------------------------
-                x_logits = decode_coordinate(hidden, self.region)  # (B,1,1024) or (B,1024)
-                if x_logits.dim() == 3:
-                    x_logits = x_logits.squeeze(1)
-                x_center = _argmax01(x_logits)                    # (B,) in [0,1]
-                x_in = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)   # (B,1)
-                x_emb = encode_coordinate(x_in, self.region).unsqueeze(1)  # (B,1,C)
-                # advance attention one step FOR ALIVE ROWS ONLY
                 mask[alive, :, :, pos] = True
                 logits, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
                 pos_ids[alive, 0] += 1
-                pos += 1  # scalar next free slot
-                # --- y ---------------------------------------------------
                 y_logits = decode_coordinate(hidden, self.region)
-                if y_logits.dim() == 3:
-                    y_logits = y_logits.squeeze(1)
-                y_center = _argmax01(y_logits)                    # (B,)
-                y_in = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)
                 y_emb = encode_coordinate(y_in, self.region).unsqueeze(1)
                 mask[alive, :, :, pos] = True
@@ -1054,32 +1039,31 @@ class MoondreamModel(nn.Module):
                 pos += 1
                 if include_size:
-                    # --- size --------------------------------------------
                     size_logits = decode_size(hidden, self.region)
                     w_logits = size_logits[0].squeeze(1)
                     h_logits = size_logits[1].squeeze(1)
                     if use_soft_argmax:
-                        # convert expected-bin -> size (same mapping as paper/code)
-                        w_bin = (torch.softmax(w_logits, dim=-1) * torch.arange(w_logits.size(-1), device=device)).sum(dim=-1)
-                        h_bin = (torch.softmax(h_logits, dim=-1) * torch.arange(h_logits.size(-1), device=device)).sum(dim=-1)
                     else:
                         w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                         h_bin = h_logits.argmax(dim=-1).to(torch.float32)
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
-                    size_in = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B,2)
-                    size_emb = encode_size(size_in, self.region).unsqueeze(1)      # (B,1,C)
-                    # record boxes only for alive rows
                     for i in range(B):
-                        if not alive[i]:
-                            continue
                         xl = (x_center[i] - w[i] / 2).item()
                         xr = (x_center[i] + w[i] / 2).item()
                         yt = (y_center[i] - h[i] / 2).item()
                         yb = (y_center[i] + h[i] / 2).item()
-                        # clamp for safety
                         out[i].append({
                             "x_min": max(0.0, min(1.0, xl)),
                             "y_min": max(0.0, min(1.0, yt)),
@@ -1091,7 +1075,7 @@ class MoondreamModel(nn.Module):
                     logits, hidden = self._decode_one_tok(size_emb, mask, pos_ids, lora)
                     pos_ids[alive, 0] += 1
                     pos += 1
-                    next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
                 else:
                     for i in range(B):
                         if alive[i]:
@@ -1102,7 +1086,6 @@ class MoondreamModel(nn.Module):
                     pos += 1
                     next_tok = logits.argmax(dim=-1).squeeze(-1)
-                # stop only rows that hit eos (or reached max objects)
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
                 counts = counts + ((~finished_now) & alive).to(counts.dtype)
                 alive &= ~finished_now

             "v_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
         )
     def update(self, pos_ids, k, v):
         """
         Supports:
           • Prefill:  k,v  = (B, n_kv_heads, q_len, d),  pos_ids = (q_len,)
           • 1-step:   k,v  = (B, n_kv_heads, 1, d),      pos_ids = (B,1) or (B,)
+          • Scalar:   k,v  = (B, n_kv_heads, 1, d),      pos_ids = ()
         Writes into self.k_cache/self.v_cache shaped (B, n_kv_heads, T_max, d).
         """
         kout, vout = self.k_cache, self.v_cache
         if not torch.is_tensor(pos_ids):
             pos_ids = torch.tensor(pos_ids, device=k.device, dtype=torch.long)
         else:
             raise RuntimeError(f"KV update expects k,v 4D. Got k={tuple(k.shape)} v={tuple(v.shape)}")
         B, Hkv, q_len, D = k.shape
+        # Expand cache to batch B if needed (expand-from-1 allowed)
         if kout.size(0) != B:
             if kout.size(0) == 1:
                 self.k_cache = kout.expand(B, -1, -1, -1).clone()
             else:
                 raise RuntimeError(f"KV cache batch mismatch: cache.B={kout.size(0)} vs k.B={B}")
+        # A) Prefill: pos_ids = (q_len,)
         if pos_ids.dim() == 1 and pos_ids.numel() == q_len:
             for i in range(B):
+                kout[i, :, pos_ids, :] = k[i]
                 vout[i, :, pos_ids, :] = v[i]
             return kout, vout
+        # B) One-step: q_len == 1 and pos_ids per row: (B,) or (B,1)
         if q_len == 1 and pos_ids.numel() == B:
             pos_ids = pos_ids.view(B)
             for i in range(B):
                 vout[i, :, pi, :] = v[i, :, 0, :]
             return kout, vout
+        # C) Scalar position for everyone & q_len == 1
         if pos_ids.dim() == 0 and q_len == 1:
             pi = int(pos_ids.item())
             kout[:, :, pi, :] = k[:, :, 0, :]
             vout[:, :, pi, :] = v[:, :, 0, :]
             return kout, vout
+        raise RuntimeError(f"Unsupported KV update combo: k={tuple(k.shape)}, pos_ids={tuple(pos_ids.shape)}")
     def _reset_kv_caches(self, batch_size: int = 1):
         c = self.config.text
         head_dim = c.dim // c.n_heads
         for blk in self.text.blocks:
             shape = (batch_size, c.n_kv_heads, c.max_context, head_dim)
             blk.kv_cache.k_cache = torch.zeros(shape, device=device, dtype=dtype)
             blk.kv_cache.v_cache = torch.zeros(shape, device=device, dtype=dtype)
     def _setup_caches(self):
         elif not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
+        # Always start from single-row caches to avoid leftovers from batched runs
         for blk in self.text.blocks:
             if blk.kv_cache.k_cache.size(0) != 1:
                 blk.kv_cache.k_cache = blk.kv_cache.k_cache[:1].contiguous()
                 blk.kv_cache.v_cache = blk.kv_cache.v_cache[:1].contiguous()
         lora = (
             variant_state_dict(settings["variant"], device=self.device)
             b.kv_cache.k_cache[:, :, :T, :] = k.expand(batch_size, -1, -1, -1)
             b.kv_cache.v_cache[:, :, :T, :] = v.expand(batch_size, -1, -1, -1)
     def _prefill_prompt_batched(self, labels, pos: int, lora=None,
                             temperature: float = 0.0, top_p: float = 0.0):
         tpl = self.config.tokenizer.templates["detect"]
         prompt_ids = torch.full((B, T), eos, device=self.device, dtype=torch.long)
         for i, ids in enumerate(rows):
+            prompt_ids[i, :ids.numel()] = ids
         prompt_emb = text_encoder(prompt_ids, self.text)  # (B,T,C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
         base = self.attn_mask[:, :, pos:pos+T, :]         # (1,1,T,kv_len)
         mask = base.expand(B, -1, -1, -1).contiguous()    # (B,1,T,kv_len)
         last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
         if temperature == 0.0:
+            next_token = last_logits.argmax(dim=-1, keepdim=True)   # (B,1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
+            next_token = torch.multinomial(probs, num_samples=1)    # (B,1)
+        pos_end = int(pos + T)  # shared scalar end position
         return last_hidden, next_token, pos_end
     def _generate_points_batched(
         self,
         hidden,              # (B,1,C)
+        next_token,          # (B,1)
         pos,                 # int or Tensor; normalized below
         include_size: bool = True,
         max_objects: int = 50,
         lora=None,
+        use_soft_argmax: bool = True,   # reduces bbox jitter
     ):
         B = hidden.size(0)
         device = self.device
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
         if torch.is_tensor(pos):
+            pos = int(pos.max().item())
+        # SDPA mask and position ids
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if pos > 0:
             mask[:, :, :, :pos] = True
         pos_ids = torch.full((B, 1), pos, device=device, dtype=torch.long)
         alive = torch.ones(B, dtype=torch.bool, device=device)
         counts = torch.zeros(B, dtype=torch.int32, device=device)
         def _argmax01(logits):
+            # logits: (B, bins) -> normalized index in [0,1]
             if use_soft_argmax:
                 probs = torch.softmax(logits, dim=-1)
                 bins = torch.arange(probs.size(-1), device=logits.device, dtype=torch.float32)
                 idx = (probs * bins).sum(dim=-1) / (probs.size(-1) - 1)
+                return idx
             else:
                 idx = logits.argmax(dim=-1).to(torch.float32)
                 return idx / float(logits.size(-1) - 1)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
+                # x
+                x_logits = decode_coordinate(hidden, self.region)
+                if x_logits.dim() == 3: x_logits = x_logits.squeeze(1)
+                x_center = _argmax01(x_logits)
+                x_in  = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)
+                x_emb = encode_coordinate(x_in, self.region).unsqueeze(1)
                 mask[alive, :, :, pos] = True
                 logits, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
                 pos_ids[alive, 0] += 1
+                pos += 1
+                # y
                 y_logits = decode_coordinate(hidden, self.region)
+                if y_logits.dim() == 3: y_logits = y_logits.squeeze(1)
+                y_center = _argmax01(y_logits)
+                y_in  = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)
                 y_emb = encode_coordinate(y_in, self.region).unsqueeze(1)
                 mask[alive, :, :, pos] = True
                 pos += 1
                 if include_size:
+                    # size
                     size_logits = decode_size(hidden, self.region)
                     w_logits = size_logits[0].squeeze(1)
                     h_logits = size_logits[1].squeeze(1)
                     if use_soft_argmax:
+                        w_bin = (torch.softmax(w_logits, dim=-1) *
+                                 torch.arange(w_logits.size(-1), device=device)).sum(dim=-1)
+                        h_bin = (torch.softmax(h_logits, dim=-1) *
+                                 torch.arange(h_logits.size(-1), device=device)).sum(dim=-1)
                     else:
                         w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                         h_bin = h_logits.argmax(dim=-1).to(torch.float32)
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
+                    size_in = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)
+                    size_emb = encode_size(size_in, self.region).unsqueeze(1)
                     for i in range(B):
+                        if not alive[i]: continue
                         xl = (x_center[i] - w[i] / 2).item()
                         xr = (x_center[i] + w[i] / 2).item()
                         yt = (y_center[i] - h[i] / 2).item()
                         yb = (y_center[i] + h[i] / 2).item()
                         out[i].append({
                             "x_min": max(0.0, min(1.0, xl)),
                             "y_min": max(0.0, min(1.0, yt)),
                     logits, hidden = self._decode_one_tok(size_emb, mask, pos_ids, lora)
                     pos_ids[alive, 0] += 1
                     pos += 1
+                    next_tok = logits.argmax(dim=-1).squeeze(-1)
                 else:
                     for i in range(B):
                         if alive[i]:
                     pos += 1
                     next_tok = logits.argmax(dim=-1).squeeze(-1)
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
                 counts = counts + ((~finished_now) & alive).to(counts.dtype)
                 alive &= ~finished_now