JigsawStack
/

moondream2-batched

@@ -79,11 +79,23 @@ class KVCache(nn.Module):
     def update(self, pos_ids, k, v):
         kout, vout = self.k_cache, self.v_cache
-        kout[:, :, pos_ids, :] = k
-        vout[:, :, pos_ids, :] = v
         return kout, vout
 class MoondreamModel(nn.Module):
     def __init__(
@@ -532,6 +544,11 @@ class MoondreamModel(nn.Module):
             return image
         elif not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
         lora = (
             variant_state_dict(settings["variant"], device=self.device)
@@ -867,14 +884,8 @@ class MoondreamModel(nn.Module):
             b.kv_cache.k_cache[:, :, :T, :] = k.expand(batch_size, -1, -1, -1)
             b.kv_cache.v_cache[:, :, :T, :] = v.expand(batch_size, -1, -1, -1)
-    def _prefill_prompt_batched(
-        self,
-        labels,
-        pos: int,
-        lora=None,
-        temperature: float = 0.0,
-        top_p: float = 0.0,
-    ):
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection (no detect template).")
@@ -882,28 +893,27 @@ class MoondreamModel(nn.Module):
         rows, lens = [], []
         for lab in labels:
             ids = tpl["prefix"] + self.tokenizer.encode(" " + lab).ids + tpl["suffix"]
-            rows.append(torch.tensor(ids, device=self.device, dtype=torch.long))
-            lens.append(len(ids))
-        B = len(rows)
-        T = max(lens)
         eos = self.config.tokenizer.eos_id
         prompt_ids = torch.full((B, T), eos, device=self.device, dtype=torch.long)
         for i, ids in enumerate(rows):
             prompt_ids[i, : ids.numel()] = ids
-        prompt_emb = text_encoder(prompt_ids, self.text)   # (B, T, C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
-        # 4-D mask is broadcastable to (B, n_heads, T, K)
-        attn = self.attn_mask
-        mask = attn[:, :, pos : pos + T, :].expand(B, -1, -1, -1).contiguous()
-        pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)
-        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)  # (B, T, C)
-        logits_BTV = lm_head(hidden_BTC, self.text)                  # (B, T, V)
-        idx = (torch.tensor(lens, device=self.device, dtype=torch.long) - 1).clamp_min(0)
         last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B,1,C)
         last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
@@ -914,117 +924,105 @@ class MoondreamModel(nn.Module):
             probs = self._apply_top_p(probs, top_p)
             next_token = torch.multinomial(probs, num_samples=1)   # (B,1)
-        pos_end = pos + T
-        return last_hidden, next_token, pos_end  # (B,1,C), (B,1), int
-    def _generate_points_batched(
-        self,
-        hidden,              # (B,1,C)
-        next_token,          # (B,1)
-        pos: int,            # shared scalar next position
-        include_size: bool = True,
-        max_objects: int = 50,
-        lora=None,
-    ):
         B = hidden.size(0)
         device = self.device
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
-        # 4-D mask: (B, 1, q_len=1, kv_len)
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
-        if pos > 0:
-            mask[:, :, :, :pos] = True
-        pos_id = torch.tensor([pos], device=device, dtype=torch.long)  # (1,)
-        alive = torch.ones(B, dtype=torch.bool, device=device)
-        counts = torch.zeros(B, dtype=torch.int32, device=device)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
-                # --- x coordinate ---
-                x_logits = decode_coordinate(hidden, self.region)  # (B,1,1024) or (B,1024)
-                if x_logits.dim() == 3:
-                    x_logits = x_logits.squeeze(1)
                 x_bin = x_logits.argmax(dim=-1).to(torch.float32)
-                x_center = x_bin / float(x_logits.size(-1))        # (B,)
                 x_in = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)  # (B,1)
-                x_emb = encode_coordinate(x_in, self.region).unsqueeze(1)  # (B,1,C)
-                mask[:, :, :, pos_id[0].item()] = True
-                logits, hidden = self._decode_one_tok(x_emb, mask, pos_id, lora)
-                pos_id += 1
-                # --- y coordinate ---
                 y_logits = decode_coordinate(hidden, self.region)
-                if y_logits.dim() == 3:
-                    y_logits = y_logits.squeeze(1)
                 y_bin = y_logits.argmax(dim=-1).to(torch.float32)
-                y_center = y_bin / float(y_logits.size(-1))        # (B,)
                 y_in = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)
                 y_emb = encode_coordinate(y_in, self.region).unsqueeze(1)
-                mask[:, :, :, pos_id[0].item()] = True
-                logits, hidden = self._decode_one_tok(y_emb, mask, pos_id, lora)
-                pos_id += 1
                 if include_size:
-                    size_logits = decode_size(hidden, self.region)
-                    # Support both tuple-of-tensors and flattened (2, -1) forms
-                    if isinstance(size_logits, (tuple, list)):
-                        w_logits = size_logits[0]
-                        h_logits = size_logits[1]
-                        if w_logits.dim() == 3:  # (B,1,1024)
-                            w_logits = w_logits.squeeze(1)
-                            h_logits = h_logits.squeeze(1)
-                    else:
-                        # size_logits shape: (2, B * size_bins) — reshape it back.
-                        size_logits = size_logits.reshape(2, B, -1)
-                        w_logits, h_logits = size_logits[0], size_logits[1]  # (B, size_bins)
                     w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                     h_bin = h_logits.argmax(dim=-1).to(torch.float32)
-                    # inverse of log-scale mapping used by Moondream
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
-                    size_in = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B,2)
                     size_emb = encode_size(size_in, self.region).unsqueeze(1)       # (B,1,C)
                     for i in range(B):
-                        if alive[i]:
-                            out[i].append({
-                                "x_min": (x_center[i] - w[i] / 2).item(),
-                                "y_min": (y_center[i] - h[i] / 2).item(),
-                                "x_max": (x_center[i] + w[i] / 2).item(),
-                                "y_max": (y_center[i] + h[i] / 2).item(),
-                            })
-                    mask[:, :, :, pos_id[0].item()] = True
-                    logits, hidden = self._decode_one_tok(size_emb, mask, pos_id, lora)
-                    pos_id += 1
                     next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
                 else:
                     for i in range(B):
                         if alive[i]:
                             out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
-                    mask[:, :, :, pos_id[0].item()] = True
-                    logits, hidden = self._decode_one_tok(y_emb, mask, pos_id, lora)
-                    pos_id += 1
                     next_tok = logits.argmax(dim=-1).squeeze(-1)
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
-                counts = counts + (~finished_now & alive).to(counts.dtype)
                 alive &= ~finished_now
         return out
     def detect_multi(self, image, objects, settings=None):
         """
         Parallel multi-label detection.

     def update(self, pos_ids, k, v):
         kout, vout = self.k_cache, self.v_cache
+        # pos_ids: scalar (int or 0-D) OR LongTensor[B]
+        if not torch.is_tensor(pos_ids) or pos_ids.ndim == 0:
+            # singleton batch
+            kout[:, :, pos_ids, :] = k
+            vout[:, :, pos_ids, :] = v
+        else:
+            # batched: write each row into its own position
+            B = k.size(0)
+            # Safe, explicit per-row scatter (B is usually small)
+            for i in range(B):
+                pi = int(pos_ids[i].item())
+                kout[i, :, pi, :] = k[i]
+                vout[i, :, pi, :] = v[i]
         return kout, vout
 class MoondreamModel(nn.Module):
     def __init__(
             return image
         elif not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
+        for blk in self.text.blocks:
+            if blk.kv_cache.k_cache.size(0) != 1:
+                blk.kv_cache.k_cache = blk.kv_cache.k_cache[:1].contiguous()
+                blk.kv_cache.v_cache = blk.kv_cache.v_cache[:1].contiguous()
         lora = (
             variant_state_dict(settings["variant"], device=self.device)
             b.kv_cache.k_cache[:, :, :T, :] = k.expand(batch_size, -1, -1, -1)
             b.kv_cache.v_cache[:, :, :T, :] = v.expand(batch_size, -1, -1, -1)
+    def _prefill_prompt_batched(self, labels, pos: int, lora=None,
+                            temperature: float = 0.0, top_p: float = 0.0):
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection (no detect template).")
         rows, lens = [], []
         for lab in labels:
             ids = tpl["prefix"] + self.tokenizer.encode(" " + lab).ids + tpl["suffix"]
+            t = torch.tensor(ids, device=self.device, dtype=torch.long)
+            rows.append(t); lens.append(t.numel())
+        B = len(rows); T = max(lens)
         eos = self.config.tokenizer.eos_id
         prompt_ids = torch.full((B, T), eos, device=self.device, dtype=torch.long)
         for i, ids in enumerate(rows):
             prompt_ids[i, : ids.numel()] = ids
+        prompt_emb = text_encoder(prompt_ids, self.text)  # (B,T,C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
+        # 4-D mask: (B,1,T,kv_len) for SDPA
+        base = self.attn_mask[:, :, pos:pos+T, :]         # (1,1,T,kv_len)
+        mask = base.expand(B, -1, -1, -1).contiguous()    # (B,1,T,kv_len)
+        pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)  # (T,)
+        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)                  # (B,T,C)
+        logits_BTV = lm_head(hidden_BTC, self.text)
+        idx = (torch.tensor(lens, device=self.device) - 1).clamp_min(0)              # (B,)
         last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B,1,C)
         last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
             probs = self._apply_top_p(probs, top_p)
             next_token = torch.multinomial(probs, num_samples=1)   # (B,1)
+        # CRITICAL: per-row next position
+        pos_vec = torch.tensor(lens, device=self.device, dtype=torch.long) + pos      # (B,)
+        return last_hidden, next_token, pos_vec
+    def _generate_points_batched(self, hidden, next_token, pos_vec,
+                             include_size: bool = True, max_objects: int = 50, lora=None):
         B = hidden.size(0)
         device = self.device
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
+        # 4-D mask: (B,1,1,kv_len) and fill historical prefix per row
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
+        for i in range(B):
+            p = int(pos_vec[i].item())
+            if p > 0: mask[i, 0, 0, :p] = True
+        pos_ids = pos_vec.clone()  # (B,)
+        alive   = torch.ones(B, dtype=torch.bool, device=device)
+        counts  = torch.zeros(B, dtype=torch.int32, device=device)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
+                # --- x ---
+                x_logits = decode_coordinate(hidden, self.region)       # (B,1,1024) or (B,1024)
+                if x_logits.dim() == 3: x_logits = x_logits.squeeze(1)
                 x_bin = x_logits.argmax(dim=-1).to(torch.float32)
+                x_center = x_bin / float(x_logits.size(-1))             # (B,)
                 x_in = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)  # (B,1)
+                x_emb = encode_coordinate(x_in, self.region).unsqueeze(1)   # (B,1,C)
+                # advance one position per *alive* row
+                for i in range(B):
+                    if alive[i]: mask[i, 0, 0, int(pos_ids[i].item())] = True
+                logits, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
+                pos_ids = pos_ids + alive.to(torch.long)
+                # --- y ---
                 y_logits = decode_coordinate(hidden, self.region)
+                if y_logits.dim() == 3: y_logits = y_logits.squeeze(1)
                 y_bin = y_logits.argmax(dim=-1).to(torch.float32)
+                y_center = y_bin / float(y_logits.size(-1))
                 y_in = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)
                 y_emb = encode_coordinate(y_in, self.region).unsqueeze(1)
+                for i in range(B):
+                    if alive[i]: mask[i, 0, 0, int(pos_ids[i].item())] = True
+                logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
+                pos_ids = pos_ids + alive.to(torch.long)
                 if include_size:
+                    size_logits = decode_size(hidden, self.region)  # Expect [(B,1,1024),(B,1,1024)] or (tuple)
+                    # be robust to either rank
+                    w_logits = size_logits[0].squeeze(1) if size_logits[0].dim() == 3 else size_logits[0]
+                    h_logits = size_logits[1].squeeze(1) if size_logits[1].dim() == 3 else size_logits[1]
                     w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                     h_bin = h_logits.argmax(dim=-1).to(torch.float32)
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
+                    size_in = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)   # (B,2)
                     size_emb = encode_size(size_in, self.region).unsqueeze(1)       # (B,1,C)
                     for i in range(B):
+                        if not alive[i]: continue
+                        out[i].append({
+                            "x_min": (x_center[i] - w[i] / 2).item(),
+                            "y_min": (y_center[i] - h[i] / 2).item(),
+                            "x_max": (x_center[i] + w[i] / 2).item(),
+                            "y_max": (y_center[i] + h[i] / 2).item(),
+                        })
+                    for i in range(B):
+                        if alive[i]: mask[i, 0, 0, int(pos_ids[i].item())] = True
+                    logits, hidden = self._decode_one_tok(size_emb, mask, pos_ids, lora)
+                    pos_ids = pos_ids + alive.to(torch.long)
                     next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
                 else:
                     for i in range(B):
                         if alive[i]:
                             out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
+                    for i in range(B):
+                        if alive[i]: mask[i, 0, 0, int(pos_ids[i].item())] = True
+                    logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
+                    pos_ids = pos_ids + alive.to(torch.long)
                     next_tok = logits.argmax(dim=-1).squeeze(-1)
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
+                counts = counts + ((~finished_now) & alive).to(counts.dtype)
                 alive &= ~finished_now
         return out
     def detect_multi(self, image, objects, settings=None):
         """
         Parallel multi-label detection.