JigsawStack
/

moondream2-batched

@@ -64,39 +64,35 @@ class EncodedImage:
     pos: int
     caches: List[Tuple[torch.Tensor, torch.Tensor]]
 class KVCache(nn.Module):
     def __init__(self, n_heads, n_kv_heads, max_context, dim, device, dtype):
         super().__init__()
         cache_shape = (1, n_kv_heads, max_context, dim // n_heads)
-        self.register_buffer(
-            "k_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
-        )
-        self.register_buffer(
-            "v_cache", torch.zeros(*cache_shape, device=device, dtype=dtype)
-        )
     def update(self, pos_ids, k, v):
         """
         Supports:
           • Prefill:  k,v  = (B, n_kv_heads, q_len, d),  pos_ids = (q_len,)
           • 1-step:   k,v  = (B, n_kv_heads, 1, d),      pos_ids = (B,1) or (B,)
-          • Scalar:   k,v  = (B, n_kv_heads, 1, d),      pos_ids = ()
         Writes into self.k_cache/self.v_cache shaped (B, n_kv_heads, T_max, d).
         """
         kout, vout = self.k_cache, self.v_cache
         if not torch.is_tensor(pos_ids):
             pos_ids = torch.tensor(pos_ids, device=k.device, dtype=torch.long)
         else:
             pos_ids = pos_ids.to(device=k.device, dtype=torch.long)
         if k.dim() != 4 or v.dim() != 4:
-            raise RuntimeError(f"KV update expects k,v 4D. Got k={tuple(k.shape)} v={tuple(v.shape)}")
         B, Hkv, q_len, D = k.shape
-        # Expand cache to batch B if needed (expand-from-1 allowed)
         if kout.size(0) != B:
             if kout.size(0) == 1:
                 self.k_cache = kout.expand(B, -1, -1, -1).clone()
@@ -104,34 +100,31 @@ class KVCache(nn.Module):
                 kout, vout = self.k_cache, self.v_cache
             else:
                 raise RuntimeError(f"KV cache batch mismatch: cache.B={kout.size(0)} vs k.B={B}")
-        # A) Prefill: pos_ids = (q_len,)
         if pos_ids.dim() == 1 and pos_ids.numel() == q_len:
             for i in range(B):
                 kout[i, :, pos_ids, :] = k[i]
                 vout[i, :, pos_ids, :] = v[i]
             return kout, vout
-        # B) One-step: q_len == 1 and pos_ids per row: (B,) or (B,1)
-        if q_len == 1 and pos_ids.numel() == B:
             pos_ids = pos_ids.view(B)
             for i in range(B):
                 pi = int(pos_ids[i].item())
                 kout[i, :, pi, :] = k[i, :, 0, :]
                 vout[i, :, pi, :] = v[i, :, 0, :]
             return kout, vout
-        # C) Scalar position for everyone & q_len == 1
         if pos_ids.dim() == 0 and q_len == 1:
             pi = int(pos_ids.item())
             kout[:, :, pi, :] = k[:, :, 0, :]
             vout[:, :, pi, :] = v[:, :, 0, :]
             return kout, vout
-        raise RuntimeError(f"Unsupported KV update combo: k={tuple(k.shape)}, pos_ids={tuple(pos_ids.shape)}")
@@ -214,11 +207,12 @@ class MoondreamModel(nn.Module):
         head_dim = c.dim // c.n_heads
         for blk in self.text.blocks:
             device = blk.kv_cache.k_cache.device
-            dtype = blk.kv_cache.k_cache.dtype
-            shape = (batch_size, c.n_kv_heads, c.max_context, head_dim)
             blk.kv_cache.k_cache = torch.zeros(shape, device=device, dtype=dtype)
             blk.kv_cache.v_cache = torch.zeros(shape, device=device, dtype=dtype)
     def _setup_caches(self):
@@ -575,52 +569,41 @@ class MoondreamModel(nn.Module):
         image: Union[Image.Image, EncodedImage],
         settings: Optional[ImageEncodingSettings] = None,
     ) -> EncodedImage:
-        # Always start from single-row caches; avoids leftovers from batched runs. DO NOT TOUCH THIS!!!!!!!!!
         self._setup_caches()
-        if isinstance(image, EncodedImage):
-            return image
-        elif not isinstance(image, Image.Image):
-            raise ValueError("image must be a PIL Image or EncodedImage")
-        # Always start from single-row caches to avoid leftovers from batched runs
         for blk in self.text.blocks:
             if blk.kv_cache.k_cache.size(0) != 1:
                 blk.kv_cache.k_cache = blk.kv_cache.k_cache[:1].contiguous()
                 blk.kv_cache.v_cache = blk.kv_cache.v_cache[:1].contiguous()
-        lora = (
-            variant_state_dict(settings["variant"], device=self.device)
-            if settings is not None and "variant" in settings
-            else None
-        )
         with torch.inference_mode():
             img_emb = self._run_vision_encoder(image)
-            bos_emb = text_encoder(
-                torch.tensor([[self.config.tokenizer.bos_id]], device=self.device), self.text
-            )
             inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
-            attn = self.attn_mask          # (1,1,Tmax,Tmax)
-            mask = self.attn_mask[:, :, 0 : inputs_embeds.size(1), :]
-            pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.long)
             self._prefill(inputs_embeds, mask, pos_ids, lora)
         return EncodedImage(
             pos=inputs_embeds.size(1),
             caches=[
                 (
-                    b.kv_cache.k_cache[:, :, : inputs_embeds.size(1), :].clone(),
-                    b.kv_cache.v_cache[:, :, : inputs_embeds.size(1), :].clone(),
                 )
                 for b in self.text.blocks
             ],
         )
     def query(
         self,
         image: Optional[Union[Image.Image, EncodedImage]] = None,
@@ -913,22 +896,18 @@ class MoondreamModel(nn.Module):
     def _load_encoded_image_batched(self, encoded_image, batch_size: int):
-        """
-        Clone single-image KV caches into a batch-B cache so we can decode B labels in parallel.
-        """
         for b, (k, v) in zip(self.text.blocks, encoded_image.caches):
             T = k.size(2)
-            # Allocate new [B, n_kv_heads, T_max, head_dim] caches if needed
             if b.kv_cache.k_cache.size(0) != batch_size:
                 new_k = b.kv_cache.k_cache.new_zeros((batch_size,) + b.kv_cache.k_cache.shape[1:])
                 new_v = b.kv_cache.v_cache.new_zeros((batch_size,) + b.kv_cache.v_cache.shape[1:])
                 b.kv_cache.k_cache = new_k
                 b.kv_cache.v_cache = new_v
-            # Copy current prefix from the encoded image into all B rows
             b.kv_cache.k_cache[:, :, :T, :] = k.expand(batch_size, -1, -1, -1)
             b.kv_cache.v_cache[:, :, :T, :] = v.expand(batch_size, -1, -1, -1)
     def _prefill_prompt_batched(self, labels, pos: int, lora=None,
                             temperature: float = 0.0, top_p: float = 0.0):
         tpl = self.config.tokenizer.templates["detect"]
@@ -945,34 +924,35 @@ class MoondreamModel(nn.Module):
         prompt_ids = torch.full((B, T), eos, device=self.device, dtype=torch.long)
         for i, ids in enumerate(rows):
-            prompt_ids[i, :ids.numel()] = ids
-        prompt_emb = text_encoder(prompt_ids, self.text)  # (B,T,C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
-        base = self.attn_mask[:, :, pos:pos+T, :]         # (1,1,T,kv_len)
-        mask = base.expand(B, -1, -1, -1).contiguous()    # (B,1,T,kv_len)
         pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)  # (T,)
-        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)                  # (B,T,C)
-        logits_BTV = lm_head(hidden_BTC, self.text)
-        idx = (torch.tensor(lens, device=self.device) - 1).clamp_min(0)              # (B,)
         last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B,1,C)
         last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
         if temperature == 0.0:
-            next_token = last_logits.argmax(dim=-1, keepdim=True)   # (B,1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
-            next_token = torch.multinomial(probs, num_samples=1)    # (B,1)
-        pos_end = int(pos + T)  # shared scalar end position
         return last_hidden, next_token, pos_end
     def _generate_points_batched(
         self,
         hidden,              # (B,1,C)
@@ -989,11 +969,11 @@ class MoondreamModel(nn.Module):
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
-        # Normalize pos to a scalar int (supports int, (1,), (B,), (B,1))
         if torch.is_tensor(pos):
             pos = int(pos.max().item())
-        # 4-D mask: (B, 1, q_len=1, kv_len) + per-row position ids (B,1)
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if pos > 0:
             mask[:, :, :, :pos] = True
@@ -1004,32 +984,48 @@ class MoondreamModel(nn.Module):
         def _argmax01(logits: torch.Tensor) -> torch.Tensor:
             """
-            logits: (..., bins) -> normalized index in [0,1] per row
-            Accepts (B,1,bins), (B,bins), or (bins,).
             """
-            # Canonicalize to (B, bins)
-            if logits.dim() == 3:  # (B,1,bins)
                 logits = logits.squeeze(1)
-            elif logits.dim() == 1:  # (bins,) -> (1,bins)
                 logits = logits.unsqueeze(0)
             if use_soft_argmax:
                 probs = torch.softmax(logits, dim=-1)
-                bins_idx = torch.arange(probs.size(-1), device=probs.device, dtype=torch.float32)
-                # expected-bin (0..bins-1) -> normalize by (bins-1) to [0,1]
-                expbin = (probs * bins_idx).sum(dim=-1)
                 return expbin / float(probs.size(-1) - 1)
             else:
                 idx = logits.argmax(dim=-1).to(torch.float32)
                 return idx / float(logits.size(-1) - 1)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
                 # ---- x ------------------------------------------------------
-                x_logits = decode_coordinate(hidden, self.region)         # (B,1,b) or (B,b)
-                x_center = _argmax01(x_logits)                            # (B,)
-                x_in  = x_center.to(dtype=x_logits.dtype if torch.is_tensor(x_logits) else hidden.dtype).unsqueeze(-1)
-                x_emb = encode_coordinate(x_in, self.region).unsqueeze(1) # (B,1,C)
                 mask[alive, :, :, pos] = True
                 logits, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
@@ -1037,9 +1033,10 @@ class MoondreamModel(nn.Module):
                 pos += 1
                 # ---- y ------------------------------------------------------
-                y_logits = decode_coordinate(hidden, self.region)         # (B,1,b) or (B,b)
-                y_center = _argmax01(y_logits)                            # (B,)
-                y_in  = y_center.to(dtype=y_logits.dtype if torch.is_tensor(y_logits) else hidden.dtype).unsqueeze(-1)
                 y_emb = encode_coordinate(y_in, self.region).unsqueeze(1)
                 mask[alive, :, :, pos] = True
@@ -1049,22 +1046,24 @@ class MoondreamModel(nn.Module):
                 if include_size:
                     # ---- size ----------------------------------------------
-                    size_logits = decode_size(hidden, self.region)         # tuple of (w_logits, h_logits)
                     w_logits, h_logits = size_logits
-                    # Canonicalize to (B, bins) for both
                     if w_logits.dim() == 3: w_logits = w_logits.squeeze(1)
                     if h_logits.dim() == 3: h_logits = h_logits.squeeze(1)
                     if w_logits.dim() == 1: w_logits = w_logits.unsqueeze(0)
                     if h_logits.dim() == 1: h_logits = h_logits.unsqueeze(0)
                     if use_soft_argmax:
                         w_probs = torch.softmax(w_logits, dim=-1)
                         h_probs = torch.softmax(h_logits, dim=-1)
-                        w_bins_idx = torch.arange(w_probs.size(-1), device=device, dtype=torch.float32)
-                        h_bins_idx = torch.arange(h_probs.size(-1), device=device, dtype=torch.float32)
-                        w_bin = (w_probs * w_bins_idx).sum(dim=-1)         # (B,)
-                        h_bin = (h_probs * h_bins_idx).sum(dim=-1)         # (B,)
                     else:
                         w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                         h_bin = h_logits.argmax(dim=-1).to(torch.float32)
@@ -1075,8 +1074,11 @@ class MoondreamModel(nn.Module):
                     w = torch.pow(2.0, (w_bin / w_den) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / h_den) * 10.0 - 10.0)
-                    size_in = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B,2)
-                    size_emb = encode_size(size_in, self.region).unsqueeze(1)      # (B,1,C)
                     # record boxes only for alive rows
                     for i in range(B):
@@ -1097,11 +1099,11 @@ class MoondreamModel(nn.Module):
                     logits, hidden = self._decode_one_tok(size_emb, mask, pos_ids, lora)
                     pos_ids[alive, 0] += 1
                     pos += 1
-                    next_tok = logits.argmax(dim=-1).squeeze(-1)          # (B,)
                 else:
                     for i in range(B):
                         if alive[i]:
-                            out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
                     mask[alive, :, :, pos] = True
                     logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
                     pos_ids[alive, 0] += 1
@@ -1120,16 +1122,8 @@ class MoondreamModel(nn.Module):
     def detect_multi(self, image, objects, settings=None):
-        """
-        Parallel multi-label detection.
-        Args:
-            image: PIL.Image or EncodedImage
-            objects: list[str], e.g. ["person", "car"]
-            settings: Optional[ObjectSamplingSettings], honors "max_objects" and "variant"
-        Returns:
-            {"objects": {label: [box_dict, ...]}}
-        """
         if self.config.tokenizer.templates["detect"] is None:
             raise NotImplementedError("Model does not support object detection.")
         settings = settings or {}
@@ -1160,9 +1154,8 @@ class MoondreamModel(nn.Module):
                 d["label"] = lab
             res[lab] = lst
-        # IMPORTANT: restore caches to B=1 so future calls (e.g., encode_image) are safe.
         self._reset_kv_caches(1)
         return {"objects": res}

     pos: int
     caches: List[Tuple[torch.Tensor, torch.Tensor]]
 class KVCache(nn.Module):
     def __init__(self, n_heads, n_kv_heads, max_context, dim, device, dtype):
         super().__init__()
         cache_shape = (1, n_kv_heads, max_context, dim // n_heads)
+        self.register_buffer("k_cache", torch.zeros(*cache_shape, device=device, dtype=dtype))
+        self.register_buffer("v_cache", torch.zeros(*cache_shape, device=device, dtype=dtype))
     def update(self, pos_ids, k, v):
         """
         Supports:
           • Prefill:  k,v  = (B, n_kv_heads, q_len, d),  pos_ids = (q_len,)
           • 1-step:   k,v  = (B, n_kv_heads, 1, d),      pos_ids = (B,1) or (B,)
+          • Legacy:   k,v  = (1, n_kv_heads, q_len, d),  pos_ids = scalar
         Writes into self.k_cache/self.v_cache shaped (B, n_kv_heads, T_max, d).
         """
         kout, vout = self.k_cache, self.v_cache
+        # Normalize pos_ids
         if not torch.is_tensor(pos_ids):
             pos_ids = torch.tensor(pos_ids, device=k.device, dtype=torch.long)
         else:
             pos_ids = pos_ids.to(device=k.device, dtype=torch.long)
         if k.dim() != 4 or v.dim() != 4:
+            raise RuntimeError(f"KV update expects 4D k,v. Got k={tuple(k.shape)} v={tuple(v.shape)}")
         B, Hkv, q_len, D = k.shape
+        # Ensure cache batch matches B (expand-from-1 allowed)
         if kout.size(0) != B:
             if kout.size(0) == 1:
                 self.k_cache = kout.expand(B, -1, -1, -1).clone()
                 kout, vout = self.k_cache, self.v_cache
             else:
                 raise RuntimeError(f"KV cache batch mismatch: cache.B={kout.size(0)} vs k.B={B}")
+        # Case A: PREFILL — vector of length q_len (same for all B rows)
         if pos_ids.dim() == 1 and pos_ids.numel() == q_len:
             for i in range(B):
                 kout[i, :, pos_ids, :] = k[i]
                 vout[i, :, pos_ids, :] = v[i]
             return kout, vout
+        # Case B: 1-STEP — q_len == 1 with (B,) or (B,1) per-row positions
+        if q_len == 1 and (pos_ids.numel() == B):
             pos_ids = pos_ids.view(B)
             for i in range(B):
                 pi = int(pos_ids[i].item())
                 kout[i, :, pi, :] = k[i, :, 0, :]
                 vout[i, :, pi, :] = v[i, :, 0, :]
             return kout, vout
+        # Case C: scalar + 1-step
         if pos_ids.dim() == 0 and q_len == 1:
             pi = int(pos_ids.item())
             kout[:, :, pi, :] = k[:, :, 0, :]
             vout[:, :, pi, :] = v[:, :, 0, :]
             return kout, vout
+        raise RuntimeError(f"Unsupported KV update combo: k={tuple(k.shape)}, pos_ids={tuple(pos_ids.shape)}")
         head_dim = c.dim // c.n_heads
         for blk in self.text.blocks:
             device = blk.kv_cache.k_cache.device
+            dtype  = blk.kv_cache.k_cache.dtype
+            shape  = (batch_size, c.n_kv_heads, c.max_context, head_dim)
             blk.kv_cache.k_cache = torch.zeros(shape, device=device, dtype=dtype)
             blk.kv_cache.v_cache = torch.zeros(shape, device=device, dtype=dtype)
     def _setup_caches(self):
         image: Union[Image.Image, EncodedImage],
         settings: Optional[ImageEncodingSettings] = None,
     ) -> EncodedImage:
+        # Always start from single-row caches; avoids leftovers from batched runs
         self._setup_caches()
         for blk in self.text.blocks:
             if blk.kv_cache.k_cache.size(0) != 1:
                 blk.kv_cache.k_cache = blk.kv_cache.k_cache[:1].contiguous()
                 blk.kv_cache.v_cache = blk.kv_cache.v_cache[:1].contiguous()
+        if isinstance(image, EncodedImage):
+            return image
+        if not isinstance(image, Image.Image):
+            raise ValueError("image must be a PIL Image or EncodedImage")
+        lora = (variant_state_dict(settings["variant"], device=self.device)
+                if settings is not None and "variant" in settings else None)
         with torch.inference_mode():
             img_emb = self._run_vision_encoder(image)
+            bos_emb = text_encoder(torch.tensor([[self.config.tokenizer.bos_id]], device=self.device), self.text)
             inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
+            mask = self.attn_mask[:, :, :inputs_embeds.size(1), :]
+            pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.long, device=self.device)
             self._prefill(inputs_embeds, mask, pos_ids, lora)
         return EncodedImage(
             pos=inputs_embeds.size(1),
             caches=[
                 (
+                    b.kv_cache.k_cache[:, :, :inputs_embeds.size(1), :].clone(),
+                    b.kv_cache.v_cache[:, :, :inputs_embeds.size(1), :].clone(),
                 )
                 for b in self.text.blocks
             ],
         )
     def query(
         self,
         image: Optional[Union[Image.Image, EncodedImage]] = None,
     def _load_encoded_image_batched(self, encoded_image, batch_size: int):
         for b, (k, v) in zip(self.text.blocks, encoded_image.caches):
             T = k.size(2)
             if b.kv_cache.k_cache.size(0) != batch_size:
                 new_k = b.kv_cache.k_cache.new_zeros((batch_size,) + b.kv_cache.k_cache.shape[1:])
                 new_v = b.kv_cache.v_cache.new_zeros((batch_size,) + b.kv_cache.v_cache.shape[1:])
                 b.kv_cache.k_cache = new_k
                 b.kv_cache.v_cache = new_v
             b.kv_cache.k_cache[:, :, :T, :] = k.expand(batch_size, -1, -1, -1)
             b.kv_cache.v_cache[:, :, :T, :] = v.expand(batch_size, -1, -1, -1)
     def _prefill_prompt_batched(self, labels, pos: int, lora=None,
                             temperature: float = 0.0, top_p: float = 0.0):
         tpl = self.config.tokenizer.templates["detect"]
         prompt_ids = torch.full((B, T), eos, device=self.device, dtype=torch.long)
         for i, ids in enumerate(rows):
+            prompt_ids[i, : ids.numel()] = ids
+        prompt_emb = text_encoder(prompt_ids, self.text)                 # (B,T,C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
+        base = self.attn_mask[:, :, pos:pos+T, :]                        # (1,1,T,K)
+        mask = base.expand(B, -1, -1, -1).contiguous()                   # (B,1,T,K)
         pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)  # (T,)
+        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)      # (B,T,C)
+        logits_BTV = lm_head(hidden_BTC, self.text)                      # (B,T,V)
+        idx = (torch.tensor(lens, device=self.device) - 1).clamp_min(0)  # (B,)
         last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B,1,C)
         last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
         if temperature == 0.0:
+            next_token = last_logits.argmax(dim=-1, keepdim=True)        # (B,1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
+            next_token = torch.multinomial(probs, num_samples=1)         # (B,1)
+        pos_end = int(pos + T)                                           # shared next-slot
         return last_hidden, next_token, pos_end
     def _generate_points_batched(
         self,
         hidden,              # (B,1,C)
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
+        # Normalize pos to a scalar int
         if torch.is_tensor(pos):
             pos = int(pos.max().item())
+        # 4-D mask: (B,1,1,K) and per-row pos ids (B,1)
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if pos > 0:
             mask[:, :, :, :pos] = True
         def _argmax01(logits: torch.Tensor) -> torch.Tensor:
             """
+            logits: (..., bins) -> (B,) in [0,1]
+            Accepts (B,1,bins), (B,bins), or (bins,)
             """
+            # Canonicalize to (B,bins)
+            if logits.dim() == 3:       # (B,1,bins)
                 logits = logits.squeeze(1)
+            elif logits.dim() == 1:     # (bins,)
                 logits = logits.unsqueeze(0)
+            # If batch accidentally collapsed to 1, expand to B so downstream indexing is safe.
+            if logits.size(0) == 1 and B > 1:
+                logits = logits.expand(B, -1)
             if use_soft_argmax:
                 probs = torch.softmax(logits, dim=-1)
+                bins = torch.arange(probs.size(-1), device=probs.device, dtype=torch.float32)
+                expbin = (probs * bins).sum(dim=-1)
                 return expbin / float(probs.size(-1) - 1)
             else:
                 idx = logits.argmax(dim=-1).to(torch.float32)
                 return idx / float(logits.size(-1) - 1)
+        def _ensure_b(vec: torch.Tensor) -> torch.Tensor:
+            """
+            Make sure 1D tensors are length-B for safe indexing.
+            Accepts scalar/(), (1,), (B,), returns (B,)
+            """
+            if vec.dim() == 0:
+                return vec.repeat(B)
+            if vec.dim() == 1 and vec.numel() == 1 and B > 1:
+                return vec.repeat(B)
+            if vec.dim() == 1 and vec.numel() == B:
+                return vec
+            raise RuntimeError(f"Expected (B,) vec, got shape {tuple(vec.shape)} for B={B}")
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
                 # ---- x ------------------------------------------------------
+                x_logits = decode_coordinate(hidden, self.region)     # (B,1,b) or (B,b) or (b,)
+                x_center = _argmax01(x_logits)                        # (B,)
+                x_center = _ensure_b(x_center)                        # force len B
+                x_in  = x_center.to(dtype=hidden.dtype).unsqueeze(-1) # (B,1)
+                x_emb = encode_coordinate(x_in, self.region).unsqueeze(1)  # (B,1,C)
                 mask[alive, :, :, pos] = True
                 logits, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
                 pos += 1
                 # ---- y ------------------------------------------------------
+                y_logits = decode_coordinate(hidden, self.region)
+                y_center = _argmax01(y_logits)                        # (B,)
+                y_center = _ensure_b(y_center)
+                y_in  = y_center.to(dtype=hidden.dtype).unsqueeze(-1)
                 y_emb = encode_coordinate(y_in, self.region).unsqueeze(1)
                 mask[alive, :, :, pos] = True
                 if include_size:
                     # ---- size ----------------------------------------------
+                    size_logits = decode_size(hidden, self.region)     # tuple: (w_logits, h_logits)
                     w_logits, h_logits = size_logits
+                    # Canonicalize to (B,bins); expand if batch collapsed
                     if w_logits.dim() == 3: w_logits = w_logits.squeeze(1)
                     if h_logits.dim() == 3: h_logits = h_logits.squeeze(1)
                     if w_logits.dim() == 1: w_logits = w_logits.unsqueeze(0)
                     if h_logits.dim() == 1: h_logits = h_logits.unsqueeze(0)
+                    if w_logits.size(0) == 1 and B > 1: w_logits = w_logits.expand(B, -1)
+                    if h_logits.size(0) == 1 and B > 1: h_logits = h_logits.expand(B, -1)
                     if use_soft_argmax:
                         w_probs = torch.softmax(w_logits, dim=-1)
                         h_probs = torch.softmax(h_logits, dim=-1)
+                        bins_w = torch.arange(w_probs.size(-1), device=device, dtype=torch.float32)
+                        bins_h = torch.arange(h_probs.size(-1), device=device, dtype=torch.float32)
+                        w_bin = (w_probs * bins_w).sum(dim=-1)        # (B,)
+                        h_bin = (h_probs * bins_h).sum(dim=-1)        # (B,)
                     else:
                         w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                         h_bin = h_logits.argmax(dim=-1).to(torch.float32)
                     w = torch.pow(2.0, (w_bin / w_den) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / h_den) * 10.0 - 10.0)
+                    # enforce (B,)
+                    w = _ensure_b(w); h = _ensure_b(h)
+                    size_in = torch.stack([w, h], dim=1).to(dtype=hidden.dtype)  # (B,2)
+                    size_emb = encode_size(size_in, self.region).unsqueeze(1)    # (B,1,C)
                     # record boxes only for alive rows
                     for i in range(B):
                     logits, hidden = self._decode_one_tok(size_emb, mask, pos_ids, lora)
                     pos_ids[alive, 0] += 1
                     pos += 1
+                    next_tok = logits.argmax(dim=-1).squeeze(-1)       # (B,)
                 else:
                     for i in range(B):
                         if alive[i]:
+                            out[i].append({"x": float(x_center[i]), "y": float(y_center[i])})
                     mask[alive, :, :, pos] = True
                     logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
                     pos_ids[alive, 0] += 1
     def detect_multi(self, image, objects, settings=None):
         if self.config.tokenizer.templates["detect"] is None:
             raise NotImplementedError("Model does not support object detection.")
         settings = settings or {}
                 d["label"] = lab
             res[lab] = lst
+        # IMPORTANT: restore caches to B=1 so future calls are safe
         self._reset_kv_caches(1)
         return {"objects": res}