JigsawStack
/

moondream2-batched

@@ -943,13 +943,19 @@ class MoondreamModel(nn.Module):
             b.kv_cache.v_cache[:, :, :T, :] = v.expand(batch_size, -1, -1, -1)
-    def _prefill_prompt_batched(self, labels, pos: int, lora=None,
-                            temperature: float = 0.0, top_p: float = 0.0):
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection.")
-        # Build each row's token ids (variable length)
         rows_ids, lens = [], []
         for lab in labels:
             ids = tpl["prefix"] + self.tokenizer.encode(" " + lab).ids + tpl["suffix"]
@@ -957,44 +963,44 @@ class MoondreamModel(nn.Module):
             rows_ids.append(t)
             lens.append(t.numel())
-        B, T = len(rows_ids), max(lens)
-        # Embed each row, then LEFT-pad using its own first token embedding (neutral),
-        # mirroring upstream moondream2 batching strategy.
-        embs = [text_encoder(t.unsqueeze(0), self.text)[0] for t in rows_ids]  # list of (Li, C)
         padded = []
         for e, L in zip(embs, lens):
             pad = T - L
             if pad > 0:
-                e = torch.cat([e[:1].repeat(pad, 1), e], dim=0)                # (T, C)
             padded.append(e)
-        prompt_emb = torch.stack(padded, dim=0)                                 # (B, T, C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
-        # Standard prefill over the shared image prefix [pos : pos+T]
-        base = self.attn_mask[:, :, pos:pos+T, :]                               # (1,1,T,K)
-        mask = base.expand(B, -1, -1, -1).contiguous()                           # (B,1,T,K)
         pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)  # (T,)
-        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)              # (B,T,C)
-        logits_BTV = lm_head(hidden_BTC, self.text)                               # (B,T,V)
-        # Take the last real token of each row
-        idx = (torch.tensor(lens, device=self.device) - 1).clamp_min(0)          # (B,)
-        last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B,1,C)
-        last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
         if temperature == 0.0:
-            next_token = last_logits.argmax(dim=-1, keepdim=True)                # (B,1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
-            next_token = torch.multinomial(probs, num_samples=1)                 # (B,1)
-        pos_end = int(pos + T)
         return last_hidden, next_token, pos_end
     def _generate_points_batched(
         self,
         hidden,              # (B,1,C)  - last token hidden state per row

             b.kv_cache.v_cache[:, :, :T, :] = v.expand(batch_size, -1, -1, -1)
+    def _prefill_prompt_batched(
+        self,
+        labels: List[str],
+        pos: int,
+        lora=None,
+        temperature: float = 0.0,
+        top_p: float = 0.0,
+    ):
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection.")
+        # 1) Build token ids for each label (variable length)
         rows_ids, lens = [], []
         for lab in labels:
             ids = tpl["prefix"] + self.tokenizer.encode(" " + lab).ids + tpl["suffix"]
             rows_ids.append(t)
             lens.append(t.numel())
+        B = len(rows_ids)
+        T = max(lens)
+        # 2) Embed then LEFT-pad each row to length T using the row’s first token embedding
+        embs = [text_encoder(t.unsqueeze(0), self.text)[0] for t in rows_ids]  # list[(Li, C)]
         padded = []
         for e, L in zip(embs, lens):
             pad = T - L
             if pad > 0:
+                e = torch.cat([e[:1].repeat(pad, 1), e], dim=0)  # (T, C)
             padded.append(e)
+        prompt_emb = torch.stack(padded, dim=0)  # (B, T, C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
+        # 3) Prefill over the shared image prefix [pos : pos + T)
+        base = self.attn_mask[:, :, pos:pos + T, :]      # (1, 1, T, K)
+        mask = base.expand(B, -1, -1, -1).contiguous()   # (B, 1, T, K)
         pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)  # (T,)
+        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)                # (B, T, C)
+        logits_BTV = lm_head(hidden_BTC, self.text)                                # (B, T, V)
+        # **FIX**: After left-padding, the last real token sits at T-1 for every row.
+        last_idx = torch.full((B,), T - 1, device=self.device, dtype=torch.long)   # (B,)
+        last_hidden = hidden_BTC[torch.arange(B, device=self.device), last_idx][:, None, :]  # (B, 1, C)
+        last_logits = logits_BTV[torch.arange(B, device=self.device), last_idx]            # (B, V)
         if temperature == 0.0:
+            next_token = last_logits.argmax(dim=-1, keepdim=True)  # (B, 1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
+            next_token = torch.multinomial(probs, num_samples=1)   # (B, 1)
+        pos_end = int(pos + T)
         return last_hidden, next_token, pos_end
     def _generate_points_batched(
         self,
         hidden,              # (B,1,C)  - last token hidden state per row