JigsawStack
/

moondream2-batched

Image-Text-to-Text

Model card Files Files and versions

HV-Khurdula commited on Sep 24, 2025

Commit

b5aefdb

·

verified ·

1 Parent(s): 01b09b7

Update moondream.py

Files changed (1) hide show

moondream.py +5 -2

moondream.py CHANGED Viewed

@@ -608,9 +608,12 @@ class MoondreamModel(nn.Module):
                 torch.tensor([[self.config.tokenizer.bos_id]], device=self.device), self.text
             )
             inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
-            mask = self.attn_mask[:, :, 0 : inputs_embeds.size(1), :]
-            pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.long)
             self._prefill(inputs_embeds, mask, pos_ids, lora)
         return EncodedImage(
             pos=inputs_embeds.size(1),

                 torch.tensor([[self.config.tokenizer.bos_id]], device=self.device), self.text
             )
             inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
+            attn = self.attn_mask          # (1,1,Tmax,Tmax)
+            mask = attn[:, :, pos:pos+T, :].expand(B, -1, -1, -1).contiguous()  # (B,1,T,K)
+            pos_ids = torch.arange(pos, pos+T, device=self.device, dtype=torch.long)
             self._prefill(inputs_embeds, mask, pos_ids, lora)
         return EncodedImage(
             pos=inputs_embeds.size(1),