moondream2-batched

Image-Text-to-Text

Safetensors

moondream1

custom_code

Model card Files Files and versions

xet

Community

HV-Khurdula commited on Sep 24, 2025

Commit

b00c890

verified ·

1 Parent(s): cd2accb

Update moondream.py

Browse files

fix: detection corruption

Files changed (1) hide show

moondream.py +53 -173

moondream.py CHANGED Viewed

@@ -76,23 +76,21 @@ class KVCache(nn.Module):
         Supports:
           • Prefill:  k,v  = (B, n_kv_heads, q_len, d),  pos_ids = (q_len,)
           • 1-step:   k,v  = (B, n_kv_heads, 1, d),      pos_ids = (B,1) or (B,)
-          • Legacy:   k,v  = (1, n_kv_heads, q_len, d),  pos_ids = scalar
         Writes into self.k_cache/self.v_cache shaped (B, n_kv_heads, T_max, d).
         """
-        kout, vout = self.k_cache, self.v_cache
-        # Normalize pos_ids
         if not torch.is_tensor(pos_ids):
             pos_ids = torch.tensor(pos_ids, device=k.device, dtype=torch.long)
         else:
             pos_ids = pos_ids.to(device=k.device, dtype=torch.long)
         if k.dim() != 4 or v.dim() != 4:
-            raise RuntimeError(f"KV update expects 4D k,v. Got k={tuple(k.shape)} v={tuple(v.shape)}")
         B, Hkv, q_len, D = k.shape
-        # Ensure cache batch matches B (expand-from-1 allowed)
         if kout.size(0) != B:
             if kout.size(0) == 1:
                 self.k_cache = kout.expand(B, -1, -1, -1).clone()
@@ -100,36 +98,37 @@ class KVCache(nn.Module):
                 kout, vout = self.k_cache, self.v_cache
             else:
                 raise RuntimeError(f"KV cache batch mismatch: cache.B={kout.size(0)} vs k.B={B}")
-        # Case A: PREFILL — vector of length q_len (same for all B rows)
         if pos_ids.dim() == 1 and pos_ids.numel() == q_len:
             for i in range(B):
                 kout[i, :, pos_ids, :] = k[i]
                 vout[i, :, pos_ids, :] = v[i]
             return kout, vout
-        # Case B: 1-STEP — q_len == 1 with (B,) or (B,1) per-row positions
-        if q_len == 1 and (pos_ids.numel() == B):
             pos_ids = pos_ids.view(B)
             for i in range(B):
                 pi = int(pos_ids[i].item())
                 kout[i, :, pi, :] = k[i, :, 0, :]
                 vout[i, :, pi, :] = v[i, :, 0, :]
             return kout, vout
-        # Case C: scalar + 1-step
-        if pos_ids.dim() == 0 and q_len == 1:
             pi = int(pos_ids.item())
             kout[:, :, pi, :] = k[:, :, 0, :]
             vout[:, :, pi, :] = v[:, :, 0, :]
             return kout, vout
         raise RuntimeError(f"Unsupported KV update combo: k={tuple(k.shape)}, pos_ids={tuple(pos_ids.shape)}")
 class MoondreamModel(nn.Module):
     def __init__(
@@ -201,7 +200,6 @@ class MoondreamModel(nn.Module):
         if setup_caches:
             self._setup_caches()
     def _reset_kv_caches(self, batch_size: int = 1):
         c = self.config.text
         head_dim = c.dim // c.n_heads
@@ -569,13 +567,13 @@ class MoondreamModel(nn.Module):
         image: Union[Image.Image, EncodedImage],
         settings: Optional[ImageEncodingSettings] = None,
     ) -> EncodedImage:
-        # Always start from single-row caches; avoids leftovers from batched runs
-        self._setup_caches()
-        for blk in self.text.blocks:
             if blk.kv_cache.k_cache.size(0) != 1:
                 blk.kv_cache.k_cache = blk.kv_cache.k_cache[:1].contiguous()
                 blk.kv_cache.v_cache = blk.kv_cache.v_cache[:1].contiguous()
         if isinstance(image, EncodedImage):
             return image
         if not isinstance(image, Image.Image):
@@ -908,6 +906,7 @@ class MoondreamModel(nn.Module):
     def _prefill_prompt_batched(self, labels, pos: int, lora=None,
                             temperature: float = 0.0, top_p: float = 0.0):
         tpl = self.config.tokenizer.templates["detect"]
@@ -926,42 +925,39 @@ class MoondreamModel(nn.Module):
         for i, ids in enumerate(rows):
             prompt_ids[i, : ids.numel()] = ids
-        prompt_emb = text_encoder(prompt_ids, self.text)                 # (B,T,C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
         base = self.attn_mask[:, :, pos:pos+T, :]                        # (1,1,T,K)
-        mask = base.expand(B, -1, -1, -1).contiguous()                   # (B,1,T,K)
         pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)  # (T,)
-        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)      # (B,T,C)
-        logits_BTV = lm_head(hidden_BTC, self.text)                      # (B,T,V)
-        idx = (torch.tensor(lens, device=self.device) - 1).clamp_min(0)  # (B,)
         last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B,1,C)
         last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
         if temperature == 0.0:
-            next_token = last_logits.argmax(dim=-1, keepdim=True)        # (B,1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
-            next_token = torch.multinomial(probs, num_samples=1)         # (B,1)
-        pos_end = int(pos + T)                                           # shared next-slot
-        return last_hidden, next_token, pos_end
     def _generate_points_batched(
         self,
         hidden,              # (B,1,C)
-        next_token,          # (B,1)  (kept for API compatibility)
-        pos,                 # int or Tensor; normalized below
         include_size: bool = True,
         max_objects: int = 50,
         lora=None,
-        use_soft_argmax: bool = True,   # reduces bbox jitter
     ):
         B = hidden.size(0)
         device = self.device
@@ -969,156 +965,40 @@ class MoondreamModel(nn.Module):
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
-        # Normalize pos to a scalar int
-        if torch.is_tensor(pos):
-            pos = int(pos.max().item())
-        # 4-D mask: (B,1,1,K) and per-row pos ids (B,1)
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if pos > 0:
             mask[:, :, :, :pos] = True
         pos_ids = torch.full((B, 1), pos, device=device, dtype=torch.long)
-        alive = torch.ones(B, dtype=torch.bool, device=device)
         counts = torch.zeros(B, dtype=torch.int32, device=device)
-        def _argmax01(logits: torch.Tensor) -> torch.Tensor:
-            """
-            logits: (..., bins) -> (B,) in [0,1]
-            Accepts (B,1,bins), (B,bins), or (bins,)
-            """
-            # Canonicalize to (B,bins)
-            if logits.dim() == 3:       # (B,1,bins)
-                logits = logits.squeeze(1)
-            elif logits.dim() == 1:     # (bins,)
-                logits = logits.unsqueeze(0)
-            # If batch accidentally collapsed to 1, expand to B so downstream indexing is safe.
-            if logits.size(0) == 1 and B > 1:
-                logits = logits.expand(B, -1)
             if use_soft_argmax:
-                probs = torch.softmax(logits, dim=-1)
-                bins = torch.arange(probs.size(-1), device=probs.device, dtype=torch.float32)
-                expbin = (probs * bins).sum(dim=-1)
-                return expbin / float(probs.size(-1) - 1)
-            else:
-                idx = logits.argmax(dim=-1).to(torch.float32)
-                return idx / float(logits.size(-1) - 1)
-        def _ensure_b(vec: torch.Tensor) -> torch.Tensor:
-            """
-            Make sure 1D tensors are length-B for safe indexing.
-            Accepts scalar/(), (1,), (B,), returns (B,)
-            """
-            if vec.dim() == 0:
-                return vec.repeat(B)
-            if vec.dim() == 1 and vec.numel() == 1 and B > 1:
-                return vec.repeat(B)
-            if vec.dim() == 1 and vec.numel() == B:
-                return vec
-            raise RuntimeError(f"Expected (B,) vec, got shape {tuple(vec.shape)} for B={B}")
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
-                # ---- x ------------------------------------------------------
-                x_logits = decode_coordinate(hidden, self.region)     # (B,1,b) or (B,b) or (b,)
-                x_center = _argmax01(x_logits)                        # (B,)
-                x_center = _ensure_b(x_center)                        # force len B
-                x_in  = x_center.to(dtype=hidden.dtype).unsqueeze(-1) # (B,1)
-                x_emb = encode_coordinate(x_in, self.region).unsqueeze(1)  # (B,1,C)
                 mask[alive, :, :, pos] = True
-                logits, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
-                pos_ids[alive, 0] += 1
-                pos += 1
-                # ---- y ------------------------------------------------------
                 y_logits = decode_coordinate(hidden, self.region)
-                y_center = _argmax01(y_logits)                        # (B,)
-                y_center = _ensure_b(y_center)
-                y_in  = y_center.to(dtype=hidden.dtype).unsqueeze(-1)
-                y_emb = encode_coordinate(y_in, self.region).unsqueeze(1)
-                mask[alive, :, :, pos] = True
-                logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
-                pos_ids[alive, 0] += 1
-                pos += 1
-                if include_size:
-                    # ---- size ----------------------------------------------
-                    size_logits = decode_size(hidden, self.region)     # tuple: (w_logits, h_logits)
-                    w_logits, h_logits = size_logits
-                    # Canonicalize to (B,bins); expand if batch collapsed
-                    if w_logits.dim() == 3: w_logits = w_logits.squeeze(1)
-                    if h_logits.dim() == 3: h_logits = h_logits.squeeze(1)
-                    if w_logits.dim() == 1: w_logits = w_logits.unsqueeze(0)
-                    if h_logits.dim() == 1: h_logits = h_logits.unsqueeze(0)
-                    if w_logits.size(0) == 1 and B > 1: w_logits = w_logits.expand(B, -1)
-                    if h_logits.size(0) == 1 and B > 1: h_logits = h_logits.expand(B, -1)
-                    if use_soft_argmax:
-                        w_probs = torch.softmax(w_logits, dim=-1)
-                        h_probs = torch.softmax(h_logits, dim=-1)
-                        bins_w = torch.arange(w_probs.size(-1), device=device, dtype=torch.float32)
-                        bins_h = torch.arange(h_probs.size(-1), device=device, dtype=torch.float32)
-                        w_bin = (w_probs * bins_w).sum(dim=-1)        # (B,)
-                        h_bin = (h_probs * bins_h).sum(dim=-1)        # (B,)
-                    else:
-                        w_bin = w_logits.argmax(dim=-1).to(torch.float32)
-                        h_bin = h_logits.argmax(dim=-1).to(torch.float32)
-                    # bins -> size (inverse log scale), robust to bins != 1024
-                    w_den = float(w_logits.size(-1) - 1)
-                    h_den = float(h_logits.size(-1) - 1)
-                    w = torch.pow(2.0, (w_bin / w_den) * 10.0 - 10.0)
-                    h = torch.pow(2.0, (h_bin / h_den) * 10.0 - 10.0)
-                    # enforce (B,)
-                    w = _ensure_b(w); h = _ensure_b(h)
-                    size_in = torch.stack([w, h], dim=1).to(dtype=hidden.dtype)  # (B,2)
-                    size_emb = encode_size(size_in, self.region).unsqueeze(1)    # (B,1,C)
-                    # record boxes only for alive rows
-                    for i in range(B):
-                        if not alive[i]:
-                            continue
-                        xl = (x_center[i] - w[i] / 2).item()
-                        xr = (x_center[i] + w[i] / 2).item()
-                        yt = (y_center[i] - h[i] / 2).item()
-                        yb = (y_center[i] + h[i] / 2).item()
-                        out[i].append({
-                            "x_min": max(0.0, min(1.0, xl)),
-                            "y_min": max(0.0, min(1.0, yt)),
-                            "x_max": max(0.0, min(1.0, xr)),
-                            "y_max": max(0.0, min(1.0, yb)),
-                        })
-                    mask[alive, :, :, pos] = True
-                    logits, hidden = self._decode_one_tok(size_emb, mask, pos_ids, lora)
-                    pos_ids[alive, 0] += 1
-                    pos += 1
-                    next_tok = logits.argmax(dim=-1).squeeze(-1)       # (B,)
-                else:
-                    for i in range(B):
-                        if alive[i]:
-                            out[i].append({"x": float(x_center[i]), "y": float(y_center[i])})
-                    mask[alive, :, :, pos] = True
-                    logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
-                    pos_ids[alive, 0] += 1
-                    pos += 1
-                    next_tok = logits.argmax(dim=-1).squeeze(-1)
-                # stop only rows that hit eos (or reached max objects)
-                finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
-                counts = counts + ((~finished_now) & alive).to(counts.dtype)
-                alive &= ~finished_now
-        return out

         Supports:
           • Prefill:  k,v  = (B, n_kv_heads, q_len, d),  pos_ids = (q_len,)
           • 1-step:   k,v  = (B, n_kv_heads, 1, d),      pos_ids = (B,1) or (B,)
+          • Legacy:   k,v  = (B, n_kv_heads, 1, d),      pos_ids = scalar
         Writes into self.k_cache/self.v_cache shaped (B, n_kv_heads, T_max, d).
         """
         if not torch.is_tensor(pos_ids):
             pos_ids = torch.tensor(pos_ids, device=k.device, dtype=torch.long)
         else:
             pos_ids = pos_ids.to(device=k.device, dtype=torch.long)
         if k.dim() != 4 or v.dim() != 4:
+            raise RuntimeError(f"KV update expects k,v 4D. Got k={tuple(k.shape)} v={tuple(v.shape)}")
         B, Hkv, q_len, D = k.shape
+        kout, vout = self.k_cache, self.v_cache
+        # Expand caches from B=1 lazily if needed
         if kout.size(0) != B:
             if kout.size(0) == 1:
                 self.k_cache = kout.expand(B, -1, -1, -1).clone()
                 kout, vout = self.k_cache, self.v_cache
             else:
                 raise RuntimeError(f"KV cache batch mismatch: cache.B={kout.size(0)} vs k.B={B}")
+        # Case A: prefill (same positions for every row)
         if pos_ids.dim() == 1 and pos_ids.numel() == q_len:
             for i in range(B):
                 kout[i, :, pos_ids, :] = k[i]
                 vout[i, :, pos_ids, :] = v[i]
             return kout, vout
+        # Case B: single step with per-row position (B,) or (B,1)
+        if q_len == 1 and pos_ids.numel() == B:
             pos_ids = pos_ids.view(B)
             for i in range(B):
                 pi = int(pos_ids[i].item())
                 kout[i, :, pi, :] = k[i, :, 0, :]
                 vout[i, :, pi, :] = v[i, :, 0, :]
             return kout, vout
+        # Case C: scalar position for everyone
+        if q_len == 1 and pos_ids.dim() == 0:
             pi = int(pos_ids.item())
             kout[:, :, pi, :] = k[:, :, 0, :]
             vout[:, :, pi, :] = v[:, :, 0, :]
             return kout, vout
         raise RuntimeError(f"Unsupported KV update combo: k={tuple(k.shape)}, pos_ids={tuple(pos_ids.shape)}")
 class MoondreamModel(nn.Module):
     def __init__(
         if setup_caches:
             self._setup_caches()
     def _reset_kv_caches(self, batch_size: int = 1):
         c = self.config.text
         head_dim = c.dim // c.n_heads
         image: Union[Image.Image, EncodedImage],
         settings: Optional[ImageEncodingSettings] = None,
     ) -> EncodedImage:
+        # Top of encode_image(), just after type checks:
+        self._setup_caches()  # re-create caches
+        for blk in self.text.blocks:          # force B=1 for encode
             if blk.kv_cache.k_cache.size(0) != 1:
                 blk.kv_cache.k_cache = blk.kv_cache.k_cache[:1].contiguous()
                 blk.kv_cache.v_cache = blk.kv_cache.v_cache[:1].contiguous()
         if isinstance(image, EncodedImage):
             return image
         if not isinstance(image, Image.Image):
     def _prefill_prompt_batched(self, labels, pos: int, lora=None,
                             temperature: float = 0.0, top_p: float = 0.0):
         tpl = self.config.tokenizer.templates["detect"]
         for i, ids in enumerate(rows):
             prompt_ids[i, : ids.numel()] = ids
+        prompt_emb = text_encoder(prompt_ids, self.text)                  # (B,T,C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
         base = self.attn_mask[:, :, pos:pos+T, :]                        # (1,1,T,K)
+        mask = base.expand(B, -1, -1, -1).contiguous()                    # (B,1,T,K)
         pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)  # (T,)
+        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)       # (B,T,C)
+        logits_BTV = lm_head(hidden_BTC, self.text)
+        idx = (torch.tensor(lens, device=self.device) - 1).clamp_min(0)   # (B,)
         last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B,1,C)
         last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
         if temperature == 0.0:
+            next_token = last_logits.argmax(dim=-1, keepdim=True)         # (B,1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
+            next_token = torch.multinomial(probs, num_samples=1)          # (B,1)
+        return last_hidden, next_token, int(pos + T)
     def _generate_points_batched(
         self,
         hidden,              # (B,1,C)
+        next_token,          # (B,1)  (not used with greedy coords; kept for API)
+        pos,                 # int, next free KV slot
         include_size: bool = True,
         max_objects: int = 50,
         lora=None,
+        use_soft_argmax: bool = True,
     ):
         B = hidden.size(0)
         device = self.device
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
+        # mask & position ids
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if pos > 0:
             mask[:, :, :, :pos] = True
         pos_ids = torch.full((B, 1), pos, device=device, dtype=torch.long)
+        alive  = torch.ones(B, dtype=torch.bool, device=device)
         counts = torch.zeros(B, dtype=torch.int32, device=device)
+        def _argmax01(logits_2d):
+            # logits_2d: (B, bins)
             if use_soft_argmax:
+                probs = torch.softmax(logits_2d, dim=-1)
+                bins  = torch.arange(probs.size(-1), device=logits_2d.device, dtype=torch.float32)
+                val   = (probs * bins).sum(dim=-1) / (probs.size(-1) - 1)
+                return val  # in [0,1]
+            idx = logits_2d.argmax(dim=-1).to(torch.float32)
+            return idx / float(logits_2d.size(-1) - 1)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
+                # x
+                x_logits = decode_coordinate(hidden, self.region)          # (B,1,1024) or (B,1024)
+                if x_logits.dim() == 3: x_logits = x_logits.squeeze(1)
+                x_center = _argmax01(x_logits)                             # (B,)
+                x_emb = encode_coordinate(x_center.to(dtype=x_logits.dtype).unsqueeze(-1),
+                                          self.region).unsqueeze(1)        # (B,1,C)
                 mask[alive, :, :, pos] = True
+                _, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
+                pos_ids[alive, 0] += 1; pos += 1
+                # y
                 y_logits = decode_coordinate(hidden, self.region)
+                if y_logits.dim