JigsawStack
/

moondream2-batched

@@ -976,7 +976,7 @@ class MoondreamModel(nn.Module):
     def _generate_points_batched(
         self,
         hidden,              # (B,1,C)
-        next_token,          # (B,1)
         pos,                 # int or Tensor; normalized below
         include_size: bool = True,
         max_objects: int = 50,
@@ -989,10 +989,11 @@ class MoondreamModel(nn.Module):
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
         if torch.is_tensor(pos):
             pos = int(pos.max().item())
-        # SDPA mask and position ids
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if pos > 0:
             mask[:, :, :, :pos] = True
@@ -1001,36 +1002,44 @@ class MoondreamModel(nn.Module):
         alive = torch.ones(B, dtype=torch.bool, device=device)
         counts = torch.zeros(B, dtype=torch.int32, device=device)
-        def _argmax01(logits):
-            # logits: (B, bins) -> normalized index in [0,1]
             if use_soft_argmax:
                 probs = torch.softmax(logits, dim=-1)
-                bins = torch.arange(probs.size(-1), device=logits.device, dtype=torch.float32)
-                idx = (probs * bins).sum(dim=-1) / (probs.size(-1) - 1)
-                return idx
             else:
                 idx = logits.argmax(dim=-1).to(torch.float32)
                 return idx / float(logits.size(-1) - 1)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
-                # x
-                x_logits = decode_coordinate(hidden, self.region)
-                if x_logits.dim() == 3: x_logits = x_logits.squeeze(1)
-                x_center = _argmax01(x_logits)
-                x_in  = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)
-                x_emb = encode_coordinate(x_in, self.region).unsqueeze(1)
                 mask[alive, :, :, pos] = True
                 logits, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
                 pos_ids[alive, 0] += 1
                 pos += 1
-                # y
-                y_logits = decode_coordinate(hidden, self.region)
-                if y_logits.dim() == 3: y_logits = y_logits.squeeze(1)
-                y_center = _argmax01(y_logits)
-                y_in  = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)
                 y_emb = encode_coordinate(y_in, self.region).unsqueeze(1)
                 mask[alive, :, :, pos] = True
@@ -1039,27 +1048,40 @@ class MoondreamModel(nn.Module):
                 pos += 1
                 if include_size:
-                    # size
-                    size_logits = decode_size(hidden, self.region)
-                    w_logits = size_logits[0].squeeze(1)
-                    h_logits = size_logits[1].squeeze(1)
                     if use_soft_argmax:
-                        w_bin = (torch.softmax(w_logits, dim=-1) *
-                                 torch.arange(w_logits.size(-1), device=device)).sum(dim=-1)
-                        h_bin = (torch.softmax(h_logits, dim=-1) *
-                                 torch.arange(h_logits.size(-1), device=device)).sum(dim=-1)
                     else:
                         w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                         h_bin = h_logits.argmax(dim=-1).to(torch.float32)
-                    w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
-                    h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
-                    size_in = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)
-                    size_emb = encode_size(size_in, self.region).unsqueeze(1)
                     for i in range(B):
-                        if not alive[i]: continue
                         xl = (x_center[i] - w[i] / 2).item()
                         xr = (x_center[i] + w[i] / 2).item()
                         yt = (y_center[i] - h[i] / 2).item()
@@ -1075,7 +1097,7 @@ class MoondreamModel(nn.Module):
                     logits, hidden = self._decode_one_tok(size_emb, mask, pos_ids, lora)
                     pos_ids[alive, 0] += 1
                     pos += 1
-                    next_tok = logits.argmax(dim=-1).squeeze(-1)
                 else:
                     for i in range(B):
                         if alive[i]:
@@ -1086,6 +1108,7 @@ class MoondreamModel(nn.Module):
                     pos += 1
                     next_tok = logits.argmax(dim=-1).squeeze(-1)
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
                 counts = counts + ((~finished_now) & alive).to(counts.dtype)
                 alive &= ~finished_now

     def _generate_points_batched(
         self,
         hidden,              # (B,1,C)
+        next_token,          # (B,1)  (kept for API compatibility)
         pos,                 # int or Tensor; normalized below
         include_size: bool = True,
         max_objects: int = 50,
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
+        # Normalize pos to a scalar int (supports int, (1,), (B,), (B,1))
         if torch.is_tensor(pos):
             pos = int(pos.max().item())
+        # 4-D mask: (B, 1, q_len=1, kv_len) + per-row position ids (B,1)
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if pos > 0:
             mask[:, :, :, :pos] = True
         alive = torch.ones(B, dtype=torch.bool, device=device)
         counts = torch.zeros(B, dtype=torch.int32, device=device)
+        def _argmax01(logits: torch.Tensor) -> torch.Tensor:
+            """
+            logits: (..., bins) -> normalized index in [0,1] per row
+            Accepts (B,1,bins), (B,bins), or (bins,).
+            """
+            # Canonicalize to (B, bins)
+            if logits.dim() == 3:  # (B,1,bins)
+                logits = logits.squeeze(1)
+            elif logits.dim() == 1:  # (bins,) -> (1,bins)
+                logits = logits.unsqueeze(0)
             if use_soft_argmax:
                 probs = torch.softmax(logits, dim=-1)
+                bins_idx = torch.arange(probs.size(-1), device=probs.device, dtype=torch.float32)
+                # expected-bin (0..bins-1) -> normalize by (bins-1) to [0,1]
+                expbin = (probs * bins_idx).sum(dim=-1)
+                return expbin / float(probs.size(-1) - 1)
             else:
                 idx = logits.argmax(dim=-1).to(torch.float32)
                 return idx / float(logits.size(-1) - 1)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
+                # ---- x ------------------------------------------------------
+                x_logits = decode_coordinate(hidden, self.region)         # (B,1,b) or (B,b)
+                x_center = _argmax01(x_logits)                            # (B,)
+                x_in  = x_center.to(dtype=x_logits.dtype if torch.is_tensor(x_logits) else hidden.dtype).unsqueeze(-1)
+                x_emb = encode_coordinate(x_in, self.region).unsqueeze(1) # (B,1,C)
                 mask[alive, :, :, pos] = True
                 logits, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
                 pos_ids[alive, 0] += 1
                 pos += 1
+                # ---- y ------------------------------------------------------
+                y_logits = decode_coordinate(hidden, self.region)         # (B,1,b) or (B,b)
+                y_center = _argmax01(y_logits)                            # (B,)
+                y_in  = y_center.to(dtype=y_logits.dtype if torch.is_tensor(y_logits) else hidden.dtype).unsqueeze(-1)
                 y_emb = encode_coordinate(y_in, self.region).unsqueeze(1)
                 mask[alive, :, :, pos] = True
                 pos += 1
                 if include_size:
+                    # ---- size ----------------------------------------------
+                    size_logits = decode_size(hidden, self.region)         # tuple of (w_logits, h_logits)
+                    w_logits, h_logits = size_logits
+                    # Canonicalize to (B, bins) for both
+                    if w_logits.dim() == 3: w_logits = w_logits.squeeze(1)
+                    if h_logits.dim() == 3: h_logits = h_logits.squeeze(1)
+                    if w_logits.dim() == 1: w_logits = w_logits.unsqueeze(0)
+                    if h_logits.dim() == 1: h_logits = h_logits.unsqueeze(0)
                     if use_soft_argmax:
+                        w_probs = torch.softmax(w_logits, dim=-1)
+                        h_probs = torch.softmax(h_logits, dim=-1)
+                        w_bins_idx = torch.arange(w_probs.size(-1), device=device, dtype=torch.float32)
+                        h_bins_idx = torch.arange(h_probs.size(-1), device=device, dtype=torch.float32)
+                        w_bin = (w_probs * w_bins_idx).sum(dim=-1)         # (B,)
+                        h_bin = (h_probs * h_bins_idx).sum(dim=-1)         # (B,)
                     else:
                         w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                         h_bin = h_logits.argmax(dim=-1).to(torch.float32)
+                    # bins -> size (inverse log scale), robust to bins != 1024
+                    w_den = float(w_logits.size(-1) - 1)
+                    h_den = float(h_logits.size(-1) - 1)
+                    w = torch.pow(2.0, (w_bin / w_den) * 10.0 - 10.0)
+                    h = torch.pow(2.0, (h_bin / h_den) * 10.0 - 10.0)
+                    size_in = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B,2)
+                    size_emb = encode_size(size_in, self.region).unsqueeze(1)      # (B,1,C)
+                    # record boxes only for alive rows
                     for i in range(B):
+                        if not alive[i]:
+                            continue
                         xl = (x_center[i] - w[i] / 2).item()
                         xr = (x_center[i] + w[i] / 2).item()
                         yt = (y_center[i] - h[i] / 2).item()
                     logits, hidden = self._decode_one_tok(size_emb, mask, pos_ids, lora)
                     pos_ids[alive, 0] += 1
                     pos += 1
+                    next_tok = logits.argmax(dim=-1).squeeze(-1)          # (B,)
                 else:
                     for i in range(B):
                         if alive[i]:
                     pos += 1
                     next_tok = logits.argmax(dim=-1).squeeze(-1)
+                # stop only rows that hit eos (or reached max objects)
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
                 counts = counts + ((~finished_now) & alive).to(counts.dtype)
                 alive &= ~finished_now