Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

convert_molmo_point_to_hf.py +7 -4
model-00007-of-00008.safetensors +2 -2
model.safetensors.index.json +13 -13
modeling_molmo_point.py +199 -134
processing_molmo2.py +0 -1
video_processing_molmo2.py +2 -1

convert_molmo_point_to_hf.py CHANGED Viewed

@@ -207,7 +207,14 @@ def convert_molmo2(
             new_key = "lm_head.new_output_embeddings"
         elif key == "transformer.ff_out.weight":
             new_key = "lm_head.output_embeddings"
         else:
             new_key = f"{base_model_prefix}.{key}"
         new_state_dict[new_key] = val
@@ -215,10 +222,6 @@ def convert_molmo2(
     qkv_bias = config.qkv_bias if isinstance(config, Molmo2TextConfig) else config.text_config.qkv_bias
     use_qk_norm = config.use_qk_norm if isinstance(config, Molmo2TextConfig) else config.text_config.use_qk_norm
-    for param in list(new_state_dict.keys()):
-        if param.startswith(f"model.connectors.0"):
-            new_state_dict[param.replace("model.connectors.0", "model.connector")] = new_state_dict.pop(param)
     for layer_i in range(config.num_hidden_layers):
         prefix = f"{model_prefix}.blocks.{layer_i}"

             new_key = "lm_head.new_output_embeddings"
         elif key == "transformer.ff_out.weight":
             new_key = "lm_head.output_embeddings"
+        elif key.split(".")[0] in [
+            "subpatch_k", "subpatch_q", "patch_k", "patch_q", "add_no_point_class_embed",
+            "subpatch_loc_k", "x_norm"
+        ]:
+            new_key = f"{base_model_prefix}.point_predictor.{key}"
         else:
+            if key.startswith(f"connectors.0"):
+                key = key.replace("connectors.0", "connector")
             new_key = f"{base_model_prefix}.{key}"
         new_state_dict[new_key] = val
     qkv_bias = config.qkv_bias if isinstance(config, Molmo2TextConfig) else config.text_config.qkv_bias
     use_qk_norm = config.use_qk_norm if isinstance(config, Molmo2TextConfig) else config.text_config.use_qk_norm
     for layer_i in range(config.num_hidden_layers):
         prefix = f"{model_prefix}.blocks.{layer_i}"

model-00007-of-00008.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:30485f5c86cefbfccb5eef21be09adf1149e4ec6f9cc75072c162900e6972226
-size 4091924852

 version https://git-lfs.github.com/spec/v1
+oid sha256:b0af4940b1b827e995397de33114310c838b8edece415819de119d4f403d420f
+size 4091925052

model.safetensors.index.json CHANGED Viewed

@@ -6,7 +6,6 @@
   "weight_map": {
     "lm_head.new_output_embeddings": "model-00008-of-00008.safetensors",
     "lm_head.output_embeddings": "model-00008-of-00008.safetensors",
-    "model.add_no_point_class_embed.vector": "model-00007-of-00008.safetensors",
     "model.build_vit_embedding.bias": "model-00007-of-00008.safetensors",
     "model.build_vit_embedding.weight": "model-00007-of-00008.safetensors",
     "model.connector.image_pooling_2d.wk.bias": "model-00007-of-00008.safetensors",
@@ -18,16 +17,18 @@
     "model.connector.image_projector.w1.weight": "model-00007-of-00008.safetensors",
     "model.connector.image_projector.w2.weight": "model-00007-of-00008.safetensors",
     "model.connector.image_projector.w3.weight": "model-00007-of-00008.safetensors",
-    "model.patch_k.bias": "model-00007-of-00008.safetensors",
-    "model.patch_k.weight": "model-00007-of-00008.safetensors",
-    "model.patch_q.bias": "model-00007-of-00008.safetensors",
-    "model.patch_q.weight": "model-00007-of-00008.safetensors",
-    "model.subpatch_k.bias": "model-00007-of-00008.safetensors",
-    "model.subpatch_k.weight": "model-00007-of-00008.safetensors",
-    "model.subpatch_loc_k.bias": "model-00007-of-00008.safetensors",
-    "model.subpatch_loc_k.weight": "model-00007-of-00008.safetensors",
-    "model.subpatch_q.bias": "model-00007-of-00008.safetensors",
-    "model.subpatch_q.weight": "model-00007-of-00008.safetensors",
     "model.transformer.blocks.0.attn_norm.weight": "model-00001-of-00008.safetensors",
     "model.transformer.blocks.0.ff_norm.weight": "model-00001-of-00008.safetensors",
     "model.transformer.blocks.0.mlp.ff_out.weight": "model-00001-of-00008.safetensors",
@@ -721,7 +722,6 @@
     "model.vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
     "model.vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
     "model.vit.transformer.resblocks.9.ffn_norm.bias": "model-00007-of-00008.safetensors",
-    "model.vit.transformer.resblocks.9.ffn_norm.weight": "model-00007-of-00008.safetensors",
-    "model.x_norm.weight": "model-00007-of-00008.safetensors"
   }
 }

   "weight_map": {
     "lm_head.new_output_embeddings": "model-00008-of-00008.safetensors",
     "lm_head.output_embeddings": "model-00008-of-00008.safetensors",
     "model.build_vit_embedding.bias": "model-00007-of-00008.safetensors",
     "model.build_vit_embedding.weight": "model-00007-of-00008.safetensors",
     "model.connector.image_pooling_2d.wk.bias": "model-00007-of-00008.safetensors",
     "model.connector.image_projector.w1.weight": "model-00007-of-00008.safetensors",
     "model.connector.image_projector.w2.weight": "model-00007-of-00008.safetensors",
     "model.connector.image_projector.w3.weight": "model-00007-of-00008.safetensors",
+    "model.point_predictor.add_no_point_class_embed.vector": "model-00007-of-00008.safetensors",
+    "model.point_predictor.patch_k.bias": "model-00007-of-00008.safetensors",
+    "model.point_predictor.patch_k.weight": "model-00007-of-00008.safetensors",
+    "model.point_predictor.patch_q.bias": "model-00007-of-00008.safetensors",
+    "model.point_predictor.patch_q.weight": "model-00007-of-00008.safetensors",
+    "model.point_predictor.subpatch_k.bias": "model-00007-of-00008.safetensors",
+    "model.point_predictor.subpatch_k.weight": "model-00007-of-00008.safetensors",
+    "model.point_predictor.subpatch_loc_k.bias": "model-00007-of-00008.safetensors",
+    "model.point_predictor.subpatch_loc_k.weight": "model-00007-of-00008.safetensors",
+    "model.point_predictor.subpatch_q.bias": "model-00007-of-00008.safetensors",
+    "model.point_predictor.subpatch_q.weight": "model-00007-of-00008.safetensors",
+    "model.point_predictor.x_norm.weight": "model-00007-of-00008.safetensors",
     "model.transformer.blocks.0.attn_norm.weight": "model-00001-of-00008.safetensors",
     "model.transformer.blocks.0.ff_norm.weight": "model-00001-of-00008.safetensors",
     "model.transformer.blocks.0.mlp.ff_out.weight": "model-00001-of-00008.safetensors",
     "model.vit.transformer.resblocks.9.feed_forward.w2.bias": "model-00007-of-00008.safetensors",
     "model.vit.transformer.resblocks.9.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
     "model.vit.transformer.resblocks.9.ffn_norm.bias": "model-00007-of-00008.safetensors",
+    "model.vit.transformer.resblocks.9.ffn_norm.weight": "model-00007-of-00008.safetensors"
   }
 }

modeling_molmo_point.py CHANGED Viewed

@@ -9,7 +9,7 @@ import torch
 from torch import nn
 from torch.nn import functional as F
-from transformers import LogitsProcessorList, LogitsProcessor, AutoProcessor
 from transformers.image_utils import PILImageResampling
 from transformers.models.auto import AutoModelForImageTextToText
@@ -347,6 +347,150 @@ class ViTMultiHeadDotProductAttention(nn.Module):
         return attn_output
 class MolmoPointPreTrainedModel(PreTrainedModel):
     config: MolmoPointConfig
     base_model_prefix = "model"
@@ -356,6 +500,7 @@ class MolmoPointPreTrainedModel(PreTrainedModel):
         "Molmo2PostNormDecoderLayer",
         "Molmo2VisionBlock",
         "ViTMultiHeadDotProductAttention",
     ]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn = True
@@ -412,6 +557,7 @@ class GeneratedTokenBounds:
 class MolmoPointLogitProcessor(LogitsProcessor):
     def __init__(self, bounds: GeneratedTokenBounds,
                  prevent_repeats, force_patch_sorted, force_subpatch_sorted):
@@ -868,6 +1014,9 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
     def __init__(self, config: MolmoPointConfig):
         super().__init__(config)
         self.transformer: MolmoPointTextModel = MolmoPointTextModel(config.text_config)
         vit_config = config.vit_config
         adapter_config = config.adapter_config
@@ -886,45 +1035,14 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
         else:
             self.vit = Molmo2VisionTransformer(vit_config)
-        if self.config.token_prediction_rotary == "none":
-            self.patch_rotary = None
-        else:
-            theta = self.config.token_prediction_rotary_theta or self.config.llm.rope_theta
-            if self.config.token_prediction_rotary == "one_d":
-                self.patch_rotary = MolmoPointPatchRope(theta, self.config.patch_embed_dim)
-            else:
-                raise NotImplementedError()
         self.connector = MolmoPointConnector(adapter_config, vit_config)
-        vit_dim = self.config.vit_config.hidden_size * len(self.config.adapter_config.vit_layers)
-        llm_dim = self.config.text_config.hidden_size
-        self.patch_q = nn.Linear(llm_dim, config.patch_embed_dim)
-        self.patch_k = nn.Linear(llm_dim, config.patch_embed_dim)
-        self.subpatch_q = nn.Linear(llm_dim, config.patch_embed_dim)
-        self.subpatch_k = nn.Linear(vit_dim, config.patch_embed_dim)
-        self.add_no_point_class_embed = MolmoPointPadWithLearnedVector(config.patch_embed_dim)
-        self.patch_token_id = self.config.patch_token_id
-        self.subpatch_token_id = self.config.subpatch_token_id
-        self.location_token_id = self.config.location_token_id
         if self.config.embed_selected_vit_patch == "linear":
             self.build_vit_embedding = nn.Linear(vit_dim, llm_dim, bias=True)
         else:
             raise NotImplementedError(f"Embedding {self.config.embed_selected_vit_patch} not implemented")
-        if self.config.patch_location == "3x3":
-            self.subpatch_loc_k = nn.Linear(llm_dim, 9)
-        elif self.config.patch_location is None:
-            self.subpatch_loc_k = None
-        else:
-            raise NotImplementedError(f"Patch location {self.config.patch_location} not implemented")
-        if self.config.layer_norm_x:
-            self.x_norm = Molmo2RMSNorm(llm_dim, eps=self.config.text_config.layer_norm_eps)
-        else:
-            self.x_norm = None
         # Initialize weights and apply final processing
         self.post_init()
@@ -1291,6 +1409,8 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
         input_ids = input_ids * (input_ids != -1).to(input_ids.dtype)
         if image_data is not None:
             can_point = True
             bounds = self.build_token_bounds(image_data.token_pooling)
             expanded_inputs = input_ids
@@ -1304,7 +1424,11 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
             input_ids = torch.where(is_subpatch, self.subpatch_token_id, input_ids)
             input_ids = torch.where(is_location, self.location_token_id, input_ids)
         else:
             input_patch_ids = None
             can_point = False
         device = input_ids.device
@@ -1312,7 +1436,6 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
         batch_size, _, dim = x.shape
         batch_idx = torch.arange(batch_size, device=device)
-        # TODO update embeddings for patch/subpatch tokens
         vit_features_flat: Optional[torch.FloatTensor] = None
         if images is not None:
             is_indexable_image_token = input_ids == self.config.image_patch_id
@@ -1342,27 +1465,23 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
             vit_features_flat = vit_features_flat[image_features_mask.view(-1)]
             vit_features_to_flat_mask = vit_features_mask.view(-1, token_pooling.shape[-1])[image_features_mask.view(-1)]
-            # Finally apply the connector and add to input embeddings
             image_features = self.connector(vit_features_flat, vit_features_to_flat_mask).to(device=device)
             x = x.clone()
             x.view(-1, dim)[is_image_token.view(-1)] += image_features.view(-1, dim)
-            # Build position ids for the image features, which we might need for rotary
-            # embeddings
-            image_token_indices = torch.cumsum(is_indexable_image_token, dim=-1) - 1
-            image_pos_ids_flat = image_token_indices.view(-1)[is_image_token.view(-1)]
-            image_pos_ids = torch.zeros([batch_size, token_pooling.shape[1]], dtype=torch.long, device=device)
-            image_pos_ids.view(-1)[image_features_mask.view(-1)] = image_pos_ids_flat
-            max_image_pos_id = image_pos_ids_flat.max() + 1
-        elif image_data is not None:
-            token_pooling = image_data.token_pooling
-            image_pos_ids = image_data.image_pos_ids
-            vit_features = image_data.vit_features
-            vit_features_mask = token_pooling >= 0
-            image_features_mask = torch.any(vit_features_mask, -1)
-            vit_features_flat = vit_features.reshape([-1, token_pooling.shape[-1], vit_features.shape[-1]])
-            vit_features_flat = vit_features_flat[image_features_mask.view(-1)]
-            vit_features_to_flat_mask = vit_features_mask.view(-1, token_pooling.shape[-1])[image_features_mask.view(-1)]
         # Embed the points
         if can_point:
@@ -1372,6 +1491,9 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
             x.view(-1, dim)[is_patch.view(-1)] += image_data.image_features0.view(-1, dim)[input_patch_ids_flat]
             if torch.any(is_subpatch):
                 assert last_predicted_patch_id is not None, "Patch should always be generated before a subpatch"
                 for_patches = (last_predicted_patch_id.view(batch_size) + image_token_offset)[input_subpatch_ids.view(batch_size) >= 0]
                 vit_features_to_embed = vit_features_flat[for_patches, input_subpatch_ids]
@@ -1386,7 +1508,7 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
                 past_seen_tokens,
                 past_seen_tokens + inputs_embeds.shape[1],
                 device=inputs_embeds.device,
-                )
         # NOTE: this `is_prefill` logic is not flawless, it fails when we're using a cache eagerly initialized
         # (e.g. compiled prefill) AND `images` are not provided. Determining prefill in that case requires
@@ -1438,84 +1560,27 @@ class MolmoPointModel(MolmoPointPreTrainedModel):
         location_logits = None
         if images is not None or image_data is not None:
-            if self.x_norm:
-                x_norm = self.x_norm(x).to(device=device)
-            elif self.config.norm_x:
-                x_norm = x / math.sqrt(dim)
-            else:
-                x_norm = x
-            # Build the keys, or get them from the cache
-            if image_data is not None:
-                patch_k, subpatch_k = image_data.patch_k, image_data.subpatch_k
-                patch_k_mask = image_data.patch_k_mask
-            else:
-                patch_k_flat = self.patch_k(x_norm.view(-1, dim)[is_image_token.view(-1)])
-                if self.patch_rotary is not None:
-                    patch_k_flat = self.patch_rotary(patch_k_flat, image_pos_ids_flat)
-                patch_k_flat = patch_k_flat.to(device=device)
-                patch_k = torch.zeros([batch_size, image_features_mask.shape[1], patch_k_flat.shape[-1]], dtype=x.dtype, device=device)
-                patch_k.view(-1, patch_k_flat.shape[-1])[image_features_mask.flatten()] = patch_k_flat.to(dtype=x.dtype)
-                patch_k_mask = image_features_mask.clone()
-                patch_k_mask.view(-1)[image_features_mask.view(-1)] = (
-                    is_indexable_image_token.view(-1)[is_image_token.view(-1)])
-                if self.config.no_more_points_class:
-                    patch_k = self.add_no_point_class_embed(patch_k).to(device=device)
-                    patch_k_mask = F.pad(patch_k_mask, (0, 1), value=True)
-                subpatch_k = self.subpatch_k(vit_features).to(device=device)
-            # Predict patch locations
-            if can_point:
-                image_q = self.patch_q(x_norm).to(device=device)
-                if self.patch_rotary is not None and last_predicted_patch_id is not None:
-                    rotate_by = image_pos_ids[batch_idx, last_predicted_patch_id]
-                    rotate_by = torch.where(last_predicted_patch_id >= 0, rotate_by, 0)
-                    rotate_by = rotate_by.squeeze(-1)
-                    image_q = self.patch_rotary(
-                        image_q.view(-1, image_q.shape[-1]),
-                        torch.clamp(rotate_by, min=0),
-                    ).reshape(batch_size, -1, image_q.shape[-1]).to(device=device)
-                dots = torch.matmul(image_q, patch_k.transpose(1, 2))  # [batch, 1, num_images]
-                if self.config.norm_logits:
-                    dots = dots / math.sqrt(dots.shape[-1])
-                valid = patch_k_mask[:, None, :]
-                patch_logits = torch.where(valid, dots, -100000000)
-            if can_point and torch.any(is_patch):
-                if x_norm.shape[1] != 1:
-                    raise NotImplementedError()
-                subpatch_point_q = self.subpatch_q(x_norm.squeeze(1)).to(device=device)
-                subpatch_k = subpatch_k[batch_idx, input_patch_ids.squeeze(1)]
-                subpatch_logits = torch.einsum("pd,pcd->pc", subpatch_point_q, subpatch_k)
-                if self.config.norm_logits:
-                    subpatch_logits = subpatch_logits / math.sqrt(patch_k.shape[-1])
-                subpatch_mask = vit_features_mask[batch_idx, input_patch_ids.squeeze(1)]
-                subpatch_logits = torch.where(subpatch_mask, subpatch_logits, -100000)
-                subpatch_logits = subpatch_logits[:, None, :]
-            if can_point and torch.any(is_subpatch):
-                location_logits = self.subpatch_loc_k(x).to(device=device)
-            if is_prefill:
                 num_image_tokens = is_image_token.sum(-1)
                 image_token_offset = torch.cumsum(num_image_tokens[:-1], 0)
                 image_token_offset = F.pad(image_token_offset, [1, 0])
-                # Return the cache for image keys/features
-                image_data = ImageCache(
-                    patch_k=patch_k,
-                    subpatch_k=subpatch_k,
-                    vit_features=vit_features,
-                    patch_k_mask=patch_k_mask,
-                    token_pooling=token_pooling,
-                    image_pos_ids=image_pos_ids,
-                    image_features0=image_features,
-                    flat_image_tokens_to_flat_image_features=image_token_offset
-                )
         if last_predicted_patch_id is not None:
             last_predicted_patch_id = torch.where(input_patch_ids == -1, last_predicted_patch_id, input_patch_ids)
@@ -1713,9 +1778,9 @@ class MolmoPointForConditionalGeneration(MolmoPointPreTrainedModel, GenerationMi
         patch_token_logits = torch.clone(logits[:, :, self.config.patch_token_id])
         logits[:, :, self.config.patch_token_id] = small_val
         predicted_patch = predicted_tokens == self.config.patch_token_id
-        argmax_patch_logits = torch.full([bs, seq, n_patches], small_val, dtype=logits.dtype, device=logits.device)
         if outputs.patch_logits is not None:
-            selected_patches = torch.argmax(outputs.patch_logits, -1)
             bs, seq, n_patches = outputs.patch_logits.shape
             batch_idx = torch.arange(outputs.patch_logits.shape[0], device=device)
             seq_ix = torch.arange(outputs.patch_logits.shape[1], device=device)
@@ -1725,13 +1790,13 @@ class MolmoPointForConditionalGeneration(MolmoPointPreTrainedModel, GenerationMi
         if outputs.subpatch_logits is not None:
             subpatch_logits = outputs.subpatch_logits
         else:
-            subpatch_logits = torch.full([bs, seq, n_subpatches], small_val, dtype=logits.dtype, device=logits.device)
         logits[:, :, self.config.location_token_id] = small_val
         if outputs.location_logits is not None:
             location_logits = outputs.location_logits
         else:
-            location_logits = torch.full([bs, seq, 9], small_val, dtype=logits.dtype, device=logits.device)
         logits = torch.concatenate([
             logits,

 from torch import nn
 from torch.nn import functional as F
+from transformers import LogitsProcessorList, LogitsProcessor, AutoProcessor, ViTConfig
 from transformers.image_utils import PILImageResampling
 from transformers.models.auto import AutoModelForImageTextToText
         return attn_output
+class PointPredictor(nn.Module):
+    """Point predictor logic"""
+    # We separate this out so accelerate will co-locate all these parameters on the same device
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        llm_dim = config.text_config.hidden_size
+        patch_embed_dim = config.patch_embed_dim
+        vit_dim = self.config.vit_config.hidden_size * len(self.config.adapter_config.vit_layers)
+        if self.config.layer_norm_x:
+            self.x_norm = Molmo2RMSNorm(llm_dim, eps=self.config.text_config.layer_norm_eps)
+        else:
+            self.x_norm = None
+        if self.config.token_prediction_rotary == "none":
+            self.patch_rotary = None
+        else:
+            theta = self.config.token_prediction_rotary_theta or self.config.llm.rope_theta
+            if self.config.token_prediction_rotary == "one_d":
+                self.patch_rotary = MolmoPointPatchRope(theta, self.config.patch_embed_dim)
+            else:
+                raise NotImplementedError()
+        self.patch_q = nn.Linear(llm_dim, patch_embed_dim)
+        self.patch_k = nn.Linear(llm_dim, patch_embed_dim)
+        self.subpatch_q = nn.Linear(llm_dim, patch_embed_dim)
+        self.subpatch_k = nn.Linear(vit_dim, patch_embed_dim)
+        self.add_no_point_class_embed = MolmoPointPadWithLearnedVector(patch_embed_dim)
+        if self.config.patch_location == "3x3":
+            self.subpatch_loc_k = nn.Linear(llm_dim, 9)
+        elif self.config.patch_location is None:
+            self.subpatch_loc_k = None
+        else:
+            raise NotImplementedError(f"Patch location {self.config.patch_location} not implemented")
+    def forward(
+        self,
+        x,
+        token_pooling,
+        is_image_token,
+        is_patch,
+        is_subpatch,
+        is_indexable_image_token,
+        vit_features,
+        vit_features_mask,
+        image_features_mask,
+        input_patch_ids,
+        last_predicted_patch_id,
+        image_data: ImageCache
+    ):
+        dim = self.config.text_config.hidden_size
+        batch_size = x.shape[0]
+        if self.x_norm is not None:
+            x_norm = self.x_norm(x)
+        elif self.config.norm_x:
+            x_norm = x / math.sqrt(dim)
+        else:
+            x_norm = x
+        # Build the keys, or get them from the cache
+        if image_data is not None:
+            patch_k, subpatch_k = image_data.patch_k, image_data.subpatch_k
+            patch_k_mask = image_data.patch_k_mask
+            token_pooling = image_data.token_pooling
+            vit_features_mask = token_pooling >= 0
+            image_pos_ids = image_data.image_pos_ids
+        else:
+            # Build patch keys, this takes a bit of indexing trickery since we want the keys in
+            # shape [batch, n_image_tokens] not [batch, sequence_length]
+            n_image_tokens = token_pooling.shape[1]
+            patch_k_flat = self.patch_k(x_norm.view(-1, dim)[is_image_token.view(-1)])
+            if self.patch_rotary is not None:
+                image_token_indices = torch.cumsum(is_indexable_image_token, dim=-1) - 1
+                image_pos_ids_flat = image_token_indices.view(-1)[is_image_token.view(-1)]
+                patch_k_flat = self.patch_rotary(patch_k_flat, image_pos_ids_flat)
+                # Computed for use with the query vectors
+                image_pos_ids = torch.zeros([batch_size, n_image_tokens], dtype=torch.long,
+                                            device=image_pos_ids_flat.device)
+                image_pos_ids.view(-1)[image_features_mask.view(-1)] = image_pos_ids_flat
+            else:
+                image_pos_ids = None
+            patch_k = torch.zeros([batch_size, n_image_tokens, patch_k_flat.shape[-1]],
+                                  dtype=x.dtype, device=x.device)
+            patch_k.view(-1, patch_k_flat.shape[-1])[image_features_mask.flatten()] = patch_k_flat.to(dtype=x.dtype)
+            patch_k_mask = image_features_mask.clone()
+            patch_k_mask.view(-1)[image_features_mask.view(-1)] = (
+                is_indexable_image_token.view(-1)[is_image_token.view(-1)])
+            if self.config.no_more_points_class:
+                patch_k = self.add_no_point_class_embed(patch_k)
+                patch_k_mask = F.pad(patch_k_mask, (0, 1), value=True)
+            subpatch_k = self.subpatch_k(vit_features)
+        patch_logits, subpatch_logits, location_logits = None, None, None
+        if image_data is not None:
+            # Predict patch locations, only done after pre-filling
+            batch_idx = torch.arange(batch_size, device=x_norm.device)
+            image_q = self.patch_q(x_norm)
+            if self.patch_rotary is not None and last_predicted_patch_id is not None:
+                rotate_by = image_pos_ids[batch_idx, last_predicted_patch_id]
+                rotate_by = torch.where(last_predicted_patch_id >= 0, rotate_by, 0)
+                rotate_by = rotate_by.squeeze(-1)
+                image_q = self.patch_rotary(
+                    image_q.view(-1, image_q.shape[-1]),
+                    torch.clamp(rotate_by, min=0),
+                ).reshape(batch_size, -1, image_q.shape[-1])
+            dots = torch.matmul(image_q, patch_k.transpose(1, 2))  # [batch, 1, num_images]
+            if self.config.norm_logits:
+                dots = dots / math.sqrt(dots.shape[-1])
+            valid = patch_k_mask[:, None, :]
+            patch_logits = torch.where(valid, dots, -100000000)
+            if torch.any(is_patch):
+                if x_norm.shape[1] != 1:
+                    raise NotImplementedError()
+                subpatch_point_q = self.subpatch_q(x_norm.squeeze(1))
+                subpatch_k = subpatch_k[batch_idx, input_patch_ids.squeeze(1)]
+                subpatch_logits = torch.einsum("pd,pcd->pc", subpatch_point_q, subpatch_k)
+                if self.config.norm_logits:
+                    subpatch_logits = subpatch_logits / math.sqrt(patch_k.shape[-1])
+                subpatch_mask = vit_features_mask[batch_idx, input_patch_ids.squeeze(1)]
+                subpatch_logits = torch.where(subpatch_mask, subpatch_logits, -100000)
+                subpatch_logits = subpatch_logits[:, None, :]
+            if torch.any(is_subpatch):
+                location_logits = self.subpatch_loc_k(x)
+        if image_data is None:
+            image_data = ImageCache(
+                patch_k=patch_k,
+                subpatch_k=subpatch_k,
+                vit_features=vit_features,
+                patch_k_mask=patch_k_mask,
+                token_pooling=token_pooling,
+                image_pos_ids=image_pos_ids,
+            )
+        return patch_logits, subpatch_logits, location_logits, image_data
 class MolmoPointPreTrainedModel(PreTrainedModel):
     config: MolmoPointConfig
     base_model_prefix = "model"
         "Molmo2PostNormDecoderLayer",
         "Molmo2VisionBlock",
         "ViTMultiHeadDotProductAttention",
+        "PointPredictor"
     ]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn = True
 class MolmoPointLogitProcessor(LogitsProcessor):
+    """Force point-special tokens to be generated in a valid order"""
     def __init__(self, bounds: GeneratedTokenBounds,
                  prevent_repeats, force_patch_sorted, force_subpatch_sorted):
     def __init__(self, config: MolmoPointConfig):
         super().__init__(config)
         self.transformer: MolmoPointTextModel = MolmoPointTextModel(config.text_config)
+        self.patch_token_id = self.config.patch_token_id
+        self.subpatch_token_id = self.config.subpatch_token_id
+        self.location_token_id = self.config.location_token_id
         vit_config = config.vit_config
         adapter_config = config.adapter_config
         else:
             self.vit = Molmo2VisionTransformer(vit_config)
         self.connector = MolmoPointConnector(adapter_config, vit_config)
         if self.config.embed_selected_vit_patch == "linear":
+            llm_dim = config.text_config.hidden_size
+            vit_dim = self.config.vit_config.hidden_size * len(self.config.adapter_config.vit_layers)
             self.build_vit_embedding = nn.Linear(vit_dim, llm_dim, bias=True)
         else:
             raise NotImplementedError(f"Embedding {self.config.embed_selected_vit_patch} not implemented")
+        self.point_predictor = PointPredictor(config)
         # Initialize weights and apply final processing
         self.post_init()
         input_ids = input_ids * (input_ids != -1).to(input_ids.dtype)
         if image_data is not None:
+            # Figure out where the patch/subpatch/location are and their values, and then convert
+            # the input_ids back into their original special token values
             can_point = True
             bounds = self.build_token_bounds(image_data.token_pooling)
             expanded_inputs = input_ids
             input_ids = torch.where(is_subpatch, self.subpatch_token_id, input_ids)
             input_ids = torch.where(is_location, self.location_token_id, input_ids)
         else:
+            # No patch prediction during pre-filling
+            input_subpatch_ids = None
             input_patch_ids = None
+            is_patch = None
+            is_subpatch = None
             can_point = False
         device = input_ids.device
         batch_size, _, dim = x.shape
         batch_idx = torch.arange(batch_size, device=device)
         vit_features_flat: Optional[torch.FloatTensor] = None
         if images is not None:
             is_indexable_image_token = input_ids == self.config.image_patch_id
             vit_features_flat = vit_features_flat[image_features_mask.view(-1)]
             vit_features_to_flat_mask = vit_features_mask.view(-1, token_pooling.shape[-1])[image_features_mask.view(-1)]
+            # Finally, apply the connector and add to input embeddings
             image_features = self.connector(vit_features_flat, vit_features_to_flat_mask).to(device=device)
             x = x.clone()
             x.view(-1, dim)[is_image_token.view(-1)] += image_features.view(-1, dim)
+        else:
+            is_image_token = None
+            is_indexable_image_token = None
+            if image_data is not None:
+                # Get the features/masks from the cache
+                token_pooling = image_data.token_pooling.to(device=device)
+                vit_features_mask = token_pooling >= 0
+                image_features_mask = torch.any(vit_features_mask, -1)
+                vit_features = image_data.vit_features.to(device=device)
+            else:
+                vit_features = None
+                vit_features_mask = None
+                image_features_mask = None
         # Embed the points
         if can_point:
             x.view(-1, dim)[is_patch.view(-1)] += image_data.image_features0.view(-1, dim)[input_patch_ids_flat]
             if torch.any(is_subpatch):
+                vit_features_flat = vit_features.reshape([-1, token_pooling.shape[-1], vit_features.shape[-1]])
+                vit_features_flat = vit_features_flat[image_features_mask.view(-1)]
                 assert last_predicted_patch_id is not None, "Patch should always be generated before a subpatch"
                 for_patches = (last_predicted_patch_id.view(batch_size) + image_token_offset)[input_subpatch_ids.view(batch_size) >= 0]
                 vit_features_to_embed = vit_features_flat[for_patches, input_subpatch_ids]
                 past_seen_tokens,
                 past_seen_tokens + inputs_embeds.shape[1],
                 device=inputs_embeds.device,
+            )
         # NOTE: this `is_prefill` logic is not flawless, it fails when we're using a cache eagerly initialized
         # (e.g. compiled prefill) AND `images` are not provided. Determining prefill in that case requires
         location_logits = None
         if images is not None or image_data is not None:
+            patch_logits, subpatch_logits, location_logits, image_data = self.point_predictor(
+                x,
+                token_pooling,
+                is_image_token,
+                is_patch,
+                is_subpatch,
+                is_indexable_image_token,
+                vit_features,
+                vit_features_mask,
+                image_features_mask,
+                input_patch_ids,
+                last_predicted_patch_id,
+                image_data
+            )
+            if images is not None:
+                # Also cache stuff we need to building the patch/subpatch token embeddings
+                image_data.image_features0 = image_features
                 num_image_tokens = is_image_token.sum(-1)
                 image_token_offset = torch.cumsum(num_image_tokens[:-1], 0)
                 image_token_offset = F.pad(image_token_offset, [1, 0])
+                image_data.flat_image_tokens_to_flat_image_features = image_token_offset
         if last_predicted_patch_id is not None:
             last_predicted_patch_id = torch.where(input_patch_ids == -1, last_predicted_patch_id, input_patch_ids)
         patch_token_logits = torch.clone(logits[:, :, self.config.patch_token_id])
         logits[:, :, self.config.patch_token_id] = small_val
         predicted_patch = predicted_tokens == self.config.patch_token_id
+        argmax_patch_logits = torch.full([bs, seq, n_patches], small_val, dtype=logits.dtype, device=device)
         if outputs.patch_logits is not None:
+            selected_patches = torch.argmax(outputs.patch_logits, -1).to(device=device)
             bs, seq, n_patches = outputs.patch_logits.shape
             batch_idx = torch.arange(outputs.patch_logits.shape[0], device=device)
             seq_ix = torch.arange(outputs.patch_logits.shape[1], device=device)
         if outputs.subpatch_logits is not None:
             subpatch_logits = outputs.subpatch_logits
         else:
+            subpatch_logits = torch.full([bs, seq, n_subpatches], small_val, dtype=logits.dtype, device=device)
         logits[:, :, self.config.location_token_id] = small_val
         if outputs.location_logits is not None:
             location_logits = outputs.location_logits
         else:
+            location_logits = torch.full([bs, seq, 9], small_val, dtype=logits.dtype, device=device)
         logits = torch.concatenate([
             logits,

processing_molmo2.py CHANGED Viewed

@@ -294,7 +294,6 @@ class Molmo2Processor(ProcessorMixin):
               Returned when `videos` is not `None`.
             - **video_grids** -- Grids of videos. Returned when `videos` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
             Molmo2ProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,

               Returned when `videos` is not `None`.
             - **video_grids** -- Grids of videos. Returned when `videos` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
             Molmo2ProcessorKwargs,
             tokenizer_init_kwargs=self.tokenizer.init_kwargs,

video_processing_molmo2.py CHANGED Viewed

@@ -826,7 +826,8 @@ class Molmo2VideoProcessor(BaseVideoProcessor):
     ) -> BatchFeature:
         validate_kwargs(
             captured_kwargs=kwargs.keys(),
-            valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) + ["return_tensors"],
         )
         # Set default kwargs from self. This ensures that if a kwarg is not provided

     ) -> BatchFeature:
         validate_kwargs(
             captured_kwargs=kwargs.keys(),
+            valid_processor_keys=list(self.valid_kwargs.__annotations__.keys()) +
+                                 ["return_tensors", "return_pointing_metadata"],
         )
         # Set default kwargs from self. This ensures that if a kwarg is not provided