Fraser
/

gemma-3-tiled-4b-it

Safetensors

gemma3_tiled

custom_code

Model card Files Files and versions

xet

Community

Fraser commited on Jan 20

Commit

c9a2bf3

verified ·

1 Parent(s): 4f71f8f

Fix grid shape ordering for multi-image inputs

Browse files

Files changed (1) hide show

processing_gemma3_tiled.py +33 -21

processing_gemma3_tiled.py CHANGED Viewed

@@ -8,6 +8,7 @@ based on the tile grid dimensions.
 import re
 from typing import Optional, Union
 import numpy as np
 from transformers.feature_extraction_utils import BatchFeature
@@ -51,7 +52,7 @@ class Gemma3TiledProcessor(ProcessorMixin):
     """
     attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "image_processing_gemma3_tiled.Gemma3TiledImageProcessor"
     tokenizer_class = "AutoTokenizer"
     def __init__(
@@ -144,8 +145,8 @@ class Gemma3TiledProcessor(ProcessorMixin):
             # Process images to get tiles
             image_inputs = self.image_processor(images_fetched, **output_kwargs["images_kwargs"])
-            # Get grid shapes for each image
-            tile_grid_shapes = image_inputs.get("tile_grid_shape", [])
             # Create empty text to be replaced with placeholders
             if not text:
@@ -158,11 +159,12 @@ class Gemma3TiledProcessor(ProcessorMixin):
             # Build flat list of grid shapes across all batches
             all_grid_shapes = []
             for imgs in batched_images:
                 for _ in imgs:
-                    if tile_grid_shapes:
-                        all_grid_shapes.append(tile_grid_shapes.pop(0))
-                    else:
                         # Fallback to 1x1 grid
                         all_grid_shapes.append((1, 1))
@@ -170,36 +172,46 @@ class Gemma3TiledProcessor(ProcessorMixin):
             grid_shape_idx = 0
             for batch_idx, (prompt, imgs) in enumerate(zip(text, batched_images)):
                 image_indexes = [m.start() for m in re.finditer(re.escape(self.boi_token), prompt)]
                 if len(imgs) != len(image_indexes):
                     raise ValueError(
                         f"Prompt contained {len(image_indexes)} image tokens but received {len(imgs)} images."
                     )
                 # Replace each BOI token with the full image sequence
-                for idx in reversed(image_indexes):
-                    grid_h, grid_w = all_grid_shapes[grid_shape_idx]
-                    grid_shape_idx += 1
                     image_sequence = self.build_image_token_sequence(grid_h, grid_w)
                     prompt = prompt[:idx] + image_sequence + prompt[idx + len(self.boi_token):]
                 text[batch_idx] = prompt
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
-        text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
         # Add token type ids (1 for image tokens, 0 for text)
         if return_mm_token_type_ids:
-            array_ids = np.array(text_inputs["input_ids"])
-            mm_token_type_ids = np.zeros_like(array_ids)
-            mm_token_type_ids[array_ids == self.image_token_id] = 1
-            text_inputs["token_type_ids"] = mm_token_type_ids.tolist()
-        # Combine outputs
-        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
     @property
     def model_input_names(self):

 import re
 from typing import Optional, Union
+import torch
 import numpy as np
 from transformers.feature_extraction_utils import BatchFeature
     """
     attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"  # Use AutoImageProcessor for compatibility
     tokenizer_class = "AutoTokenizer"
     def __init__(
             # Process images to get tiles
             image_inputs = self.image_processor(images_fetched, **output_kwargs["images_kwargs"])
+            # Get grid shapes for each image (make a copy to avoid mutating)
+            tile_grid_shapes = list(image_inputs.get("tile_grid_shape", []))
             # Create empty text to be replaced with placeholders
             if not text:
             # Build flat list of grid shapes across all batches
             all_grid_shapes = []
+            grid_shape_iter = iter(tile_grid_shapes)
             for imgs in batched_images:
                 for _ in imgs:
+                    try:
+                        all_grid_shapes.append(next(grid_shape_iter))
+                    except StopIteration:
                         # Fallback to 1x1 grid
                         all_grid_shapes.append((1, 1))
             grid_shape_idx = 0
             for batch_idx, (prompt, imgs) in enumerate(zip(text, batched_images)):
                 image_indexes = [m.start() for m in re.finditer(re.escape(self.boi_token), prompt)]
                 if len(imgs) != len(image_indexes):
                     raise ValueError(
                         f"Prompt contained {len(image_indexes)} image tokens but received {len(imgs)} images."
                     )
+                # Get grid shapes for this batch's images (in order)
+                batch_grid_shapes = all_grid_shapes[grid_shape_idx:grid_shape_idx + len(imgs)]
+                grid_shape_idx += len(imgs)
                 # Replace each BOI token with the full image sequence
+                # Iterate in reverse to avoid shifting string indices, but also reverse grid shapes to match
+                for idx, (grid_h, grid_w) in zip(reversed(image_indexes), reversed(batch_grid_shapes)):
                     image_sequence = self.build_image_token_sequence(grid_h, grid_w)
                     prompt = prompt[:idx] + image_sequence + prompt[idx + len(self.boi_token):]
                 text[batch_idx] = prompt
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        # Get text inputs - let tokenizer handle tensor conversion for text
+        text_inputs = self.tokenizer(text=text, return_tensors=return_tensors, **output_kwargs["text_kwargs"])
         # Add token type ids (1 for image tokens, 0 for text)
         if return_mm_token_type_ids:
+            if return_tensors == "pt":
+                input_ids = text_inputs["input_ids"]
+                mm_token_type_ids = torch.zeros_like(input_ids)
+                mm_token_type_ids[input_ids == self.image_token_id] = 1
+                text_inputs["token_type_ids"] = mm_token_type_ids
+            else:
+                array_ids = np.array(text_inputs["input_ids"])
+                mm_token_type_ids = np.zeros_like(array_ids)
+                mm_token_type_ids[array_ids == self.image_token_id] = 1
+                text_inputs["token_type_ids"] = mm_token_type_ids.tolist()
+        # Combine outputs - DON'T pass tensor_type here because pixel_values
+        # has inhomogeneous shapes (different tile counts per image)
+        return BatchFeature(data={**text_inputs, **image_inputs})
     @property
     def model_input_names(self):