Fraser
/

gemma-3-tiled-4b-it

Safetensors

gemma3_tiled

custom_code

Model card Files Files and versions

xet

Community

Fraser commited on 13 days ago

Commit

ff23816

verified ·

1 Parent(s): 50deb8f

Update processing_gemma3_tiled.py

Browse files

Files changed (1) hide show

processing_gemma3_tiled.py +106 -7

processing_gemma3_tiled.py CHANGED Viewed

@@ -13,10 +13,52 @@ import numpy as np
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput, make_nested_list_of_images
-from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, ImagesKwargs
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 class Gemma3TiledImagesKwargs(ImagesKwargs):
     tile_size: Optional[int]
     max_tiles_h: Optional[int]
@@ -54,6 +96,7 @@ class Gemma3TiledProcessor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "AutoImageProcessor"  # Use AutoImageProcessor for compatibility
     tokenizer_class = "AutoTokenizer"
     def __init__(
         self,
@@ -99,20 +142,24 @@ class Gemma3TiledProcessor(ProcessorMixin):
     def build_image_token_sequence(self, grid_h: int, grid_w: int) -> str:
         """
         Build the image token sequence for a tiled image.
         Returns a string like:
-        \n\n<boi><img>×(16*grid_w)<img>×(16*grid_w)...(×16*grid_h rows)...<eoi>\n\n
         Note: We use <img> tokens for BOTH actual image positions AND linebreak positions.
         The model will replace them with the appropriate embeddings.
         """
         rows = grid_h * self.tokens_per_tile_side
         cols = grid_w * self.tokens_per_tile_side
         total_tokens = self.get_num_image_tokens(grid_h, grid_w)
         image_tokens = self.image_token * total_tokens
-        return f"\n\n{self.boi_token}{image_tokens}{self.eoi_token}\n\n"
     def __call__(
         self,
@@ -218,6 +265,58 @@ class Gemma3TiledProcessor(ProcessorMixin):
         tokenizer_input_names = self.tokenizer.model_input_names + ["token_type_ids"]
         image_processor_input_names = self.image_processor.model_input_names
         return list(set(tokenizer_input_names + image_processor_input_names))
 __all__ = ["Gemma3TiledProcessor", "Gemma3TiledProcessorKwargs"]

 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput, make_nested_list_of_images
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, ImagesKwargs, MultiModalData
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+def calculate_tile_grid(
+    image_height: int,
+    image_width: int,
+    tile_size: int,
+    max_tiles_h: int,
+    max_tiles_w: int,
+    min_tiles: int = 1,
+) -> tuple[int, int]:
+    """
+    Calculate the optimal tile grid dimensions for an image.
+    The strategy is to:
+    1. Maximize effective resolution (pixels preserved from original image)
+    2. Minimize wasted canvas space as a tiebreaker
+    """
+    original_pixels = image_height * image_width
+    best_grid = (1, 1)
+    best_score = float('-inf')
+    for rows in range(1, max_tiles_h + 1):
+        for cols in range(1, max_tiles_w + 1):
+            total_tiles = rows * cols
+            if total_tiles < min_tiles:
+                continue
+            canvas_h = rows * tile_size
+            canvas_w = cols * tile_size
+            scale = min(canvas_w / image_width, canvas_h / image_height)
+            effective = min(image_height * image_width * scale * scale, original_pixels)
+            waste = (canvas_h * canvas_w) - effective
+            score = effective - 0.001 * waste
+            if score > best_score:
+                best_score = score
+                best_grid = (rows, cols)
+    return best_grid
 class Gemma3TiledImagesKwargs(ImagesKwargs):
     tile_size: Optional[int]
     max_tiles_h: Optional[int]
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "AutoImageProcessor"  # Use AutoImageProcessor for compatibility
     tokenizer_class = "AutoTokenizer"
+    _auto_class = "AutoProcessor"  # Required for auto_map in processor_config.json
     def __init__(
         self,
     def build_image_token_sequence(self, grid_h: int, grid_w: int) -> str:
         """
         Build the image token sequence for a tiled image.
         Returns a string like:
+        \n\n<boi><img>×(16*grid_w)<img>×(16*grid_w)...(×16*grid_h rows)...<eoi>
         Note: We use <img> tokens for BOTH actual image positions AND linebreak positions.
         The model will replace them with the appropriate embeddings.
+        IMPORTANT: We do NOT add trailing \n\n because when followed by text content
+        that starts with \n, it would create \n\n\n which tokenizes differently and
+        breaks vLLM's placeholder pattern matching.
         """
         rows = grid_h * self.tokens_per_tile_side
         cols = grid_w * self.tokens_per_tile_side
         total_tokens = self.get_num_image_tokens(grid_h, grid_w)
         image_tokens = self.image_token * total_tokens
+        return f"\n\n{self.boi_token}{image_tokens}{self.eoi_token}"
     def __call__(
         self,
         tokenizer_input_names = self.tokenizer.model_input_names + ["token_type_ids"]
         image_processor_input_names = self.image_processor.model_input_names
         return list(set(tokenizer_input_names + image_processor_input_names))
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+        This is required by vLLM for memory profiling and scheduling.
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+            **kwargs: Additional arguments (tile_size, max_tiles_h, max_tiles_w, min_tiles)
+                that override image processor defaults.
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+        vision_data = {}
+        if image_sizes is not None:
+            # Get tiling parameters from kwargs or fall back to image processor settings
+            tile_size = kwargs.get("tile_size", getattr(self.image_processor, "tile_size", 896))
+            max_tiles_h = kwargs.get("max_tiles_h", getattr(self.image_processor, "max_tiles_h", 4))
+            max_tiles_w = kwargs.get("max_tiles_w", getattr(self.image_processor, "max_tiles_w", 4))
+            min_tiles = kwargs.get("min_tiles", getattr(self.image_processor, "min_tiles", 1))
+            num_image_tokens = []
+            num_image_patches = []
+            for height, width in image_sizes:
+                # Calculate optimal tile grid for this image
+                grid_h, grid_w = calculate_tile_grid(
+                    image_height=height,
+                    image_width=width,
+                    tile_size=tile_size,
+                    max_tiles_h=max_tiles_h,
+                    max_tiles_w=max_tiles_w,
+                    min_tiles=min_tiles,
+                )
+                # Calculate token count for this grid
+                tokens = self.get_num_image_tokens(grid_h, grid_w)
+                num_image_tokens.append(tokens)
+                # Number of patches = number of tiles
+                num_image_patches.append(grid_h * grid_w)
+            vision_data.update({
+                "num_image_tokens": num_image_tokens,
+                "num_image_patches": num_image_patches,
+            })
+        return MultiModalData(**vision_data)
 __all__ = ["Gemma3TiledProcessor", "Gemma3TiledProcessorKwargs"]