| """ |
| Configuration for Gemma3 with tiled image processing. |
| |
| This extends the base Gemma3Config to support dynamic resolution images |
| by tiling them into a grid of 896x896 patches. |
| """ |
|
|
| from typing import Optional |
|
|
| from transformers import Gemma3Config |
|
|
|
|
| class Gemma3TiledConfig(Gemma3Config): |
| """ |
| Configuration class for Gemma3Tiled model. |
| |
| Extends Gemma3Config with tiling-specific parameters for handling |
| high-resolution images by splitting them into a grid of tiles. |
| |
| Args: |
| tile_size: Size of each tile (default: 896, matching Gemma3's vision encoder) |
| max_tiles_h: Maximum number of tiles in height dimension |
| max_tiles_w: Maximum number of tiles in width dimension |
| min_tiles: Minimum total number of tiles (to ensure some detail) |
| linebreak_token_id: Token ID to use for linebreak embedding lookup. |
| If None, will use the "\n" token ID. |
| """ |
| |
| model_type = "gemma3_tiled" |
| |
| def __init__( |
| self, |
| tile_size: int = 896, |
| max_tiles_h: int = 4, |
| max_tiles_w: int = 4, |
| min_tiles: int = 1, |
| linebreak_token_id: Optional[int] = None, |
| **kwargs, |
| ): |
| super().__init__(**kwargs) |
| |
| self.tile_size = tile_size |
| self.max_tiles_h = max_tiles_h |
| self.max_tiles_w = max_tiles_w |
| self.min_tiles = min_tiles |
| self.linebreak_token_id = linebreak_token_id |
| |
| @property |
| def tokens_per_tile_side(self) -> int: |
| """Number of tokens per side after projection (16 for 256 tokens per tile).""" |
| return int(self.mm_tokens_per_image ** 0.5) |
|
|
|
|
| __all__ = ["Gemma3TiledConfig"] |
|
|