File size: 16,514 Bytes
0b0ec56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# Copyright 2026 the HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Literal

from huggingface_hub.dataclasses import strict

from transformers.configuration_utils import PreTrainedConfig
from transformers.utils import auto_docstring, logging
from transformers.utils.type_validators import interval


logger = logging.get_logger(__name__)


@auto_docstring(checkpoint="google/gemma-4-e2b-it")
@strict
class Gemma4AudioConfig(PreTrainedConfig):
    r"""
    subsampling_conv_channels (`list[int]`, defaults to `[128, 32]`):
        Channel sizes for the convolutional layers in the Sub-sample Convolution Projection.
    residual_weight (`float`, defaults to `0.5`):
        Scaling applied to hidden_states prior to combining with the residual in the feedforward.
    attention_chunk_size (`int`, defaults to `12`):
        The sub-sequence size for attention processing.
    attention_context_left (`int`, defaults to `13`):
        The leftward context size for the attention chunk.
    attention_context_right (`int`, defaults to `0`):
        The rightward context size for the attention chunk.
    attention_logit_cap (`float`, defaults to `50.0`):
        Cap applied to attention weights.
    attention_invalid_logits_value (`float`, defaults to `1e-9`):
        Value to use for invalid logits in attention.
    use_clipped_linears (`bool`, defaults to `True`):
        If true, apply clipping to the Linear layers, drawing bounds from the model checkpoint.
    gradient_clipping (`float`, defaults to `1e10`):
        Clipping value used to stabilize extremely large gradient values.
    output_proj_dims (`int`, defaults to `1536`):
        Dimension of the final linear projection from `hidden_size` to the model's output.
    """

    model_type = "gemma4_audio"

    hidden_size: int = 1024
    num_hidden_layers: int = 12
    num_attention_heads: int = 8
    hidden_act: str = "silu"

    # subsampling parameters
    subsampling_conv_channels: list[int] | tuple[int, int] = (128, 32)

    # conformer parameters
    conv_kernel_size: int = 5
    residual_weight: float = 0.5
    attention_chunk_size: int = 12
    attention_context_left: int = 13
    attention_context_right: int = 0
    attention_logit_cap: float = 50.0
    attention_invalid_logits_value: float = -1.0e9

    use_clipped_linears: bool = True
    rms_norm_eps: float = 1e-6
    gradient_clipping: float = 1e10
    output_proj_dims: int = 1536
    initializer_range: float = interval(min=0.0, max=1.0)(default=0.02)

    def __post_init__(self, **kwargs):
        # JSON serialization converts tuples to lists, convert back
        if isinstance(self.subsampling_conv_channels, tuple):
            self.subsampling_conv_channels = list(self.subsampling_conv_channels)
        super().__post_init__(**kwargs)


@auto_docstring(checkpoint="google/gemma-4-e2b-it")
@strict
class Gemma4TextConfig(PreTrainedConfig):
    r"""
    use_bidirectional_attention (`str`, *optional*):
        Controls bidirectional attention behavior. When set to `"vision"`, vision tokens
        attend bidirectionally while text tokens use causal attention. When set to `"all"`,
        all tokens use bidirectional attention.
    vocab_size_per_layer_input (`int`, defaults to 262144):
        Vocabulary size for the per-layer input embeddings. Used by models with per-layer
        residual streams where a smaller embedding is added at each decoder layer.
    hidden_size_per_layer_input (`int`, defaults to 256):
        Hidden dimension for the per-layer input embeddings. Controls the width of the
        per-layer residual embedding vectors.
    num_global_key_value_heads (`int`, *optional*):
        Number of key-value heads for global (full) attention layers. If `None`, defaults
        to `num_key_value_heads`.
    global_head_dim (`int`, defaults to 512):
        Dimension of each attention head in global (full) attention layers.
    attention_k_eq_v (`bool`, defaults to `False`):
        Whether keys and values share the same projection weights. When `True`, the key
        projection output is reused as the value projection.
    num_kv_shared_layers (`int`, defaults to 0):
        Number of consecutive decoder layers that share the same key-value projections.
        A value of 0 means no sharing (each layer has independent KV projections).
    enable_moe_block (`bool`, defaults to `False`):
        Whether to enable Mixture-of-Experts (MoE) blocks in the decoder layers. When
        `True`, eligible layers will use a sparse MoE feed-forward network.
    use_double_wide_mlp (`bool`, defaults to `False`):
        Whether to use a double-width MLP with fused gate and up projections.
    top_k_experts (`int`, *optional*):
        Number of experts activated per token in MoE layers. Only used when
        `enable_moe_block=True`.
    moe_intermediate_size (`int`, *optional*):
        Intermediate (hidden) size of each expert's feed-forward network in MoE layers.
        Only used when `enable_moe_block=True`.
    add_zero_compute_expert (`bool`, defaults to `False`):
        Whether to append a router-only expert slot that performs no expert compute. This
        keeps the original expert weights intact while allowing the router to learn to
        send tokens to a zero-compute path.
    use_zero_compute_optimization (`bool`, defaults to `False`):
        Signals higher-level orchestration to build the optimized Gemma4 text stack instead
        of the original one while keeping the base architecture definitions available.
    """

    model_type = "gemma4_text"
    keys_to_ignore_at_inference = ["past_key_values"]
    base_model_tp_plan = {
        "layers.*.self_attn.q_proj": "colwise",
        "layers.*.self_attn.k_proj": "colwise",
        "layers.*.self_attn.v_proj": "colwise",
        "layers.*.self_attn.q_norm": "replicated_with_grad_allreduce",
        "layers.*.self_attn.k_norm": "replicated_with_grad_allreduce",
        "layers.*.self_attn.o_proj": "rowwise",
        "layers.*.mlp.gate_proj": "colwise",
        "layers.*.mlp.up_proj": "colwise",
        "layers.*.mlp.down_proj": "rowwise",
        "layers.*.experts.gate_up_proj": "packed_colwise",
        "layers.*.experts.down_proj": "rowwise",
        "layers.*.experts": "moe_tp_experts",
    }
    base_model_pp_plan = {
        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
        "norm": (["hidden_states"], ["hidden_states"]),
    }

    vocab_size: int = 262_144
    hidden_size: int = 2304
    intermediate_size: int = 9216
    num_hidden_layers: int = 30
    num_attention_heads: int = 8
    num_key_value_heads: int = 4
    head_dim: int = 256
    hidden_activation: str = "gelu_pytorch_tanh"
    max_position_embeddings: int = 131_072
    initializer_range: float = 0.02
    rms_norm_eps: float = 1e-6
    use_cache: bool = True
    pad_token_id: int | None = 0
    eos_token_id: int | list[int] | None = 1
    bos_token_id: int | None = 2
    tie_word_embeddings: bool = True
    rope_parameters: dict | None = None
    attention_bias: bool = False
    attention_dropout: int | float | None = 0.0
    sliding_window: int = 512
    layer_types: list[str] | None = None
    final_logit_softcapping: float | None = None
    use_bidirectional_attention: Literal["all", "vision"] | None = None
    vocab_size_per_layer_input: int = 262_144
    hidden_size_per_layer_input: int = 256
    num_global_key_value_heads: int | None = None
    global_head_dim: int = 512
    attention_k_eq_v: bool = False
    num_kv_shared_layers: int = 0
    enable_moe_block: bool = False
    use_double_wide_mlp: bool = False
    num_experts: int | None = None
    top_k_experts: int | None = None
    moe_intermediate_size: int | None = None
    add_zero_compute_expert: bool = False
    use_zero_compute_optimization: bool = False

    def __post_init__(self, **kwargs):
        if self.use_bidirectional_attention == "all":
            self.sliding_window = (self.sliding_window // 2) + 1  # due to fa we set exclusive bounds

        if self.layer_types is None:
            sliding_window_pattern = 6  # by default 5:1
            self.layer_types = [
                "sliding_attention" if bool((i + 1) % sliding_window_pattern) else "full_attention"
                for i in range(self.num_hidden_layers)
            ]

        if self.layer_types and (last_layer_type := self.layer_types[-1]) != "full_attention":
            logger.warning(
                f"Last layer must use `full_attention`, but got `{last_layer_type}`. Forcing last layer to `full_attention`."
            )
            self.layer_types[-1] = "full_attention"

        default_rope_params: dict[Literal["full_attention", "sliding_attention"] : dict[str, Any]] = {
            "sliding_attention": {"rope_type": "default", "rope_theta": 10_000.0},
            "full_attention": {"rope_type": "proportional", "partial_rotary_factor": 0.25, "rope_theta": 1_000_000.0},
        }
        active_layer_types = set(self.layer_types)
        if self.rope_parameters is None:
            self.rope_parameters = {
                layer_type: dict(default_rope_params[layer_type]) for layer_type in active_layer_types
            }
        elif set(self.rope_parameters.keys()).issubset(default_rope_params):
            self.rope_parameters = {
                layer_type: dict(rope_params)
                for layer_type, rope_params in self.rope_parameters.items()
                if layer_type in active_layer_types
            }

        if self.num_experts is not None and self.top_k_experts is not None:
            total_num_experts = self.num_experts + int(self.add_zero_compute_expert)
            if self.top_k_experts > total_num_experts:
                logger.warning(
                    "top_k_experts=%s exceeds the available expert count %s. "
                    "Clamping top_k_experts to %s.",
                    self.top_k_experts,
                    total_num_experts,
                    total_num_experts,
                )
                self.top_k_experts = total_num_experts

        if self.add_zero_compute_expert:
            self.use_zero_compute_optimization = True

        super().__post_init__(**kwargs)

    def convert_rope_params_to_dict(self, **kwargs):
        # No need to handle BC for new models, because they have no old-format `rope_scaling`
        return kwargs


@auto_docstring(checkpoint="google/gemma-4-e2b-it")
@strict
class Gemma4VisionConfig(PreTrainedConfig):
    r"""
    pooling_kernel_size (`int`, *optional*):
        Spatial pooling kernel size applied after patchification.
    position_embedding_size (`int`, defaults to 10240):
        Maximum number of position embeddings for the vision encoder. Controls the size of
        the learned 2D position embedding table used by the patch embedder.
    use_clipped_linears (`bool`, defaults to `False`):
        Whether to use weight-clipped linear layers. When enabled, linear layer weights are
        clamped to a fixed range during the forward pass to improve numerical stability.
    standardize (`bool`, defaults to `False`):
        If true, applies a bias and scale to the soft tokens returned from the pooler.
    """

    model_type = "gemma4_vision"
    base_model_tp_plan = {
        "encoder.layers.*.self_attn.q_proj": "colwise",
        "encoder.layers.*.self_attn.k_proj": "colwise",
        "encoder.layers.*.self_attn.v_proj": "colwise",
        "encoder.layers.*.self_attn.q_norm": "replicated_with_grad_allreduce",
        "encoder.layers.*.self_attn.k_norm": "replicated_with_grad_allreduce",
        "encoder.layers.*.self_attn.o_proj": "rowwise",
        "encoder.layers.*.mlp.gate_proj": "colwise",
        "encoder.layers.*.mlp.up_proj": "colwise",
        "encoder.layers.*.mlp.down_proj": "rowwise",
    }
    default_theta = 100.0

    hidden_size: int = 768
    intermediate_size: int = 3072
    num_hidden_layers: int = 16
    num_attention_heads: int = 12
    num_key_value_heads: int = 12
    head_dim: int = 64
    hidden_activation: str = "gelu_pytorch_tanh"
    rms_norm_eps: float = 1e-6
    max_position_embeddings: int = 131_072
    attention_bias: bool | None = False
    attention_dropout: float | None = 0.0
    rope_parameters: dict | None = None
    pooling_kernel_size: int = 3
    patch_size: int = 16
    position_embedding_size: int = 10 * 1024
    use_clipped_linears: bool = False
    standardize: bool = False
    initializer_range: float = 0.02

    def __post_init__(self, **kwargs):
        if self.rope_parameters is None:
            self.rope_parameters = {"rope_type": "default", "rope_theta": 100.0}

        super().__post_init__(**kwargs)


@auto_docstring(checkpoint="google/gemma-4-e2b-it")
@strict
class Gemma4Config(PreTrainedConfig):
    r"""
    boi_token_id (`int`, *optional*, defaults to 255999):
        The begin-of-image token index to wrap the image prompt.
    eoi_token_id (`int`, *optional*, defaults to 258882):
        The end-of-image token index to wrap the image prompt.
    boa_token_id (`int`, *optional*, defaults to 256000):
        The begin-of-audio token index to wrap the audio prompt.
    eoa_token_index (`int`, *optional*, defaults to 258883):
        The end-of-audio token index to wrap the audio prompt.

    Example:

    ```python
    >>> from transformers import (
    >>>     Gemma4AudioConfig,
    >>>     Gemma4Config,
    >>>     Gemma4ForConditionalGeneration,
    >>>     Gemma4TextConfig,
    >>>     Gemma4VisionConfig,
    >>> )

    >>> # Initializing a Gemma 4 Audio config.
    >>> audio_config = Gemma4AudioConfig()

    >>> # Initializing a Gemma 4 Text config.
    >>> text_config = Gemma4TextConfig()

    >>> # Initializing a Gemma 4 vision config.
    >>> vision_config = Gemma4VisionConfig()

    >>> # Initializing a Gemma 4 config similar to google/gemma-4-e2b-it
    >>> configuration = Gemma4Config(text_config, vision_config, audio_config)

    >>> # Initializing a model from the google/gemma-4-e2b-it configuration
    >>> model = Gemma4ForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""

    model_type = "gemma4"
    sub_configs = {
        "text_config": Gemma4TextConfig,
        "vision_config": Gemma4VisionConfig,
        "audio_config": Gemma4AudioConfig,
    }

    text_config: Gemma4TextConfig | dict[str, Any] | None = None
    vision_config: Gemma4VisionConfig | dict[str, Any] | None = None
    audio_config: Gemma4AudioConfig | dict[str, Any] | None = None
    boi_token_id: int | None = 255_999
    eoi_token_id: int | None = 258_882
    image_token_id: int | None = 258_880
    video_token_id: int | None = 258_884
    boa_token_id: int | None = 256_000
    eoa_token_index: int | None = 258_883
    audio_token_id: int | None = 258_881
    initializer_range: float | None = 0.02
    tie_word_embeddings: bool = True

    def __post_init__(self, **kwargs):
        if self.text_config is None:
            self.text_config = Gemma4TextConfig()
            logger.info("text_config is None. Using default Gemma4TextConfig.")
        elif isinstance(self.text_config, dict):
            self.text_config = Gemma4TextConfig(**self.text_config)

        if self.vision_config is None:
            logger.info("vision_config is None. Gemma4Model.vision_tower will not be initialized.")
        if isinstance(self.vision_config, dict):
            self.vision_config = Gemma4VisionConfig(**self.vision_config)

        if self.audio_config is None:
            logger.info("audio_config is None. Gemma4Model.audio_tower will not be initialized.")
        if isinstance(self.audio_config, dict):
            self.audio_config = Gemma4AudioConfig(**self.audio_config)

        super().__post_init__(**kwargs)


__all__ = ["Gemma4AudioConfig", "Gemma4Config", "Gemma4TextConfig", "Gemma4VisionConfig"]