rpDungeon
/

gemmagain-4b-pt

Safetensors

gemma3

custom_code

Model card Files Files and versions

xet

Community

ToastyPigeon commited on Jan 6

Commit

437ff8f

verified ·

1 Parent(s): d81fb9b

Fix configuration file

Browse files

Files changed (1) hide show

configuration_gemmagain.py +186 -39

configuration_gemmagain.py CHANGED Viewed

@@ -1,39 +1,186 @@
-{
-  "architectures": [
-    "Gemma3ForCausalLM"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_gemmagain.GemmagainConfig",
-    "AutoModelForCausalLM": "modeling_gemmagain.GemmagainForCausalLM"
-  },
-  "attention_bias": false,
-  "attention_dropout": 0.0,
-  "attn_logit_softcapping": null,
-  "cache_implementation": "hybrid",
-  "final_logit_softcapping": null,
-  "head_dim": 256,
-  "hidden_activation": "gelu_pytorch_tanh",
-  "hidden_size": 2560,
-  "initializer_range": 0.02,
-  "intermediate_size": 10240,
-  "max_position_embeddings": 131072,
-  "model_type": "gemma3",
-  "num_attention_heads": 8,
-  "num_hidden_layers": 34,
-  "num_key_value_heads": 4,
-  "query_pre_attn_scalar": 256,
-  "rms_norm_eps": 1e-06,
-  "rope_local_base_freq": 10000.0,
-  "rope_scaling": {
-    "factor": 8.0,
-    "rope_type": "linear"
-  },
-  "rope_theta": 1000000.0,
-  "sliding_window": 1024,
-  "sliding_window_pattern": 6,
-  "layer_sequence": [[0, 34, 1]],
-  "torch_dtype": "bfloat16",
-  "use_cache": true,
-  "vocab_size": 262208,
-  "transformers_version": "4.51.0"
-}

+# coding=utf-8
+# Copyright 2025 Google Inc. HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Gemmagain model configuration - Gemma3 with layer looping support"""
+from transformers.configuration_utils import PretrainedConfig, layer_type_validation
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class GemmagainConfig(PretrainedConfig):
+    r"""
+    Configuration class for Gemmagain - a Gemma3 text model with layer looping support.
+    This extends Gemma3TextConfig to add the `layer_sequence` parameter which controls
+    how layers are executed, allowing layers to be repeated multiple times.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 262208):
+            Vocabulary size of the model.
+        hidden_size (`int`, *optional*, defaults to 2560):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 10240):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 34):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            Number of key_value heads for GQA.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_activation (`str`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 131072):
+            Maximum sequence length.
+        layer_sequence (`list`, *optional*):
+            Order to execute layers. Defaults to all layers once.
+            Flexible format - each item can be:
+            - An integer: single layer index (e.g., 5 means layer 5)
+            - A 2-element list [start, end]: range of layers (e.g., [4, 20] means layers 4-19)
+            - A 3-element list [start, end, repeats]: range repeated N times
+            Examples:
+            - [[0, 34, 1]]: all 34 layers once
+            - [[0, 10], [10, 28, 2], [28, 34]]: layers 0-9, then 10-27 twice, then 28-33
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer ("sliding_attention" or "full_attention").
+        sliding_window (`int`, *optional*, defaults to 1024):
+            Size of the sliding window for sliding attention layers.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            Base period for RoPE embeddings (global attention).
+        rope_local_base_freq (`float`, *optional*, defaults to 10000.0):
+            Base period for RoPE embeddings (local/sliding attention).
+        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
+            Scaling factor for attention scores.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-6):
+            Epsilon for RMS normalization.
+        attention_bias (`bool`, *optional*, defaults to False):
+            Whether to use bias in attention projections.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout ratio for attention.
+        final_logit_softcapping (`float`, *optional*):
+            Softcapping for final logits.
+        attn_logit_softcapping (`float`, *optional*):
+            Softcapping for attention logits.
+        rope_scaling (`dict`, *optional*):
+            RoPE scaling configuration.
+        use_bidirectional_attention (`bool`, *optional*, defaults to False):
+            If True, use bidirectional attention instead of causal.
+    """
+    model_type = "gemma3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=262_208,
+        hidden_size=2560,
+        intermediate_size=10240,
+        num_hidden_layers=34,
+        num_attention_heads=8,
+        num_key_value_heads=4,
+        head_dim=256,
+        hidden_activation="gelu_pytorch_tanh",
+        max_position_embeddings=131_072,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        bos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=1_000_000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        query_pre_attn_scalar=256,
+        sliding_window=1024,
+        layer_types=None,
+        layer_sequence=None,
+        final_logit_softcapping=None,
+        attn_logit_softcapping=None,
+        rope_scaling=None,
+        rope_local_base_freq=10_000.0,
+        use_bidirectional_attention=False,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.num_key_value_heads = num_key_value_heads
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.hidden_activation = hidden_activation
+        self.query_pre_attn_scalar = query_pre_attn_scalar
+        self.sliding_window = sliding_window
+        self.final_logit_softcapping = final_logit_softcapping
+        self.attn_logit_softcapping = attn_logit_softcapping
+        self.use_bidirectional_attention = use_bidirectional_attention
+        if use_bidirectional_attention:
+            self.sliding_window = (self.sliding_window // 2) + 1
+        self.rope_local_base_freq = rope_local_base_freq
+        self.rope_scaling = rope_scaling
+        rope_config_validation(self)
+        # Layer sequence for looping - defaults to all layers once
+        if layer_sequence is None:
+            layer_sequence = [[0, num_hidden_layers, 1]]
+        self.layer_sequence = layer_sequence
+        # Layer types (sliding vs full attention)
+        self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 6)
+        self.layer_types = layer_types
+        if self.layer_types is None:
+            self.layer_types = [
+                "sliding_attention" if bool((i + 1) % self._sliding_window_pattern) else "full_attention"
+                for i in range(self.num_hidden_layers)
+            ]
+        layer_type_validation(self.layer_types, self.num_hidden_layers)
+__all__ = ["GemmagainConfig"]