rpDungeon
/

gemmagain-trained-fizzed-loopnt

@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Gemmagain - Gemma3 text model with layer looping support (wrapper approach).
 This model allows running the same physical layers multiple times in sequence,
 enabling parameter-efficient deep networks. Compatible with standard Gemma3 weights.
@@ -42,9 +42,9 @@ from transformers.utils import TransformersKwargs, auto_docstring, can_return_tu
 from transformers.utils.deprecation import deprecate_kwarg
 try:
-    from .configuration_gemmagain import GemmagainConfig
 except ImportError:
-    from configuration_gemmagain import GemmagainConfig
 logger = logging.get_logger(__name__)
@@ -64,7 +64,7 @@ class Gemma3TextScaledWordEmbedding(nn.Embedding):
 class Gemma3MLP(nn.Module):
-    def __init__(self, config: GemmagainConfig):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -101,7 +101,7 @@ class Gemma3RMSNorm(nn.Module):
 class Gemma3RotaryEmbedding(nn.Module):
     inv_freq: torch.Tensor
-    def __init__(self, config: GemmagainConfig, device=None):
         super().__init__()
         if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
@@ -195,7 +195,7 @@ def eager_attention_forward(
 class Gemma3Attention(nn.Module):
     """Multi-headed attention with support for virtual layer index."""
-    def __init__(self, config: GemmagainConfig, layer_idx: int):
         super().__init__()
         self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
         self.config = config
@@ -276,7 +276,7 @@ class Gemma3Attention(nn.Module):
 class Gemma3DecoderLayer(GradientCheckpointingLayer):
-    def __init__(self, config: GemmagainConfig, layer_idx: int):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
@@ -337,8 +337,8 @@ class Gemma3DecoderLayer(GradientCheckpointingLayer):
 @auto_docstring
-class GemmagainPreTrainedModel(PreTrainedModel):
-    config_class = GemmagainConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["Gemma3DecoderLayer"]
@@ -388,8 +388,8 @@ def _bidirectional_window_overlay(sliding_window: int) -> Callable[[int, int, in
 @auto_docstring
-class GemmagainModel(GemmagainPreTrainedModel):
-    def __init__(self, config: GemmagainConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -514,14 +514,14 @@ class GemmagainModel(GemmagainPreTrainedModel):
 @auto_docstring
-class GemmagainForCausalLM(GemmagainPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     _tp_plan = {"lm_head": "colwise_rep"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
-    def __init__(self, config: GemmagainConfig):
         super().__init__(config)
-        self.model = GemmagainModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
@@ -583,7 +583,7 @@ class GemmagainForCausalLM(GemmagainPreTrainedModel, GenerationMixin):
 __all__ = [
-    "GemmagainForCausalLM",
-    "GemmagainModel",
-    "GemmagainPreTrainedModel",
 ]

 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
+Gemma3 - Gemma3 text model with layer looping support (wrapper approach).
 This model allows running the same physical layers multiple times in sequence,
 enabling parameter-efficient deep networks. Compatible with standard Gemma3 weights.
 from transformers.utils.deprecation import deprecate_kwarg
 try:
+    from .configuration_gemmagain import Gemma3Config
 except ImportError:
+    from configuration_gemmagain import Gemma3Config
 logger = logging.get_logger(__name__)
 class Gemma3MLP(nn.Module):
+    def __init__(self, config: Gemma3Config):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
 class Gemma3RotaryEmbedding(nn.Module):
     inv_freq: torch.Tensor
+    def __init__(self, config: Gemma3Config, device=None):
         super().__init__()
         if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
 class Gemma3Attention(nn.Module):
     """Multi-headed attention with support for virtual layer index."""
+    def __init__(self, config: Gemma3Config, layer_idx: int):
         super().__init__()
         self.is_sliding = config.layer_types[layer_idx] == "sliding_attention"
         self.config = config
 class Gemma3DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Gemma3Config, layer_idx: int):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
 @auto_docstring
+class Gemma3PreTrainedModel(PreTrainedModel):
+    config_class = Gemma3Config
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["Gemma3DecoderLayer"]
 @auto_docstring
+class Gemma3Model(Gemma3PreTrainedModel):
+    def __init__(self, config: Gemma3Config):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 @auto_docstring
+class Gemma3ForCausalLM(Gemma3PreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]
     _tp_plan = {"lm_head": "colwise_rep"}
     _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config: Gemma3Config):
         super().__init__(config)
+        self.model = Gemma3Model(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 __all__ = [
+    "Gemma3ForCausalLM",
+    "Gemma3Model",
+    "Gemma3PreTrainedModel",
 ]