KitsuVp
/

NeoLLM

@@ -5,12 +5,11 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
 class NeoLLMConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a `NeoLLMModel`]. It is used to instantiate a
     NeoLLM model according to the specified arguments, defining the model architecture.
-    Configuration objects inherit from `PretrainedConfig`] and can be used to control the model outputs.
     """
     model_type = "neollm"
     keys_to_ignore_at_inference = []
@@ -34,16 +33,10 @@ class NeoLLMConfig(PretrainedConfig):
         attention_bias=False,
         attention_dropout=0.1,
         head_dim=64,
-        linear_conv_kernel_dim=4,
-        linear_key_head_dim=32,
-        linear_value_head_dim=32,
-        linear_num_key_heads=8,
-        linear_num_value_heads=16,
-        layer_types=None,
         fan_ratio=0.125,
-        fan_ratio_ffn=0.0625,
         dropout_rate=0.1,
-        pope_bias_init="zero",
         **kwargs,
     ):
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
@@ -63,39 +56,19 @@ class NeoLLMConfig(PretrainedConfig):
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.head_dim = head_dim
         rope_config_validation(self)
-        self.layer_types = layer_types
-        if self.layer_types is None:
-            interval_pattern = kwargs.get("full_attention_interval", 4)
-            self.layer_types = [
-                "linear_attention" if bool((i + 1) % interval_pattern) else "full_attention"
-                for i in range(self.num_hidden_layers)
-            ]
-        # Linear attention parameters
-        self.linear_conv_kernel_dim = linear_conv_kernel_dim
-        self.linear_key_head_dim = linear_key_head_dim
-        self.linear_value_head_dim = linear_value_head_dim
-        self.linear_num_key_heads = linear_num_key_heads
-        self.linear_num_value_heads = linear_num_value_heads
         # FANformer parameters
-        self.fan_ratio = fan_ratio
-        self.fan_ratio_ffn = fan_ratio_ffn
-        # Dropout
         self.dropout_rate = dropout_rate
-        # PoPE (Polar Positional Embedding) parameters
-        # rope_theta is reused as base wavelength for PoPE frequency components
-        self.pope_bias_init = pope_bias_init  # "zero" (better for length extrapolation) or "uniform" (better in-distribution)
         self.auto_map = {
             "AutoConfig": "configuration_neollm.NeoLLMConfig",
             "AutoModel": "modeling_neollm.NeoLLMModel",
             "AutoModelForCausalLM": "modeling_neollm.NeoLLMForCausalLM"
         }
-__all__ = ["NeoLLMConfig"]

 logger = logging.get_logger(__name__)
 class NeoLLMConfig(PretrainedConfig):
     r"""
+    This is the configuration class to store the configuration of a [`NeoLLMModel`]. It is used to instantiate a
     NeoLLM model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
     """
     model_type = "neollm"
     keys_to_ignore_at_inference = []
         attention_bias=False,
         attention_dropout=0.1,
         head_dim=64,
         fan_ratio=0.125,
+        fan_ratio_ffn=0.0625,  # NEW: Half of fan_ratio for FFN periodicity modeling
         dropout_rate=0.1,
         **kwargs,
     ):
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.head_dim = head_dim
         rope_config_validation(self)
         # FANformer parameters
+        self.fan_ratio = fan_ratio  # Used in attention mechanisms
+        self.fan_ratio_ffn = fan_ratio_ffn  # NEW: Used in FFN for complementary periodicity
         self.dropout_rate = dropout_rate
         self.auto_map = {
             "AutoConfig": "configuration_neollm.NeoLLMConfig",
             "AutoModel": "modeling_neollm.NeoLLMModel",
             "AutoModelForCausalLM": "modeling_neollm.NeoLLMForCausalLM"
         }
+__all__ = ["NeoLLMConfig"]