KitsuVp
/

NeoLLM

@@ -1,24 +1,19 @@
 # ==================== configuration_neollm.py ====================
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_rope_utils import rope_config_validation
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
 class NeoLLMConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`NeoLLMModel`]. It is used to instantiate a
     NeoLLM model according to the specified arguments, defining the model architecture.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
     """
     model_type = "neollm"
     keys_to_ignore_at_inference = []
     def __init__(
         self,
         vocab_size=151665,
@@ -45,6 +40,7 @@ class NeoLLMConfig(PretrainedConfig):
         linear_num_value_heads=16,
         layer_types=None,
         fan_ratio=0.125,
         dropout_rate=0.1,
         **kwargs,
     ):
@@ -65,8 +61,9 @@ class NeoLLMConfig(PretrainedConfig):
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.head_dim = head_dim
         rope_config_validation(self)
         self.layer_types = layer_types
         if self.layer_types is None:
             interval_pattern = kwargs.get("full_attention_interval", 4)
@@ -74,18 +71,24 @@ class NeoLLMConfig(PretrainedConfig):
                 "linear_attention" if bool((i + 1) % interval_pattern) else "full_attention"
                 for i in range(self.num_hidden_layers)
             ]
-        # linear attention part
         self.linear_conv_kernel_dim = linear_conv_kernel_dim
         self.linear_key_head_dim = linear_key_head_dim
         self.linear_value_head_dim = linear_value_head_dim
         self.linear_num_key_heads = linear_num_key_heads
         self.linear_num_value_heads = linear_num_value_heads
-        self.fan_ratio = fan_ratio
         self.dropout_rate = dropout_rate
         self.auto_map = {
             "AutoConfig": "configuration_neollm.NeoLLMConfig",
             "AutoModel": "modeling_neollm.NeoLLMModel",
             "AutoModelForCausalLM": "modeling_neollm.NeoLLMForCausalLM"
         }
-__all__ = ["NeoLLMConfig"]

 # ==================== configuration_neollm.py ====================
 from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_rope_utils import rope_config_validation
 from transformers.utils import logging
 logger = logging.get_logger(__name__)
 class NeoLLMConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`NeoLLMModel`]. It is used to instantiate a
     NeoLLM model according to the specified arguments, defining the model architecture.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
     """
     model_type = "neollm"
     keys_to_ignore_at_inference = []
     def __init__(
         self,
         vocab_size=151665,
         linear_num_value_heads=16,
         layer_types=None,
         fan_ratio=0.125,
+        fan_ratio_ffn=0.0625,  # NEW: Half of fan_ratio for FFN periodicity modeling
         dropout_rate=0.1,
         **kwargs,
     ):
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.head_dim = head_dim
         rope_config_validation(self)
         self.layer_types = layer_types
         if self.layer_types is None:
             interval_pattern = kwargs.get("full_attention_interval", 4)
                 "linear_attention" if bool((i + 1) % interval_pattern) else "full_attention"
                 for i in range(self.num_hidden_layers)
             ]
+        # Linear attention parameters
         self.linear_conv_kernel_dim = linear_conv_kernel_dim
         self.linear_key_head_dim = linear_key_head_dim
         self.linear_value_head_dim = linear_value_head_dim
         self.linear_num_key_heads = linear_num_key_heads
         self.linear_num_value_heads = linear_num_value_heads
+        # FANformer parameters
+        self.fan_ratio = fan_ratio  # Used in attention mechanisms
+        self.fan_ratio_ffn = fan_ratio_ffn  # NEW: Used in FFN for complementary periodicity
         self.dropout_rate = dropout_rate
         self.auto_map = {
             "AutoConfig": "configuration_neollm.NeoLLMConfig",
             "AutoModel": "modeling_neollm.NeoLLMModel",
             "AutoModelForCausalLM": "modeling_neollm.NeoLLMForCausalLM"
         }
+__all__ = ["NeoLLMConfig"]