KitsuVp
/

NeoLLM

@@ -5,18 +5,19 @@ from transformers.utils import logging
 logger = logging.get_logger(__name__)
 class NeoLLMConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`NeoLLMModel`]. It is used to instantiate a
     NeoLLM model according to the specified arguments, defining the model architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
     """
     model_type = "neollm"
     keys_to_ignore_at_inference = []
     def __init__(
         self,
-        vocab_size=151665,
         hidden_size=512,
         intermediate_size=1536,
         num_hidden_layers=12,
@@ -40,8 +41,9 @@ class NeoLLMConfig(PretrainedConfig):
         linear_num_value_heads=16,
         layer_types=None,
         fan_ratio=0.125,
-        fan_ratio_ffn=0.0625,  # NEW: Half of fan_ratio for FFN periodicity modeling
         dropout_rate=0.1,
         **kwargs,
     ):
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
@@ -61,7 +63,6 @@ class NeoLLMConfig(PretrainedConfig):
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.head_dim = head_dim
         rope_config_validation(self)
         self.layer_types = layer_types
@@ -80,15 +81,21 @@ class NeoLLMConfig(PretrainedConfig):
         self.linear_num_value_heads = linear_num_value_heads
         # FANformer parameters
-        self.fan_ratio = fan_ratio  # Used in attention mechanisms
-        self.fan_ratio_ffn = fan_ratio_ffn  # NEW: Used in FFN for complementary periodicity
         self.dropout_rate = dropout_rate
         self.auto_map = {
             "AutoConfig": "configuration_neollm.NeoLLMConfig",
             "AutoModel": "modeling_neollm.NeoLLMModel",
             "AutoModelForCausalLM": "modeling_neollm.NeoLLMForCausalLM"
         }
-__all__ = ["NeoLLMConfig"]

 logger = logging.get_logger(__name__)
 class NeoLLMConfig(PretrainedConfig):
     r"""
+    This is the configuration class to store the configuration of a `NeoLLMModel`]. It is used to instantiate a
     NeoLLM model according to the specified arguments, defining the model architecture.
+    Configuration objects inherit from `PretrainedConfig`] and can be used to control the model outputs.
     """
     model_type = "neollm"
     keys_to_ignore_at_inference = []
     def __init__(
         self,
+        vocab_size=200005,
         hidden_size=512,
         intermediate_size=1536,
         num_hidden_layers=12,
         linear_num_value_heads=16,
         layer_types=None,
         fan_ratio=0.125,
+        fan_ratio_ffn=0.0625,
         dropout_rate=0.1,
+        pope_bias_init="zero",
         **kwargs,
     ):
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
         self.head_dim = head_dim
         rope_config_validation(self)
         self.layer_types = layer_types
         self.linear_num_value_heads = linear_num_value_heads
         # FANformer parameters
+        self.fan_ratio = fan_ratio
+        self.fan_ratio_ffn = fan_ratio_ffn
+        # Dropout
         self.dropout_rate = dropout_rate
+        # PoPE (Polar Positional Embedding) parameters
+        # rope_theta is reused as base wavelength for PoPE frequency components
+        self.pope_bias_init = pope_bias_init  # "zero" (better for length extrapolation) or "uniform" (better in-distribution)
         self.auto_map = {
             "AutoConfig": "configuration_neollm.NeoLLMConfig",
             "AutoModel": "modeling_neollm.NeoLLMModel",
             "AutoModelForCausalLM": "modeling_neollm.NeoLLMForCausalLM"
         }
+__all__ = ["NeoLLMConfig"]