Mishamq
/

HybriDNA-3B

@@ -9,7 +9,8 @@ logger = logging.get_logger(__name__)
 class HybriDNAConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`HybriDNA`] model. It is adopted from the AI21 lab work of Jamba Model.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
@@ -94,8 +95,6 @@ class HybriDNAConfig(PretrainedConfig):
             Minimum clamping value of the `dt_proj.bias` layer initialization.
         time_step_limit (`tuple`, *optional*, defaults to `(0.0, inf)`):
             Accepted range of time step values.
-        output_router_logits (`bool`, *optional*, defaults to `False`):
-            Whether to return the router logits from mixture-of-experts layers.
     """
     model_type = "hybridna"
@@ -135,10 +134,8 @@ class HybriDNAConfig(PretrainedConfig):
             time_step_max=0.1,
             time_step_floor=1e-4,
             time_step_limit=(0.0, float("inf")),
-            output_router_logits=False,
             **kwargs,
     ):
-        self.output_router_logits = output_router_logits
         self.vocab_size = vocab_size
         self.tie_word_embeddings = tie_word_embeddings
         self.hidden_size = hidden_size
@@ -176,13 +173,11 @@ class HybriDNAConfig(PretrainedConfig):
         self.time_step_max = time_step_max
         self.time_step_floor = time_step_floor
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
-            output_router_logits=output_router_logits,
             **kwargs,
         )

 class HybriDNAConfig(PretrainedConfig):
     r"""
+    This is the configuration class to store the configuration of a [`HybriDNA`] model.
+    HybriDNA is a hybrid Mamba-Attention model for DNA sequence modeling.
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
     Args:
             Minimum clamping value of the `dt_proj.bias` layer initialization.
         time_step_limit (`tuple`, *optional*, defaults to `(0.0, inf)`):
             Accepted range of time step values.
     """
     model_type = "hybridna"
             time_step_max=0.1,
             time_step_floor=1e-4,
             time_step_limit=(0.0, float("inf")),
             **kwargs,
     ):
         self.vocab_size = vocab_size
         self.tie_word_embeddings = tie_word_embeddings
         self.hidden_size = hidden_size
         self.time_step_max = time_step_max
         self.time_step_floor = time_step_floor
         super().__init__(
             pad_token_id=pad_token_id,
             bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             tie_word_embeddings=tie_word_embeddings,
             **kwargs,
         )