Tensoic
/

Cerule-v0.1

@@ -872,7 +872,7 @@ if is_torch_fx_available():
 logger = logging.get_logger(__name__)
-_CONFIG_FOR_DOC = "GemmaConfig"
 def _get_unpad_data(attention_mask):
@@ -1003,7 +1003,7 @@ class GemmaAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
     # Ignore copy
-    def __init__(self, config: GemmaConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
@@ -1396,7 +1396,7 @@ GEMMA_ATTENTION_CLASSES = {
 # Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with LLAMA->GEMMA,Llama->Gemma
 class GemmaDecoderLayer(nn.Module):
-    def __init__(self, config: GemmaConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -1480,7 +1480,7 @@ GEMMA_START_DOCSTRING = r"""
     and behavior.
     Parameters:
-        config ([`GemmaConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
@@ -1492,7 +1492,7 @@ GEMMA_START_DOCSTRING = r"""
     GEMMA_START_DOCSTRING,
 )
 class GemmaPreTrainedModel(PreTrainedModel):
-    config_class = GemmaConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _keep_in_fp32_modules = ["inv_freq", "rotary_emb", "cos_cached", "sin_cached"]
@@ -1618,7 +1618,7 @@ class GemmaModel(GemmaPreTrainedModel):
         config: GemmaConfig
     """
-    def __init__(self, config: GemmaConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -2155,7 +2155,7 @@ from .configuration_gemma import CeruleGemmaConfig
 class CeruleGemmaModel(CeruleMetaModel, GemmaModel):
     config_class = CeruleGemmaConfig
-    def __init__(self, config: GemmaConfig):
         super(CeruleGemmaModel, self).__init__(config)

 logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "CeruleGemmaConfig"
 def _get_unpad_data(attention_mask):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
     # Ignore copy
+    def __init__(self, config: CeruleGemmaConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
         self.layer_idx = layer_idx
 # Copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with LLAMA->GEMMA,Llama->Gemma
 class GemmaDecoderLayer(nn.Module):
+    def __init__(self, config: CeruleGemmaConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
     and behavior.
     Parameters:
+        config ([`CeruleGemmaConfig`]):
             Model configuration class with all the parameters of the model. Initializing with a config file does not
             load the weights associated with the model, only the configuration. Check out the
             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
     GEMMA_START_DOCSTRING,
 )
 class GemmaPreTrainedModel(PreTrainedModel):
+    config_class = CeruleGemmaConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _keep_in_fp32_modules = ["inv_freq", "rotary_emb", "cos_cached", "sin_cached"]
         config: GemmaConfig
     """
+    def __init__(self, config: CeruleGemmaConfig):
         super().__init__(config)
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 class CeruleGemmaModel(CeruleMetaModel, GemmaModel):
     config_class = CeruleGemmaConfig
+    def __init__(self, config: CeruleGemmaConfig):
         super(CeruleGemmaModel, self).__init__(config)