add max_seq_len checks

Browse files

Files changed (3) hide show

config.json +1 -1
configuration_aria.py +2 -2
modeling_aria.py +43 -15

config.json CHANGED Viewed

@@ -7,7 +7,7 @@
   "hidden_size": 1536,
   "embedding_size": 512,
   "intermediate_size": 6144,
-  "max_position_embeddings": 8192,
   "model_type": "aria",
   "num_attention_heads": 24,
   "num_hidden_layers": 16,

   "hidden_size": 1536,
   "embedding_size": 512,
   "intermediate_size": 6144,
+  "max_seq_len": 2048,
   "model_type": "aria",
   "num_attention_heads": 24,
   "num_hidden_layers": 16,

configuration_aria.py CHANGED Viewed

@@ -13,7 +13,7 @@ class AriaConfig(PretrainedConfig):
         num_hidden_layers: int = 16,
         num_attention_heads: int = 64,
         intermediate_size: int = 6144,
-        max_position_embeddings: int = 8192,
         use_cache: bool = True,
         bos_token_id: int = 0,
         eos_token_id: int = 1,
@@ -32,7 +32,7 @@ class AriaConfig(PretrainedConfig):
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.intermediate_size = intermediate_size
-        self.max_position_embeddings = max_position_embeddings
         self.use_cache = use_cache
         self.tie_word_embeddings = tie_word_embeddings
         self.output_attentions = output_attentions

         num_hidden_layers: int = 16,
         num_attention_heads: int = 64,
         intermediate_size: int = 6144,
+        max_seq_len: int = 8192,
         use_cache: bool = True,
         bos_token_id: int = 0,
         eos_token_id: int = 1,
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
         self.intermediate_size = intermediate_size
+        self.max_seq_len = max_seq_len
         self.use_cache = use_cache
         self.tie_word_embeddings = tie_word_embeddings
         self.output_attentions = output_attentions

modeling_aria.py CHANGED Viewed

@@ -66,7 +66,7 @@ class TransformerBlock(nn.Module):
         self.d_head = (
             model_config.hidden_size // model_config.num_attention_heads
         )
-        self.max_seq_len = model_config.max_position_embeddings
         self.layer_idx = layer_idx
         # Attention
@@ -257,6 +257,23 @@ class AriaModel(AriaPreTrainedModel):
             torch.tensor: Model outputs with shape (batch_size, seq_len,
                 d_model).
         """
         output_attentions = (
             output_attentions
             if output_attentions is not None
@@ -333,7 +350,7 @@ class AriaModel(AriaPreTrainedModel):
         if self.freqs_cis is None:
             self.freqs_cis = precompute_freqs_cis(
-                seq_len=self.model_config.max_position_embeddings,
                 n_elem=self.model_config.hidden_size
                 // self.model_config.num_attention_heads,
                 base=500000,
@@ -548,7 +565,7 @@ class AriaForCausalLM(AriaPreTrainedModel, GenerationMixin):
     def __init__(self, model_config: AriaConfig):
         super().__init__(model_config)
         self.model_config = model_config
-        self.max_seq_len = model_config.max_position_embeddings
         self.model = AriaModel(model_config)
         self.lm_head = nn.Linear(
             model_config.hidden_size, model_config.vocab_size, bias=False
@@ -629,13 +646,30 @@ class AriaForSequenceEmbedding(AriaPreTrainedModel):
         assert model_config.embedding_size
         self.model_config = model_config
-        self.max_seq_len = model_config.max_position_embeddings
         self.model = AriaModel(model_config)
         self.emb_head = nn.Linear(
             model_config.hidden_size, model_config.embedding_size, bias=False
         )
         self.post_init()
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -671,14 +705,6 @@ class AriaForSequenceEmbedding(AriaPreTrainedModel):
         ):
             raise ValueError("Provided args unsupported for embedding head")
-        _batch_size = input_ids.shape[0]
-        eos_mask = input_ids == self.config.eos_token_id
-        if not eos_mask.any(dim=1).all():
-            raise ValueError(
-                "Each sequence must contain at least one EOS token"
-            )
-        eos_pos = eos_mask.int().argmax(dim=1)
         outputs = self.model(
             input_ids,
             attention_mask=attention_mask,
@@ -689,9 +715,11 @@ class AriaForSequenceEmbedding(AriaPreTrainedModel):
         )
         hidden = outputs[0]
         embedding = self.emb_head(hidden)
-        pooled_embedding = embedding[
-            torch.arange(_batch_size, device=input_ids.device), eos_pos
-        ]
         if not return_dict:
             output = (pooled_embedding,) + outputs[1:]
             return output

         self.d_head = (
             model_config.hidden_size // model_config.num_attention_heads
         )
+        self.max_seq_len = model_config.max_seq_len
         self.layer_idx = layer_idx
         # Attention
             torch.tensor: Model outputs with shape (batch_size, seq_len,
                 d_model).
         """
+        if (
+            input_ids is not None
+            and input_ids.shape[1] > self.model_config.max_seq_len
+        ):
+            raise ValueError(
+                f"Sequence length ({input_ids.shape[1]}) exceeds max_seq_len "
+                f"({self.model_config.max_seq_len})."
+            )
+        if (
+            inputs_embeds is not None
+            and inputs_embeds.shape[1] > self.model_config.max_seq_len
+        ):
+            raise ValueError(
+                f"Sequence length ({inputs_embeds.shape[1]}) exceeds max_seq_len "
+                f"({self.model_config.max_seq_len})."
+            )
         output_attentions = (
             output_attentions
             if output_attentions is not None
         if self.freqs_cis is None:
             self.freqs_cis = precompute_freqs_cis(
+                seq_len=self.model_config.max_seq_len,
                 n_elem=self.model_config.hidden_size
                 // self.model_config.num_attention_heads,
                 base=500000,
     def __init__(self, model_config: AriaConfig):
         super().__init__(model_config)
         self.model_config = model_config
+        self.max_seq_len = model_config.max_seq_len
         self.model = AriaModel(model_config)
         self.lm_head = nn.Linear(
             model_config.hidden_size, model_config.vocab_size, bias=False
         assert model_config.embedding_size
         self.model_config = model_config
+        self.max_seq_len = model_config.max_seq_len
         self.model = AriaModel(model_config)
         self.emb_head = nn.Linear(
             model_config.hidden_size, model_config.embedding_size, bias=False
         )
         self.post_init()
+    def get_pooled_embedding(
+        self, input_ids: torch.Tensor, embedding: torch.Tensor
+    ):
+        _batch_size = input_ids.shape[0]
+        eos_mask = input_ids == self.config.eos_token_id
+        if not eos_mask.any(dim=1).all():
+            raise ValueError(
+                "Each sequence must contain at least one EOS token"
+            )
+        eos_pos = eos_mask.int().argmax(dim=1)
+        pooled_embedding = embedding[
+            torch.arange(_batch_size, device=input_ids.device), eos_pos
+        ]
+        return pooled_embedding
     def forward(
         self,
         input_ids: torch.Tensor,
         ):
             raise ValueError("Provided args unsupported for embedding head")
         outputs = self.model(
             input_ids,
             attention_mask=attention_mask,
         )
         hidden = outputs[0]
         embedding = self.emb_head(hidden)
+        pooled_embedding = self.get_pooled_embedding(
+            input_ids=input_ids,
+            embedding=embedding,
+        )
         if not return_dict:
             output = (pooled_embedding,) + outputs[1:]
             return output