PursuitOfDataScience
/

Argonne-1.5

@@ -9,6 +9,7 @@ from transformers import (
     AutoModel,
     AutoModelForCausalLM
 )
 from typing import Optional
@@ -102,6 +103,9 @@ class MLP(nn.Module):
 class ArgonneModel(PreTrainedModel):
     config_class = ArgonneConfig
     def __init__(self, config, device_map=None):
         super().__init__(config)
         # Create embeddings on CPU initially
@@ -214,18 +218,40 @@ class ArgonneModel(PreTrainedModel):
         # For now, we'll just return self since our model structure should be compatible
         return self
-    def forward(self, idx, targets=None):
         """
-        If self.pipeline_stages is None, we do a normal single-device forward
-        (whatever device everything is currently on—CPU or a single GPU).
-        Otherwise, we do a pipeline parallel forward.
         """
-        # Make the forward method more compiler-friendly
         if idx.dim() == 1:
-            # Add batch dimension if missing
             idx = idx.unsqueeze(0)
-        # Rest of the forward method remains the same
         if self.pipeline_stages is None:
             # Single-device forward pass
             device = self.token_embedding.weight.device
@@ -250,7 +276,11 @@ class ArgonneModel(PreTrainedModel):
                 targets = targets.view(-1)
                 loss = F.cross_entropy(logits, targets)
-            return logits, loss
         else:
             # Pipeline parallel forward
             first_device = next(self.token_embedding.parameters()).device
@@ -270,7 +300,7 @@ class ArgonneModel(PreTrainedModel):
                 hidden_states = hidden_states.to(device_stage)
                 hidden_states = stage(hidden_states)
-            # Explicitly move to last device before final operations
             hidden_states = hidden_states.to(last_device)
             hidden_states = self.ln_f(hidden_states)
             logits = self.head(hidden_states)
@@ -282,7 +312,11 @@ class ArgonneModel(PreTrainedModel):
                 targets = targets.view(-1)
                 loss = F.cross_entropy(logits, targets)
-            return logits, loss
     @torch.no_grad()
@@ -342,8 +376,9 @@ class ArgonneModel(PreTrainedModel):
                 generated = generated[:, -self.config.block_size:]
             # Forward pass
-            logits, _ = self.forward(generated)
-            logits = logits[:, -1, :]  # get the last token's logits
             # Temperature
             if temperature != 1.0:
@@ -382,91 +417,6 @@ class ArgonneModel(PreTrainedModel):
         return generated
-    # @torch.no_grad()
-    # def generate(self, input_ids, max_new_tokens, temperature=0.7, top_k=None, top_p=None, sample=True):
-    #     """
-    #     Generate text using the model.
-    #     Args:
-    #         input_ids: Input token IDs to continue from
-    #         max_new_tokens: Number of tokens to generate
-    #         temperature: Temperature for sampling (higher = more random)
-    #         top_k: If set, only sample from the top k most likely tokens
-    #         top_p: If set, sample from the smallest set of tokens whose cumulative probability exceeds p
-    #         sample: If True, sample from the distribution; if False, use greedy decoding
-    #     Returns:
-    #         Tensor containing the input_ids extended with max_new_tokens generated tokens
-    #     """
-    #     self.eval()
-    #     # Determine which device to use - explicitly use first device for consistency
-    #     if self.pipeline_stages is not None and len(self.devices) > 0:
-    #         device = self.devices[0]  # Always use first device for generation
-    #     else:
-    #         device = next(self.parameters()).device
-    #     # Ensure input is on the correct device
-    #     generated = input_ids.to(device)
-    #     for _ in range(max_new_tokens):
-    #         # Truncate if necessary to fit within the model's context window
-    #         if generated.shape[1] > self.config.block_size:
-    #             generated = generated[:, -self.config.block_size:]
-    #         # Forward pass
-    #         logits, _ = self.forward(generated)
-    #         # Make sure logits are on the same device
-    #         logits = logits.to(device)
-    #         # Get logits for the last token only
-    #         logits = logits[:, -1, :]
-    #         # Apply temperature
-    #         if temperature != 1.0:
-    #             logits = logits / temperature
-    #         # Greedy decoding (argmax) if sample=False
-    #         if not sample:
-    #             next_token = torch.argmax(logits, dim=-1, keepdim=True)
-    #         else:
-    #             # Sampling logic
-    #             # Apply top-k filtering
-    #             if top_k is not None:
-    #                 indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
-    #                 logits = logits.masked_fill(indices_to_remove, float('-inf'))
-    #             # Apply top-p (nucleus) filtering
-    #             if top_p is not None:
-    #                 sorted_logits, sorted_indices = torch.sort(logits, descending=True)
-    #                 cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
-    #                 # Remove tokens with cumulative probability above the threshold
-    #                 sorted_indices_to_remove = cumulative_probs > top_p
-    #                 # Shift the indices to the right to keep the first token above the threshold
-    #                 sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
-    #                 sorted_indices_to_remove[..., 0] = 0
-    #                 indices_to_remove = sorted_indices_to_remove.scatter(
-    #                     dim=1, index=sorted_indices, src=sorted_indices_to_remove
-    #                 )
-    #                 logits = logits.masked_fill(indices_to_remove, float('-inf'))
-    #             # Convert to probability distribution and sample
-    #             probs = F.softmax(logits, dim=-1)
-    #             next_token = torch.multinomial(probs, num_samples=1)
-    #         # Ensure next_token is on the same device before concatenation
-    #         next_token = next_token.to(device)
-    #         # Append the generated token to the sequence
-    #         generated = torch.cat((generated, next_token), dim=1)
-    #     return generated
 # Register the model with Hugging Face's Auto classes
 AutoConfig.register("argonne", ArgonneConfig)
 AutoModel.register(ArgonneConfig, ArgonneModel)

     AutoModel,
     AutoModelForCausalLM
 )
+from transformers.modeling_outputs import CausalLMOutput
 from typing import Optional
 class ArgonneModel(PreTrainedModel):
     config_class = ArgonneConfig
+    # for map_device = "auto"
+    _no_split_modules = ["Block"]
     def __init__(self, config, device_map=None):
         super().__init__(config)
         # Create embeddings on CPU initially
         # For now, we'll just return self since our model structure should be compatible
         return self
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        labels=None,
+        **kwargs
+    ):
         """
+        HF-friendly forward method.
+        Args:
+            input_ids (torch.LongTensor): Tokens to be fed to the model. [batch_size, seq_len].
+            attention_mask (torch.LongTensor, optional): Mask of shape [batch_size, seq_len],
+                with 1 for actual tokens and 0 for padding, if you want to incorporate it.
+                Currently ignored in this minimal example.
+            labels (torch.LongTensor, optional): Targets for language modeling, same shape as `input_ids`.
+            **kwargs: Catch-all for any additional arguments (e.g. past_key_values) so we don't crash.
         """
+        # 1) We'll rename the parameters from the old code
+        if input_ids is None:
+            raise ValueError("`input_ids` must be provided.")
+        # We used to call it 'idx'
+        idx = input_ids
+        # We used to call it 'targets'
+        targets = labels
+        # [Optional] If we want to handle single-dim input_ids
         if idx.dim() == 1:
             idx = idx.unsqueeze(0)
+        # 2) Now the rest of your old forward logic remains, just replacing references
+        #    to "idx" and "targets" with these new variables.
         if self.pipeline_stages is None:
             # Single-device forward pass
             device = self.token_embedding.weight.device
                 targets = targets.view(-1)
                 loss = F.cross_entropy(logits, targets)
+            return CausalLMOutput(
+                loss=loss,
+                logits=logits,
+                )
         else:
             # Pipeline parallel forward
             first_device = next(self.token_embedding.parameters()).device
                 hidden_states = hidden_states.to(device_stage)
                 hidden_states = stage(hidden_states)
+            # Move to last device before final ops
             hidden_states = hidden_states.to(last_device)
             hidden_states = self.ln_f(hidden_states)
             logits = self.head(hidden_states)
                 targets = targets.view(-1)
                 loss = F.cross_entropy(logits, targets)
+            return CausalLMOutput(
+                loss=loss,
+                logits=logits,
+                )
     @torch.no_grad()
                 generated = generated[:, -self.config.block_size:]
             # Forward pass
+            outputs = self.forward(generated)
+            logits = outputs.logits            # outputs is a CausalLMOutput
+            logits = logits[:, -1, :]          # get the last token's logits
             # Temperature
             if temperature != 1.0:
         return generated
 # Register the model with Hugging Face's Auto classes
 AutoConfig.register("argonne", ArgonneConfig)
 AutoModel.register(ArgonneConfig, ArgonneModel)