tamoghna
/

encoder-decoder-en-hi-bn

Safetensors

translation_transformer

custom_code

Model card Files Files and versions

xet

Community

tamoghna commited on Nov 12, 2025

Commit

c9912a0

verified ·

1 Parent(s): 6cdab95

Update modeling.py

Browse files

Files changed (1) hide show

modeling.py +7 -47

modeling.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import torch
 import torch.nn as nn
 from transformers import PreTrainedModel, PretrainedConfig
@@ -142,29 +145,15 @@ class TranslationTransformerModel(PreTrainedModel):
         return_dict: Optional[bool] = None,
         **kwargs
     ) -> Union[Tuple, Seq2SeqLMOutput]:
-        """
-        Forward pass
-        Args:
-            input_ids: Source sequence tokens [batch_size, src_seq_len]
-            attention_mask: Source attention mask [batch_size, src_seq_len]
-            decoder_input_ids: Target sequence tokens [batch_size, tgt_seq_len]
-            decoder_attention_mask: Target attention mask [batch_size, tgt_seq_len]
-            labels: Labels for loss calculation [batch_size, tgt_seq_len]
-            output_attentions: Whether to output attentions
-            output_hidden_states: Whether to output hidden states
-            return_dict: Whether to return ModelOutput
-        """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         device = input_ids.device
         # If labels provided but no decoder_input_ids, shift labels to create decoder_input_ids
         if labels is not None and decoder_input_ids is None:
-            # Replace -100 with pad_token_id for embedding
             labels_shifted = labels.clone()
             labels_shifted[labels_shifted == -100] = self.config.pad_token_id
-            # Shift right: [BOS, token1, token2, ...] from [token1, token2, ..., EOS]
             decoder_input_ids = torch.cat([
                 torch.full((labels.shape[0], 1), self.config.bos_token_id, dtype=torch.long, device=device),
                 labels_shifted[:, :-1]
@@ -200,7 +189,6 @@ class TranslationTransformerModel(PreTrainedModel):
         # Calculate loss if labels provided
         loss = None
         if labels is not None:
-            # Use -100 as ignore_index (standard for HuggingFace)
             loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
             loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
@@ -222,7 +210,7 @@ class TranslationTransformerModel(PreTrainedModel):
         encoder_outputs=None,
         **kwargs
     ):
-        """Prepare inputs for generation (required for HuggingFace generate)"""
         return {
             "input_ids": kwargs.get("input_ids"),
             "decoder_input_ids": decoder_input_ids,
@@ -231,7 +219,7 @@ class TranslationTransformerModel(PreTrainedModel):
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
-        """Reorder cache for beam search (placeholder)"""
         return past_key_values
     def generate(
@@ -246,22 +234,7 @@ class TranslationTransformerModel(PreTrainedModel):
         top_p: float = 1.0,
         **kwargs
     ) -> torch.LongTensor:
-        """
-        Generate translations
-        Args:
-            input_ids: Source sequence [batch_size, src_seq_len]
-            attention_mask: Source attention mask
-            max_length: Maximum generation length
-            num_beams: Number of beams for beam search
-            temperature: Sampling temperature
-            do_sample: Whether to use sampling
-            top_k: Top-k sampling parameter
-            top_p: Nucleus sampling parameter
-        Returns:
-            Generated sequences [batch_size, tgt_seq_len]
-        """
         device = input_ids.device
         batch_size = input_ids.size(0)
@@ -277,7 +250,6 @@ class TranslationTransformerModel(PreTrainedModel):
         # Generate tokens one by one
         for _ in range(max_length - 1):
-            # Forward pass
             outputs = self.forward(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
@@ -285,16 +257,13 @@ class TranslationTransformerModel(PreTrainedModel):
                 return_dict=True
             )
-            # Get next token logits
             next_token_logits = outputs.logits[:, -1, :] / temperature
             if do_sample:
-                # Apply top-k filtering
                 if top_k > 0:
                     indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
                     next_token_logits[indices_to_remove] = float('-inf')
-                # Apply top-p (nucleus) filtering
                 if top_p < 1.0:
                     sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
                     cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
@@ -304,23 +273,15 @@ class TranslationTransformerModel(PreTrainedModel):
                     indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
                     next_token_logits[indices_to_remove] = float('-inf')
-                # Sample
                 probs = torch.softmax(next_token_logits, dim=-1)
                 next_token = torch.multinomial(probs, num_samples=1)
             else:
-                # Greedy selection
                 next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
-            # Mark finished sequences (those that generated EOS)
             finished = finished | (next_token.squeeze(-1) == self.config.eos_token_id)
-            # Replace tokens in finished sequences with PAD
             next_token[finished] = self.config.pad_token_id
-            # Append to decoder input
             decoder_input_ids = torch.cat([decoder_input_ids, next_token], dim=1)
-            # Stop if all sequences are finished
             if finished.all():
                 break
@@ -329,7 +290,6 @@ class TranslationTransformerModel(PreTrainedModel):
 # Register the model in the AutoModel registry
 from transformers import AutoConfig, AutoModel, AutoModelForSeq2SeqLM
-from .configuration_translation_transformer import TranslationTransformerConfig
 AutoConfig.register("translation_transformer", TranslationTransformerConfig)
 AutoModel.register(TranslationTransformerConfig, TranslationTransformerModel)

+"""
+Translation Transformer Model for HuggingFace Hub
+"""
 import torch
 import torch.nn as nn
 from transformers import PreTrainedModel, PretrainedConfig
         return_dict: Optional[bool] = None,
         **kwargs
     ) -> Union[Tuple, Seq2SeqLMOutput]:
+        """Forward pass"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         device = input_ids.device
         # If labels provided but no decoder_input_ids, shift labels to create decoder_input_ids
         if labels is not None and decoder_input_ids is None:
             labels_shifted = labels.clone()
             labels_shifted[labels_shifted == -100] = self.config.pad_token_id
             decoder_input_ids = torch.cat([
                 torch.full((labels.shape[0], 1), self.config.bos_token_id, dtype=torch.long, device=device),
                 labels_shifted[:, :-1]
         # Calculate loss if labels provided
         loss = None
         if labels is not None:
             loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
             loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
         encoder_outputs=None,
         **kwargs
     ):
+        """Prepare inputs for generation"""
         return {
             "input_ids": kwargs.get("input_ids"),
             "decoder_input_ids": decoder_input_ids,
     @staticmethod
     def _reorder_cache(past_key_values, beam_idx):
+        """Reorder cache for beam search"""
         return past_key_values
     def generate(
         top_p: float = 1.0,
         **kwargs
     ) -> torch.LongTensor:
+        """Generate translations"""
         device = input_ids.device
         batch_size = input_ids.size(0)
         # Generate tokens one by one
         for _ in range(max_length - 1):
             outputs = self.forward(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
                 return_dict=True
             )
             next_token_logits = outputs.logits[:, -1, :] / temperature
             if do_sample:
                 if top_k > 0:
                     indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
                     next_token_logits[indices_to_remove] = float('-inf')
                 if top_p < 1.0:
                     sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
                     cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
                     indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
                     next_token_logits[indices_to_remove] = float('-inf')
                 probs = torch.softmax(next_token_logits, dim=-1)
                 next_token = torch.multinomial(probs, num_samples=1)
             else:
                 next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
             finished = finished | (next_token.squeeze(-1) == self.config.eos_token_id)
             next_token[finished] = self.config.pad_token_id
             decoder_input_ids = torch.cat([decoder_input_ids, next_token], dim=1)
             if finished.all():
                 break
 # Register the model in the AutoModel registry
 from transformers import AutoConfig, AutoModel, AutoModelForSeq2SeqLM
 AutoConfig.register("translation_transformer", TranslationTransformerConfig)
 AutoModel.register(TranslationTransformerConfig, TranslationTransformerModel)