Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +1 -1
__init__.py +6 -0
example_usage.py +2 -1
modeling_seamless_translation.py +118 -0

README.md CHANGED Viewed

@@ -72,7 +72,7 @@ import torch
 import numpy as np
 import importlib.util
-# Load model
 model = AutoModel.from_pretrained("videoloc/seamless-translation")
 config = AutoConfig.from_pretrained("videoloc/seamless-translation")

 import numpy as np
 import importlib.util
+# Load model - architecture is included in the repository
 model = AutoModel.from_pretrained("videoloc/seamless-translation")
 config = AutoConfig.from_pretrained("videoloc/seamless-translation")

__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""
+SeamlessTranslation model for HuggingFace Transformers
+"""
+from .modeling_seamless_translation import HFSeamlessTranslation, SeamlessTranslationConfig
+__all__ = ["HFSeamlessTranslation", "SeamlessTranslationConfig"]

example_usage.py CHANGED Viewed

@@ -8,6 +8,7 @@ import numpy as np
 import importlib.util
 def load_model_and_collator():
     model = AutoModel.from_pretrained("videoloc/seamless-translation")
     config = AutoConfig.from_pretrained("videoloc/seamless-translation")
@@ -31,7 +32,7 @@ def example_inference():
     # Example data with translation awareness
     data = [{
         'raw_audio': np.random.randn(16000 * 3),  # 3 seconds at 16kHz
-        'raw_text': "Example subtitle text for temporal alignment",
         'is_translation': 1,  # 1 for translated content, 0 for original
     }]

 import importlib.util
 def load_model_and_collator():
+    # Load model - architecture is included in the repository
     model = AutoModel.from_pretrained("videoloc/seamless-translation")
     config = AutoConfig.from_pretrained("videoloc/seamless-translation")
     # Example data with translation awareness
     data = [{
         'raw_audio': np.random.randn(16000 * 3),  # 3 seconds at 16kHz
+        'raw_text': "Example subtitle text for TTE prediction",
         'is_translation': 1,  # 1 for translated content, 0 for original
     }]

modeling_seamless_translation.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel, PretrainedConfig
+from transformers.modeling_outputs import SequenceClassifierOutput
+from transformers import SeamlessM4TModel
+import logging
+logger = logging.getLogger(__name__)
+class SeamlessTranslationConfig(PretrainedConfig):
+    """Configuration class for SeamlessTranslation model."""
+    model_type = "seamless_translation"
+    def __init__(
+        self,
+        seamless_model_name="facebook/hf-seamless-m4t-medium",
+        hidden_size=1024,
+        dropout_prob=0.1,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.seamless_model_name = seamless_model_name
+        self.hidden_size = hidden_size
+        self.dropout_prob = dropout_prob
+class HFSeamlessTranslation(PreTrainedModel):
+    """SeamlessM4T model with translation features for HuggingFace Hub."""
+    config_class = SeamlessTranslationConfig
+    supports_gradient_checkpointing = True
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        # Load the underlying SeamlessM4T model
+        self.seamless_model = SeamlessM4TModel.from_pretrained(config.seamless_model_name)
+        self.seamless_model_speech_encoder = self.seamless_model.speech_encoder
+        self.seamless_model_text_encoder = self.seamless_model.text_encoder
+        # Freeze pre-trained models
+        for param in self.seamless_model_speech_encoder.parameters():
+            param.requires_grad = False
+        for param in self.seamless_model_text_encoder.parameters():
+            param.requires_grad = False
+        # Projection layers
+        self.audio_proj = nn.Linear(
+            self.seamless_model_speech_encoder.config.hidden_size,
+            config.hidden_size
+        )
+        self.text_proj = nn.Linear(
+            self.seamless_model_text_encoder.config.hidden_size,
+            config.hidden_size
+        )
+        # Translation feature embedding
+        self.translation_proj = nn.Linear(1, 64)
+        # Classification head (2048 + 64 = 2112)
+        self.fc = nn.Sequential(
+            nn.Linear(2112, 1024),
+            nn.ReLU(),
+            nn.Dropout(config.dropout_prob),
+            nn.Linear(1024, 512),
+            nn.ReLU(),
+            nn.Dropout(config.dropout_prob),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.Dropout(config.dropout_prob),
+            nn.Linear(256, 1)
+        )
+    def forward(
+        self,
+        input_features,
+        input_ids,
+        text_attention_mask,
+        is_translation,
+        audio_attention_mask=None,
+        labels=None,
+        **kwargs  # Accept additional features but ignore them
+    ):
+        # Encode audio
+        audio_emb = self.seamless_model_speech_encoder(
+            input_features=input_features,
+            attention_mask=audio_attention_mask
+        ).last_hidden_state.mean(dim=1)
+        audio_emb = self.audio_proj(audio_emb)
+        # Encode text
+        text_emb = self.seamless_model_text_encoder(
+            input_ids=input_ids,
+            attention_mask=text_attention_mask
+        ).last_hidden_state.mean(dim=1)
+        text_emb = self.text_proj(text_emb)
+        # Process translation feature
+        translation_emb = self.translation_proj(is_translation.unsqueeze(-1))
+        # Combine features
+        combined = torch.cat([audio_emb, text_emb, translation_emb], dim=1)  # (batch_size, 2112)
+        logits = self.fc(combined).squeeze(-1)
+        # Compute loss if labels are provided
+        loss = F.mse_loss(logits, labels) if labels is not None else None
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=None,
+            attentions=None
+        )