diff --git "a/docs/transformers/build/lib/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py" "b/docs/transformers/build/lib/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py"
new file mode 100644--- /dev/null
+++ "b/docs/transformers/build/lib/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py"
@@ -0,0 +1,4722 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch SeamlessM4Tv2 model."""
+
+import copy
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import Tensor, nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...generation import GenerationMixin
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
+from ...modeling_outputs import (
+ BaseModelOutput,
+ BaseModelOutputWithPastAndCrossAttentions,
+ Seq2SeqLMOutput,
+ Seq2SeqModelOutput,
+ Wav2Vec2BaseModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+ ModelOutput,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ logging,
+)
+from .configuration_seamless_m4t_v2 import SeamlessM4Tv2Config
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = ""
+_CONFIG_FOR_DOC = "SeamlessM4Tv2Config"
+
+
+@dataclass
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TGenerationOutput with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2GenerationOutput(ModelOutput):
+ """
+ Class defining the generated outputs from [`SeamlessM4Tv2Model`], [`SeamlessM4Tv2ForTextToText`],
+ [`SeamlessM4Tv2ForTextToSpeech`], [`SeamlessM4Tv2ForSpeechToSpeech`] and [`SeamlessM4Tv2ForTextToSpeech`].
+
+ Args:
+ waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+ The final audio waveform predicted by the model.
+ waveform_lengths (`torch.IntTensor` of shape `(batch_size,)`, *optional*):
+ The length in samples of each element in the `waveform` batch.
+ sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ The generated translated sequences. This is the output of the text-to-text or the speech-to-text models.
+ The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished
+ early due to the `eos_token_id`.
+ unit_sequences (`torch.LongTensor` of shape `(batch_size, unit_sequence_length)`, *optional*):
+ The generated translated unit sequences. This is the output of the text-to-units model. The second
+ dimension (unit_sequence_length) is either equal to `t2u_max_length` or shorter if all batches finished
+ early due to the `t2u_eos_token_id`.
+ """
+
+ waveform: Optional[torch.FloatTensor] = None
+ waveform_lengths: Optional[torch.IntTensor] = None
+ sequences: Optional[Tuple[torch.FloatTensor]] = None
+ unit_sequences: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class SeamlessM4Tv2TextToUnitDecoderOutput(ModelOutput):
+ """
+ Class defining the outputs from [`SeamlessM4Tv2TextToUnitDecoder`].
+
+ Args:
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the model.
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+ heads.
+ padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+ for *masked*
+ """
+
+ last_hidden_state: Optional[torch.FloatTensor] = None
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
+ padding_mask: Optional[torch.Tensor] = None
+
+
+@dataclass
+class SeamlessM4Tv2TextToUnitOutput(ModelOutput):
+ """
+ Class defining the outputs from [`SeamlessM4Tv2TextToUnitForConditionalGeneration`] and
+ [`SeamlessM4Tv2TextToUnitModel`].
+
+ Args:
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+ hidden_size)` is output.
+ padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+ for *masked*
+ decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+ decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+ self-attention heads.
+ encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder of the model.
+ encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+ Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+ encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+ sequence_length)`.
+
+ Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+ self-attention heads.
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+ Language modeling loss.
+ """
+
+ last_hidden_state: Optional[torch.FloatTensor] = None
+ padding_mask: Optional[torch.Tensor] = None
+ decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+ encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+ encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+ loss: Optional[torch.FloatTensor] = None
+
+
+SEAMLESS_M4T_V2_START_DOCSTRING = r"""
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+ it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+ behavior.
+
+ Parameters:
+ config ([`~SeamlessM4Tv2Config`]): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+SEAMLESS_M4T_V2_MULTIMODAL_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+ Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+ [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+ """
+
+M4T_TEXT_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ """
+
+M4T_SPEECH_INPUTS_DOCSTRING = r"""
+ Args:
+ input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+ Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+ [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+ """
+
+SEAMLESS_M4T_V2_END_INPUTS_DOCSTRING = r"""
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Indices of decoder input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+ Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+ is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+
+ For translation and summarization training, `decoder_input_ids` should be provided. If no
+ `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
+ for denoising pre-training following the paper.
+ decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+ Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+ be used by default.
+
+ If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+ information on the default strategy.
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+ `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+ hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+ `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+ blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape`(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+ representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+ input (see `past_key_values`). This is useful if you want more control over how to convert
+ `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+ If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+ of `inputs_embeds`.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ use_cache (`bool`, *optional*):
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+ `past_key_values`).
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+M4T_MODEL_INPUTS_DOCSTRING = SEAMLESS_M4T_V2_MULTIMODAL_INPUTS_DOCSTRING + SEAMLESS_M4T_V2_END_INPUTS_DOCSTRING
+
+M4T_TEXT_INPUTS_DOCSTRING = M4T_TEXT_INPUTS_DOCSTRING + SEAMLESS_M4T_V2_END_INPUTS_DOCSTRING
+
+M4T_SPEECH_INPUTS_DOCSTRING = M4T_SPEECH_INPUTS_DOCSTRING + SEAMLESS_M4T_V2_END_INPUTS_DOCSTRING
+
+M4T_TEXT_TO_UNITS_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ char_input_ids (`torch.LongTensor` of shape `(batch_size, char_sequence_length)`):
+ Character indices. The correspondence between characters and indices can be found in `char_to_id`, a
+ dictionary in the generation configuration.
+ char_count_per_id (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Number of characters per input id.
+ attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+ Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+ `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+ hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+ inputs_embeds (`torch.FloatTensor` of shape`(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+ model's internal embedding lookup matrix.
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+ tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+ more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+############ UTILS ################
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+ """
+ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+ are ignored. This is modified from fairseq's `utils.make_positions`.
+
+ Args:
+ x: torch.Tensor x:
+
+ Returns: torch.Tensor
+ """
+ # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+ mask = input_ids.ne(padding_idx).int()
+ incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+ return incremental_indices.long() + padding_idx
+
+
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+ """
+ Shift input ids one token to the right.
+ """
+ shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+ shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+ shifted_input_ids[:, 0] = decoder_start_token_id
+
+ if pad_token_id is None:
+ raise ValueError("self.model.config.pad_token_id has to be defined.")
+ # replace possible -100 values in labels by `pad_token_id`
+ shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+ return shifted_input_ids
+
+
+def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
+ """
+ Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that
+ stops at the corresponding element in `seq_lens`.
+
+ Args:
+ hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`):
+ The sequences to mask, where `*` is any number of sequence-specific dimensions including none.
+ seq_lens (`torch.Tensor` of shape `(batch)`:
+ Each element represents the length of the sequence at the same index in `hidden_states`
+
+ Returns:
+ `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)`
+ """
+ batch_size, mask_seq_len = hidden_states.shape[:2]
+
+ indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
+
+ bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
+
+ mask = hidden_states.new_ones((batch_size, mask_seq_len))
+
+ mask = mask.masked_fill(bool_mask, 0)
+
+ return mask
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.format_speech_generation_kwargs with SeamlessM4T->SeamlessM4Tv2
+def format_speech_generation_kwargs(kwargs):
+ """
+ Format kwargs for SeamlessM4Tv2 models that generate speech, attribute kwargs to either the text generation or the
+ speech generation models.
+
+ Args:
+ kwargs (`dict`)`:
+ Keyword arguments are of two types:
+
+ - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+ except for `decoder_input_ids` which will only be passed through the text components.
+ - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+ text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+ This means you can, for example, specify a generation strategy for one generation but not for the
+ other.
+ """
+ # attribute kwargs to models
+ kwargs_text = {}
+ kwargs_speech = {}
+ for key, value in kwargs.items():
+ if key.startswith("text_"):
+ key = key[len("text_") :]
+ kwargs_text[key] = value
+ elif key.startswith("speech_"):
+ key = key[len("speech_") :]
+ kwargs_speech[key] = value
+ elif key == "generation_config":
+ kwargs_text[key] = value
+ else:
+ # If the key is already in a specific config, then it's been set with a
+ # submodules specific value and we don't override
+ if key not in kwargs_text:
+ kwargs_text[key] = value
+ if key not in kwargs_speech:
+ kwargs_speech[key] = value
+ return kwargs_text, kwargs_speech
+
+
+############ SPEECH ENCODER related code ################
+
+
+class SeamlessM4Tv2ConformerFeatureProjection(nn.Module):
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerFeatureProjection.__init__
+ def __init__(self, config):
+ super().__init__()
+ self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps)
+ self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size)
+ self.dropout = nn.Dropout(config.speech_encoder_dropout)
+
+ def forward(self, hidden_states):
+ # non-projected hidden states are needed for quantization
+ norm_hidden_states = self.layer_norm(hidden_states.to(self.layer_norm.weight.dtype))
+ hidden_states = self.projection(norm_hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ return hidden_states
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerFeedForward with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2ConformerFeedForward(nn.Module):
+ def __init__(self, config, act_fn=None, dropout=None):
+ super().__init__()
+ dropout = dropout if dropout is not None else config.speech_encoder_dropout
+ act_fn = act_fn if act_fn is not None else config.speech_encoder_hidden_act
+
+ self.intermediate_dropout = nn.Dropout(dropout)
+ self.intermediate_dense = nn.Linear(config.hidden_size, config.speech_encoder_intermediate_size)
+ self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn
+
+ self.output_dense = nn.Linear(config.speech_encoder_intermediate_size, config.hidden_size)
+ self.output_dropout = nn.Dropout(dropout)
+
+ def forward(self, hidden_states):
+ hidden_states = self.intermediate_dense(hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+ hidden_states = self.intermediate_dropout(hidden_states)
+
+ hidden_states = self.output_dense(hidden_states)
+ hidden_states = self.output_dropout(hidden_states)
+ return hidden_states
+
+
+class SeamlessM4Tv2ConformerConvolutionModule(nn.Module):
+ """Convolution block used in the conformer block. Uses a causal depthwise convolution similar to that
+ described in Section 2.1 of `https://doi.org/10.48550/arxiv.1609.03499"""
+
+ def __init__(self, config):
+ super().__init__()
+ if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+ raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+ self.layer_norm = nn.LayerNorm(config.hidden_size)
+ self.pointwise_conv1 = nn.Conv1d(
+ config.hidden_size,
+ 2 * config.hidden_size,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ bias=False,
+ )
+ self.glu = nn.GLU(dim=1)
+ self.depthwise_conv = nn.Conv1d(
+ config.hidden_size,
+ config.hidden_size,
+ config.conv_depthwise_kernel_size,
+ stride=1,
+ padding=0,
+ groups=config.hidden_size,
+ bias=False,
+ )
+ self.depthwise_layer_norm = nn.LayerNorm(config.hidden_size)
+ self.activation = ACT2FN[config.speech_encoder_hidden_act]
+ self.pointwise_conv2 = nn.Conv1d(
+ config.hidden_size,
+ config.hidden_size,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ bias=False,
+ )
+ self.dropout = nn.Dropout(config.speech_encoder_dropout)
+
+ def forward(self, hidden_states, attention_mask=None):
+ hidden_states = self.layer_norm(hidden_states)
+
+ # Ensure that we do not leak padded positions in depthwise convolution.
+ # Put 0 where necessary
+ if attention_mask is not None:
+ hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+
+ # exchange the temporal dimension and the feature dimension
+ hidden_states = hidden_states.transpose(1, 2)
+
+ # GLU mechanism
+ # => (batch, 2*channel, dim)
+ hidden_states = self.pointwise_conv1(hidden_states)
+ # => (batch, channel, dim)
+ hidden_states = self.glu(hidden_states)
+
+ # Pad the sequence entirely on the left because of causal convolution.
+ hidden_states = torch.nn.functional.pad(hidden_states, (self.depthwise_conv.kernel_size[0] - 1, 0))
+
+ # 1D Depthwise Conv
+ hidden_states = self.depthwise_conv(hidden_states)
+ hidden_states = self.depthwise_layer_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+ hidden_states = self.activation(hidden_states)
+
+ hidden_states = self.pointwise_conv2(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ hidden_states = hidden_states.transpose(1, 2)
+ return hidden_states
+
+
+class SeamlessM4Tv2ConformerSelfAttention(nn.Module):
+ """Construct a SeamlessM4Tv2ConformerSelfAttention object.
+ Can be enhanced with relative position embeddings.
+ """
+
+ def __init__(self, config, use_position_embeddings=True):
+ super().__init__()
+
+ self.head_size = config.hidden_size // config.speech_encoder_attention_heads
+ self.num_heads = config.speech_encoder_attention_heads
+ self.position_embeddings_type = config.position_embeddings_type if use_position_embeddings else None
+
+ self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
+ self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
+ self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
+ self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)
+
+ self.dropout = nn.Dropout(p=config.speech_encoder_dropout)
+
+ if self.position_embeddings_type == "relative_key":
+ self.left_max_position_embeddings = config.left_max_position_embeddings
+ self.right_max_position_embeddings = config.right_max_position_embeddings
+ num_positions = self.left_max_position_embeddings + self.right_max_position_embeddings + 1
+ self.distance_embedding = nn.Embedding(num_positions, self.head_size)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ # self-attention mechanism
+ batch_size, sequence_length, hidden_size = hidden_states.size()
+
+ # make sure query/key states can be != value states
+ query_key_states = hidden_states
+ value_states = hidden_states
+
+ # project query_key_states and value_states
+ query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+ key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+ value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+
+ # => (batch, head, time1, d_k)
+ query = query.transpose(1, 2)
+ key = key.transpose(1, 2)
+ value = value.transpose(1, 2)
+
+ attn_weights = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+
+ if self.position_embeddings_type == "relative_key":
+ query_length, key_length = query.shape[2], key.shape[2]
+
+ position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+ position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+ distance = position_ids_r - position_ids_l
+ distance = torch.clamp(distance, -self.left_max_position_embeddings, self.right_max_position_embeddings)
+
+ positional_embedding = self.distance_embedding(distance + self.left_max_position_embeddings)
+ positional_embedding = positional_embedding.to(dtype=query.dtype) # fp16 compatibility
+
+ relative_position_attn_weights = torch.einsum("bhld,lrd->bhlr", query, positional_embedding)
+ attn_weights = attn_weights + (relative_position_attn_weights / math.sqrt(self.head_size))
+
+ # apply attention_mask if necessary
+ if attention_mask is not None:
+ attn_weights = attn_weights + attention_mask
+
+ # => (batch, head, time1, time2)
+ attn_weights = torch.softmax(attn_weights, dim=-1)
+ attn_weights = self.dropout(attn_weights)
+
+ # => (batch, head, time1, d_k)
+ attn_output = torch.matmul(attn_weights, value)
+
+ # => (batch, time1, hidden_size)
+ attn_output = attn_output.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+ attn_output = self.linear_out(attn_output)
+
+ if not output_attentions:
+ attn_weights = None
+
+ return attn_output, attn_weights
+
+
+class SeamlessM4Tv2ConformerEncoderLayer(nn.Module):
+ """Conformer block based on https://arxiv.org/abs/2005.08100."""
+
+ # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerEncoderLayer.__init__ with Wav2Vec2->SeamlessM4Tv2, attention_dropout->speech_encoder_dropout, torch.nn->nn
+ def __init__(self, config):
+ super().__init__()
+ embed_dim = config.hidden_size
+ dropout = config.speech_encoder_dropout
+
+ # Feed-forward 1
+ self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
+ self.ffn1 = SeamlessM4Tv2ConformerFeedForward(config)
+
+ # Self-Attention
+ self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+ self.self_attn_dropout = nn.Dropout(dropout)
+ self.self_attn = SeamlessM4Tv2ConformerSelfAttention(config)
+
+ # Conformer Convolution
+ self.conv_module = SeamlessM4Tv2ConformerConvolutionModule(config)
+
+ # Feed-forward 2
+ self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
+ self.ffn2 = SeamlessM4Tv2ConformerFeedForward(config)
+ self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ conv_attention_mask: Optional[torch.Tensor] = None,
+ ):
+ hidden_states = hidden_states
+
+ # 1. Feed-Forward 1 layer
+ residual = hidden_states
+ hidden_states = self.ffn1_layer_norm(hidden_states)
+ hidden_states = self.ffn1(hidden_states)
+ hidden_states = hidden_states * 0.5 + residual
+ residual = hidden_states
+
+ # 2. Self-Attention layer
+ hidden_states = self.self_attn_layer_norm(hidden_states)
+ hidden_states, attn_weights = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = self.self_attn_dropout(hidden_states)
+ hidden_states = hidden_states + residual
+
+ # 3. Convolutional Layer
+ residual = hidden_states
+ hidden_states = self.conv_module(hidden_states, attention_mask=conv_attention_mask)
+ hidden_states = residual + hidden_states
+
+ # 4. Feed-Forward 2 Layer
+ residual = hidden_states
+ hidden_states = self.ffn2_layer_norm(hidden_states)
+ hidden_states = self.ffn2(hidden_states)
+ hidden_states = hidden_states * 0.5 + residual
+ hidden_states = self.final_layer_norm(hidden_states)
+
+ return hidden_states, attn_weights
+
+
+class SeamlessM4Tv2ConformerEncoder(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ self.config = config
+
+ self.dropout = nn.Dropout(config.speech_encoder_dropout)
+ self.layers = nn.ModuleList(
+ [SeamlessM4Tv2ConformerEncoderLayer(config) for _ in range(config.speech_encoder_layers)]
+ )
+
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+ self.gradient_checkpointing = False
+
+ def _apply_chunk_attention(self, attention_mask, hidden_states):
+ """
+ Creates a chunk attention mask. It creates a mask to prevent attention across chunks, ensuring that each
+ position attends only to positions within its own chunk. If a left chunk overlap is specified
+ (`speech_encoder_chunk_size` in the configuration), the attention mask is adjusted accordingly to allow each
+ position to also attends the `speech_encoder_chunk_size - 1` previous chunks.
+ """
+ sequence_len = hidden_states.shape[1]
+
+ chunk_indices = torch.arange(sequence_len, device=hidden_states.device)
+ chunk_indices = torch.div(chunk_indices, self.config.speech_encoder_chunk_size).long()
+
+ start_indices = torch.full_like(chunk_indices, 0)
+ if self.config.speech_encoder_left_chunk_num >= 0:
+ start_indices = (chunk_indices - self.config.speech_encoder_left_chunk_num).clamp_(min=0)
+ start_indices = start_indices * self.config.speech_encoder_chunk_size
+ start_indices = start_indices
+ start_indices = start_indices.unsqueeze(1).expand(-1, sequence_len)
+
+ end_indices = ((chunk_indices + 1) * self.config.speech_encoder_chunk_size).clamp_(max=sequence_len)
+
+ end_indices = end_indices.unsqueeze(1).expand(-1, sequence_len)
+
+ indices = torch.arange(sequence_len, device=hidden_states.device).unsqueeze(0).expand(sequence_len, -1)
+
+ chunk_mask = (indices < start_indices) | (indices >= end_indices)
+ chunk_mask = chunk_mask.unsqueeze(0).unsqueeze(0)
+
+ attention_mask = chunk_mask if attention_mask is None else (attention_mask.bool() | chunk_mask)
+ attention_mask = attention_mask.to(dtype=hidden_states.dtype)
+ return attention_mask
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask=None,
+ output_attentions=False,
+ output_hidden_states=False,
+ return_dict=True,
+ ):
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attentions = () if output_attentions else None
+
+ conv_attention_mask = attention_mask
+ if attention_mask is not None:
+ # make sure padded tokens output 0
+ hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+ # extend attention_mask
+ attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+ attention_mask = attention_mask.expand(
+ attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+ )
+
+ if self.config.speech_encoder_chunk_size is not None:
+ attention_mask = self._apply_chunk_attention(attention_mask, hidden_states)
+
+ if attention_mask is not None:
+ attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+
+ hidden_states = self.dropout(hidden_states)
+
+ synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+ for i, layer in enumerate(self.layers):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+ dropout_probability = torch.rand([])
+
+ skip_the_layer = (
+ True if self.training and (dropout_probability < self.config.speech_encoder_layerdrop) else False
+ )
+ if not skip_the_layer or synced_gpus:
+ # under fsdp or deepspeed zero3 all gpus must run in sync
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ layer.__call__,
+ hidden_states,
+ attention_mask,
+ output_attentions,
+ conv_attention_mask,
+ )
+ else:
+ layer_outputs = layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ conv_attention_mask=conv_attention_mask,
+ )
+ hidden_states = layer_outputs[0]
+
+ if skip_the_layer:
+ layer_outputs = (None, None)
+
+ if output_attentions:
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+ hidden_states = self.layer_norm(hidden_states)
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attentions,
+ )
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerAdapterLayer with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2ConformerAdapterLayer(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+ embed_dim = config.hidden_size
+ dropout = config.adaptor_dropout
+
+ self.kernel_size = config.adaptor_kernel_size
+ self.stride = config.adaptor_stride
+
+ # 1. residual convolution
+ self.residual_layer_norm = nn.LayerNorm(embed_dim)
+ self.residual_conv = nn.Conv1d(
+ embed_dim,
+ 2 * embed_dim,
+ self.kernel_size,
+ stride=self.stride,
+ padding=self.stride // 2,
+ )
+ self.activation = nn.GLU(dim=1)
+
+ # Self-Attention
+ self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+ self.self_attn_conv = nn.Conv1d(
+ embed_dim,
+ 2 * embed_dim,
+ self.kernel_size,
+ stride=self.stride,
+ padding=self.stride // 2,
+ )
+ self.self_attn = SeamlessM4Tv2ConformerSelfAttention(config, use_position_embeddings=False)
+ self.self_attn_dropout = nn.Dropout(dropout)
+
+ # Feed-forward
+ self.ffn_layer_norm = nn.LayerNorm(embed_dim)
+ self.ffn = SeamlessM4Tv2ConformerFeedForward(config, act_fn="relu", dropout=dropout)
+
+ def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):
+ pad = self.kernel_size // 2
+ seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1)
+
+ seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1
+
+ return seq_lens.floor()
+
+ def forward(
+ self,
+ hidden_states,
+ attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ):
+ residual = self.residual_layer_norm(hidden_states)
+
+ # Apply pooling to the residual to match the sequence length of the
+ # multi-head attention output.
+ # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+ residual = residual.transpose(1, 2)
+ residual = self.residual_conv(residual)
+ residual = self.activation(residual)
+ # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+ residual = residual.transpose(1, 2)
+
+ hidden_states = self.self_attn_layer_norm(hidden_states)
+ # Apply pooling before feeding to the multihead-attention layer.
+ # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+ hidden_states = hidden_states.transpose(1, 2)
+ hidden_states = self.self_attn_conv(hidden_states)
+ hidden_states = self.activation(hidden_states)
+ # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+ hidden_states = hidden_states.transpose(1, 2)
+
+ if attention_mask is not None:
+ sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+ hidden_states.device
+ )
+ attention_mask = _compute_new_attention_mask(hidden_states=hidden_states, seq_lens=sub_sampled_lengths)
+ attention_mask = _prepare_4d_attention_mask(
+ attention_mask,
+ hidden_states.dtype,
+ )
+
+ # The rest of the computation is identical to a vanilla Transformer
+ # encoder layer.
+ hidden_states, attn_weigths = self.self_attn(
+ hidden_states,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = self.self_attn_dropout(hidden_states)
+ hidden_states = hidden_states + residual
+
+ residual = hidden_states
+
+ hidden_states = self.ffn_layer_norm(hidden_states)
+ hidden_states = self.ffn(hidden_states) + residual
+
+ return hidden_states
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerAdapter with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2ConformerAdapter(nn.Module):
+ def __init__(self, config):
+ super().__init__()
+
+ self.layers = nn.ModuleList(
+ SeamlessM4Tv2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers)
+ )
+
+ def forward(self, hidden_states, attention_mask):
+ # down project hidden_states if necessary
+
+ for layer in self.layers:
+ hidden_states = layer(hidden_states, attention_mask)
+
+ return hidden_states
+
+
+############ TEXT / UNITS related code ################
+
+
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100ScaledWordEmbedding with M2M100->SeamlessM4Tv2
+class SeamlessM4Tv2ScaledWordEmbedding(nn.Embedding):
+ """
+ This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+ """
+
+ def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0):
+ super().__init__(num_embeddings, embedding_dim, padding_idx)
+ self.embed_scale = embed_scale
+
+ def forward(self, input_ids: torch.Tensor):
+ return super().forward(input_ids) * self.embed_scale
+
+
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding
+class SeamlessM4Tv2SinusoidalPositionalEmbedding(nn.Module):
+ """This module produces sinusoidal positional embeddings of any length."""
+
+ def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+ super().__init__()
+ self.offset = 2
+ self.embedding_dim = embedding_dim
+ self.padding_idx = padding_idx
+ self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+
+ def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+ emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+ if hasattr(self, "weights"):
+ # in forward put the weights on the correct dtype and device of the param
+ emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+ self.register_buffer("weights", emb_weights, persistent=False)
+
+ @staticmethod
+ def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+ """
+ Build sinusoidal embeddings.
+
+ This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
+ "Attention Is All You Need".
+ """
+ half_dim = embedding_dim // 2
+ emb = math.log(10000) / (half_dim - 1)
+ emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+ emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
+ emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+ if embedding_dim % 2 == 1:
+ # zero pad
+ emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+ if padding_idx is not None:
+ emb[padding_idx, :] = 0
+
+ return emb.to(torch.get_default_dtype())
+
+ @torch.no_grad()
+ def forward(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.Tensor] = None,
+ past_key_values_length: int = 0,
+ ):
+ if input_ids is not None:
+ bsz, seq_len = input_ids.size()
+ # Create the position ids from the input token ids. Any padded tokens remain padded.
+ position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
+ input_ids.device
+ )
+ else:
+ bsz, seq_len = inputs_embeds.size()[:-1]
+ position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
+
+ # expand embeddings if needed
+ max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
+ if max_pos > self.weights.size(0):
+ self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+
+ return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
+
+ def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
+ """
+ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+ Args:
+ inputs_embeds: torch.Tensor
+
+ Returns: torch.Tensor
+ """
+ input_shape = inputs_embeds.size()[:-1]
+ sequence_length = input_shape[1]
+
+ position_ids = torch.arange(
+ self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+ )
+ return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
+
+
+class SeamlessM4Tv2Attention(nn.Module):
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+ # Copied from transformers.models.bart.modeling_bart.BartAttention.__init__ with Bart->SeamlessM4Tv2
+ def __init__(
+ self,
+ embed_dim: int,
+ num_heads: int,
+ dropout: float = 0.0,
+ is_decoder: bool = False,
+ bias: bool = True,
+ is_causal: bool = False,
+ config: Optional[SeamlessM4Tv2Config] = None,
+ ):
+ super().__init__()
+ self.embed_dim = embed_dim
+ self.num_heads = num_heads
+ self.dropout = dropout
+ self.head_dim = embed_dim // num_heads
+ self.config = config
+
+ if (self.head_dim * num_heads) != self.embed_dim:
+ raise ValueError(
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+ f" and `num_heads`: {num_heads})."
+ )
+ self.scaling = self.head_dim**-0.5
+ self.is_decoder = is_decoder
+ self.is_causal = is_causal
+
+ self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+ self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+ self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+ self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+ def _shape(self, projection: torch.Tensor) -> torch.Tensor:
+ new_projection_shape = projection.size()[:-1] + (self.num_heads, self.head_dim)
+ # move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
+ new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)
+ return new_projection
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: bool = False,
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+ """Input shape: Batch x Time x Channel"""
+
+ is_cross_attention = encoder_hidden_states is not None
+ batch_size, seq_length = hidden_states.shape[:2]
+
+ # use encoder_hidden_states if cross attention
+ current_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+ # checking that the `sequence_length` of the `past_key_value` is the same as the he provided
+ # `encoder_hidden_states` to support prefix tuning
+ if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]:
+ # reuse k,v, cross_attentions
+ key_states = past_key_value[0]
+ value_states = past_key_value[1]
+ else:
+ key_states = self._shape(self.k_proj(current_states))
+ value_states = self._shape(self.v_proj(current_states))
+ if past_key_value is not None and not is_cross_attention:
+ # reuse k, v, self_attention
+ key_states = torch.cat([past_key_value[0], key_states], dim=2)
+ value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+ query_states = self._shape(self.q_proj(hidden_states) * self.scaling)
+ attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
+
+ if self.is_decoder:
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+ # Further calls to cross_attention layer can then reuse all cross-attention
+ # key/value_states (first "if" case)
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
+ past_key_value = (key_states, value_states)
+
+ if attention_mask is not None:
+ attention_scores = attention_scores + attention_mask
+
+ # (batch_size, n_heads, seq_length, key_length)
+ attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).type_as(attention_scores)
+ attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+ # attn_output = torch.bmm(attn_probs, value_states) ?
+ context_states = torch.matmul(attn_weights, value_states)
+ # attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) ?
+ context_states = context_states.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1)
+ attn_output = self.out_proj(context_states)
+
+ if output_attentions:
+ return attn_output, attn_weights, past_key_value
+ else:
+ return attn_output, None, past_key_value
+
+
+# Copied from transformers.models.nllb_moe.modeling_nllb_moe.NllbMoeDenseActDense with NllbMoe->SeamlessM4Tv2,DenseActDense->FeedForwardNetwork, d_model->hidden_size
+class SeamlessM4Tv2FeedForwardNetwork(nn.Module):
+ def __init__(self, config: SeamlessM4Tv2Config, ffn_dim: int):
+ super().__init__()
+ self.fc1 = nn.Linear(config.hidden_size, ffn_dim)
+ self.fc2 = nn.Linear(ffn_dim, config.hidden_size)
+ self.dropout = nn.Dropout(config.activation_dropout)
+ self.act = ACT2FN[config.activation_function]
+
+ def forward(self, hidden_states):
+ hidden_states = self.fc1(hidden_states)
+ hidden_states = self.act(hidden_states)
+ hidden_states = self.dropout(hidden_states)
+ if (
+ isinstance(self.fc2.weight, torch.Tensor)
+ and hidden_states.dtype != self.fc2.weight.dtype
+ and (self.fc2.weight.dtype != torch.int8 and self.fc2.weight.dtype != torch.uint8)
+ ):
+ hidden_states = hidden_states.to(self.fc2.weight.dtype)
+ hidden_states = self.fc2(hidden_states)
+ return hidden_states
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TEncoderLayer with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2EncoderLayer(nn.Module):
+ def __init__(self, config: SeamlessM4Tv2Config, encoder_ffn_dim=None, encoder_attention_heads=None):
+ super().__init__()
+ encoder_ffn_dim = config.encoder_ffn_dim if encoder_ffn_dim is None else encoder_ffn_dim
+ encoder_attention_heads = (
+ config.encoder_attention_heads if encoder_attention_heads is None else encoder_attention_heads
+ )
+
+ self.embed_dim = config.hidden_size
+ self.self_attn = SeamlessM4Tv2Attention(
+ embed_dim=self.embed_dim,
+ num_heads=encoder_attention_heads,
+ dropout=config.attention_dropout,
+ )
+ self.attn_dropout = nn.Dropout(config.dropout)
+ self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+ self.ffn = SeamlessM4Tv2FeedForwardNetwork(config, ffn_dim=encoder_ffn_dim)
+
+ self.ffn_layer_norm = nn.LayerNorm(config.hidden_size)
+ self.ffn_dropout = nn.Dropout(config.activation_dropout)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: torch.Tensor,
+ output_attentions: bool = False,
+ ) -> torch.Tensor:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`):
+ input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`):
+ attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+ large negative values.
+ """
+ residual = hidden_states
+ hidden_states = self.self_attn_layer_norm(hidden_states)
+ hidden_states, attn_weights, _ = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = self.attn_dropout(hidden_states)
+ hidden_states = residual + hidden_states
+
+ residual = hidden_states
+
+ hidden_states = self.ffn_layer_norm(hidden_states)
+
+ hidden_states = self.ffn(hidden_states)
+ hidden_states = self.ffn_dropout(hidden_states)
+
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states,)
+
+ if output_attentions:
+ outputs += (attn_weights,)
+
+ return outputs
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TDecoderLayer with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2DecoderLayer(nn.Module):
+ def __init__(self, config: SeamlessM4Tv2Config, decoder_ffn_dim=None, decoder_attention_heads=None):
+ super().__init__()
+ decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim
+ decoder_attention_heads = (
+ config.decoder_attention_heads if decoder_attention_heads is None else decoder_attention_heads
+ )
+
+ self.embed_dim = config.hidden_size
+ self.self_attn = SeamlessM4Tv2Attention(
+ embed_dim=self.embed_dim,
+ num_heads=decoder_attention_heads,
+ dropout=config.attention_dropout,
+ is_decoder=True,
+ )
+ self.dropout = config.dropout
+ self.activation_fn = ACT2FN[config.activation_function]
+ self.attn_dropout = nn.Dropout(config.dropout)
+
+ self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+ self.cross_attention = SeamlessM4Tv2Attention(
+ self.embed_dim, decoder_attention_heads, config.attention_dropout, is_decoder=True
+ )
+ self.cross_attention_layer_norm = nn.LayerNorm(self.embed_dim)
+
+ self.ffn = SeamlessM4Tv2FeedForwardNetwork(config, ffn_dim=decoder_ffn_dim)
+
+ self.ffn_layer_norm = nn.LayerNorm(config.hidden_size)
+ self.ffn_dropout = nn.Dropout(config.activation_dropout)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ encoder_hidden_states: Optional[torch.Tensor] = None,
+ encoder_attention_mask: Optional[torch.Tensor] = None,
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
+ output_attentions: Optional[bool] = False,
+ use_cache: Optional[bool] = True,
+ ) -> torch.Tensor:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`):
+ input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`):
+ attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+ large negative values.
+ encoder_hidden_states (`torch.FloatTensor`):
+ cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+ encoder_attention_mask (`torch.FloatTensor`):
+ encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by
+ very large negative values.
+ past_key_value (`Tuple(torch.FloatTensor)`):
+ cached past key and value projection states
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+ residual = hidden_states
+ hidden_states = self.self_attn_layer_norm(hidden_states)
+
+ # Self Attention
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+ self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+ # add present self-attn cache to positions 1,2 of present_key_value tuple
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
+ hidden_states=hidden_states,
+ past_key_value=self_attn_past_key_value,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = self.attn_dropout(hidden_states)
+ hidden_states = residual + hidden_states
+
+ # Cross-Attention Block
+ cross_attn_present_key_value = None
+ cross_attn_weights = None
+ if encoder_hidden_states is not None:
+ residual = hidden_states
+ hidden_states = self.cross_attention_layer_norm(hidden_states)
+
+ # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+ cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+
+ hidden_states, cross_attn_weights, cross_attn_present_key_value = self.cross_attention(
+ hidden_states=hidden_states,
+ encoder_hidden_states=encoder_hidden_states,
+ past_key_value=cross_attn_past_key_value,
+ attention_mask=encoder_attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = self.attn_dropout(hidden_states)
+ hidden_states = residual + hidden_states
+
+ # add cross-attn to positions 3,4 of present_key_value tuple
+ present_key_value += cross_attn_present_key_value
+
+ # Fully Connected
+ residual = hidden_states
+
+ hidden_states = self.ffn_layer_norm(hidden_states)
+
+ hidden_states = self.ffn(hidden_states)
+ hidden_states = self.ffn_dropout(hidden_states)
+
+ hidden_states = residual + hidden_states
+
+ outputs = (hidden_states, present_key_value)
+
+ if output_attentions:
+ outputs += (self_attn_weights, cross_attn_weights)
+
+ return outputs
+
+
+class SeamlessM4Tv2TextToUnitDecoderLayer(nn.Module):
+ def __init__(self, config: SeamlessM4Tv2Config, decoder_ffn_dim=None, decoder_attention_heads=None):
+ super().__init__()
+ decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim
+ decoder_attention_heads = (
+ config.decoder_attention_heads if decoder_attention_heads is None else decoder_attention_heads
+ )
+ self.dropout = config.dropout
+ self.embed_dim = config.hidden_size
+
+ self.self_attn = SeamlessM4Tv2Attention(
+ embed_dim=self.embed_dim,
+ num_heads=decoder_attention_heads,
+ dropout=config.attention_dropout,
+ is_decoder=True,
+ )
+ self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+ self.conv1 = nn.Conv1d(self.embed_dim, self.embed_dim, kernel_size=7, stride=1, padding="same")
+ self.activation_fn = ACT2FN[config.activation_function]
+ self.conv2 = nn.Conv1d(self.embed_dim, self.embed_dim, kernel_size=7, stride=1, padding="same")
+
+ self.conv_layer_norm = nn.LayerNorm(config.hidden_size)
+ self.conv_dropout = nn.Dropout(self.dropout)
+
+ def forward(
+ self,
+ hidden_states: torch.Tensor,
+ attention_mask: Optional[torch.Tensor] = None,
+ padding_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = False,
+ ) -> torch.Tensor:
+ """
+ Args:
+ hidden_states (`torch.FloatTensor`):
+ input to the layer of shape `(batch, seq_len, embed_dim)`
+ attention_mask (`torch.FloatTensor`):
+ attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+ large negative values.
+ padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked*
+ or 0 for *masked*
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ """
+ residual = hidden_states
+
+ # Self Attention
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = residual + hidden_states
+ hidden_states = self.self_attn_layer_norm(hidden_states)
+
+ # Conv
+ residual = hidden_states
+
+ # Apply padding mask to avoid leaking padded positions in the convolution layer
+ if padding_mask is not None:
+ hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+ hidden_states = self.conv1(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+ if padding_mask is not None:
+ hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+
+ hidden_states = self.activation_fn(hidden_states)
+ hidden_states = self.conv2(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+ hidden_states = self.conv_dropout(hidden_states)
+ hidden_states = residual + hidden_states
+ hidden_states = self.conv_layer_norm(hidden_states)
+
+ outputs = (hidden_states, present_key_value)
+
+ if output_attentions:
+ outputs += self_attn_weights
+
+ return outputs
+
+
+############ SUB-MODELS related code ################
+
+
+class SeamlessM4Tv2PreTrainedModel(PreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = SeamlessM4Tv2Config
+ base_model_prefix = "seamless_m4t_v2"
+ supports_gradient_checkpointing = True
+ _no_split_modules = [
+ "SeamlessM4Tv2EncoderLayer",
+ "SeamlessM4Tv2DecoderLayer",
+ "SeamlessM4Tv2ConformerEncoderLayer",
+ "SeamlessM4Tv2TextToUnitDecoderLayer",
+ ]
+
+ def _init_weights(self, module):
+ """Initialize the weights"""
+ std = self.config.initializer_range
+ if isinstance(module, nn.Linear):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=std)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+ elif isinstance(module, SeamlessM4Tv2ConformerSelfAttention):
+ if hasattr(module, "pos_bias_u"):
+ nn.init.xavier_uniform_(module.pos_bias_u)
+ if hasattr(module, "pos_bias_v"):
+ nn.init.xavier_uniform_(module.pos_bias_v)
+ elif isinstance(module, SeamlessM4Tv2ConformerFeatureProjection):
+ k = math.sqrt(1 / module.projection.in_features)
+ nn.init.uniform_(module.projection.weight, a=-k, b=k)
+ nn.init.uniform_(module.projection.bias, a=-k, b=k)
+ elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+ module.bias.data.zero_()
+ module.weight.data.fill_(1.0)
+ elif isinstance(module, (nn.Conv1d, nn.ConvTranspose1d)):
+ nn.init.kaiming_normal_(module.weight)
+ if module.bias is not None:
+ k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+ nn.init.uniform_(module.bias, a=-k, b=k)
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TPreTrainedModel._compute_sub_sample_lengths_from_attention_mask
+ def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):
+ kernel_size, stride = self.config.adaptor_kernel_size, self.config.adaptor_stride
+ pad = kernel_size // 2
+ seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1)
+
+ seq_lens = ((seq_lens + 2 * pad - kernel_size) / stride) + 1
+
+ return seq_lens.floor()
+
+ def _indices_to_subwords(self, input_ids):
+ """
+ Returns the corresponding text string for each input id.
+ """
+ if not hasattr(self.generation_config, "id_to_text"):
+ raise ValueError(
+ """This model generation config doesn't have a `id_to_text` key which maps
+ token ids to subwords. Make sure to load the right generation config."""
+ )
+ batch_size, sequence_len = input_ids.shape
+
+ subwords_batch = []
+ for batch_id in range(batch_size):
+ subwords = []
+ for i in range(sequence_len):
+ subword = self.generation_config.id_to_text.get(str(input_ids[batch_id, i].item()))
+ subwords.append(str(subword))
+ subwords_batch.append(subwords)
+ return subwords_batch
+
+ def _count_character_length_in_subword(
+ self,
+ input_ids,
+ subwords_batch,
+ merge_space_with_prev_subword=False,
+ pad_token_id=0,
+ unk_token_id=1,
+ space="▁",
+ ):
+ """
+ Counts the number of characters per text string associated with the input token id.
+
+ Args:
+ input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary.
+ subwords_batch (`List[List[str]]` of shape `(batch_size, sequence_length)`):
+ Corresponding text string for each input id.
+ merge_space_with_prev_subword (`bool`, *optional*, defaults to `False`):
+ Indicates if the space character is merged with the previous subword. If `False`, it will be merged
+ with the next subword.
+ pad_token_id (`int`, *optional*, defaults to 0):
+ The id of the _padding_ text token. If it is encountered when calculating the length of a subword
+ sample, the lengths of subsequent subwords will be set to 0.
+ unk_token_id (`int`, *optional*, defaults to 1):
+ The id of the _unknown_ text token. Associated to a subword of length 1.
+ space (`str`, *optional*, defaults to `"▁"`):
+ The space character.
+ """
+ batch_size, _ = input_ids.shape
+
+ char_count_per_id = input_ids.new_zeros(input_ids.size())
+
+ subword_lens = input_ids.ne(pad_token_id).sum(1)
+
+ for batch_id in range(batch_size):
+ # We slice out the tensor till the padding index.
+ subword_indices = input_ids[batch_id, : subword_lens[batch_id]]
+ subwords = subwords_batch[batch_id][: subword_lens[batch_id]]
+
+ is_next_start_with_space = [
+ len(subwords[i + 1]) > 1 and subwords[i + 1][0] == space if i < len(subwords) - 1 else False
+ for i in range(len(subwords))
+ ]
+ is_punc = [
+ len(subwords[i]) == 1
+ and not subwords[i].isalpha()
+ and not subwords[i].isnumeric()
+ and subwords[i] != space
+ for i in range(len(subwords))
+ ]
+ for i, (subword_idx, subword) in enumerate(zip(subword_indices, subwords)):
+ if subword_idx == pad_token_id:
+ break
+
+ if subword_idx == unk_token_id:
+ # We set char_len to 1 for an unk token.
+ char_len = 1
+
+ if merge_space_with_prev_subword and is_next_start_with_space[i]:
+ char_len += 1
+ else:
+ # By default, spaces are merged with the next subword.
+ # char_len includes the space.
+ char_len = len(subword)
+
+ if merge_space_with_prev_subword:
+ # Add the space for the next subword.
+ if is_next_start_with_space[i]:
+ char_len += 1
+ # Subtract the space for the current subword.
+ if i > 0 and is_next_start_with_space[i - 1]:
+ char_len -= 1
+ else:
+ # Merge space with punctuation mark by default.
+ if is_punc[i] and is_next_start_with_space[i]:
+ char_len += 1
+ # Subtract the space for the subword succeeding the punctuation mark.
+ elif i > 0 and is_punc[i - 1] and is_next_start_with_space[i - 1]:
+ char_len -= 1
+
+ char_count_per_id[batch_id, i] = char_len
+
+ return char_count_per_id
+
+ def _get_char_input_ids(self, input_ids, subwords_batch, char_count_per_id, pad_token_id=0, unk_token_id=1):
+ """
+ Returns the corresponding character input id for each character of `subwords_batch`.
+
+ Args:
+ input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary.
+ subwords_batch (`List[List[str]]` of shape `(batch_size, sequence_length)`):
+ Corresponding text string for each input id.
+ char_count_per_id (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+ Number of characters per input id.
+ pad_token_id (`int`, *optional*, defaults to 0):
+ The id of the _padding_ text token. If it is encountered when calculating the length of a subword
+ sample, the lengths of subsequent subwords will be set to 0.
+ unk_token_id (`int`, *optional*, defaults to 1):
+ The id of the _unknown_ text token. Associated to a subword of length 1.
+ Returns:
+ `torch.Tensor`: Tensor of shape `(batch_size, char_sequence_length)` containing the id of each character.
+ """
+ if not hasattr(self.generation_config, "char_to_id"):
+ raise ValueError(
+ """This model generation config doesn't have a `char_to_id` key which maps
+ characters to character ids. Make sure to load the right generation config."""
+ )
+
+ batch_size = input_ids.shape[0]
+ max_len = int(char_count_per_id.sum(1).max().item())
+
+ char_seqs = input_ids.new_zeros((batch_size, max_len)).fill_(pad_token_id)
+
+ subword_lens = input_ids.ne(pad_token_id).sum(1)
+
+ for batch_id in range(batch_size):
+ total = 0
+ subword_indices = input_ids[batch_id, : subword_lens[batch_id]]
+ subwords = subwords_batch[batch_id][: subword_lens[batch_id]]
+ for subword_idx, subword in zip(subword_indices, subwords):
+ if subword_idx == unk_token_id:
+ char_ids = [unk_token_id]
+ else:
+ # Get char token indices corresponding to the subwords.
+ char_ids = [self.generation_config.char_to_id.get(ch, unk_token_id) for ch in list(subword)]
+ char_seq_len = len(char_ids)
+ char_seqs[batch_id, total : total + char_seq_len] = torch.tensor(char_ids).to(char_seqs)
+ total += char_seq_len
+ return char_seqs
+
+ def _hard_upsample(self, hidden_states, durations):
+ """
+ Repeats the time dimension of each sample in the batch based on the corresponding duration.
+
+ Args:
+ hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, *)`, *optional*):
+ The sequence to repeat, where `*` is any number of sequence-specific dimensions including none.
+ durations (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indicates how many times to repeat time segments.
+ """
+ if hidden_states.size(0) == 1:
+ hidden_states = torch.repeat_interleave(hidden_states, durations.view(-1), dim=1)
+ else:
+ # if batched sample, need to interleave per sample, and pad -> loss of parallelism
+ if hidden_states.shape[0] > 1 and self.training:
+ logger.warning_once(
+ """`self.training=True` and you use batching. You lose parallelism during the hifigan
+ forward pass because the samples are interleaved."""
+ )
+ hidden_states = [
+ torch.repeat_interleave(hidden_state, duration, dim=0)
+ for (hidden_state, duration) in zip(hidden_states, durations)
+ ]
+
+ hidden_states = nn.utils.rnn.pad_sequence(hidden_states, batch_first=True)
+
+ return hidden_states
+
+
+@add_start_docstrings(
+ """Transformer speech encoder consisting of *config.speech_encoder_layers* conformer self attention layers.
+ Each layer is a [`SeamlessM4Tv2ConformerEncoderLayer`].""",
+ SEAMLESS_M4T_V2_START_DOCSTRING,
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TSpeechEncoder with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2SpeechEncoder(SeamlessM4Tv2PreTrainedModel):
+ main_input_name = "input_features"
+
+ def __init__(self, config: SeamlessM4Tv2Config):
+ super().__init__(config)
+
+ self.feature_projection = SeamlessM4Tv2ConformerFeatureProjection(config)
+ self.encoder = SeamlessM4Tv2ConformerEncoder(config)
+ self.intermediate_ffn = SeamlessM4Tv2ConformerFeedForward(config, act_fn="relu", dropout=0.0)
+ self.adapter = SeamlessM4Tv2ConformerAdapter(config) if config.add_adapter else None
+ self.inner_layer_norm = nn.LayerNorm(config.hidden_size)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def forward(
+ self,
+ input_features: Optional[torch.Tensor],
+ attention_mask: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs,
+ ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_features is None:
+ raise ValueError(
+ """Both `input_features` and `inputs_embeds` are `None` in `SeamlessM4Tv2SpeechEncoder.forward`.
+ Make sure one of them is not `None`."""
+ )
+
+ hidden_states = self.feature_projection(input_features)
+
+ encoder_outputs = self.encoder(
+ hidden_states,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ hidden_states = encoder_outputs[0]
+
+ expanded_hidden_states = self.intermediate_ffn(hidden_states)
+ hidden_states = hidden_states + 0.5 * expanded_hidden_states
+
+ if self.adapter is not None:
+ hidden_states = self.adapter(hidden_states, attention_mask=attention_mask)
+
+ hidden_states = self.inner_layer_norm(hidden_states)
+
+ if not return_dict:
+ return (hidden_states,) + encoder_outputs[1:]
+
+ return Wav2Vec2BaseModelOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+# inspired from MBart and NllbMoe
+@add_start_docstrings(
+ "Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`SeamlessM4Tv2EncoderLayer`].",
+ SEAMLESS_M4T_V2_START_DOCSTRING,
+ """
+ embed_tokens (`nn.Embedding`, *optional*):
+ Input embedding
+ is_t2u_encoder (`bool`, *optional*, defaults to `False`):
+ indicates if it belongs to the text-to-units model, in which case it won't have input embeddings
+ """,
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TEncoder with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2Encoder(SeamlessM4Tv2PreTrainedModel):
+ def __init__(
+ self,
+ config: SeamlessM4Tv2Config,
+ embed_tokens: Optional[nn.Embedding] = None,
+ is_t2u_encoder: bool = False,
+ ):
+ super().__init__(config)
+
+ self.dropout = config.dropout
+ self.layerdrop = config.encoder_layerdrop
+ self.padding_idx = config.pad_token_id
+ embed_dim = config.hidden_size
+
+ self.is_t2u_encoder = is_t2u_encoder
+ self.max_source_positions = config.max_position_embeddings
+
+ if not self.is_t2u_encoder:
+ embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+ self.embed_tokens = SeamlessM4Tv2ScaledWordEmbedding(
+ config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
+ )
+
+ if embed_tokens is not None:
+ self.embed_tokens.weight = embed_tokens.weight
+
+ self.embed_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+ self.max_source_positions,
+ embed_dim,
+ self.padding_idx,
+ )
+
+ layers = []
+ for _ in range(config.encoder_layers):
+ layers.append(
+ SeamlessM4Tv2EncoderLayer(
+ config,
+ encoder_attention_heads=config.encoder_attention_heads,
+ encoder_ffn_dim=config.encoder_ffn_dim,
+ )
+ )
+
+ self.layers = nn.ModuleList(layers)
+
+ self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs,
+ ) -> Union[Tuple, BaseModelOutput]:
+ r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+ provide it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ than the model's internal embedding lookup matrix.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_ids is not None and self.is_t2u_encoder:
+ raise ValueError(
+ "You cannot pass input_ids to the encoder of the text_to_units model. Pass inputs_embeds instead."
+ )
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif input_ids is not None:
+ input = input_ids
+ input_shape = input.shape
+ input_ids = input_ids.view(-1, input_shape[-1])
+ elif inputs_embeds is not None:
+ input = inputs_embeds[:, :, -1]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input_ids)
+
+ if not self.is_t2u_encoder:
+ embed_pos = self.embed_positions(input)
+
+ hidden_states = inputs_embeds + embed_pos.to(inputs_embeds.device)
+ else:
+ hidden_states = inputs_embeds
+
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+ # expand attention_mask
+ if attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+ encoder_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+
+ for idx, encoder_layer in enumerate(self.layers):
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+ to_drop = False
+ if self.training:
+ dropout_probability = torch.rand([])
+ if dropout_probability < self.layerdrop: # skip the layer
+ to_drop = True
+
+ if to_drop:
+ layer_outputs = (None, None)
+ else:
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ encoder_layer.forward,
+ hidden_states,
+ attention_mask,
+ output_attentions,
+ )
+ else:
+ layer_outputs = encoder_layer(
+ hidden_states,
+ attention_mask,
+ output_attentions=output_attentions,
+ )
+
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+
+ hidden_states = self.layer_norm(hidden_states)
+
+ if output_hidden_states:
+ encoder_states = encoder_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+ return BaseModelOutput(
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+ )
+
+
+@add_start_docstrings(
+ "Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SeamlessM4Tv2DecoderLayer`].",
+ SEAMLESS_M4T_V2_START_DOCSTRING,
+ """
+ embed_tokens (`nn.Embedding`, *optional*):
+ Input embedding
+ """,
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TDecoder with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2Decoder(SeamlessM4Tv2PreTrainedModel):
+ def __init__(
+ self,
+ config: SeamlessM4Tv2Config,
+ embed_tokens: Optional[nn.Embedding] = None,
+ ):
+ super().__init__(config)
+ self.dropout = config.dropout
+ self.layerdrop = config.decoder_layerdrop
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+ self.max_target_positions = config.max_position_embeddings
+ embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
+
+ if embed_tokens is not None:
+ # if embed_tokens defined, use its shape instead
+ self.embed_tokens = SeamlessM4Tv2ScaledWordEmbedding(
+ embed_tokens.num_embeddings, embed_tokens.embedding_dim, self.padding_idx, embed_scale=embed_scale
+ )
+ self.embed_tokens.weight = embed_tokens.weight
+ else:
+ self.embed_tokens = SeamlessM4Tv2ScaledWordEmbedding(
+ self.vocab_size, config.hidden_size, self.padding_idx, embed_scale=embed_scale
+ )
+
+ self.embed_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+ self.max_target_positions,
+ config.hidden_size,
+ padding_idx=self.padding_idx,
+ )
+
+ layers = []
+ for _ in range(config.decoder_layers):
+ layers.append(
+ SeamlessM4Tv2DecoderLayer(
+ config,
+ decoder_attention_heads=config.decoder_attention_heads,
+ decoder_ffn_dim=config.decoder_ffn_dim,
+ )
+ )
+ self.layers = nn.ModuleList(layers)
+ self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ encoder_attention_mask: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+ r"""
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+ provide it.
+
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+ [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+ of the decoder.
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+ Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+ selected in `[0, 1]`:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ [What are attention masks?](../glossary#attention-mask)
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+ that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+ all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+ than the model's internal embedding lookup matrix.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # retrieve input_ids and inputs_embeds
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+ elif input_ids is not None:
+ input = input_ids
+ input_shape = input.size()
+ input_ids = input_ids.view(-1, input_shape[-1])
+ elif inputs_embeds is not None:
+ input_shape = inputs_embeds.size()[:-1]
+ input = inputs_embeds[:, :, -1]
+ else:
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+ # past_key_values_length
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+ if inputs_embeds is None:
+ inputs_embeds = self.embed_tokens(input_ids)
+
+ attention_mask = _prepare_4d_causal_attention_mask(
+ attention_mask, input_shape, inputs_embeds, past_key_values_length
+ )
+
+ # expand encoder attention mask
+ if encoder_hidden_states is not None and encoder_attention_mask is not None:
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+ encoder_attention_mask = _prepare_4d_attention_mask(
+ encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+ )
+
+ # embed positions
+ positions = self.embed_positions(input, past_key_values_length=past_key_values_length)
+
+ hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
+
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+ if self.gradient_checkpointing and self.training:
+ if use_cache:
+ logger.warning_once(
+ "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
+ )
+ use_cache = False
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+ all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+ next_decoder_cache = () if use_cache else None
+
+ for idx, decoder_layer in enumerate(self.layers):
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+ if self.training:
+ dropout_probability = torch.rand([])
+ if dropout_probability < self.layerdrop:
+ continue
+
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ decoder_layer.__call__,
+ hidden_states,
+ attention_mask,
+ encoder_hidden_states,
+ encoder_attention_mask,
+ None,
+ output_attentions,
+ use_cache,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ use_cache=use_cache,
+ )
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[1],)
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[2],)
+
+ if encoder_hidden_states is not None:
+ all_cross_attentions += (layer_outputs[3],)
+
+ hidden_states = self.layer_norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ next_cache = next_decoder_cache if use_cache else None
+ if not return_dict:
+ return tuple(
+ v
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+ if v is not None
+ )
+ return BaseModelOutputWithPastAndCrossAttentions(
+ last_hidden_state=hidden_states,
+ past_key_values=next_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ cross_attentions=all_cross_attentions,
+ )
+
+
+@add_start_docstrings(
+ "Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SeamlessM4Tv2DecoderLayer`].",
+ SEAMLESS_M4T_V2_START_DOCSTRING,
+ """
+ embed_tokens (`nn.Embedding`, *optional*):
+ Input embedding
+ """,
+)
+class SeamlessM4Tv2TextToUnitDecoder(SeamlessM4Tv2PreTrainedModel):
+ def __init__(
+ self,
+ config: SeamlessM4Tv2Config,
+ embed_tokens: Optional[nn.Embedding] = None,
+ ):
+ super().__init__(config)
+ self.dropout = config.dropout
+ self.layerdrop = config.decoder_layerdrop
+ self.padding_idx = config.pad_token_id
+ self.vocab_size = config.vocab_size
+ self.max_target_positions = config.max_position_embeddings
+ self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
+
+ if embed_tokens is not None:
+ # if embed_tokens defined, use its shape instead
+ self.embed_tokens = nn.Embedding(embed_tokens.num_embeddings, embed_tokens.embedding_dim, self.padding_idx)
+ self.embed_tokens.weight = embed_tokens.weight
+ else:
+ self.embed_tokens = nn.Embedding(self.vocab_size, config.hidden_size, self.padding_idx)
+
+ self.embed_char = nn.Embedding(config.char_vocab_size, config.hidden_size)
+ self.embed_char_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+ self.max_target_positions,
+ config.hidden_size,
+ padding_idx=self.padding_idx,
+ )
+
+ self.pos_emb_alpha_char = nn.Parameter(torch.ones(1))
+ self.pos_emb_alpha = nn.Parameter(torch.ones(1))
+ self.duration_predictor = SeamlessM4Tv2VariancePredictor(
+ config.variance_predictor_embed_dim,
+ config.variance_predictor_hidden_dim,
+ config.variance_predictor_kernel_size,
+ config.variance_pred_dropout,
+ )
+
+ self.embed_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+ self.max_target_positions,
+ config.hidden_size,
+ padding_idx=self.padding_idx,
+ )
+
+ layers = []
+ for _ in range(config.decoder_layers):
+ layers.append(
+ SeamlessM4Tv2TextToUnitDecoderLayer(
+ config,
+ decoder_attention_heads=config.decoder_attention_heads,
+ decoder_ffn_dim=config.decoder_ffn_dim,
+ )
+ )
+ self.layers = nn.ModuleList(layers)
+ self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+ self.gradient_checkpointing = False
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_input_embeddings(self):
+ return self.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.embed_tokens = value
+
+ def forward(
+ self,
+ char_input_ids: Optional[torch.LongTensor] = None,
+ char_count_per_id: Optional[torch.LongTensor] = None,
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple, SeamlessM4Tv2TextToUnitDecoderOutput]:
+ r"""
+ Args:
+ char_input_ids (`torch.LongTensor` of shape `(batch_size, char_sequence_length)`):
+ Character indices. The correspondence between characters and indices can be found in `char_to_id`, a
+ dictionary in the generation configuration.
+ char_count_per_id (`torch.Tensor` of shape `(batch_size, encoder_sequence_length)`):
+ Number of characters per text input id.
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`):
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+ of the decoder.
+ output_attentions (`bool`, *optional*):
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+ returned tensors for more detail.
+ output_hidden_states (`bool`, *optional*):
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+ for more detail.
+ return_dict (`bool`, *optional*):
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+ """
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ # create padding mask for character lengths
+ char_padding_mask = _compute_new_attention_mask(char_input_ids, char_count_per_id.sum(1))
+
+ # upsample hidden states according to characters sequence lengths
+ char_hidden_states = self._hard_upsample(encoder_hidden_states, char_count_per_id)
+ # embed char positions
+ char_positions = self.pos_emb_alpha_char * self.embed_char_positions(inputs_embeds=char_hidden_states)
+ # update char hidden states with positions and char embeddings
+ char_hidden_states = self.embed_char(char_input_ids) * self.embed_scale + char_positions + char_hidden_states
+
+ # predict duration
+ log_dur_pred = self.duration_predictor(char_hidden_states, padding_mask=char_padding_mask)
+ dur_out = torch.clamp(torch.round((torch.expm1(log_dur_pred))).long(), min=1)
+ dur_out = dur_out.masked_fill(~char_padding_mask.bool(), 0.0)
+
+ # upsample char hidden states according to predicted duration
+ char_hidden_states = self._hard_upsample(char_hidden_states, dur_out)
+
+ positions = self.pos_emb_alpha * self.embed_positions(inputs_embeds=char_hidden_states)
+ hidden_states = char_hidden_states + positions
+
+ padding_mask = _compute_new_attention_mask(hidden_states, dur_out.sum(1))
+ attention_mask = _prepare_4d_attention_mask(padding_mask, hidden_states.dtype)
+
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+ # decoder layers
+ all_hidden_states = () if output_hidden_states else None
+ all_self_attns = () if output_attentions else None
+
+ for idx, decoder_layer in enumerate(self.layers):
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+ if self.training:
+ dropout_probability = torch.rand([])
+ if dropout_probability < self.layerdrop:
+ continue
+
+ if self.gradient_checkpointing and self.training:
+ layer_outputs = self._gradient_checkpointing_func(
+ decoder_layer.__call__,
+ hidden_states,
+ attention_mask,
+ padding_mask,
+ output_attentions,
+ )
+ else:
+ layer_outputs = decoder_layer(
+ hidden_states,
+ attention_mask=attention_mask,
+ padding_mask=padding_mask,
+ output_attentions=output_attentions,
+ )
+ hidden_states = layer_outputs[0]
+
+ if output_attentions:
+ all_self_attns += (layer_outputs[2],)
+
+ hidden_states = self.layer_norm(hidden_states)
+
+ # add hidden states from the last decoder layer
+ if output_hidden_states:
+ all_hidden_states += (hidden_states,)
+
+ if not return_dict:
+ return tuple(v for v in [hidden_states, all_hidden_states, all_self_attns, padding_mask] if v is not None)
+ return SeamlessM4Tv2TextToUnitDecoderOutput(
+ last_hidden_state=hidden_states,
+ hidden_states=all_hidden_states,
+ attentions=all_self_attns,
+ padding_mask=padding_mask,
+ )
+
+
+@add_start_docstrings(
+ "Transformer bare text-to-unit encoder-decoder. The encoder is a [`SeamlessM4Tv2Encoder`] without embeddings and the decoder is a [`SeamlessM4Tv2TextToUnitDecoder`].",
+ SEAMLESS_M4T_V2_START_DOCSTRING,
+ """
+ embed_tokens_decoder (`nn.Embedding`, *optional*): input embedding of the decoder.
+ """,
+)
+class SeamlessM4Tv2TextToUnitModel(SeamlessM4Tv2PreTrainedModel):
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitModel.__init__ with SeamlessM4T->SeamlessM4Tv2, Decoder->TextToUnitDecoder
+ def __init__(
+ self,
+ config: SeamlessM4Tv2Config,
+ embed_tokens_decoder: Optional[nn.Embedding] = None,
+ ):
+ super().__init__(config)
+
+ self.encoder = SeamlessM4Tv2Encoder(config, is_t2u_encoder=True)
+ self.decoder = SeamlessM4Tv2TextToUnitDecoder(config, embed_tokens_decoder)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ char_input_ids: Optional[torch.LongTensor] = None,
+ char_count_per_id: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if encoder_outputs is None:
+ encoder_outputs = self.encoder(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+ encoder_outputs = BaseModelOutput(
+ last_hidden_state=encoder_outputs[0],
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+ )
+
+ # decoder outputs consists of (dec_features, dec_hidden, dec_attn, padding_mask)
+ decoder_outputs = self.decoder(
+ char_input_ids=char_input_ids,
+ char_count_per_id=char_count_per_id,
+ encoder_hidden_states=encoder_outputs[0],
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ if not return_dict:
+ return decoder_outputs + encoder_outputs
+
+ return SeamlessM4Tv2TextToUnitOutput(
+ last_hidden_state=decoder_outputs.last_hidden_state,
+ padding_mask=decoder_outputs.padding_mask,
+ decoder_hidden_states=decoder_outputs.hidden_states,
+ decoder_attentions=decoder_outputs.attentions,
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+ encoder_hidden_states=encoder_outputs.hidden_states,
+ encoder_attentions=encoder_outputs.attentions,
+ )
+
+
+@add_start_docstrings(
+ "Transformer text-to-unit encoder-decoder with a language model head. The base encoder-decoder model is a [`SeamlessM4Tv2TextToUnitModel`].",
+ SEAMLESS_M4T_V2_START_DOCSTRING,
+ """
+ embed_tokens_decoder (`nn.Embedding`, *optional*): input embedding of the decoder.
+ """,
+)
+class SeamlessM4Tv2TextToUnitForConditionalGeneration(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+ _keys_to_ignore_on_load_missing = [
+ "vocoder",
+ "speech_encoder",
+ "text_encoder",
+ "text_decoder",
+ ]
+ _tied_weights_keys = ["decoder.embed_tokens.weight", "lm_head.weight"]
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.__init__ with SeamlessM4T->SeamlessM4Tv2
+ def __init__(
+ self,
+ config: SeamlessM4Tv2Config,
+ embed_tokens_decoder: Optional[nn.Embedding] = None,
+ ):
+ # update config - used principaly for bos_token_id etc.
+ config = copy.deepcopy(config)
+ for param, val in config.to_dict().items():
+ if param.startswith("t2u_"):
+ config.__setattr__(param[4:], val)
+ super().__init__(config)
+
+ self.model = SeamlessM4Tv2TextToUnitModel(config, embed_tokens_decoder)
+
+ self.lm_head = nn.Linear(config.hidden_size, config.t2u_vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_encoder
+ def get_encoder(self):
+ return self.model.encoder
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_decoder
+ def get_decoder(self):
+ return self.model.decoder
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_output_embeddings
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.set_output_embeddings
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_input_embeddings
+ def get_input_embeddings(self):
+ return self.model.decoder.embed_tokens
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.set_input_embeddings
+ def set_input_embeddings(self, value):
+ self.model.decoder.embed_tokens = value
+
+ @add_start_docstrings_to_model_forward(M4T_TEXT_TO_UNITS_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ char_input_ids: Optional[torch.LongTensor] = None,
+ char_count_per_id: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs,
+ ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ outputs = self.model(
+ input_ids,
+ char_input_ids=char_input_ids,
+ char_count_per_id=char_count_per_id,
+ attention_mask=attention_mask,
+ encoder_outputs=encoder_outputs,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ lm_logits = self.lm_head(outputs[0])
+
+ masked_lm_loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+ labels = labels.to(lm_logits.device)
+ masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+ if not return_dict:
+ output = (lm_logits,) + outputs[1:]
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+ return SeamlessM4Tv2TextToUnitOutput(
+ last_hidden_state=lm_logits,
+ padding_mask=outputs.padding_mask,
+ decoder_hidden_states=outputs.decoder_hidden_states,
+ decoder_attentions=outputs.decoder_attentions,
+ encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+ encoder_hidden_states=outputs.encoder_hidden_states,
+ encoder_attentions=outputs.encoder_attentions,
+ loss=masked_lm_loss,
+ )
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration._tie_weights
+ def _tie_weights(self) -> None:
+ if getattr(self.config, "tie_word_embeddings", True):
+ output_embeddings = self.get_output_embeddings()
+ if output_embeddings is not None:
+ self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
+
+
+############ VOCODER related code ################
+
+
+HIFIGAN_START_DOCSTRING = r"""
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+ etc.)
+
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+ and behavior.
+
+ Parameters:
+ config ([`SeamlessM4Tv2Config`]):
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
+ load the weights associated with the model, only the configuration. Check out the
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
+class HifiGanResidualBlock(nn.Module):
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
+ super().__init__()
+ self.leaky_relu_slope = leaky_relu_slope
+
+ self.convs1 = nn.ModuleList(
+ [
+ nn.Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ stride=1,
+ dilation=dilation[i],
+ padding=self.get_padding(kernel_size, dilation[i]),
+ )
+ for i in range(len(dilation))
+ ]
+ )
+ self.convs2 = nn.ModuleList(
+ [
+ nn.Conv1d(
+ channels,
+ channels,
+ kernel_size,
+ stride=1,
+ dilation=1,
+ padding=self.get_padding(kernel_size, 1),
+ )
+ for _ in range(len(dilation))
+ ]
+ )
+
+ def get_padding(self, kernel_size, dilation=1):
+ return (kernel_size * dilation - dilation) // 2
+
+ def apply_weight_norm(self):
+ weight_norm = nn.utils.weight_norm
+ if hasattr(nn.utils.parametrizations, "weight_norm"):
+ weight_norm = nn.utils.parametrizations.weight_norm
+
+ for layer in self.convs1:
+ weight_norm(layer)
+ for layer in self.convs2:
+ weight_norm(layer)
+
+ def remove_weight_norm(self):
+ for layer in self.convs1:
+ nn.utils.remove_weight_norm(layer)
+ for layer in self.convs2:
+ nn.utils.remove_weight_norm(layer)
+
+ def forward(self, hidden_states):
+ for conv1, conv2 in zip(self.convs1, self.convs2):
+ residual = hidden_states
+ hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+ hidden_states = conv1(hidden_states)
+ hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+ hidden_states = conv2(hidden_states)
+ hidden_states = hidden_states + residual
+ return hidden_states
+
+
+class SeamlessM4Tv2VariancePredictor(nn.Module):
+ def __init__(self, embed_dim, hidden_dim, kernel_size, var_pred_dropout):
+ super().__init__()
+
+ self.conv1 = nn.Conv1d(
+ embed_dim,
+ hidden_dim,
+ kernel_size=kernel_size,
+ padding="same",
+ )
+ self.activation_fuction = nn.ReLU()
+ self.ln1 = nn.LayerNorm(hidden_dim)
+ self.dropout_module = nn.Dropout(p=var_pred_dropout)
+ self.conv2 = nn.Conv1d(
+ hidden_dim,
+ hidden_dim,
+ kernel_size=kernel_size,
+ padding="same",
+ )
+ self.ln2 = nn.LayerNorm(hidden_dim)
+ self.proj = nn.Linear(hidden_dim, 1)
+
+ def forward(self, hidden_states: Tensor, padding_mask: Optional[Tensor] = None) -> Tensor:
+ # Input: B x T x C; Output: B x T
+ if padding_mask is not None:
+ hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+ hidden_states = self.conv1(hidden_states.transpose(1, 2))
+ hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
+ hidden_states = self.dropout_module(self.ln1(hidden_states))
+ if padding_mask is not None:
+ hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+ hidden_states = self.conv2(hidden_states.transpose(1, 2))
+ hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
+ hidden_states = self.dropout_module(self.ln2(hidden_states))
+ return self.proj(hidden_states).squeeze(dim=2)
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4THifiGan with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2HifiGan(nn.Module):
+ def __init__(self, config: SeamlessM4Tv2Config):
+ super().__init__()
+ model_in_dim = config.unit_embed_dim + config.lang_embed_dim + config.spkr_embed_dim
+ self.leaky_relu_slope = config.leaky_relu_slope
+ self.num_kernels = len(config.resblock_kernel_sizes)
+ self.num_upsamples = len(config.upsample_rates)
+ self.conv_pre = nn.Conv1d(
+ model_in_dim,
+ config.upsample_initial_channel,
+ kernel_size=7,
+ stride=1,
+ padding=3,
+ )
+
+ self.upsampler = nn.ModuleList()
+ for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+ self.upsampler.append(
+ nn.ConvTranspose1d(
+ config.upsample_initial_channel // (2**i),
+ config.upsample_initial_channel // (2 ** (i + 1)),
+ kernel_size=kernel_size,
+ stride=upsample_rate,
+ padding=(kernel_size - upsample_rate) // 2,
+ )
+ )
+
+ self.resblocks = nn.ModuleList()
+ for i in range(len(self.upsampler)):
+ channels = config.upsample_initial_channel // (2 ** (i + 1))
+ for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
+ self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
+
+ self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
+
+ def forward(self, input_embeds: torch.FloatTensor) -> torch.FloatTensor:
+ r"""
+ Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
+ of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
+ waveform.
+
+ Args:
+ spectrogram (`torch.FloatTensor`):
+ Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
+ model_in_dim)`, or un-batched and of shape `(sequence_length, model_in_dim)`. Note that `model_in_dim`
+ is the sum of `config.unit_embed_dim`, `config.lang_embed_dim` and `config.spkr_embed_dim`.
+
+ Returns:
+ `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
+ shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
+ """
+
+ hidden_states = self.conv_pre(input_embeds)
+ for i in range(self.num_upsamples):
+ hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+ hidden_states = self.upsampler[i](hidden_states)
+
+ res_state = self.resblocks[i * self.num_kernels](hidden_states)
+ for j in range(1, self.num_kernels):
+ res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
+ hidden_states = res_state / self.num_kernels
+
+ hidden_states = nn.functional.leaky_relu(hidden_states)
+ hidden_states = self.conv_post(hidden_states)
+ hidden_states = torch.tanh(hidden_states)
+
+ # remove seq-len dim since this collapses to 1
+ waveform = hidden_states.squeeze(1)
+
+ return waveform
+
+
+@add_start_docstrings(
+ """Code HiFi-GAN vocoder as described in this [repository](https://github.com/facebookresearch/speech-resynthesis).""",
+ HIFIGAN_START_DOCSTRING,
+)
+class SeamlessM4Tv2CodeHifiGan(PreTrainedModel):
+ config_class = SeamlessM4Tv2Config
+ main_input_name = "input_embeds"
+ _no_split_modules = []
+
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.pad_token_id = config.t2u_pad_token_id
+ embed_dim = config.unit_embed_dim
+ kernel_size = config.variance_predictor_kernel_size
+ var_pred_dropout = config.var_pred_dropout
+ self.dur_predictor = SeamlessM4Tv2VariancePredictor(embed_dim, embed_dim, kernel_size, var_pred_dropout)
+
+ self.unit_embedding = nn.Embedding(config.unit_hifi_gan_vocab_size, config.unit_embed_dim)
+ self.speaker_embedding = nn.Embedding(config.vocoder_num_spkrs, config.spkr_embed_dim)
+ self.language_embedding = nn.Embedding(config.vocoder_num_langs, config.lang_embed_dim)
+
+ self.hifi_gan = SeamlessM4Tv2HifiGan(config)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._get_dur_output_lengths
+ def _get_dur_output_lengths(self, input_ids, dur_out):
+ """
+ Computes the output length after the duration layer.
+ """
+ unit_lengths = (input_ids != self.pad_token_id).sum(1)
+
+ # take care of edge cases where no padding or too many padding
+ unit_lengths = torch.clamp(unit_lengths, 0, dur_out.shape[1] - 1)
+
+ cumulative_dur_out = torch.cumsum(dur_out, dim=1)
+ unit_lengths = cumulative_dur_out.gather(dim=1, index=unit_lengths.unsqueeze(1)).squeeze()
+
+ return unit_lengths
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._get_output_hifigan_lengths
+ def _get_output_hifigan_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+ """
+ Computes the output length of the hifigan convolutional layers
+ """
+
+ def _conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
+ # 1D convolutional layer output length formula taken
+ # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+ return (
+ torch.div(input_length + 2 * pad - dilation * (kernel_size - 1) - 1, stride, rounding_mode="floor") + 1
+ )
+
+ def _transpose_conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
+ return (input_length - 1) * stride - 2 * pad + dilation * (kernel_size - 1) + 1
+
+ # conv_pre
+ input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
+
+ # upsampler
+ for i, (upsample_rate, kernel_size) in enumerate(
+ zip(self.config.upsample_rates, self.config.upsample_kernel_sizes)
+ ):
+ input_lengths = _transpose_conv_out_length(
+ input_lengths, kernel_size, upsample_rate, (kernel_size - upsample_rate) // 2
+ )
+
+ # resblock
+ for i in range(len(self.config.upsample_rates)):
+ for kernel_size, dilation in zip(self.config.resblock_kernel_sizes, self.config.resblock_dilation_sizes):
+ for dil in dilation:
+ input_lengths = _conv_out_length(
+ input_lengths, kernel_size, 1, (kernel_size - 1) * dil // 2, dilation=dil
+ )
+
+ for dil in dilation:
+ input_lengths = _conv_out_length(input_lengths, kernel_size, 1, (kernel_size - 1) // 2, dilation=1)
+
+ # conv_post
+ input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
+
+ return input_lengths
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.forward with SeamlessM4T->SeamlessM4Tv2, spkr_id->speaker_id
+ def forward(
+ self, input_ids: torch.LongTensor, speaker_id: torch.Tensor, lang_id: torch.Tensor
+ ) -> Tuple[torch.Tensor]:
+ """
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`SeamlessM4Tv2TextToUnitForConditionalGeneration`]. [What are input
+ IDs?](../glossary#input-ids)
+ speaker_id (`int`, *optional*):
+ The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+ tgt_lang (`str`, *optional*):
+ The language id to use as target language for translation.
+ """
+ hidden_states = self.unit_embedding(input_ids).transpose(1, 2)
+ spkr = self.speaker_embedding(speaker_id).transpose(1, 2)
+ lang = self.language_embedding(lang_id).transpose(1, 2)
+
+ log_dur_pred = self.dur_predictor(hidden_states.transpose(1, 2))
+ dur_out = torch.clamp(torch.round((torch.expm1(log_dur_pred))).long(), min=1)
+ # B x C x T
+ if hidden_states.size(0) == 1:
+ hidden_states = torch.repeat_interleave(hidden_states, dur_out.view(-1), dim=2)
+ else:
+ # if batched sample, need to interleave per sample, and pad -> loss of parallelism
+ if hidden_states.shape[0] > 1 and self.training:
+ logger.warning(
+ """`self.training=True` and you use batching. You lose parallelism during the hifigan
+ forward pass because the samples are interleaved."""
+ )
+ hidden_states = [
+ torch.repeat_interleave(hidden_state, duration, dim=-1).transpose(0, 1)
+ for (hidden_state, duration) in zip(hidden_states, dur_out)
+ ]
+
+ hidden_states = nn.utils.rnn.pad_sequence(hidden_states, batch_first=True).transpose(1, 2)
+
+ spkr = spkr.repeat(1, 1, hidden_states.shape[-1])
+ lang = lang.repeat(1, 1, hidden_states.shape[-1])
+ hidden_states = torch.cat([lang, hidden_states, spkr], dim=1)
+
+ hidden_states = self.hifi_gan(hidden_states)
+
+ unit_lengths = self._get_dur_output_lengths(input_ids, dur_out)
+ lengths = self._get_output_hifigan_lengths(unit_lengths)
+
+ return hidden_states, lengths
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._init_weights
+ def _init_weights(self, module):
+ """Initialize the weights."""
+ if isinstance(module, (nn.Linear, nn.Conv1d, nn.ConvTranspose1d)):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.bias is not None:
+ module.bias.data.zero_()
+ elif isinstance(module, nn.Embedding):
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+ if module.padding_idx is not None:
+ module.weight.data[module.padding_idx].zero_()
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.apply_weight_norm
+ def apply_weight_norm(self):
+ weight_norm = nn.utils.weight_norm
+ if hasattr(nn.utils.parametrizations, "weight_norm"):
+ weight_norm = nn.utils.parametrizations.weight_norm
+
+ weight_norm(self.hifi_gan.conv_pre)
+ for layer in self.hifi_gan.upsampler:
+ weight_norm(layer)
+ for layer in self.hifi_gan.resblocks:
+ layer.apply_weight_norm()
+ weight_norm(self.hifi_gan.conv_post)
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.remove_weight_norm
+ def remove_weight_norm(self):
+ nn.utils.remove_weight_norm(self.hifi_gan.conv_pre)
+ for layer in self.hifi_gan.upsampler:
+ nn.utils.remove_weight_norm(layer)
+ for layer in self.hifi_gan.resblocks:
+ layer.remove_weight_norm()
+ nn.utils.remove_weight_norm(self.hifi_gan.conv_post)
+
+
+############ WHOLE MODEL related code ################
+
+
+@add_start_docstrings(
+ "The text-to-text SeamlessM4Tv2 Model transformer which can be used for T2TT.",
+ SEAMLESS_M4T_V2_START_DOCSTRING,
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToText with SeamlessM4T->SeamlessM4Tv2,SeamlessM4Tv2Tokenizer->SeamlessM4TTokenizer, SeamlessM4Tv2Processor->SeamlessM4TProcessor
+class SeamlessM4Tv2ForTextToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+ _keys_to_ignore_on_load_missing = ["speech_encoder", "t2u_model", "vocoder"]
+ main_input_name = "input_ids"
+
+ _tied_weights_keys = [
+ "lm_head.weight",
+ "text_encoder.embed_tokens.weight",
+ "text_decoder.embed_tokens.weight",
+ ]
+
+ def __init__(self, config: SeamlessM4Tv2Config):
+ super().__init__(config)
+
+ self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+ self.text_encoder = SeamlessM4Tv2Encoder(config, self.shared)
+ self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ def get_encoder(self):
+ return self.text_encoder
+
+ def get_decoder(self):
+ return self.text_decoder
+
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ def get_input_embeddings(self):
+ return self.text_decoder.embed_tokens
+
+ def set_input_embeddings(self, value):
+ self.text_encoder.embed_tokens = value
+ self.text_decoder.embed_tokens = value
+ self.shared = value
+
+ def _tie_weights(self):
+ if self.config.tie_word_embeddings:
+ self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
+ self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+ self._tie_or_clone_weights(self.lm_head, self.shared)
+
+ @add_start_docstrings_to_model_forward(M4T_TEXT_INPUTS_DOCSTRING)
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ decoder_input_ids: Optional[torch.LongTensor] = None,
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
+ encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs,
+ ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
+ if labels is not None:
+ if use_cache:
+ logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+ use_cache = False
+ if decoder_input_ids is None and decoder_inputs_embeds is None:
+ decoder_input_ids = shift_tokens_right(
+ labels, self.config.pad_token_id, self.config.decoder_start_token_id
+ )
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if encoder_outputs is None:
+ encoder_outputs = self.text_encoder(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+ encoder_outputs = BaseModelOutput(
+ last_hidden_state=encoder_outputs[0],
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+ )
+
+ encoder_attention_mask = attention_mask
+
+ # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+ decoder_outputs = self.text_decoder(
+ input_ids=decoder_input_ids,
+ attention_mask=decoder_attention_mask,
+ encoder_hidden_states=encoder_outputs[0],
+ encoder_attention_mask=encoder_attention_mask,
+ past_key_values=past_key_values,
+ inputs_embeds=decoder_inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ lm_logits = self.lm_head(decoder_outputs[0])
+
+ masked_lm_loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+ labels = labels.to(lm_logits.device)
+ masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+ if not return_dict:
+ outputs = decoder_outputs + encoder_outputs
+ output = (lm_logits,) + outputs[1:]
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+ return Seq2SeqLMOutput(
+ loss=masked_lm_loss,
+ logits=lm_logits,
+ past_key_values=decoder_outputs.past_key_values,
+ decoder_hidden_states=decoder_outputs.hidden_states,
+ decoder_attentions=decoder_outputs.attentions,
+ cross_attentions=decoder_outputs.cross_attentions,
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+ encoder_hidden_states=encoder_outputs.hidden_states,
+ encoder_attentions=encoder_outputs.attentions,
+ )
+
+ def generate(
+ self,
+ input_ids=None,
+ tgt_lang=None,
+ generation_config=None,
+ logits_processor=None,
+ stopping_criteria=None,
+ prefix_allowed_tokens_fn=None,
+ synced_gpus=False,
+ **kwargs,
+ ):
+ """
+ Generates sequences of token ids.
+
+
+
+ Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+ model's default generation configuration. You can override any `generation_config` by passing the corresponding
+ parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+ For an overview of generation strategies and code examples, check out the [following
+ guide](./generation_strategies).
+
+
+
+ Parameters:
+ input_ids (`torch.Tensor` of varying shape depending on the modality, *optional*):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ tgt_lang (`str`, *optional*):
+ The language to use as target language for translation.
+ generation_config (`~generation.GenerationConfig`, *optional*):
+ The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+ passed to generate matching the attributes of `generation_config` will override them. If
+ `generation_config` is not provided, the default will be used, which had the following loading
+ priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+ configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+ default values, whose documentation should be checked to parameterize generation.
+ logits_processor (`LogitsProcessorList`, *optional*):
+ Custom logits processors that complement the default logits processors built from arguments and
+ generation config. If a logit processor is passed that is already created with the arguments or a
+ generation config an error is thrown. This feature is intended for advanced users.
+ stopping_criteria (`StoppingCriteriaList`, *optional*):
+ Custom stopping criteria that complement the default stopping criteria built from arguments and a
+ generation config. If a stopping criteria is passed that is already created with the arguments or a
+ generation config an error is thrown. This feature is intended for advanced users.
+ prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
+ If provided, this function constraints the beam search to allowed tokens only at each step. If not
+ provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+ `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+ on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+ for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+ Retrieval](https://arxiv.org/abs/2010.00904).
+ synced_gpus (`bool`, *optional*, defaults to `False`):
+ Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+ `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+ kwargs (`Dict[str, Any]`, *optional*):
+ Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+ forwarded to the `forward` function of the model.
+
+ Return:
+ [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+ or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible
+ [`~utils.ModelOutput`] types are:
+ - [`~generation.GenerateEncoderDecoderOutput`],
+ - [`~generation.GenerateBeamEncoderDecoderOutput`]
+ """
+ # prepare text_decoder_input_ids
+ text_decoder_input_ids = kwargs.pop("decoder_input_ids", None)
+ # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+ if tgt_lang is not None:
+ batch_size = len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds"))
+
+ if hasattr(self.generation_config, "text_decoder_lang_to_code_id"):
+ # also accept __xxx__
+ tgt_lang = tgt_lang.replace("__", "")
+ if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
+ raise ValueError(
+ f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
+ {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
+ )
+ # tgt_lang gets priority over decoder input ids
+ text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+ text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+ else:
+ raise ValueError(
+ """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps
+ the target language to the right token id. Make sure to load the right generation config."""
+ )
+ else:
+ # only a warning, otherwise errors appear in the tests
+ logger.warning(
+ """You must either specify a `tgt_lang` or pass a correct `text_decoder_input_ids` to get
+ a correct generation, otherwise the generation will probably make no sense."""
+ )
+
+ return super().generate(
+ input_ids,
+ generation_config,
+ logits_processor,
+ stopping_criteria,
+ prefix_allowed_tokens_fn,
+ synced_gpus,
+ decoder_input_ids=text_decoder_input_ids,
+ **kwargs,
+ )
+
+ @staticmethod
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ # cached cross_attention states don't have to be reordered -> they are always the same
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+ )
+ return reordered_past
+
+
+@add_start_docstrings(
+ "The speech-to-text SeamlessM4Tv2 Model transformer which can be used for S2TT.",
+ SEAMLESS_M4T_V2_START_DOCSTRING,
+)
+class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+ _keys_to_ignore_on_load_missing = ["text_decoder", "t2u_model", "vocoder"]
+ main_input_name = "input_features"
+
+ _tied_weights_keys = [
+ "lm_head.weight",
+ "text_decoder.embed_tokens.weight",
+ ]
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.__init__ with SeamlessM4T->SeamlessM4Tv2
+ def __init__(self, config: SeamlessM4Tv2Config):
+ super().__init__(config)
+
+ self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+ self.speech_encoder = SeamlessM4Tv2SpeechEncoder(config)
+ self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_encoder
+ def get_encoder(self):
+ return self.speech_encoder
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_decoder
+ def get_decoder(self):
+ return self.text_decoder
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_output_embeddings
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.set_output_embeddings
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_input_embeddings
+ def get_input_embeddings(self):
+ return self.text_decoder.embed_tokens
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.set_input_embeddings
+ def set_input_embeddings(self, value):
+ self.text_decoder.embed_tokens = value
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText._tie_weights
+ def _tie_weights(self):
+ if self.config.tie_word_embeddings:
+ self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+ self._tie_or_clone_weights(self.lm_head, self.shared)
+
+ @add_start_docstrings_to_model_forward(M4T_SPEECH_INPUTS_DOCSTRING)
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.forward
+ def forward(
+ self,
+ input_features: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ decoder_input_ids: Optional[torch.LongTensor] = None,
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
+ encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs,
+ ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
+ if labels is not None:
+ if use_cache:
+ logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+ use_cache = False
+ if decoder_input_ids is None and decoder_inputs_embeds is None:
+ decoder_input_ids = shift_tokens_right(
+ labels, self.config.pad_token_id, self.config.decoder_start_token_id
+ )
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if encoder_outputs is None:
+ encoder_outputs = self.speech_encoder(
+ input_features=input_features,
+ attention_mask=attention_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+ encoder_outputs = BaseModelOutput(
+ last_hidden_state=encoder_outputs[0],
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+ )
+
+ encoder_attention_mask = attention_mask
+ if attention_mask is not None:
+ sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+ encoder_outputs[0].device
+ )
+ encoder_attention_mask = _compute_new_attention_mask(
+ hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths
+ )
+
+ # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+ decoder_outputs = self.text_decoder(
+ input_ids=decoder_input_ids,
+ attention_mask=decoder_attention_mask,
+ encoder_hidden_states=encoder_outputs[0],
+ encoder_attention_mask=encoder_attention_mask,
+ past_key_values=past_key_values,
+ inputs_embeds=decoder_inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ lm_logits = self.lm_head(decoder_outputs[0])
+
+ masked_lm_loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+ labels = labels.to(lm_logits.device)
+ masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+ if not return_dict:
+ outputs = decoder_outputs + encoder_outputs
+ output = (lm_logits,) + outputs[1:]
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+ return Seq2SeqLMOutput(
+ loss=masked_lm_loss,
+ logits=lm_logits,
+ past_key_values=decoder_outputs.past_key_values,
+ decoder_hidden_states=decoder_outputs.hidden_states,
+ decoder_attentions=decoder_outputs.attentions,
+ cross_attentions=decoder_outputs.cross_attentions,
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+ encoder_hidden_states=encoder_outputs.hidden_states,
+ encoder_attentions=encoder_outputs.attentions,
+ )
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.generate
+ def generate(
+ self,
+ input_features=None,
+ tgt_lang=None,
+ generation_config=None,
+ logits_processor=None,
+ stopping_criteria=None,
+ prefix_allowed_tokens_fn=None,
+ synced_gpus=False,
+ **kwargs,
+ ):
+ """
+ Generates sequences of token ids.
+
+
+
+ Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+ model's default generation configuration. You can override any `generation_config` by passing the corresponding
+ parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+ For an overview of generation strategies and code examples, check out the [following
+ guide](./generation_strategies).
+
+
+
+ Parameters:
+ input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+ Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+ [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+
+ tgt_lang (`str`, *optional*):
+ The language to use as target language for translation.
+ generation_config (`~generation.GenerationConfig`, *optional*):
+ The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+ passed to generate matching the attributes of `generation_config` will override them. If
+ `generation_config` is not provided, the default will be used, which had the following loading
+ priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+ configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+ default values, whose documentation should be checked to parameterize generation.
+ logits_processor (`LogitsProcessorList`, *optional*):
+ Custom logits processors that complement the default logits processors built from arguments and
+ generation config. If a logit processor is passed that is already created with the arguments or a
+ generation config an error is thrown. This feature is intended for advanced users.
+ stopping_criteria (`StoppingCriteriaList`, *optional*):
+ Custom stopping criteria that complement the default stopping criteria built from arguments and a
+ generation config. If a stopping criteria is passed that is already created with the arguments or a
+ generation config an error is thrown. This feature is intended for advanced users.
+ prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
+ If provided, this function constraints the beam search to allowed tokens only at each step. If not
+ provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+ `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+ on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+ for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+ Retrieval](https://arxiv.org/abs/2010.00904).
+ synced_gpus (`bool`, *optional*, defaults to `False`):
+ Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+ `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+ kwargs (`Dict[str, Any]`, *optional*):
+ Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+ forwarded to the `forward` function of the model.
+
+ Return:
+ [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+ or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible
+ [`~utils.ModelOutput`] types are:
+ - [`~generation.GenerateEncoderDecoderOutput`],
+ - [`~generation.GenerateBeamEncoderDecoderOutput`]
+ """
+ text_decoder_input_ids = kwargs.pop("decoder_input_ids", None)
+ # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+ input_features = input_features if input_features is not None else kwargs.pop("inputs")
+ if tgt_lang is not None:
+ inputs = kwargs.get("input_embeds") if input_features is None else input_features
+ inputs = (
+ inputs
+ if inputs is not None
+ else kwargs.get("encoder_outputs", {"last_hidden_state": None})["last_hidden_state"]
+ )
+ batch_size = len(inputs)
+
+ if hasattr(self.generation_config, "text_decoder_lang_to_code_id"):
+ # also accept __xxx__
+ tgt_lang = tgt_lang.replace("__", "")
+ if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
+ raise ValueError(
+ f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
+ {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
+ )
+ # tgt_lang gets priority over decoder input ids
+ text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+ text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+ else:
+ raise ValueError(
+ """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps
+ the target language to the right token id. Make sure to load the right generation config."""
+ )
+ else:
+ # only a warning, otherwise errors appear in the tests
+ logger.warning(
+ """You must either specify a `tgt_lang` or pass a correct `text_decoder_input_ids` to get
+ a correct generation, otherwise the generation will probably make no sense."""
+ )
+ return super().generate(
+ input_features,
+ generation_config,
+ logits_processor,
+ stopping_criteria,
+ prefix_allowed_tokens_fn,
+ synced_gpus,
+ decoder_input_ids=text_decoder_input_ids,
+ **kwargs,
+ )
+
+ @staticmethod
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText._reorder_cache
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ # cached cross_attention states don't have to be reordered -> they are always the same
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+ )
+ return reordered_past
+
+
+@add_start_docstrings(
+ "The text-to-speech SeamlessM4Tv2 Model transformer which can be used for T2ST.",
+ SEAMLESS_M4T_V2_START_DOCSTRING,
+)
+class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+ _keys_to_ignore_on_load_missing = ["speech_encoder"]
+ main_input_name = "input_ids"
+
+ _tied_weights_keys = [
+ "lm_head.weight",
+ "text_encoder.embed_tokens.weight",
+ "text_decoder.embed_tokens.weight",
+ ]
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.__init__ with SeamlessM4T->SeamlessM4Tv2
+ def __init__(self, config: SeamlessM4Tv2Config):
+ super().__init__(config)
+
+ self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+ self.text_encoder = SeamlessM4Tv2Encoder(config, self.shared)
+ self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ self.t2u_model = SeamlessM4Tv2TextToUnitForConditionalGeneration(config)
+ self.vocoder = SeamlessM4Tv2CodeHifiGan(config)
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_encoder
+ def get_encoder(self):
+ return self.text_encoder
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_decoder
+ def get_decoder(self):
+ return self.text_decoder
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_output_embeddings
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.set_output_embeddings
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_input_embeddings
+ def get_input_embeddings(self):
+ return self.text_decoder.embed_tokens
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.set_input_embeddings
+ def set_input_embeddings(self, value):
+ self.text_encoder.embed_tokens = value
+ self.text_decoder.embed_tokens = value
+ self.shared = value
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech._tie_weights
+ def _tie_weights(self):
+ if self.config.tie_word_embeddings:
+ self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
+ self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+ self._tie_or_clone_weights(self.lm_head, self.shared)
+
+ @add_start_docstrings_to_model_forward(M4T_TEXT_INPUTS_DOCSTRING)
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.forward with SeamlessM4T->SeamlessM4Tv2
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ decoder_input_ids: Optional[torch.LongTensor] = None,
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
+ encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
+ if labels is not None:
+ if use_cache:
+ logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+ use_cache = False
+ if decoder_input_ids is None and decoder_inputs_embeds is None:
+ decoder_input_ids = shift_tokens_right(
+ labels, self.config.pad_token_id, self.config.decoder_start_token_id
+ )
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if encoder_outputs is None:
+ # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+ logger.warning(
+ "This is the same forward method as `SeamlessM4Tv2ForTextToText`."
+ "It doesn't use the text-to-unit model `SeamlessM4Tv2TextToUnitForConditionalGeneration`."
+ "If you want to generate speech, use the `.generate` method."
+ )
+ encoder_outputs = self.text_encoder(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+ encoder_outputs = BaseModelOutput(
+ last_hidden_state=encoder_outputs[0],
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+ )
+
+ encoder_attention_mask = attention_mask
+
+ # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+ decoder_outputs = self.text_decoder(
+ input_ids=decoder_input_ids,
+ attention_mask=decoder_attention_mask,
+ encoder_hidden_states=encoder_outputs[0],
+ encoder_attention_mask=encoder_attention_mask,
+ past_key_values=past_key_values,
+ inputs_embeds=decoder_inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ lm_logits = self.lm_head(decoder_outputs[0])
+
+ masked_lm_loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+ labels = labels.to(lm_logits.device)
+ masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+ if not return_dict:
+ outputs = decoder_outputs + encoder_outputs
+ output = (lm_logits,) + outputs[1:]
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+ return Seq2SeqLMOutput(
+ loss=masked_lm_loss,
+ logits=lm_logits,
+ past_key_values=decoder_outputs.past_key_values,
+ decoder_hidden_states=decoder_outputs.hidden_states,
+ decoder_attentions=decoder_outputs.attentions,
+ cross_attentions=decoder_outputs.cross_attentions,
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+ encoder_hidden_states=encoder_outputs.hidden_states,
+ encoder_attentions=encoder_outputs.attentions,
+ )
+
+ @torch.no_grad()
+ def generate(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ return_intermediate_token_ids: Optional[bool] = None,
+ tgt_lang: Optional[str] = None,
+ speaker_id: Optional[int] = 0,
+ **kwargs,
+ ) -> Union[torch.Tensor, SeamlessM4Tv2GenerationOutput]:
+ """
+ Generates translated audio waveforms.
+
+
+
+ This method successively calls the `.generate` function of two different sub-models. You can specify keyword
+ arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments
+ that will be passed to one of them.
+
+ For example, calling `.generate(input_ids, num_beams=4, speech_do_sample=True)` will successively perform
+ beam-search decoding on the text model, and multinomial beam-search sampling on the speech model.
+
+ For an overview of generation strategies and code examples, check out the [following
+ guide](./generation_strategies).
+
+
+
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ return_intermediate_token_ids (`bool`, *optional*):
+ If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
+ to get translated text alongside the audio.
+ tgt_lang (`str`, *optional*):
+ The language to use as target language for translation.
+ speaker_id (`int`, *optional*, defaults to 0):
+ The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+ kwargs (*optional*):
+ Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword
+ arguments are of two types:
+
+ - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+ except for `decoder_input_ids` which will only be passed through the text components.
+ - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+ text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+ This means you can, for example, specify a generation strategy for one generation but not for the
+ other.
+
+
+ Returns:
+ `Union[SeamlessM4Tv2GenerationOutput, Tuple[Tensor]]`:
+ - If `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
+ - If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size,
+ sequence_length)`and and `waveform_lengths` which gives the length of each sample.
+ """
+ batch_size = len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds"))
+
+ if tgt_lang is None:
+ raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
+ else:
+ # also accept __xxx__
+ tgt_lang = tgt_lang.replace("__", "")
+ for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]:
+ lang_code_to_id = getattr(self.generation_config, key, None)
+ if lang_code_to_id is None:
+ raise ValueError(
+ f"""This model generation config doesn't have a `{key}` key which maps the target language
+ to the right token id. Make sure to load the right generation config."""
+ )
+ elif tgt_lang not in lang_code_to_id:
+ raise ValueError(
+ f"""`tgt_lang={tgt_lang}` is not supported by this model.
+ Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
+ more languages for text translation than for speech synthesis."""
+ )
+
+ kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
+ kwargs_text["output_hidden_states"] = True
+ kwargs_text["return_dict_in_generate"] = True
+ kwargs_text["output_scores"] = True
+
+ text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
+
+ # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+ text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+ text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+
+ kwargs_text["decoder_input_ids"] = text_decoder_input_ids
+
+ # first generation
+ text_generation_output = super().generate(input_ids, **kwargs_text)
+ sequences = text_generation_output.sequences
+
+ # prepare second generation
+ num_return_sequences = len(sequences) // batch_size
+ attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
+
+ if attention_mask is not None:
+ # repeat attention mask alongside batch dimension
+ attention_mask = torch.repeat_interleave(attention_mask, num_return_sequences, dim=0)
+ encoder_hidden_states = text_generation_output.encoder_hidden_states[-1]
+
+ # repeat attention mask alongside batch dimension
+ encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, num_return_sequences, dim=0)
+
+ # get decoder last hidden state - must do a pass through the text decoder
+ t2u_input_embeds = self.text_decoder(
+ input_ids=sequences[:, :-1], # Manually trim the final EOS token
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=attention_mask,
+ ).last_hidden_state
+
+ pad_token_id = self.generation_config.pad_token_id
+
+ # Compute new attention mask
+ seq_lens = (sequences[:, :-1] != pad_token_id).int().sum(1)
+ t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
+ kwargs_speech["attention_mask"] = t2u_model_attention_mask
+
+ # REMOVE EOS and lang_id
+ t2u_input_ids = sequences[:, 2:-1]
+ # replace every other EOS
+ t2u_input_ids = torch.masked_fill(
+ t2u_input_ids, t2u_input_ids == self.generation_config.eos_token_id, pad_token_id
+ )
+
+ # compute t2u_char_input_ids
+ t2u_subwords = self._indices_to_subwords(t2u_input_ids)
+ t2u_char_count_per_id = self._count_character_length_in_subword(
+ t2u_input_ids, t2u_subwords, pad_token_id=pad_token_id
+ )
+
+ # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode.
+ pad_zero = t2u_char_count_per_id.new_zeros((t2u_char_count_per_id.shape[0], 1))
+ t2u_char_count_per_id = torch.cat([pad_zero, t2u_char_count_per_id, pad_zero], dim=1)
+ t2u_char_input_ids = self._get_char_input_ids(
+ t2u_input_ids, t2u_subwords, t2u_char_count_per_id, pad_token_id=pad_token_id
+ )
+
+ # second pass
+ t2u_output = self.t2u_model(
+ inputs_embeds=t2u_input_embeds,
+ char_input_ids=t2u_char_input_ids,
+ char_count_per_id=t2u_char_count_per_id,
+ **kwargs_speech,
+ )
+
+ t2u_logits = t2u_output[0]
+ padding_mask = t2u_output[1].bool()
+
+ # The text-to-unit model is non auto-regressive. We keep the ability to use sampling with temperature
+ temperature = kwargs_speech.get("temperature", None)
+ if (temperature is None or temperature == 1.0) or not kwargs_speech.get("do_sample", False):
+ unit_ids = t2u_logits.argmax(dim=-1)
+ else:
+ t2u_logits = t2u_logits / temperature
+ # apply softmax
+ probs = nn.functional.softmax(t2u_logits, dim=-1)
+ # reshape to 2D: (batch_size, seq_len, t2u_vocab_size) -> (batch_size*seq_len, t2u_vocab_size)
+ probs = probs.reshape((-1, probs.shape[2]))
+ # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len)
+ unit_ids = torch.multinomial(probs, num_samples=1).view(t2u_logits.shape[0], -1)
+
+ output_unit_ids = unit_ids.detach().clone()
+
+ replace_mask = (unit_ids == self.config.t2u_eos_token_id) | (~padding_mask)
+ # replace eos per pad
+ unit_ids = unit_ids.masked_fill(replace_mask, self.config.t2u_pad_token_id)
+
+ # offset of control symbols
+ unit_ids = torch.where(
+ unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset
+ )
+
+ vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
+ vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
+
+ speaker_id = torch.tensor([[speaker_id]] * len(unit_ids), device=self.device)
+
+ waveform, waveform_lengths = self.vocoder(
+ input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id
+ )
+
+ if return_intermediate_token_ids:
+ return SeamlessM4Tv2GenerationOutput(
+ waveform=waveform,
+ waveform_lengths=waveform_lengths,
+ sequences=sequences,
+ unit_sequences=output_unit_ids,
+ )
+
+ return waveform, waveform_lengths
+
+ @staticmethod
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech._reorder_cache
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ # cached cross_attention states don't have to be reordered -> they are always the same
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+ )
+ return reordered_past
+
+
+@add_start_docstrings(
+ "The speech-to-speech SeamlessM4Tv2 Model transformer which can be used for S2ST.",
+ SEAMLESS_M4T_V2_START_DOCSTRING,
+)
+class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+ _keys_to_ignore_on_load_missing = ["text_encoder"]
+ main_input_name = "input_features"
+
+ _tied_weights_keys = [
+ "lm_head.weight",
+ "text_decoder.embed_tokens.weight",
+ ]
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.__init__ with SeamlessM4T->SeamlessM4Tv2
+ def __init__(self, config):
+ super().__init__(config)
+
+ self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+ self.speech_encoder = SeamlessM4Tv2SpeechEncoder(config)
+ self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ self.t2u_model = SeamlessM4Tv2TextToUnitForConditionalGeneration(config)
+ self.vocoder = SeamlessM4Tv2CodeHifiGan(config)
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_encoder
+ def get_encoder(self):
+ return self.speech_encoder
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_decoder
+ def get_decoder(self):
+ return self.text_decoder
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_output_embeddings
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.set_output_embeddings
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_input_embeddings
+ def get_input_embeddings(self):
+ return self.text_decoder.embed_tokens
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.set_input_embeddings
+ def set_input_embeddings(self, value):
+ self.text_decoder.embed_tokens = value
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech._tie_weights
+ def _tie_weights(self):
+ if self.config.tie_word_embeddings:
+ self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+ self._tie_or_clone_weights(self.lm_head, self.shared)
+
+ @add_start_docstrings_to_model_forward(M4T_SPEECH_INPUTS_DOCSTRING)
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.forward with SeamlessM4T->SeamlessM4Tv2
+ def forward(
+ self,
+ input_features: Optional[torch.LongTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ decoder_input_ids: Optional[torch.LongTensor] = None,
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
+ encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs,
+ ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
+ if labels is not None:
+ if use_cache:
+ logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+ use_cache = False
+ if decoder_input_ids is None and decoder_inputs_embeds is None:
+ decoder_input_ids = shift_tokens_right(
+ labels, self.config.pad_token_id, self.config.decoder_start_token_id
+ )
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if encoder_outputs is None:
+ # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+ logger.warning(
+ "This is the same forward method as `SeamlessM4Tv2ForSpeechToText`. It doesn't use `self.t2u_model`."
+ "If you want to generate speech, use the `generate` method."
+ )
+
+ encoder_outputs = self.speech_encoder(
+ input_features=input_features,
+ attention_mask=attention_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+ encoder_outputs = BaseModelOutput(
+ last_hidden_state=encoder_outputs[0],
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+ )
+
+ encoder_attention_mask = attention_mask
+ if attention_mask is not None:
+ sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+ encoder_outputs[0].device
+ )
+ encoder_attention_mask = _compute_new_attention_mask(
+ hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths
+ )
+
+ # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+ decoder_outputs = self.text_decoder(
+ input_ids=decoder_input_ids,
+ attention_mask=decoder_attention_mask,
+ encoder_hidden_states=encoder_outputs[0],
+ encoder_attention_mask=encoder_attention_mask,
+ past_key_values=past_key_values,
+ inputs_embeds=decoder_inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ lm_logits = self.lm_head(decoder_outputs[0])
+
+ masked_lm_loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+ labels = labels.to(lm_logits.device)
+ masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+ if not return_dict:
+ outputs = decoder_outputs + encoder_outputs
+ output = (lm_logits,) + outputs[1:]
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+ return Seq2SeqLMOutput(
+ loss=masked_lm_loss,
+ logits=lm_logits,
+ past_key_values=decoder_outputs.past_key_values,
+ decoder_hidden_states=decoder_outputs.hidden_states,
+ decoder_attentions=decoder_outputs.attentions,
+ cross_attentions=decoder_outputs.cross_attentions,
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+ encoder_hidden_states=encoder_outputs.hidden_states,
+ encoder_attentions=encoder_outputs.attentions,
+ )
+
+ @torch.no_grad()
+ def generate(
+ self,
+ input_features: Optional[torch.Tensor] = None,
+ return_intermediate_token_ids: Optional[bool] = None,
+ tgt_lang: Optional[str] = None,
+ speaker_id: Optional[int] = 0,
+ **kwargs,
+ ) -> Union[torch.Tensor, SeamlessM4Tv2GenerationOutput]:
+ """
+ Generates translated audio waveforms.
+
+
+
+ This method successively calls the `.generate` function of two different sub-models. You can specify keyword
+ arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments
+ that will be passed to one of them.
+
+ For example, calling `.generate(input_features, num_beams=4, speech_do_sample=True)` will successively perform
+ beam-search decoding on the text model, and multinomial beam-search sampling on the speech model.
+
+ For an overview of generation strategies and code examples, check out the [following
+ guide](./generation_strategies).
+
+
+
+ Args:
+ input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+ Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+ [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+ return_intermediate_token_ids (`bool`, *optional*):
+ If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
+ to get translated text alongside the audio.
+ tgt_lang (`str`, *optional*):
+ The language to use as target language for translation.
+ speaker_id (`int`, *optional*, defaults to 0):
+ The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+
+ kwargs (*optional*):
+ Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword
+ arguments are of two types:
+
+ - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+ except for `decoder_input_ids` which will only be passed through the text components.
+ - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+ text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+ This means you can, for example, specify a generation strategy for one generation but not for the
+ other.
+
+
+ Returns:
+ `Union[SeamlessM4Tv2GenerationOutput, Tuple[Tensor]]`:
+ - If `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
+ - If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size,
+ sequence_length)`and and `waveform_lengths` which gives the length of each sample.
+ """
+ batch_size = len(input_features) if input_features is not None else len(kwargs.get("inputs_embeds"))
+
+ if tgt_lang is None:
+ raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
+ else:
+ # also accept __xxx__
+ tgt_lang = tgt_lang.replace("__", "")
+ for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]:
+ lang_code_to_id = getattr(self.generation_config, key, None)
+ if lang_code_to_id is None:
+ raise ValueError(
+ f"""This model generation config doesn't have a `{key}` key which maps the target language
+ to the right token id. Make sure to load the right generation config."""
+ )
+ elif tgt_lang not in lang_code_to_id:
+ raise ValueError(
+ f"""`tgt_lang={tgt_lang}` is not supported by this model.
+ Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
+ more languages for text translation than for speech synthesis."""
+ )
+
+ kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
+ kwargs_text["output_hidden_states"] = True
+ kwargs_text["return_dict_in_generate"] = True
+ kwargs_text["output_scores"] = True
+
+ text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
+ # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+ text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+ text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+
+ kwargs_text["decoder_input_ids"] = text_decoder_input_ids
+
+ # first generation
+ text_generation_output = super().generate(input_features, **kwargs_text)
+ sequences = text_generation_output.sequences
+
+ # prepare second generation
+ num_return_sequences = len(sequences) // batch_size
+ attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
+
+ # get last_hidden_state from encoder
+ encoder_hidden_states = self.speech_encoder(input_features=input_features, attention_mask=attention_mask)[0]
+
+ # input modality = speech so new attention mask for the decoder
+ if attention_mask is not None:
+ sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+ encoder_hidden_states.device
+ )
+ attention_mask = _compute_new_attention_mask(
+ hidden_states=encoder_hidden_states, seq_lens=sub_sampled_lengths
+ )
+
+ # repeat attention mask alongside batch dimension
+ attention_mask = torch.repeat_interleave(attention_mask, num_return_sequences, dim=0)
+
+ # repeat attention mask alongside batch dimension
+ encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, num_return_sequences, dim=0)
+
+ # get decoder last hidden state - must do a pass through the text decoder
+ t2u_input_embeds = self.text_decoder(
+ input_ids=sequences[:, :-1], # Manually trim the final EOS token
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=attention_mask,
+ ).last_hidden_state
+
+ pad_token_id = self.generation_config.pad_token_id
+
+ # Compute new attention mask
+ seq_lens = (sequences[:, :-1] != pad_token_id).int().sum(1)
+ t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
+ kwargs_speech["attention_mask"] = t2u_model_attention_mask
+
+ # REMOVE EOS and lang_id
+ t2u_input_ids = sequences[:, 2:-1]
+ # replace every other EOS
+ t2u_input_ids = torch.masked_fill(
+ t2u_input_ids, t2u_input_ids == self.generation_config.eos_token_id, pad_token_id
+ )
+
+ # compute t2u_char_input_ids
+ t2u_subwords = self._indices_to_subwords(t2u_input_ids)
+ t2u_char_count_per_id = self._count_character_length_in_subword(
+ t2u_input_ids, t2u_subwords, pad_token_id=pad_token_id
+ )
+
+ # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode.
+ pad_zero = t2u_char_count_per_id.new_zeros((t2u_char_count_per_id.shape[0], 1))
+ t2u_char_count_per_id = torch.cat([pad_zero, t2u_char_count_per_id, pad_zero], dim=1)
+ t2u_char_input_ids = self._get_char_input_ids(
+ t2u_input_ids, t2u_subwords, t2u_char_count_per_id, pad_token_id=pad_token_id
+ )
+
+ # second pass
+ t2u_output = self.t2u_model(
+ inputs_embeds=t2u_input_embeds,
+ char_input_ids=t2u_char_input_ids,
+ char_count_per_id=t2u_char_count_per_id,
+ **kwargs_speech,
+ )
+
+ t2u_logits = t2u_output[0]
+ padding_mask = t2u_output[1].bool()
+
+ # The text-to-unit model is non auto-regressive. We keep the ability to use sampling with temperature
+ temperature = kwargs_speech.get("temperature", None)
+ if (temperature is None or temperature == 1.0) or not kwargs_speech.get("do_sample", False):
+ unit_ids = t2u_logits.argmax(dim=-1)
+ else:
+ t2u_logits = t2u_logits / temperature
+ # apply softmax
+ probs = nn.functional.softmax(t2u_logits, dim=-1)
+ # reshape to 2D: (batch_size, seq_len, t2u_vocab_size) -> (batch_size*seq_len, t2u_vocab_size)
+ probs = probs.reshape((-1, probs.shape[2]))
+ # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len)
+ unit_ids = torch.multinomial(probs, num_samples=1).view(t2u_logits.shape[0], -1)
+
+ output_unit_ids = unit_ids.detach().clone()
+
+ replace_mask = (unit_ids == self.config.t2u_eos_token_id) | (~padding_mask)
+ # replace eos per pad
+ unit_ids = unit_ids.masked_fill(replace_mask, self.config.t2u_pad_token_id)
+
+ # offset of control symbols
+ unit_ids = torch.where(
+ unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset
+ )
+
+ vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
+ vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
+
+ speaker_id = torch.tensor([[speaker_id]] * len(unit_ids), device=self.device)
+
+ waveform, waveform_lengths = self.vocoder(
+ input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id
+ )
+
+ if return_intermediate_token_ids:
+ return SeamlessM4Tv2GenerationOutput(
+ waveform=waveform,
+ waveform_lengths=waveform_lengths,
+ sequences=sequences,
+ unit_sequences=output_unit_ids,
+ )
+
+ return waveform, waveform_lengths
+
+ @staticmethod
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech._reorder_cache
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ # cached cross_attention states don't have to be reordered -> they are always the same
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+ )
+ return reordered_past
+
+
+@add_start_docstrings(
+ "The original SeamlessM4Tv2 Model transformer which can be used for every tasks available (S2ST, S2TT, T2TT, T2ST).",
+ SEAMLESS_M4T_V2_START_DOCSTRING,
+ """
+ current_modality (`str`, *optional*, defaults to `"text"`):
+ Default modality. Used only to initialize the model. It can be set to `"text"` or `"speech"`.
+ This will be updated automatically according to the modality passed to the forward and generate passes (`input_ids` for text and `input_features` for audio).
+ """,
+)
+class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+ _tied_weights_keys = [
+ "lm_head.weight",
+ "text_encoder.embed_tokens.weight",
+ "text_decoder.embed_tokens.weight",
+ ]
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.__init__ with SeamlessM4T->SeamlessM4Tv2
+ def __init__(self, config, current_modality="text"):
+ super().__init__(config)
+
+ self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+ self.text_encoder = SeamlessM4Tv2Encoder(config, self.shared)
+ self.speech_encoder = SeamlessM4Tv2SpeechEncoder(config)
+ self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+ # Initialize weights and apply final processing
+ self.post_init()
+
+ self.current_modality = current_modality
+ if current_modality == "speech":
+ self.main_input_name = "input_features"
+
+ # these models already call post_init in their initialization
+ self.t2u_model = SeamlessM4Tv2TextToUnitForConditionalGeneration(config)
+ self.vocoder = SeamlessM4Tv2CodeHifiGan(config)
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.set_modality
+ def set_modality(self, modality="text"):
+ if modality == "text":
+ self.main_input_name = "input_ids"
+ self.current_modality = "text"
+ elif modality == "speech":
+ self.main_input_name = "input_features"
+ self.current_modality = "speech"
+ else:
+ raise ValueError(f"`modality={modality}` is not a valid modality. It must be `text` or `speech`.")
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.get_encoder
+ def get_encoder(self):
+ if self.current_modality == "text":
+ return self.text_encoder
+ else:
+ return self.speech_encoder
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.get_output_embeddings
+ def get_output_embeddings(self):
+ return self.lm_head
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.set_output_embeddings
+ def set_output_embeddings(self, new_embeddings):
+ self.lm_head = new_embeddings
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.get_input_embeddings
+ def get_input_embeddings(self):
+ return self.text_decoder.embed_tokens
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.set_input_embeddings
+ def set_input_embeddings(self, value):
+ self.text_encoder.embed_tokens = value
+ self.text_decoder.embed_tokens = value
+ self.shared = value
+
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel._tie_weights
+ def _tie_weights(self):
+ if self.config.tie_word_embeddings:
+ self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
+ self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+ self._tie_or_clone_weights(self.lm_head, self.shared)
+
+ @add_start_docstrings_to_model_forward(M4T_MODEL_INPUTS_DOCSTRING)
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.forward with SeamlessM4T->SeamlessM4Tv2
+ def forward(
+ self,
+ input_ids: Optional[torch.LongTensor] = None,
+ input_features: Optional[torch.FloatTensor] = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ decoder_input_ids: Optional[torch.LongTensor] = None,
+ decoder_attention_mask: Optional[torch.LongTensor] = None,
+ encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
+ use_cache: Optional[bool] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ **kwargs,
+ ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if labels is not None:
+ if use_cache:
+ logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+ use_cache = False
+ if decoder_input_ids is None and decoder_inputs_embeds is None:
+ decoder_input_ids = shift_tokens_right(
+ labels, self.config.pad_token_id, self.config.decoder_start_token_id
+ )
+
+ if input_ids is None and input_features is None and inputs_embeds is None and encoder_outputs is None:
+ raise ValueError(
+ "`input_ids`,`input_features`, `inputs_embeds` and `encoder_outputs` are all empty. Make sure at least one of them is not."
+ )
+ elif input_features is not None:
+ if input_ids is not None:
+ logger.warning(
+ "`input_ids` is not `None` but `input_features` has been given."
+ "`input_features` will be used in priority through the `speech_encoder`. "
+ "Make sure that `input_features` and `input_ids` are mutually exclusive."
+ )
+
+ if inputs_embeds is not None:
+ logger.warning(
+ "`inputs_embeds` is not `None` but `input_features` has been given."
+ "`input_features` will be used in priority through `speech_encoder`. "
+ "`inputs_embeds` will be ignored."
+ )
+
+ # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+ logger.warning(
+ "This calls the same method `forward` as `SeamlessM4Tv2ForTextToText` and `SeamlessM4Tv2ForSpeechToText`"
+ "depending on the input modality. If you want to generate speech, use the `generate` method."
+ )
+
+ self.set_modality("speech")
+
+ encoder_outputs = self.speech_encoder(
+ input_features=input_features,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ elif input_ids is not None or inputs_embeds is not None:
+ # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+ logger.warning(
+ "This calls the same method `forward` as `SeamlessM4Tv2ForTextToText` and `SeamlessM4Tv2ForSpeechToText`"
+ "depending on the input modality. If you want to generate speech, use the `generate` method."
+ )
+ self.set_modality("text")
+ encoder_outputs = self.text_encoder(
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+ # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+ elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+ encoder_outputs = BaseModelOutput(
+ last_hidden_state=encoder_outputs[0],
+ hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+ attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+ )
+
+ encoder_attention_mask = attention_mask
+ # input modality = speech so new attention mask
+ if self.current_modality == "speech" and attention_mask is not None:
+ sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+ encoder_outputs[0].device
+ )
+ encoder_attention_mask = _compute_new_attention_mask(
+ hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths
+ )
+
+ # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+ decoder_outputs = self.text_decoder(
+ input_ids=decoder_input_ids,
+ attention_mask=decoder_attention_mask,
+ encoder_hidden_states=encoder_outputs[0],
+ encoder_attention_mask=encoder_attention_mask,
+ past_key_values=past_key_values,
+ inputs_embeds=decoder_inputs_embeds,
+ use_cache=use_cache,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ )
+
+ lm_logits = self.lm_head(decoder_outputs[0])
+
+ masked_lm_loss = None
+ if labels is not None:
+ loss_fct = CrossEntropyLoss()
+ labels = labels.to(lm_logits.device)
+ masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+ if not return_dict:
+ outputs = decoder_outputs + encoder_outputs
+ output = (lm_logits,) + outputs[1:]
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+ return Seq2SeqLMOutput(
+ loss=masked_lm_loss,
+ logits=lm_logits,
+ past_key_values=decoder_outputs.past_key_values,
+ decoder_hidden_states=decoder_outputs.hidden_states,
+ decoder_attentions=decoder_outputs.attentions,
+ cross_attentions=decoder_outputs.cross_attentions,
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+ encoder_hidden_states=encoder_outputs.hidden_states,
+ encoder_attentions=encoder_outputs.attentions,
+ )
+
+ @torch.no_grad()
+ def generate(
+ self,
+ input_ids: Optional[torch.Tensor] = None,
+ input_features: Optional[torch.Tensor] = None,
+ return_intermediate_token_ids: Optional[bool] = None,
+ tgt_lang: Optional[str] = None,
+ speaker_id: Optional[int] = 0,
+ generate_speech: Optional[bool] = True,
+ **kwargs,
+ ) -> Union[torch.Tensor, SeamlessM4Tv2GenerationOutput]:
+ """
+ Generates translated token ids and/or translated audio waveforms.
+
+
+
+ This method successively calls the `.generate` function of two different sub-models. You can specify keyword
+ arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments
+ that will be passed to one of them.
+
+ For example, calling `.generate(input_ids=input_ids, num_beams=4, speech_do_sample=True)` will successively
+ perform beam-search decoding on the text model, and multinomial beam-search sampling on the speech model.
+
+ For an overview of generation strategies and code examples, check out the [following
+ guide](./generation_strategies).
+
+
+
+
+ Args:
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+ [What are input IDs?](../glossary#input-ids)
+ input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`, *optional*):
+ Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+ [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+ return_intermediate_token_ids (`bool`, *optional*):
+ If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
+ to get translated text alongside the audio. Note that if `generate_speech=True`, this parameter will be
+ ignored.
+ tgt_lang (`str`, *optional*):
+ The language to use as target language for translation.
+ speaker_id (`int`, *optional*, defaults to 0):
+ The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+ generate_speech (`bool`, *optional*, defaults to `True`):
+ If `False`, will only returns the text tokens and won't generate speech.
+
+ kwargs (*optional*):
+ Remaining dictioy of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword
+ arguments are of two types:
+
+ - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+ except for `decoder_input_ids` which will only be passed through the text components.
+ - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+ text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+ This means you can, for example, specify a generation strategy for one generation but not for the
+ other.
+
+ Returns:
+ `Union[SeamlessM4Tv2GenerationOutput, Tuple[Tensor], ModelOutput]`:
+ - If `generate_speech` and `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
+ - If `generate_speech` and not `return_intermediate_token_ids`, returns a tuple composed of waveforms of
+ shape `(batch_size, sequence_length)`and and `waveform_lengths` which gives the length of each sample.
+ - If `generate_speech=False`, it will returns `ModelOutput`.
+ """
+ if input_ids is None and input_features is None and kwargs.get("inputs_embeds", None) is None:
+ raise ValueError(
+ "`input_ids`,`input_features` and `inputs_embeds` are all empty. Make sure at least one of them is not."
+ )
+
+ if generate_speech and tgt_lang is None:
+ raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
+
+ if tgt_lang is not None:
+ # also accept __xxx__
+ tgt_lang = tgt_lang.replace("__", "")
+ if generate_speech:
+ keys_to_check = ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]
+ else:
+ keys_to_check = ["text_decoder_lang_to_code_id"]
+ for key in keys_to_check:
+ lang_code_to_id = getattr(self.generation_config, key, None)
+ if lang_code_to_id is None:
+ raise ValueError(
+ f"""This model generation config doesn't have a `{key}` key which maps the target language
+ to the right token id. Make sure to load the right generation config."""
+ )
+ elif tgt_lang not in lang_code_to_id:
+ raise ValueError(
+ f"""`tgt_lang={tgt_lang}` is not supported by this model.
+ Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
+ more languages for text translation than for speech synthesis."""
+ )
+
+ batch_size = (
+ len(input_features)
+ if input_features is not None
+ else (len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds")))
+ )
+
+ kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
+ kwargs_text["output_hidden_states"] = True
+ kwargs_text["return_dict_in_generate"] = True
+ kwargs_text["output_scores"] = True
+
+ text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
+ # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+ if tgt_lang is not None:
+ # tgt_lang gets priority over decoder input ids
+ text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+ text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+
+ kwargs_text["decoder_input_ids"] = text_decoder_input_ids
+
+ # first generation
+ if input_features is not None:
+ self.set_modality("speech")
+ if input_ids is not None:
+ logger.warning(
+ "`input_features` and `input_ids` are both non empty. `input_features` will be used in priority "
+ "through the speech encoder. Make sure `input_features=None` if you want to use the text encoder."
+ )
+ text_generation_output = super().generate(input_features=input_features, **kwargs_text)
+ else:
+ self.set_modality("text")
+ text_generation_output = super().generate(input_ids=input_ids, input_features=None, **kwargs_text)
+ sequences = text_generation_output.sequences
+
+ if not generate_speech:
+ return text_generation_output
+
+ # prepare second generation
+ num_return_sequences = len(sequences) // batch_size
+ attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
+
+ # get encoder last hidden states
+ if self.current_modality == "speech":
+ # get last_hidden_state from encoder - must do a pass through the speech encoder
+ encoder_hidden_states = self.speech_encoder(
+ input_features=input_features, attention_mask=attention_mask
+ ).last_hidden_state
+
+ # input modality = speech so new attention mask for the decoder
+ if attention_mask is not None:
+ sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+ encoder_hidden_states.device
+ )
+ attention_mask = _compute_new_attention_mask(
+ hidden_states=encoder_hidden_states, seq_lens=sub_sampled_lengths
+ )
+ else:
+ encoder_hidden_states = text_generation_output.encoder_hidden_states[-1]
+
+ if attention_mask is not None:
+ # repeat attention mask alongside batch dimension
+ attention_mask = torch.repeat_interleave(attention_mask, num_return_sequences, dim=0)
+
+ # repeat attention mask alongside batch dimension
+ encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, num_return_sequences, dim=0)
+
+ # get decoder last hidden state - must do a pass through the text decoder
+ t2u_input_embeds = self.text_decoder(
+ input_ids=sequences[:, :-1], # Manually trim the final EOS token
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=attention_mask,
+ ).last_hidden_state
+
+ pad_token_id = self.generation_config.pad_token_id
+
+ # Compute new attention mask
+ seq_lens = (sequences[:, :-1] != pad_token_id).int().sum(1)
+ t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
+ kwargs_speech["attention_mask"] = t2u_model_attention_mask
+
+ # REMOVE EOS and lang_id
+ t2u_input_ids = sequences[:, 2:-1]
+ # replace every other EOS
+ t2u_input_ids = torch.masked_fill(
+ t2u_input_ids, t2u_input_ids == self.generation_config.eos_token_id, pad_token_id
+ )
+
+ # compute t2u_char_input_ids
+ t2u_subwords = self._indices_to_subwords(t2u_input_ids)
+ t2u_char_count_per_id = self._count_character_length_in_subword(
+ t2u_input_ids, t2u_subwords, pad_token_id=pad_token_id
+ )
+
+ # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode.
+ pad_zero = t2u_char_count_per_id.new_zeros((t2u_char_count_per_id.shape[0], 1))
+ t2u_char_count_per_id = torch.cat([pad_zero, t2u_char_count_per_id, pad_zero], dim=1)
+ t2u_char_input_ids = self._get_char_input_ids(
+ t2u_input_ids, t2u_subwords, t2u_char_count_per_id, pad_token_id=pad_token_id
+ )
+
+ # second pass
+ t2u_output = self.t2u_model(
+ inputs_embeds=t2u_input_embeds,
+ char_input_ids=t2u_char_input_ids,
+ char_count_per_id=t2u_char_count_per_id,
+ **kwargs_speech,
+ )
+
+ t2u_logits = t2u_output[0]
+ padding_mask = t2u_output[1].bool()
+
+ # The text-to-unit model is non auto-regressive. We keep the ability to use sampling with temperature
+ temperature = kwargs_speech.get("temperature", None)
+ if (temperature is None or temperature == 1.0) or not kwargs_speech.get("do_sample", False):
+ unit_ids = t2u_logits.argmax(dim=-1)
+ else:
+ t2u_logits = t2u_logits / temperature
+ # apply softmax
+ probs = nn.functional.softmax(t2u_logits, dim=-1)
+ # reshape to 2D: (batch_size, seq_len, t2u_vocab_size) -> (batch_size*seq_len, t2u_vocab_size)
+ probs = probs.reshape((-1, probs.shape[2]))
+ # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len)
+ unit_ids = torch.multinomial(probs, num_samples=1).view(t2u_logits.shape[0], -1)
+
+ output_unit_ids = unit_ids.detach().clone()
+
+ replace_mask = (unit_ids == self.config.t2u_eos_token_id) | (~padding_mask)
+ # replace eos per pad
+ unit_ids = unit_ids.masked_fill(replace_mask, self.config.t2u_pad_token_id)
+
+ # offset of control symbols
+ unit_ids = torch.where(
+ unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset
+ )
+
+ vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
+ vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
+
+ speaker_id = torch.tensor([[speaker_id]] * len(unit_ids), device=self.device)
+
+ waveform, waveform_lengths = self.vocoder(
+ input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id
+ )
+
+ if return_intermediate_token_ids:
+ return SeamlessM4Tv2GenerationOutput(
+ waveform=waveform,
+ waveform_lengths=waveform_lengths,
+ sequences=sequences,
+ unit_sequences=output_unit_ids,
+ )
+
+ return waveform, waveform_lengths
+
+ @staticmethod
+ # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel._reorder_cache
+ def _reorder_cache(past_key_values, beam_idx):
+ reordered_past = ()
+ for layer_past in past_key_values:
+ # cached cross_attention states don't have to be reordered -> they are always the same
+ reordered_past += (
+ tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+ )
+ return reordered_past
+
+
+__all__ = [
+ "SeamlessM4Tv2ForTextToSpeech",
+ "SeamlessM4Tv2ForSpeechToSpeech",
+ "SeamlessM4Tv2ForTextToText",
+ "SeamlessM4Tv2ForSpeechToText",
+ "SeamlessM4Tv2Model",
+ "SeamlessM4Tv2PreTrainedModel",
+]