File size: 38,723 Bytes

cc34926

# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import math
from typing import Callable, Optional, Union, Any, Dict

import wandb
import torch
from torch import nn
from transformers.activations import ACT2FN
from transformers.cache_utils import Cache
from transformers.generation import GenerationMixin
from transformers.modeling_layers import GradientCheckpointingLayer
from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPast, CausalLMOutputWithPast
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from transformers.processing_utils import Unpack
from transformers.utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
from transformers.utils.generic import check_model_inputs
from transformers.models.auto import AutoModel, AutoModelForCausalLM
from .configuration_dixtral import DixtralConfig, DixtralEncoderConfig
from transformers.models.voxtral import  VoxtralConfig
from transformers.generation.utils import  GenerationConfig, LogitsProcessorList
from src.models.dicow.FDDT import FDDT
from src.models.dicow.layers import CustomLinear, CustomDiagonalLinear
from src.models.dixtral.decoding import CTCRescorerLogitsProcessorWithPruning


logger = logging.get_logger(__name__)


def eager_attention_forward(
    module: nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor],
    scaling: Optional[float] = None,
    dropout: float = 0.0,
    head_mask: Optional[torch.Tensor] = None,
    **kwargs,
):
    if scaling is None:
        scaling = query.size(-1) ** -0.5

    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
    if attention_mask is not None and attention_mask.ndim == 4:
        attn_weights = attn_weights + attention_mask[:, :, :, : key.shape[-2]]

    attn_weights = nn.functional.softmax(attn_weights, dim=-1)

    if head_mask is not None:
        attn_weights = attn_weights * head_mask.view(1, -1, 1, 1)

    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
    attn_output = torch.matmul(attn_weights, value)
    attn_output = attn_output.transpose(1, 2).contiguous()

    return attn_output, attn_weights


class CTCProcessorDummy:
    def __init__(self):
        super().__init__()
        self.func = None
    def set_func(self,func):
        self.func = func
    def __call__(self, input_ids_orig: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        return self.func(input_ids_orig, scores)

class VoxtralAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(
        self,
        embed_dim: int,
        num_heads: int,
        dropout: float = 0.0,
        is_decoder: bool = False,
        bias: bool = True,
        is_causal: bool = False,
        layer_idx: Optional[int] = None,
        config: Optional[VoxtralConfig] = None,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.dropout = dropout
        self.head_dim = embed_dim // num_heads
        self.config = config

        if (self.head_dim * num_heads) != self.embed_dim:
            raise ValueError(
                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
                f" and `num_heads`: {num_heads})."
            )
        self.scaling = self.head_dim**-0.5
        self.is_decoder = is_decoder
        self.is_causal = is_causal

        if layer_idx is None and is_decoder:
            logger.warning_once(
                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
                "will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
                "when creating this class."
            )
        self.layer_idx = layer_idx

        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)

    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        layer_head_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        **kwargs,
    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        """Input shape: Batch x Time x Channel"""

        bsz, tgt_len, _ = hidden_states.size()

        # Scaling is susceptible to floating point arithmetics' inprecisions
        # which can lead to different results (this is dependent from model
        # to model, e.g. whisper is one such case). We therefore keep the
        # original order of scaling to follow the original implementation
        # and enforce no scaling (1.0) in the attention call below.
        query_states = self._shape(self.q_proj(hidden_states) * self.scaling, tgt_len, bsz)
        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]

        attn_output, attn_weights = attention_interface(
            self,
            query_states,
            key_states,
            value_states,
            attention_mask,
            dropout=0.0 if not self.training else self.dropout,
            scaling=1.0,
            output_attentions=output_attentions,
            head_mask=layer_head_mask,
            **kwargs,
        )

        attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous()
        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights


class VoxtralEncoderLayer(GradientCheckpointingLayer):
    def __init__(self, config: VoxtralConfig):
        super().__init__()
        self.embed_dim = config.d_model

        self.self_attn = VoxtralAttention(
            embed_dim=self.embed_dim,
            num_heads=config.encoder_attention_heads,
            dropout=config.attention_dropout,
            config=config,
        )
        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
        layer_head_mask: torch.Tensor,
        output_attentions: bool = False,
    ) -> torch.Tensor:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        """
        residual = hidden_states
        hidden_states = self.self_attn_layer_norm(hidden_states)
        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            layer_head_mask=layer_head_mask,
            output_attentions=output_attentions,
        )
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states

        residual = hidden_states
        hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states

        if hidden_states.dtype == torch.float16:
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)

        return hidden_states, attn_weights


@auto_docstring
class DixtralPreTrainedModel(PreTrainedModel):
    config: DixtralConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = None
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn = True
    _supports_sdpa = True
    _supports_flex_attn = True
    _supports_cache_class = True
    _supports_attention_backend = True
    _can_compile_fullgraph = True

    def _init_weights(self, module):
        # important: this ported version of Voxtral isn't meant for training from scratch - only
        # inference and fine-tuning - so the proper init weights code has been removed
        std = (
            self.config.initializer_range
            if hasattr(self.config, "initializer_range")
            else self.config.audio_config.initializer_range
        )

        if isinstance(module, (nn.Linear, nn.Conv1d)):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.weight.data.fill_(1.0)
            module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, (CustomLinear, CustomDiagonalLinear)):
            module.reset_parameters()


@auto_docstring(
    custom_intro="""
    The Voxtral encoder, which is a Whisper encoder.
    """
)
class DixtralEncoder(DixtralPreTrainedModel):
    """
    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
    [`VoxtralEncoderLayer`].

    Args:
        config: VoxtralEncoderConfig
    """

    # Ignore copy
    config: DixtralEncoderConfig
    main_input_name = "input_features"
    _no_split_modules = ["VoxtralEncoderLayer"]
    _can_record_outputs = {
        "attentions": VoxtralAttention,
        "hidden_states": VoxtralEncoderLayer,
    }

    def __init__(self, config: DixtralEncoderConfig):
        super().__init__(config)
        self.dropout = config.dropout
        self.layerdrop = config.encoder_layerdrop

        embed_dim = config.d_model
        self.num_mel_bins = config.num_mel_bins
        self.padding_idx = config.pad_token_id
        self.max_source_positions = config.max_source_positions
        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

        self.conv1 = nn.Conv1d(self.num_mel_bins, embed_dim, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, stride=2, padding=1)

        self.embed_positions = nn.Embedding(self.max_source_positions, embed_dim)
        self.embed_positions.requires_grad_(False)

        self.layers = nn.ModuleList([VoxtralEncoderLayer(config) for _ in range(config.encoder_layers)])
        self.layer_norm = nn.LayerNorm(config.d_model)
        # Ignore copy
        self.avg_pooler = nn.AvgPool1d(2, stride=2)

        self._init_dicow_components(config)

        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def _init_dicow_components(self, config):
        """Initialize DiCoW-specific components"""
        if not config.use_dicow_encoder:
            return

        # FDDT components
        if config.use_fddt:
            num_fddts = (config.apply_fddt_to_n_layers
                        if config.apply_fddt_to_n_layers != -1
                        else len(self.layers))
            self.fddts = nn.ModuleList([
                FDDT(
                    d_model=config.d_model,
                    non_target_rate=1.0,
                    fddt_init=config.fddt_init,
                    is_diagonal=config.fddt_is_diagonal,
                    bias_only=config.fddt_bias_only,
                    use_silence=config.fddt_use_silence,
                    use_target=config.fddt_use_target,
                    use_overlap=config.fddt_use_overlap,
                    use_non_target=config.fddt_use_non_target,
                )
                for _ in range(num_fddts)
            ])

            if config.use_pre_pos_fddt:
                self.initial_fddt = FDDT(
                    d_model=config.d_model,
                    non_target_rate=config.non_target_fddt_value,
                    fddt_init=config.fddt_init,
                    is_diagonal=config.fddt_is_diagonal,
                    bias_only=config.fddt_bias_only,
                    use_silence=config.fddt_use_silence,
                    use_target=config.fddt_use_target,
                    use_overlap=config.fddt_use_overlap,
                    use_non_target=config.fddt_use_non_target,
                )

        # For CTC label processing
        self.first_task_token = config.vocab_size - 30 * 50 - 1 - 6

    def _freeze_parameters(self):
        for param in self.parameters():
            param.requires_grad = False
        self._requires_grad = False

    def get_input_embeddings(self) -> nn.Module:
        return self.conv1

    def set_input_embeddings(self, value: nn.Module):
        self.conv1 = value


    @check_model_inputs
    def forward(
        self,
        input_features,
        attention_mask=None,
        stno_mask=None,
        **kwargs: Unpack[TransformersKwargs],
    ):
        r"""
        Args:
            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
            attention_mask (`torch.Tensor`)`, *optional*):
                Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
                but it is not used. By default the silence in the input log mel spectrogram are ignored.
        """
        expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0]
        if input_features.shape[-1] != expected_seq_length:
            raise ValueError(
                f"Qwen2Audio expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
            )

        input_features = input_features.to(dtype=self.conv1.weight.dtype, device=self.conv1.weight.device)
        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
        inputs_embeds = inputs_embeds.permute(0, 2, 1)

        # Apply initial FDDT if configured
        if (self.config.use_dicow_encoder and
                self.config.use_fddt and
                self.config.use_pre_pos_fddt and
                hasattr(self, 'initial_fddt')):
            inputs_embeds = self.initial_fddt(inputs_embeds, stno_mask)

        embed_pos = self.embed_positions.weight
        hidden_states = (inputs_embeds + embed_pos).to(inputs_embeds.dtype)
        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)

        for idx, encoder_layer in enumerate(self.layers):

            if (self.config.use_dicow_encoder and
                self.config.use_fddt and
                hasattr(self, 'fddts') and
                idx < len(self.fddts)):
                hidden_states = self.fddts[idx](hidden_states, stno_mask)

            layer_outputs = encoder_layer(
                hidden_states,
                attention_mask=attention_mask,
                layer_head_mask=None,
            )
            hidden_states = layer_outputs[0]

        hidden_states = self.layer_norm(hidden_states)

        return BaseModelOutput(
            last_hidden_state=hidden_states,
        )


    # Ignore copy
    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
        """
        Computes the output length of the convolutional layers and the output length of the audio encoder
        """
        input_lengths = (input_lengths - 1) // 2 + 1
        output_lengths = (input_lengths - 2) // 2 + 1
        return input_lengths, output_lengths


class VoxtralMultiModalProjector(nn.Module):
    def __init__(self, config: VoxtralConfig):
        super().__init__()
        self.linear_1 = nn.Linear(config.audio_config.intermediate_size, config.text_config.hidden_size, bias=False)
        self.act = ACT2FN[config.projector_hidden_act]
        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=False)

    def forward(self, audio_features):
        hidden_states = self.linear_1(audio_features)
        hidden_states = self.act(hidden_states)
        hidden_states = self.linear_2(hidden_states)
        return hidden_states


@auto_docstring(
    custom_intro="""
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    """
)
class DixtralForConditionalGeneration(DixtralPreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]
    _tp_plan = {"lm_head": "colwise_rep"}
    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
    _keep_in_fp32_modules_strict = ["embed_positions"]

    def __init__(self, config):
        super().__init__(config)
        self.vocab_size = config.text_config.vocab_size
        self.audio_tower = DixtralEncoder(config.audio_config)
        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
        self.multi_modal_projector = VoxtralMultiModalProjector(config)

        self.num_soft_prompts = config.num_soft_prompts
        if self.num_soft_prompts > 0:
            self.soft_prompt_token_id = getattr(config, "soft_prompt_token_id", 23)

            self.soft_prompt = nn.Parameter(
                torch.randn(1, self.num_soft_prompts, config.text_config.hidden_size)
            )

        self._init_dicow_components(config)
        # Initialize weights and apply final processing
        self.post_init()

    def _init_dicow_components(self, config):
        self.ctc_weight = config.audio_config.ctc_weight

        # Additional layers for CTC
        if config.audio_config.additional_layer and self.ctc_weight > 0.0:
            custom_conf = copy.deepcopy(config.audio_config)
            custom_conf.d_model = config.text_config.hidden_size
            custom_conf.encoder_attention_heads = config.text_config.num_attention_heads
            custom_conf.encoder_ffn_dim = custom_conf.d_model * 2
            self.additional_layer = VoxtralEncoderLayer(custom_conf)

        if config.audio_config.additional_self_attention_layer and self.ctc_weight > 0.0:
            self.additional_self_attention_layer = VoxtralAttention(
                embed_dim=config.text_config.hidden_size,
                num_heads=config.text_config.num_attention_heads,
                dropout=config.text_config.attention_dropout,
                config=config.audio_config,  # Fixed: pass audio_config which is VoxtralConfig
            )

        # CTC head
        if self.ctc_weight > 0.0:
            self.ctc_lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
            self.ctc_lm_head.weight = self.language_model.get_input_embeddings().weight
    def get_input_embeddings(self):
        return self.language_model.get_input_embeddings()

    def set_input_embeddings(self, value):
        self.language_model.set_input_embeddings(value)

    def get_output_embeddings(self):
        return self.language_model.get_output_embeddings()

    def set_output_embeddings(self, new_embeddings):
        self.language_model.set_output_embeddings(new_embeddings)

    def set_decoder(self, decoder):
        self.language_model.set_decoder(decoder)

    def get_decoder(self):
        return self.language_model.get_decoder()

    def get_audio_embeds(self, input_features: torch.FloatTensor, stno_mask: torch.FloatTensor):
        """
        This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
        Args:
            input_features (`torch.FloatTensor`):
                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
                and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]

        Returns:
            `torch.FloatTensor`:
                The audio embeddings.
        """
        audio_outputs = self.audio_tower(input_features, stno_mask=stno_mask)
        audio_hidden_states = audio_outputs.last_hidden_state
        audio_hidden_states = audio_hidden_states.reshape(-1, self.config.audio_config.intermediate_size)
        audio_embeds = self.multi_modal_projector(audio_hidden_states)
        return audio_embeds

    def set_tokenizer(self, tokenizer):
        self.tokenizer = tokenizer


    def possibly_update_last_hidden_states(self, hidden_states):
        """DiCoW post-processing for CTC"""
        if not self.config.audio_config.use_dicow_encoder:
            return hidden_states

        if hasattr(self, "additional_layer"):
            hidden_states, _ = self.additional_layer(
                hidden_states,
                attention_mask=None,
                layer_head_mask=None,
                output_attentions=False,
            )
        elif hasattr(self, "additional_self_attention_layer"):
            hidden_states, _ = self.additional_self_attention_layer(
                hidden_states,
                attention_mask=None,
                layer_head_mask=None,
                output_attentions=False,
            )

        return hidden_states

    def get_enc_logits(self, hidden_states):
        """
        Get CTC logits from encoder hidden states.
        Applies optional additional processing layer and projects to vocabulary.

        Args:
            hidden_states: Encoder output hidden states

        Returns:
            logits: CTC logits of shape (batch_size, seq_len, vocab_size + 1)
        """
        hidden_states = self.possibly_update_last_hidden_states(hidden_states)
        logits = self.ctc_lm_head(hidden_states)
        return logits

    def right_pad_labels(self, labels, pad_value=-100):
        """
        labels: (B, L) tensor possibly left/right padded
        returns: right-padded labels only
        """
        B, L = labels.shape
        new_labels = torch.full_like(labels, pad_value)
        max_len = 1
        for b in range(B):
            valid = labels[b][labels[b] != pad_value]
            max_len = max(max_len, len(valid))
            new_labels[b, :valid.numel()] = valid

        new_labels = new_labels[:, :max_len]

        return new_labels

    def get_ctc_loss(self, logits, labels, input_lengths):

        """Compute CTC loss for DiCoW"""
        if labels.max() >= self.config.text_config.vocab_size:
            raise ValueError(f"Label values must be <= vocab_size: {self.config.text_config.vocab_size}")

        # Assuming that padded tokens are filled with -100
        labels_mask = labels >= 0
        target_lengths = labels_mask.sum(-1)

        # CTC loss doesn't support fp16
        log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)

        with torch.backends.cudnn.flags(enabled=True):
            ctc_loss = nn.functional.ctc_loss(
                log_probs,
                labels,
                input_lengths,
                target_lengths,
                blank=logits.shape[-1] - 1,
                reduction=self.config.audio_config.ctc_loss_reduction,
                zero_infinity=True,
            )

        return ctc_loss

    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        input_features: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        logits_to_keep: Union[int, torch.Tensor] = 0,
        stno_mask=None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> CausalLMOutputWithPast:
        r"""
        Example:

        ```python
        >>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
        >>> import torch

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

        >>> processor = AutoProcessor.from_pretrained(repo_id)
        >>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)

        >>> conversation = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "audio",
                        "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
                    },
                    {"type": "text", "text": "What can you tell me about this audio?"},
                ],
            }
        ]

        >>> inputs = processor.apply_chat_template(conversation)
        >>> inputs = inputs.to(device, dtype=torch.bfloat16)

        >>> outputs = model.generate(**inputs, max_new_tokens=30)
        >>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
        ["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
        ```"""
        if inputs_embeds is None:
            inputs_embeds = self.get_input_embeddings()(input_ids)

        ctc_loss = None
        if input_features is not None:
            # Get audio encoder outputs
            audio_outputs = self.audio_tower(input_features, stno_mask=stno_mask)
            audio_hidden_states = audio_outputs.last_hidden_state

            # Project audio features for language model
            audio_hidden_states_flat = audio_hidden_states.reshape(-1, self.config.audio_config.intermediate_size)
            audio_embeds_flat = self.multi_modal_projector(audio_hidden_states_flat)

            # Replace text-audio token placeholders with audio embeddings
            audio_token_mask = input_ids == self.config.audio_token_id
            inputs_embeds[audio_token_mask] = audio_embeds_flat

            if self.num_soft_prompts > 0:
                prompt_mask = (input_ids == self.soft_prompt_token_id)

                if prompt_mask.any():
                    batch_size = inputs_embeds.shape[0]

                    # Expand the learned soft prompts to [Batch_Size, Num_Soft_Tokens, Hidden_Size]
                    # Then flatten to [Batch_Size * Num_Soft_Tokens, Hidden_Size] to match the mask
                    prompts_expanded = self.soft_prompt.expand(batch_size, -1, -1).reshape(-1,
                                                                                           self.config.text_config.hidden_size)

                    # Replace embeddings
                    inputs_embeds[prompt_mask] = prompts_expanded

            # Compute CTC loss on projected embeddings if configured
            if (self.config.audio_config.use_dicow_encoder and
                    self.config.audio_config.ctc_weight > 0.0 and
                    labels is not None and
                    self.training and
                    audio_token_mask is not None) or hasattr(self, "ctc_rescorer"):

                # Create tensor with shape of input_ids filled with zeros
                batch_size, seq_len = input_ids.shape
                hidden_dim = audio_embeds_flat.shape[-1]
                ctc_embeds = torch.empty(
                    batch_size, seq_len, hidden_dim,
                    device=audio_embeds_flat.device,
                    dtype=audio_embeds_flat.dtype
                )

                # Fill with audio_embeds at audio_token positions
                ctc_embeds[audio_token_mask] = audio_embeds_flat
                ctc_embeds_detached = ctc_embeds.detach()

                # 2. Force it to require gradients so the additional_layer
                # builds a backward graph for its own weights
                ctc_embeds_detached.requires_grad_(True)

                # Remove values outside maximum valid range using audio_mask
                enc_output_lens = audio_token_mask.sum(dim=1)
                max_valid_len = enc_output_lens.max().item()
                first_audio_token = audio_token_mask.int().argmax(dim=1).min().item()  # First True position per batch
                ctc_embeds = ctc_embeds[:, first_audio_token:first_audio_token+max_valid_len, :]

                # Get encoder logits for CTC
                enc_logits = self.get_enc_logits(ctc_embeds)

                if hasattr(self, "ctc_rescorer"):
                    rescorer = CTCRescorerLogitsProcessorWithPruning(
                        enc_logits,
                        torch.full((enc_logits.shape[0],), fill_value=enc_logits.shape[1],
                                   device=enc_logits.device),
                        enc_logits.shape[-1] - 1,
                        self.generation_config.pad_token_id,
                        self.generation_config.eos_token_id,
                        self.generation_config.bos_token_id,
                        self.tokenizer,
                        0,
                        self.generation_config.ctc_weight,
                        self.generation_config.num_beams,
                        False,
                    )
                    self.ctc_rescorer.set_func(func=rescorer)

                if labels is not None:
                    # Prepare encoder labels
                    enc_labels = labels.clone()

                    # Replace EOS tokens with ignore index
                    enc_labels[enc_labels == self.config.text_config.eos_token_id] = -100
                    enc_labels = self.right_pad_labels(enc_labels)

                    # Compute CTC loss
                    ctc_loss = self.get_ctc_loss(enc_logits, enc_labels, enc_output_lens)

        outputs: BaseModelOutputWithPast = self.language_model(
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            labels=labels,
            use_cache=use_cache,
            cache_position=cache_position,
            logits_to_keep=logits_to_keep,
            **kwargs,
        )

        if ctc_loss is not None and outputs.loss is not None:
            if wandb.run is not None:
                wandb.log({"dec_loss": outputs.loss, "ctc_loss": ctc_loss})
            total_loss = outputs.loss + self.config.audio_config.ctc_weight * ctc_loss
            outputs.loss = total_loss
        elif ctc_loss is not None:
            outputs.loss = self.config.audio_config.ctc_weight * ctc_loss


        return outputs

    def prepare_inputs_for_generation(self, *args, **kwargs):
        # Overwritten -- we should not pass input_features/stno_mask when in cached decoding stage

        input_features = kwargs.pop("input_features", None)
        stno_mask = kwargs.pop("stno_mask", None)
        cache_position = kwargs.get("cache_position")

        model_inputs = super().prepare_inputs_for_generation(*args, **kwargs)

        if cache_position is not None and cache_position[0] == 0:
            # Only pass audio inputs on the first (prefill) step
            model_inputs["input_features"] = input_features
            model_inputs["stno_mask"] = stno_mask

        return model_inputs


    def _get_logits_processor(
        self,
        generation_config: GenerationConfig,
        input_ids_seq_length: Optional[int] = None,
        encoder_input_ids: torch.LongTensor = None,
        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
        logits_processor: Optional[LogitsProcessorList] = None,
        device: Optional[str] = None,
        model_kwargs: Optional[dict[str, Any]] = None,
        negative_prompt_ids: Optional[torch.Tensor] = None,
        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
    ) -> LogitsProcessorList:
        # pylint: disable=no-member
        gen_config_copy = copy.deepcopy(generation_config)
        processors = super()._get_logits_processor(
            gen_config_copy,
            input_ids_seq_length,
            encoder_input_ids,
            prefix_allowed_tokens_fn,
            logits_processor,
            device,
            model_kwargs,
            negative_prompt_ids,
            negative_prompt_attention_mask,
        )
        if hasattr(generation_config, "ctc_weight") and generation_config.ctc_weight > 0:
            self.ctc_rescorer = CTCProcessorDummy
            processors.append(self.ctc_rescorer)
        return processors

    @torch.no_grad()
    def decode_ctc(
            self,
            input_ids: torch.LongTensor,
            input_features: torch.FloatTensor,
            stno_mask: Optional[torch.Tensor] = None,
    ) -> tuple[None, torch.LongTensor]:
        """
        Performs greedy CTC decoding on the audio input.
        """

        audio_outputs = self.audio_tower(input_features, stno_mask=stno_mask)
        audio_hidden_states = audio_outputs.last_hidden_state

        # Project audio features for language model
        audio_hidden_states_flat = audio_hidden_states.reshape(-1, self.config.audio_config.intermediate_size)
        audio_embeds_flat = self.multi_modal_projector(audio_hidden_states_flat)

        # Replace text-audio token placeholders with audio embeddings
        audio_token_mask = input_ids == self.config.audio_token_id

        # Create tensor with shape of input_ids filled with zeros
        batch_size, seq_len = input_ids.shape
        hidden_dim = audio_embeds_flat.shape[-1]
        ctc_embeds = torch.empty(
            batch_size, seq_len, hidden_dim,
            device=audio_embeds_flat.device,
            dtype=audio_embeds_flat.dtype
        )

        # Fill with audio_embeds at audio_token positions
        ctc_embeds[audio_token_mask] = audio_embeds_flat

        # Remove values outside maximum valid range using audio_mask
        enc_output_lens = audio_token_mask.sum(dim=1)
        max_valid_len = enc_output_lens.max().item()
        first_audio_token = audio_token_mask.int().argmax(dim=1).min().item()  # First True position per batch
        ctc_embeds = ctc_embeds[:, first_audio_token:first_audio_token + max_valid_len, :]

        # Get encoder logits for CTC
        logits = self.get_enc_logits(ctc_embeds)

        # 4. Greedy Decoding
        predicted_ids = torch.argmax(logits, dim=-1)

        # Blank token is the last index in the vocabulary (vocab_size - 1)
        # Based on: blank=logits.shape[-1] - 1 in get_ctc_loss
        blank_id = self.config.text_config.vocab_size - 1

        sequences = []

        for batch_idx in range(batch_size):
            ids = predicted_ids[batch_idx].cpu().tolist()

            # CTC Collapse:
            # 1. Merge adjacent duplicates
            # 2. Remove blank tokens
            collapsed_ids = []
            prev_id = -1

            for token_id in ids:
                if token_id != prev_id:
                    if token_id != blank_id:
                        collapsed_ids.append(token_id)
                    prev_id = token_id

            sequences.append(torch.tensor(collapsed_ids, dtype=torch.long))

        return None, torch.nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=-100).to(input_ids.device)

__all__ = ["DixtralPreTrainedModel", "DixtralEncoder", "DixtralForConditionalGeneration"]