PantagrueLLM
/

Text_Base_FR_OSCAR

@@ -52,7 +52,10 @@ from transformers.modeling_outputs import (
     QuestionAnsweringModelOutput,
     SequenceClassifierOutput,
     TokenClassifierOutput,
 )
 from .configuration_pantagruel_uni import (
     PantagruelUniConfig,
     PantagruelModalityConfig,
@@ -83,8 +86,6 @@ class PantagruelUniBaseModelOutput(ModelOutput):
     attentions: Optional[tuple[torch.FloatTensor, ...]] = None
-#################################################
-### modeling_pantagruel_uni_base.py
 # copied from fairseq.modules.grad_multiply
 class GradMultiply(torch.autograd.Function):
     @staticmethod
@@ -98,7 +99,7 @@ class GradMultiply(torch.autograd.Function):
         return grad * ctx.scale, None
-# Copied from fairseq.modules.transpose_last.py
 class TransposeLast(nn.Module):
     def __init__(self, deconstruct_idx=None, tranpose_dim=-2):
         super().__init__()
@@ -111,7 +112,7 @@ class TransposeLast(nn.Module):
         return x.transpose(self.tranpose_dim, -1)
-# Copied from fairseq.modules.layer_norm.py
 class Fp32LayerNorm(nn.LayerNorm):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -125,7 +126,7 @@ class Fp32LayerNorm(nn.LayerNorm):
             self.eps,
         )
         return output.type_as(input)
 def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True):
     return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
@@ -457,6 +458,7 @@ class BlockEncoder(nn.Module):
         return x
 class ModalitySpecificEncoder(nn.Module):
     def __init__(
         self,
@@ -820,6 +822,7 @@ class ModalitySpecificEncoder(nn.Module):
         return x
 class AudioEncoder(ModalitySpecificEncoder):
     modality_cfg: PantagruelAudioConfig
@@ -952,6 +955,7 @@ class AudioEncoder(ModalitySpecificEncoder):
         return padding_mask
 class LearnedPositionalEmbedding(nn.Embedding):
     """
     This module learns positional embeddings up to a fixed maximum size.
@@ -1001,6 +1005,7 @@ class LearnedPositionalEmbedding(nn.Embedding):
         )
 class SinusoidalPositionalEmbedding(nn.Module):
     """This module produces sinusoidal positional embeddings of any length.
@@ -1098,7 +1103,9 @@ class SinusoidalPositionalEmbedding(nn.Module):
             .view(bsz, seq_len, -1)
             .detach()
         )
 def PositionalEmbedding(
     num_embeddings: int,
     embedding_dim: int,
@@ -1125,6 +1132,7 @@ def PositionalEmbedding(
     return m
 class TextLocalEncoder(nn.Module):
     def __init__(
         self,
@@ -1246,7 +1254,6 @@ class TextEncoder(ModalitySpecificEncoder):
         ), f"{x.size(1), padding_mask.size(1), diff, self.downsample}"
         return padding_mask
-#################################################
 # copied from transformers.models.data2vec.modeling_data2vec.PantagruelUniTextPooler
@@ -1265,6 +1272,64 @@ class PantagruelUniTextPooler(nn.Module):
         return pooled_output
 class PantagruelUniPreTrainedModel(PreTrainedModel):
     config_class = PantagruelUniConfig
     base_model_prefix = "pantagruel_uni"
@@ -1310,27 +1375,60 @@ class PantagruelUniPreTrainedModel(PreTrainedModel):
         else:
             _init(module)
-    # @classmethod
-    # def from_pretrained(
-    #     cls,
-    #     pretrained_model_name_or_path,
-    #     *model_args,
-    #     **kwargs,
-    # ):
-    #     config = cls.config_class()
-    #     config.from_pretrained(pretrained_model_name_or_path)
-    #     print(f"Loading configuration from pre-trained model: {type(config)}")
-    #     return super().from_pretrained(pretrained_model_name_or_path,
-    #                                    *model_args,
-    #                                    config,
-    #                                    **kwargs,)
 class PantagruelUniModel(PantagruelUniPreTrainedModel):
     def __init__(
         self, config: PantagruelUniConfig, add_pooling_layer: bool = True
     ):
         super().__init__(config)
         self.config = config
         modalities_cfg = config.modalities
@@ -1390,10 +1488,12 @@ class PantagruelUniModel(PantagruelUniPreTrainedModel):
         self.post_init()
     def get_input_embeddings(self):
-        return self.modality_encoders["TEXT"].local_encoder.embed_tokens
     def set_input_embeddings(self, value):
-        self.modality_encoders["TEXT"].local_encoder.embed_tokens = value
     def freeze_feature_extractor(self):
         """
@@ -1414,6 +1514,14 @@ class PantagruelUniModel(PantagruelUniPreTrainedModel):
         """
         for mod in self.modalities:
             self.modality_encoders[mod]._freeze_parameters()
         for block in self.blocks:
             for p in block.parameters():
                 p.requires_grad = False
@@ -1447,6 +1555,7 @@ class PantagruelUniModel(PantagruelUniPreTrainedModel):
         self,
         input_values=None, # audio input
         input_ids=None, # text input
         attention_mask=None,
         padding_mask=None,
         mask=False,
@@ -1454,12 +1563,68 @@ class PantagruelUniModel(PantagruelUniPreTrainedModel):
         output_hidden_states=True,
         output_attn_weights=False,
         return_dict=True,
-    ):
         if mode is None:
             mode = "TEXT" if input_ids is not None else "AUDIO"
         if padding_mask is None and attention_mask is not None:
-            padding_mask = ~attention_mask # attention mask: 1 means to attend to (not masked), 0 means not to attend to (masked). padding mask: 1 means padded (not attend to), 0 means not padded (to attend to)
         feature_extractor = self.modality_encoders[mode]
         extractor_out = feature_extractor(
@@ -1598,7 +1763,7 @@ class PantagruelTextClassificationHead(nn.Module):
 @auto_docstring
 class PantagruelUniForMaskedLM(PantagruelUniPreTrainedModel):
-    _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
     def __init__(self, config):
         super().__init__(config)
@@ -1663,10 +1828,13 @@ class PantagruelUniForMaskedLM(PantagruelUniPreTrainedModel):
         )
 @auto_docstring(
     custom_intro="""
-    PantagruelText Model transformer with a sequence classification/regression head on top (a linear layer on top of the
-    pooled output) e.g. for GLUE tasks.
     """
 )
 class PantagruelUniForSequenceClassification(PantagruelUniPreTrainedModel):
@@ -1674,64 +1842,157 @@ class PantagruelUniForSequenceClassification(PantagruelUniPreTrainedModel):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.config = config
         self.pantagruel_uni = PantagruelUniModel(config, add_pooling_layer=False)
-        self.classifier = PantagruelTextClassificationHead(config)
         # Initialize weights and apply final processing
         self.post_init()
     @can_return_tuple
     @auto_docstring
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         padding_mask: Optional[torch.FloatTensor] = None,
         labels: Optional[torch.LongTensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> Union[tuple, SequenceClassifierOutput]:
         r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
-            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
-        outputs = self.pantagruel_uni(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            padding_mask=padding_mask,
-            mask=False,
-            mode="TEXT",
-            return_dict=True,
-        )
-        sequence_output = outputs.last_hidden_state
-        logits = self.classifier(sequence_output)
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == "single_label_classification":
                 loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
         return SequenceClassifierOutput(
             loss=loss,
@@ -1952,6 +2213,408 @@ class PantagruelUniForQuestionAnswering(PantagruelUniPreTrainedModel):
             attentions=outputs.attentions,
         )
 __all__ = [
     "PantagruelUniForMaskedLM",
@@ -1961,4 +2624,5 @@ __all__ = [
     "PantagruelUniForTokenClassification",
     "PantagruelUniModel",
     "PantagruelUniPreTrainedModel",
 ]

     QuestionAnsweringModelOutput,
     SequenceClassifierOutput,
     TokenClassifierOutput,
+    CausalLMOutput,
+    XVectorOutput,
 )
+from transformers.utils import auto_docstring, is_peft_available
 from .configuration_pantagruel_uni import (
     PantagruelUniConfig,
     PantagruelModalityConfig,
     attentions: Optional[tuple[torch.FloatTensor, ...]] = None
 # copied from fairseq.modules.grad_multiply
 class GradMultiply(torch.autograd.Function):
     @staticmethod
         return grad * ctx.scale, None
+# copied from fairseq.modules.transpose_last.py
 class TransposeLast(nn.Module):
     def __init__(self, deconstruct_idx=None, tranpose_dim=-2):
         super().__init__()
         return x.transpose(self.tranpose_dim, -1)
+# copied from fairseq.modules.layer_norm.py
 class Fp32LayerNorm(nn.LayerNorm):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
             self.eps,
         )
         return output.type_as(input)
 def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True):
     return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
         return x
+# copied from fairseq.examples.data2vec.models
 class ModalitySpecificEncoder(nn.Module):
     def __init__(
         self,
         return x
+# copied from fairseq.examples.data2vec.models.modalities.audio
 class AudioEncoder(ModalitySpecificEncoder):
     modality_cfg: PantagruelAudioConfig
         return padding_mask
+# copied from fairseq
 class LearnedPositionalEmbedding(nn.Embedding):
     """
     This module learns positional embeddings up to a fixed maximum size.
         )
+# copied from fairseq
 class SinusoidalPositionalEmbedding(nn.Module):
     """This module produces sinusoidal positional embeddings of any length.
             .view(bsz, seq_len, -1)
             .detach()
         )
+# copied from fairseq.modules
 def PositionalEmbedding(
     num_embeddings: int,
     embedding_dim: int,
     return m
+# copied from fairseq.examples.data2vec.modules
 class TextLocalEncoder(nn.Module):
     def __init__(
         self,
         ), f"{x.size(1), padding_mask.size(1), diff, self.downsample}"
         return padding_mask
 # copied from transformers.models.data2vec.modeling_data2vec.PantagruelUniTextPooler
         return pooled_output
+# copied from transformers.models.data2vec.modeling_data2vec_audio
+class AMSoftmaxLoss(nn.Module):
+    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
+        super().__init__()
+        self.scale = scale
+        self.margin = margin
+        self.num_labels = num_labels
+        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
+    def forward(self, hidden_states, labels):
+        labels = labels.flatten()
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
+        cos_theta = torch.mm(hidden_states, weight)
+        psi = cos_theta - self.margin
+        onehot = nn.functional.one_hot(labels, self.num_labels)
+        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
+        loss = self.loss(logits, labels)
+        return loss
+# copied from transformers.models.data2vec.modeling_data2vec_audio
+class TDNNLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
+        self.out_conv_dim = config.tdnn_dim[layer_id]
+        self.kernel_size = config.tdnn_kernel[layer_id]
+        self.dilation = config.tdnn_dilation[layer_id]
+        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
+        self.activation = nn.ReLU()
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if is_peft_available():
+            from peft.tuners.lora import LoraLayer
+        if is_peft_available():
+            if isinstance(self.kernel, LoraLayer):
+                warnings.warn(
+                    "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
+                    "You should exclude TDNNLayer from LoRA's target modules.",
+                )
+        # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
+        hidden_states = hidden_states.transpose(1, 2)
+        weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)
+        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+@auto_docstring
 class PantagruelUniPreTrainedModel(PreTrainedModel):
     config_class = PantagruelUniConfig
     base_model_prefix = "pantagruel_uni"
         else:
             _init(module)
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+        add_adapter = self.config.modalities.audio.add_adapter if add_adapter is None else add_adapter
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+        return input_lengths
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+        batch_size = attention_mask.shape[0]
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+@auto_docstring
 class PantagruelUniModel(PantagruelUniPreTrainedModel):
     def __init__(
         self, config: PantagruelUniConfig, add_pooling_layer: bool = True
     ):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
         super().__init__(config)
         self.config = config
         modalities_cfg = config.modalities
         self.post_init()
     def get_input_embeddings(self):
+        if "TEXT" in self.modality_encoders:
+            return self.modality_encoders["TEXT"].local_encoder.embed_tokens
     def set_input_embeddings(self, value):
+        if "TEXT" in self.modality_encoders:
+            self.modality_encoders["TEXT"].local_encoder.embed_tokens = value
     def freeze_feature_extractor(self):
         """
         """
         for mod in self.modalities:
             self.modality_encoders[mod]._freeze_parameters()
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        for mod in self.modalities:
+            self.modality_encoders[mod]._freeze_parameters()
         for block in self.blocks:
             for p in block.parameters():
                 p.requires_grad = False
         self,
         input_values=None, # audio input
         input_ids=None, # text input
+        token_type_ids=None,
         attention_mask=None,
         padding_mask=None,
         mask=False,
         output_hidden_states=True,
         output_attn_weights=False,
         return_dict=True,
+    ) -> Union[Tuple, PantagruelUniBaseModelOutput]:
+        r"""
+        Performs a forward pass of the model for either audio or text inputs.
+        The modality is automatically inferred if `mode` is not provided:
+            `"TEXT"` is used when `input_ids` is specified, otherwise `"AUDIO"`.
+        Args:
+            input_values (`torch.FloatTensor`, *optional*):
+                Audio input values of shape `(batch_size, sequence_length)`
+                containing *normalized* audio samples
+                Required when operating in `"AUDIO"` mode.
+            input_ids (`torch.LongTensor`, *optional*):
+                Tokenized text input IDs of shape `(batch_size, sequence_length)`.
+                Required when operating in `"TEXT"` mode.
+            attention_mask (`torch.LongTensor`, *optional*):
+                Attention mask for text inputs, with values in `{0, 1}`:
+                - `1` for tokens that should be attended to,
+                - `0` for tokens that should be masked.
+                If provided and `padding_mask` is `None`, it will be converted internally
+                to a padding mask.
+            padding_mask (`torch.BoolTensor` or `torch.LongTensor`, *optional*):
+                Padding mask indicating which positions are padded:
+                - `1` (or `True`) for padded positions (not attended to),
+                - `0` (or `False`) for non-padded positions.
+                If not provided and `attention_mask` is given, this is inferred as
+                the logical negation of `attention_mask`.
+            mask (`bool`, *optional*, defaults to `False`):
+                Whether to apply input masking.
+            mode (`str`, *optional*):
+                Explicitly specifies the input modality. Supported values are
+                `"TEXT"` and `"AUDIO"`. If `None`, the mode is inferred from the
+                provided inputs.
+            output_hidden_states (`bool`, *optional*, defaults to `True`):
+                Whether to return the hidden states of all layers.
+            output_attn_weights (`bool`, *optional*, defaults to `False`):
+                Whether to return attention weights.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`ModelOutput`] instead of a plain tuple.
+        Returns:
+            [`ModelOutput`] or `tuple`:
+                The model outputs. If `return_dict=True`, a [`ModelOutput`] is returned
+                containing (depending on configuration) the final hidden states,
+                optional hidden states from all layers, and optional attention weights.
+                If `return_dict=False`, a tuple is returned with the same contents in
+                a fixed order.
+        """
         if mode is None:
             mode = "TEXT" if input_ids is not None else "AUDIO"
         if padding_mask is None and attention_mask is not None:
+            padding_mask = ~attention_mask.bool() # attention mask: 1 means to attend to (not masked), 0 means not to attend to (masked). padding mask: 1 means padded (not attend to), 0 means not padded (to attend to)
         feature_extractor = self.modality_encoders[mode]
         extractor_out = feature_extractor(
 @auto_docstring
 class PantagruelUniForMaskedLM(PantagruelUniPreTrainedModel):
+    # _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
     def __init__(self, config):
         super().__init__(config)
         )
+_HIDDEN_STATES_START_POSITION = 2
 @auto_docstring(
     custom_intro="""
+    PantagruelUniModel with a sequence classification or regression head on top (a linear layer applied to a pooled representation of the sequence).
+    This model supports text and audio modalities. The classification head and internal processing are selected automatically based on the configuration.
     """
 )
 class PantagruelUniForSequenceClassification(PantagruelUniPreTrainedModel):
         super().__init__(config)
         self.num_labels = config.num_labels
         self.config = config
         self.pantagruel_uni = PantagruelUniModel(config, add_pooling_layer=False)
+        if config.supported_modality == "TEXT":
+            logger.info("Initializing PantagruelUniForSequenceClassification for TEXT")
+            self.classifier = PantagruelTextClassificationHead(config)
+        elif config.supported_modality == "AUDIO":
+            logger.info("Initializing PantagruelUniForSequenceClassification for AUDIO")
+            num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+            if config.modalities.audio.use_weighted_layer_sum:
+                self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+            self.projector = nn.Linear(config.hidden_size, config.modalities.audio.classifier_proj_size)
+            self.classifier = nn.Linear(config.modalities.audio.classifier_proj_size, config.num_labels)
         # Initialize weights and apply final processing
         self.post_init()
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.pantagruel_uni.freeze_feature_encoder()
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.pantagruel_uni.parameters():
+            param.requires_grad = False
     @can_return_tuple
     @auto_docstring
     def forward(
         self,
+        input_values: Optional[torch.FloatTensor] = None,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.FloatTensor] = None,
         padding_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
         labels: Optional[torch.LongTensor] = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> Union[tuple, SequenceClassifierOutput]:
         r"""
+        Performs a forward pass for sequence classification or regression.
+        This method supports both **text** and **audio** inputs. The modality is inferred
+            from the provided inputs and the model configuration.
+        Args:
+            input_values (`torch.FloatTensor`, *optional*):
+                Audio input values of shape `(batch_size, sequence_length)`
+                containing *normalized* audio samples.
+            input_ids (`torch.LongTensor`, *optional*):
+                Tokenized text input IDs of shape `(batch_size, sequence_length)`.
+                Used when the model is configured for `"TEXT"` modality.
+            labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+                    If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+                    If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
         """
+        if self.config.supported_modality == "TEXT":
+            outputs = self.pantagruel_uni(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                padding_mask=padding_mask,
+                mask=False,
+                mode="TEXT",
+                return_dict=True,
+            )
+            sequence_output = outputs.last_hidden_state
+            logits = self.classifier(sequence_output)
+            loss = None
+            if labels is not None:
+                labels = labels.to(logits.device)
+                if self.config.problem_type is None:
+                    if self.num_labels == 1:
+                        self.config.problem_type = "regression"
+                    elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                        self.config.problem_type = "single_label_classification"
+                    else:
+                        self.config.problem_type = "multi_label_classification"
+                if self.config.problem_type == "regression":
+                    loss_fct = MSELoss()
+                    if self.num_labels == 1:
+                        loss = loss_fct(logits.squeeze(), labels.squeeze())
+                    else:
+                        loss = loss_fct(logits, labels)
+                elif self.config.problem_type == "single_label_classification":
+                    loss_fct = CrossEntropyLoss()
+                    loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+                elif self.config.problem_type == "multi_label_classification":
+                    loss_fct = BCEWithLogitsLoss()
+                    loss = loss_fct(logits, labels)
+        else:
+            outputs = self.pantagruel_uni(
+                input_values=input_values,
+                attention_mask=attention_mask,
+                mask=False,
+                mode="AUDIO",
+                output_hidden_states=output_hidden_states,
+                output_attn_weights=output_attentions,
+                return_dict=return_dict,
+            )
+            if self.config.modalities.audio.use_weighted_layer_sum:
+                hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+                hidden_states = torch.stack(hidden_states, dim=1)
+                norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+                hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+            else:
+                hidden_states = outputs[0]
+            hidden_states = self.projector(hidden_states)
+            if attention_mask is None:
+                pooled_output = hidden_states.mean(dim=1)
+            else:
+                padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+                expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+                hidden_states[~expand_padding_mask] = 0.0
+                pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+            logits = self.classifier(pooled_output)
+            loss = None
+            if labels is not None:
                 loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+            if not return_dict:
+                output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+                return ((loss,) + output) if loss is not None else output
         return SequenceClassifierOutput(
             loss=loss,
             attentions=outputs.attentions,
         )
+class PantagruelUniForAudioFrameClassification(PantagruelUniPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        if hasattr(config.modalities.audio, "add_adapter") and config.modalities.audio.add_adapter:
+            raise ValueError(
+                "Audio frame classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)"
+            )
+        self.pantagruel_uni = PantagruelUniModel(config, add_pooling_layer=False)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.modalities.audio.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.num_labels = config.num_labels
+        self.init_weights()
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.pantagruel_uni.freeze_feature_encoder()
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.pantagruel_uni.parameters():
+            param.requires_grad = False
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
+            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
+            into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            True if self.config.modalities.audio.use_weighted_layer_sum
+            else output_hidden_states
+        )
+        outputs = self.pantagruel_uni(
+            input_values=input_values,
+            attention_mask=attention_mask,
+            mask=False,
+            mode="AUDIO",
+            output_hidden_states=output_hidden_states,
+            output_attn_weights=output_attentions,
+            return_dict=return_dict,
+        )
+        if self.config.modalities.audio.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+        logits = self.classifier(hidden_states)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@auto_docstring(
+    custom_intro="""
+    PantagruelUniForCTC Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
+    """
+)
+class PantagruelUniForCTC(PantagruelUniPreTrainedModel):
+    def __init__(self, config):
+        r"""
+        target_lang (`str`, *optional*):
+            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
+            adapter.<lang>.bin. Only relevant when using an instance of [`Data2VecAudioForCTC`] with adapters. Uses 'eng' by
+            default.
+        """
+        super().__init__(config)
+        self.pantagruel_uni = PantagruelUniModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.final_dropout)
+        if config.modalities.audio.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.modalities.audio.output_hidden_size if hasattr(config.modalities.audio, "add_adapter") and config.modalities.audio.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.modalities.audio.vocab_size)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.pantagruel_uni.freeze_feature_encoder()
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.pantagruel_uni.parameters():
+            param.requires_grad = False
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None and labels.max() >= self.config.modalities.audio.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.modalities.audio.vocab_size}")
+        outputs = self.pantagruel_uni(
+            input_values=input_values,
+            attention_mask=attention_mask,
+            mask=False,
+            mode="AUDIO",
+            output_hidden_states=output_hidden_states,
+            output_attn_weights=output_attentions,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+class PantagruelUniForXVector(PantagruelUniPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.pantagruel_uni = PantagruelUniModel(config, add_pooling_layer=False)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.modalities.audio.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.modalities.audio.tdnn_dim[0])
+        tdnn_layers = [
+            TDNNLayer(config.modalities.audio, i) for i in range(len(config.modalities.audio.tdnn_dim))
+        ]
+        self.tdnn = nn.ModuleList(tdnn_layers)
+        self.feature_extractor = nn.Linear(
+            config.modalities.audio.tdnn_dim[-1] * 2, config.modalities.audio.xvector_output_dim
+        )
+        self.classifier = nn.Linear(
+            config.modalities.audio.xvector_output_dim, config.modalities.audio.xvector_output_dim
+        )
+        self.objective = AMSoftmaxLoss(
+            config.modalities.audio.xvector_output_dim, config.num_labels
+        )
+        self.init_weights()
+    def freeze_feature_extractor(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        warnings.warn(
+            "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
+            "Please use the equivalent `freeze_feature_encoder` method instead.",
+            FutureWarning,
+        )
+        self.freeze_feature_encoder()
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.pantagruel_uni.freeze_feature_encoder()
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.pantagruel_uni.parameters():
+            param.requires_grad = False
+    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the TDNN layers
+        """
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+        for kernel_size in self.config.modalities.audio.tdnn_kernel:
+            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
+        return input_lengths
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, XVectorOutput]:
+        r"""
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
+            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
+            into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.modalities.audio.use_weighted_layer_sum else output_hidden_states
+        outputs = self.pantagruel_uni(
+            input_values=input_values,
+            attention_mask=attention_mask,
+            mask=False,
+            mode="AUDIO",
+            output_hidden_states=output_hidden_states,
+            output_attn_weights=output_attentions,
+            return_dict=return_dict,
+        )
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+        hidden_states = self.projector(hidden_states)
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
 __all__ = [
     "PantagruelUniForMaskedLM",
     "PantagruelUniForTokenClassification",
     "PantagruelUniModel",
     "PantagruelUniPreTrainedModel",
+    "PantagruelUniForAudioFrameClassification",
 ]