flaubert commited on
Commit
d66ae9a
·
verified ·
1 Parent(s): 9dd2d61

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. modeling_pantagruel_uni.py +729 -65
modeling_pantagruel_uni.py CHANGED
@@ -52,7 +52,10 @@ from transformers.modeling_outputs import (
52
  QuestionAnsweringModelOutput,
53
  SequenceClassifierOutput,
54
  TokenClassifierOutput,
 
 
55
  )
 
56
  from .configuration_pantagruel_uni import (
57
  PantagruelUniConfig,
58
  PantagruelModalityConfig,
@@ -83,8 +86,6 @@ class PantagruelUniBaseModelOutput(ModelOutput):
83
  attentions: Optional[tuple[torch.FloatTensor, ...]] = None
84
 
85
 
86
- #################################################
87
- ### modeling_pantagruel_uni_base.py
88
  # copied from fairseq.modules.grad_multiply
89
  class GradMultiply(torch.autograd.Function):
90
  @staticmethod
@@ -98,7 +99,7 @@ class GradMultiply(torch.autograd.Function):
98
  return grad * ctx.scale, None
99
 
100
 
101
- # Copied from fairseq.modules.transpose_last.py
102
  class TransposeLast(nn.Module):
103
  def __init__(self, deconstruct_idx=None, tranpose_dim=-2):
104
  super().__init__()
@@ -111,7 +112,7 @@ class TransposeLast(nn.Module):
111
  return x.transpose(self.tranpose_dim, -1)
112
 
113
 
114
- # Copied from fairseq.modules.layer_norm.py
115
  class Fp32LayerNorm(nn.LayerNorm):
116
  def __init__(self, *args, **kwargs):
117
  super().__init__(*args, **kwargs)
@@ -125,7 +126,7 @@ class Fp32LayerNorm(nn.LayerNorm):
125
  self.eps,
126
  )
127
  return output.type_as(input)
128
-
129
 
130
  def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True):
131
  return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
@@ -457,6 +458,7 @@ class BlockEncoder(nn.Module):
457
  return x
458
 
459
 
 
460
  class ModalitySpecificEncoder(nn.Module):
461
  def __init__(
462
  self,
@@ -820,6 +822,7 @@ class ModalitySpecificEncoder(nn.Module):
820
  return x
821
 
822
 
 
823
  class AudioEncoder(ModalitySpecificEncoder):
824
 
825
  modality_cfg: PantagruelAudioConfig
@@ -952,6 +955,7 @@ class AudioEncoder(ModalitySpecificEncoder):
952
  return padding_mask
953
 
954
 
 
955
  class LearnedPositionalEmbedding(nn.Embedding):
956
  """
957
  This module learns positional embeddings up to a fixed maximum size.
@@ -1001,6 +1005,7 @@ class LearnedPositionalEmbedding(nn.Embedding):
1001
  )
1002
 
1003
 
 
1004
  class SinusoidalPositionalEmbedding(nn.Module):
1005
  """This module produces sinusoidal positional embeddings of any length.
1006
 
@@ -1098,7 +1103,9 @@ class SinusoidalPositionalEmbedding(nn.Module):
1098
  .view(bsz, seq_len, -1)
1099
  .detach()
1100
  )
1101
-
 
 
1102
  def PositionalEmbedding(
1103
  num_embeddings: int,
1104
  embedding_dim: int,
@@ -1125,6 +1132,7 @@ def PositionalEmbedding(
1125
  return m
1126
 
1127
 
 
1128
  class TextLocalEncoder(nn.Module):
1129
  def __init__(
1130
  self,
@@ -1246,7 +1254,6 @@ class TextEncoder(ModalitySpecificEncoder):
1246
  ), f"{x.size(1), padding_mask.size(1), diff, self.downsample}"
1247
 
1248
  return padding_mask
1249
- #################################################
1250
 
1251
 
1252
  # copied from transformers.models.data2vec.modeling_data2vec.PantagruelUniTextPooler
@@ -1265,6 +1272,64 @@ class PantagruelUniTextPooler(nn.Module):
1265
  return pooled_output
1266
 
1267
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1268
  class PantagruelUniPreTrainedModel(PreTrainedModel):
1269
  config_class = PantagruelUniConfig
1270
  base_model_prefix = "pantagruel_uni"
@@ -1310,27 +1375,60 @@ class PantagruelUniPreTrainedModel(PreTrainedModel):
1310
  else:
1311
  _init(module)
1312
 
1313
- # @classmethod
1314
- # def from_pretrained(
1315
- # cls,
1316
- # pretrained_model_name_or_path,
1317
- # *model_args,
1318
- # **kwargs,
1319
- # ):
1320
- # config = cls.config_class()
1321
- # config.from_pretrained(pretrained_model_name_or_path)
1322
- # print(f"Loading configuration from pre-trained model: {type(config)}")
1323
- # return super().from_pretrained(pretrained_model_name_or_path,
1324
- # *model_args,
1325
- # config,
1326
- # **kwargs,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1327
 
1328
 
 
1329
  class PantagruelUniModel(PantagruelUniPreTrainedModel):
1330
 
1331
  def __init__(
1332
  self, config: PantagruelUniConfig, add_pooling_layer: bool = True
1333
  ):
 
 
 
 
1334
  super().__init__(config)
1335
  self.config = config
1336
  modalities_cfg = config.modalities
@@ -1390,10 +1488,12 @@ class PantagruelUniModel(PantagruelUniPreTrainedModel):
1390
  self.post_init()
1391
 
1392
  def get_input_embeddings(self):
1393
- return self.modality_encoders["TEXT"].local_encoder.embed_tokens
 
1394
 
1395
  def set_input_embeddings(self, value):
1396
- self.modality_encoders["TEXT"].local_encoder.embed_tokens = value
 
1397
 
1398
  def freeze_feature_extractor(self):
1399
  """
@@ -1414,6 +1514,14 @@ class PantagruelUniModel(PantagruelUniPreTrainedModel):
1414
  """
1415
  for mod in self.modalities:
1416
  self.modality_encoders[mod]._freeze_parameters()
 
 
 
 
 
 
 
 
1417
  for block in self.blocks:
1418
  for p in block.parameters():
1419
  p.requires_grad = False
@@ -1447,6 +1555,7 @@ class PantagruelUniModel(PantagruelUniPreTrainedModel):
1447
  self,
1448
  input_values=None, # audio input
1449
  input_ids=None, # text input
 
1450
  attention_mask=None,
1451
  padding_mask=None,
1452
  mask=False,
@@ -1454,12 +1563,68 @@ class PantagruelUniModel(PantagruelUniPreTrainedModel):
1454
  output_hidden_states=True,
1455
  output_attn_weights=False,
1456
  return_dict=True,
1457
- ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1458
  if mode is None:
1459
  mode = "TEXT" if input_ids is not None else "AUDIO"
1460
 
1461
  if padding_mask is None and attention_mask is not None:
1462
- padding_mask = ~attention_mask # attention mask: 1 means to attend to (not masked), 0 means not to attend to (masked). padding mask: 1 means padded (not attend to), 0 means not padded (to attend to)
1463
 
1464
  feature_extractor = self.modality_encoders[mode]
1465
  extractor_out = feature_extractor(
@@ -1598,7 +1763,7 @@ class PantagruelTextClassificationHead(nn.Module):
1598
 
1599
  @auto_docstring
1600
  class PantagruelUniForMaskedLM(PantagruelUniPreTrainedModel):
1601
- _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
1602
 
1603
  def __init__(self, config):
1604
  super().__init__(config)
@@ -1663,10 +1828,13 @@ class PantagruelUniForMaskedLM(PantagruelUniPreTrainedModel):
1663
  )
1664
 
1665
 
 
 
 
1666
  @auto_docstring(
1667
  custom_intro="""
1668
- PantagruelText Model transformer with a sequence classification/regression head on top (a linear layer on top of the
1669
- pooled output) e.g. for GLUE tasks.
1670
  """
1671
  )
1672
  class PantagruelUniForSequenceClassification(PantagruelUniPreTrainedModel):
@@ -1674,64 +1842,157 @@ class PantagruelUniForSequenceClassification(PantagruelUniPreTrainedModel):
1674
  super().__init__(config)
1675
  self.num_labels = config.num_labels
1676
  self.config = config
1677
-
1678
  self.pantagruel_uni = PantagruelUniModel(config, add_pooling_layer=False)
1679
- self.classifier = PantagruelTextClassificationHead(config)
 
 
 
 
 
 
 
 
 
 
1680
 
1681
  # Initialize weights and apply final processing
1682
  self.post_init()
1683
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1684
  @can_return_tuple
1685
  @auto_docstring
1686
  def forward(
1687
  self,
 
1688
  input_ids: Optional[torch.LongTensor] = None,
1689
  attention_mask: Optional[torch.FloatTensor] = None,
1690
  padding_mask: Optional[torch.FloatTensor] = None,
 
 
 
1691
  labels: Optional[torch.LongTensor] = None,
1692
  **kwargs: Unpack[TransformersKwargs],
1693
  ) -> Union[tuple, SequenceClassifierOutput]:
1694
  r"""
1695
- labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1696
- Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1697
- config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1698
- `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
 
 
 
 
 
 
 
 
 
 
 
 
 
1699
  """
1700
- outputs = self.pantagruel_uni(
1701
- input_ids=input_ids,
1702
- attention_mask=attention_mask,
1703
- padding_mask=padding_mask,
1704
- mask=False,
1705
- mode="TEXT",
1706
- return_dict=True,
1707
- )
1708
- sequence_output = outputs.last_hidden_state
1709
- logits = self.classifier(sequence_output)
1710
 
1711
- loss = None
1712
- if labels is not None:
1713
- labels = labels.to(logits.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1714
 
1715
- if self.config.problem_type is None:
1716
- if self.num_labels == 1:
1717
- self.config.problem_type = "regression"
1718
- elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1719
- self.config.problem_type = "single_label_classification"
1720
- else:
1721
- self.config.problem_type = "multi_label_classification"
 
 
 
 
 
 
 
 
 
 
1722
 
1723
- if self.config.problem_type == "regression":
1724
- loss_fct = MSELoss()
1725
- if self.num_labels == 1:
1726
- loss = loss_fct(logits.squeeze(), labels.squeeze())
1727
- else:
1728
- loss = loss_fct(logits, labels)
1729
- elif self.config.problem_type == "single_label_classification":
 
 
 
 
 
 
1730
  loss_fct = CrossEntropyLoss()
1731
- loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1732
- elif self.config.problem_type == "multi_label_classification":
1733
- loss_fct = BCEWithLogitsLoss()
1734
- loss = loss_fct(logits, labels)
 
1735
 
1736
  return SequenceClassifierOutput(
1737
  loss=loss,
@@ -1952,6 +2213,408 @@ class PantagruelUniForQuestionAnswering(PantagruelUniPreTrainedModel):
1952
  attentions=outputs.attentions,
1953
  )
1954
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1955
 
1956
  __all__ = [
1957
  "PantagruelUniForMaskedLM",
@@ -1961,4 +2624,5 @@ __all__ = [
1961
  "PantagruelUniForTokenClassification",
1962
  "PantagruelUniModel",
1963
  "PantagruelUniPreTrainedModel",
 
1964
  ]
 
52
  QuestionAnsweringModelOutput,
53
  SequenceClassifierOutput,
54
  TokenClassifierOutput,
55
+ CausalLMOutput,
56
+ XVectorOutput,
57
  )
58
+ from transformers.utils import auto_docstring, is_peft_available
59
  from .configuration_pantagruel_uni import (
60
  PantagruelUniConfig,
61
  PantagruelModalityConfig,
 
86
  attentions: Optional[tuple[torch.FloatTensor, ...]] = None
87
 
88
 
 
 
89
  # copied from fairseq.modules.grad_multiply
90
  class GradMultiply(torch.autograd.Function):
91
  @staticmethod
 
99
  return grad * ctx.scale, None
100
 
101
 
102
+ # copied from fairseq.modules.transpose_last.py
103
  class TransposeLast(nn.Module):
104
  def __init__(self, deconstruct_idx=None, tranpose_dim=-2):
105
  super().__init__()
 
112
  return x.transpose(self.tranpose_dim, -1)
113
 
114
 
115
+ # copied from fairseq.modules.layer_norm.py
116
  class Fp32LayerNorm(nn.LayerNorm):
117
  def __init__(self, *args, **kwargs):
118
  super().__init__(*args, **kwargs)
 
126
  self.eps,
127
  )
128
  return output.type_as(input)
129
+
130
 
131
  def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True):
132
  return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
 
458
  return x
459
 
460
 
461
+ # copied from fairseq.examples.data2vec.models
462
  class ModalitySpecificEncoder(nn.Module):
463
  def __init__(
464
  self,
 
822
  return x
823
 
824
 
825
+ # copied from fairseq.examples.data2vec.models.modalities.audio
826
  class AudioEncoder(ModalitySpecificEncoder):
827
 
828
  modality_cfg: PantagruelAudioConfig
 
955
  return padding_mask
956
 
957
 
958
+ # copied from fairseq
959
  class LearnedPositionalEmbedding(nn.Embedding):
960
  """
961
  This module learns positional embeddings up to a fixed maximum size.
 
1005
  )
1006
 
1007
 
1008
+ # copied from fairseq
1009
  class SinusoidalPositionalEmbedding(nn.Module):
1010
  """This module produces sinusoidal positional embeddings of any length.
1011
 
 
1103
  .view(bsz, seq_len, -1)
1104
  .detach()
1105
  )
1106
+
1107
+
1108
+ # copied from fairseq.modules
1109
  def PositionalEmbedding(
1110
  num_embeddings: int,
1111
  embedding_dim: int,
 
1132
  return m
1133
 
1134
 
1135
+ # copied from fairseq.examples.data2vec.modules
1136
  class TextLocalEncoder(nn.Module):
1137
  def __init__(
1138
  self,
 
1254
  ), f"{x.size(1), padding_mask.size(1), diff, self.downsample}"
1255
 
1256
  return padding_mask
 
1257
 
1258
 
1259
  # copied from transformers.models.data2vec.modeling_data2vec.PantagruelUniTextPooler
 
1272
  return pooled_output
1273
 
1274
 
1275
+ # copied from transformers.models.data2vec.modeling_data2vec_audio
1276
+ class AMSoftmaxLoss(nn.Module):
1277
+ def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
1278
+ super().__init__()
1279
+ self.scale = scale
1280
+ self.margin = margin
1281
+ self.num_labels = num_labels
1282
+ self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
1283
+ self.loss = nn.CrossEntropyLoss()
1284
+
1285
+ def forward(self, hidden_states, labels):
1286
+ labels = labels.flatten()
1287
+ weight = nn.functional.normalize(self.weight, dim=0)
1288
+ hidden_states = nn.functional.normalize(hidden_states, dim=1)
1289
+ cos_theta = torch.mm(hidden_states, weight)
1290
+ psi = cos_theta - self.margin
1291
+
1292
+ onehot = nn.functional.one_hot(labels, self.num_labels)
1293
+ logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
1294
+ loss = self.loss(logits, labels)
1295
+
1296
+ return loss
1297
+
1298
+
1299
+ # copied from transformers.models.data2vec.modeling_data2vec_audio
1300
+ class TDNNLayer(nn.Module):
1301
+ def __init__(self, config, layer_id=0):
1302
+ super().__init__()
1303
+ self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
1304
+ self.out_conv_dim = config.tdnn_dim[layer_id]
1305
+ self.kernel_size = config.tdnn_kernel[layer_id]
1306
+ self.dilation = config.tdnn_dilation[layer_id]
1307
+
1308
+ self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
1309
+ self.activation = nn.ReLU()
1310
+
1311
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
1312
+ if is_peft_available():
1313
+ from peft.tuners.lora import LoraLayer
1314
+
1315
+ if is_peft_available():
1316
+ if isinstance(self.kernel, LoraLayer):
1317
+ warnings.warn(
1318
+ "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
1319
+ "You should exclude TDNNLayer from LoRA's target modules.",
1320
+ )
1321
+
1322
+ # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
1323
+ hidden_states = hidden_states.transpose(1, 2)
1324
+ weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)
1325
+ hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
1326
+ hidden_states = hidden_states.transpose(1, 2)
1327
+
1328
+ hidden_states = self.activation(hidden_states)
1329
+ return hidden_states
1330
+
1331
+
1332
+ @auto_docstring
1333
  class PantagruelUniPreTrainedModel(PreTrainedModel):
1334
  config_class = PantagruelUniConfig
1335
  base_model_prefix = "pantagruel_uni"
 
1375
  else:
1376
  _init(module)
1377
 
1378
+ def _get_feat_extract_output_lengths(
1379
+ self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
1380
+ ):
1381
+ """
1382
+ Computes the output length of the convolutional layers
1383
+ """
1384
+
1385
+ add_adapter = self.config.modalities.audio.add_adapter if add_adapter is None else add_adapter
1386
+
1387
+ def _conv_out_length(input_length, kernel_size, stride):
1388
+ # 1D convolutional layer output length formula taken
1389
+ # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
1390
+ return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
1391
+
1392
+ for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
1393
+ input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
1394
+
1395
+ if add_adapter:
1396
+ for _ in range(self.config.num_adapter_layers):
1397
+ input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
1398
+
1399
+ return input_lengths
1400
+
1401
+ def _get_feature_vector_attention_mask(
1402
+ self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
1403
+ ):
1404
+ # Effectively attention_mask.sum(-1), but not inplace to be able to run
1405
+ # on inference mode.
1406
+ non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
1407
+
1408
+ output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
1409
+ output_lengths = output_lengths.to(torch.long)
1410
+
1411
+ batch_size = attention_mask.shape[0]
1412
+
1413
+ attention_mask = torch.zeros(
1414
+ (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
1415
+ )
1416
+ # these two operations makes sure that all values before the output lengths idxs are attended to
1417
+ attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
1418
+ attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
1419
+ return attention_mask
1420
 
1421
 
1422
+ @auto_docstring
1423
  class PantagruelUniModel(PantagruelUniPreTrainedModel):
1424
 
1425
  def __init__(
1426
  self, config: PantagruelUniConfig, add_pooling_layer: bool = True
1427
  ):
1428
+ r"""
1429
+ add_pooling_layer (bool, *optional*, defaults to `True`):
1430
+ Whether to add a pooling layer
1431
+ """
1432
  super().__init__(config)
1433
  self.config = config
1434
  modalities_cfg = config.modalities
 
1488
  self.post_init()
1489
 
1490
  def get_input_embeddings(self):
1491
+ if "TEXT" in self.modality_encoders:
1492
+ return self.modality_encoders["TEXT"].local_encoder.embed_tokens
1493
 
1494
  def set_input_embeddings(self, value):
1495
+ if "TEXT" in self.modality_encoders:
1496
+ self.modality_encoders["TEXT"].local_encoder.embed_tokens = value
1497
 
1498
  def freeze_feature_extractor(self):
1499
  """
 
1514
  """
1515
  for mod in self.modalities:
1516
  self.modality_encoders[mod]._freeze_parameters()
1517
+
1518
+ def freeze_base_model(self):
1519
+ """
1520
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
1521
+ not be updated during training.
1522
+ """
1523
+ for mod in self.modalities:
1524
+ self.modality_encoders[mod]._freeze_parameters()
1525
  for block in self.blocks:
1526
  for p in block.parameters():
1527
  p.requires_grad = False
 
1555
  self,
1556
  input_values=None, # audio input
1557
  input_ids=None, # text input
1558
+ token_type_ids=None,
1559
  attention_mask=None,
1560
  padding_mask=None,
1561
  mask=False,
 
1563
  output_hidden_states=True,
1564
  output_attn_weights=False,
1565
  return_dict=True,
1566
+ ) -> Union[Tuple, PantagruelUniBaseModelOutput]:
1567
+ r"""
1568
+ Performs a forward pass of the model for either audio or text inputs.
1569
+
1570
+ The modality is automatically inferred if `mode` is not provided:
1571
+ `"TEXT"` is used when `input_ids` is specified, otherwise `"AUDIO"`.
1572
+
1573
+ Args:
1574
+ input_values (`torch.FloatTensor`, *optional*):
1575
+ Audio input values of shape `(batch_size, sequence_length)`
1576
+ containing *normalized* audio samples
1577
+ Required when operating in `"AUDIO"` mode.
1578
+
1579
+ input_ids (`torch.LongTensor`, *optional*):
1580
+ Tokenized text input IDs of shape `(batch_size, sequence_length)`.
1581
+ Required when operating in `"TEXT"` mode.
1582
+
1583
+ attention_mask (`torch.LongTensor`, *optional*):
1584
+ Attention mask for text inputs, with values in `{0, 1}`:
1585
+ - `1` for tokens that should be attended to,
1586
+ - `0` for tokens that should be masked.
1587
+ If provided and `padding_mask` is `None`, it will be converted internally
1588
+ to a padding mask.
1589
+
1590
+ padding_mask (`torch.BoolTensor` or `torch.LongTensor`, *optional*):
1591
+ Padding mask indicating which positions are padded:
1592
+ - `1` (or `True`) for padded positions (not attended to),
1593
+ - `0` (or `False`) for non-padded positions.
1594
+ If not provided and `attention_mask` is given, this is inferred as
1595
+ the logical negation of `attention_mask`.
1596
+
1597
+ mask (`bool`, *optional*, defaults to `False`):
1598
+ Whether to apply input masking.
1599
+
1600
+ mode (`str`, *optional*):
1601
+ Explicitly specifies the input modality. Supported values are
1602
+ `"TEXT"` and `"AUDIO"`. If `None`, the mode is inferred from the
1603
+ provided inputs.
1604
+
1605
+ output_hidden_states (`bool`, *optional*, defaults to `True`):
1606
+ Whether to return the hidden states of all layers.
1607
+
1608
+ output_attn_weights (`bool`, *optional*, defaults to `False`):
1609
+ Whether to return attention weights.
1610
+
1611
+ return_dict (`bool`, *optional*, defaults to `True`):
1612
+ Whether to return a [`ModelOutput`] instead of a plain tuple.
1613
+
1614
+ Returns:
1615
+ [`ModelOutput`] or `tuple`:
1616
+ The model outputs. If `return_dict=True`, a [`ModelOutput`] is returned
1617
+ containing (depending on configuration) the final hidden states,
1618
+ optional hidden states from all layers, and optional attention weights.
1619
+ If `return_dict=False`, a tuple is returned with the same contents in
1620
+ a fixed order.
1621
+ """
1622
+
1623
  if mode is None:
1624
  mode = "TEXT" if input_ids is not None else "AUDIO"
1625
 
1626
  if padding_mask is None and attention_mask is not None:
1627
+ padding_mask = ~attention_mask.bool() # attention mask: 1 means to attend to (not masked), 0 means not to attend to (masked). padding mask: 1 means padded (not attend to), 0 means not padded (to attend to)
1628
 
1629
  feature_extractor = self.modality_encoders[mode]
1630
  extractor_out = feature_extractor(
 
1763
 
1764
  @auto_docstring
1765
  class PantagruelUniForMaskedLM(PantagruelUniPreTrainedModel):
1766
+ # _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
1767
 
1768
  def __init__(self, config):
1769
  super().__init__(config)
 
1828
  )
1829
 
1830
 
1831
+ _HIDDEN_STATES_START_POSITION = 2
1832
+
1833
+
1834
  @auto_docstring(
1835
  custom_intro="""
1836
+ PantagruelUniModel with a sequence classification or regression head on top (a linear layer applied to a pooled representation of the sequence).
1837
+ This model supports text and audio modalities. The classification head and internal processing are selected automatically based on the configuration.
1838
  """
1839
  )
1840
  class PantagruelUniForSequenceClassification(PantagruelUniPreTrainedModel):
 
1842
  super().__init__(config)
1843
  self.num_labels = config.num_labels
1844
  self.config = config
 
1845
  self.pantagruel_uni = PantagruelUniModel(config, add_pooling_layer=False)
1846
+
1847
+ if config.supported_modality == "TEXT":
1848
+ logger.info("Initializing PantagruelUniForSequenceClassification for TEXT")
1849
+ self.classifier = PantagruelTextClassificationHead(config)
1850
+ elif config.supported_modality == "AUDIO":
1851
+ logger.info("Initializing PantagruelUniForSequenceClassification for AUDIO")
1852
+ num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
1853
+ if config.modalities.audio.use_weighted_layer_sum:
1854
+ self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
1855
+ self.projector = nn.Linear(config.hidden_size, config.modalities.audio.classifier_proj_size)
1856
+ self.classifier = nn.Linear(config.modalities.audio.classifier_proj_size, config.num_labels)
1857
 
1858
  # Initialize weights and apply final processing
1859
  self.post_init()
1860
 
1861
+ def freeze_feature_extractor(self):
1862
+ """
1863
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
1864
+ not be updated during training.
1865
+ """
1866
+ warnings.warn(
1867
+ "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
1868
+ "Please use the equivalent `freeze_feature_encoder` method instead.",
1869
+ FutureWarning,
1870
+ )
1871
+ self.freeze_feature_encoder()
1872
+
1873
+ def freeze_feature_encoder(self):
1874
+ """
1875
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
1876
+ not be updated during training.
1877
+ """
1878
+ self.pantagruel_uni.freeze_feature_encoder()
1879
+
1880
+ def freeze_base_model(self):
1881
+ """
1882
+ Calling this function will disable the gradient computation for the base model so that its parameters will not
1883
+ be updated during training. Only the classification head will be updated.
1884
+ """
1885
+ for param in self.pantagruel_uni.parameters():
1886
+ param.requires_grad = False
1887
+
1888
  @can_return_tuple
1889
  @auto_docstring
1890
  def forward(
1891
  self,
1892
+ input_values: Optional[torch.FloatTensor] = None,
1893
  input_ids: Optional[torch.LongTensor] = None,
1894
  attention_mask: Optional[torch.FloatTensor] = None,
1895
  padding_mask: Optional[torch.FloatTensor] = None,
1896
+ output_attentions: Optional[bool] = None,
1897
+ output_hidden_states: Optional[bool] = None,
1898
+ return_dict: Optional[bool] = None,
1899
  labels: Optional[torch.LongTensor] = None,
1900
  **kwargs: Unpack[TransformersKwargs],
1901
  ) -> Union[tuple, SequenceClassifierOutput]:
1902
  r"""
1903
+ Performs a forward pass for sequence classification or regression.
1904
+
1905
+ This method supports both **text** and **audio** inputs. The modality is inferred
1906
+ from the provided inputs and the model configuration.
1907
+
1908
+ Args:
1909
+ input_values (`torch.FloatTensor`, *optional*):
1910
+ Audio input values of shape `(batch_size, sequence_length)`
1911
+ containing *normalized* audio samples.
1912
+ input_ids (`torch.LongTensor`, *optional*):
1913
+ Tokenized text input IDs of shape `(batch_size, sequence_length)`.
1914
+ Used when the model is configured for `"TEXT"` modality.
1915
+
1916
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1917
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`.
1918
+ If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
1919
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1920
  """
1921
+ if self.config.supported_modality == "TEXT":
1922
+ outputs = self.pantagruel_uni(
1923
+ input_ids=input_ids,
1924
+ attention_mask=attention_mask,
1925
+ padding_mask=padding_mask,
1926
+ mask=False,
1927
+ mode="TEXT",
1928
+ return_dict=True,
1929
+ )
 
1930
 
1931
+ sequence_output = outputs.last_hidden_state
1932
+ logits = self.classifier(sequence_output)
1933
+
1934
+ loss = None
1935
+ if labels is not None:
1936
+ labels = labels.to(logits.device)
1937
+
1938
+ if self.config.problem_type is None:
1939
+ if self.num_labels == 1:
1940
+ self.config.problem_type = "regression"
1941
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1942
+ self.config.problem_type = "single_label_classification"
1943
+ else:
1944
+ self.config.problem_type = "multi_label_classification"
1945
+
1946
+ if self.config.problem_type == "regression":
1947
+ loss_fct = MSELoss()
1948
+ if self.num_labels == 1:
1949
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
1950
+ else:
1951
+ loss = loss_fct(logits, labels)
1952
+ elif self.config.problem_type == "single_label_classification":
1953
+ loss_fct = CrossEntropyLoss()
1954
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1955
+ elif self.config.problem_type == "multi_label_classification":
1956
+ loss_fct = BCEWithLogitsLoss()
1957
+ loss = loss_fct(logits, labels)
1958
 
1959
+ else:
1960
+ outputs = self.pantagruel_uni(
1961
+ input_values=input_values,
1962
+ attention_mask=attention_mask,
1963
+ mask=False,
1964
+ mode="AUDIO",
1965
+ output_hidden_states=output_hidden_states,
1966
+ output_attn_weights=output_attentions,
1967
+ return_dict=return_dict,
1968
+ )
1969
+ if self.config.modalities.audio.use_weighted_layer_sum:
1970
+ hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
1971
+ hidden_states = torch.stack(hidden_states, dim=1)
1972
+ norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
1973
+ hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
1974
+ else:
1975
+ hidden_states = outputs[0]
1976
 
1977
+ hidden_states = self.projector(hidden_states)
1978
+ if attention_mask is None:
1979
+ pooled_output = hidden_states.mean(dim=1)
1980
+ else:
1981
+ padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
1982
+ expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
1983
+ hidden_states[~expand_padding_mask] = 0.0
1984
+ pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
1985
+
1986
+ logits = self.classifier(pooled_output)
1987
+
1988
+ loss = None
1989
+ if labels is not None:
1990
  loss_fct = CrossEntropyLoss()
1991
+ loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
1992
+
1993
+ if not return_dict:
1994
+ output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
1995
+ return ((loss,) + output) if loss is not None else output
1996
 
1997
  return SequenceClassifierOutput(
1998
  loss=loss,
 
2213
  attentions=outputs.attentions,
2214
  )
2215
 
2216
+ class PantagruelUniForAudioFrameClassification(PantagruelUniPreTrainedModel):
2217
+ def __init__(self, config):
2218
+ super().__init__(config)
2219
+
2220
+ self.config = config
2221
+
2222
+ if hasattr(config.modalities.audio, "add_adapter") and config.modalities.audio.add_adapter:
2223
+ raise ValueError(
2224
+ "Audio frame classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)"
2225
+ )
2226
+
2227
+ self.pantagruel_uni = PantagruelUniModel(config, add_pooling_layer=False)
2228
+ num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
2229
+ if config.modalities.audio.use_weighted_layer_sum:
2230
+ self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
2231
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
2232
+ self.num_labels = config.num_labels
2233
+
2234
+ self.init_weights()
2235
+
2236
+ def freeze_feature_extractor(self):
2237
+ """
2238
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
2239
+ not be updated during training.
2240
+ """
2241
+ warnings.warn(
2242
+ "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
2243
+ "Please use the equivalent `freeze_feature_encoder` method instead.",
2244
+ FutureWarning,
2245
+ )
2246
+ self.freeze_feature_encoder()
2247
+
2248
+ def freeze_feature_encoder(self):
2249
+ """
2250
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
2251
+ not be updated during training.
2252
+ """
2253
+ self.pantagruel_uni.freeze_feature_encoder()
2254
+
2255
+ def freeze_base_model(self):
2256
+ """
2257
+ Calling this function will disable the gradient computation for the base model so that its parameters will not
2258
+ be updated during training. Only the classification head will be updated.
2259
+ """
2260
+ for param in self.pantagruel_uni.parameters():
2261
+ param.requires_grad = False
2262
+
2263
+ @auto_docstring
2264
+ def forward(
2265
+ self,
2266
+ input_values: Optional[torch.Tensor],
2267
+ attention_mask: Optional[torch.Tensor] = None,
2268
+ labels: Optional[torch.Tensor] = None,
2269
+ output_attentions: Optional[bool] = None,
2270
+ output_hidden_states: Optional[bool] = None,
2271
+ return_dict: Optional[bool] = None,
2272
+ ) -> Union[tuple, TokenClassifierOutput]:
2273
+ r"""
2274
+ input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
2275
+ Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
2276
+ into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
2277
+ (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
2278
+ To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
2279
+ into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details.
2280
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
2281
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
2282
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
2283
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
2284
+ """
2285
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2286
+ output_hidden_states = (
2287
+ True if self.config.modalities.audio.use_weighted_layer_sum
2288
+ else output_hidden_states
2289
+ )
2290
+
2291
+ outputs = self.pantagruel_uni(
2292
+ input_values=input_values,
2293
+ attention_mask=attention_mask,
2294
+ mask=False,
2295
+ mode="AUDIO",
2296
+ output_hidden_states=output_hidden_states,
2297
+ output_attn_weights=output_attentions,
2298
+ return_dict=return_dict,
2299
+ )
2300
+
2301
+ if self.config.modalities.audio.use_weighted_layer_sum:
2302
+ hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
2303
+ hidden_states = torch.stack(hidden_states, dim=1)
2304
+ norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
2305
+ hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
2306
+ else:
2307
+ hidden_states = outputs[0]
2308
+
2309
+ logits = self.classifier(hidden_states)
2310
+
2311
+ loss = None
2312
+ if labels is not None:
2313
+ loss_fct = CrossEntropyLoss()
2314
+ loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
2315
+
2316
+ if not return_dict:
2317
+ output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
2318
+ return output
2319
+
2320
+ return TokenClassifierOutput(
2321
+ loss=loss,
2322
+ logits=logits,
2323
+ hidden_states=outputs.hidden_states,
2324
+ attentions=outputs.attentions,
2325
+ )
2326
+
2327
+
2328
+ @auto_docstring(
2329
+ custom_intro="""
2330
+ PantagruelUniForCTC Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
2331
+ """
2332
+ )
2333
+ class PantagruelUniForCTC(PantagruelUniPreTrainedModel):
2334
+ def __init__(self, config):
2335
+ r"""
2336
+ target_lang (`str`, *optional*):
2337
+ Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
2338
+ adapter.<lang>.bin. Only relevant when using an instance of [`Data2VecAudioForCTC`] with adapters. Uses 'eng' by
2339
+ default.
2340
+ """
2341
+ super().__init__(config)
2342
+
2343
+ self.pantagruel_uni = PantagruelUniModel(config, add_pooling_layer=False)
2344
+ self.dropout = nn.Dropout(config.final_dropout)
2345
+
2346
+ if config.modalities.audio.vocab_size is None:
2347
+ raise ValueError(
2348
+ f"You are trying to instantiate {self.__class__} with a configuration that "
2349
+ "does not define the vocabulary size of the language model head. Please "
2350
+ "instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
2351
+ "or define `vocab_size` of your model's configuration."
2352
+ )
2353
+ output_hidden_size = (
2354
+ config.modalities.audio.output_hidden_size if hasattr(config.modalities.audio, "add_adapter") and config.modalities.audio.add_adapter else config.hidden_size
2355
+ )
2356
+ self.lm_head = nn.Linear(output_hidden_size, config.modalities.audio.vocab_size)
2357
+
2358
+ # Initialize weights and apply final processing
2359
+ self.post_init()
2360
+
2361
+ def freeze_feature_extractor(self):
2362
+ """
2363
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
2364
+ not be updated during training.
2365
+ """
2366
+ warnings.warn(
2367
+ "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
2368
+ "Please use the equivalent `freeze_feature_encoder` method instead.",
2369
+ FutureWarning,
2370
+ )
2371
+ self.freeze_feature_encoder()
2372
+
2373
+ def freeze_feature_encoder(self):
2374
+ """
2375
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
2376
+ not be updated during training.
2377
+ """
2378
+ self.pantagruel_uni.freeze_feature_encoder()
2379
+
2380
+ def freeze_base_model(self):
2381
+ """
2382
+ Calling this function will disable the gradient computation for the base model so that its parameters will not
2383
+ be updated during training. Only the classification head will be updated.
2384
+ """
2385
+ for param in self.pantagruel_uni.parameters():
2386
+ param.requires_grad = False
2387
+
2388
+ @auto_docstring
2389
+ def forward(
2390
+ self,
2391
+ input_values: Optional[torch.Tensor],
2392
+ attention_mask: Optional[torch.Tensor] = None,
2393
+ output_attentions: Optional[bool] = None,
2394
+ output_hidden_states: Optional[bool] = None,
2395
+ return_dict: Optional[bool] = None,
2396
+ labels: Optional[torch.Tensor] = None,
2397
+ ) -> Union[tuple, CausalLMOutput]:
2398
+ r"""
2399
+ labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
2400
+ Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
2401
+ the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
2402
+ All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
2403
+ config.vocab_size - 1]`.
2404
+ """
2405
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2406
+
2407
+ if labels is not None and labels.max() >= self.config.modalities.audio.vocab_size:
2408
+ raise ValueError(f"Label values must be <= vocab_size: {self.config.modalities.audio.vocab_size}")
2409
+
2410
+ outputs = self.pantagruel_uni(
2411
+ input_values=input_values,
2412
+ attention_mask=attention_mask,
2413
+ mask=False,
2414
+ mode="AUDIO",
2415
+ output_hidden_states=output_hidden_states,
2416
+ output_attn_weights=output_attentions,
2417
+ return_dict=return_dict,
2418
+ )
2419
+
2420
+ hidden_states = outputs[0]
2421
+ hidden_states = self.dropout(hidden_states)
2422
+
2423
+ logits = self.lm_head(hidden_states)
2424
+
2425
+ loss = None
2426
+ if labels is not None:
2427
+ # retrieve loss input_lengths from attention_mask
2428
+ attention_mask = (
2429
+ attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
2430
+ )
2431
+ input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
2432
+
2433
+ # assuming that padded tokens are filled with -100
2434
+ # when not being attended to
2435
+ labels_mask = labels >= 0
2436
+ target_lengths = labels_mask.sum(-1)
2437
+ flattened_targets = labels.masked_select(labels_mask)
2438
+
2439
+ # ctc_loss doesn't support fp16
2440
+ log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
2441
+
2442
+ with torch.backends.cudnn.flags(enabled=False):
2443
+ loss = nn.functional.ctc_loss(
2444
+ log_probs,
2445
+ flattened_targets,
2446
+ input_lengths,
2447
+ target_lengths,
2448
+ blank=self.config.pad_token_id,
2449
+ reduction=self.config.ctc_loss_reduction,
2450
+ zero_infinity=self.config.ctc_zero_infinity,
2451
+ )
2452
+
2453
+ if not return_dict:
2454
+ output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
2455
+ return ((loss,) + output) if loss is not None else output
2456
+
2457
+ return CausalLMOutput(
2458
+ loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
2459
+ )
2460
+
2461
+
2462
+ class PantagruelUniForXVector(PantagruelUniPreTrainedModel):
2463
+ def __init__(self, config):
2464
+ super().__init__(config)
2465
+
2466
+ self.config = config
2467
+
2468
+ self.pantagruel_uni = PantagruelUniModel(config, add_pooling_layer=False)
2469
+ num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
2470
+ if config.modalities.audio.use_weighted_layer_sum:
2471
+ self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
2472
+ self.projector = nn.Linear(config.hidden_size, config.modalities.audio.tdnn_dim[0])
2473
+
2474
+ tdnn_layers = [
2475
+ TDNNLayer(config.modalities.audio, i) for i in range(len(config.modalities.audio.tdnn_dim))
2476
+ ]
2477
+ self.tdnn = nn.ModuleList(tdnn_layers)
2478
+
2479
+ self.feature_extractor = nn.Linear(
2480
+ config.modalities.audio.tdnn_dim[-1] * 2, config.modalities.audio.xvector_output_dim
2481
+ )
2482
+ self.classifier = nn.Linear(
2483
+ config.modalities.audio.xvector_output_dim, config.modalities.audio.xvector_output_dim
2484
+ )
2485
+
2486
+ self.objective = AMSoftmaxLoss(
2487
+ config.modalities.audio.xvector_output_dim, config.num_labels
2488
+ )
2489
+
2490
+ self.init_weights()
2491
+
2492
+ def freeze_feature_extractor(self):
2493
+ """
2494
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
2495
+ not be updated during training.
2496
+ """
2497
+ warnings.warn(
2498
+ "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
2499
+ "Please use the equivalent `freeze_feature_encoder` method instead.",
2500
+ FutureWarning,
2501
+ )
2502
+ self.freeze_feature_encoder()
2503
+
2504
+ def freeze_feature_encoder(self):
2505
+ """
2506
+ Calling this function will disable the gradient computation for the feature encoder so that its parameter will
2507
+ not be updated during training.
2508
+ """
2509
+ self.pantagruel_uni.freeze_feature_encoder()
2510
+
2511
+ def freeze_base_model(self):
2512
+ """
2513
+ Calling this function will disable the gradient computation for the base model so that its parameters will not
2514
+ be updated during training. Only the classification head will be updated.
2515
+ """
2516
+ for param in self.pantagruel_uni.parameters():
2517
+ param.requires_grad = False
2518
+
2519
+ def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
2520
+ """
2521
+ Computes the output length of the TDNN layers
2522
+ """
2523
+
2524
+ def _conv_out_length(input_length, kernel_size, stride):
2525
+ # 1D convolutional layer output length formula taken
2526
+ # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
2527
+ return (input_length - kernel_size) // stride + 1
2528
+
2529
+ for kernel_size in self.config.modalities.audio.tdnn_kernel:
2530
+ input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
2531
+
2532
+ return input_lengths
2533
+
2534
+ @auto_docstring
2535
+ def forward(
2536
+ self,
2537
+ input_values: Optional[torch.Tensor],
2538
+ attention_mask: Optional[torch.Tensor] = None,
2539
+ output_attentions: Optional[bool] = None,
2540
+ output_hidden_states: Optional[bool] = None,
2541
+ return_dict: Optional[bool] = None,
2542
+ labels: Optional[torch.Tensor] = None,
2543
+ ) -> Union[tuple, XVectorOutput]:
2544
+ r"""
2545
+ input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
2546
+ Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
2547
+ into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
2548
+ (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
2549
+ To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
2550
+ into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details.
2551
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
2552
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
2553
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
2554
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
2555
+ """
2556
+
2557
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2558
+ output_hidden_states = True if self.config.modalities.audio.use_weighted_layer_sum else output_hidden_states
2559
+
2560
+ outputs = self.pantagruel_uni(
2561
+ input_values=input_values,
2562
+ attention_mask=attention_mask,
2563
+ mask=False,
2564
+ mode="AUDIO",
2565
+ output_hidden_states=output_hidden_states,
2566
+ output_attn_weights=output_attentions,
2567
+ return_dict=return_dict,
2568
+ )
2569
+
2570
+ if self.config.use_weighted_layer_sum:
2571
+ hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
2572
+ hidden_states = torch.stack(hidden_states, dim=1)
2573
+ norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
2574
+ hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
2575
+ else:
2576
+ hidden_states = outputs[0]
2577
+
2578
+ hidden_states = self.projector(hidden_states)
2579
+
2580
+ for tdnn_layer in self.tdnn:
2581
+ hidden_states = tdnn_layer(hidden_states)
2582
+
2583
+ # Statistic Pooling
2584
+ if attention_mask is None:
2585
+ mean_features = hidden_states.mean(dim=1)
2586
+ std_features = hidden_states.std(dim=1)
2587
+ else:
2588
+ feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
2589
+ tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
2590
+ mean_features = []
2591
+ std_features = []
2592
+ for i, length in enumerate(tdnn_output_lengths):
2593
+ mean_features.append(hidden_states[i, :length].mean(dim=0))
2594
+ std_features.append(hidden_states[i, :length].std(dim=0))
2595
+ mean_features = torch.stack(mean_features)
2596
+ std_features = torch.stack(std_features)
2597
+ statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
2598
+
2599
+ output_embeddings = self.feature_extractor(statistic_pooling)
2600
+ logits = self.classifier(output_embeddings)
2601
+
2602
+ loss = None
2603
+ if labels is not None:
2604
+ loss = self.objective(logits, labels)
2605
+
2606
+ if not return_dict:
2607
+ output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
2608
+ return ((loss,) + output) if loss is not None else output
2609
+
2610
+ return XVectorOutput(
2611
+ loss=loss,
2612
+ logits=logits,
2613
+ embeddings=output_embeddings,
2614
+ hidden_states=outputs.hidden_states,
2615
+ attentions=outputs.attentions,
2616
+ )
2617
+
2618
 
2619
  __all__ = [
2620
  "PantagruelUniForMaskedLM",
 
2624
  "PantagruelUniForTokenClassification",
2625
  "PantagruelUniModel",
2626
  "PantagruelUniPreTrainedModel",
2627
+ "PantagruelUniForAudioFrameClassification",
2628
  ]