Upload folder using huggingface_hub
Browse files- modeling_pantagruel_uni.py +729 -65
modeling_pantagruel_uni.py
CHANGED
|
@@ -52,7 +52,10 @@ from transformers.modeling_outputs import (
|
|
| 52 |
QuestionAnsweringModelOutput,
|
| 53 |
SequenceClassifierOutput,
|
| 54 |
TokenClassifierOutput,
|
|
|
|
|
|
|
| 55 |
)
|
|
|
|
| 56 |
from .configuration_pantagruel_uni import (
|
| 57 |
PantagruelUniConfig,
|
| 58 |
PantagruelModalityConfig,
|
|
@@ -83,8 +86,6 @@ class PantagruelUniBaseModelOutput(ModelOutput):
|
|
| 83 |
attentions: Optional[tuple[torch.FloatTensor, ...]] = None
|
| 84 |
|
| 85 |
|
| 86 |
-
#################################################
|
| 87 |
-
### modeling_pantagruel_uni_base.py
|
| 88 |
# copied from fairseq.modules.grad_multiply
|
| 89 |
class GradMultiply(torch.autograd.Function):
|
| 90 |
@staticmethod
|
|
@@ -98,7 +99,7 @@ class GradMultiply(torch.autograd.Function):
|
|
| 98 |
return grad * ctx.scale, None
|
| 99 |
|
| 100 |
|
| 101 |
-
#
|
| 102 |
class TransposeLast(nn.Module):
|
| 103 |
def __init__(self, deconstruct_idx=None, tranpose_dim=-2):
|
| 104 |
super().__init__()
|
|
@@ -111,7 +112,7 @@ class TransposeLast(nn.Module):
|
|
| 111 |
return x.transpose(self.tranpose_dim, -1)
|
| 112 |
|
| 113 |
|
| 114 |
-
#
|
| 115 |
class Fp32LayerNorm(nn.LayerNorm):
|
| 116 |
def __init__(self, *args, **kwargs):
|
| 117 |
super().__init__(*args, **kwargs)
|
|
@@ -125,7 +126,7 @@ class Fp32LayerNorm(nn.LayerNorm):
|
|
| 125 |
self.eps,
|
| 126 |
)
|
| 127 |
return output.type_as(input)
|
| 128 |
-
|
| 129 |
|
| 130 |
def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True):
|
| 131 |
return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
|
|
@@ -457,6 +458,7 @@ class BlockEncoder(nn.Module):
|
|
| 457 |
return x
|
| 458 |
|
| 459 |
|
|
|
|
| 460 |
class ModalitySpecificEncoder(nn.Module):
|
| 461 |
def __init__(
|
| 462 |
self,
|
|
@@ -820,6 +822,7 @@ class ModalitySpecificEncoder(nn.Module):
|
|
| 820 |
return x
|
| 821 |
|
| 822 |
|
|
|
|
| 823 |
class AudioEncoder(ModalitySpecificEncoder):
|
| 824 |
|
| 825 |
modality_cfg: PantagruelAudioConfig
|
|
@@ -952,6 +955,7 @@ class AudioEncoder(ModalitySpecificEncoder):
|
|
| 952 |
return padding_mask
|
| 953 |
|
| 954 |
|
|
|
|
| 955 |
class LearnedPositionalEmbedding(nn.Embedding):
|
| 956 |
"""
|
| 957 |
This module learns positional embeddings up to a fixed maximum size.
|
|
@@ -1001,6 +1005,7 @@ class LearnedPositionalEmbedding(nn.Embedding):
|
|
| 1001 |
)
|
| 1002 |
|
| 1003 |
|
|
|
|
| 1004 |
class SinusoidalPositionalEmbedding(nn.Module):
|
| 1005 |
"""This module produces sinusoidal positional embeddings of any length.
|
| 1006 |
|
|
@@ -1098,7 +1103,9 @@ class SinusoidalPositionalEmbedding(nn.Module):
|
|
| 1098 |
.view(bsz, seq_len, -1)
|
| 1099 |
.detach()
|
| 1100 |
)
|
| 1101 |
-
|
|
|
|
|
|
|
| 1102 |
def PositionalEmbedding(
|
| 1103 |
num_embeddings: int,
|
| 1104 |
embedding_dim: int,
|
|
@@ -1125,6 +1132,7 @@ def PositionalEmbedding(
|
|
| 1125 |
return m
|
| 1126 |
|
| 1127 |
|
|
|
|
| 1128 |
class TextLocalEncoder(nn.Module):
|
| 1129 |
def __init__(
|
| 1130 |
self,
|
|
@@ -1246,7 +1254,6 @@ class TextEncoder(ModalitySpecificEncoder):
|
|
| 1246 |
), f"{x.size(1), padding_mask.size(1), diff, self.downsample}"
|
| 1247 |
|
| 1248 |
return padding_mask
|
| 1249 |
-
#################################################
|
| 1250 |
|
| 1251 |
|
| 1252 |
# copied from transformers.models.data2vec.modeling_data2vec.PantagruelUniTextPooler
|
|
@@ -1265,6 +1272,64 @@ class PantagruelUniTextPooler(nn.Module):
|
|
| 1265 |
return pooled_output
|
| 1266 |
|
| 1267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1268 |
class PantagruelUniPreTrainedModel(PreTrainedModel):
|
| 1269 |
config_class = PantagruelUniConfig
|
| 1270 |
base_model_prefix = "pantagruel_uni"
|
|
@@ -1310,27 +1375,60 @@ class PantagruelUniPreTrainedModel(PreTrainedModel):
|
|
| 1310 |
else:
|
| 1311 |
_init(module)
|
| 1312 |
|
| 1313 |
-
|
| 1314 |
-
|
| 1315 |
-
|
| 1316 |
-
|
| 1317 |
-
|
| 1318 |
-
|
| 1319 |
-
|
| 1320 |
-
|
| 1321 |
-
|
| 1322 |
-
|
| 1323 |
-
|
| 1324 |
-
|
| 1325 |
-
|
| 1326 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1327 |
|
| 1328 |
|
|
|
|
| 1329 |
class PantagruelUniModel(PantagruelUniPreTrainedModel):
|
| 1330 |
|
| 1331 |
def __init__(
|
| 1332 |
self, config: PantagruelUniConfig, add_pooling_layer: bool = True
|
| 1333 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1334 |
super().__init__(config)
|
| 1335 |
self.config = config
|
| 1336 |
modalities_cfg = config.modalities
|
|
@@ -1390,10 +1488,12 @@ class PantagruelUniModel(PantagruelUniPreTrainedModel):
|
|
| 1390 |
self.post_init()
|
| 1391 |
|
| 1392 |
def get_input_embeddings(self):
|
| 1393 |
-
|
|
|
|
| 1394 |
|
| 1395 |
def set_input_embeddings(self, value):
|
| 1396 |
-
|
|
|
|
| 1397 |
|
| 1398 |
def freeze_feature_extractor(self):
|
| 1399 |
"""
|
|
@@ -1414,6 +1514,14 @@ class PantagruelUniModel(PantagruelUniPreTrainedModel):
|
|
| 1414 |
"""
|
| 1415 |
for mod in self.modalities:
|
| 1416 |
self.modality_encoders[mod]._freeze_parameters()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1417 |
for block in self.blocks:
|
| 1418 |
for p in block.parameters():
|
| 1419 |
p.requires_grad = False
|
|
@@ -1447,6 +1555,7 @@ class PantagruelUniModel(PantagruelUniPreTrainedModel):
|
|
| 1447 |
self,
|
| 1448 |
input_values=None, # audio input
|
| 1449 |
input_ids=None, # text input
|
|
|
|
| 1450 |
attention_mask=None,
|
| 1451 |
padding_mask=None,
|
| 1452 |
mask=False,
|
|
@@ -1454,12 +1563,68 @@ class PantagruelUniModel(PantagruelUniPreTrainedModel):
|
|
| 1454 |
output_hidden_states=True,
|
| 1455 |
output_attn_weights=False,
|
| 1456 |
return_dict=True,
|
| 1457 |
-
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1458 |
if mode is None:
|
| 1459 |
mode = "TEXT" if input_ids is not None else "AUDIO"
|
| 1460 |
|
| 1461 |
if padding_mask is None and attention_mask is not None:
|
| 1462 |
-
padding_mask = ~attention_mask # attention mask: 1 means to attend to (not masked), 0 means not to attend to (masked). padding mask: 1 means padded (not attend to), 0 means not padded (to attend to)
|
| 1463 |
|
| 1464 |
feature_extractor = self.modality_encoders[mode]
|
| 1465 |
extractor_out = feature_extractor(
|
|
@@ -1598,7 +1763,7 @@ class PantagruelTextClassificationHead(nn.Module):
|
|
| 1598 |
|
| 1599 |
@auto_docstring
|
| 1600 |
class PantagruelUniForMaskedLM(PantagruelUniPreTrainedModel):
|
| 1601 |
-
_tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
| 1602 |
|
| 1603 |
def __init__(self, config):
|
| 1604 |
super().__init__(config)
|
|
@@ -1663,10 +1828,13 @@ class PantagruelUniForMaskedLM(PantagruelUniPreTrainedModel):
|
|
| 1663 |
)
|
| 1664 |
|
| 1665 |
|
|
|
|
|
|
|
|
|
|
| 1666 |
@auto_docstring(
|
| 1667 |
custom_intro="""
|
| 1668 |
-
|
| 1669 |
-
|
| 1670 |
"""
|
| 1671 |
)
|
| 1672 |
class PantagruelUniForSequenceClassification(PantagruelUniPreTrainedModel):
|
|
@@ -1674,64 +1842,157 @@ class PantagruelUniForSequenceClassification(PantagruelUniPreTrainedModel):
|
|
| 1674 |
super().__init__(config)
|
| 1675 |
self.num_labels = config.num_labels
|
| 1676 |
self.config = config
|
| 1677 |
-
|
| 1678 |
self.pantagruel_uni = PantagruelUniModel(config, add_pooling_layer=False)
|
| 1679 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1680 |
|
| 1681 |
# Initialize weights and apply final processing
|
| 1682 |
self.post_init()
|
| 1683 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1684 |
@can_return_tuple
|
| 1685 |
@auto_docstring
|
| 1686 |
def forward(
|
| 1687 |
self,
|
|
|
|
| 1688 |
input_ids: Optional[torch.LongTensor] = None,
|
| 1689 |
attention_mask: Optional[torch.FloatTensor] = None,
|
| 1690 |
padding_mask: Optional[torch.FloatTensor] = None,
|
|
|
|
|
|
|
|
|
|
| 1691 |
labels: Optional[torch.LongTensor] = None,
|
| 1692 |
**kwargs: Unpack[TransformersKwargs],
|
| 1693 |
) -> Union[tuple, SequenceClassifierOutput]:
|
| 1694 |
r"""
|
| 1695 |
-
|
| 1696 |
-
|
| 1697 |
-
|
| 1698 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1699 |
"""
|
| 1700 |
-
|
| 1701 |
-
|
| 1702 |
-
|
| 1703 |
-
|
| 1704 |
-
|
| 1705 |
-
|
| 1706 |
-
|
| 1707 |
-
|
| 1708 |
-
|
| 1709 |
-
logits = self.classifier(sequence_output)
|
| 1710 |
|
| 1711 |
-
|
| 1712 |
-
|
| 1713 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1714 |
|
| 1715 |
-
|
| 1716 |
-
|
| 1717 |
-
|
| 1718 |
-
|
| 1719 |
-
|
| 1720 |
-
|
| 1721 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1722 |
|
| 1723 |
-
|
| 1724 |
-
|
| 1725 |
-
|
| 1726 |
-
|
| 1727 |
-
|
| 1728 |
-
|
| 1729 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1730 |
loss_fct = CrossEntropyLoss()
|
| 1731 |
-
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
| 1732 |
-
|
| 1733 |
-
|
| 1734 |
-
|
|
|
|
| 1735 |
|
| 1736 |
return SequenceClassifierOutput(
|
| 1737 |
loss=loss,
|
|
@@ -1952,6 +2213,408 @@ class PantagruelUniForQuestionAnswering(PantagruelUniPreTrainedModel):
|
|
| 1952 |
attentions=outputs.attentions,
|
| 1953 |
)
|
| 1954 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1955 |
|
| 1956 |
__all__ = [
|
| 1957 |
"PantagruelUniForMaskedLM",
|
|
@@ -1961,4 +2624,5 @@ __all__ = [
|
|
| 1961 |
"PantagruelUniForTokenClassification",
|
| 1962 |
"PantagruelUniModel",
|
| 1963 |
"PantagruelUniPreTrainedModel",
|
|
|
|
| 1964 |
]
|
|
|
|
| 52 |
QuestionAnsweringModelOutput,
|
| 53 |
SequenceClassifierOutput,
|
| 54 |
TokenClassifierOutput,
|
| 55 |
+
CausalLMOutput,
|
| 56 |
+
XVectorOutput,
|
| 57 |
)
|
| 58 |
+
from transformers.utils import auto_docstring, is_peft_available
|
| 59 |
from .configuration_pantagruel_uni import (
|
| 60 |
PantagruelUniConfig,
|
| 61 |
PantagruelModalityConfig,
|
|
|
|
| 86 |
attentions: Optional[tuple[torch.FloatTensor, ...]] = None
|
| 87 |
|
| 88 |
|
|
|
|
|
|
|
| 89 |
# copied from fairseq.modules.grad_multiply
|
| 90 |
class GradMultiply(torch.autograd.Function):
|
| 91 |
@staticmethod
|
|
|
|
| 99 |
return grad * ctx.scale, None
|
| 100 |
|
| 101 |
|
| 102 |
+
# copied from fairseq.modules.transpose_last.py
|
| 103 |
class TransposeLast(nn.Module):
|
| 104 |
def __init__(self, deconstruct_idx=None, tranpose_dim=-2):
|
| 105 |
super().__init__()
|
|
|
|
| 112 |
return x.transpose(self.tranpose_dim, -1)
|
| 113 |
|
| 114 |
|
| 115 |
+
# copied from fairseq.modules.layer_norm.py
|
| 116 |
class Fp32LayerNorm(nn.LayerNorm):
|
| 117 |
def __init__(self, *args, **kwargs):
|
| 118 |
super().__init__(*args, **kwargs)
|
|
|
|
| 126 |
self.eps,
|
| 127 |
)
|
| 128 |
return output.type_as(input)
|
| 129 |
+
|
| 130 |
|
| 131 |
def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True):
|
| 132 |
return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
|
|
|
|
| 458 |
return x
|
| 459 |
|
| 460 |
|
| 461 |
+
# copied from fairseq.examples.data2vec.models
|
| 462 |
class ModalitySpecificEncoder(nn.Module):
|
| 463 |
def __init__(
|
| 464 |
self,
|
|
|
|
| 822 |
return x
|
| 823 |
|
| 824 |
|
| 825 |
+
# copied from fairseq.examples.data2vec.models.modalities.audio
|
| 826 |
class AudioEncoder(ModalitySpecificEncoder):
|
| 827 |
|
| 828 |
modality_cfg: PantagruelAudioConfig
|
|
|
|
| 955 |
return padding_mask
|
| 956 |
|
| 957 |
|
| 958 |
+
# copied from fairseq
|
| 959 |
class LearnedPositionalEmbedding(nn.Embedding):
|
| 960 |
"""
|
| 961 |
This module learns positional embeddings up to a fixed maximum size.
|
|
|
|
| 1005 |
)
|
| 1006 |
|
| 1007 |
|
| 1008 |
+
# copied from fairseq
|
| 1009 |
class SinusoidalPositionalEmbedding(nn.Module):
|
| 1010 |
"""This module produces sinusoidal positional embeddings of any length.
|
| 1011 |
|
|
|
|
| 1103 |
.view(bsz, seq_len, -1)
|
| 1104 |
.detach()
|
| 1105 |
)
|
| 1106 |
+
|
| 1107 |
+
|
| 1108 |
+
# copied from fairseq.modules
|
| 1109 |
def PositionalEmbedding(
|
| 1110 |
num_embeddings: int,
|
| 1111 |
embedding_dim: int,
|
|
|
|
| 1132 |
return m
|
| 1133 |
|
| 1134 |
|
| 1135 |
+
# copied from fairseq.examples.data2vec.modules
|
| 1136 |
class TextLocalEncoder(nn.Module):
|
| 1137 |
def __init__(
|
| 1138 |
self,
|
|
|
|
| 1254 |
), f"{x.size(1), padding_mask.size(1), diff, self.downsample}"
|
| 1255 |
|
| 1256 |
return padding_mask
|
|
|
|
| 1257 |
|
| 1258 |
|
| 1259 |
# copied from transformers.models.data2vec.modeling_data2vec.PantagruelUniTextPooler
|
|
|
|
| 1272 |
return pooled_output
|
| 1273 |
|
| 1274 |
|
| 1275 |
+
# copied from transformers.models.data2vec.modeling_data2vec_audio
|
| 1276 |
+
class AMSoftmaxLoss(nn.Module):
|
| 1277 |
+
def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
|
| 1278 |
+
super().__init__()
|
| 1279 |
+
self.scale = scale
|
| 1280 |
+
self.margin = margin
|
| 1281 |
+
self.num_labels = num_labels
|
| 1282 |
+
self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
|
| 1283 |
+
self.loss = nn.CrossEntropyLoss()
|
| 1284 |
+
|
| 1285 |
+
def forward(self, hidden_states, labels):
|
| 1286 |
+
labels = labels.flatten()
|
| 1287 |
+
weight = nn.functional.normalize(self.weight, dim=0)
|
| 1288 |
+
hidden_states = nn.functional.normalize(hidden_states, dim=1)
|
| 1289 |
+
cos_theta = torch.mm(hidden_states, weight)
|
| 1290 |
+
psi = cos_theta - self.margin
|
| 1291 |
+
|
| 1292 |
+
onehot = nn.functional.one_hot(labels, self.num_labels)
|
| 1293 |
+
logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
|
| 1294 |
+
loss = self.loss(logits, labels)
|
| 1295 |
+
|
| 1296 |
+
return loss
|
| 1297 |
+
|
| 1298 |
+
|
| 1299 |
+
# copied from transformers.models.data2vec.modeling_data2vec_audio
|
| 1300 |
+
class TDNNLayer(nn.Module):
|
| 1301 |
+
def __init__(self, config, layer_id=0):
|
| 1302 |
+
super().__init__()
|
| 1303 |
+
self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
|
| 1304 |
+
self.out_conv_dim = config.tdnn_dim[layer_id]
|
| 1305 |
+
self.kernel_size = config.tdnn_kernel[layer_id]
|
| 1306 |
+
self.dilation = config.tdnn_dilation[layer_id]
|
| 1307 |
+
|
| 1308 |
+
self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
|
| 1309 |
+
self.activation = nn.ReLU()
|
| 1310 |
+
|
| 1311 |
+
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
| 1312 |
+
if is_peft_available():
|
| 1313 |
+
from peft.tuners.lora import LoraLayer
|
| 1314 |
+
|
| 1315 |
+
if is_peft_available():
|
| 1316 |
+
if isinstance(self.kernel, LoraLayer):
|
| 1317 |
+
warnings.warn(
|
| 1318 |
+
"Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
|
| 1319 |
+
"You should exclude TDNNLayer from LoRA's target modules.",
|
| 1320 |
+
)
|
| 1321 |
+
|
| 1322 |
+
# for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
|
| 1323 |
+
hidden_states = hidden_states.transpose(1, 2)
|
| 1324 |
+
weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)
|
| 1325 |
+
hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
|
| 1326 |
+
hidden_states = hidden_states.transpose(1, 2)
|
| 1327 |
+
|
| 1328 |
+
hidden_states = self.activation(hidden_states)
|
| 1329 |
+
return hidden_states
|
| 1330 |
+
|
| 1331 |
+
|
| 1332 |
+
@auto_docstring
|
| 1333 |
class PantagruelUniPreTrainedModel(PreTrainedModel):
|
| 1334 |
config_class = PantagruelUniConfig
|
| 1335 |
base_model_prefix = "pantagruel_uni"
|
|
|
|
| 1375 |
else:
|
| 1376 |
_init(module)
|
| 1377 |
|
| 1378 |
+
def _get_feat_extract_output_lengths(
|
| 1379 |
+
self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
|
| 1380 |
+
):
|
| 1381 |
+
"""
|
| 1382 |
+
Computes the output length of the convolutional layers
|
| 1383 |
+
"""
|
| 1384 |
+
|
| 1385 |
+
add_adapter = self.config.modalities.audio.add_adapter if add_adapter is None else add_adapter
|
| 1386 |
+
|
| 1387 |
+
def _conv_out_length(input_length, kernel_size, stride):
|
| 1388 |
+
# 1D convolutional layer output length formula taken
|
| 1389 |
+
# from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
|
| 1390 |
+
return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
|
| 1391 |
+
|
| 1392 |
+
for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
|
| 1393 |
+
input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
|
| 1394 |
+
|
| 1395 |
+
if add_adapter:
|
| 1396 |
+
for _ in range(self.config.num_adapter_layers):
|
| 1397 |
+
input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
|
| 1398 |
+
|
| 1399 |
+
return input_lengths
|
| 1400 |
+
|
| 1401 |
+
def _get_feature_vector_attention_mask(
|
| 1402 |
+
self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
|
| 1403 |
+
):
|
| 1404 |
+
# Effectively attention_mask.sum(-1), but not inplace to be able to run
|
| 1405 |
+
# on inference mode.
|
| 1406 |
+
non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
|
| 1407 |
+
|
| 1408 |
+
output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
|
| 1409 |
+
output_lengths = output_lengths.to(torch.long)
|
| 1410 |
+
|
| 1411 |
+
batch_size = attention_mask.shape[0]
|
| 1412 |
+
|
| 1413 |
+
attention_mask = torch.zeros(
|
| 1414 |
+
(batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
|
| 1415 |
+
)
|
| 1416 |
+
# these two operations makes sure that all values before the output lengths idxs are attended to
|
| 1417 |
+
attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
|
| 1418 |
+
attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
|
| 1419 |
+
return attention_mask
|
| 1420 |
|
| 1421 |
|
| 1422 |
+
@auto_docstring
|
| 1423 |
class PantagruelUniModel(PantagruelUniPreTrainedModel):
|
| 1424 |
|
| 1425 |
def __init__(
|
| 1426 |
self, config: PantagruelUniConfig, add_pooling_layer: bool = True
|
| 1427 |
):
|
| 1428 |
+
r"""
|
| 1429 |
+
add_pooling_layer (bool, *optional*, defaults to `True`):
|
| 1430 |
+
Whether to add a pooling layer
|
| 1431 |
+
"""
|
| 1432 |
super().__init__(config)
|
| 1433 |
self.config = config
|
| 1434 |
modalities_cfg = config.modalities
|
|
|
|
| 1488 |
self.post_init()
|
| 1489 |
|
| 1490 |
def get_input_embeddings(self):
|
| 1491 |
+
if "TEXT" in self.modality_encoders:
|
| 1492 |
+
return self.modality_encoders["TEXT"].local_encoder.embed_tokens
|
| 1493 |
|
| 1494 |
def set_input_embeddings(self, value):
|
| 1495 |
+
if "TEXT" in self.modality_encoders:
|
| 1496 |
+
self.modality_encoders["TEXT"].local_encoder.embed_tokens = value
|
| 1497 |
|
| 1498 |
def freeze_feature_extractor(self):
|
| 1499 |
"""
|
|
|
|
| 1514 |
"""
|
| 1515 |
for mod in self.modalities:
|
| 1516 |
self.modality_encoders[mod]._freeze_parameters()
|
| 1517 |
+
|
| 1518 |
+
def freeze_base_model(self):
|
| 1519 |
+
"""
|
| 1520 |
+
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
| 1521 |
+
not be updated during training.
|
| 1522 |
+
"""
|
| 1523 |
+
for mod in self.modalities:
|
| 1524 |
+
self.modality_encoders[mod]._freeze_parameters()
|
| 1525 |
for block in self.blocks:
|
| 1526 |
for p in block.parameters():
|
| 1527 |
p.requires_grad = False
|
|
|
|
| 1555 |
self,
|
| 1556 |
input_values=None, # audio input
|
| 1557 |
input_ids=None, # text input
|
| 1558 |
+
token_type_ids=None,
|
| 1559 |
attention_mask=None,
|
| 1560 |
padding_mask=None,
|
| 1561 |
mask=False,
|
|
|
|
| 1563 |
output_hidden_states=True,
|
| 1564 |
output_attn_weights=False,
|
| 1565 |
return_dict=True,
|
| 1566 |
+
) -> Union[Tuple, PantagruelUniBaseModelOutput]:
|
| 1567 |
+
r"""
|
| 1568 |
+
Performs a forward pass of the model for either audio or text inputs.
|
| 1569 |
+
|
| 1570 |
+
The modality is automatically inferred if `mode` is not provided:
|
| 1571 |
+
`"TEXT"` is used when `input_ids` is specified, otherwise `"AUDIO"`.
|
| 1572 |
+
|
| 1573 |
+
Args:
|
| 1574 |
+
input_values (`torch.FloatTensor`, *optional*):
|
| 1575 |
+
Audio input values of shape `(batch_size, sequence_length)`
|
| 1576 |
+
containing *normalized* audio samples
|
| 1577 |
+
Required when operating in `"AUDIO"` mode.
|
| 1578 |
+
|
| 1579 |
+
input_ids (`torch.LongTensor`, *optional*):
|
| 1580 |
+
Tokenized text input IDs of shape `(batch_size, sequence_length)`.
|
| 1581 |
+
Required when operating in `"TEXT"` mode.
|
| 1582 |
+
|
| 1583 |
+
attention_mask (`torch.LongTensor`, *optional*):
|
| 1584 |
+
Attention mask for text inputs, with values in `{0, 1}`:
|
| 1585 |
+
- `1` for tokens that should be attended to,
|
| 1586 |
+
- `0` for tokens that should be masked.
|
| 1587 |
+
If provided and `padding_mask` is `None`, it will be converted internally
|
| 1588 |
+
to a padding mask.
|
| 1589 |
+
|
| 1590 |
+
padding_mask (`torch.BoolTensor` or `torch.LongTensor`, *optional*):
|
| 1591 |
+
Padding mask indicating which positions are padded:
|
| 1592 |
+
- `1` (or `True`) for padded positions (not attended to),
|
| 1593 |
+
- `0` (or `False`) for non-padded positions.
|
| 1594 |
+
If not provided and `attention_mask` is given, this is inferred as
|
| 1595 |
+
the logical negation of `attention_mask`.
|
| 1596 |
+
|
| 1597 |
+
mask (`bool`, *optional*, defaults to `False`):
|
| 1598 |
+
Whether to apply input masking.
|
| 1599 |
+
|
| 1600 |
+
mode (`str`, *optional*):
|
| 1601 |
+
Explicitly specifies the input modality. Supported values are
|
| 1602 |
+
`"TEXT"` and `"AUDIO"`. If `None`, the mode is inferred from the
|
| 1603 |
+
provided inputs.
|
| 1604 |
+
|
| 1605 |
+
output_hidden_states (`bool`, *optional*, defaults to `True`):
|
| 1606 |
+
Whether to return the hidden states of all layers.
|
| 1607 |
+
|
| 1608 |
+
output_attn_weights (`bool`, *optional*, defaults to `False`):
|
| 1609 |
+
Whether to return attention weights.
|
| 1610 |
+
|
| 1611 |
+
return_dict (`bool`, *optional*, defaults to `True`):
|
| 1612 |
+
Whether to return a [`ModelOutput`] instead of a plain tuple.
|
| 1613 |
+
|
| 1614 |
+
Returns:
|
| 1615 |
+
[`ModelOutput`] or `tuple`:
|
| 1616 |
+
The model outputs. If `return_dict=True`, a [`ModelOutput`] is returned
|
| 1617 |
+
containing (depending on configuration) the final hidden states,
|
| 1618 |
+
optional hidden states from all layers, and optional attention weights.
|
| 1619 |
+
If `return_dict=False`, a tuple is returned with the same contents in
|
| 1620 |
+
a fixed order.
|
| 1621 |
+
"""
|
| 1622 |
+
|
| 1623 |
if mode is None:
|
| 1624 |
mode = "TEXT" if input_ids is not None else "AUDIO"
|
| 1625 |
|
| 1626 |
if padding_mask is None and attention_mask is not None:
|
| 1627 |
+
padding_mask = ~attention_mask.bool() # attention mask: 1 means to attend to (not masked), 0 means not to attend to (masked). padding mask: 1 means padded (not attend to), 0 means not padded (to attend to)
|
| 1628 |
|
| 1629 |
feature_extractor = self.modality_encoders[mode]
|
| 1630 |
extractor_out = feature_extractor(
|
|
|
|
| 1763 |
|
| 1764 |
@auto_docstring
|
| 1765 |
class PantagruelUniForMaskedLM(PantagruelUniPreTrainedModel):
|
| 1766 |
+
# _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
|
| 1767 |
|
| 1768 |
def __init__(self, config):
|
| 1769 |
super().__init__(config)
|
|
|
|
| 1828 |
)
|
| 1829 |
|
| 1830 |
|
| 1831 |
+
_HIDDEN_STATES_START_POSITION = 2
|
| 1832 |
+
|
| 1833 |
+
|
| 1834 |
@auto_docstring(
|
| 1835 |
custom_intro="""
|
| 1836 |
+
PantagruelUniModel with a sequence classification or regression head on top (a linear layer applied to a pooled representation of the sequence).
|
| 1837 |
+
This model supports text and audio modalities. The classification head and internal processing are selected automatically based on the configuration.
|
| 1838 |
"""
|
| 1839 |
)
|
| 1840 |
class PantagruelUniForSequenceClassification(PantagruelUniPreTrainedModel):
|
|
|
|
| 1842 |
super().__init__(config)
|
| 1843 |
self.num_labels = config.num_labels
|
| 1844 |
self.config = config
|
|
|
|
| 1845 |
self.pantagruel_uni = PantagruelUniModel(config, add_pooling_layer=False)
|
| 1846 |
+
|
| 1847 |
+
if config.supported_modality == "TEXT":
|
| 1848 |
+
logger.info("Initializing PantagruelUniForSequenceClassification for TEXT")
|
| 1849 |
+
self.classifier = PantagruelTextClassificationHead(config)
|
| 1850 |
+
elif config.supported_modality == "AUDIO":
|
| 1851 |
+
logger.info("Initializing PantagruelUniForSequenceClassification for AUDIO")
|
| 1852 |
+
num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
|
| 1853 |
+
if config.modalities.audio.use_weighted_layer_sum:
|
| 1854 |
+
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
|
| 1855 |
+
self.projector = nn.Linear(config.hidden_size, config.modalities.audio.classifier_proj_size)
|
| 1856 |
+
self.classifier = nn.Linear(config.modalities.audio.classifier_proj_size, config.num_labels)
|
| 1857 |
|
| 1858 |
# Initialize weights and apply final processing
|
| 1859 |
self.post_init()
|
| 1860 |
|
| 1861 |
+
def freeze_feature_extractor(self):
|
| 1862 |
+
"""
|
| 1863 |
+
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
| 1864 |
+
not be updated during training.
|
| 1865 |
+
"""
|
| 1866 |
+
warnings.warn(
|
| 1867 |
+
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
|
| 1868 |
+
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
| 1869 |
+
FutureWarning,
|
| 1870 |
+
)
|
| 1871 |
+
self.freeze_feature_encoder()
|
| 1872 |
+
|
| 1873 |
+
def freeze_feature_encoder(self):
|
| 1874 |
+
"""
|
| 1875 |
+
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
| 1876 |
+
not be updated during training.
|
| 1877 |
+
"""
|
| 1878 |
+
self.pantagruel_uni.freeze_feature_encoder()
|
| 1879 |
+
|
| 1880 |
+
def freeze_base_model(self):
|
| 1881 |
+
"""
|
| 1882 |
+
Calling this function will disable the gradient computation for the base model so that its parameters will not
|
| 1883 |
+
be updated during training. Only the classification head will be updated.
|
| 1884 |
+
"""
|
| 1885 |
+
for param in self.pantagruel_uni.parameters():
|
| 1886 |
+
param.requires_grad = False
|
| 1887 |
+
|
| 1888 |
@can_return_tuple
|
| 1889 |
@auto_docstring
|
| 1890 |
def forward(
|
| 1891 |
self,
|
| 1892 |
+
input_values: Optional[torch.FloatTensor] = None,
|
| 1893 |
input_ids: Optional[torch.LongTensor] = None,
|
| 1894 |
attention_mask: Optional[torch.FloatTensor] = None,
|
| 1895 |
padding_mask: Optional[torch.FloatTensor] = None,
|
| 1896 |
+
output_attentions: Optional[bool] = None,
|
| 1897 |
+
output_hidden_states: Optional[bool] = None,
|
| 1898 |
+
return_dict: Optional[bool] = None,
|
| 1899 |
labels: Optional[torch.LongTensor] = None,
|
| 1900 |
**kwargs: Unpack[TransformersKwargs],
|
| 1901 |
) -> Union[tuple, SequenceClassifierOutput]:
|
| 1902 |
r"""
|
| 1903 |
+
Performs a forward pass for sequence classification or regression.
|
| 1904 |
+
|
| 1905 |
+
This method supports both **text** and **audio** inputs. The modality is inferred
|
| 1906 |
+
from the provided inputs and the model configuration.
|
| 1907 |
+
|
| 1908 |
+
Args:
|
| 1909 |
+
input_values (`torch.FloatTensor`, *optional*):
|
| 1910 |
+
Audio input values of shape `(batch_size, sequence_length)`
|
| 1911 |
+
containing *normalized* audio samples.
|
| 1912 |
+
input_ids (`torch.LongTensor`, *optional*):
|
| 1913 |
+
Tokenized text input IDs of shape `(batch_size, sequence_length)`.
|
| 1914 |
+
Used when the model is configured for `"TEXT"` modality.
|
| 1915 |
+
|
| 1916 |
+
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
| 1917 |
+
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`.
|
| 1918 |
+
If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
|
| 1919 |
+
If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
| 1920 |
"""
|
| 1921 |
+
if self.config.supported_modality == "TEXT":
|
| 1922 |
+
outputs = self.pantagruel_uni(
|
| 1923 |
+
input_ids=input_ids,
|
| 1924 |
+
attention_mask=attention_mask,
|
| 1925 |
+
padding_mask=padding_mask,
|
| 1926 |
+
mask=False,
|
| 1927 |
+
mode="TEXT",
|
| 1928 |
+
return_dict=True,
|
| 1929 |
+
)
|
|
|
|
| 1930 |
|
| 1931 |
+
sequence_output = outputs.last_hidden_state
|
| 1932 |
+
logits = self.classifier(sequence_output)
|
| 1933 |
+
|
| 1934 |
+
loss = None
|
| 1935 |
+
if labels is not None:
|
| 1936 |
+
labels = labels.to(logits.device)
|
| 1937 |
+
|
| 1938 |
+
if self.config.problem_type is None:
|
| 1939 |
+
if self.num_labels == 1:
|
| 1940 |
+
self.config.problem_type = "regression"
|
| 1941 |
+
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
|
| 1942 |
+
self.config.problem_type = "single_label_classification"
|
| 1943 |
+
else:
|
| 1944 |
+
self.config.problem_type = "multi_label_classification"
|
| 1945 |
+
|
| 1946 |
+
if self.config.problem_type == "regression":
|
| 1947 |
+
loss_fct = MSELoss()
|
| 1948 |
+
if self.num_labels == 1:
|
| 1949 |
+
loss = loss_fct(logits.squeeze(), labels.squeeze())
|
| 1950 |
+
else:
|
| 1951 |
+
loss = loss_fct(logits, labels)
|
| 1952 |
+
elif self.config.problem_type == "single_label_classification":
|
| 1953 |
+
loss_fct = CrossEntropyLoss()
|
| 1954 |
+
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
|
| 1955 |
+
elif self.config.problem_type == "multi_label_classification":
|
| 1956 |
+
loss_fct = BCEWithLogitsLoss()
|
| 1957 |
+
loss = loss_fct(logits, labels)
|
| 1958 |
|
| 1959 |
+
else:
|
| 1960 |
+
outputs = self.pantagruel_uni(
|
| 1961 |
+
input_values=input_values,
|
| 1962 |
+
attention_mask=attention_mask,
|
| 1963 |
+
mask=False,
|
| 1964 |
+
mode="AUDIO",
|
| 1965 |
+
output_hidden_states=output_hidden_states,
|
| 1966 |
+
output_attn_weights=output_attentions,
|
| 1967 |
+
return_dict=return_dict,
|
| 1968 |
+
)
|
| 1969 |
+
if self.config.modalities.audio.use_weighted_layer_sum:
|
| 1970 |
+
hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
|
| 1971 |
+
hidden_states = torch.stack(hidden_states, dim=1)
|
| 1972 |
+
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
|
| 1973 |
+
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
|
| 1974 |
+
else:
|
| 1975 |
+
hidden_states = outputs[0]
|
| 1976 |
|
| 1977 |
+
hidden_states = self.projector(hidden_states)
|
| 1978 |
+
if attention_mask is None:
|
| 1979 |
+
pooled_output = hidden_states.mean(dim=1)
|
| 1980 |
+
else:
|
| 1981 |
+
padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
|
| 1982 |
+
expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
|
| 1983 |
+
hidden_states[~expand_padding_mask] = 0.0
|
| 1984 |
+
pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
|
| 1985 |
+
|
| 1986 |
+
logits = self.classifier(pooled_output)
|
| 1987 |
+
|
| 1988 |
+
loss = None
|
| 1989 |
+
if labels is not None:
|
| 1990 |
loss_fct = CrossEntropyLoss()
|
| 1991 |
+
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
|
| 1992 |
+
|
| 1993 |
+
if not return_dict:
|
| 1994 |
+
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
|
| 1995 |
+
return ((loss,) + output) if loss is not None else output
|
| 1996 |
|
| 1997 |
return SequenceClassifierOutput(
|
| 1998 |
loss=loss,
|
|
|
|
| 2213 |
attentions=outputs.attentions,
|
| 2214 |
)
|
| 2215 |
|
| 2216 |
+
class PantagruelUniForAudioFrameClassification(PantagruelUniPreTrainedModel):
|
| 2217 |
+
def __init__(self, config):
|
| 2218 |
+
super().__init__(config)
|
| 2219 |
+
|
| 2220 |
+
self.config = config
|
| 2221 |
+
|
| 2222 |
+
if hasattr(config.modalities.audio, "add_adapter") and config.modalities.audio.add_adapter:
|
| 2223 |
+
raise ValueError(
|
| 2224 |
+
"Audio frame classification does not support the use of Data2VecAudio adapters (config.add_adapter=True)"
|
| 2225 |
+
)
|
| 2226 |
+
|
| 2227 |
+
self.pantagruel_uni = PantagruelUniModel(config, add_pooling_layer=False)
|
| 2228 |
+
num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
|
| 2229 |
+
if config.modalities.audio.use_weighted_layer_sum:
|
| 2230 |
+
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
|
| 2231 |
+
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
|
| 2232 |
+
self.num_labels = config.num_labels
|
| 2233 |
+
|
| 2234 |
+
self.init_weights()
|
| 2235 |
+
|
| 2236 |
+
def freeze_feature_extractor(self):
|
| 2237 |
+
"""
|
| 2238 |
+
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
| 2239 |
+
not be updated during training.
|
| 2240 |
+
"""
|
| 2241 |
+
warnings.warn(
|
| 2242 |
+
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
|
| 2243 |
+
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
| 2244 |
+
FutureWarning,
|
| 2245 |
+
)
|
| 2246 |
+
self.freeze_feature_encoder()
|
| 2247 |
+
|
| 2248 |
+
def freeze_feature_encoder(self):
|
| 2249 |
+
"""
|
| 2250 |
+
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
| 2251 |
+
not be updated during training.
|
| 2252 |
+
"""
|
| 2253 |
+
self.pantagruel_uni.freeze_feature_encoder()
|
| 2254 |
+
|
| 2255 |
+
def freeze_base_model(self):
|
| 2256 |
+
"""
|
| 2257 |
+
Calling this function will disable the gradient computation for the base model so that its parameters will not
|
| 2258 |
+
be updated during training. Only the classification head will be updated.
|
| 2259 |
+
"""
|
| 2260 |
+
for param in self.pantagruel_uni.parameters():
|
| 2261 |
+
param.requires_grad = False
|
| 2262 |
+
|
| 2263 |
+
@auto_docstring
|
| 2264 |
+
def forward(
|
| 2265 |
+
self,
|
| 2266 |
+
input_values: Optional[torch.Tensor],
|
| 2267 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 2268 |
+
labels: Optional[torch.Tensor] = None,
|
| 2269 |
+
output_attentions: Optional[bool] = None,
|
| 2270 |
+
output_hidden_states: Optional[bool] = None,
|
| 2271 |
+
return_dict: Optional[bool] = None,
|
| 2272 |
+
) -> Union[tuple, TokenClassifierOutput]:
|
| 2273 |
+
r"""
|
| 2274 |
+
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
| 2275 |
+
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
|
| 2276 |
+
into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
|
| 2277 |
+
(`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
|
| 2278 |
+
To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
|
| 2279 |
+
into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details.
|
| 2280 |
+
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
| 2281 |
+
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
| 2282 |
+
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
| 2283 |
+
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
| 2284 |
+
"""
|
| 2285 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 2286 |
+
output_hidden_states = (
|
| 2287 |
+
True if self.config.modalities.audio.use_weighted_layer_sum
|
| 2288 |
+
else output_hidden_states
|
| 2289 |
+
)
|
| 2290 |
+
|
| 2291 |
+
outputs = self.pantagruel_uni(
|
| 2292 |
+
input_values=input_values,
|
| 2293 |
+
attention_mask=attention_mask,
|
| 2294 |
+
mask=False,
|
| 2295 |
+
mode="AUDIO",
|
| 2296 |
+
output_hidden_states=output_hidden_states,
|
| 2297 |
+
output_attn_weights=output_attentions,
|
| 2298 |
+
return_dict=return_dict,
|
| 2299 |
+
)
|
| 2300 |
+
|
| 2301 |
+
if self.config.modalities.audio.use_weighted_layer_sum:
|
| 2302 |
+
hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
|
| 2303 |
+
hidden_states = torch.stack(hidden_states, dim=1)
|
| 2304 |
+
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
|
| 2305 |
+
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
|
| 2306 |
+
else:
|
| 2307 |
+
hidden_states = outputs[0]
|
| 2308 |
+
|
| 2309 |
+
logits = self.classifier(hidden_states)
|
| 2310 |
+
|
| 2311 |
+
loss = None
|
| 2312 |
+
if labels is not None:
|
| 2313 |
+
loss_fct = CrossEntropyLoss()
|
| 2314 |
+
loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
|
| 2315 |
+
|
| 2316 |
+
if not return_dict:
|
| 2317 |
+
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
|
| 2318 |
+
return output
|
| 2319 |
+
|
| 2320 |
+
return TokenClassifierOutput(
|
| 2321 |
+
loss=loss,
|
| 2322 |
+
logits=logits,
|
| 2323 |
+
hidden_states=outputs.hidden_states,
|
| 2324 |
+
attentions=outputs.attentions,
|
| 2325 |
+
)
|
| 2326 |
+
|
| 2327 |
+
|
| 2328 |
+
@auto_docstring(
|
| 2329 |
+
custom_intro="""
|
| 2330 |
+
PantagruelUniForCTC Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
|
| 2331 |
+
"""
|
| 2332 |
+
)
|
| 2333 |
+
class PantagruelUniForCTC(PantagruelUniPreTrainedModel):
|
| 2334 |
+
def __init__(self, config):
|
| 2335 |
+
r"""
|
| 2336 |
+
target_lang (`str`, *optional*):
|
| 2337 |
+
Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
|
| 2338 |
+
adapter.<lang>.bin. Only relevant when using an instance of [`Data2VecAudioForCTC`] with adapters. Uses 'eng' by
|
| 2339 |
+
default.
|
| 2340 |
+
"""
|
| 2341 |
+
super().__init__(config)
|
| 2342 |
+
|
| 2343 |
+
self.pantagruel_uni = PantagruelUniModel(config, add_pooling_layer=False)
|
| 2344 |
+
self.dropout = nn.Dropout(config.final_dropout)
|
| 2345 |
+
|
| 2346 |
+
if config.modalities.audio.vocab_size is None:
|
| 2347 |
+
raise ValueError(
|
| 2348 |
+
f"You are trying to instantiate {self.__class__} with a configuration that "
|
| 2349 |
+
"does not define the vocabulary size of the language model head. Please "
|
| 2350 |
+
"instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
|
| 2351 |
+
"or define `vocab_size` of your model's configuration."
|
| 2352 |
+
)
|
| 2353 |
+
output_hidden_size = (
|
| 2354 |
+
config.modalities.audio.output_hidden_size if hasattr(config.modalities.audio, "add_adapter") and config.modalities.audio.add_adapter else config.hidden_size
|
| 2355 |
+
)
|
| 2356 |
+
self.lm_head = nn.Linear(output_hidden_size, config.modalities.audio.vocab_size)
|
| 2357 |
+
|
| 2358 |
+
# Initialize weights and apply final processing
|
| 2359 |
+
self.post_init()
|
| 2360 |
+
|
| 2361 |
+
def freeze_feature_extractor(self):
|
| 2362 |
+
"""
|
| 2363 |
+
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
| 2364 |
+
not be updated during training.
|
| 2365 |
+
"""
|
| 2366 |
+
warnings.warn(
|
| 2367 |
+
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
|
| 2368 |
+
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
| 2369 |
+
FutureWarning,
|
| 2370 |
+
)
|
| 2371 |
+
self.freeze_feature_encoder()
|
| 2372 |
+
|
| 2373 |
+
def freeze_feature_encoder(self):
|
| 2374 |
+
"""
|
| 2375 |
+
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
| 2376 |
+
not be updated during training.
|
| 2377 |
+
"""
|
| 2378 |
+
self.pantagruel_uni.freeze_feature_encoder()
|
| 2379 |
+
|
| 2380 |
+
def freeze_base_model(self):
|
| 2381 |
+
"""
|
| 2382 |
+
Calling this function will disable the gradient computation for the base model so that its parameters will not
|
| 2383 |
+
be updated during training. Only the classification head will be updated.
|
| 2384 |
+
"""
|
| 2385 |
+
for param in self.pantagruel_uni.parameters():
|
| 2386 |
+
param.requires_grad = False
|
| 2387 |
+
|
| 2388 |
+
@auto_docstring
|
| 2389 |
+
def forward(
|
| 2390 |
+
self,
|
| 2391 |
+
input_values: Optional[torch.Tensor],
|
| 2392 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 2393 |
+
output_attentions: Optional[bool] = None,
|
| 2394 |
+
output_hidden_states: Optional[bool] = None,
|
| 2395 |
+
return_dict: Optional[bool] = None,
|
| 2396 |
+
labels: Optional[torch.Tensor] = None,
|
| 2397 |
+
) -> Union[tuple, CausalLMOutput]:
|
| 2398 |
+
r"""
|
| 2399 |
+
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
|
| 2400 |
+
Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
|
| 2401 |
+
the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
|
| 2402 |
+
All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
|
| 2403 |
+
config.vocab_size - 1]`.
|
| 2404 |
+
"""
|
| 2405 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 2406 |
+
|
| 2407 |
+
if labels is not None and labels.max() >= self.config.modalities.audio.vocab_size:
|
| 2408 |
+
raise ValueError(f"Label values must be <= vocab_size: {self.config.modalities.audio.vocab_size}")
|
| 2409 |
+
|
| 2410 |
+
outputs = self.pantagruel_uni(
|
| 2411 |
+
input_values=input_values,
|
| 2412 |
+
attention_mask=attention_mask,
|
| 2413 |
+
mask=False,
|
| 2414 |
+
mode="AUDIO",
|
| 2415 |
+
output_hidden_states=output_hidden_states,
|
| 2416 |
+
output_attn_weights=output_attentions,
|
| 2417 |
+
return_dict=return_dict,
|
| 2418 |
+
)
|
| 2419 |
+
|
| 2420 |
+
hidden_states = outputs[0]
|
| 2421 |
+
hidden_states = self.dropout(hidden_states)
|
| 2422 |
+
|
| 2423 |
+
logits = self.lm_head(hidden_states)
|
| 2424 |
+
|
| 2425 |
+
loss = None
|
| 2426 |
+
if labels is not None:
|
| 2427 |
+
# retrieve loss input_lengths from attention_mask
|
| 2428 |
+
attention_mask = (
|
| 2429 |
+
attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
|
| 2430 |
+
)
|
| 2431 |
+
input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
|
| 2432 |
+
|
| 2433 |
+
# assuming that padded tokens are filled with -100
|
| 2434 |
+
# when not being attended to
|
| 2435 |
+
labels_mask = labels >= 0
|
| 2436 |
+
target_lengths = labels_mask.sum(-1)
|
| 2437 |
+
flattened_targets = labels.masked_select(labels_mask)
|
| 2438 |
+
|
| 2439 |
+
# ctc_loss doesn't support fp16
|
| 2440 |
+
log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
|
| 2441 |
+
|
| 2442 |
+
with torch.backends.cudnn.flags(enabled=False):
|
| 2443 |
+
loss = nn.functional.ctc_loss(
|
| 2444 |
+
log_probs,
|
| 2445 |
+
flattened_targets,
|
| 2446 |
+
input_lengths,
|
| 2447 |
+
target_lengths,
|
| 2448 |
+
blank=self.config.pad_token_id,
|
| 2449 |
+
reduction=self.config.ctc_loss_reduction,
|
| 2450 |
+
zero_infinity=self.config.ctc_zero_infinity,
|
| 2451 |
+
)
|
| 2452 |
+
|
| 2453 |
+
if not return_dict:
|
| 2454 |
+
output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
|
| 2455 |
+
return ((loss,) + output) if loss is not None else output
|
| 2456 |
+
|
| 2457 |
+
return CausalLMOutput(
|
| 2458 |
+
loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
|
| 2459 |
+
)
|
| 2460 |
+
|
| 2461 |
+
|
| 2462 |
+
class PantagruelUniForXVector(PantagruelUniPreTrainedModel):
|
| 2463 |
+
def __init__(self, config):
|
| 2464 |
+
super().__init__(config)
|
| 2465 |
+
|
| 2466 |
+
self.config = config
|
| 2467 |
+
|
| 2468 |
+
self.pantagruel_uni = PantagruelUniModel(config, add_pooling_layer=False)
|
| 2469 |
+
num_layers = config.num_hidden_layers + 1 # transformer layers + input embeddings
|
| 2470 |
+
if config.modalities.audio.use_weighted_layer_sum:
|
| 2471 |
+
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
|
| 2472 |
+
self.projector = nn.Linear(config.hidden_size, config.modalities.audio.tdnn_dim[0])
|
| 2473 |
+
|
| 2474 |
+
tdnn_layers = [
|
| 2475 |
+
TDNNLayer(config.modalities.audio, i) for i in range(len(config.modalities.audio.tdnn_dim))
|
| 2476 |
+
]
|
| 2477 |
+
self.tdnn = nn.ModuleList(tdnn_layers)
|
| 2478 |
+
|
| 2479 |
+
self.feature_extractor = nn.Linear(
|
| 2480 |
+
config.modalities.audio.tdnn_dim[-1] * 2, config.modalities.audio.xvector_output_dim
|
| 2481 |
+
)
|
| 2482 |
+
self.classifier = nn.Linear(
|
| 2483 |
+
config.modalities.audio.xvector_output_dim, config.modalities.audio.xvector_output_dim
|
| 2484 |
+
)
|
| 2485 |
+
|
| 2486 |
+
self.objective = AMSoftmaxLoss(
|
| 2487 |
+
config.modalities.audio.xvector_output_dim, config.num_labels
|
| 2488 |
+
)
|
| 2489 |
+
|
| 2490 |
+
self.init_weights()
|
| 2491 |
+
|
| 2492 |
+
def freeze_feature_extractor(self):
|
| 2493 |
+
"""
|
| 2494 |
+
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
| 2495 |
+
not be updated during training.
|
| 2496 |
+
"""
|
| 2497 |
+
warnings.warn(
|
| 2498 |
+
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. "
|
| 2499 |
+
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
| 2500 |
+
FutureWarning,
|
| 2501 |
+
)
|
| 2502 |
+
self.freeze_feature_encoder()
|
| 2503 |
+
|
| 2504 |
+
def freeze_feature_encoder(self):
|
| 2505 |
+
"""
|
| 2506 |
+
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
| 2507 |
+
not be updated during training.
|
| 2508 |
+
"""
|
| 2509 |
+
self.pantagruel_uni.freeze_feature_encoder()
|
| 2510 |
+
|
| 2511 |
+
def freeze_base_model(self):
|
| 2512 |
+
"""
|
| 2513 |
+
Calling this function will disable the gradient computation for the base model so that its parameters will not
|
| 2514 |
+
be updated during training. Only the classification head will be updated.
|
| 2515 |
+
"""
|
| 2516 |
+
for param in self.pantagruel_uni.parameters():
|
| 2517 |
+
param.requires_grad = False
|
| 2518 |
+
|
| 2519 |
+
def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
|
| 2520 |
+
"""
|
| 2521 |
+
Computes the output length of the TDNN layers
|
| 2522 |
+
"""
|
| 2523 |
+
|
| 2524 |
+
def _conv_out_length(input_length, kernel_size, stride):
|
| 2525 |
+
# 1D convolutional layer output length formula taken
|
| 2526 |
+
# from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
|
| 2527 |
+
return (input_length - kernel_size) // stride + 1
|
| 2528 |
+
|
| 2529 |
+
for kernel_size in self.config.modalities.audio.tdnn_kernel:
|
| 2530 |
+
input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
|
| 2531 |
+
|
| 2532 |
+
return input_lengths
|
| 2533 |
+
|
| 2534 |
+
@auto_docstring
|
| 2535 |
+
def forward(
|
| 2536 |
+
self,
|
| 2537 |
+
input_values: Optional[torch.Tensor],
|
| 2538 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 2539 |
+
output_attentions: Optional[bool] = None,
|
| 2540 |
+
output_hidden_states: Optional[bool] = None,
|
| 2541 |
+
return_dict: Optional[bool] = None,
|
| 2542 |
+
labels: Optional[torch.Tensor] = None,
|
| 2543 |
+
) -> Union[tuple, XVectorOutput]:
|
| 2544 |
+
r"""
|
| 2545 |
+
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
|
| 2546 |
+
Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
|
| 2547 |
+
into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
|
| 2548 |
+
(`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
|
| 2549 |
+
To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
|
| 2550 |
+
into a tensor of type `torch.FloatTensor`. See [`Data2VecAudioProcessor.__call__`] for details.
|
| 2551 |
+
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
|
| 2552 |
+
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
|
| 2553 |
+
config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
|
| 2554 |
+
`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
|
| 2555 |
+
"""
|
| 2556 |
+
|
| 2557 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 2558 |
+
output_hidden_states = True if self.config.modalities.audio.use_weighted_layer_sum else output_hidden_states
|
| 2559 |
+
|
| 2560 |
+
outputs = self.pantagruel_uni(
|
| 2561 |
+
input_values=input_values,
|
| 2562 |
+
attention_mask=attention_mask,
|
| 2563 |
+
mask=False,
|
| 2564 |
+
mode="AUDIO",
|
| 2565 |
+
output_hidden_states=output_hidden_states,
|
| 2566 |
+
output_attn_weights=output_attentions,
|
| 2567 |
+
return_dict=return_dict,
|
| 2568 |
+
)
|
| 2569 |
+
|
| 2570 |
+
if self.config.use_weighted_layer_sum:
|
| 2571 |
+
hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
|
| 2572 |
+
hidden_states = torch.stack(hidden_states, dim=1)
|
| 2573 |
+
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
|
| 2574 |
+
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
|
| 2575 |
+
else:
|
| 2576 |
+
hidden_states = outputs[0]
|
| 2577 |
+
|
| 2578 |
+
hidden_states = self.projector(hidden_states)
|
| 2579 |
+
|
| 2580 |
+
for tdnn_layer in self.tdnn:
|
| 2581 |
+
hidden_states = tdnn_layer(hidden_states)
|
| 2582 |
+
|
| 2583 |
+
# Statistic Pooling
|
| 2584 |
+
if attention_mask is None:
|
| 2585 |
+
mean_features = hidden_states.mean(dim=1)
|
| 2586 |
+
std_features = hidden_states.std(dim=1)
|
| 2587 |
+
else:
|
| 2588 |
+
feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
|
| 2589 |
+
tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
|
| 2590 |
+
mean_features = []
|
| 2591 |
+
std_features = []
|
| 2592 |
+
for i, length in enumerate(tdnn_output_lengths):
|
| 2593 |
+
mean_features.append(hidden_states[i, :length].mean(dim=0))
|
| 2594 |
+
std_features.append(hidden_states[i, :length].std(dim=0))
|
| 2595 |
+
mean_features = torch.stack(mean_features)
|
| 2596 |
+
std_features = torch.stack(std_features)
|
| 2597 |
+
statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
|
| 2598 |
+
|
| 2599 |
+
output_embeddings = self.feature_extractor(statistic_pooling)
|
| 2600 |
+
logits = self.classifier(output_embeddings)
|
| 2601 |
+
|
| 2602 |
+
loss = None
|
| 2603 |
+
if labels is not None:
|
| 2604 |
+
loss = self.objective(logits, labels)
|
| 2605 |
+
|
| 2606 |
+
if not return_dict:
|
| 2607 |
+
output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
|
| 2608 |
+
return ((loss,) + output) if loss is not None else output
|
| 2609 |
+
|
| 2610 |
+
return XVectorOutput(
|
| 2611 |
+
loss=loss,
|
| 2612 |
+
logits=logits,
|
| 2613 |
+
embeddings=output_embeddings,
|
| 2614 |
+
hidden_states=outputs.hidden_states,
|
| 2615 |
+
attentions=outputs.attentions,
|
| 2616 |
+
)
|
| 2617 |
+
|
| 2618 |
|
| 2619 |
__all__ = [
|
| 2620 |
"PantagruelUniForMaskedLM",
|
|
|
|
| 2624 |
"PantagruelUniForTokenClassification",
|
| 2625 |
"PantagruelUniModel",
|
| 2626 |
"PantagruelUniPreTrainedModel",
|
| 2627 |
+
"PantagruelUniForAudioFrameClassification",
|
| 2628 |
]
|