| | from transformers import Qwen2AudioEncoder |
| | import torch |
| | from torch import nn |
| | from transformers.modeling_outputs import BaseModelOutput |
| | import torch.nn.functional as F |
| |
|
| | class Qwen2AudioEncoderModel(Qwen2AudioEncoder): |
| | def forward( |
| | self, |
| | input_features, |
| | attention_mask=None, |
| | head_mask=None, |
| | output_attentions=None, |
| | output_hidden_states=None, |
| | return_dict=None, |
| | ): |
| | r""" |
| | Args: |
| | attention_mask (`torch.Tensor`)`, *optional*): |
| | Qwen2Audio does not support masking of the `input_features`, this argument is preserved for compatibility, |
| | but it is not used. By default the silence in the input log mel spectrogram are ignored. |
| | head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): |
| | Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: |
| | |
| | - 1 indicates the head is **not masked**, |
| | - 0 indicates the head is **masked**. |
| | output_attentions (`bool`, *optional*): |
| | Whether or not to return the attentions tensors of all attention layers. See `attentions` under |
| | returned tensors for more detail. |
| | output_hidden_states (`bool`, *optional*): |
| | Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors |
| | for more detail. |
| | return_dict (`bool`, *optional*): |
| | Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. |
| | """ |
| |
|
| | expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0] |
| |
|
| | output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions |
| | output_hidden_states = ( |
| | output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states |
| | ) |
| | return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
| |
|
| | |
| | input_features = input_features.to(dtype=self.conv1.weight.dtype, device=self.conv1.weight.device) |
| |
|
| | inputs_embeds = nn.functional.gelu(self.conv1(input_features)) |
| | inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds)) |
| |
|
| | inputs_embeds = inputs_embeds.permute(0, 2, 1) |
| | embed_pos = self.embed_positions.weight |
| |
|
| | hidden_states = inputs_embeds + embed_pos[: inputs_embeds.shape[1], :] |
| | hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) |
| |
|
| | encoder_states = () if output_hidden_states else None |
| | all_attentions = () if output_attentions else None |
| |
|
| | |
| | if head_mask is not None: |
| | assert head_mask.size()[0] == (len(self.layers)), ( |
| | f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." |
| | ) |
| |
|
| | for idx, encoder_layer in enumerate(self.layers): |
| | if output_hidden_states: |
| | encoder_states = encoder_states + (hidden_states,) |
| | |
| | to_drop = False |
| | if self.training: |
| | dropout_probability = torch.rand([]) |
| | if dropout_probability < self.layerdrop: |
| | to_drop = True |
| |
|
| | |
| | if to_drop: |
| | layer_outputs = (None, None) |
| | else: |
| | layer_outputs = encoder_layer( |
| | hidden_states, |
| | attention_mask, |
| | layer_head_mask=(head_mask[idx] if head_mask is not None else None), |
| | output_attentions=output_attentions, |
| | ) |
| |
|
| | hidden_states = layer_outputs[0] |
| |
|
| | if output_attentions: |
| | all_attentions = all_attentions + (layer_outputs[1],) |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | hidden_states = self.layer_norm(hidden_states) |
| | if output_hidden_states: |
| | encoder_states = encoder_states + (hidden_states,) |
| |
|
| | if not return_dict: |
| | return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) |
| | return BaseModelOutput( |
| | last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions |
| | ) |
| |
|