BUT-FIT
/

DiCoW_v3_2

@@ -2,7 +2,19 @@ import torch
 from torch import nn
 from transformers import WhisperConfig
 from transformers.activations import ACT2FN
-from transformers.models.whisper.modeling_whisper import WHISPER_ATTENTION_CLASSES
 import torch.nn.functional as F
 from .coattention import CoAttention
 from .layers import CustomLinear, CustomDiagonalLinear, Gate

 from torch import nn
 from transformers import WhisperConfig
 from transformers.activations import ACT2FN
+try:
+    from transformers.models.whisper.modeling_whisper import WHISPER_ATTENTION_CLASSES
+except ImportError:
+    from transformers.models.whisper.modeling_whisper import WhisperAttention, WhisperSdpaAttention
+    try:
+        from transformers.models.whisper.modeling_whisper import WhisperFlashAttention2
+    except ImportError:
+        WhisperFlashAttention2 = WhisperAttention
+    WHISPER_ATTENTION_CLASSES = {
+        "eager": WhisperAttention,
+        "sdpa": WhisperSdpaAttention,
+        "flash_attention_2": WhisperFlashAttention2,
+    }
 import torch.nn.functional as F
 from .coattention import CoAttention
 from .layers import CustomLinear, CustomDiagonalLinear, Gate

encoder.py CHANGED Viewed

@@ -1,7 +1,19 @@
 import torch
 from torch import nn
 from transformers.modeling_outputs import CausalLMOutput, BaseModelOutput
-from transformers.models.whisper.modeling_whisper import WhisperEncoder, WhisperEncoderLayer, WHISPER_ATTENTION_CLASSES
 from .FDDT import FDDT
 from .config import DiCoWConfig

 import torch
 from torch import nn
 from transformers.modeling_outputs import CausalLMOutput, BaseModelOutput
+try:
+    from transformers.models.whisper.modeling_whisper import WhisperEncoder, WhisperEncoderLayer, WHISPER_ATTENTION_CLASSES
+except ImportError:
+    from transformers.models.whisper.modeling_whisper import WhisperEncoder, WhisperEncoderLayer, WhisperAttention, WhisperSdpaAttention
+    try:
+        from transformers.models.whisper.modeling_whisper import WhisperFlashAttention2
+    except ImportError:
+        WhisperFlashAttention2 = WhisperAttention
+    WHISPER_ATTENTION_CLASSES = {
+        "eager": WhisperAttention,
+        "sdpa": WhisperSdpaAttention,
+        "flash_attention_2": WhisperFlashAttention2,
+    }
 from .FDDT import FDDT
 from .config import DiCoWConfig

generation.py CHANGED Viewed

@@ -55,6 +55,8 @@ class DiCoWGenerationMixin(WhisperForConditionalGeneration):
         model_kwargs = super()._prepare_encoder_decoder_kwargs_for_generation(
             inputs_tensor, model_kwargs, model_input_name, generation_config
         )
         self.encoder_logits = model_kwargs["encoder_outputs"].logits
         return model_kwargs
@@ -1436,15 +1438,14 @@ class DiCoWGenerationMixin(WhisperForConditionalGeneration):
         gen_config_copy = copy.deepcopy(generation_config)
         gen_config_copy.forced_decoder_ids = None
         processors = super()._get_logits_processor(
-            gen_config_copy,
-            input_ids_seq_length,
-            encoder_input_ids,
-            prefix_allowed_tokens_fn,
-            logits_processor,
-            device,
-            model_kwargs,
-            negative_prompt_ids,
-            negative_prompt_attention_mask,
         )
         if hasattr(generation_config, "ctc_weight") and generation_config.ctc_weight > 0:
             enc_logits = self.encoder_logits

         model_kwargs = super()._prepare_encoder_decoder_kwargs_for_generation(
             inputs_tensor, model_kwargs, model_input_name, generation_config
         )
+        # Ensure output_hidden_states is in model_kwargs
+        model_kwargs["output_hidden_states"] = True
         self.encoder_logits = model_kwargs["encoder_outputs"].logits
         return model_kwargs
         gen_config_copy = copy.deepcopy(generation_config)
         gen_config_copy.forced_decoder_ids = None
         processors = super()._get_logits_processor(
+            generation_config=gen_config_copy,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=encoder_input_ids,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+            model_kwargs=model_kwargs,
+            negative_prompt_ids=negative_prompt_ids,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
         )
         if hasattr(generation_config, "ctc_weight") and generation_config.ctc_weight > 0:
             enc_logits = self.encoder_logits

modeling_dicow.py CHANGED Viewed

@@ -101,7 +101,7 @@ class DiCoW(WhisperModel):
         decoder_outputs = self.decoder(
             input_ids=decoder_input_ids,
             attention_mask=decoder_attention_mask,
-            encoder_hidden_states=encoder_outputs.hidden_states[-1],
             head_mask=decoder_head_mask,
             cross_attn_head_mask=cross_attn_head_mask,
             past_key_values=past_key_values,
@@ -122,7 +122,7 @@ class DiCoW(WhisperModel):
             decoder_hidden_states=decoder_outputs.hidden_states,
             decoder_attentions=decoder_outputs.attentions,
             cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.hidden_states[-1],
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
             encoder_logits=encoder_outputs.logits,

         decoder_outputs = self.decoder(
             input_ids=decoder_input_ids,
             attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs.hidden_states[-1] if encoder_outputs.hidden_states is not None else encoder_outputs.last_hidden_state,
             head_mask=decoder_head_mask,
             cross_attn_head_mask=cross_attn_head_mask,
             past_key_values=past_key_values,
             decoder_hidden_states=decoder_outputs.hidden_states,
             decoder_attentions=decoder_outputs.attentions,
             cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.hidden_states[-1] if encoder_outputs.hidden_states is not None else encoder_outputs.last_hidden_state,
             encoder_hidden_states=encoder_outputs.hidden_states,
             encoder_attentions=encoder_outputs.attentions,
             encoder_logits=encoder_outputs.logits,