mazesmazes
/

tiny-audio-glm

@@ -168,7 +168,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         decoder_kwargs = {
             "attn_implementation": config.attn_implementation,
             "trust_remote_code": True,
-            "tie_word_embeddings": False,
             "low_cpu_mem_usage": True,
             "dtype": dtype,
         }
@@ -342,7 +342,9 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         # Create valid mask for variable-length samples and extract only real embeddings
         max_len = audio_embeds.shape[1]
-        valid_mask = torch.arange(max_len, device=audio_embeds.device)[None, :] < projector_lengths[:, None]
         return audio_embeds[valid_mask]
     def forward(

         decoder_kwargs = {
             "attn_implementation": config.attn_implementation,
             "trust_remote_code": True,
+            "tie_word_embeddings": True,
             "low_cpu_mem_usage": True,
             "dtype": dtype,
         }
         # Create valid mask for variable-length samples and extract only real embeddings
         max_len = audio_embeds.shape[1]
+        valid_mask = (
+            torch.arange(max_len, device=audio_embeds.device)[None, :] < projector_lengths[:, None]
+        )
         return audio_embeds[valid_mask]
     def forward(

asr_pipeline.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from pathlib import Path
 from typing import Any
@@ -473,4 +474,6 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
                 tokens = tokens[0]
         text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
         return {"text": text}

+import re
 from pathlib import Path
 from typing import Any
                 tokens = tokens[0]
         text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
+        # Strip <think>...</think> tags (Qwen3 doesn't respect /no_think prompt)
+        text = re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL).strip()
         return {"text": text}