mazesmazes
/

tiny-audio-glm

@@ -52,7 +52,7 @@ class ASRConfig(transformers.PretrainedConfig):
         # Set default generation parameters (greedy decoding only)
         generation_defaults = {
             "num_beams": 1,
-            "max_new_tokens": 96,
             "repetition_penalty": 1.0,
             "length_penalty": 1.0,
             "no_repeat_ngram_size": 0,

         # Set default generation parameters (greedy decoding only)
         generation_defaults = {
             "num_beams": 1,
+            "max_new_tokens": 256,
             "repetition_penalty": 1.0,
             "length_penalty": 1.0,
             "no_repeat_ngram_size": 0,

asr_modeling.py CHANGED Viewed

@@ -121,7 +121,10 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         self.generation_config.length_penalty = config.length_penalty
         self.generation_config.repetition_penalty = config.repetition_penalty
         self.generation_config.no_repeat_ngram_size = config.no_repeat_ngram_size
-        self.generation_config.eos_token_id = self.tokenizer.convert_tokens_to_ids("<|im_end|>")
         self.generation_config.pad_token_id = self.tokenizer.pad_token_id
         # Feature extractor for audio preprocessing
@@ -145,7 +148,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         encoder_kwargs = {
             "attn_implementation": config.attn_implementation,
             "low_cpu_mem_usage": True,
-            "torch_dtype": dtype,
         }
         if "whisper" in config.audio_model_id.lower():

         self.generation_config.length_penalty = config.length_penalty
         self.generation_config.repetition_penalty = config.repetition_penalty
         self.generation_config.no_repeat_ngram_size = config.no_repeat_ngram_size
+        self.generation_config.eos_token_id = [
+            self.tokenizer.convert_tokens_to_ids("<|im_end|>"),
+            self.tokenizer.convert_tokens_to_ids("<|endoftext|>"),
+        ]
         self.generation_config.pad_token_id = self.tokenizer.pad_token_id
         # Feature extractor for audio preprocessing
         encoder_kwargs = {
             "attn_implementation": config.attn_implementation,
             "low_cpu_mem_usage": True,
+            "dtype": dtype,
         }
         if "whisper" in config.audio_model_id.lower():

asr_pipeline.py CHANGED Viewed

@@ -476,4 +476,32 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
         # Strip <think>...</think> tags (Qwen3 doesn't respect /no_think prompt)
         text = re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL).strip()
         return {"text": text}

         text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
         # Strip <think>...</think> tags (Qwen3 doesn't respect /no_think prompt)
         text = re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL).strip()
+        # Truncate if a word repeats more than 3 times consecutively
+        text = self._truncate_repetitions(text, max_repeats=3)
         return {"text": text}
+    def _truncate_repetitions(self, text: str, max_repeats: int = 3) -> str:
+        """Truncate text when a word repeats more than max_repeats times consecutively.
+        Args:
+            text: Input text to check for repetitions
+            max_repeats: Maximum allowed consecutive repetitions (default 3)
+        Returns:
+            Truncated text if repetition detected, otherwise original text
+        """
+        words = text.split()
+        if len(words) <= max_repeats:
+            return text
+        repeat_count = 1
+        for i in range(1, len(words)):
+            if words[i].lower() == words[i - 1].lower():
+                repeat_count += 1
+                if repeat_count > max_repeats:
+                    # Keep up to max_repeats of the repeated word
+                    return " ".join(words[:i])
+            else:
+                repeat_count = 1
+        return text