mazesmazes
/

tiny-audio

@@ -534,6 +534,14 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
             if tokens.dim() > 1:
                 tokens = tokens[0]
         text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
         # Strip <think>...</think> tags (Qwen3 doesn't respect /no_think prompt)
         text = re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL).strip()
@@ -565,14 +573,11 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         if not text:
             return ""
-        # 1. LOWERCASE
-        text = text.lower()
-        # 2. CHECK FOR KNOWN HALLUCINATIONS (delete entirely)
-        if text.strip() in self.HALLUCINATION_PATTERNS:
             return ""
-        # 3. CHECK FOR REGEX-BASED HALLUCINATIONS
         for pattern in self.HALLUCINATION_REGEXES:
             if pattern.search(text):
                 # If hallucination is the entire output, return empty
@@ -581,21 +586,21 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
                 # Otherwise remove the hallucinated portion
                 text = pattern.sub("", text)
-        # 4. COMBINE ACRONYMS
-        # Merge consecutive single letters into one word (e.g., "u s a" -> "usa")
-        text = re.sub(r"\b([a-z])((?:\s+[a-z])+)\b", lambda m: m.group(0).replace(" ", ""), text)
-        # 5. NORMALIZE CURRENCY
         # Convert "eur X" to "X euros" for Whisper normalizer compatibility
-        text = re.sub(r"\beur\s+(\d+)", r"\1 euros", text)
-        # 6. TRUNCATE CHARACTER REPETITIONS (e.g., "uhhhhhh" -> "uhh")
         text = self._truncate_character_repetitions(text)
-        # 7. TRUNCATE TRAILING REPEATS (word-level)
         text = self._truncate_trailing_repeats(text)
-        # 8. STRIP WHITESPACE
         return re.sub(r"\s+", " ", text).strip()
     def _truncate_trailing_repeats(self, text: str, max_ngram: int = 10) -> str:

             if tokens.dim() > 1:
                 tokens = tokens[0]
+        # Filter out eos tokens that the tokenizer doesn't recognize as special
+        # (generation_config.eos_token_id may differ from tokenizer.eos_token_id)
+        if hasattr(self, "model") and hasattr(self.model, "generation_config"):
+            eos_ids = self.model.generation_config.eos_token_id
+            if eos_ids is not None:
+                eos_set = set(eos_ids) if isinstance(eos_ids, list) else {eos_ids}
+                tokens = [t for t in tokens.tolist() if t not in eos_set]
         text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
         # Strip <think>...</think> tags (Qwen3 doesn't respect /no_think prompt)
         text = re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL).strip()
         if not text:
             return ""
+        # 1. CHECK FOR KNOWN HALLUCINATIONS (delete entirely, case-insensitive)
+        if text.strip().lower() in self.HALLUCINATION_PATTERNS:
             return ""
+        # 2. CHECK FOR REGEX-BASED HALLUCINATIONS
         for pattern in self.HALLUCINATION_REGEXES:
             if pattern.search(text):
                 # If hallucination is the entire output, return empty
                 # Otherwise remove the hallucinated portion
                 text = pattern.sub("", text)
+        # 3. COMBINE ACRONYMS
+        # Merge consecutive single letters into one word (e.g., "U S A" -> "USA")
+        text = re.sub(r"\b([a-zA-Z])((?:\s+[a-zA-Z])+)\b", lambda m: m.group(0).replace(" ", ""), text, flags=re.IGNORECASE)
+        # 4. NORMALIZE CURRENCY
         # Convert "eur X" to "X euros" for Whisper normalizer compatibility
+        text = re.sub(r"\beur\s+(\d+)", r"\1 euros", text, flags=re.IGNORECASE)
+        # 5. TRUNCATE CHARACTER REPETITIONS (e.g., "uhhhhhh" -> "uhh")
         text = self._truncate_character_repetitions(text)
+        # 6. TRUNCATE TRAILING REPEATS (word-level)
         text = self._truncate_trailing_repeats(text)
+        # 7. STRIP WHITESPACE
         return re.sub(r"\s+", " ", text).strip()
     def _truncate_trailing_repeats(self, text: str, max_ngram: int = 10) -> str: