mazesmazes
/

tiny-audio-omni

@@ -622,18 +622,13 @@ class ASRModel(PreTrainedModel, GenerationMixin):
                 user_content += " " + self.TRANSCRIBE_PROMPT
             messages.append({"role": "user", "content": user_content})
-            enable_thinking_val = getattr(self.config, "enable_thinking", False)
-            print(f"[DEBUG generate] enable_thinking={enable_thinking_val}, system_prompt={system_prompt[:100] if system_prompt else None}...")
             chat_result = self.tokenizer.apply_chat_template(
                 messages,
                 tokenize=True,
                 add_generation_prompt=True,
                 return_tensors="pt",
-                enable_thinking=enable_thinking_val,
             )
-            # Debug: show the formatted prompt
-            prompt_text = self.tokenizer.decode(chat_result.input_ids[0] if chat_result.input_ids.dim() > 1 else chat_result.input_ids)
-            print(f"[DEBUG generate] Formatted prompt: {prompt_text[:500]}...")
             input_ids = chat_result.input_ids.to(device)
             if input_ids.dim() == 1:

                 user_content += " " + self.TRANSCRIBE_PROMPT
             messages.append({"role": "user", "content": user_content})
             chat_result = self.tokenizer.apply_chat_template(
                 messages,
                 tokenize=True,
                 add_generation_prompt=True,
                 return_tensors="pt",
+                enable_thinking=getattr(self.config, "enable_thinking", False),
             )
             input_ids = chat_result.input_ids.to(device)
             if input_ids.dim() == 1:

asr_pipeline.py CHANGED Viewed

@@ -18,30 +18,13 @@ except ImportError:
     from diarization import SpeakerDiarizer  # type: ignore[no-redef]
 # Re-export for backwards compatibility
-__all__ = ["ForcedAligner", "SpeakerDiarizer", "ASRPipeline", "strip_thinking", "extract_thinking"]
 # Default TTS voice for Kokoro
 DEFAULT_TTS_VOICE = "af_heart"
 TTS_SAMPLE_RATE = 24000
-def extract_thinking(text: str) -> tuple[str, str]:
-    """Extract thinking content from model output.
-    Args:
-        text: Model output text that may contain thinking tags
-    Returns:
-        Tuple of (thinking_content, response_text)
-    """
-    if not text:
-        return "", ""
-    match = re.search(r"<think>(.*?)</think>", text, flags=re.DOTALL)
-    thinking = match.group(1).strip() if match else ""
-    response = re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL).strip()
-    return thinking, response
 def strip_thinking(text: str) -> str:
     """Remove <think>...</think> tags from model output.
@@ -51,8 +34,10 @@ def strip_thinking(text: str) -> str:
     Returns:
         Text with thinking content removed
     """
-    _, response = extract_thinking(text)
-    return response
 class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
@@ -518,17 +503,11 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
                 tokens = [t for t in tokens.tolist() if t not in eos_set]
         text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
-        # Debug: show raw text before extraction
-        if "<think>" in text or "think" in text.lower()[:50]:
-            print(f"[DEBUG postprocess] Raw text contains thinking: {text[:300]}...")
-        # Extract thinking content before stripping
-        thinking, response = extract_thinking(text)
         # Truncate repetitions at end of text
-        response = _truncate_repetitions(response)
-        result = {"text": response}
-        if thinking:
-            result["thinking"] = thinking
-        return result
 def _truncate_repetitions(text: str, min_repeats: int = 3) -> str:

     from diarization import SpeakerDiarizer  # type: ignore[no-redef]
 # Re-export for backwards compatibility
+__all__ = ["ForcedAligner", "SpeakerDiarizer", "ASRPipeline", "strip_thinking"]
 # Default TTS voice for Kokoro
 DEFAULT_TTS_VOICE = "af_heart"
 TTS_SAMPLE_RATE = 24000
 def strip_thinking(text: str) -> str:
     """Remove <think>...</think> tags from model output.
     Returns:
         Text with thinking content removed
     """
+    if not text:
+        return text
+    text = re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL)
+    return text.strip()
 class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
                 tokens = [t for t in tokens.tolist() if t not in eos_set]
         text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
+        # Strip <think>...</think> tags (Qwen3 doesn't respect /no_think prompt)
+        text = strip_thinking(text)
         # Truncate repetitions at end of text
+        text = _truncate_repetitions(text)
+        return {"text": text}
 def _truncate_repetitions(text: str, min_repeats: int = 3) -> str: