mazesmazes
/

tiny-audio

@@ -120,6 +120,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         super().__init__(config)
         self.system_prompt = config.system_prompt
         target_dtype = getattr(torch, config.model_dtype)
         # Audio encoder (frozen)
@@ -553,7 +554,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
                 tokenize=True,
                 add_generation_prompt=True,
                 return_tensors="pt",
-                enable_thinking=False,  # Disable Qwen3 thinking mode for ASR
             )
             input_ids = chat_result.input_ids.to(device)
@@ -631,7 +632,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
-            enable_thinking=False,  # Disable Qwen3 thinking mode for ASR
         )
         input_ids = chat_result.input_ids.to(device)
@@ -730,7 +731,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
-            enable_thinking=False,
         ).to(device)
         if input_ids.dim() == 1:

         super().__init__(config)
         self.system_prompt = config.system_prompt
+        self.enable_thinking = False  # Can be enabled for experimental thinking mode
         target_dtype = getattr(torch, config.model_dtype)
         # Audio encoder (frozen)
                 tokenize=True,
                 add_generation_prompt=True,
                 return_tensors="pt",
+                enable_thinking=self.enable_thinking,
             )
             input_ids = chat_result.input_ids.to(device)
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
+            enable_thinking=self.enable_thinking,
         )
         input_ids = chat_result.input_ids.to(device)
             tokenize=True,
             add_generation_prompt=True,
             return_tensors="pt",
+            enable_thinking=self.enable_thinking,
         ).to(device)
         if input_ids.dim() == 1:

asr_pipeline.py CHANGED Viewed

@@ -446,7 +446,9 @@ def _truncate_repetitions(text: str, min_repeats: int = 3) -> str:
     text = char_pattern.sub(r"\1", text)
     # 2. Truncate repeated words at end (e.g., "the the the" -> "the")
-    word_pattern = re.compile(r"\b(\w+)(?:\s+\1){" + str(min_repeats - 1) + r",}\s*$", re.IGNORECASE)
     while word_pattern.search(text):
         text = word_pattern.sub(r"\1", text)
@@ -461,7 +463,13 @@ def _truncate_repetitions(text: str, min_repeats: int = 3) -> str:
             # Build pattern to match repeated phrases at end
             phrase_escaped = re.escape(phrase)
             phrase_pattern = re.compile(
-                r"(^|.*?\s)(" + phrase_escaped + r")(?:\s+" + phrase_escaped + r"){" + str(min_repeats - 1) + r",}\s*$",
                 re.IGNORECASE,
             )
             match = phrase_pattern.match(text)

     text = char_pattern.sub(r"\1", text)
     # 2. Truncate repeated words at end (e.g., "the the the" -> "the")
+    word_pattern = re.compile(
+        r"\b(\w+)(?:\s+\1){" + str(min_repeats - 1) + r",}\s*$", re.IGNORECASE
+    )
     while word_pattern.search(text):
         text = word_pattern.sub(r"\1", text)
             # Build pattern to match repeated phrases at end
             phrase_escaped = re.escape(phrase)
             phrase_pattern = re.compile(
+                r"(^|.*?\s)("
+                + phrase_escaped
+                + r")(?:\s+"
+                + phrase_escaped
+                + r"){"
+                + str(min_repeats - 1)
+                + r",}\s*$",
                 re.IGNORECASE,
             )
             match = phrase_pattern.match(text)

diarization.py CHANGED Viewed

@@ -737,7 +737,7 @@ class SpeakerDiarizer:
             cls._pyannote_pipeline = Pipeline.from_pretrained(
                 "pyannote/speaker-diarization-3.1",
-                use_auth_token=hf_token,
             )
             cls._pyannote_pipeline.to(torch.device(_get_device()))

             cls._pyannote_pipeline = Pipeline.from_pretrained(
                 "pyannote/speaker-diarization-3.1",
+                token=hf_token,
             )
             cls._pyannote_pipeline.to(torch.device(_get_device()))