mazesmazes
/

tiny-audio

@@ -51,9 +51,7 @@ def _compute_mask_indices(
         raise ValueError(f"mask_length must be >= 1, got {mask_length}")
     if mask_length > sequence_length:
-        raise ValueError(
-            f"mask_length {mask_length} must be <= sequence_length {sequence_length}"
-        )
     # Compute number of masked spans per sample
     num_masked_spans = int(mask_prob * sequence_length / mask_length + torch.rand(1).item())
@@ -190,21 +188,6 @@ class ASRModel(PreTrainedModel, GenerationMixin):
                 state_dict = load_file(model_file)
                 model.load_state_dict(state_dict, strict=False)
-            # Load LoRA adapter if present
-            adapter_config = cached_file(
-                pretrained_model_name_or_path,
-                "adapter_config.json",
-                _raise_exceptions_for_missing_entries=False,
-                **cache_kwargs,
-            )
-            if adapter_config is not None:
-                from peft import PeftModel
-                # Pass original repo ID to PEFT, let it handle caching
-                model.language_model = PeftModel.from_pretrained(
-                    model.language_model, pretrained_model_name_or_path, is_trainable=False
-                )
             return model
         finally:
             cls._is_loading_from_pretrained = False
@@ -728,14 +711,14 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         thread.start()
         # Yield tokens as they're generated, filtering out <think>...</think> blocks
-        # SmolLM3 always starts in thinking mode, so assume we're in a think block
-        in_think_block = True
         buffer = ""
         for text in streamer:
             buffer += text
-            # Check for think block start (in case model outputs multiple think blocks)
             while "<think>" in buffer:
                 in_think_block = True
                 # Yield any text before <think>

         raise ValueError(f"mask_length must be >= 1, got {mask_length}")
     if mask_length > sequence_length:
+        raise ValueError(f"mask_length {mask_length} must be <= sequence_length {sequence_length}")
     # Compute number of masked spans per sample
     num_masked_spans = int(mask_prob * sequence_length / mask_length + torch.rand(1).item())
                 state_dict = load_file(model_file)
                 model.load_state_dict(state_dict, strict=False)
             return model
         finally:
             cls._is_loading_from_pretrained = False
         thread.start()
         # Yield tokens as they're generated, filtering out <think>...</think> blocks
+        # Start assuming no think block - only filter when we see <think>
+        in_think_block = False
         buffer = ""
         for text in streamer:
             buffer += text
+            # Check for think block start (in case model outputs think blocks)
             while "<think>" in buffer:
                 in_think_block = True
                 # Yield any text before <think>

asr_pipeline.py CHANGED Viewed

@@ -507,6 +507,4 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
                     break
         # 3. STRIP WHITESPACE
-        text = re.sub(r'\s+', ' ', text).strip()
-        return text

                     break
         # 3. STRIP WHITESPACE
+        return re.sub(r"\s+", " ", text).strip()

asr_processing.py CHANGED Viewed

@@ -106,7 +106,7 @@ class ASRProcessor(ProcessorMixin):
             input_ids = tokenized
         else:
             # BatchEncoding or dict-like object
-            input_ids = tokenized["input_ids"] if "input_ids" in tokenized else tokenized.input_ids
         if input_ids.dim() == 1:
             input_ids = input_ids.unsqueeze(0)

             input_ids = tokenized
         else:
             # BatchEncoding or dict-like object
+            input_ids = tokenized.get("input_ids", tokenized.input_ids)
         if input_ids.dim() == 1:
             input_ids = input_ids.unsqueeze(0)