mazesmazes
/

tiny-audio

@@ -1,4 +1,3 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 tokenizer_config.json -filter -diff -merge text
-tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 tokenizer_config.json -filter -diff -merge text

asr_config.py CHANGED Viewed

@@ -25,7 +25,6 @@ class ASRConfig(transformers.PretrainedConfig):
         model_dtype: str = "bfloat16",
         num_beams: Optional[int] = None,
         system_prompt: str = "You are a helpful assistant.",
-        user_prompt: str = "Please transcribe this English audio into text: <audio>",
         encoder_dim: Optional[int] = None,
         llm_dim: Optional[int] = None,
         # Encoder conv layers: list of (padding, kernel_size, stride) tuples
@@ -51,14 +50,12 @@ class ASRConfig(transformers.PretrainedConfig):
         qformer_intermediate_size: Optional[int] = None,  # FFN size (defaults to 4x hidden)
         label_smoothing: float = 0.0,  # Label smoothing for cross-entropy loss
         inference_warmup_tokens: int = 10,
-        # SpecAugment settings (Whisper defaults)
         use_specaugment: bool = False,
-        mask_time_prob: float = 0.05,  # Probability of masking time steps
-        mask_time_length: int = 10,  # Max length of time mask
-        mask_time_min_masks: int = 2,  # Min number of time masks
-        mask_feature_prob: float = 0.0,  # Probability of masking frequency bins (disabled by default)
-        mask_feature_length: int = 10,  # Max length of frequency mask
-        mask_feature_min_masks: int = 0,  # Min number of frequency masks
         # LoRA configuration (for Stage 2 fine-tuning)
         use_lora: bool = False,
         lora_rank: int = 8,  # SALMONN default
@@ -104,7 +101,6 @@ class ASRConfig(transformers.PretrainedConfig):
         self.attn_implementation = attn_implementation
         self.model_dtype = model_dtype
         self.system_prompt = system_prompt
-        self.user_prompt = user_prompt
         self.encoder_dim = encoder_dim
         self.llm_dim = llm_dim
         # Default conv layers for Whisper/GLM-ASR: [(pad, kernel, stride), ...]
@@ -131,12 +127,10 @@ class ASRConfig(transformers.PretrainedConfig):
         self.inference_warmup_tokens = inference_warmup_tokens
         # SpecAugment configuration
         self.use_specaugment = use_specaugment
-        self.mask_time_prob = mask_time_prob
-        self.mask_time_length = mask_time_length
-        self.mask_time_min_masks = mask_time_min_masks
-        self.mask_feature_prob = mask_feature_prob
-        self.mask_feature_length = mask_feature_length
-        self.mask_feature_min_masks = mask_feature_min_masks
         # LoRA configuration
         self.use_lora = use_lora
         self.lora_rank = lora_rank
@@ -206,6 +200,10 @@ class ASRConfig(transformers.PretrainedConfig):
         super().__init__(**kwargs)
         self.auto_map = {
             "AutoConfig": "asr_config.ASRConfig",
             "AutoModel": "asr_modeling.ASRModel",

         model_dtype: str = "bfloat16",
         num_beams: Optional[int] = None,
         system_prompt: str = "You are a helpful assistant.",
         encoder_dim: Optional[int] = None,
         llm_dim: Optional[int] = None,
         # Encoder conv layers: list of (padding, kernel_size, stride) tuples
         qformer_intermediate_size: Optional[int] = None,  # FFN size (defaults to 4x hidden)
         label_smoothing: float = 0.0,  # Label smoothing for cross-entropy loss
         inference_warmup_tokens: int = 10,
+        # SpecAugment settings
         use_specaugment: bool = False,
+        num_time_masks: int = 2,
+        time_mask_length: int = 10,
+        num_freq_masks: int = 0,
+        freq_mask_length: int = 10,
         # LoRA configuration (for Stage 2 fine-tuning)
         use_lora: bool = False,
         lora_rank: int = 8,  # SALMONN default
         self.attn_implementation = attn_implementation
         self.model_dtype = model_dtype
         self.system_prompt = system_prompt
         self.encoder_dim = encoder_dim
         self.llm_dim = llm_dim
         # Default conv layers for Whisper/GLM-ASR: [(pad, kernel, stride), ...]
         self.inference_warmup_tokens = inference_warmup_tokens
         # SpecAugment configuration
         self.use_specaugment = use_specaugment
+        self.num_time_masks = num_time_masks
+        self.time_mask_length = time_mask_length
+        self.num_freq_masks = num_freq_masks
+        self.freq_mask_length = freq_mask_length
         # LoRA configuration
         self.use_lora = use_lora
         self.lora_rank = lora_rank
         super().__init__(**kwargs)
+        # Point encoder to audio_config so pipeline uses correct feature extractor
+        # The pipeline looks for config.encoder._name_or_path for feature extractor
+        self.encoder = self.audio_config
         self.auto_map = {
             "AutoConfig": "asr_config.ASRConfig",
             "AutoModel": "asr_modeling.ASRModel",

asr_modeling.py CHANGED Viewed

@@ -24,120 +24,26 @@ except ImportError:
     from projectors import PROJECTOR_CLASSES  # type: ignore[no-redef]
-def _compute_mask_indices(
-    shape: tuple[int, int],
-    mask_prob: float,
-    mask_length: int,
-    min_masks: int = 0,
-    device: torch.device = None,
-) -> torch.Tensor:
-    """Compute random mask spans for SpecAugment.
-    Based on transformers' _compute_mask_indices for Wav2Vec2/Whisper.
-    Args:
-        shape: (batch_size, sequence_length)
-        mask_prob: Probability for each token to be chosen as start of mask span
-        mask_length: Maximum length of mask span
-        min_masks: Minimum number of masks per sample
-        device: Device to create tensor on
-    Returns:
-        Boolean mask tensor of shape (batch_size, sequence_length)
-    """
-    batch_size, sequence_length = shape
-    if mask_length < 1:
-        raise ValueError(f"mask_length must be >= 1, got {mask_length}")
-    if mask_length > sequence_length:
-        raise ValueError(f"mask_length {mask_length} must be <= sequence_length {sequence_length}")
-    # Compute number of masked spans per sample
-    num_masked_spans = int(mask_prob * sequence_length / mask_length + torch.rand(1).item())
-    num_masked_spans = max(num_masked_spans, min_masks)
-    # Clamp to ensure we don't exceed sequence length
-    if num_masked_spans * mask_length > sequence_length:
-        num_masked_spans = sequence_length // mask_length
-    if num_masked_spans == 0:
-        return torch.zeros((batch_size, sequence_length), dtype=torch.bool, device=device)
-    # Uniformly sample span start indices
-    mask = torch.zeros((batch_size, sequence_length), dtype=torch.bool, device=device)
-    for i in range(batch_size):
-        # Random start indices for this sample
-        spec_aug_start_indices = torch.randint(
-            0, sequence_length - mask_length + 1, (num_masked_spans,), device=device
-        )
-        # Create mask spans
-        for start_idx in spec_aug_start_indices:
-            mask[i, start_idx : start_idx + mask_length] = True
-    return mask
 def apply_specaugment(
-    input_features: torch.Tensor,
-    mask_time_prob: float = 0.05,
-    mask_time_length: int = 10,
-    mask_time_min_masks: int = 2,
-    mask_feature_prob: float = 0.0,
-    mask_feature_length: int = 10,
-    mask_feature_min_masks: int = 0,
 ) -> torch.Tensor:
-    """Apply SpecAugment to mel spectrogram features.
-    Args:
-        input_features: Mel spectrogram of shape (batch, n_mels, time)
-        mask_time_prob: Probability of masking time steps
-        mask_time_length: Max length of time mask
-        mask_time_min_masks: Min number of time masks
-        mask_feature_prob: Probability of masking frequency bins
-        mask_feature_length: Max length of frequency mask
-        mask_feature_min_masks: Min number of frequency masks
-    Returns:
-        Augmented mel spectrogram with same shape
-    """
-    batch_size, n_mels, time_steps = input_features.shape
-    device = input_features.device
-    # Clone to avoid modifying original
-    augmented = input_features.clone()
-    # Time masking (along time dimension)
-    # Apply if prob > 0 OR min_masks > 0 (to support fixed mask count with prob=0)
-    if mask_time_prob > 0 or mask_time_min_masks > 0:
-        time_mask = _compute_mask_indices(
-            shape=(batch_size, time_steps),
-            mask_prob=mask_time_prob,
-            mask_length=mask_time_length,
-            min_masks=mask_time_min_masks,
-            device=device,
-        )
-        # Expand to (batch, 1, time) for broadcasting
-        time_mask = time_mask.unsqueeze(1)
-        augmented = augmented.masked_fill(time_mask, 0.0)
-    # Frequency masking (along mel dimension)
-    # Apply if prob > 0 OR min_masks > 0 (to support fixed mask count with prob=0)
-    if mask_feature_prob > 0 or mask_feature_min_masks > 0:
-        feature_mask = _compute_mask_indices(
-            shape=(batch_size, n_mels),
-            mask_prob=mask_feature_prob,
-            mask_length=mask_feature_length,
-            min_masks=mask_feature_min_masks,
-            device=device,
-        )
-        # Expand to (batch, n_mels, 1) for broadcasting
-        feature_mask = feature_mask.unsqueeze(2)
-        augmented = augmented.masked_fill(feature_mask, 0.0)
-    return augmented
 class ASRModel(PreTrainedModel, GenerationMixin):
@@ -225,6 +131,10 @@ class ASRModel(PreTrainedModel, GenerationMixin):
                     )
                     model.language_model = get_peft_model(model.language_model, lora_config)
             return model
         finally:
             cls._is_loading_from_pretrained = False
@@ -393,6 +303,11 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         )
         self.language_model = get_peft_model(self.language_model, lora_config)
     def _init_tokenizer(self, config: ASRConfig):
         """Initialize tokenizer with audio token."""
         self.tokenizer = AutoTokenizer.from_pretrained(config.text_model_id, trust_remote_code=True)
@@ -551,12 +466,10 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             if self.training and getattr(self.config, "use_specaugment", False):
                 input_features = apply_specaugment(
                     input_features,
-                    mask_time_prob=self.config.mask_time_prob,
-                    mask_time_length=self.config.mask_time_length,
-                    mask_time_min_masks=self.config.mask_time_min_masks,
-                    mask_feature_prob=self.config.mask_feature_prob,
-                    mask_feature_length=self.config.mask_feature_length,
-                    mask_feature_min_masks=self.config.mask_feature_min_masks,
                 )
             # Encode audio -> flattened (total_audio_tokens, hidden_dim)
@@ -841,6 +754,27 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         if hasattr(self.language_model, "peft_config"):
             self.language_model.save_pretrained(save_dir, save_embedding_layers=False)
         # Add processor auto_map to preprocessor_config.json
         config_path = save_dir / "preprocessor_config.json"
         if config_path.exists():
@@ -866,6 +800,11 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         # Copy projectors module
         shutil.copy(src_dir / "projectors.py", save_dir / "projectors.py")
     def create_or_update_model_card(self, output_dir: Union[str, Path]) -> None:
         """No-op for model card creation - we use MODEL_CARD.md in repo instead."""
         pass

     from projectors import PROJECTOR_CLASSES  # type: ignore[no-redef]
+from torchaudio.transforms import FrequencyMasking, TimeMasking
 def apply_specaugment(
+    x: torch.Tensor,
+    num_time_masks: int = 2,
+    time_mask_length: int = 10,
+    num_freq_masks: int = 0,
+    freq_mask_length: int = 10,
 ) -> torch.Tensor:
+    """Apply SpecAugment using torchaudio. Input shape: (batch, n_mels, time)."""
+    if num_time_masks > 0:
+        tm = TimeMasking(time_mask_param=time_mask_length, iid_masks=True)
+        for _ in range(num_time_masks):
+            x = tm(x)
+    if num_freq_masks > 0:
+        fm = FrequencyMasking(freq_mask_param=freq_mask_length, iid_masks=True)
+        for _ in range(num_freq_masks):
+            x = fm(x)
+    return x
 class ASRModel(PreTrainedModel, GenerationMixin):
                     )
                     model.language_model = get_peft_model(model.language_model, lora_config)
+                    # Clear base_model_name_or_path so PEFT doesn't save a reference
+                    # to the base LLM. See _setup_lora for details.
+                    model.language_model.peft_config["default"].base_model_name_or_path = None
             return model
         finally:
             cls._is_loading_from_pretrained = False
         )
         self.language_model = get_peft_model(self.language_model, lora_config)
+        # Clear base_model_name_or_path so PEFT doesn't save a reference to the
+        # base LLM (e.g. Qwen). This prevents pipeline() from redirecting to the
+        # wrong model. The correct path gets set during save_pretrained/push_to_hub.
+        self.language_model.peft_config["default"].base_model_name_or_path = None
     def _init_tokenizer(self, config: ASRConfig):
         """Initialize tokenizer with audio token."""
         self.tokenizer = AutoTokenizer.from_pretrained(config.text_model_id, trust_remote_code=True)
             if self.training and getattr(self.config, "use_specaugment", False):
                 input_features = apply_specaugment(
                     input_features,
+                    num_time_masks=self.config.num_time_masks,
+                    time_mask_length=self.config.time_mask_length,
+                    num_freq_masks=self.config.num_freq_masks,
+                    freq_mask_length=self.config.freq_mask_length,
                 )
             # Encode audio -> flattened (total_audio_tokens, hidden_dim)
         if hasattr(self.language_model, "peft_config"):
             self.language_model.save_pretrained(save_dir, save_embedding_layers=False)
+            # Fix adapter_config.json to point base_model_name_or_path to the repo itself
+            # This prevents transformers pipeline() from redirecting to the base LLM repo
+            # (like Qwen) which breaks feature extractor loading for multimodal models.
+            # See: https://huggingface.co/ibm-granite/granite-speech-3.3-2b for reference
+            adapter_config_path = save_dir / "adapter_config.json"
+            if adapter_config_path.exists():
+                with adapter_config_path.open() as f:
+                    adapter_config = json.load(f)
+                # Use repo_id from kwargs or config - never use checkpoint directory name
+                repo_id = (
+                    kwargs.get("repo_id")
+                    or kwargs.get("push_to_hub_model_id")
+                    or getattr(self.config, "pretrained_model_path", None)
+                )
+                if repo_id:
+                    adapter_config["base_model_name_or_path"] = repo_id
+                with adapter_config_path.open("w") as f:
+                    json.dump(adapter_config, f, indent=2)
         # Add processor auto_map to preprocessor_config.json
         config_path = save_dir / "preprocessor_config.json"
         if config_path.exists():
         # Copy projectors module
         shutil.copy(src_dir / "projectors.py", save_dir / "projectors.py")
+    def push_to_hub(self, repo_id: str, **kwargs) -> str:
+        """Push model to HuggingFace Hub, ensuring adapter_config points to repo."""
+        # Call parent's push_to_hub with repo_id in kwargs so save_pretrained can use it
+        return super().push_to_hub(repo_id, repo_id=repo_id, **kwargs)
     def create_or_update_model_card(self, output_dir: Union[str, Path]) -> None:
         """No-op for model card creation - we use MODEL_CARD.md in repo instead."""
         pass

asr_pipeline.py CHANGED Viewed

@@ -332,6 +332,7 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         kwargs.pop("min_speakers", None)
         kwargs.pop("max_speakers", None)
         kwargs.pop("hf_token", None)
         return super()._sanitize_parameters(**kwargs)
@@ -346,6 +347,7 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
             inputs: Audio input (file path, dict with array/sampling_rate, etc.)
             return_timestamps: If True, return word-level timestamps using forced alignment
             return_speakers: If True, return speaker labels for each word
             num_speakers: Exact number of speakers (if known, for diarization)
             min_speakers: Minimum number of speakers (for diarization)
             max_speakers: Maximum number of speakers (for diarization)
@@ -359,6 +361,7 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         # Extract our params before super().__call__ (which will also call _sanitize_parameters)
         return_timestamps = kwargs.pop("return_timestamps", False)
         return_speakers = kwargs.pop("return_speakers", False)
         diarization_params = {
             "num_speakers": kwargs.pop("num_speakers", None),
             "min_speakers": kwargs.pop("min_speakers", None),
@@ -369,6 +372,12 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         if return_speakers:
             return_timestamps = True
         # Store audio for timestamp alignment and diarization
         if return_timestamps or return_speakers:
             self._current_audio = self._extract_audio(inputs)
@@ -416,6 +425,8 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         # Clean up
         self._current_audio = None
         return result
@@ -523,6 +534,13 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         text = self._post_process_prediction(text)
         return {"text": text}
     def _post_process_prediction(self, text: str) -> str:
         """Post-process model output to fix common issues."""
         if not text:
@@ -531,22 +549,29 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         # 1. LOWERCASE
         text = text.lower()
-        # 2. COMBINE ACRONYMS
         # Merge consecutive single letters into one word (e.g., "u s a" -> "usa")
         text = re.sub(r"\b([a-z])((?:\s+[a-z])+)\b", lambda m: m.group(0).replace(" ", ""), text)
-        # 3. NORMALIZE CURRENCY
         # Convert "eur X" to "X euros" for Whisper normalizer compatibility
         text = re.sub(r"\beur\s+(\d+)", r"\1 euros", text)
-        # 4. TRUNCATE TRAILING REPEATS
         text = self._truncate_trailing_repeats(text)
-        # 5. STRIP WHITESPACE
         return re.sub(r"\s+", " ", text).strip()
-    def _truncate_trailing_repeats(self, text: str, max_ngram: int = 4) -> str:
-        """Remove trailing repeated n-grams (1-4 words)."""
         words = text.split()
         if len(words) < 2:
             return text
@@ -566,3 +591,25 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
                     break  # Restart from largest n-gram
         return " ".join(words)

         kwargs.pop("min_speakers", None)
         kwargs.pop("max_speakers", None)
         kwargs.pop("hf_token", None)
+        kwargs.pop("user_prompt", None)
         return super()._sanitize_parameters(**kwargs)
             inputs: Audio input (file path, dict with array/sampling_rate, etc.)
             return_timestamps: If True, return word-level timestamps using forced alignment
             return_speakers: If True, return speaker labels for each word
+            user_prompt: Custom transcription prompt (default: "Transcribe: ")
             num_speakers: Exact number of speakers (if known, for diarization)
             min_speakers: Minimum number of speakers (for diarization)
             max_speakers: Maximum number of speakers (for diarization)
         # Extract our params before super().__call__ (which will also call _sanitize_parameters)
         return_timestamps = kwargs.pop("return_timestamps", False)
         return_speakers = kwargs.pop("return_speakers", False)
+        user_prompt = kwargs.pop("user_prompt", None)
         diarization_params = {
             "num_speakers": kwargs.pop("num_speakers", None),
             "min_speakers": kwargs.pop("min_speakers", None),
         if return_speakers:
             return_timestamps = True
+        # Set custom user prompt if provided
+        original_prompt = None
+        if user_prompt:
+            original_prompt = self.model.TRANSCRIBE_PROMPT
+            self.model.TRANSCRIBE_PROMPT = user_prompt
         # Store audio for timestamp alignment and diarization
         if return_timestamps or return_speakers:
             self._current_audio = self._extract_audio(inputs)
         # Clean up
         self._current_audio = None
+        if original_prompt is not None:
+            self.model.TRANSCRIBE_PROMPT = original_prompt
         return result
         text = self._post_process_prediction(text)
         return {"text": text}
+    # Known hallucination patterns that should be deleted entirely
+    HALLUCINATION_PATTERNS = frozenset(
+        [
+            "and gt and gt",
+        ]
+    )
     def _post_process_prediction(self, text: str) -> str:
         """Post-process model output to fix common issues."""
         if not text:
         # 1. LOWERCASE
         text = text.lower()
+        # 2. CHECK FOR KNOWN HALLUCINATIONS (delete entirely)
+        if text.strip() in self.HALLUCINATION_PATTERNS:
+            return ""
+        # 3. COMBINE ACRONYMS
         # Merge consecutive single letters into one word (e.g., "u s a" -> "usa")
         text = re.sub(r"\b([a-z])((?:\s+[a-z])+)\b", lambda m: m.group(0).replace(" ", ""), text)
+        # 4. NORMALIZE CURRENCY
         # Convert "eur X" to "X euros" for Whisper normalizer compatibility
         text = re.sub(r"\beur\s+(\d+)", r"\1 euros", text)
+        # 5. TRUNCATE CHARACTER REPETITIONS (e.g., "uhhhhhh" -> "uhh")
+        text = self._truncate_character_repetitions(text)
+        # 6. TRUNCATE TRAILING REPEATS (word-level)
         text = self._truncate_trailing_repeats(text)
+        # 7. STRIP WHITESPACE
         return re.sub(r"\s+", " ", text).strip()
+    def _truncate_trailing_repeats(self, text: str, max_ngram: int = 10) -> str:
+        """Remove trailing repeated n-grams (1-10 words)."""
         words = text.split()
         if len(words) < 2:
             return text
                     break  # Restart from largest n-gram
         return " ".join(words)
+    def _truncate_character_repetitions(self, text: str, max_repeats: int = 3) -> str:
+        """Remove excessive character repetitions (e.g., 'uhhhhhh' -> 'uhh').
+        Handles hallucinations where the model outputs the same character many times,
+        like "uhhhhhhhhhhhhhhhhhhhhhhhhh" at the end of a prediction.
+        Args:
+            text: Input text to clean
+            max_repeats: Maximum allowed consecutive repetitions of a character
+        Returns:
+            Text with character repetitions truncated
+        """
+        if not text:
+            return text
+        # Replace any character repeated more than max_repeats times with max_repeats
+        # Pattern: any character followed by itself N+ times
+        pattern = rf"(.)\1{{{max_repeats},}}"
+        replacement = r"\1" * max_repeats
+        return re.sub(pattern, replacement, text)