mazesmazes
/

tiny-audio

@@ -1,3 +1,4 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 tokenizer_config.json -filter -diff -merge text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 tokenizer_config.json -filter -diff -merge text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

asr_config.py CHANGED Viewed

@@ -25,6 +25,7 @@ class ASRConfig(transformers.PretrainedConfig):
         model_dtype: str = "bfloat16",
         num_beams: Optional[int] = None,
         system_prompt: str = "You are a helpful assistant.",
         encoder_dim: Optional[int] = None,
         llm_dim: Optional[int] = None,
         # Encoder conv layers: list of (padding, kernel_size, stride) tuples
@@ -103,6 +104,7 @@ class ASRConfig(transformers.PretrainedConfig):
         self.attn_implementation = attn_implementation
         self.model_dtype = model_dtype
         self.system_prompt = system_prompt
         self.encoder_dim = encoder_dim
         self.llm_dim = llm_dim
         # Default conv layers for Whisper/GLM-ASR: [(pad, kernel, stride), ...]
@@ -204,10 +206,6 @@ class ASRConfig(transformers.PretrainedConfig):
         super().__init__(**kwargs)
-        # Point encoder to audio_config so pipeline uses correct feature extractor
-        # The pipeline looks for config.encoder._name_or_path for feature extractor
-        self.encoder = self.audio_config
         self.auto_map = {
             "AutoConfig": "asr_config.ASRConfig",
             "AutoModel": "asr_modeling.ASRModel",

         model_dtype: str = "bfloat16",
         num_beams: Optional[int] = None,
         system_prompt: str = "You are a helpful assistant.",
+        user_prompt: str = "Please transcribe this English audio into text: <audio>",
         encoder_dim: Optional[int] = None,
         llm_dim: Optional[int] = None,
         # Encoder conv layers: list of (padding, kernel_size, stride) tuples
         self.attn_implementation = attn_implementation
         self.model_dtype = model_dtype
         self.system_prompt = system_prompt
+        self.user_prompt = user_prompt
         self.encoder_dim = encoder_dim
         self.llm_dim = llm_dim
         # Default conv layers for Whisper/GLM-ASR: [(pad, kernel, stride), ...]
         super().__init__(**kwargs)
         self.auto_map = {
             "AutoConfig": "asr_config.ASRConfig",
             "AutoModel": "asr_modeling.ASRModel",

asr_modeling.py CHANGED Viewed

@@ -225,10 +225,6 @@ class ASRModel(PreTrainedModel, GenerationMixin):
                     )
                     model.language_model = get_peft_model(model.language_model, lora_config)
-                    # Clear base_model_name_or_path so PEFT doesn't save a reference
-                    # to the base LLM. See _setup_lora for details.
-                    model.language_model.peft_config["default"].base_model_name_or_path = None
             return model
         finally:
             cls._is_loading_from_pretrained = False
@@ -397,11 +393,6 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         )
         self.language_model = get_peft_model(self.language_model, lora_config)
-        # Clear base_model_name_or_path so PEFT doesn't save a reference to the
-        # base LLM (e.g. Qwen). This prevents pipeline() from redirecting to the
-        # wrong model. The correct path gets set during save_pretrained/push_to_hub.
-        self.language_model.peft_config["default"].base_model_name_or_path = None
     def _init_tokenizer(self, config: ASRConfig):
         """Initialize tokenizer with audio token."""
         self.tokenizer = AutoTokenizer.from_pretrained(config.text_model_id, trust_remote_code=True)
@@ -850,27 +841,6 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         if hasattr(self.language_model, "peft_config"):
             self.language_model.save_pretrained(save_dir, save_embedding_layers=False)
-            # Fix adapter_config.json to point base_model_name_or_path to the repo itself
-            # This prevents transformers pipeline() from redirecting to the base LLM repo
-            # (like Qwen) which breaks feature extractor loading for multimodal models.
-            # See: https://huggingface.co/ibm-granite/granite-speech-3.3-2b for reference
-            adapter_config_path = save_dir / "adapter_config.json"
-            if adapter_config_path.exists():
-                with adapter_config_path.open() as f:
-                    adapter_config = json.load(f)
-                # Use repo_id if provided, otherwise use the save directory name
-                # (which becomes the repo ID when pushed to hub)
-                repo_id = kwargs.get("repo_id") or kwargs.get("push_to_hub_model_id")
-                if repo_id:
-                    adapter_config["base_model_name_or_path"] = repo_id
-                else:
-                    # Fallback: use save_dir name (works when save_dir matches repo structure)
-                    adapter_config["base_model_name_or_path"] = save_dir.name
-                with adapter_config_path.open("w") as f:
-                    json.dump(adapter_config, f, indent=2)
         # Add processor auto_map to preprocessor_config.json
         config_path = save_dir / "preprocessor_config.json"
         if config_path.exists():
@@ -896,11 +866,6 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         # Copy projectors module
         shutil.copy(src_dir / "projectors.py", save_dir / "projectors.py")
-    def push_to_hub(self, repo_id: str, **kwargs) -> str:
-        """Push model to HuggingFace Hub, ensuring adapter_config points to repo."""
-        # Call parent's push_to_hub with repo_id in kwargs so save_pretrained can use it
-        return super().push_to_hub(repo_id, repo_id=repo_id, **kwargs)
     def create_or_update_model_card(self, output_dir: Union[str, Path]) -> None:
         """No-op for model card creation - we use MODEL_CARD.md in repo instead."""
         pass

                     )
                     model.language_model = get_peft_model(model.language_model, lora_config)
             return model
         finally:
             cls._is_loading_from_pretrained = False
         )
         self.language_model = get_peft_model(self.language_model, lora_config)
     def _init_tokenizer(self, config: ASRConfig):
         """Initialize tokenizer with audio token."""
         self.tokenizer = AutoTokenizer.from_pretrained(config.text_model_id, trust_remote_code=True)
         if hasattr(self.language_model, "peft_config"):
             self.language_model.save_pretrained(save_dir, save_embedding_layers=False)
         # Add processor auto_map to preprocessor_config.json
         config_path = save_dir / "preprocessor_config.json"
         if config_path.exists():
         # Copy projectors module
         shutil.copy(src_dir / "projectors.py", save_dir / "projectors.py")
     def create_or_update_model_card(self, output_dir: Union[str, Path]) -> None:
         """No-op for model card creation - we use MODEL_CARD.md in repo instead."""
         pass

asr_pipeline.py CHANGED Viewed

@@ -523,13 +523,6 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         text = self._post_process_prediction(text)
         return {"text": text}
-    # Known hallucination patterns that should be deleted entirely
-    HALLUCINATION_PATTERNS = frozenset(
-        [
-            "and gt and gt",
-        ]
-    )
     def _post_process_prediction(self, text: str) -> str:
         """Post-process model output to fix common issues."""
         if not text:
@@ -538,29 +531,22 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         # 1. LOWERCASE
         text = text.lower()
-        # 2. CHECK FOR KNOWN HALLUCINATIONS (delete entirely)
-        if text.strip() in self.HALLUCINATION_PATTERNS:
-            return ""
-        # 3. COMBINE ACRONYMS
         # Merge consecutive single letters into one word (e.g., "u s a" -> "usa")
         text = re.sub(r"\b([a-z])((?:\s+[a-z])+)\b", lambda m: m.group(0).replace(" ", ""), text)
-        # 4. NORMALIZE CURRENCY
         # Convert "eur X" to "X euros" for Whisper normalizer compatibility
         text = re.sub(r"\beur\s+(\d+)", r"\1 euros", text)
-        # 5. TRUNCATE CHARACTER REPETITIONS (e.g., "uhhhhhh" -> "uhh")
-        text = self._truncate_character_repetitions(text)
-        # 6. TRUNCATE TRAILING REPEATS (word-level)
         text = self._truncate_trailing_repeats(text)
-        # 7. STRIP WHITESPACE
         return re.sub(r"\s+", " ", text).strip()
-    def _truncate_trailing_repeats(self, text: str, max_ngram: int = 10) -> str:
-        """Remove trailing repeated n-grams (1-10 words)."""
         words = text.split()
         if len(words) < 2:
             return text
@@ -580,25 +566,3 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
                     break  # Restart from largest n-gram
         return " ".join(words)
-    def _truncate_character_repetitions(self, text: str, max_repeats: int = 3) -> str:
-        """Remove excessive character repetitions (e.g., 'uhhhhhh' -> 'uhh').
-        Handles hallucinations where the model outputs the same character many times,
-        like "uhhhhhhhhhhhhhhhhhhhhhhhhh" at the end of a prediction.
-        Args:
-            text: Input text to clean
-            max_repeats: Maximum allowed consecutive repetitions of a character
-        Returns:
-            Text with character repetitions truncated
-        """
-        if not text:
-            return text
-        # Replace any character repeated more than max_repeats times with max_repeats
-        # Pattern: any character followed by itself N+ times
-        pattern = rf"(.)\1{{{max_repeats},}}"
-        replacement = r"\1" * max_repeats
-        return re.sub(pattern, replacement, text)

         text = self._post_process_prediction(text)
         return {"text": text}
     def _post_process_prediction(self, text: str) -> str:
         """Post-process model output to fix common issues."""
         if not text:
         # 1. LOWERCASE
         text = text.lower()
+        # 2. COMBINE ACRONYMS
         # Merge consecutive single letters into one word (e.g., "u s a" -> "usa")
         text = re.sub(r"\b([a-z])((?:\s+[a-z])+)\b", lambda m: m.group(0).replace(" ", ""), text)
+        # 3. NORMALIZE CURRENCY
         # Convert "eur X" to "X euros" for Whisper normalizer compatibility
         text = re.sub(r"\beur\s+(\d+)", r"\1 euros", text)
+        # 4. TRUNCATE TRAILING REPEATS
         text = self._truncate_trailing_repeats(text)
+        # 5. STRIP WHITESPACE
         return re.sub(r"\s+", " ", text).strip()
+    def _truncate_trailing_repeats(self, text: str, max_ngram: int = 4) -> str:
+        """Remove trailing repeated n-grams (1-4 words)."""
         words = text.split()
         if len(words) < 2:
             return text
                     break  # Restart from largest n-gram
         return " ".join(words)