Update custom model files, README, and requirements

Browse files

Files changed (6) hide show

.gitattributes +0 -1
README.md +42 -106
asr_config.py +6 -6
asr_modeling.py +14 -30
asr_pipeline.py +35 -11
handler.py +8 -60

.gitattributes CHANGED Viewed

@@ -1,4 +1,3 @@
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 tokenizer_config.json -filter -diff -merge text
-tokenizer.json filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 tokenizer_config.json -filter -diff -merge text

README.md CHANGED Viewed

@@ -1,123 +1,59 @@
 ---
-library_name: transformers
 tags:
-- generated_from_trainer
-model-index:
-- name: tiny-audio
-  results: []
 ---
-<!-- This model card has been generated automatically according to the information the Trainer had access to. You
-should probably proofread and complete it, then remove this comment. -->
-# tiny-audio
-This model is a fine-tuned version of [](https://huggingface.co/) on an unknown dataset.
-It achieves the following results on the evaluation set:
-- Loss: 0.2566
-## Model description
-More information needed
-## Intended uses & limitations
-More information needed
-## Training and evaluation data
-More information needed
-## Training procedure
-### Training hyperparameters
-The following hyperparameters were used during training:
-- learning_rate: 0.0001
-- train_batch_size: 16
-- eval_batch_size: 16
-- seed: 936
-- optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
-- lr_scheduler_type: cosine
-- lr_scheduler_warmup_steps: 500
-- num_epochs: 1
-### Training results
-| Training Loss | Epoch  | Step  | Validation Loss |
-|:-------------:|:------:|:-----:|:---------------:|
-| 0.2888        | 0.0149 | 1000  | 0.2819          |
-| 0.3565        | 0.0298 | 2000  | 0.2919          |
-| 0.3189        | 0.0447 | 3000  | 0.2879          |
-| 0.3274        | 0.0596 | 4000  | 0.2929          |
-| 0.3231        | 0.0745 | 5000  | 0.2870          |
-| 0.3270        | 0.0894 | 6000  | 0.2853          |
-| 0.3486        | 0.1043 | 7000  | 0.2860          |
-| 0.3066        | 0.1192 | 8000  | 0.2865          |
-| 0.3487        | 0.1341 | 9000  | 0.2866          |
-| 0.3307        | 0.1490 | 10000 | 0.2871          |
-| 0.3419        | 0.1639 | 11000 | 0.2852          |
-| 0.3601        | 0.1788 | 12000 | 0.2848          |
-| 0.3156        | 0.1936 | 13000 | 0.2860          |
-| 0.3098        | 0.2085 | 14000 | 0.2830          |
-| 0.3133        | 0.2234 | 15000 | 0.2851          |
-| 0.3269        | 0.2383 | 16000 | 0.2826          |
-| 0.3257        | 0.2532 | 17000 | 0.2822          |
-| 0.3281        | 0.2681 | 18000 | 0.2822          |
-| 0.3941        | 0.2830 | 19000 | 0.2813          |
-| 0.3875        | 0.2979 | 20000 | 0.2854          |
-| 0.3214        | 0.3128 | 21000 | 0.2795          |
-| 0.2914        | 0.3277 | 22000 | 0.2792          |
-| 0.2951        | 0.3426 | 23000 | 0.2805          |
-| 0.3343        | 0.3575 | 24000 | 0.2779          |
-| 0.3252        | 0.3724 | 25000 | 0.2771          |
-| 0.3027        | 0.3873 | 26000 | 0.2768          |
-| 0.3287        | 0.4022 | 27000 | 0.2759          |
-| 0.3208        | 0.4171 | 28000 | 0.2749          |
-| 0.3402        | 0.4320 | 29000 | 0.2730          |
-| 0.2928        | 0.4469 | 30000 | 0.2726          |
-| 0.3085        | 0.4618 | 31000 | 0.2737          |
-| 0.3073        | 0.4767 | 32000 | 0.2705          |
-| 0.3471        | 0.4916 | 33000 | 0.2708          |
-| 0.2945        | 0.5065 | 34000 | 0.2690          |
-| 0.3294        | 0.5214 | 35000 | 0.2696          |
-| 0.3095        | 0.5363 | 36000 | 0.2679          |
-| 0.3152        | 0.5512 | 37000 | 0.2659          |
-| 0.3035        | 0.5660 | 38000 | 0.2674          |
-| 0.3342        | 0.5809 | 39000 | 0.2656          |
-| 0.3242        | 0.5958 | 40000 | 0.2653          |
-| 0.2789        | 0.6107 | 41000 | 0.2643          |
-| 0.3082        | 0.6256 | 42000 | 0.2643          |
-| 0.3174        | 0.6405 | 43000 | 0.2633          |
-| 0.2730        | 0.6554 | 44000 | 0.2628          |
-| 0.2934        | 0.6703 | 45000 | 0.2609          |
-| 0.2944        | 0.6852 | 46000 | 0.2606          |
-| 0.3111        | 0.7001 | 47000 | 0.2614          |
-| 0.3431        | 0.7150 | 48000 | 0.2605          |
-| 0.3226        | 0.7299 | 49000 | 0.2601          |
-| 0.2735        | 0.7448 | 50000 | 0.2591          |
-| 0.3208        | 0.7597 | 51000 | 0.2590          |
-| 0.3208        | 0.7746 | 52000 | 0.2584          |
-| 0.3021        | 0.7895 | 53000 | 0.2578          |
-| 0.2730        | 0.8044 | 54000 | 0.2583          |
-| 0.2938        | 0.8193 | 55000 | 0.2581          |
-| 0.2894        | 0.8342 | 56000 | 0.2574          |
-| 0.2781        | 0.8491 | 57000 | 0.2572          |
-| 0.3003        | 0.8640 | 58000 | 0.2568          |
-| 0.2719        | 0.8789 | 59000 | 0.2568          |
-| 0.2878        | 0.8938 | 60000 | 0.2567          |
-| 0.3058        | 0.9087 | 61000 | 0.2568          |
-| 0.3036        | 0.9236 | 62000 | 0.2568          |
-| 0.3050        | 0.9384 | 63000 | 0.2568          |
-| 0.3244        | 0.9533 | 64000 | 0.2567          |
-| 0.3187        | 0.9682 | 65000 | 0.2566          |
-| 0.3016        | 0.9831 | 66000 | 0.2566          |
-| 0.2697        | 0.9980 | 67000 | 0.2566          |
-### Framework versions
-- Transformers 5.0.0.dev0
-- Pytorch 2.8.0+cu128
-- Datasets 3.6.0
-- Tokenizers 0.22.1

 ---
+license: mit
+language:
+- en
+datasets:
+- speechbrain/LoquaciousSet
+base_model:
+- zai-org/GLM-ASR-Nano-2512
+- Qwen/Qwen3-0.6B
+pipeline_tag: automatic-speech-recognition
 tags:
+- asr
+- speech-recognition
+- audio
+- qwen
+- glm-asr
 ---
+# Tiny Audio
+A speech recognition model trained in 24 hours on a single GPU for ~$12. Built with [Tiny Audio](https://github.com/alexkroman/tiny-audio)—a minimal, hackable ASR framework.
+## Architecture
+```
+Audio (16kHz) → GLM-ASR Encoder (frozen) → MLP Projector (trained) → Qwen3 (frozen) → Text
+```
+Only the projector is trained (~12M params). The encoder and decoder remain frozen.
+## Training
+| | |
+|---|---|
+| **Dataset** | LoquaciousSet (25,000 hours) |
+| **Hardware** | Single NVIDIA A40 |
+| **Time** | ~24 hours |
+| **Cost** | ~$12 |
+## Usage
+```python
+from transformers import pipeline
+pipe = pipeline("automatic-speech-recognition", model="mazesmazes/tiny-audio", trust_remote_code=True)
+result = pipe("audio.wav")
+print(result["text"])
+```
+## Limitations
+- English only
+- 16kHz audio (other sample rates resampled automatically)
+- May degrade on accented speech, noisy audio, or domain-specific terms
+## Links
+- [Train your own](https://github.com/alexkroman/tiny-audio)
+- [Free 3.5-hour course](https://github.com/alexkroman/tiny-audio/blob/main/docs/course/0-course-overview.md)

asr_config.py CHANGED Viewed

@@ -7,8 +7,8 @@ class ASRConfig(transformers.PretrainedConfig):
     """Configuration class for the ASR model.
     This config combines settings for:
-    - Audio encoder (Whisper)
-    - Text decoder (SmolLM/Qwen)
     - Projector (MLP, MOSA, MoE, QFormer)
     - Generation parameters
     - Training options (SpecAugment, LoRA)
@@ -19,8 +19,8 @@ class ASRConfig(transformers.PretrainedConfig):
     def __init__(
         self,
-        audio_model_id: str = "openai/whisper-large-v3-turbo",
-        text_model_id: str = "HuggingFaceTB/SmolLM3-3B",
         attn_implementation: str = "flash_attention_2",
         model_dtype: str = "bfloat16",
         num_beams: Optional[int] = None,
@@ -74,8 +74,8 @@ class ASRConfig(transformers.PretrainedConfig):
         """Initialize ASR model configuration.
         Args:
-            audio_model_id: HuggingFace model ID for audio encoder (Whisper)
-            text_model_id: HuggingFace model ID for text decoder (SmolLM/Qwen)
             attn_implementation: Attention implementation ("flash_attention_2", "sdpa", "eager")
             model_dtype: Model dtype ("bfloat16", "float16", "float32")
             projector_type: Projector architecture ("mlp", "mosa", "moe", "qformer")

     """Configuration class for the ASR model.
     This config combines settings for:
+    - Audio encoder (GLM-ASR/Whisper)
+    - Text decoder (Qwen)
     - Projector (MLP, MOSA, MoE, QFormer)
     - Generation parameters
     - Training options (SpecAugment, LoRA)
     def __init__(
         self,
+        audio_model_id: str = "zai-org/GLM-ASR-Nano-2512",
+        text_model_id: str = "Qwen/Qwen3-0.6B",
         attn_implementation: str = "flash_attention_2",
         model_dtype: str = "bfloat16",
         num_beams: Optional[int] = None,
         """Initialize ASR model configuration.
         Args:
+            audio_model_id: HuggingFace model ID for audio encoder (GLM-ASR/Whisper)
+            text_model_id: HuggingFace model ID for text decoder (Qwen)
             attn_implementation: Attention implementation ("flash_attention_2", "sdpa", "eager")
             model_dtype: Model dtype ("bfloat16", "float16", "float32")
             projector_type: Projector architecture ("mlp", "mosa", "moe", "qformer")

asr_modeling.py CHANGED Viewed

@@ -24,26 +24,7 @@ except ImportError:
     from projectors import PROJECTOR_CLASSES  # type: ignore[no-redef]
-from torchaudio.transforms import FrequencyMasking, TimeMasking
-def apply_specaugment(
-    x: torch.Tensor,
-    num_time_masks: int = 2,
-    time_mask_length: int = 10,
-    num_freq_masks: int = 0,
-    freq_mask_length: int = 10,
-) -> torch.Tensor:
-    """Apply SpecAugment using torchaudio. Input shape: (batch, n_mels, time)."""
-    if num_time_masks > 0:
-        tm = TimeMasking(time_mask_param=time_mask_length, iid_masks=True)
-        for _ in range(num_time_masks):
-            x = tm(x)
-    if num_freq_masks > 0:
-        fm = FrequencyMasking(freq_mask_param=freq_mask_length, iid_masks=True)
-        for _ in range(num_freq_masks):
-            x = fm(x)
-    return x
 class ASRModel(PreTrainedModel, GenerationMixin):
@@ -192,6 +173,17 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         if getattr(config, "freeze_projector", False):
             self.projector.requires_grad_(False)
         # For model parallelism
         self._no_split_modules = getattr(self.language_model, "_no_split_modules", [])
@@ -230,8 +222,6 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             full_model.language_model = None
             full_model.multi_modal_projector = None
             del full_model
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
         else:
             encoder = AutoModel.from_pretrained(config.audio_model_id, **encoder_kwargs)
@@ -463,14 +453,8 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         if input_features is not None and input_ids is not None:
             # Apply SpecAugment during training if enabled
-            if self.training and getattr(self.config, "use_specaugment", False):
-                input_features = apply_specaugment(
-                    input_features,
-                    num_time_masks=self.config.num_time_masks,
-                    time_mask_length=self.config.time_mask_length,
-                    num_freq_masks=self.config.num_freq_masks,
-                    freq_mask_length=self.config.freq_mask_length,
-                )
             # Encode audio -> flattened (total_audio_tokens, hidden_dim)
             audio_embeds = self._encode_audio(input_features, audio_attention_mask)

     from projectors import PROJECTOR_CLASSES  # type: ignore[no-redef]
+from torchaudio.transforms import SpecAugment
 class ASRModel(PreTrainedModel, GenerationMixin):
         if getattr(config, "freeze_projector", False):
             self.projector.requires_grad_(False)
+        # SpecAugment for data augmentation during training
+        if getattr(config, "use_specaugment", False):
+            self.spec_augment = SpecAugment(
+                n_time_masks=config.num_time_masks,
+                time_mask_param=config.time_mask_length,
+                n_freq_masks=config.num_freq_masks,
+                freq_mask_param=config.freq_mask_length,
+            )
+        else:
+            self.spec_augment = None
         # For model parallelism
         self._no_split_modules = getattr(self.language_model, "_no_split_modules", [])
             full_model.language_model = None
             full_model.multi_modal_projector = None
             del full_model
         else:
             encoder = AutoModel.from_pretrained(config.audio_model_id, **encoder_kwargs)
         if input_features is not None and input_ids is not None:
             # Apply SpecAugment during training if enabled
+            if self.training and self.spec_augment is not None:
+                input_features = self.spec_augment(input_features)
             # Encode audio -> flattened (total_audio_tokens, hidden_dim)
             audio_embeds = self._encode_audio(input_features, audio_attention_mask)

asr_pipeline.py CHANGED Viewed

@@ -14,6 +14,15 @@ except ImportError:
     from asr_modeling import ASRModel  # type: ignore[no-redef]
 class ForcedAligner:
     """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2."""
@@ -66,7 +75,7 @@ class ForcedAligner:
         import torchaudio
         from torchaudio.functional import forced_align, merge_tokens
-        device = "cuda" if torch.cuda.is_available() else "cpu"
         model, labels, dictionary = cls.get_instance(device)
         # Convert audio to tensor (copy to ensure array is writable)
@@ -179,11 +188,8 @@ class SpeakerDiarizer:
                 "pyannote/speaker-diarization-3.1",
             )
-            # Move to GPU if available
-            if torch.cuda.is_available():
-                cls._pipeline.to(torch.device("cuda"))
-            elif torch.backends.mps.is_available():
-                cls._pipeline.to(torch.device("mps"))
         return cls._pipeline
@@ -539,9 +545,18 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
     HALLUCINATION_PATTERNS = frozenset(
         [
             "and gt and gt",
         ]
     )
     def _post_process_prediction(self, text: str) -> str:
         """Post-process model output to fix common issues."""
         if not text:
@@ -554,21 +569,30 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         if text.strip() in self.HALLUCINATION_PATTERNS:
             return ""
-        # 3. COMBINE ACRONYMS
         # Merge consecutive single letters into one word (e.g., "u s a" -> "usa")
         text = re.sub(r"\b([a-z])((?:\s+[a-z])+)\b", lambda m: m.group(0).replace(" ", ""), text)
-        # 4. NORMALIZE CURRENCY
         # Convert "eur X" to "X euros" for Whisper normalizer compatibility
         text = re.sub(r"\beur\s+(\d+)", r"\1 euros", text)
-        # 5. TRUNCATE CHARACTER REPETITIONS (e.g., "uhhhhhh" -> "uhh")
         text = self._truncate_character_repetitions(text)
-        # 6. TRUNCATE TRAILING REPEATS (word-level)
         text = self._truncate_trailing_repeats(text)
-        # 7. STRIP WHITESPACE
         return re.sub(r"\s+", " ", text).strip()
     def _truncate_trailing_repeats(self, text: str, max_ngram: int = 10) -> str:

     from asr_modeling import ASRModel  # type: ignore[no-redef]
+def _get_device() -> str:
+    """Get best available device for non-transformers models."""
+    if torch.cuda.is_available():
+        return "cuda"
+    if torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
 class ForcedAligner:
     """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2."""
         import torchaudio
         from torchaudio.functional import forced_align, merge_tokens
+        device = _get_device()
         model, labels, dictionary = cls.get_instance(device)
         # Convert audio to tensor (copy to ensure array is writable)
                 "pyannote/speaker-diarization-3.1",
             )
+            # Move to best available device
+            cls._pipeline.to(torch.device(_get_device()))
         return cls._pipeline
     HALLUCINATION_PATTERNS = frozenset(
         [
             "and gt and gt",
+            "n",  # Single character noise
         ]
     )
+    # Regex patterns for hallucinations (compiled for efficiency)
+    HALLUCINATION_REGEXES = [
+        # Repeating decimal hallucinations (e.g., "12.93242424242424")
+        re.compile(r"\d+\.\d*?(\d{2,})\1{3,}"),
+        # Very long repeated digit sequences (e.g., "242424242424")
+        re.compile(r"(\d{2,})\1{4,}"),
+    ]
     def _post_process_prediction(self, text: str) -> str:
         """Post-process model output to fix common issues."""
         if not text:
         if text.strip() in self.HALLUCINATION_PATTERNS:
             return ""
+        # 3. CHECK FOR REGEX-BASED HALLUCINATIONS
+        for pattern in self.HALLUCINATION_REGEXES:
+            if pattern.search(text):
+                # If hallucination is the entire output, return empty
+                if pattern.fullmatch(text.strip()):
+                    return ""
+                # Otherwise remove the hallucinated portion
+                text = pattern.sub("", text)
+        # 4. COMBINE ACRONYMS
         # Merge consecutive single letters into one word (e.g., "u s a" -> "usa")
         text = re.sub(r"\b([a-z])((?:\s+[a-z])+)\b", lambda m: m.group(0).replace(" ", ""), text)
+        # 5. NORMALIZE CURRENCY
         # Convert "eur X" to "X euros" for Whisper normalizer compatibility
         text = re.sub(r"\beur\s+(\d+)", r"\1 euros", text)
+        # 6. TRUNCATE CHARACTER REPETITIONS (e.g., "uhhhhhh" -> "uhh")
         text = self._truncate_character_repetitions(text)
+        # 7. TRUNCATE TRAILING REPEATS (word-level)
         text = self._truncate_trailing_repeats(text)
+        # 8. STRIP WHITESPACE
         return re.sub(r"\s+", " ", text).strip()
     def _truncate_trailing_repeats(self, text: str, max_ngram: int = 10) -> str:

handler.py CHANGED Viewed

@@ -2,8 +2,6 @@
 from typing import Any, Dict, List, Union
-import torch
 try:
     # For remote execution, imports are relative
     from .asr_modeling import ASRModel
@@ -35,35 +33,21 @@ class EndpointHandler:
         os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
-        # Enable TF32 for faster matmul on Ampere+ GPUs (A100, etc.)
-        # Also beneficial for T4 (Turing) which supports TensorFloat-32
-        torch.backends.cuda.matmul.allow_tf32 = True
-        torch.backends.cudnn.allow_tf32 = True
-        # Set device and dtype
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Use float16 for better T4 compatibility (bfloat16 not well supported on T4)
-        # T4 has excellent float16 performance with tensor cores
-        self.dtype = torch.float16 if self.device == "cuda" else torch.float32
-        # Enable CUDA optimizations
-        if torch.cuda.is_available():
-            torch.backends.cudnn.benchmark = True
-        # Prepare model kwargs for pipeline
         model_kwargs = {
-            "dtype": self.dtype,
             "low_cpu_mem_usage": True,
         }
-        if torch.cuda.is_available():
-            model_kwargs["attn_implementation"] = (
-                "flash_attention_2" if self._is_flash_attn_available() else "sdpa"
-            )
         # Load model (this loads the model, tokenizer, and feature extractor)
         self.model = ASRModel.from_pretrained(path, **model_kwargs)
         # Instantiate custom pipeline - it will get feature_extractor and tokenizer from model
         self.pipe = ASRPipeline(
             model=self.model,
@@ -72,48 +56,12 @@ class EndpointHandler:
             device=self.device,
         )
-        # Apply torch.compile if enabled (after model is loaded by pipeline)
-        # Use "default" mode for T4 - better compatibility than "reduce-overhead"
-        # "reduce-overhead" is better for A100+ but can be slower on older GPUs
-        if torch.cuda.is_available() and os.getenv("ENABLE_TORCH_COMPILE", "1") == "1":
-            compile_mode = os.getenv("TORCH_COMPILE_MODE", "default")
-            self.model = torch.compile(self.model, mode=compile_mode)
-            self.pipe.model = self.model
-        # Warmup the model to trigger compilation and optimize kernels
-        if torch.cuda.is_available():
-            self._warmup()
     def _is_flash_attn_available(self):
         """Check if flash attention is available."""
         import importlib.util
         return importlib.util.find_spec("flash_attn") is not None
-    def _warmup(self):
-        """Warmup to trigger model compilation and allocate GPU memory."""
-        try:
-            # Create dummy audio (1 second at config sample rate)
-            sample_rate = self.pipe.model.config.audio_sample_rate
-            dummy_audio = torch.randn(sample_rate, dtype=torch.float32)
-            # Run inference to trigger torch.compile and kernel optimization
-            with torch.inference_mode():
-                warmup_tokens = self.pipe.model.config.inference_warmup_tokens
-                _ = self.pipe(
-                    {"raw": dummy_audio, "sampling_rate": sample_rate},
-                    max_new_tokens=warmup_tokens,
-                )
-            # Force CUDA synchronization to ensure kernels are compiled
-            if torch.cuda.is_available():
-                torch.cuda.synchronize()
-                # Clear cache after warmup to free memory
-                torch.cuda.empty_cache()
-        except Exception as e:
-            print(f"Warmup skipped due to: {e}")
     def __call__(self, data: Dict[str, Any]) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
         """Process an inference request.

 from typing import Any, Dict, List, Union
 try:
     # For remote execution, imports are relative
     from .asr_modeling import ASRModel
         os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
+        # Prepare model kwargs - let transformers handle device placement
         model_kwargs = {
+            "device_map": "auto",
+            "torch_dtype": "auto",
             "low_cpu_mem_usage": True,
         }
+        if self._is_flash_attn_available():
+            model_kwargs["attn_implementation"] = "flash_attention_2"
         # Load model (this loads the model, tokenizer, and feature extractor)
         self.model = ASRModel.from_pretrained(path, **model_kwargs)
+        # Get device from model for pipeline
+        self.device = next(self.model.parameters()).device
         # Instantiate custom pipeline - it will get feature_extractor and tokenizer from model
         self.pipe = ASRPipeline(
             model=self.model,
             device=self.device,
         )
     def _is_flash_attn_available(self):
         """Check if flash attention is available."""
         import importlib.util
         return importlib.util.find_spec("flash_attn") is not None
     def __call__(self, data: Dict[str, Any]) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
         """Process an inference request.