Update custom model files, README, and requirements

Browse files

Files changed (9) hide show

.gitattributes +2 -34
README.md +267 -0
asr_config.py +225 -0
asr_modeling.py +801 -0
asr_pipeline.py +421 -0
asr_processing.py +130 -0
handler.py +81 -0
projectors.py +483 -0
requirements.txt +5 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,3 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.safetensors filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+tokenizer_config.json -filter -diff -merge text

README.md ADDED Viewed

	@@ -0,0 +1,267 @@

+---
+license: mit
+language:
+- en
+datasets:
+- speechbrain/LoquaciousSet
+base_model:
+- zai-org/GLM-ASR-Nano-2512
+- Qwen/Qwen3-0.6B
+pipeline_tag: automatic-speech-recognition
+tags:
+- asr
+- speech-recognition
+- audio
+- qwen
+- glm-asr
+library_name: transformers
+---
+# Tiny Audio
+A speech recognition model trained in 24 hours on a single GPU for ~$12. Built with [Tiny Audio](https://github.com/alexkroman/tiny-audio)—a minimal, hackable ASR framework.
+## Quick Start
+```python
+from transformers import pipeline
+pipe = pipeline("automatic-speech-recognition", model="mazesmazes/tiny-audio", trust_remote_code=True)
+result = pipe("audio.wav")
+print(result["text"])
+```
+## Usage Examples
+### Basic Transcription
+```python
+from transformers import pipeline
+pipe = pipeline("automatic-speech-recognition", model="mazesmazes/tiny-audio", trust_remote_code=True)
+# From file
+result = pipe("audio.wav")
+print(result["text"])
+# From URL
+result = pipe("https://example.com/audio.mp3")
+# From numpy array (must be 16kHz)
+import numpy as np
+audio = np.random.randn(16000).astype(np.float32)  # 1 second
+result = pipe(audio)
+```
+### Batch Processing
+```python
+# Process multiple files
+files = ["audio1.wav", "audio2.wav", "audio3.wav"]
+results = pipe(files, batch_size=4)
+for r in results:
+    print(r["text"])
+```
+### Word-Level Timestamps
+```python
+result = pipe("audio.wav", return_timestamps="word")
+# Returns:
+# {
+#   "text": "hello world",
+#   "chunks": [
+#     {"text": "hello", "timestamp": (0.0, 0.5)},
+#     {"text": "world", "timestamp": (0.6, 1.0)}
+#   ]
+# }
+```
+### Streaming Inference
+```python
+from tiny_audio import ASRModel, ASRProcessor
+import torch
+model = ASRModel.from_pretrained("mazesmazes/tiny-audio")
+processor = ASRProcessor.from_pretrained("mazesmazes/tiny-audio")
+# Load and process audio
+import librosa
+audio, sr = librosa.load("audio.wav", sr=16000)
+inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
+# Stream tokens
+for token in model.generate_streaming(inputs["input_features"]):
+    print(token, end="", flush=True)
+```
+### Using with torch directly
+```python
+from tiny_audio import ASRModel, ASRProcessor
+import torch
+import librosa
+# Load model and processor
+model = ASRModel.from_pretrained("mazesmazes/tiny-audio")
+processor = ASRProcessor.from_pretrained("mazesmazes/tiny-audio")
+# Load audio (16kHz)
+audio, sr = librosa.load("audio.wav", sr=16000)
+# Process
+inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
+# Generate
+with torch.no_grad():
+    output = model.generate(
+        input_features=inputs["input_features"],
+        attention_mask=inputs["attention_mask"],
+        max_new_tokens=256
+    )
+# Decode
+text = processor.batch_decode(output, skip_special_tokens=True)[0]
+print(text)
+```
+### GPU Inference
+```python
+import torch
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model="mazesmazes/tiny-audio",
+    trust_remote_code=True,
+    device="cuda"  # or device=0
+)
+```
+### Half Precision
+```python
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model="mazesmazes/tiny-audio",
+    trust_remote_code=True,
+    torch_dtype=torch.float16,
+    device="cuda"
+)
+```
+## Architecture
+```
+Audio (16kHz) → GLM-ASR Encoder (frozen) → MLP Projector (trained) → Qwen3 (frozen) → Text
+```
+Only the projector is trained (~12M params). The encoder and decoder remain frozen, leveraging their pretrained knowledge.
+| Component | Model | Parameters | Status |
+|-----------|-------|------------|--------|
+| Audio Encoder | GLM-ASR-Nano-2512 | ~600M | Frozen |
+| Projector | 2-layer MLP | ~12M | Trained |
+| Language Model | Qwen3-0.6B | ~600M | Frozen |
+### How It Works
+1. **Audio Encoder**: GLM-ASR converts 16kHz audio into frame-level embeddings (768-dim)
+2. **Projector**: A 2-layer MLP with frame stacking bridges the audio and text embedding spaces
+3. **Language Model**: Qwen3 generates text autoregressively, conditioned on the projected audio
+The projector reduces sequence length via frame stacking: `output_len = (input_len - 5) // 5 + 1`
+## Model Specifications
+| Specification | Value |
+|---------------|-------|
+| Input | Audio (16kHz mono) |
+| Output | Text transcription |
+| Max Audio Length | ~30 seconds (limited by encoder) |
+| Vocabulary | Qwen3 tokenizer |
+| Languages | English only |
+| Generation | Greedy decoding (num_beams=1, do_sample=False) |
+## Training Details
+| | |
+|---|---|
+| **Dataset** | LoquaciousSet (25,000 hours) |
+| **Hardware** | Single NVIDIA A40 |
+| **Time** | ~24 hours |
+| **Cost** | ~$12 |
+| **Optimizer** | AdamW |
+| **Learning Rate** | 1e-4 |
+| **Batch Size** | 4 |
+| **Steps** | 50,000 |
+## Limitations
+- **English only**: Not trained on other languages
+- **Sample rate**: Expects 16kHz audio (other rates resampled automatically)
+- **Audio length**: Best for clips under 30 seconds
+- **Accuracy**: May degrade on:
+  - Heavily accented speech
+  - Noisy or low-quality audio
+  - Domain-specific terminology
+  - Overlapping speakers
+- **No punctuation**: Output is lowercase without punctuation by default
+## Requirements
+```
+transformers>=4.40.0
+torch>=2.0.0
+torchaudio>=2.0.0
+```
+Optional for streaming:
+```
+librosa
+soundfile
+```
+## Files
+| File | Description |
+|------|-------------|
+| `config.json` | Model configuration |
+| `model.safetensors` | Projector weights (~48MB) |
+| `preprocessor_config.json` | Audio preprocessing config |
+| `tokenizer.json` | Tokenizer |
+| `tokenizer_config.json` | Tokenizer config |
+| `special_tokens_map.json` | Special tokens |
+Note: Only the projector weights are stored. The encoder (GLM-ASR) and decoder (Qwen3) are loaded from their respective HuggingFace repos.
+## Citation
+If you use this model, please cite:
+```bibtex
+@misc{tinyaudio2024,
+  author = {Alex Kroman},
+  title = {Tiny Audio: Minimal ASR Training},
+  year = {2024},
+  publisher = {GitHub},
+  url = {https://github.com/alexkroman/tiny-audio}
+}
+```
+## Links
+- [GitHub Repository](https://github.com/alexkroman/tiny-audio) - Train your own model
+- [Free 3.5-hour Course](https://github.com/alexkroman/tiny-audio/blob/main/docs/course/0-course-overview.md) - Learn ASR from scratch
+- [Live Demo](https://huggingface.co/spaces/mazesmazes/tiny-audio) - Try it in your browser
+## Acknowledgments
+- [GLM-ASR](https://huggingface.co/zai-org/GLM-ASR-Nano-2512) for the audio encoder
+- [Qwen3](https://huggingface.co/Qwen/Qwen3-0.6B) for the language model
+- [LoquaciousSet](https://huggingface.co/datasets/speechbrain/LoquaciousSet) for training data
+## License
+MIT

asr_config.py ADDED Viewed

	@@ -0,0 +1,225 @@

+from typing import Optional
+import transformers
+class ASRConfig(transformers.PretrainedConfig):
+    """Configuration class for the ASR model.
+    This config combines settings for:
+    - Audio encoder (GLM-ASR/Whisper)
+    - Text decoder (Qwen)
+    - Projector (MLP, MOSA, MoE, QFormer)
+    - Generation parameters
+    - Training options (SpecAugment, LoRA)
+    """
+    model_type = "asr_model"
+    is_composition = True
+    def __init__(
+        self,
+        audio_model_id: str = "zai-org/GLM-ASR-Nano-2512",
+        text_model_id: str = "Qwen/Qwen3-0.6B",
+        attn_implementation: str = "flash_attention_2",
+        model_dtype: str = "bfloat16",
+        num_beams: Optional[int] = None,
+        system_prompt: str = "You are a helpful assistant.",
+        encoder_dim: Optional[int] = None,
+        llm_dim: Optional[int] = None,
+        # Encoder conv layers: list of (padding, kernel_size, stride) tuples
+        # Default is Whisper/GLM-ASR structure: conv1(k=3,s=1,p=1) + conv2(k=3,s=2,p=1)
+        encoder_conv_layers: Optional[list] = None,
+        audio_sample_rate: int = 16000,
+        projector_pool_stride: int = 4,
+        downsample_rate: int = 5,  # Granite default
+        projector_hidden_dim: Optional[int] = None,
+        projector_type: str = "mlp",  # "mlp", "mosa", "moe", "qformer"
+        projector_num_layers: int = 2,  # Number of layers in MLP projector
+        projector_init_std: float = 0.02,  # Weight initialization std
+        projector_dropout: float = 0.0,  # Dropout rate for projector layers
+        # MoE-specific configuration
+        num_experts: int = 4,  # Number of experts in MoE projectors
+        num_experts_per_tok: int = 2,  # Top-k experts per token
+        router_aux_loss_coef: float = 0.01,  # Auxiliary loss coefficient for load balancing
+        # QFormer-specific configuration (Granite defaults)
+        qformer_window_size: int = 15,  # Window size for QFormer processing
+        qformer_hidden_size: Optional[int] = None,  # QFormer hidden size (defaults to encoder_dim)
+        qformer_num_layers: int = 2,  # Number of QFormer transformer layers
+        qformer_num_heads: int = 16,  # Number of attention heads in QFormer
+        qformer_intermediate_size: Optional[int] = None,  # FFN size (defaults to 4x hidden)
+        label_smoothing: float = 0.0,  # Label smoothing for cross-entropy loss
+        inference_warmup_tokens: int = 10,
+        # SpecAugment settings
+        use_specaugment: bool = False,
+        num_time_masks: int = 2,
+        time_mask_length: int = 10,
+        num_freq_masks: int = 0,
+        freq_mask_length: int = 10,
+        # LoRA configuration (for Stage 2 fine-tuning)
+        use_lora: bool = False,
+        lora_rank: int = 8,  # SALMONN default
+        lora_alpha: int = 32,  # SALMONN default (scaling factor 4.0)
+        lora_dropout: float = 0.0,
+        lora_target_modules: Optional[list] = None,  # Default: all linear layers
+        freeze_projector: bool = False,  # True for Stage 2 (LoRA-only training)
+        max_new_tokens: Optional[int] = None,
+        min_new_tokens: Optional[int] = None,
+        repetition_penalty: Optional[float] = None,
+        length_penalty: Optional[float] = None,
+        no_repeat_ngram_size: Optional[int] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs,
+    ):
+        """Initialize ASR model configuration.
+        Args:
+            audio_model_id: HuggingFace model ID for audio encoder (GLM-ASR/Whisper)
+            text_model_id: HuggingFace model ID for text decoder (Qwen)
+            attn_implementation: Attention implementation ("flash_attention_2", "sdpa", "eager")
+            model_dtype: Model dtype ("bfloat16", "float16", "float32")
+            projector_type: Projector architecture ("mlp", "mosa", "moe", "qformer")
+            use_lora: Enable LoRA adapters for Stage 2 fine-tuning
+            use_specaugment: Enable SpecAugment data augmentation
+        """
+        # Set default generation parameters (greedy decoding only)
+        generation_defaults = {
+            "num_beams": 1,
+            "max_new_tokens": 128,
+            "min_new_tokens": 0,
+            "repetition_penalty": 1.0,
+            "length_penalty": 1.0,
+            "no_repeat_ngram_size": 0,  # Prevent repeating 3-grams like "so so so"
+            "use_cache": True,
+        }
+        # Apply defaults (config.json values take precedence)
+        kwargs = {**generation_defaults, **kwargs}
+        self.audio_model_id = audio_model_id
+        self.text_model_id = text_model_id
+        self.attn_implementation = attn_implementation
+        self.model_dtype = model_dtype
+        self.system_prompt = system_prompt
+        self.encoder_dim = encoder_dim
+        self.llm_dim = llm_dim
+        # Default conv layers for Whisper/GLM-ASR: [(pad, kernel, stride), ...]
+        self.encoder_conv_layers = encoder_conv_layers or [(1, 3, 1), (1, 3, 2)]
+        self.audio_sample_rate = audio_sample_rate
+        self.projector_init_std = projector_init_std
+        self.projector_pool_stride = projector_pool_stride
+        self.downsample_rate = downsample_rate
+        self.projector_hidden_dim = projector_hidden_dim
+        self.projector_type = projector_type
+        self.projector_num_layers = projector_num_layers
+        self.projector_dropout = projector_dropout
+        # MoE-specific configuration
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.router_aux_loss_coef = router_aux_loss_coef
+        # QFormer-specific configuration
+        self.qformer_window_size = qformer_window_size
+        self.qformer_hidden_size = qformer_hidden_size
+        self.qformer_num_layers = qformer_num_layers
+        self.qformer_num_heads = qformer_num_heads
+        self.qformer_intermediate_size = qformer_intermediate_size
+        self.label_smoothing = label_smoothing
+        self.inference_warmup_tokens = inference_warmup_tokens
+        # SpecAugment configuration
+        self.use_specaugment = use_specaugment
+        self.num_time_masks = num_time_masks
+        self.time_mask_length = time_mask_length
+        self.num_freq_masks = num_freq_masks
+        self.freq_mask_length = freq_mask_length
+        # LoRA configuration
+        self.use_lora = use_lora
+        self.lora_rank = lora_rank
+        self.lora_alpha = lora_alpha
+        self.lora_dropout = lora_dropout
+        self.lora_target_modules = lora_target_modules or [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj",
+            "gate_proj",
+            "up_proj",
+            "down_proj",
+        ]
+        self.freeze_projector = freeze_projector
+        # Generation parameters (use explicit value if provided, else use default)
+        self.num_beams = num_beams if num_beams is not None else generation_defaults["num_beams"]
+        self.max_new_tokens = (
+            max_new_tokens if max_new_tokens is not None else generation_defaults["max_new_tokens"]
+        )
+        self.min_new_tokens = (
+            min_new_tokens if min_new_tokens is not None else generation_defaults["min_new_tokens"]
+        )
+        self.repetition_penalty = (
+            repetition_penalty
+            if repetition_penalty is not None
+            else generation_defaults["repetition_penalty"]
+        )
+        self.length_penalty = (
+            length_penalty if length_penalty is not None else generation_defaults["length_penalty"]
+        )
+        self.no_repeat_ngram_size = (
+            no_repeat_ngram_size
+            if no_repeat_ngram_size is not None
+            else generation_defaults["no_repeat_ngram_size"]
+        )
+        self.use_cache = use_cache if use_cache is not None else generation_defaults["use_cache"]
+        if "audio_config" not in kwargs:
+            self.audio_config = transformers.AutoConfig.from_pretrained(audio_model_id)
+            # Override dtype to match model_dtype
+            self.audio_config.dtype = model_dtype
+        else:
+            self.audio_config = kwargs.pop("audio_config")
+        if "text_config" not in kwargs:
+            self.text_config = transformers.AutoConfig.from_pretrained(
+                text_model_id, trust_remote_code=True
+            )
+            # Override dtype to match model_dtype
+            self.text_config.dtype = model_dtype
+        else:
+            self.text_config = kwargs.pop("text_config")
+        if isinstance(self.text_config, dict):
+            # Reconstruct config from dict using the model_type stored in the dict
+            model_type = self.text_config["model_type"]
+            config_class = transformers.AutoConfig.for_model(model_type).__class__
+            self.text_config = config_class(**self.text_config)
+        if isinstance(self.audio_config, dict):
+            model_type = self.audio_config.get("model_type")
+            if model_type:
+                config_class = transformers.AutoConfig.for_model(model_type).__class__
+                self.audio_config = config_class(**self.audio_config)
+        super().__init__(**kwargs)
+        # Point encoder to audio_config so pipeline uses correct feature extractor
+        # The pipeline looks for config.encoder._name_or_path for feature extractor
+        self.encoder = self.audio_config
+        self.auto_map = {
+            "AutoConfig": "asr_config.ASRConfig",
+            "AutoModel": "asr_modeling.ASRModel",
+            "AutoModelForSpeechSeq2Seq": "asr_modeling.ASRModel",
+            "AutoProcessor": "asr_processing.ASRProcessor",
+        }
+        self.custom_pipelines = {
+            "automatic-speech-recognition": {
+                "impl": "asr_pipeline.ASRPipeline",
+                "pt": ["AutoModelForSpeechSeq2Seq"],
+                "tf": [],
+                "type": "audio",
+            }
+        }
+        self.architectures = ["ASRModel"]
+        self.pipeline_tag = "automatic-speech-recognition"
+transformers.AutoConfig.register("asr_model", ASRConfig)

asr_modeling.py ADDED Viewed

	@@ -0,0 +1,801 @@

+import json
+from pathlib import Path
+from threading import Thread
+from typing import Iterator, Optional, Union
+import torch
+import torch.nn as nn
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    PreTrainedModel,
+    TextIteratorStreamer,
+)
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+try:
+    from .asr_config import ASRConfig
+    from .projectors import PROJECTOR_CLASSES
+except ImportError:
+    from asr_config import ASRConfig  # type: ignore[no-redef]
+    from projectors import PROJECTOR_CLASSES  # type: ignore[no-redef]
+from torchaudio.transforms import SpecAugment
+class ASRModel(PreTrainedModel, GenerationMixin):
+    """Audio-to-text model combining an audio encoder, projector, and language model."""
+    config_class = ASRConfig
+    base_model_prefix = "model"
+    main_input_name = "input_features"
+    _supports_flash_attn_2 = True
+    supports_gradient_checkpointing = True
+    _is_loading_from_pretrained: bool = False
+    _pretrained_model_path: Optional[str] = None
+    TRANSCRIBE_PROMPT = "Please transcribe this audio into text: "
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> "ASRModel":
+        """Load model from pretrained, handling device placement correctly."""
+        from safetensors.torch import load_file
+        from transformers.utils.hub import cached_file
+        config = kwargs.pop("config", None)
+        if config is None:
+            config = ASRConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        # Set flag to avoid device_map="auto" in sub-model loaders
+        cls._is_loading_from_pretrained = True
+        cls._pretrained_model_path = pretrained_model_name_or_path
+        try:
+            model = cls(config, **kwargs)
+            # Load projector weights from safetensors
+            subfolder = kwargs.get("subfolder")
+            revision = kwargs.get("revision")
+            cache_kwargs = {}
+            if subfolder:
+                cache_kwargs["subfolder"] = subfolder
+            if revision:
+                cache_kwargs["revision"] = revision
+            model_file = cached_file(
+                pretrained_model_name_or_path,
+                "model.safetensors",
+                _raise_exceptions_for_missing_entries=False,
+                **cache_kwargs,
+            )
+            if model_file is not None:
+                state_dict = load_file(model_file)
+                model.load_state_dict(state_dict, strict=False)
+            # Load LoRA adapters if use_lora is enabled
+            if getattr(config, "use_lora", False):
+                # Check for adapter_config.json (required by PEFT to load adapters)
+                adapter_config_file = cached_file(
+                    pretrained_model_name_or_path,
+                    "adapter_config.json",
+                    _raise_exceptions_for_missing_entries=False,
+                    **cache_kwargs,
+                )
+                if adapter_config_file is not None:
+                    # Load saved adapter weights using the original repo_id/path
+                    # PEFT handles Hub downloads and caching internally
+                    from peft import PeftModel
+                    model.language_model = PeftModel.from_pretrained(
+                        model.language_model,
+                        pretrained_model_name_or_path,
+                        is_trainable=True,
+                        **cache_kwargs,
+                    )
+                else:
+                    # No saved adapters - initialize fresh LLM LoRA for training
+                    from peft import LoraConfig, get_peft_model
+                    lora_config = LoraConfig(
+                        r=config.lora_rank,
+                        lora_alpha=config.lora_alpha,
+                        target_modules=config.lora_target_modules,
+                        lora_dropout=config.lora_dropout,
+                        bias="none",
+                        task_type="CAUSAL_LM",
+                    )
+                    model.language_model = get_peft_model(model.language_model, lora_config)
+            return model
+        finally:
+            cls._is_loading_from_pretrained = False
+            cls._pretrained_model_path = None
+    def __init__(self, config: ASRConfig, **kwargs) -> None:
+        super().__init__(config)
+        self.system_prompt = config.system_prompt
+        target_dtype = getattr(torch, config.model_dtype)
+        # Audio encoder (frozen)
+        self.audio_tower = self._load_audio_encoder(config, target_dtype)
+        # Language model (frozen)
+        self.language_model = self._load_language_model(config, target_dtype)
+        # Initialize tokenizer and special tokens
+        self._init_tokenizer(config)
+        # Set up generation config with greedy decoding defaults
+        self.generation_config = self.language_model.generation_config
+        self.generation_config.max_new_tokens = config.max_new_tokens
+        self.generation_config.min_new_tokens = config.min_new_tokens
+        self.generation_config.num_beams = config.num_beams
+        self.generation_config.do_sample = False
+        # Clear sampling params (inherited from LLM) since we use greedy decoding
+        self.generation_config.temperature = None
+        self.generation_config.top_p = None
+        self.generation_config.top_k = None
+        self.generation_config.use_cache = config.use_cache
+        self.generation_config.length_penalty = config.length_penalty
+        self.generation_config.repetition_penalty = config.repetition_penalty
+        self.generation_config.no_repeat_ngram_size = config.no_repeat_ngram_size
+        self.generation_config.eos_token_id = [
+            self.tokenizer.convert_tokens_to_ids("<|im_end|>"),
+            self.tokenizer.convert_tokens_to_ids("<|endoftext|>"),
+        ]
+        self.generation_config.pad_token_id = self.tokenizer.pad_token_id
+        # Feature extractor for audio preprocessing
+        self.feature_extractor = self._create_feature_extractor(config)
+        # Audio projector (trainable unless freeze_projector is set)
+        self.projector = self._create_projector(config, target_dtype)
+        # Setup LoRA if enabled (Stage 2 fine-tuning)
+        # Skip if loading from pretrained - from_pretrained will handle adapter loading
+        if getattr(config, "use_lora", False) and not getattr(
+            self.__class__, "_is_loading_from_pretrained", False
+        ):
+            self._setup_lora(config)
+        # Freeze projector if specified (for Stage 2 LoRA-only training)
+        if getattr(config, "freeze_projector", False):
+            self.projector.requires_grad_(False)
+        # SpecAugment for data augmentation during training
+        if getattr(config, "use_specaugment", False):
+            self.spec_augment = SpecAugment(
+                n_time_masks=config.num_time_masks,
+                time_mask_param=config.time_mask_length,
+                n_freq_masks=config.num_freq_masks,
+                freq_mask_param=config.freq_mask_length,
+            )
+        else:
+            self.spec_augment = None
+        # For model parallelism
+        self._no_split_modules = getattr(self.language_model, "_no_split_modules", [])
+    def _create_feature_extractor(self, config: ASRConfig):
+        """Create the appropriate feature extractor for the audio encoder."""
+        from transformers import AutoFeatureExtractor
+        feature_extractor = AutoFeatureExtractor.from_pretrained(config.audio_model_id)
+        # Disable padding by default - use actual audio length
+        feature_extractor.padding = False
+        return feature_extractor
+    @classmethod
+    def _load_audio_encoder(cls, config: ASRConfig, dtype: torch.dtype) -> nn.Module:
+        """Load and freeze the audio encoder."""
+        encoder_kwargs = {
+            "attn_implementation": config.attn_implementation,
+            "low_cpu_mem_usage": True,
+            "dtype": dtype,
+        }
+        if "whisper" in config.audio_model_id.lower():
+            from transformers import WhisperModel
+            full_model = WhisperModel.from_pretrained(config.audio_model_id, **encoder_kwargs)
+            encoder = full_model.encoder
+            del full_model
+        elif "glm" in config.audio_model_id.lower():
+            # GLM-ASR models use audio_tower as the encoder
+            # Requires transformers >= 5.x or installed from source
+            from transformers import AutoModelForSeq2SeqLM
+            full_model = AutoModelForSeq2SeqLM.from_pretrained(
+                config.audio_model_id, trust_remote_code=True, **encoder_kwargs
+            )
+            # GLM stores encoder at audio_tower (GlmAsrEncoder)
+            encoder = full_model.audio_tower
+            # Clear references to free VRAM from the LLM decoder
+            full_model.language_model = None
+            full_model.multi_modal_projector = None
+            del full_model
+        else:
+            encoder = AutoModel.from_pretrained(config.audio_model_id, **encoder_kwargs)
+        encoder.requires_grad_(False)
+        encoder.eval()
+        return encoder
+    @classmethod
+    def _load_language_model(cls, config: ASRConfig, dtype: torch.dtype) -> PreTrainedModel:
+        """Load and freeze the language model."""
+        decoder_kwargs = {
+            "attn_implementation": config.attn_implementation,
+            "trust_remote_code": True,
+            "tie_word_embeddings": False,
+            "low_cpu_mem_usage": True,
+            "dtype": dtype,
+        }
+        decoder = AutoModelForCausalLM.from_pretrained(config.text_model_id, **decoder_kwargs)
+        decoder.config.use_cache = getattr(config, "use_cache", True)
+        decoder.requires_grad_(False)
+        decoder.eval()
+        return decoder
+    def _create_projector(self, config: ASRConfig, dtype: torch.dtype) -> nn.Module:
+        """Create the trainable audio projector."""
+        # Auto-detect dimensions if not specified
+        if config.encoder_dim is None:
+            enc_cfg = self.audio_tower.config
+            config.encoder_dim = getattr(enc_cfg, "hidden_size", None) or getattr(
+                enc_cfg, "d_model", None
+            )
+            if config.encoder_dim is None:
+                raise ValueError("Could not auto-detect encoder_dim. Please specify in config.")
+        if config.llm_dim is None:
+            dec_cfg = self.language_model.config
+            config.llm_dim = getattr(dec_cfg, "hidden_size", None) or getattr(
+                dec_cfg, "d_model", None
+            )
+            if config.llm_dim is None:
+                raise ValueError("Could not auto-detect llm_dim. Please specify in config.")
+        # Select projector type based on config
+        projector_type = getattr(config, "projector_type", "mlp")
+        projector_class = PROJECTOR_CLASSES.get(projector_type)
+        if projector_class is None:
+            raise ValueError(
+                f"Unknown projector_type: {projector_type}. "
+                f"Valid options: {list(PROJECTOR_CLASSES.keys())}"
+            )
+        projector = projector_class(config)
+        # Move projector to same device as language model (important when using quantization)
+        device = next(self.language_model.parameters()).device
+        return projector.to(device=device, dtype=dtype)
+    def _setup_lora(self, config: ASRConfig):
+        """Apply LoRA adapters to the language model for Stage 2 fine-tuning."""
+        from peft import LoraConfig, get_peft_model
+        lora_config = LoraConfig(
+            r=config.lora_rank,
+            lora_alpha=config.lora_alpha,
+            target_modules=config.lora_target_modules,
+            lora_dropout=config.lora_dropout,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+        self.language_model = get_peft_model(self.language_model, lora_config)
+    def _init_tokenizer(self, config: ASRConfig):
+        """Initialize tokenizer with audio token."""
+        self.tokenizer = AutoTokenizer.from_pretrained(config.text_model_id, trust_remote_code=True)
+        # Set pad token
+        if (
+            self.tokenizer.pad_token is None
+            or self.tokenizer.pad_token_id == self.tokenizer.eos_token_id
+        ) and "<|finetune_right_pad_id|>" in self.tokenizer.get_vocab():
+            self.tokenizer.pad_token = "<|finetune_right_pad_id|>"
+        # Add audio token
+        existing_special = getattr(self.tokenizer, "additional_special_tokens", None) or []
+        if "<audio>" not in existing_special:
+            self.tokenizer.add_special_tokens(
+                {"additional_special_tokens": existing_special + ["<audio>"]}
+            )
+            self.language_model.resize_token_embeddings(len(self.tokenizer), mean_resizing=False)
+        self.audio_token_id = self.tokenizer.convert_tokens_to_ids("<audio>")
+        self.tokenizer.padding_side = "right"
+        # Sync token IDs to configs
+        for cfg in [self.config.text_config, self.language_model.config, self.generation_config]:
+            if cfg is not None:
+                cfg.pad_token_id = self.tokenizer.pad_token_id
+                cfg.eos_token_id = self.tokenizer.eos_token_id
+                cfg.bos_token_id = self.tokenizer.bos_token_id
+    def _init_weights(self, _module):
+        """Weight initialization (projector weights are initialized in MoEAudioProjector)."""
+        pass
+    def _set_gradient_checkpointing(self, enable: bool = True, gradient_checkpointing_func=None):
+        """Enable/disable gradient checkpointing for the language model."""
+        # The LLM still stores activations during forward for backprop to projector
+        # Gradient checkpointing trades compute for memory by recomputing activations
+        if hasattr(self.language_model, "_set_gradient_checkpointing"):
+            self.language_model._set_gradient_checkpointing(enable, gradient_checkpointing_func)
+        elif hasattr(self.language_model, "gradient_checkpointing_enable") and enable:
+            self.language_model.gradient_checkpointing_enable(
+                gradient_checkpointing_kwargs={"use_reentrant": False}
+            )
+        elif hasattr(self.language_model, "gradient_checkpointing_disable") and not enable:
+            self.language_model.gradient_checkpointing_disable()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value: nn.Module) -> None:
+        self.language_model.set_input_embeddings(value)
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+    def set_output_embeddings(self, value: nn.Module) -> None:
+        self.language_model.set_output_embeddings(value)
+    def get_processor(self):
+        """Get the processor for this model."""
+        try:
+            from .asr_processing import ASRProcessor
+        except ImportError:
+            from asr_processing import ASRProcessor  # type: ignore[no-redef]
+        return ASRProcessor(
+            feature_extractor=self.feature_extractor,
+            tokenizer=self.tokenizer,
+            projector=self.projector,
+            encoder_conv_layers=self.config.encoder_conv_layers,
+        )
+    def state_dict(self, *args, **kwargs) -> dict[str, torch.Tensor]:
+        """Only save trainable projector weights."""
+        return {f"projector.{k}": v for k, v in self.projector.state_dict().items()}
+    def _compute_encoder_output_lengths(
+        self,
+        audio_attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """Compute per-sample encoder output lengths using conv layer formulas.
+        Args:
+            audio_attention_mask: Mask indicating real vs padded mel frames (batch, mel_len)
+        Returns:
+            Tensor of encoder output lengths per sample (batch,)
+        """
+        # Get mel frame lengths from attention mask
+        lengths = audio_attention_mask.sum(dim=-1)
+        # Apply conv layer formulas: output = (input + 2*pad - (kernel-1) - 1) // stride + 1
+        for padding, kernel_size, stride in self.config.encoder_conv_layers:
+            lengths = (lengths + 2 * padding - (kernel_size - 1) - 1) // stride + 1
+        return lengths
+    def _encode_audio(
+        self,
+        audio_features: torch.Tensor,
+        audio_attention_mask: torch.Tensor,
+    ) -> torch.Tensor:
+        """Encode audio and project to LLM embedding space.
+        Args:
+            audio_features: Mel spectrogram features (batch, n_mels, mel_len)
+            audio_attention_mask: Mask indicating real vs padded mel frames (batch, mel_len)
+        Returns:
+            Flattened audio embeddings of shape (total_audio_tokens, hidden_dim).
+        """
+        with torch.no_grad():
+            encoder_out = self.audio_tower(input_features=audio_features)
+            hidden_states = encoder_out.last_hidden_state
+        # Compute per-sample encoder output lengths using conv formulas
+        encoder_lengths = self._compute_encoder_output_lengths(audio_attention_mask)
+        # Project to LLM space
+        audio_embeds = self.projector(hidden_states)
+        # Compute per-sample projector output lengths
+        projector_lengths = torch.tensor(
+            [self.projector.get_output_length(int(length.item())) for length in encoder_lengths],
+            device=audio_embeds.device,
+        )
+        # Create valid mask for variable-length samples and extract only real embeddings
+        max_len = audio_embeds.shape[1]
+        valid_mask = (
+            torch.arange(max_len, device=audio_embeds.device)[None, :] < projector_lengths[:, None]
+        )
+        return audio_embeds[valid_mask]
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_features: Optional[torch.Tensor] = None,
+        audio_attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        """Forward pass for training and inference."""
+        # Get text embeddings if not provided
+        if inputs_embeds is None:
+            inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        if input_features is not None and input_ids is not None:
+            # Apply SpecAugment during training if enabled
+            if self.training and self.spec_augment is not None:
+                input_features = self.spec_augment(input_features)
+            # Encode audio -> flattened (total_audio_tokens, hidden_dim)
+            audio_embeds = self._encode_audio(input_features, audio_attention_mask)
+            # Replace <audio> token placeholders with audio embeddings using masked_scatter
+            audio_token_mask = (input_ids == self.audio_token_id).unsqueeze(-1)
+            inputs_embeds = inputs_embeds.masked_scatter(
+                audio_token_mask.to(inputs_embeds.device),
+                audio_embeds.to(inputs_embeds.device, dtype=inputs_embeds.dtype),
+            )
+        # Run through language model (let it compute loss if labels provided)
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        # Add auxiliary loss from MoE projectors if available
+        if outputs.loss is not None and hasattr(self.projector, "get_aux_loss"):
+            aux_loss = self.projector.get_aux_loss()
+            if aux_loss is not None and aux_loss.numel() > 0:
+                outputs.loss = outputs.loss + aux_loss.to(outputs.loss.device)
+        return outputs
+    def prepare_inputs_for_generation(self, *args, **kwargs):
+        """Prepare inputs for generation, handling audio features for cached decoding."""
+        input_features = kwargs.pop("input_features", None)
+        cache_position = kwargs.get("cache_position")
+        model_inputs = self.language_model.prepare_inputs_for_generation(*args, **kwargs)
+        # Only pass audio features on the first generation step (cache_position[0] == 0)
+        if cache_position is not None and cache_position[0] == 0 and input_features is not None:
+            model_inputs["input_features"] = input_features
+        return model_inputs
+    def _get_num_audio_tokens(
+        self,
+        audio_attention_mask: torch.Tensor,
+    ) -> int:
+        """Calculate number of audio tokens based on actual audio length.
+        Uses attention mask to get real audio length, then computes:
+        mel_frames -> encoder_frames (via conv formulas) -> projector output tokens
+        """
+        encoder_lengths = self._compute_encoder_output_lengths(audio_attention_mask)
+        # Use max length for batch (all samples should have same token count for generation)
+        encoder_output_len = int(encoder_lengths.max().item())
+        return int(self.projector.get_output_length(encoder_output_len))
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_features: Optional[torch.Tensor] = None,
+        audio_attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        system_prompt: Optional[str] = None,
+        **generate_kwargs,
+    ) -> torch.Tensor:
+        """Generate transcription from audio input.
+        Can be called in two ways:
+        1. With input_ids containing <audio> tokens (from processor)
+        2. With just audio, and we build the prompt internally
+        """
+        if input_features is None:
+            raise ValueError("input_features required for generation")
+        if audio_attention_mask is None:
+            raise ValueError("audio_attention_mask required for generation")
+        device = input_features.device
+        batch_size = input_features.shape[0]
+        # Encode audio -> flattened embeddings
+        audio_embeds = self._encode_audio(input_features, audio_attention_mask)
+        # If input_ids not provided, build prompt with correct number of audio tokens
+        if input_ids is None:
+            num_audio_tokens = self._get_num_audio_tokens(audio_attention_mask)
+            audio_placeholder = "<audio>" * num_audio_tokens
+            system_prompt = system_prompt or self.system_prompt
+            messages: list[dict[str, str]] = []
+            if system_prompt:
+                messages.append({"role": "system", "content": system_prompt})
+            messages.append({"role": "user", "content": self.TRANSCRIBE_PROMPT + audio_placeholder})
+            chat_result = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=True,
+                add_generation_prompt=True,
+                return_tensors="pt",
+                enable_thinking=False,  # Disable Qwen3 thinking mode for ASR
+            )
+            input_ids = chat_result.input_ids.to(device)
+            if input_ids.dim() == 1:
+                input_ids = input_ids.unsqueeze(0)
+            if input_ids.shape[0] == 1 and batch_size > 1:
+                input_ids = input_ids.expand(batch_size, -1)
+            attention_mask = torch.ones_like(input_ids)
+        # Get text embeddings and replace audio tokens with audio embeddings
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        audio_token_mask = (input_ids == self.audio_token_id).unsqueeze(-1)
+        inputs_embeds = inputs_embeds.masked_scatter(
+            audio_token_mask.to(inputs_embeds.device),
+            audio_embeds.to(inputs_embeds.device, dtype=inputs_embeds.dtype),
+        )
+        # Generate using language model
+        output = self.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            generation_config=self.generation_config,
+            **generate_kwargs,
+        )
+        # When using inputs_embeds without input_ids, generate returns only new tokens
+        if isinstance(output, torch.Tensor):
+            return output
+        return output.sequences
+    def generate_streaming(
+        self,
+        input_features: torch.Tensor,
+        audio_attention_mask: torch.Tensor,
+        system_prompt: Optional[str] = None,
+        **generate_kwargs,
+    ) -> Iterator[str]:
+        """Generate transcription with streaming token output.
+        Yields partial transcript strings as tokens are generated.
+        Reduces time-to-first-word by streaming tokens as they're decoded.
+        Args:
+            input_features: Mel spectrogram features (batch, n_mels, mel_len)
+            audio_attention_mask: Mask for real vs padded mel frames (batch, mel_len)
+            system_prompt: Optional system prompt override
+            **generate_kwargs: Additional generation arguments
+        Yields:
+            Partial transcript text as each token is generated
+        """
+        device = input_features.device
+        batch_size = input_features.shape[0]
+        # Encode audio -> flattened embeddings
+        audio_embeds = self._encode_audio(input_features, audio_attention_mask)
+        # Build prompt with correct number of audio tokens
+        num_audio_tokens = self._get_num_audio_tokens(audio_attention_mask)
+        audio_placeholder = "<audio>" * num_audio_tokens
+        system_prompt = system_prompt or self.system_prompt
+        messages: list[dict[str, str]] = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        messages.append({"role": "user", "content": self.TRANSCRIBE_PROMPT + audio_placeholder})
+        chat_result = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_tensors="pt",
+            enable_thinking=False,  # Disable Qwen3 thinking mode for ASR
+        )
+        input_ids = chat_result.input_ids.to(device)
+        if input_ids.dim() == 1:
+            input_ids = input_ids.unsqueeze(0)
+        if input_ids.shape[0] == 1 and batch_size > 1:
+            input_ids = input_ids.expand(batch_size, -1)
+        attention_mask = torch.ones_like(input_ids)
+        # Get text embeddings and replace audio tokens with audio embeddings
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        audio_token_mask = (input_ids == self.audio_token_id).unsqueeze(-1)
+        inputs_embeds = inputs_embeds.masked_scatter(
+            audio_token_mask.to(inputs_embeds.device),
+            audio_embeds.to(inputs_embeds.device, dtype=inputs_embeds.dtype),
+        )
+        # Setup streamer for token-by-token output
+        streamer = TextIteratorStreamer(
+            self.tokenizer,
+            skip_prompt=True,
+            skip_special_tokens=True,
+        )
+        # Prepare generation kwargs
+        gen_kwargs = {
+            "inputs_embeds": inputs_embeds,
+            "attention_mask": attention_mask,
+            "generation_config": self.generation_config,
+            "streamer": streamer,
+            **generate_kwargs,
+        }
+        # Run generation in background thread
+        thread = Thread(target=self.language_model.generate, kwargs=gen_kwargs)
+        thread.start()
+        # Yield tokens as they're generated, filtering out <think>...</think> blocks
+        # Start assuming no think block - only filter when we see <think>
+        in_think_block = False
+        buffer = ""
+        for text in streamer:
+            buffer += text
+            # Check for think block start (in case model outputs think blocks)
+            while "<think>" in buffer:
+                in_think_block = True
+                # Yield any text before <think>
+                before_think = buffer.split("<think>")[0]
+                if before_think:
+                    yield before_think
+                buffer = buffer.split("<think>", 1)[-1]
+            # Check for think block end
+            while in_think_block and "</think>" in buffer:
+                in_think_block = False
+                buffer = buffer.split("</think>", 1)[-1]
+            # Yield text if not in think block
+            if not in_think_block and buffer:
+                yield buffer
+                buffer = ""
+        # Yield any remaining buffer
+        if buffer and not in_think_block:
+            yield buffer
+        thread.join()
+    def save_pretrained(self, save_directory: Union[str, Path], **kwargs) -> None:
+        """Save model, tokenizer, and processor."""
+        import shutil
+        from pathlib import Path as PathlibPath
+        save_dir = PathlibPath(save_directory)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        # Update config with actual vocab size
+        self.config.vocab_size = self.language_model.config.vocab_size
+        self.config.text_config.vocab_size = self.language_model.config.vocab_size
+        if hasattr(self.audio_tower.config, "num_mel_bins"):
+            self.config.audio_config.num_mel_bins = self.audio_tower.config.num_mel_bins
+        # Save model (temporarily remove non-serializable attributes)
+        tokenizer = self.tokenizer
+        del self.tokenizer
+        try:
+            super().save_pretrained(save_dir, **kwargs)
+        finally:
+            self.tokenizer = tokenizer
+        # Save tokenizer and feature extractor
+        self.tokenizer.save_pretrained(save_dir)
+        self.feature_extractor.save_pretrained(save_dir)
+        # Save LoRA adapters if present (creates adapter_model.safetensors and adapter_config.json)
+        # Don't save embedding layers - the <audio> token embedding is never used
+        # (it's replaced with projected audio embeddings before the LLM sees it)
+        if hasattr(self.language_model, "peft_config"):
+            self.language_model.save_pretrained(save_dir, save_embedding_layers=False)
+            # Clear base_model_name_or_path in adapter_config.json to prevent HF pipeline
+            # from redirecting to the base LLM repo (like Qwen) which breaks feature
+            # extractor loading for multimodal models. If a repo_id is provided, use that
+            # so the model can be loaded directly from the Hub.
+            adapter_config_path = save_dir / "adapter_config.json"
+            if adapter_config_path.exists():
+                with adapter_config_path.open() as f:
+                    adapter_config = json.load(f)
+                # Use repo_id if available, otherwise clear to prevent redirect.
+                # Use empty string instead of None to avoid str(None) -> "None" bug
+                # in some transformers/PEFT versions.
+                repo_id = (
+                    kwargs.get("repo_id")
+                    or kwargs.get("push_to_hub_model_id")
+                    or getattr(self.config, "pretrained_model_path", None)
+                    or ""  # Use empty string instead of None
+                )
+                adapter_config["base_model_name_or_path"] = repo_id
+                with adapter_config_path.open("w") as f:
+                    json.dump(adapter_config, f, indent=2)
+        # Add processor auto_map to preprocessor_config.json
+        config_path = save_dir / "preprocessor_config.json"
+        if config_path.exists():
+            with config_path.open() as f:
+                processor_config = json.load(f)
+        else:
+            processor_config = {}
+        processor_config.update(
+            {
+                "processor_class": "ASRProcessor",
+                "auto_map": {"AutoProcessor": "asr_processing.ASRProcessor"},
+            }
+        )
+        with config_path.open("w") as f:
+            json.dump(processor_config, f, indent=2)
+        # Copy source files for auto-loading
+        src_dir = PathlibPath(__file__).parent
+        for asr_file in src_dir.glob("asr_*.py"):
+            shutil.copy(asr_file, save_dir / asr_file.name)
+        # Copy projectors module
+        shutil.copy(src_dir / "projectors.py", save_dir / "projectors.py")
+    def push_to_hub(self, repo_id: str, **kwargs) -> str:
+        """Push model to HuggingFace Hub, ensuring adapter_config points to repo.
+        IMPORTANT: Sets base_model_name_or_path in adapter_config.json to repo_id
+        so that transformers pipeline() can load the model correctly. Without this,
+        the pipeline tries to load from "None" which fails.
+        """
+        # Store repo_id in config so save_pretrained can access it
+        self.config.pretrained_model_path = repo_id
+        # Call parent's push_to_hub with repo_id in kwargs
+        return super().push_to_hub(repo_id, repo_id=repo_id, **kwargs)
+    def create_or_update_model_card(self, output_dir: Union[str, Path]) -> None:
+        """No-op for model card creation - we use MODEL_CARD.md in repo instead."""
+        pass
+# Register with transformers Auto classes
+AutoConfig.register("asr_model", ASRConfig)
+AutoModel.register(ASRConfig, ASRModel)

asr_pipeline.py ADDED Viewed

	@@ -0,0 +1,421 @@

+"""ASR pipeline for audio-to-text transcription with optional timestamps and diarization."""
+import re
+from pathlib import Path
+from typing import Any
+import numpy as np
+import torch
+import transformers
+try:
+    from .asr_modeling import ASRModel
+except ImportError:
+    from asr_modeling import ASRModel  # type: ignore[no-redef]
+def _get_device() -> str:
+    """Get best available device for non-transformers models."""
+    if torch.cuda.is_available():
+        return "cuda"
+    if torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+class ForcedAligner:
+    """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2."""
+    _bundle = None
+    _model = None
+    _labels = None
+    _dictionary = None
+    @classmethod
+    def get_instance(cls, device: str = "cuda"):
+        """Get or create the forced alignment model (singleton).
+        Args:
+            device: Device to run model on ("cuda" or "cpu")
+        Returns:
+            Tuple of (model, labels, dictionary)
+        """
+        if cls._model is None:
+            import torchaudio
+            cls._bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
+            cls._model = cls._bundle.get_model().to(device)
+            cls._model.eval()
+            cls._labels = cls._bundle.get_labels()
+            cls._dictionary = {c: i for i, c in enumerate(cls._labels)}
+        return cls._model, cls._labels, cls._dictionary
+    @classmethod
+    def align(
+        cls,
+        audio: np.ndarray,
+        text: str,
+        sample_rate: int = 16000,
+        _language: str = "eng",
+        _batch_size: int = 16,
+    ) -> list[dict]:
+        """Align transcript to audio and return word-level timestamps.
+        Args:
+            audio: Audio waveform as numpy array
+            text: Transcript text to align
+            sample_rate: Audio sample rate (default 16000)
+            _language: ISO-639-3 language code (default "eng" for English, unused)
+            _batch_size: Batch size for alignment model (unused)
+        Returns:
+            List of dicts with 'word', 'start', 'end' keys
+        """
+        import torchaudio
+        from torchaudio.functional import forced_align, merge_tokens
+        device = _get_device()
+        model, labels, dictionary = cls.get_instance(device)
+        # Convert audio to tensor (copy to ensure array is writable)
+        if isinstance(audio, np.ndarray):
+            waveform = torch.from_numpy(audio.copy()).float()
+        else:
+            waveform = audio.clone().float()
+        # Ensure 2D (channels, time)
+        if waveform.dim() == 1:
+            waveform = waveform.unsqueeze(0)
+        # Resample if needed (wav2vec2 expects 16kHz)
+        if sample_rate != cls._bundle.sample_rate:
+            waveform = torchaudio.functional.resample(
+                waveform, sample_rate, cls._bundle.sample_rate
+            )
+        waveform = waveform.to(device)
+        # Get emissions from model
+        with torch.inference_mode():
+            emissions, _ = model(waveform)
+            emissions = torch.log_softmax(emissions, dim=-1)
+        emission = emissions[0].cpu()
+        # Normalize text: uppercase, keep only valid characters
+        transcript = text.upper()
+        # Build tokens from transcript
+        tokens = []
+        for char in transcript:
+            if char in dictionary:
+                tokens.append(dictionary[char])
+            elif char == " ":
+                tokens.append(dictionary.get("|", dictionary.get(" ", 0)))
+        if not tokens:
+            return []
+        targets = torch.tensor([tokens], dtype=torch.int32)
+        # Run forced alignment
+        # Note: forced_align is deprecated in torchaudio 2.6+ and will be removed in 2.9 (late 2025)
+        # No official replacement announced yet. See https://github.com/pytorch/audio/issues/3902
+        aligned_tokens, scores = forced_align(emission.unsqueeze(0), targets, blank=0)
+        # Use torchaudio's merge_tokens to get token spans (removes blanks and merges repeats)
+        token_spans = merge_tokens(aligned_tokens[0], scores[0])
+        # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
+        frame_duration = 320 / cls._bundle.sample_rate
+        # Group token spans into words based on pipe separator
+        words = text.split()
+        word_timestamps = []
+        current_word_start = None
+        current_word_end = None
+        word_idx = 0
+        for span in token_spans:
+            token_char = labels[span.token]
+            if token_char == "|":  # Word separator
+                if current_word_start is not None and word_idx < len(words):
+                    word_timestamps.append(
+                        {
+                            "word": words[word_idx],
+                            "start": current_word_start * frame_duration,
+                            "end": current_word_end * frame_duration,
+                        }
+                    )
+                    word_idx += 1
+                current_word_start = None
+                current_word_end = None
+            else:
+                if current_word_start is None:
+                    current_word_start = span.start
+                current_word_end = span.end
+        # Don't forget the last word
+        if current_word_start is not None and word_idx < len(words):
+            word_timestamps.append(
+                {
+                    "word": words[word_idx],
+                    "start": current_word_start * frame_duration,
+                    "end": current_word_end * frame_duration,
+                }
+            )
+        return word_timestamps
+try:
+    from .diarization import SpeakerDiarizer
+except ImportError:
+    from diarization import SpeakerDiarizer  # type: ignore[no-redef]
+# Re-export for backwards compatibility
+__all__ = ["ForcedAligner", "SpeakerDiarizer", "ASRPipeline"]
+class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
+    """ASR Pipeline for audio-to-text transcription."""
+    model: ASRModel
+    def __init__(self, model: ASRModel, **kwargs):
+        """Initialize ASR pipeline.
+        Args:
+            model: ASRModel instance for transcription
+            **kwargs: Additional arguments (feature_extractor, tokenizer, device)
+        """
+        feature_extractor = kwargs.pop("feature_extractor", None)
+        tokenizer = kwargs.pop("tokenizer", model.tokenizer)
+        if feature_extractor is None:
+            feature_extractor = model.get_processor().feature_extractor
+        super().__init__(
+            model=model, feature_extractor=feature_extractor, tokenizer=tokenizer, **kwargs
+        )
+        self._current_audio = None
+    def _sanitize_parameters(self, **kwargs):
+        """Intercept our custom parameters before parent class validates them."""
+        # Remove our custom parameters so parent doesn't see them
+        kwargs.pop("return_timestamps", None)
+        kwargs.pop("return_speakers", None)
+        kwargs.pop("num_speakers", None)
+        kwargs.pop("min_speakers", None)
+        kwargs.pop("max_speakers", None)
+        kwargs.pop("hf_token", None)
+        kwargs.pop("user_prompt", None)
+        kwargs.pop("diarization_backend", None)
+        return super()._sanitize_parameters(**kwargs)
+    def __call__(
+        self,
+        inputs,
+        **kwargs,
+    ):
+        """Transcribe audio with optional word-level timestamps and speaker diarization.
+        Args:
+            inputs: Audio input (file path, dict with array/sampling_rate, etc.)
+            return_timestamps: If True, return word-level timestamps using forced alignment
+            return_speakers: If True, return speaker labels for each word
+            user_prompt: Custom transcription prompt (default: "Transcribe: ")
+            num_speakers: Exact number of speakers (if known, for diarization)
+            min_speakers: Minimum number of speakers (for diarization)
+            max_speakers: Maximum number of speakers (for diarization)
+            hf_token: HuggingFace token for pyannote models (or set HF_TOKEN env var)
+            diarization_backend: Backend for diarization ("pyannote" or "local")
+            **kwargs: Additional arguments passed to the pipeline
+        Returns:
+            Dict with 'text' key, 'words' key if return_timestamps=True,
+            and speaker labels on words if return_speakers=True
+        """
+        # Extract our params before super().__call__ (which will also call _sanitize_parameters)
+        return_timestamps = kwargs.pop("return_timestamps", False)
+        return_speakers = kwargs.pop("return_speakers", False)
+        user_prompt = kwargs.pop("user_prompt", None)
+        diarization_params = {
+            "num_speakers": kwargs.pop("num_speakers", None),
+            "min_speakers": kwargs.pop("min_speakers", None),
+            "max_speakers": kwargs.pop("max_speakers", None),
+            "hf_token": kwargs.pop("hf_token", None),
+            "backend": kwargs.pop("diarization_backend", "pyannote"),
+        }
+        if return_speakers:
+            return_timestamps = True
+        # Set custom user prompt if provided
+        original_prompt = None
+        if user_prompt:
+            original_prompt = self.model.TRANSCRIBE_PROMPT
+            self.model.TRANSCRIBE_PROMPT = user_prompt
+        # Store audio for timestamp alignment and diarization
+        if return_timestamps or return_speakers:
+            self._current_audio = self._extract_audio(inputs)
+        # Run standard transcription
+        result = super().__call__(inputs, **kwargs)
+        # Add timestamps if requested
+        if return_timestamps and self._current_audio is not None:
+            text = result.get("text", "")
+            if text:
+                try:
+                    words = ForcedAligner.align(
+                        self._current_audio["array"],
+                        text,
+                        sample_rate=self._current_audio.get("sampling_rate", 16000),
+                    )
+                    result["words"] = words
+                except Exception as e:
+                    result["words"] = []
+                    result["timestamp_error"] = str(e)
+            else:
+                result["words"] = []
+        # Add speaker diarization if requested
+        if return_speakers and self._current_audio is not None:
+            try:
+                # Run diarization
+                speaker_segments = SpeakerDiarizer.diarize(
+                    self._current_audio["array"],
+                    sample_rate=self._current_audio.get("sampling_rate", 16000),
+                    **{k: v for k, v in diarization_params.items() if v is not None},
+                )
+                result["speaker_segments"] = speaker_segments
+                # Assign speakers to words
+                if result.get("words"):
+                    result["words"] = SpeakerDiarizer.assign_speakers_to_words(
+                        result["words"],
+                        speaker_segments,
+                    )
+            except Exception as e:
+                result["speaker_segments"] = []
+                result["diarization_error"] = str(e)
+        # Clean up
+        self._current_audio = None
+        if original_prompt is not None:
+            self.model.TRANSCRIBE_PROMPT = original_prompt
+        return result
+    def _extract_audio(self, inputs) -> dict | None:
+        """Extract audio array from various input formats using HF utilities."""
+        from transformers.pipelines.audio_utils import ffmpeg_read
+        if isinstance(inputs, dict):
+            if "array" in inputs:
+                return {
+                    "array": inputs["array"],
+                    "sampling_rate": inputs.get("sampling_rate", 16000),
+                }
+            if "raw" in inputs:
+                return {
+                    "array": inputs["raw"],
+                    "sampling_rate": inputs.get("sampling_rate", 16000),
+                }
+        elif isinstance(inputs, str):
+            # File path - load audio using ffmpeg (same as HF pipeline)
+            with Path(inputs).open("rb") as f:
+                audio = ffmpeg_read(f.read(), sampling_rate=16000)
+            return {"array": audio, "sampling_rate": 16000}
+        elif isinstance(inputs, bytes):
+            audio = ffmpeg_read(inputs, sampling_rate=16000)
+            return {"array": audio, "sampling_rate": 16000}
+        elif isinstance(inputs, np.ndarray):
+            return {"array": inputs, "sampling_rate": 16000}
+        return None
+    def preprocess(self, inputs, **preprocess_params):
+        """Preprocess audio inputs for the model.
+        Args:
+            inputs: Audio input (dict with array, file path, etc.)
+            **preprocess_params: Additional preprocessing parameters
+        Yields:
+            Model input dicts with input_features and attention_mask
+        """
+        # Handle dict with "array" key (from datasets)
+        if isinstance(inputs, dict) and "array" in inputs:
+            inputs = {
+                "raw": inputs["array"],
+                "sampling_rate": inputs.get("sampling_rate", self.feature_extractor.sampling_rate),
+            }
+        for item in super().preprocess(inputs, **preprocess_params):
+            if "is_last" not in item:
+                item["is_last"] = True
+            yield item
+    def _forward(self, model_inputs, **generate_kwargs) -> dict[str, Any]:
+        """Run model forward pass to generate transcription.
+        Args:
+            model_inputs: Dict with input_features and attention_mask
+            **generate_kwargs: Generation parameters
+        Returns:
+            Dict with generated token IDs
+        """
+        # Extract audio features and is_last flag
+        is_last = model_inputs.pop("is_last", True) if isinstance(model_inputs, dict) else True
+        input_features = model_inputs["input_features"].to(self.model.device)
+        audio_attention_mask = model_inputs["attention_mask"].to(self.model.device)
+        generated_ids = self.model.generate(
+            input_features=input_features,
+            audio_attention_mask=audio_attention_mask,
+            **generate_kwargs,
+        )
+        return {"tokens": generated_ids, "is_last": is_last}
+    def postprocess(self, model_outputs, **kwargs) -> dict[str, str]:
+        """Convert model output tokens to text.
+        Args:
+            model_outputs: Dict with 'tokens' key containing generated IDs
+            **kwargs: Additional postprocessing parameters
+        Returns:
+            Dict with 'text' key containing transcription
+        """
+        # Handle list of outputs (from chunking)
+        if isinstance(model_outputs, list):
+            model_outputs = model_outputs[0] if model_outputs else {}
+        tokens = model_outputs.get("tokens")
+        if tokens is None:
+            return super().postprocess(model_outputs, **kwargs)
+        if torch.is_tensor(tokens):
+            tokens = tokens.cpu()
+            if tokens.dim() > 1:
+                tokens = tokens[0]
+        # Filter out eos tokens that the tokenizer doesn't recognize as special
+        # (generation_config.eos_token_id may differ from tokenizer.eos_token_id)
+        if hasattr(self, "model") and hasattr(self.model, "generation_config"):
+            eos_ids = self.model.generation_config.eos_token_id
+            if eos_ids is not None:
+                eos_set = set(eos_ids) if isinstance(eos_ids, list) else {eos_ids}
+                tokens = [t for t in tokens.tolist() if t not in eos_set]
+        text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
+        # Strip <think>...</think> tags (Qwen3 doesn't respect /no_think prompt)
+        text = re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL).strip()
+        return {"text": text}

asr_processing.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from typing import Optional, Union
+import torch
+import transformers
+from transformers import ProcessorMixin
+try:
+    from .asr_config import ASRConfig
+except ImportError:
+    from asr_config import ASRConfig  # type: ignore[no-redef]
+class ASRProcessor(ProcessorMixin):
+    """Processor for Whisper-based ASR models."""
+    attributes = ["feature_extractor", "tokenizer"]
+    feature_extractor_class = "AutoFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+    AUDIO_TOKEN = "<audio>"
+    TRANSCRIBE_PROMPT = "Transcribe: "
+    # Default conv layers for Whisper/GLM-ASR: [(pad, kernel, stride), ...]
+    DEFAULT_ENCODER_CONV_LAYERS = [(1, 3, 1), (1, 3, 2)]
+    def __init__(
+        self,
+        feature_extractor,
+        tokenizer,
+        projector=None,
+        encoder_conv_layers: Optional[list] = None,
+    ):
+        """Initialize the ASR processor.
+        Args:
+            feature_extractor: Audio feature extractor (WhisperFeatureExtractor)
+            tokenizer: Text tokenizer for the language model
+            projector: Audio projector module (for computing output lengths)
+            encoder_conv_layers: Conv layer specs [(pad, kernel, stride), ...]
+        """
+        self.feature_extractor = feature_extractor
+        self.tokenizer = tokenizer
+        self.audio_token_id = tokenizer.convert_tokens_to_ids(self.AUDIO_TOKEN)
+        self.projector = projector
+        self.encoder_conv_layers = encoder_conv_layers or self.DEFAULT_ENCODER_CONV_LAYERS
+    def _compute_encoder_output_length(self, mel_length: int) -> int:
+        """Compute encoder output length using conv layer formulas."""
+        length = mel_length
+        for padding, kernel_size, stride in self.encoder_conv_layers:
+            length = (length + 2 * padding - (kernel_size - 1) - 1) // stride + 1
+        return length
+    def __call__(
+        self,
+        audio: Optional[Union[list, "torch.Tensor"]] = None,
+        text: Optional[str] = None,
+        system_prompt: Optional[str] = None,
+        return_tensors: str = "pt",
+        **kwargs,
+    ) -> dict:
+        """Process audio and text inputs for inference.
+        Args:
+            audio: Raw audio waveform(s)
+            text: Target transcription (optional, for training - but use DataCollator instead)
+            system_prompt: Optional system prompt
+            return_tensors: Return format ("pt" for PyTorch)
+        Returns:
+            Dict with input_features, input_ids, attention_mask
+        """
+        result = {}
+        # Process audio
+        if audio is not None:
+            audio_inputs = self.feature_extractor(
+                audio,
+                sampling_rate=getattr(self.feature_extractor, "sampling_rate", 16000),
+                return_attention_mask=True,
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            result["input_features"] = audio_inputs["input_features"]
+            result["audio_attention_mask"] = audio_inputs["attention_mask"]
+            # Use actual audio length (from attention mask) for token count
+            real_mel_len = int(audio_inputs["attention_mask"].sum(dim=-1).max().item())
+            encoder_output_len = self._compute_encoder_output_length(real_mel_len)
+            num_audio_tokens = self.projector.get_output_length(encoder_output_len)
+        else:
+            num_audio_tokens = 0
+        # Build prompt with audio token placeholders
+        user_content = self.TRANSCRIBE_PROMPT
+        if num_audio_tokens > 0:
+            user_content += self.AUDIO_TOKEN * num_audio_tokens
+        messages = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        messages.append({"role": "user", "content": user_content})
+        if text is not None:
+            messages.append({"role": "assistant", "content": text})
+        # Tokenize
+        tokenized = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=(text is None),
+            return_tensors=return_tensors,
+            enable_thinking=False,  # Disable Qwen3 thinking mode for ASR
+        )
+        # Handle both tensor and BatchEncoding returns
+        if isinstance(tokenized, torch.Tensor):
+            input_ids = tokenized
+        else:
+            # BatchEncoding or dict-like object
+            input_ids = tokenized.get("input_ids", tokenized.input_ids)
+        if input_ids.dim() == 1:
+            input_ids = input_ids.unsqueeze(0)
+        result["input_ids"] = input_ids
+        result["attention_mask"] = torch.ones_like(input_ids)
+        return result
+ASRProcessor.register_for_auto_class()
+transformers.AutoProcessor.register(ASRConfig, ASRProcessor)

handler.py ADDED Viewed

	@@ -0,0 +1,81 @@

+"""Custom inference handler for HuggingFace Inference Endpoints."""
+from typing import Any, Dict, List, Union
+try:
+    # For remote execution, imports are relative
+    from .asr_modeling import ASRModel
+    from .asr_pipeline import ASRPipeline
+except ImportError:
+    # For local execution, imports are not relative
+    from asr_modeling import ASRModel  # type: ignore[no-redef]
+    from asr_pipeline import ASRPipeline  # type: ignore[no-redef]
+class EndpointHandler:
+    """HuggingFace Inference Endpoints handler for ASR model.
+    Handles model loading, warmup, and inference requests for deployment
+    on HuggingFace Inference Endpoints or similar services.
+    """
+    def __init__(self, path: str = ""):
+        """Initialize the endpoint handler.
+        Args:
+            path: Path to model directory or HuggingFace model ID
+        """
+        import os
+        import nltk
+        nltk.download("punkt_tab", quiet=True)
+        os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
+        # Prepare model kwargs - let transformers handle device placement
+        model_kwargs = {
+            "device_map": "auto",
+            "torch_dtype": "auto",
+            "low_cpu_mem_usage": True,
+        }
+        if self._is_flash_attn_available():
+            model_kwargs["attn_implementation"] = "flash_attention_2"
+        # Load model (this loads the model, tokenizer, and feature extractor)
+        self.model = ASRModel.from_pretrained(path, **model_kwargs)
+        # Get device from model for pipeline
+        self.device = next(self.model.parameters()).device
+        # Instantiate custom pipeline - it will get feature_extractor and tokenizer from model
+        self.pipe = ASRPipeline(
+            model=self.model,
+            feature_extractor=self.model.feature_extractor,
+            tokenizer=self.model.tokenizer,
+            device=self.device,
+        )
+    def _is_flash_attn_available(self):
+        """Check if flash attention is available."""
+        import importlib.util
+        return importlib.util.find_spec("flash_attn") is not None
+    def __call__(self, data: Dict[str, Any]) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
+        """Process an inference request.
+        Args:
+            data: Request data containing 'inputs' (audio path/bytes) and optional 'parameters'
+        Returns:
+            Transcription result with 'text' key
+        """
+        inputs = data.get("inputs")
+        if inputs is None:
+            raise ValueError("Missing 'inputs' in request data")
+        # Pass through any parameters from request, let model config provide defaults
+        params = data.get("parameters", {})
+        return self.pipe(inputs, **params)

projectors.py ADDED Viewed

	@@ -0,0 +1,483 @@

+"""Audio projector modules for bridging encoder and decoder embeddings.
+This module contains all projector architectures:
+- MLPAudioProjector: Simple 2-layer MLP with frame stacking downsampling
+- MOSAProjector: MOSA-style dense mixture of experts
+- SharedMoEAudioProjector: Shared expert + sparse routed experts
+- QFormerAudioProjector: BLIP-2 QFormer with learnable queries (Granite-style)
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F  # noqa: N812
+from transformers import AutoModel, Blip2QFormerConfig
+from transformers.models.llama.modeling_llama import LlamaRMSNorm
+# =============================================================================
+# MLP Projector
+# =============================================================================
+class MLPAudioProjector(nn.Module):
+    """2-layer MLP projector with frame-stacking downsampling (matches GLM-ASR)."""
+    def __init__(self, config):
+        """Initialize MLP projector.
+        Args:
+            config: ASRConfig with encoder_dim, llm_dim, projector_pool_stride
+        """
+        super().__init__()
+        encoder_dim = getattr(config, "encoder_dim", 768)
+        llm_dim = getattr(config, "llm_dim", 2048)
+        self.k = getattr(config, "projector_pool_stride", 2)
+        # Frame stacking: concat k adjacent frames then project
+        in_dim = encoder_dim * self.k
+        hidden_dim = llm_dim
+        self.linear_1 = nn.Linear(in_dim, hidden_dim)
+        self.act = nn.GELU()
+        self.linear_2 = nn.Linear(hidden_dim, llm_dim)
+    def get_output_length(self, input_length: int) -> int:
+        """Calculate output sequence length given input length (matches GLM-ASR)."""
+        # GLM-ASR formula: (L - merge_factor) // merge_factor + 1
+        return (input_length - self.k) // self.k + 1
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Project audio features to LLM embedding space.
+        Args:
+            x: Audio encoder output of shape [batch, seq_len, encoder_dim]
+        Returns:
+            Projected features of shape [batch, (seq_len - k) // k + 1, llm_dim]
+        """
+        batch, seq, dim = x.shape
+        # Truncate to match GLM-ASR: use (seq - k) // k + 1 frames
+        # This drops trailing frames that don't fill a complete k-frame window
+        out_len = (seq - self.k) // self.k + 1
+        x = x[:, : out_len * self.k, :]  # Truncate to exact multiple
+        x = x.reshape(batch, out_len, dim * self.k)
+        x = self.linear_1(x)
+        x = self.act(x)
+        return self.linear_2(x)
+# =============================================================================
+# MoE Projector (MOSA-style)
+# =============================================================================
+class SimpleAdapter(nn.Module):
+    """Simple 2-layer GELU adapter (from MOSA paper)."""
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
+        super().__init__()
+        self.fc1 = nn.Linear(input_dim, hidden_dim)
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(hidden_dim, output_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.fc2(self.act(self.fc1(x)))
+class MOSAProjector(nn.Module):
+    """MOSA-Base projector: simple 2-layer ReLU router with 4 simple adapters.
+    Based on "MOSA: Mixtures of Simple Adapters" (arXiv:2508.18998).
+    Uses softmax gating over all experts (dense MoE) with only cross-entropy loss.
+    Uses Conv1d for downsampling (2 layers, stride 2 each = 4x total).
+    """
+    def __init__(self, config):
+        """Initialize MOSA projector.
+        Args:
+            config: ASRConfig with encoder_dim, llm_dim, num_experts
+        """
+        super().__init__()
+        self.encoder_dim = getattr(config, "encoder_dim", None) or 1280
+        self.llm_dim = getattr(config, "llm_dim", None) or 2048
+        self.num_experts = getattr(config, "num_experts", None) or 4  # MOSA-Base uses 4
+        adapter_hidden = getattr(config, "adapter_hidden_dim", None) or 4096
+        router_hidden = getattr(config, "router_hidden_dim", None) or 512
+        # --- 1. Conv1d Downsampler (4x reduction) ---
+        # 2 layers of stride-2 convolution
+        self.downsampler = nn.Sequential(
+            nn.Conv1d(self.encoder_dim, self.encoder_dim, kernel_size=3, stride=2, padding=1),
+            nn.GELU(),
+            nn.Conv1d(self.encoder_dim, self.llm_dim, kernel_size=3, stride=2, padding=1),
+            nn.GELU(),
+        )
+        # --- 2. Simple Router (MOSA-Base: 2 layers with ReLU) ---
+        # Takes downsampled features (llm_dim) -> 512 -> num_experts
+        self.router = nn.Sequential(
+            nn.Linear(self.llm_dim, router_hidden),
+            nn.ReLU(),
+            nn.Linear(router_hidden, self.num_experts),
+        )
+        # --- 3. Experts (Simple 2-layer GELU adapters) ---
+        # Each expert: llm_dim -> hidden -> llm_dim (much smaller than frame-stacking)
+        self.experts = nn.ModuleList(
+            [
+                SimpleAdapter(self.llm_dim, adapter_hidden, self.llm_dim)
+                for _ in range(self.num_experts)
+            ]
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Project audio features using mixture of experts.
+        Args:
+            x: Audio encoder output of shape [batch, seq_len, encoder_dim]
+        Returns:
+            Projected features of shape [batch, out_len, llm_dim]
+        """
+        # --- 1. Conv1d Downsampling ---
+        # Permute for Conv1d: [B, S, D] -> [B, D, S]
+        x = x.transpose(1, 2)
+        x = self.downsampler(x)
+        # Permute back: [B, D, S] -> [B, S, D]
+        x = x.transpose(1, 2)
+        # --- 2. Routing ---
+        routing_weights = F.softmax(self.router(x), dim=-1)  # (B, out_len, num_experts)
+        # --- 3. Expert Mixture (Dense Execution) ---
+        expert_outputs = torch.stack([expert(x) for expert in self.experts])  # (E, B, out_len, D)
+        return torch.einsum("ebsd, bse -> bsd", expert_outputs, routing_weights)
+    def get_output_length(self, input_length: int) -> int:
+        """Calculate output sequence length after Conv1d downsampling (4x reduction)."""
+        # Conv1d with stride 2, kernel 3, padding 1: out = (in + 2*1 - 3) // 2 + 1 = (in - 1) // 2 + 1
+        # Applied twice for 4x total reduction
+        after_conv1 = (input_length + 2 * 1 - 3) // 2 + 1
+        return (after_conv1 + 2 * 1 - 3) // 2 + 1
+# =============================================================================
+# MoE Projector (Shared Expert + Sparse Routed Experts)
+# =============================================================================
+class SharedMoEBlock(nn.Module):
+    """MoE block with Shared + Sigmoid-Routed Experts."""
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_experts: int = 4,
+        top_k: int = 2,
+    ):
+        super().__init__()
+        self.num_experts = num_experts
+        self.top_k = top_k
+        self.output_dim = output_dim
+        # RMSNorm before routing
+        self.norm = LlamaRMSNorm(input_dim, eps=1e-8)
+        self.router = nn.Linear(input_dim, num_experts, bias=False)
+        nn.init.normal_(self.router.weight, mean=0.0, std=0.02)
+        self.shared_expert = SimpleAdapter(input_dim, hidden_dim, output_dim)
+        self.experts = nn.ModuleList(
+            [SimpleAdapter(input_dim, hidden_dim, output_dim) for _ in range(num_experts)]
+        )
+        self.last_router_logits = None
+        self.last_router_probs = None
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, dim = hidden_states.shape
+        # 1. Apply Shared Expert
+        normed_states = self.norm(hidden_states)
+        shared_out = self.shared_expert(normed_states)
+        # 2. Router Logic (Sigmoid Style)
+        flat_hidden = normed_states.view(-1, dim)
+        router_logits = self.router(flat_hidden)
+        # Sigmoid routing
+        router_probs = torch.sigmoid(router_logits)
+        self.last_router_logits = router_logits
+        self.last_router_probs = router_probs
+        # 3. Top-K Selection
+        top_k_scores, top_k_indices = torch.topk(router_probs, self.top_k, dim=-1)
+        # Normalize weights
+        top_k_weights = top_k_scores / (top_k_scores.sum(dim=-1, keepdim=True) + 1e-6)
+        top_k_weights = top_k_weights.to(hidden_states.dtype)
+        # 4. Dispatch
+        routed_out = self._dispatch_experts(flat_hidden, top_k_indices, top_k_weights)
+        routed_out = routed_out.view(batch_size, seq_len, -1)
+        return shared_out + routed_out
+    def _dispatch_experts(
+        self,
+        hidden_states: torch.Tensor,
+        top_k_indices: torch.Tensor,
+        top_k_weights: torch.Tensor,
+    ) -> torch.Tensor:
+        num_tokens = hidden_states.shape[0]
+        output = torch.zeros(
+            num_tokens, self.output_dim, device=hidden_states.device, dtype=hidden_states.dtype
+        )
+        for expert_idx, expert in enumerate(self.experts):
+            expert_mask = top_k_indices == expert_idx
+            if not expert_mask.any():
+                continue
+            token_indices, slot_indices = torch.where(expert_mask)
+            expert_input = hidden_states[token_indices]
+            expert_output = expert(expert_input).to(output.dtype)
+            weights = top_k_weights[token_indices, slot_indices].unsqueeze(-1)
+            output.index_add_(0, token_indices, expert_output * weights)
+        return output
+def load_balancing_loss(router_probs: torch.Tensor, num_experts: int, top_k: int) -> torch.Tensor:
+    """Auxiliary loss to encourage balanced expert usage."""
+    prob_per_expert = router_probs.mean(dim=0)
+    target_mean = prob_per_expert.mean()
+    return (prob_per_expert - target_mean).square().sum() * num_experts
+def z_loss(router_logits: torch.Tensor) -> torch.Tensor:
+    """Z-loss to prevent router logits from growing too large."""
+    return torch.logsumexp(router_logits.float(), dim=-1).square().mean()
+class MoEAudioProjector(nn.Module):
+    """MoE projector with shared expert + sparse routed experts."""
+    def __init__(self, config):
+        """Initialize MoE projector.
+        Args:
+            config: ASRConfig with encoder_dim, llm_dim, num_experts, num_experts_per_tok
+        """
+        super().__init__()
+        self.k = getattr(config, "projector_pool_stride", 4)
+        encoder_dim = config.encoder_dim
+        # Depthwise Conv for temporal mixing
+        self.temporal_conv = nn.Conv1d(
+            encoder_dim, encoder_dim, kernel_size=3, padding=1, groups=encoder_dim
+        )
+        in_dim = encoder_dim * self.k
+        out_dim = config.llm_dim
+        hidden_dim = getattr(config, "projector_hidden_dim", None) or in_dim
+        self.num_experts = getattr(config, "num_experts", 4)
+        self.top_k = getattr(config, "num_experts_per_tok", 2)
+        self.aux_loss_coef = getattr(config, "router_aux_loss_coef", 0.02)
+        self.z_loss_coef = getattr(config, "router_z_loss_coef", 0.001)
+        self.moe = SharedMoEBlock(in_dim, hidden_dim, out_dim, self.num_experts, self.top_k)
+        self._init_weights()
+    def _init_weights(self):
+        with torch.no_grad():
+            nn.init.orthogonal_(self.moe.shared_expert.fc1.weight)
+            nn.init.orthogonal_(self.moe.shared_expert.fc2.weight, gain=0.5)
+            for expert in self.moe.experts:
+                nn.init.orthogonal_(expert.fc1.weight)
+                nn.init.orthogonal_(expert.fc2.weight, gain=0.01)
+    def get_output_length(self, input_length: int) -> int:
+        """Calculate output sequence length given input length."""
+        # Temporal pooling with stride k
+        if input_length % self.k:
+            input_length += self.k - input_length % self.k
+        return input_length // self.k
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Project audio features using shared + sparse MoE.
+        Args:
+            x: Audio encoder output of shape [batch, seq_len, encoder_dim]
+        Returns:
+            Projected features of shape [batch, out_len, llm_dim]
+        """
+        batch_size, seq_len, dim = x.size()
+        target_dtype = self.moe.shared_expert.fc1.weight.dtype
+        if x.dtype != target_dtype:
+            x = x.to(target_dtype)
+        # Temporal Context Injection
+        x_ctx = x.transpose(1, 2)
+        x_ctx = self.temporal_conv(x_ctx)
+        x = x + x_ctx.transpose(1, 2)
+        if seq_len % self.k:
+            x = F.pad(x, (0, 0, 0, self.k - seq_len % self.k))
+        x = x.view(batch_size, -1, dim * self.k)
+        return self.moe(x)
+    def get_aux_loss(self) -> torch.Tensor:
+        if self.moe.last_router_logits is None:
+            return torch.tensor(0.0, device=self.moe.router.weight.device)
+        balance = load_balancing_loss(self.moe.last_router_probs, self.num_experts, self.top_k)
+        z = z_loss(self.moe.last_router_logits)
+        return self.aux_loss_coef * balance + self.z_loss_coef * z
+# =============================================================================
+# QFormer Projector (Granite-style)
+# =============================================================================
+class QFormerAudioProjector(nn.Module):
+    """
+    BLIP-2 QFormer projector with learnable queries.
+    Based on GraniteSpeechEncoderProjector - uses a QFormer model with learnable
+    query embeddings to compress and project audio encoder outputs. The audio
+    sequence is processed in windows and downsampled via cross-attention.
+    """
+    def __init__(self, config):
+        """Initialize QFormer projector.
+        Args:
+            config: ASRConfig with encoder_dim, llm_dim, qformer_* settings
+        """
+        super().__init__()
+        encoder_dim = config.encoder_dim
+        llm_dim = config.llm_dim
+        # Window and downsampling parameters (Granite defaults: window=15, downsample=5)
+        self.window_size = getattr(config, "qformer_window_size", 15)
+        self.downsample_rate = getattr(config, "downsample_rate", 5)
+        self.num_queries = self.window_size // self.downsample_rate
+        # QFormer hidden size (matches encoder for cross-attention)
+        qformer_hidden = getattr(config, "qformer_hidden_size", None) or encoder_dim
+        qformer_num_layers = getattr(config, "qformer_num_layers", 2)
+        qformer_num_heads = getattr(config, "qformer_num_heads", 16)
+        qformer_intermediate = getattr(config, "qformer_intermediate_size", None) or (
+            qformer_hidden * 4
+        )
+        # Learnable query embeddings (Granite uses std=1.0)
+        self.query = nn.Parameter(torch.zeros(1, self.num_queries, qformer_hidden))
+        self.query.data.normal_(mean=0.0, std=1.0)
+        # Optional projection if encoder dim != qformer hidden
+        if encoder_dim != qformer_hidden:
+            self.encoder_proj = nn.Linear(encoder_dim, qformer_hidden, bias=False)
+        else:
+            self.encoder_proj = None
+        # Configure QFormer to match Granite's exact config
+        qformer_config = Blip2QFormerConfig(
+            hidden_size=qformer_hidden,
+            num_hidden_layers=qformer_num_layers,
+            num_attention_heads=qformer_num_heads,
+            intermediate_size=qformer_intermediate,
+            encoder_hidden_size=qformer_hidden,
+            cross_attention_frequency=1,
+            # Granite-specific settings
+            hidden_act="gelu",
+            attention_probs_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
+            layer_norm_eps=1e-12,
+            initializer_range=0.02,
+        )
+        self.qformer = AutoModel.from_config(qformer_config)
+        # Final projection to LLM dimension (Granite uses bias=True)
+        self.linear = nn.Linear(qformer_hidden, llm_dim)
+    def get_output_length(self, input_length: int) -> int:
+        """Calculate output sequence length given input length."""
+        # QFormer uses window-based processing with num_queries per window
+        nblocks = math.ceil(input_length / self.window_size)
+        return nblocks * self.num_queries
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: [batch_size, seq_len, encoder_dim]
+        Returns:
+            projected: [batch_size, num_output_tokens, llm_dim]
+        """
+        batch_size, seq_len, dim = hidden_states.size()
+        # Ensure float dtype for QFormer
+        target_dtype = self.query.dtype
+        if hidden_states.dtype != target_dtype:
+            hidden_states = hidden_states.to(target_dtype)
+        # Optional encoder projection
+        if self.encoder_proj is not None:
+            hidden_states = self.encoder_proj(hidden_states)
+        # Compute number of windows and pad to fit
+        nblocks = math.ceil(seq_len / self.window_size)
+        pad = nblocks * self.window_size - seq_len
+        if pad > 0:
+            hidden_states = F.pad(hidden_states, (0, 0, 0, pad), "constant", 0)
+        # Reshape to process each window: [batch*nblocks, window_size, dim]
+        effective_batch = batch_size * nblocks
+        hidden_states = hidden_states.view(effective_batch, self.window_size, -1)
+        # Expand queries to match batch size
+        query_embeds = self.query.expand(effective_batch, -1, -1)
+        # QFormer cross-attention
+        query_output = self.qformer(
+            query_embeds=query_embeds,
+            encoder_hidden_states=hidden_states,
+            return_dict=True,
+        )
+        # Reshape back: [batch, nblocks * num_queries, hidden]
+        output_tokens = nblocks * self.num_queries
+        query_proj = query_output.last_hidden_state.view(batch_size, output_tokens, -1)
+        # Project to LLM dimension
+        return self.linear(query_proj)
+# =============================================================================
+# Projector Registry
+# =============================================================================
+PROJECTOR_CLASSES = {
+    "mlp": MLPAudioProjector,
+    "mosa": MOSAProjector,
+    "moe": MoEAudioProjector,
+    "qformer": QFormerAudioProjector,
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+# Core dependencies for tiny-audio model inference
+# This file is pushed to HuggingFace for model repository
+# Transformers - main library for model loading and inference
+transformers>=4.57.0