Training in progress - step 1000

Browse files

Files changed (14) hide show

asr_config.py +65 -16
asr_modeling.py +379 -532
asr_pipeline.py +29 -256
asr_processing.py +68 -54
chat_template.jinja +94 -6
mlp_projector.py +42 -0
moe_projector.py +162 -0
preprocessor_config.json +0 -3
residual_projector.py +153 -0
shared_moe_projector.py +182 -0
special_tokens_map.json +8 -23
swiglu_projector.py +68 -0
tokenizer.json +2 -2
tokenizer_config.json +0 -0

asr_config.py CHANGED Viewed

@@ -11,9 +11,8 @@ class ASRConfig(transformers.PretrainedConfig):
         self,
         audio_model_id: str = "openai/whisper-large-v3-turbo",
         text_model_id: str = "HuggingFaceTB/SmolLM3-3B",
-        attn_implementation: str = "sdpa",
         model_dtype: str = "bfloat16",
-        audio_downsample_rate: int = 5,  # Deprecated: use projector_pool_stride instead
         num_beams: Optional[int] = None,
         system_prompt: str = "/no_think /system_override",
         user_prompt: str = "Transcribe: <audio>",
@@ -22,8 +21,18 @@ class ASRConfig(transformers.PretrainedConfig):
         audio_sample_rate: int = 16000,
         projector_init_std: float = 0.02,
         projector_pool_stride: int = 2,
         projector_hidden_dim: Optional[int] = None,
-        projector_dropout: float = 0.0,  # Dropout rate for projector layers
         inference_diversity_penalty: float = 0.0,
         inference_warmup_tokens: int = 10,
         max_new_tokens: Optional[int] = None,
@@ -42,10 +51,12 @@ class ASRConfig(transformers.PretrainedConfig):
         # Set default generation parameters
         generation_defaults = {
             "num_beams": 1,
-            "max_new_tokens": 128,
-            "min_new_tokens": 1,
             "do_sample": False,
-            "repetition_penalty": 1.05,
             "no_repeat_ngram_size": 0,
             "use_cache": True,
         }
@@ -57,7 +68,6 @@ class ASRConfig(transformers.PretrainedConfig):
         self.text_model_id = text_model_id
         self.attn_implementation = attn_implementation
         self.model_dtype = model_dtype
-        self.audio_downsample_rate = audio_downsample_rate
         self.system_prompt = system_prompt
         self.user_prompt = user_prompt
         self.encoder_dim = encoder_dim
@@ -65,12 +75,55 @@ class ASRConfig(transformers.PretrainedConfig):
         self.audio_sample_rate = audio_sample_rate
         self.projector_init_std = projector_init_std
         self.projector_pool_stride = projector_pool_stride
         self.projector_hidden_dim = projector_hidden_dim
         self.projector_dropout = projector_dropout
         self.inference_diversity_penalty = inference_diversity_penalty
         self.inference_warmup_tokens = inference_warmup_tokens
         if "audio_config" not in kwargs:
             self.audio_config = transformers.AutoConfig.from_pretrained(audio_model_id)
         else:
             self.audio_config = kwargs.pop("audio_config")
@@ -78,20 +131,16 @@ class ASRConfig(transformers.PretrainedConfig):
             self.text_config = transformers.AutoConfig.from_pretrained(
                 text_model_id, trust_remote_code=True
             )
         else:
             self.text_config = kwargs.pop("text_config")
         if isinstance(self.text_config, dict):
             # Reconstruct config from dict using the model_type stored in the dict
-            model_type = self.text_config.get("model_type")
-            if model_type:
-                config_class = transformers.AutoConfig.for_model(model_type).__class__
-                self.text_config = config_class(**self.text_config)
-            else:
-                # Fallback: try to load from model_id
-                self.text_config = transformers.AutoConfig.from_pretrained(
-                    text_model_id, trust_remote_code=True
-                )
         if isinstance(self.audio_config, dict):
             model_type = self.audio_config.get("model_type")

         self,
         audio_model_id: str = "openai/whisper-large-v3-turbo",
         text_model_id: str = "HuggingFaceTB/SmolLM3-3B",
+        attn_implementation: str = "flash_attention_2",
         model_dtype: str = "bfloat16",
         num_beams: Optional[int] = None,
         system_prompt: str = "/no_think /system_override",
         user_prompt: str = "Transcribe: <audio>",
         audio_sample_rate: int = 16000,
         projector_init_std: float = 0.02,
         projector_pool_stride: int = 2,
+        downsample_rate: int = 16,
         projector_hidden_dim: Optional[int] = None,
+        projector_type: str = "moe",  # "moe", "swiglu", "residual", "shared_moe", "mlp"
+        projector_num_layers: int = 2,  # Number of layers (for residual projector)
+        projector_dropout: float = 0.05,  # Dropout rate for projector layers
+        projector_input_noise: float = 0.02,  # Input noise for projector
+        # MoE-specific configuration
+        num_experts: int = 4,  # Number of experts in MoE projectors
+        num_experts_per_tok: int = 2,  # Top-k experts per token
+        router_aux_loss_coef: float = 0.01,  # Auxiliary loss coefficient for load balancing
+        use_specaugment: bool = True,  # Apply SpecAugment during training
+        label_smoothing: float = 0.0,  # Label smoothing for cross-entropy loss
         inference_diversity_penalty: float = 0.0,
         inference_warmup_tokens: int = 10,
         max_new_tokens: Optional[int] = None,
         # Set default generation parameters
         generation_defaults = {
             "num_beams": 1,
+            "max_new_tokens": 96,
+            "min_new_tokens": 0,
             "do_sample": False,
+            "temperature": 0.1,
+            "repetition_penalty": 1.0,
+            "length_penalty": 1.0,
             "no_repeat_ngram_size": 0,
             "use_cache": True,
         }
         self.text_model_id = text_model_id
         self.attn_implementation = attn_implementation
         self.model_dtype = model_dtype
         self.system_prompt = system_prompt
         self.user_prompt = user_prompt
         self.encoder_dim = encoder_dim
         self.audio_sample_rate = audio_sample_rate
         self.projector_init_std = projector_init_std
         self.projector_pool_stride = projector_pool_stride
+        self.downsample_rate = downsample_rate
         self.projector_hidden_dim = projector_hidden_dim
+        self.projector_type = projector_type
+        self.projector_num_layers = projector_num_layers
         self.projector_dropout = projector_dropout
+        self.projector_input_noise = projector_input_noise
+        # MoE-specific configuration
+        self.num_experts = num_experts
+        self.num_experts_per_tok = num_experts_per_tok
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.use_specaugment = use_specaugment
+        self.label_smoothing = label_smoothing
         self.inference_diversity_penalty = inference_diversity_penalty
         self.inference_warmup_tokens = inference_warmup_tokens
+        # Generation parameters (use explicit value if provided, else use default)
+        self.num_beams = num_beams if num_beams is not None else generation_defaults["num_beams"]
+        self.max_new_tokens = (
+            max_new_tokens if max_new_tokens is not None else generation_defaults["max_new_tokens"]
+        )
+        self.min_new_tokens = (
+            min_new_tokens if min_new_tokens is not None else generation_defaults["min_new_tokens"]
+        )
+        self.do_sample = do_sample if do_sample is not None else generation_defaults["do_sample"]
+        self.repetition_penalty = (
+            repetition_penalty
+            if repetition_penalty is not None
+            else generation_defaults["repetition_penalty"]
+        )
+        self.length_penalty = (
+            length_penalty if length_penalty is not None else generation_defaults["length_penalty"]
+        )
+        self.no_repeat_ngram_size = (
+            no_repeat_ngram_size
+            if no_repeat_ngram_size is not None
+            else generation_defaults["no_repeat_ngram_size"]
+        )
+        self.use_cache = use_cache if use_cache is not None else generation_defaults["use_cache"]
+        self.temperature = (
+            temperature if temperature is not None else generation_defaults["temperature"]
+        )
+        self.top_k = top_k
+        self.top_p = top_p
+        self.early_stopping = early_stopping
         if "audio_config" not in kwargs:
             self.audio_config = transformers.AutoConfig.from_pretrained(audio_model_id)
+            # Override dtype to match model_dtype
+            self.audio_config.dtype = model_dtype
         else:
             self.audio_config = kwargs.pop("audio_config")
             self.text_config = transformers.AutoConfig.from_pretrained(
                 text_model_id, trust_remote_code=True
             )
+            # Override dtype to match model_dtype
+            self.text_config.dtype = model_dtype
         else:
             self.text_config = kwargs.pop("text_config")
         if isinstance(self.text_config, dict):
             # Reconstruct config from dict using the model_type stored in the dict
+            model_type = self.text_config["model_type"]
+            config_class = transformers.AutoConfig.for_model(model_type).__class__
+            self.text_config = config_class(**self.text_config)
         if isinstance(self.audio_config, dict):
             model_type = self.audio_config.get("model_type")

asr_modeling.py CHANGED Viewed

@@ -1,148 +1,78 @@
 from pathlib import Path
 from typing import Optional, Union
 import torch
 import torch.nn as nn
-import torch.nn.functional as F  # noqa: N812
 from transformers import (
     AutoConfig,
     AutoModel,
     AutoModelForCausalLM,
     AutoTokenizer,
     PreTrainedModel,
-    Wav2Vec2FeatureExtractor,
 )
-from transformers.generation.utils import (
-    GenerateBeamDecoderOnlyOutput,
-    GenerateBeamEncoderDecoderOutput,
-    GenerateDecoderOnlyOutput,
-    GenerateEncoderDecoderOutput,
 )
 try:
     from .asr_config import ASRConfig
 except ImportError:
     from asr_config import ASRConfig  # type: ignore[no-redef]
-class SwiGLU(nn.Module):
-    def __init__(self, in_features, hidden_features, out_features, bias=False, dropout=0.0):
-        super().__init__()
-        self.w1 = nn.Linear(in_features, hidden_features, bias=bias)
-        self.w2 = nn.Linear(in_features, hidden_features, bias=bias)
-        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
-        self.act = nn.SiLU()
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x):
-        x_gate = self.act(self.w1(x))
-        x_val = self.w2(x)
-        x = x_gate * x_val
-        x = self.dropout(x)
-        return self.w3(x)
-class AudioProjector(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.k = getattr(config, "projector_pool_stride", 2)  # Downsampling rate
-        in_dim = config.encoder_dim * self.k
-        out_dim = config.llm_dim
-        hidden_dim = config.projector_hidden_dim
-        if hidden_dim is None:
-            hidden_dim = config.encoder_dim * 4
-        dropout_rate = getattr(config, "projector_dropout", 0.0)
-        from transformers.models.llama.modeling_llama import LlamaRMSNorm
-        self.ln_pre = LlamaRMSNorm(in_dim, eps=1e-6)
-        self.proj = SwiGLU(in_dim, hidden_dim, out_dim, dropout=dropout_rate)
-        self.ln_post = LlamaRMSNorm(out_dim, eps=1e-6)
-        self.output_dropout = nn.Dropout(dropout_rate)
-        with torch.no_grad():
-            std = getattr(config, "projector_init_std", 0.02)
-            self.ln_pre.weight.data.fill_(1.0)
-            self.ln_post.weight.data.fill_(1.0)
-            nn.init.normal_(self.proj.w1.weight, mean=0.0, std=std)
-            nn.init.normal_(self.proj.w2.weight, mean=0.0, std=std)
-            nn.init.normal_(self.proj.w3.weight, mean=0.0, std=std)
-    def forward(self, x):
-        batch_size, seq_len, dim = x.size()
-        target_dtype = self.proj.w1.weight.dtype
-        if x.dtype != target_dtype:
-            x = x.to(target_dtype)
-        remainder = seq_len % self.k
-        if remainder:
-            pad_len = self.k - remainder
-            x = F.pad(x, (0, 0, 0, pad_len))
-        x = x.contiguous().view(batch_size, -1, dim * self.k)
-        x = self.ln_pre(x)
-        x = self.proj(x)
-        x = self.ln_post(x)
-        return self.output_dropout(x)
-class ASRModel(PreTrainedModel):
     config_class = ASRConfig
     base_model_prefix = "model"
-    main_input_name = "input_values"
     _supports_flash_attn_2 = True
     supports_gradient_checkpointing = True
     _is_loading_from_pretrained: bool = False
     _pretrained_model_path: Optional[str] = None
-    # Task to prompt mapping for generation
-    TASK_PROMPTS = {
-        "transcribe": "Transcribe: <audio>",
-        "continue": "Continue: <audio>",
-        "describe": "Describe: <audio>",
-        "emotion": "Emotion: <audio>",
-    }
-    @staticmethod
-    def _create_feature_extractor(audio_model_id: str):
-        """Factory method to create the appropriate feature extractor."""
-        is_whisper = "whisper" in audio_model_id.lower()
-        if is_whisper:
-            from transformers import WhisperConfig, WhisperFeatureExtractor
-            encoder_config = WhisperConfig.from_pretrained(audio_model_id)
-            num_mel_bins = encoder_config.num_mel_bins
-            return WhisperFeatureExtractor.from_pretrained(
-                audio_model_id,
-                feature_size=num_mel_bins,
-            )
-        return Wav2Vec2FeatureExtractor.from_pretrained(audio_model_id)
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
-        from transformers import AutoFeatureExtractor
         config = kwargs.pop("config", None)
         if config is None:
             config = ASRConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        # Load feature extractor from saved model directory
-        kwargs["feature_extractor"] = AutoFeatureExtractor.from_pretrained(
-            pretrained_model_name_or_path, **kwargs
-        )
         cls._is_loading_from_pretrained = True
         cls._pretrained_model_path = pretrained_model_name_or_path
         try:
-            from safetensors.torch import load_file
-            from transformers.utils.hub import cached_file
             model = cls(config, **kwargs)
             subfolder = kwargs.get("subfolder")
             revision = kwargs.get("revision")
             cache_kwargs = {}
@@ -158,102 +88,76 @@ class ASRModel(PreTrainedModel):
                 **cache_kwargs,
             )
-            if not model_file:
-                raise FileNotFoundError(
-                    f"model.safetensors not found in {pretrained_model_name_or_path}. "
-                    "The repository may not have been trained yet."
-                )
-            state_dict = load_file(model_file)
-            model.load_state_dict(state_dict, strict=False, assign=True)
-            target_dtype = getattr(torch, config.model_dtype)
-            model.projector = model.projector.to(dtype=target_dtype)
-            device = kwargs.get("device")
-            if device is not None:
-                model = model.to(device)
             return model
         finally:
             cls._is_loading_from_pretrained = False
-            del cls._pretrained_model_path
     def __init__(self, config: ASRConfig, **kwargs):
         super().__init__(config)
-        feature_extractor = kwargs.pop("feature_extractor", None)
         self.system_prompt = config.system_prompt
-        self.encoder = self._create_encoder(config)
-        is_whisper = "whisper" in config.audio_model_id.lower() or (
-            hasattr(self.encoder.config, "model_type")
-            and "whisper" in self.encoder.config.model_type.lower()
-        )
-        if is_whisper:
-            self.main_input_name = "input_features"
-        else:
-            self.main_input_name = "input_values"
-        if feature_extractor is not None:
-            self.feature_extractor = feature_extractor
         else:
-            self.feature_extractor = self._create_feature_extractor(config.audio_model_id)
-        self.decoder = self._create_decoder(config)
-        self.generation_config = self.decoder.generation_config
-        self._init_tokenizer()
-        from types import SimpleNamespace
-        encoder_dim = config.encoder_dim
-        if encoder_dim is None:
-            if hasattr(self.encoder.config, "hidden_size"):
-                encoder_dim = self.encoder.config.hidden_size
-            elif hasattr(self.encoder.config, "d_model"):
-                encoder_dim = self.encoder.config.d_model
-            else:
-                raise ValueError("Could not auto-detect encoder_dim. Please specify in config.")
-        llm_dim = config.llm_dim
-        if llm_dim is None:
-            if hasattr(self.decoder.config, "hidden_size"):
-                llm_dim = self.decoder.config.hidden_size
-            elif hasattr(self.decoder.config, "d_model"):
-                llm_dim = self.decoder.config.d_model
-            else:
-                raise ValueError("Could not auto-detect llm_dim. Please specify in config.")
-        projector_config = SimpleNamespace(
-            encoder_dim=encoder_dim,
-            llm_dim=llm_dim,
-            projector_pool_stride=getattr(config, "projector_pool_stride", 2),
-            projector_hidden_dim=getattr(config, "projector_hidden_dim", None),
-            projector_init_std=getattr(config, "projector_init_std", 0.02),
-            projector_dropout=getattr(config, "projector_dropout", 0.0),
-        )
-        self.projector = AudioProjector(projector_config)
-        target_dtype = getattr(torch, config.model_dtype)
-        self.projector = self.projector.to(dtype=target_dtype)
-        self._no_split_modules = self.decoder._no_split_modules
     @classmethod
-    def _create_encoder(cls, config: ASRConfig):
-        target_dtype = getattr(torch, config.model_dtype)
         encoder_kwargs = {
             "attn_implementation": config.attn_implementation,
-            "dtype": target_dtype,
             "low_cpu_mem_usage": True,
         }
-        if not cls._is_loading_from_pretrained:
-            encoder_kwargs["device_map"] = "auto"
         if "whisper" in config.audio_model_id.lower():
             from transformers import WhisperModel
@@ -264,471 +168,414 @@ class ASRModel(PreTrainedModel):
         else:
             encoder = AutoModel.from_pretrained(config.audio_model_id, **encoder_kwargs)
-        is_whisper = "whisper" in config.audio_model_id.lower() or (
-            hasattr(encoder.config, "model_type") and "whisper" in encoder.config.model_type.lower()
-        )
-        original_forward = encoder.forward
-        input_key = "input_features" if is_whisper else "input_values"
-        def safe_encoder_forward(self_encoder, input_values=None, **kwargs):
-            kwargs.pop("input_ids", None)
-            return original_forward(**{input_key: input_values}, **kwargs)
-        import types
-        encoder.forward = types.MethodType(safe_encoder_forward, encoder)
         encoder.requires_grad_(False)
         return encoder
     @classmethod
-    def _create_decoder(cls, config: ASRConfig):
-        target_dtype = getattr(torch, config.model_dtype)
         decoder_kwargs = {
             "attn_implementation": config.attn_implementation,
-            "dtype": target_dtype,
             "trust_remote_code": True,
         }
         decoder = AutoModelForCausalLM.from_pretrained(config.text_model_id, **decoder_kwargs)
-        decoder.config.use_cache = config.use_cache
         decoder.requires_grad_(False)
         return decoder
-    def _init_weights(self, module):
-        pass
-    def can_generate(self) -> bool:
-        return True
-    @property
-    def _tied_weights_keys(self):
-        if hasattr(self.decoder, "_tied_weights_keys"):
-            return [f"decoder.{k}" for k in self.decoder._tied_weights_keys]
-        return []
-    def _init_tokenizer(self):
-        model_path = (
-            self.__class__._pretrained_model_path
-            if self._is_loading_from_pretrained
-            else self.config.text_model_id
-        )
-        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
         if (
             self.tokenizer.pad_token is None
             or self.tokenizer.pad_token_id == self.tokenizer.eos_token_id
         ) and "<|finetune_right_pad_id|>" in self.tokenizer.get_vocab():
             self.tokenizer.pad_token = "<|finetune_right_pad_id|>"
         existing_special = self.tokenizer.additional_special_tokens or []
         if "<audio>" not in existing_special:
-            special_tokens = {"additional_special_tokens": existing_special + ["<audio>"]}
-            num_added_tokens = self.tokenizer.add_special_tokens(special_tokens)
-            if num_added_tokens > 0:
-                self.decoder.resize_token_embeddings(len(self.tokenizer), mean_resizing=False)
-        current_embed_size = self.decoder.get_input_embeddings().weight.shape[0]
-        expected_size = len(self.tokenizer)
-        if current_embed_size != expected_size:
-            self.decoder.resize_token_embeddings(expected_size, mean_resizing=False)
         self.audio_token_id = self.tokenizer.convert_tokens_to_ids("<audio>")
         self.tokenizer.padding_side = "right"
-        for cfg in [self.config.text_config, self.decoder.config, self.generation_config]:
-            if isinstance(cfg, dict):
-                cfg["pad_token_id"] = self.tokenizer.pad_token_id
-                cfg["eos_token_id"] = self.tokenizer.eos_token_id
-                cfg["bos_token_id"] = self.tokenizer.bos_token_id
-            else:
                 cfg.pad_token_id = self.tokenizer.pad_token_id
                 cfg.eos_token_id = self.tokenizer.eos_token_id
                 cfg.bos_token_id = self.tokenizer.bos_token_id
-    def get_processor(self):
-        try:
-            from .asr_processing import ASRProcessor
-        except ImportError:
-            from asr_processing import ASRProcessor  # type: ignore[no-redef]
-        return ASRProcessor(feature_extractor=self.feature_extractor, tokenizer=self.tokenizer)
-    def state_dict(self, *args, **kwargs):
-        return self._get_trainable_state_dict()
-    def _get_trainable_state_dict(self):
-        state = {}
-        projector_state = self.projector.state_dict()
-        for name, tensor in projector_state.items():
-            state[f"projector.{name}"] = tensor
-        return state
     def get_input_embeddings(self):
-        return self.decoder.get_input_embeddings()
     def set_input_embeddings(self, value):
-        self.decoder.set_input_embeddings(value)
     def get_output_embeddings(self):
-        return self.decoder.get_output_embeddings()
     def set_output_embeddings(self, value):
-        self.decoder.set_output_embeddings(value)
-    def _encode_audio(
-        self,
-        input_values: torch.Tensor,
-        audio_attention_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        encoder_device = next(self.encoder.parameters()).device
-        encoder_dtype = next(self.encoder.parameters()).dtype
-        input_values = input_values.clone().to(device=encoder_device, dtype=encoder_dtype)
-        with torch.no_grad():
-            audio_features = self.encoder(
-                input_values=input_values,
-                attention_mask=audio_attention_mask,
-            ).last_hidden_state
-        audio_embeds = self.projector(audio_features)
-        decoder_dtype = next(self.decoder.parameters()).dtype
-        if audio_embeds.dtype != decoder_dtype:
-            audio_embeds = audio_embeds.to(dtype=decoder_dtype)
-        return audio_embeds
-    def _get_audio_expansion_details(self, input_ids: torch.Tensor, num_audio_tokens: int) -> dict:
-        batch_size, seq_len = input_ids.shape
-        device = input_ids.device
-        audio_mask = input_ids == self.audio_token_id
-        audio_counts = audio_mask.sum(dim=1)
-        if not (audio_counts == 1).all():
-            missing = (audio_counts == 0).any()
-            multiple = (audio_counts > 1).any()
-            if missing:
-                raise ValueError("Some samples are missing audio token")
-            if multiple:
-                raise ValueError("Some samples have multiple audio tokens")
-        token_counts = torch.where(audio_mask, num_audio_tokens, 1)
-        cumsum_counts = torch.cumsum(token_counts, dim=1)
-        new_start_positions = torch.cat(
-            [
-                torch.zeros(batch_size, 1, dtype=torch.long, device=device),
-                cumsum_counts[:, :-1],
-            ],
-            dim=1,
-        )
-        new_seq_len = seq_len - 1 + num_audio_tokens
-        return {
-            "new_seq_len": new_seq_len,
-            "new_start_positions": new_start_positions,
-            "audio_mask": audio_mask,
-        }
-    def _expand_tensor_for_audio(
         self,
-        input_ids: torch.Tensor,
-        tensor_to_expand: Optional[torch.Tensor],
-        num_audio_tokens: int,
-        fill_value: Optional[Union[int, float]] = None,
-        audio_fill_value: Optional[Union[int, float]] = None,
     ) -> torch.Tensor:
-        batch_size, seq_len = input_ids.shape
-        device = input_ids.device
-        details = self._get_audio_expansion_details(input_ids, num_audio_tokens)
-        new_seq_len = details["new_seq_len"]
-        new_start_positions = details["new_start_positions"]
-        audio_mask = details["audio_mask"]
-        if tensor_to_expand is None:
-            tensor_to_expand = input_ids
-            fill_value = fill_value or self.tokenizer.pad_token_id
-            audio_fill_value = audio_fill_value or self.audio_token_id
-        else:
-            if fill_value is None:
-                raise ValueError("fill_value must be provided when expanding non-input_ids tensors")
-            if audio_fill_value is None:
-                audio_fill_value = fill_value
-        assert tensor_to_expand is not None
-        expanded = torch.full(
-            (batch_size, new_seq_len),
-            fill_value,
-            dtype=tensor_to_expand.dtype,
-            device=device,
-        )
-        batch_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand(-1, seq_len)
-        non_audio_mask = ~audio_mask
-        expanded[batch_indices[non_audio_mask], new_start_positions[non_audio_mask]] = (
-            tensor_to_expand[non_audio_mask]
-        )
-        if audio_fill_value != fill_value:
-            audio_positions = audio_mask.int().argmax(dim=1)
-            audio_new_start = new_start_positions[
-                torch.arange(batch_size, device=device), audio_positions
-            ]
-            audio_token_indices = torch.arange(num_audio_tokens, device=device).unsqueeze(0)
-            audio_positions_expanded = audio_new_start.unsqueeze(1) + audio_token_indices
-            batch_idx_expanded = (
-                torch.arange(batch_size, device=device).unsqueeze(1).expand(-1, num_audio_tokens)
             )
-            expanded[batch_idx_expanded, audio_positions_expanded] = audio_fill_value
-        return expanded
-    def _expand_audio_tokens(self, input_ids: torch.Tensor, num_audio_tokens: int) -> torch.Tensor:
-        return self._expand_tensor_for_audio(input_ids, None, num_audio_tokens)
-    def _expand_for_audio_tokens(
         self,
-        input_ids: torch.Tensor,
-        tensor_to_expand: torch.Tensor,
-        num_audio_tokens: int,
-        fill_value: Union[int, float],
     ) -> torch.Tensor:
-        return self._expand_tensor_for_audio(
-            input_ids, tensor_to_expand, num_audio_tokens, fill_value
-        )
-    def _prepare_audio_inputs_embeds(
-        self, expanded_input_ids: torch.Tensor, audio_embeds: torch.Tensor
-    ) -> torch.Tensor:
-        inputs_embeds = self.decoder.get_input_embeddings()(expanded_input_ids)
-        special_audio_mask = (expanded_input_ids == self.audio_token_id).unsqueeze(-1)
-        special_audio_mask = special_audio_mask.expand_as(inputs_embeds)
-        audio_embeds_flat = audio_embeds.reshape(-1, audio_embeds.shape[-1])
-        return inputs_embeds.masked_scatter(special_audio_mask, audio_embeds_flat)
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
-        input_values: Optional[torch.Tensor] = None,
-        input_features: Optional[torch.Tensor] = None,  # For Whisper
-        labels: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        num_items_in_batch: Optional[
-            int
-        ] = None,  # HF Trainer provides this for gradient accumulation
         **kwargs,
-    ):
-        audio_inputs = input_values if input_values is not None else input_features
-        if audio_inputs is not None:
-            if input_ids is None:
-                raise ValueError(
-                    "forward() requires both audio inputs and input_ids (for training). "
-                    "For inference, use the generate() method instead, or use the pipeline "
-                    "which will automatically call generate()."
-                )
-            audio_attention_mask = kwargs.pop("audio_attention_mask", None)
-            kwargs.pop("past_key_values", None)
-            use_cache = kwargs.pop("use_cache", None)
-            audio_embeds = self._encode_audio(
-                input_values=audio_inputs,  # Will be mapped to input_features for Whisper by safe_encoder_forward
-                audio_attention_mask=audio_attention_mask,
             )
-            if self.audio_token_id is None:
-                raise ValueError(f"Audio token not properly initialized: {self.audio_token_id}")
-            vocab_size = self.decoder.get_input_embeddings().weight.shape[0]
-            if self.audio_token_id >= vocab_size:
-                raise ValueError(
-                    f"Audio token ID out of range. ID: {self.audio_token_id}, Vocab size: {vocab_size}"
-                )
-            if not (input_ids == self.audio_token_id).any():
-                raise ValueError("Audio token <audio> must be present in input")
-            num_audio_tokens = audio_embeds.shape[1]
-            expanded_input_ids = self._expand_audio_tokens(input_ids, num_audio_tokens)
-            inputs_embeds = self._prepare_audio_inputs_embeds(expanded_input_ids, audio_embeds)
-            if attention_mask is not None:
-                full_attention_mask = self._expand_for_audio_tokens(
-                    input_ids, attention_mask, num_audio_tokens, fill_value=1
-                )
-            else:
-                full_attention_mask = None
-            if labels is not None:
-                labels = self._expand_for_audio_tokens(
-                    input_ids, labels, num_audio_tokens, fill_value=-100
-                )
-        else:
-            inputs_embeds = self.decoder.get_input_embeddings()(input_ids)
-            full_attention_mask = attention_mask
-            use_cache = kwargs.pop("use_cache", None)
-        return self.decoder(
-            inputs_embeds=inputs_embeds,
-            attention_mask=full_attention_mask,
-            labels=labels,
-            use_cache=use_cache if use_cache is not None else False,
-            **kwargs,
-        )
     @torch.no_grad()
     def generate(
         self,
-        input_values: Optional[torch.Tensor] = None,
-        input_features: Optional[torch.Tensor] = None,  # For Whisper
         system_prompt: Optional[str] = None,
-        user_prompt: Optional[str] = None,
-        task: Optional[str] = None,
         **generate_kwargs,
-    ) -> Union[
-        torch.Tensor,
-        GenerateDecoderOnlyOutput,
-        GenerateEncoderDecoderOutput,
-        GenerateBeamDecoderOnlyOutput,
-        GenerateBeamEncoderDecoderOutput,
-    ]:
-        audio_inputs = input_values if input_values is not None else input_features
-        if audio_inputs is None:
-            raise ValueError("input_values or input_features must be provided for generation")
-        audio_embeds = self._encode_audio(audio_inputs)
-        batch_size = audio_embeds.shape[0]
-        device = audio_embeds.device
-        if system_prompt is None:
-            system_prompt = self.system_prompt
-        if user_prompt is None:
-            user_prompt = (
-                self.TASK_PROMPTS.get(task, self.config.user_prompt or "Transcribe: <audio>")
-                or "Transcribe: <audio>"
-            )
-        messages = []
-        if system_prompt:
-            messages.append({"role": "system", "content": system_prompt})
-        messages.append(
-            {
-                "role": "user",
-                "content": user_prompt,
-            }
-        )
-        prompt_ids = self.tokenizer.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_tensors="pt",
-            enable_thinking=False,
-        ).to(device)
-        if len(prompt_ids.shape) == 1:
-            prompt_ids = prompt_ids.unsqueeze(0)
-        if prompt_ids.shape[0] == 1 and batch_size > 1:
-            prompt_ids = prompt_ids.expand(batch_size, -1)
-        if not (prompt_ids == self.audio_token_id).any():
-            raise ValueError("Audio token <audio> not found in prompt")
-        num_audio_tokens = audio_embeds.shape[1]
-        expanded_prompt_ids = self._expand_audio_tokens(prompt_ids, num_audio_tokens)
-        inputs_embeds = self._prepare_audio_inputs_embeds(expanded_prompt_ids, audio_embeds)
-        total_seq_len = inputs_embeds.shape[1]
-        attention_mask = torch.ones(batch_size, total_seq_len, dtype=torch.long, device=device)
-        config_params = [
-            "max_new_tokens",
-            "min_new_tokens",
-            "num_beams",
-            "do_sample",
-            "temperature",
-            "top_k",
-            "top_p",
-            "repetition_penalty",
-            "length_penalty",
-            "no_repeat_ngram_size",
-            "early_stopping",
-        ]
-        for param in config_params:
-            if hasattr(self.config, param) and getattr(self.config, param) is not None:
-                generate_kwargs.setdefault(param, getattr(self.config, param))
-        generate_kwargs.setdefault("use_cache", True)
-        generate_kwargs.setdefault(
-            "eos_token_id", self.tokenizer.convert_tokens_to_ids("<|im_end|>")
         )
-        generate_kwargs.setdefault("pad_token_id", self.tokenizer.pad_token_id)
-        prompt_length = expanded_prompt_ids.shape[1]
-        generated_ids = self.decoder.generate(
-            input_ids=expanded_prompt_ids,
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             **generate_kwargs,
         )
-        return generated_ids[:, prompt_length:]
     def save_pretrained(self, save_directory: Union[str, Path], **kwargs):
         import shutil
         from pathlib import Path as PathlibPath
         save_dir = PathlibPath(save_directory)
         save_dir.mkdir(parents=True, exist_ok=True)
-        actual_vocab_size = self.decoder.config.vocab_size
-        self.config.vocab_size = actual_vocab_size
-        self.config.text_config.vocab_size = actual_vocab_size
-        if hasattr(self.encoder.config, "num_mel_bins"):
-            self.config.audio_config.num_mel_bins = self.encoder.config.num_mel_bins
-        feature_extractor = self.feature_extractor
         tokenizer = self.tokenizer
-        del self.feature_extractor
         del self.tokenizer
         try:
             super().save_pretrained(save_dir, **kwargs)
         finally:
-            self.feature_extractor = feature_extractor
             self.tokenizer = tokenizer
         self.tokenizer.save_pretrained(save_dir)
-        if hasattr(self.encoder.config, "num_mel_bins"):
-            # For Whisper models, explicitly set the correct feature_size before saving
-            num_mel_bins = self.encoder.config.num_mel_bins
-            self.feature_extractor.feature_size = num_mel_bins
-            self.feature_extractor.num_mel_bins = num_mel_bins  # Explicitly set num_mel_bins
-            if hasattr(self.feature_extractor, "n_mels"):
-                self.feature_extractor.n_mels = num_mel_bins
-            self.feature_extractor.nb_max_frames = 3000  # Whisper's max frames
-        self.get_processor().save_pretrained(save_dir)
         src_dir = PathlibPath(__file__).parent
         for asr_file in src_dir.glob("asr_*.py"):
             shutil.copy(asr_file, save_dir / asr_file.name)
 AutoConfig.register("asr_model", ASRConfig)
 AutoModel.register(ASRConfig, ASRModel)

+import json
 from pathlib import Path
 from typing import Optional, Union
 import torch
 import torch.nn as nn
 from transformers import (
     AutoConfig,
     AutoModel,
     AutoModelForCausalLM,
     AutoTokenizer,
     PreTrainedModel,
 )
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.models.whisper.modeling_whisper import (
+    _compute_mask_indices,
 )
 try:
     from .asr_config import ASRConfig
+    from .mlp_projector import MLPAudioProjector
+    from .moe_projector import MoEAudioProjector
+    from .residual_projector import ResidualAudioProjector
+    from .shared_moe_projector import SharedMoEAudioProjector
+    from .swiglu_projector import AudioProjector
 except ImportError:
     from asr_config import ASRConfig  # type: ignore[no-redef]
+    from mlp_projector import MLPAudioProjector  # type: ignore[no-redef]
+    from moe_projector import MoEAudioProjector  # type: ignore[no-redef]
+    from residual_projector import ResidualAudioProjector  # type: ignore[no-redef]
+    from shared_moe_projector import SharedMoEAudioProjector  # type: ignore[no-redef]
+    from swiglu_projector import AudioProjector  # type: ignore[no-redef]
+# Map projector type names to classes
+PROJECTOR_CLASSES = {
+    "swiglu": AudioProjector,
+    "residual": ResidualAudioProjector,
+    "moe": MoEAudioProjector,
+    "shared_moe": SharedMoEAudioProjector,
+    "mlp": MLPAudioProjector,
+}
+class ASRModel(PreTrainedModel, GenerationMixin):
+    """Audio-to-text model combining an audio encoder, projector, and language model."""
     config_class = ASRConfig
     base_model_prefix = "model"
+    main_input_name = "input_features"
     _supports_flash_attn_2 = True
     supports_gradient_checkpointing = True
     _is_loading_from_pretrained: bool = False
     _pretrained_model_path: Optional[str] = None
+    TRANSCRIBE_PROMPT = "Transcribe: "
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        """Load model from pretrained, handling device placement correctly."""
+        from safetensors.torch import load_file
+        from transformers.utils.hub import cached_file
         config = kwargs.pop("config", None)
         if config is None:
             config = ASRConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        # Set flag to avoid device_map="auto" in sub-model loaders
         cls._is_loading_from_pretrained = True
         cls._pretrained_model_path = pretrained_model_name_or_path
         try:
             model = cls(config, **kwargs)
+            # Load projector weights from safetensors
             subfolder = kwargs.get("subfolder")
             revision = kwargs.get("revision")
             cache_kwargs = {}
                 **cache_kwargs,
             )
+            if model_file is not None:
+                state_dict = load_file(model_file)
+                model.load_state_dict(state_dict, strict=False)
             return model
         finally:
             cls._is_loading_from_pretrained = False
+            cls._pretrained_model_path = None
     def __init__(self, config: ASRConfig, **kwargs):
         super().__init__(config)
         self.system_prompt = config.system_prompt
+        target_dtype = getattr(torch, config.model_dtype)
+        # Audio encoder (frozen)
+        self.audio_tower = self._load_audio_encoder(config, target_dtype)
+        # Language model (frozen)
+        self.language_model = self._load_language_model(config, target_dtype)
+        # Initialize tokenizer and special tokens
+        self._init_tokenizer(config)
+        # Set up generation config with our defaults
+        self.generation_config = self.language_model.generation_config
+        self.generation_config.max_new_tokens = config.max_new_tokens
+        self.generation_config.num_beams = config.num_beams
+        self.generation_config.do_sample = config.do_sample
+        self.generation_config.use_cache = config.use_cache
+        self.generation_config.length_penalty = config.length_penalty
+        self.generation_config.repetition_penalty = config.repetition_penalty
+        self.generation_config.no_repeat_ngram_size = config.no_repeat_ngram_size
+        # Only set sampling params when do_sample=True, otherwise clear them
+        if config.do_sample:
+            self.generation_config.temperature = config.temperature
+            if config.top_k is not None:
+                self.generation_config.top_k = config.top_k
+            if config.top_p is not None:
+                self.generation_config.top_p = config.top_p
         else:
+            self.generation_config.temperature = None
+            self.generation_config.top_k = None
+            self.generation_config.top_p = None
+        self.generation_config.eos_token_id = self.tokenizer.convert_tokens_to_ids("<|im_end|>")
+        self.generation_config.pad_token_id = self.tokenizer.pad_token_id
+        # Feature extractor for audio preprocessing
+        self.feature_extractor = self._create_feature_extractor(config)
+        # Audio projector (trainable)
+        self.projector = self._create_projector(config, target_dtype)
+        # For model parallelism
+        self._no_split_modules = getattr(self.language_model, "_no_split_modules", [])
+    def _create_feature_extractor(self, config: ASRConfig):
+        """Create the appropriate feature extractor for the audio encoder."""
+        from transformers import AutoFeatureExtractor
+        return AutoFeatureExtractor.from_pretrained(config.audio_model_id)
     @classmethod
+    def _load_audio_encoder(cls, config: ASRConfig, dtype: torch.dtype) -> nn.Module:
+        """Load and freeze the audio encoder."""
         encoder_kwargs = {
             "attn_implementation": config.attn_implementation,
             "low_cpu_mem_usage": True,
+            "dtype": dtype,
         }
         if "whisper" in config.audio_model_id.lower():
             from transformers import WhisperModel
         else:
             encoder = AutoModel.from_pretrained(config.audio_model_id, **encoder_kwargs)
         encoder.requires_grad_(False)
+        encoder.eval()
         return encoder
     @classmethod
+    def _load_language_model(cls, config: ASRConfig, dtype: torch.dtype) -> PreTrainedModel:
+        """Load and freeze the language model."""
         decoder_kwargs = {
             "attn_implementation": config.attn_implementation,
             "trust_remote_code": True,
+            "tie_word_embeddings": True,
+            "low_cpu_mem_usage": True,
+            "dtype": dtype,
         }
         decoder = AutoModelForCausalLM.from_pretrained(config.text_model_id, **decoder_kwargs)
+        decoder.config.use_cache = getattr(config, "use_cache", True)
         decoder.requires_grad_(False)
+        decoder.eval()
         return decoder
+    def _create_projector(self, config: ASRConfig, dtype: torch.dtype) -> nn.Module:
+        """Create the trainable audio projector."""
+        # Auto-detect dimensions if not specified
+        if config.encoder_dim is None:
+            enc_cfg = self.audio_tower.config
+            config.encoder_dim = getattr(enc_cfg, "hidden_size", None) or getattr(
+                enc_cfg, "d_model", None
+            )
+            if config.encoder_dim is None:
+                raise ValueError("Could not auto-detect encoder_dim. Please specify in config.")
+        if config.llm_dim is None:
+            dec_cfg = self.language_model.config
+            config.llm_dim = getattr(dec_cfg, "hidden_size", None) or getattr(
+                dec_cfg, "d_model", None
+            )
+            if config.llm_dim is None:
+                raise ValueError("Could not auto-detect llm_dim. Please specify in config.")
+        # Select projector type based on config
+        projector_type = getattr(config, "projector_type", "moe")
+        projector_class = PROJECTOR_CLASSES.get(projector_type)
+        if projector_class is None:
+            raise ValueError(
+                f"Unknown projector_type: {projector_type}. "
+                f"Valid options: {list(PROJECTOR_CLASSES.keys())}"
+            )
+        projector = projector_class(config)
+        # Move projector to same device as language model (important when using quantization)
+        device = next(self.language_model.parameters()).device
+        return projector.to(device=device, dtype=dtype)
+    def _init_tokenizer(self, config: ASRConfig):
+        """Initialize tokenizer with audio token."""
+        self.tokenizer = AutoTokenizer.from_pretrained(config.text_model_id, trust_remote_code=True)
+        # Set pad token
         if (
             self.tokenizer.pad_token is None
             or self.tokenizer.pad_token_id == self.tokenizer.eos_token_id
         ) and "<|finetune_right_pad_id|>" in self.tokenizer.get_vocab():
             self.tokenizer.pad_token = "<|finetune_right_pad_id|>"
+        # Add audio token
         existing_special = self.tokenizer.additional_special_tokens or []
         if "<audio>" not in existing_special:
+            self.tokenizer.add_special_tokens(
+                {"additional_special_tokens": existing_special + ["<audio>"]}
+            )
+            self.language_model.resize_token_embeddings(len(self.tokenizer), mean_resizing=False)
         self.audio_token_id = self.tokenizer.convert_tokens_to_ids("<audio>")
         self.tokenizer.padding_side = "right"
+        # Sync token IDs to configs
+        for cfg in [self.config.text_config, self.language_model.config, self.generation_config]:
+            if cfg is not None:
                 cfg.pad_token_id = self.tokenizer.pad_token_id
                 cfg.eos_token_id = self.tokenizer.eos_token_id
                 cfg.bos_token_id = self.tokenizer.bos_token_id
+    def _init_weights(self, module):
+        """Weight initialization (projector weights are initialized in MoEAudioProjector)."""
+        pass
+    def _set_gradient_checkpointing(self, enable: bool = True, gradient_checkpointing_func=None):
+        """Enable/disable gradient checkpointing for the language model."""
+        # The LLM still stores activations during forward for backprop to projector
+        # Gradient checkpointing trades compute for memory by recomputing activations
+        if hasattr(self.language_model, "_set_gradient_checkpointing"):
+            self.language_model._set_gradient_checkpointing(enable, gradient_checkpointing_func)
+        elif hasattr(self.language_model, "gradient_checkpointing_enable") and enable:
+            self.language_model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
+        elif hasattr(self.language_model, "gradient_checkpointing_disable") and not enable:
+            self.language_model.gradient_checkpointing_disable()
     def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
     def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
     def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
     def set_output_embeddings(self, value):
+        self.language_model.set_output_embeddings(value)
+    def get_processor(self):
+        """Get the processor for this model."""
+        try:
+            from .asr_processing import ASRProcessor
+        except ImportError:
+            from asr_processing import ASRProcessor  # type: ignore[no-redef]
+        return ASRProcessor(feature_extractor=self.feature_extractor, tokenizer=self.tokenizer)
+    def state_dict(self, *args, **kwargs):
+        """Only save trainable projector weights."""
+        return {f"projector.{k}": v for k, v in self.projector.state_dict().items()}
+    def _apply_specaugment(
         self,
+        input_features: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        if not getattr(self.config, "use_specaugment", False):
+            return input_features
+        if not self.training:
+            return input_features
+        # Input shape: (batch_size, num_mel_bins, sequence_length) for Whisper
+        batch_size, hidden_size, sequence_length = input_features.size()
+        mask_time_prob = getattr(self.config, "mask_time_prob", 0.05)
+        mask_time_length = getattr(self.config, "mask_time_length", 10)
+        mask_feature_prob = getattr(self.config, "mask_feature_prob", 0.0)
+        mask_feature_length = getattr(self.config, "mask_feature_length", 10)
+        # Time masking
+        if mask_time_prob > 0:
+            mask_time_np = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=mask_time_prob,
+                mask_length=mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=2,
             )
+            mask_time_indices = torch.tensor(
+                mask_time_np, device=input_features.device, dtype=torch.bool
+            )
+            # Expand to cover all features: (batch, seq) -> (batch, features, seq)
+            mask_time_expanded = mask_time_indices[:, None].expand(-1, hidden_size, -1)
+            input_features = input_features.masked_fill(mask_time_expanded, 0.0)
+        # Feature masking
+        if mask_feature_prob > 0:
+            mask_feature_np = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=mask_feature_prob,
+                mask_length=mask_feature_length,
+                min_masks=2,
+            )
+            mask_feature_indices = torch.tensor(
+                mask_feature_np, device=input_features.device, dtype=torch.bool
+            )
+            # Expand: (batch, features) -> (batch, features, seq)
+            mask_feature_expanded = mask_feature_indices[:, :, None].expand(-1, -1, sequence_length)
+            input_features = input_features.masked_fill(mask_feature_expanded, 0.0)
+        return input_features
+    def _encode_audio(
         self,
+        audio_features: torch.Tensor,
+        audio_attention_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        """Encode audio and project to LLM embedding space.
+        Returns flattened audio embeddings of shape (total_audio_tokens, hidden_dim).
+        """
+        # Apply SpecAugment during training (before encoding)
+        audio_features = self._apply_specaugment(audio_features, audio_attention_mask)
+        with torch.no_grad():
+            encoder_out = self.audio_tower(
+                input_features=audio_features, attention_mask=audio_attention_mask
+            )
+            hidden_states = encoder_out.last_hidden_state
+        audio_embeds = self.projector(hidden_states)
+        # Flatten: (batch, seq, hidden) -> (batch * seq, hidden)
+        # This allows masked_scatter to do 1:1 replacement
+        return audio_embeds.reshape(-1, audio_embeds.shape[-1])
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
+        input_features: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        audio_attention_mask: Optional[torch.Tensor] = None,
         **kwargs,
+    ) -> CausalLMOutputWithPast:
+        """Forward pass for training and inference."""
+        # Get text embeddings if not provided
+        if inputs_embeds is None:
+            inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        if input_features is not None and input_ids is not None:
+            # Encode audio -> flattened (total_audio_tokens, hidden_dim)
+            audio_embeds = self._encode_audio(input_features, audio_attention_mask)
+            # Replace <audio> token placeholders with audio embeddings using masked_scatter
+            audio_token_mask = (input_ids == self.audio_token_id).unsqueeze(-1)
+            inputs_embeds = inputs_embeds.masked_scatter(
+                audio_token_mask.to(inputs_embeds.device),
+                audio_embeds.to(inputs_embeds.device, dtype=inputs_embeds.dtype),
             )
+        # Run through language model (let it compute loss if labels provided)
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        # Add auxiliary loss from MoE projectors if available
+        if outputs.loss is not None and hasattr(self.projector, "get_aux_loss"):
+            aux_loss = self.projector.get_aux_loss()
+            if aux_loss is not None and aux_loss.numel() > 0:
+                outputs.loss = outputs.loss + aux_loss.to(outputs.loss.device)
+        return outputs
+    def prepare_inputs_for_generation(self, *args, **kwargs):
+        """Prepare inputs for generation, handling audio features for cached decoding."""
+        input_features = kwargs.pop("input_features", None)
+        cache_position = kwargs.get("cache_position")
+        model_inputs = self.language_model.prepare_inputs_for_generation(*args, **kwargs)
+        # Only pass audio features on the first generation step (cache_position[0] == 0)
+        if cache_position is not None and cache_position[0] == 0 and input_features is not None:
+            model_inputs["input_features"] = input_features
+        return model_inputs
+    def _get_num_audio_tokens(self, input_features: torch.Tensor) -> int:
+        """Calculate number of audio tokens based on input shape.
+        Whisper: input_features shape is (batch, n_mels, mel_len)
+        Encoder output is mel_len // 2 due to stride-2 conv
+        MLP projector adds another stride-2 for 4x total downsampling
+        """
+        mel_len = input_features.shape[-1]
+        return mel_len // 4
     @torch.no_grad()
     def generate(
         self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_features: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        audio_attention_mask: Optional[torch.Tensor] = None,
         system_prompt: Optional[str] = None,
         **generate_kwargs,
+    ) -> torch.Tensor:
+        """Generate transcription from audio input.
+        Can be called in two ways:
+        1. With input_ids containing <audio> tokens (from processor)
+        2. With just audio, and we build the prompt internally
+        """
+        if input_features is None:
+            raise ValueError("input_features required for generation")
+        device = input_features.device
+        batch_size = input_features.shape[0]
+        # Encode audio -> flattened embeddings
+        audio_embeds = self._encode_audio(input_features, audio_attention_mask)
+        # If input_ids not provided, build prompt with correct number of audio tokens
+        if input_ids is None:
+            num_audio_tokens = self._get_num_audio_tokens(input_features)
+            audio_placeholder = "<audio>" * num_audio_tokens
+            system_prompt = system_prompt or self.system_prompt
+            messages: list[dict[str, str]] = []
+            if system_prompt:
+                messages.append({"role": "system", "content": system_prompt})
+            messages.append({"role": "user", "content": self.TRANSCRIBE_PROMPT + audio_placeholder})
+            input_ids = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=True,
+                add_generation_prompt=True,
+                return_tensors="pt",
+            ).to(device)
+            if input_ids.dim() == 1:
+                input_ids = input_ids.unsqueeze(0)
+            if input_ids.shape[0] == 1 and batch_size > 1:
+                input_ids = input_ids.expand(batch_size, -1)
+            attention_mask = torch.ones_like(input_ids)
+        # Get text embeddings and replace audio tokens with audio embeddings
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        audio_token_mask = (input_ids == self.audio_token_id).unsqueeze(-1)
+        inputs_embeds = inputs_embeds.masked_scatter(
+            audio_token_mask.to(inputs_embeds.device),
+            audio_embeds.to(inputs_embeds.device, dtype=inputs_embeds.dtype),
         )
+        # Generate using language model
+        output = self.language_model.generate(
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
+            generation_config=self.generation_config,
             **generate_kwargs,
         )
+        # When using inputs_embeds without input_ids, generate returns only new tokens
+        if isinstance(output, torch.Tensor):
+            return output
+        return output.sequences
     def save_pretrained(self, save_directory: Union[str, Path], **kwargs):
+        """Save model, tokenizer, and processor."""
         import shutil
         from pathlib import Path as PathlibPath
         save_dir = PathlibPath(save_directory)
         save_dir.mkdir(parents=True, exist_ok=True)
+        # Update config with actual vocab size
+        self.config.vocab_size = self.language_model.config.vocab_size
+        self.config.text_config.vocab_size = self.language_model.config.vocab_size
+        if hasattr(self.audio_tower.config, "num_mel_bins"):
+            self.config.audio_config.num_mel_bins = self.audio_tower.config.num_mel_bins
+        # Save model (temporarily remove non-serializable attributes)
         tokenizer = self.tokenizer
         del self.tokenizer
         try:
             super().save_pretrained(save_dir, **kwargs)
         finally:
             self.tokenizer = tokenizer
+        # Save tokenizer and feature extractor
         self.tokenizer.save_pretrained(save_dir)
+        self.feature_extractor.save_pretrained(save_dir)
+        # Add processor auto_map to preprocessor_config.json
+        config_path = save_dir / "preprocessor_config.json"
+        if config_path.exists():
+            with config_path.open() as f:
+                processor_config = json.load(f)
+        else:
+            processor_config = {}
+        processor_config.update(
+            {
+                "processor_class": "ASRProcessor",
+                "auto_map": {"AutoProcessor": "asr_processing.ASRProcessor"},
+            }
+        )
+        with config_path.open("w") as f:
+            json.dump(processor_config, f, indent=2)
+        # Copy source files for auto-loading
         src_dir = PathlibPath(__file__).parent
         for asr_file in src_dir.glob("asr_*.py"):
             shutil.copy(asr_file, save_dir / asr_file.name)
+        # Copy projector files
+        projector_files = [
+            "mlp_projector.py",
+            "moe_projector.py",
+            "residual_projector.py",
+            "swiglu_projector.py",
+            "shared_moe_projector.py",
+        ]
+        for projector_file in projector_files:
+            src_path = src_dir / projector_file
+            if src_path.exists():
+                shutil.copy(src_path, save_dir / projector_file)
+# Register with transformers Auto classes
 AutoConfig.register("asr_model", ASRConfig)
 AutoModel.register(ASRConfig, ASRModel)

asr_pipeline.py CHANGED Viewed

@@ -1,8 +1,7 @@
-from typing import Any, Dict
 import torch
 import transformers
-from truecase import get_true_case
 try:
     from .asr_modeling import ASRModel
@@ -11,284 +10,58 @@ except ImportError:
 class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
     model: ASRModel
     def __init__(self, model: ASRModel, **kwargs):
-        feature_extractor = kwargs.pop("feature_extractor", model.feature_extractor)
         tokenizer = kwargs.pop("tokenizer", model.tokenizer)
         super().__init__(
             model=model, feature_extractor=feature_extractor, tokenizer=tokenizer, **kwargs
         )
-        # Initialize text normalizer (same as train.py)
-        if hasattr(tokenizer, "normalize"):
-            self.text_normalizer = tokenizer
-        else:
-            # Fallback to whisper-tiny tokenizer for its normalize() method only
-            from transformers import WhisperTokenizer
-            self.text_normalizer = WhisperTokenizer.from_pretrained("openai/whisper-tiny")
-    def __call__(self, inputs, **kwargs):
-        generate_kwargs = {}
-        for key in [
-            "max_new_tokens",
-            "num_beams",
-            "do_sample",
-            "length_penalty",
-            "repetition_penalty",
-            "no_repeat_ngram_size",
-            "early_stopping",
-            "num_beam_groups",
-            "diversity_penalty",
-            "top_k",
-            "temperature",
-            "top_p",
-            "user_prompt",
-            "task",
-            "text_input",
-        ]:
-            if key in kwargs:
-                generate_kwargs[key] = kwargs.pop(key)
-        # Handle text-only mode
-        task = generate_kwargs.get("task")
-        if task == "text" or generate_kwargs.get("text_input"):
-            return self._process_text_only(generate_kwargs)
-        if isinstance(inputs, list):
-            results = []
-            for single_input in inputs:
-                result = self.__call__(single_input, **kwargs, **generate_kwargs)
-                results.append(result)
-            return results
-        model_inputs = self.preprocess(inputs, **kwargs)
-        from collections.abc import Iterator
-        if isinstance(model_inputs, Iterator):
-            # Convert iterator to list to process chunks
-            chunks = list(model_inputs)
-            all_outputs = []
-            for _chunk_num, chunk in enumerate(chunks, start=1):
-                chunk_output = self._forward(chunk, **generate_kwargs)
-                # Move tensors to CPU before adding to outputs
-                for key, value in chunk_output.items():
-                    if torch.is_tensor(value):
-                        chunk_output[key] = value.cpu()
-                all_outputs.append(chunk_output)
-            # Merge chunks and decode ourselves to ensure skip_special_tokens=True
-            all_tokens: list[int] = []
-            for output in all_outputs:
-                tokens = output.get("tokens")
-                if tokens is None:
-                    tokens = output.get("generated_ids")
-                if tokens is not None:
-                    if torch.is_tensor(tokens):
-                        tokens = tokens.cpu()
-                    if len(tokens.shape) > 1:
-                        tokens = tokens[0]
-                    all_tokens.extend(tokens.tolist() if torch.is_tensor(tokens) else tokens)
-            # Decode the merged tokens with skip_special_tokens
-            text = self.tokenizer.decode(all_tokens, skip_special_tokens=True)
-            text = text.strip()
-            # Apply Whisper normalization (matches training)
-            text = self.text_normalizer.normalize(text)
-            # Apply truecasing for proper capitalization
-            text = get_true_case(text)
-            return {"text": text}
-        model_outputs = self._forward(model_inputs, **generate_kwargs)
-        return self.postprocess(model_outputs)
     def preprocess(self, inputs, **preprocess_params):
-        if isinstance(inputs, list):
-            raise ValueError("Lists should not reach preprocess - bug in __call__")
-        # Set default chunking to 30 seconds with 5 second overlap
-        preprocess_params.setdefault("chunk_length_s", 30)
-        preprocess_params.setdefault("stride_length_s", (5, 5))
-        # Handle different formats from datasets
-        if isinstance(inputs, dict):
-            if "bytes" in inputs:
-                # Decode bytes to audio array using torchcodec
-                import tempfile
-                from torchcodec.decoders import AudioDecoder
-                wav_bytes = inputs["bytes"]
-                # Write to temp file for torchcodec to read
-                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-                    f.write(wav_bytes)
-                    temp_path = f.name
-                try:
-                    decoder = AudioDecoder(temp_path)
-                    # Get all audio samples
-                    audio_result = decoder.get_all_samples()
-                    audio_tensor = audio_result.data
-                    sample_rate = audio_result.sample_rate
-                    inputs = {"raw": audio_tensor.squeeze().numpy(), "sampling_rate": sample_rate}
-                finally:
-                    from pathlib import Path
-                    Path(temp_path).unlink()
-            elif "array" in inputs:
-                # Convert "array" key to "raw" key
-                inputs = {"raw": inputs["array"], "sampling_rate": inputs["sampling_rate"]}
-            # If it already has "raw" and "sampling_rate", it's good to go
-        elif hasattr(inputs, "array") and hasattr(inputs, "sampling_rate"):
-            # Audio object with attributes (not dict)
-            inputs = {"raw": inputs.array, "sampling_rate": inputs.sampling_rate}
-        elif hasattr(inputs, "__array__") and not isinstance(inputs, (dict, bytes, str)):
-            inputs = {"raw": inputs, "sampling_rate": self.model.config.audio_sample_rate}
-        elif torch.is_tensor(inputs):
             inputs = {
-                "raw": inputs.cpu().numpy(),
-                "sampling_rate": self.model.config.audio_sample_rate,
             }
         return super().preprocess(inputs, **preprocess_params)
-    def _forward(self, model_inputs, **generate_kwargs):
-        # Extract task and set sampling parameters
-        task = generate_kwargs.pop("task", None)
-        # Task-specific sampling parameters
-        task_params: Dict[str, Dict[str, Any]] = {
-            "transcribe": {"do_sample": False},
-            "emotion": {"do_sample": True, "temperature": 0.7},
-            "describe": {"do_sample": True, "temperature": 0.7},
-            "continue": {"do_sample": True, "temperature": 1.0},
-        }
-        if task in task_params:
-            for key, value in task_params[task].items():
-                generate_kwargs.setdefault(key, value)
-        # Extract audio inputs from various formats
-        is_last = True
-        audio_inputs = None
-        is_whisper = False  # Track if this is Whisper input
-        # Normalize model_inputs to dict format
-        if isinstance(model_inputs, torch.Tensor):
-            audio_inputs = model_inputs
-        elif isinstance(model_inputs, (list, tuple)) and model_inputs:
-            model_inputs = (
-                model_inputs[0]
-                if isinstance(model_inputs[0], dict)
-                else {"input_values": model_inputs[0]}
-            )
         if isinstance(model_inputs, dict):
-            # Pop metadata fields
-            is_last = model_inputs.pop("is_last", True)
-            model_inputs.pop("stride", None)
-            # Get audio input (Whisper uses input_features, others use input_values)
-            if "input_features" in model_inputs:
-                audio_inputs = model_inputs["input_features"]
-                is_whisper = True
-            else:
-                audio_inputs = model_inputs.get("input_values")
-        if audio_inputs is None:
-            raise ValueError(
-                f"Could not extract input_values or input_features from {type(model_inputs)}"
-            )
-        if isinstance(audio_inputs, torch.Tensor):
-            audio_inputs = audio_inputs.to(self.model.device)
         else:
-            raise ValueError(f"audio inputs must be a tensor, got {type(audio_inputs)}")
-        im_end_id = self.model.tokenizer.convert_tokens_to_ids("<|im_end|>")
-        generate_kwargs.setdefault("eos_token_id", im_end_id)
-        generate_kwargs.setdefault("max_new_tokens", self.model.config.max_new_tokens)
-        # Pass the appropriate input type to generate
-        if is_whisper:
-            # Whisper model - use input_features
-            generated_ids = self.model.generate(
-                input_features=audio_inputs,
-                system_prompt=self.model.config.system_prompt,
-                task=task,
-                **generate_kwargs,
-            )
-        else:
-            # Wav2Vec2/HuBERT model - use input_values
-            generated_ids = self.model.generate(
-                input_values=audio_inputs,
-                system_prompt=self.model.config.system_prompt,
-                task=task,
-                **generate_kwargs,
-            )
-        return {"tokens": generated_ids, "is_last": is_last}
-    def _process_text_only(self, generate_kwargs):
-        """Process text-only input without audio encoding."""
-        text_input = generate_kwargs.pop("text_input", None)
-        if text_input is None:
-            raise ValueError("text_input is required for text task")
-        # Remove task from generate_kwargs to avoid duplicate argument
-        generate_kwargs.pop("task", None)
-        # Generate text using the model
-        generated_ids = self.model.generate(task="text", text_input=text_input, **generate_kwargs)
-        # Decode the generated text
-        generated_text = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-        return {"text": generated_text}
-    def postprocess(
-        self, model_outputs: Dict[str, Any], return_timestamps=None, return_language=None
-    ):
-        # Handle chunked outputs from iterator
-        if isinstance(model_outputs, list):
-            # Move all tensors to CPU before calling parent postprocess
-            for output_dict in model_outputs:
-                for key, value in output_dict.items():
-                    if torch.is_tensor(value):
-                        output_dict[key] = value.cpu()
-            return super().postprocess(model_outputs)
-        if "is_last" in model_outputs:
-            model_outputs.pop("is_last")
         tokens = model_outputs.get("tokens")
         if tokens is None:
-            tokens = model_outputs.get("generated_ids")
-        if tokens is None:
-            raise ValueError(
-                f"Expected 'tokens' or 'generated_ids' in model_outputs, got: {model_outputs.keys()}"
-            )
-        # Move to CPU if on MPS or other device
-        if torch.is_tensor(tokens) and tokens.device.type != "cpu":
             tokens = tokens.cpu()
-        if len(tokens.shape) > 1:
-            tokens = tokens[0]
-        text = self.tokenizer.decode(tokens, skip_special_tokens=True)
-        text = text.strip()
-        # Apply Whisper normalization (matches training)
-        text = self.text_normalizer.normalize(text)
-        # Apply truecasing for proper capitalization
-        text = get_true_case(text)
         return {"text": text}

+from typing import Any
 import torch
 import transformers
 try:
     from .asr_modeling import ASRModel
 class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
+    """ASR Pipeline for audio-to-text transcription."""
     model: ASRModel
     def __init__(self, model: ASRModel, **kwargs):
+        feature_extractor = kwargs.pop("feature_extractor", None)
         tokenizer = kwargs.pop("tokenizer", model.tokenizer)
+        if feature_extractor is None:
+            feature_extractor = model.get_processor().feature_extractor
         super().__init__(
             model=model, feature_extractor=feature_extractor, tokenizer=tokenizer, **kwargs
         )
     def preprocess(self, inputs, **preprocess_params):
+        preprocess_params.setdefault("chunk_length_s", 0)
+        # Handle dict with "array" key (from datasets)
+        if isinstance(inputs, dict) and "array" in inputs:
             inputs = {
+                "raw": inputs["array"],
+                "sampling_rate": inputs.get("sampling_rate", self.feature_extractor.sampling_rate),
             }
         return super().preprocess(inputs, **preprocess_params)
+    def _forward(self, model_inputs, **generate_kwargs) -> dict[str, Any]:
+        # Extract audio features
         if isinstance(model_inputs, dict):
+            input_features = model_inputs.get("input_features")
+            if input_features is not None:
+                input_features = input_features.to(self.model.device)
         else:
+            input_features = model_inputs.to(self.model.device)
+        generated_ids = self.model.generate(
+            input_features=input_features,
+            **generate_kwargs,
+        )
+        return {"tokens": generated_ids}
+    def postprocess(self, model_outputs, **kwargs) -> dict[str, str]:
         tokens = model_outputs.get("tokens")
         if tokens is None:
+            return super().postprocess(model_outputs, **kwargs)
+        if torch.is_tensor(tokens):
             tokens = tokens.cpu()
+            if tokens.dim() > 1:
+                tokens = tokens[0]
+        text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
         return {"text": text}

asr_processing.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import transformers
-from transformers import AutoTokenizer, ProcessorMixin
-# Handle both package and standalone imports
 try:
     from .asr_config import ASRConfig
 except ImportError:
@@ -9,69 +11,81 @@ except ImportError:
 class ASRProcessor(ProcessorMixin):
-    """Generic processor that can handle both Wav2Vec2 and Whisper feature extractors."""
     feature_extractor_class = "AutoFeatureExtractor"
     tokenizer_class = "AutoTokenizer"
     def __init__(self, feature_extractor, tokenizer):
         self.feature_extractor = feature_extractor
         self.tokenizer = tokenizer
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        from transformers import AutoFeatureExtractor
-        # Load feature extractor and tokenizer from saved model directory
-        feature_extractor = AutoFeatureExtractor.from_pretrained(
-            pretrained_model_name_or_path, **kwargs
-        )
-        tokenizer = AutoTokenizer.from_pretrained(
-            pretrained_model_name_or_path, trust_remote_code=True, **kwargs
         )
-        return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
-    def save_pretrained(self, save_directory, **kwargs):
-        """Override save_pretrained to avoid attribute errors from base class."""
-        import json
-        from pathlib import Path
-        save_path = Path(save_directory)
-        save_path.mkdir(parents=True, exist_ok=True)
-        # Save the feature extractor (this creates preprocessor_config.json with all feature extractor settings)
-        if self.feature_extractor is not None:
-            self.feature_extractor.save_pretrained(save_directory)
-        # Save the tokenizer
-        if self.tokenizer is not None:
-            self.tokenizer.save_pretrained(save_directory)
-        # Load the existing preprocessor_config.json and add processor-specific metadata
-        config_path = save_path / "preprocessor_config.json"
-        if config_path.exists():
-            with config_path.open() as f:
-                processor_config = json.load(f)
-        else:
-            processor_config = {}
-        # Add/update processor metadata while preserving feature extractor settings
-        feature_extractor_type = self.feature_extractor.__class__.__name__
-        processor_config.update(
-            {
-                "processor_class": self.__class__.__name__,
-                "feature_extractor_class": self.feature_extractor_class,
-                "tokenizer_class": self.tokenizer_class,
-                "feature_extractor_type": feature_extractor_type,  # Dynamic based on actual type
-                "auto_map": {"AutoProcessor": "asr_processing.ASRProcessor"},
-            }
-        )
-        # Save the merged config
-        with config_path.open("w") as f:
-            json.dump(processor_config, f, indent=2)
 ASRProcessor.register_for_auto_class()

+from typing import Optional, Union
+import torch
 import transformers
+from transformers import ProcessorMixin
 try:
     from .asr_config import ASRConfig
 except ImportError:
 class ASRProcessor(ProcessorMixin):
+    """Processor for Whisper-based ASR models."""
+    attributes = ["feature_extractor", "tokenizer"]
     feature_extractor_class = "AutoFeatureExtractor"
     tokenizer_class = "AutoTokenizer"
+    AUDIO_TOKEN = "<audio>"
+    TRANSCRIBE_PROMPT = "Transcribe: "
     def __init__(self, feature_extractor, tokenizer):
         self.feature_extractor = feature_extractor
         self.tokenizer = tokenizer
+        self.audio_token_id = tokenizer.convert_tokens_to_ids(self.AUDIO_TOKEN)
+    def __call__(
+        self,
+        audio: Optional[Union[list, "torch.Tensor"]] = None,
+        text: Optional[str] = None,
+        system_prompt: Optional[str] = None,
+        return_tensors: str = "pt",
+        **kwargs,
+    ) -> dict:
+        """Process audio and text inputs for inference.
+        Args:
+            audio: Raw audio waveform(s)
+            text: Target transcription (optional, for training - but use DataCollator instead)
+            system_prompt: Optional system prompt
+            return_tensors: Return format ("pt" for PyTorch)
+        Returns:
+            Dict with input_features, input_ids, attention_mask
+        """
+        result = {}
+        # Process audio
+        if audio is not None:
+            audio_inputs = self.feature_extractor(
+                audio,
+                sampling_rate=getattr(self.feature_extractor, "sampling_rate", 16000),
+                return_tensors=return_tensors,
+                **kwargs,
+            )
+            result["input_features"] = audio_inputs["input_features"]
+            # Whisper encoder output length = mel_len // 2 (stride-2 conv)
+            num_audio_tokens = audio_inputs["input_features"].shape[-1] // 2
+        else:
+            num_audio_tokens = 0
+        # Build prompt with audio token placeholders
+        user_content = self.TRANSCRIBE_PROMPT
+        if num_audio_tokens > 0:
+            user_content += self.AUDIO_TOKEN * num_audio_tokens
+        messages = []
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        messages.append({"role": "user", "content": user_content})
+        if text is not None:
+            messages.append({"role": "assistant", "content": text})
+        # Tokenize
+        input_ids = self.tokenizer.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=(text is None),
+            return_tensors=return_tensors,
         )
+        if isinstance(input_ids, torch.Tensor) and input_ids.dim() == 1:
+            input_ids = input_ids.unsqueeze(0)
+        result["input_ids"] = input_ids
+        result["attention_mask"] = torch.ones_like(input_ids)
+        return result
 ASRProcessor.register_for_auto_class()

chat_template.jinja CHANGED Viewed

@@ -1,6 +1,94 @@
-{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system
-You are a helpful AI assistant named SmolLM, trained by Hugging Face<|im_end|>
-' }}{% endif %}{{'<|im_start|>' + message['role'] + '
-' + message['content'] + '<|im_end|>' + '
-'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
-' }}{% endif %}

+{# ───── defaults ───── #}
+{%- if enable_thinking is not defined -%}
+{%- set enable_thinking = true -%}
+{%- endif -%}
+{# ───── reasoning mode ───── #}
+{%- if enable_thinking -%}
+  {%- set reasoning_mode = "/think" -%}
+{%- else -%}
+  {%- set reasoning_mode = "/no_think" -%}
+{%- endif -%}
+{# ───── header (system message) ───── #}
+{{- "<|im_start|>system\n" -}}
+{%- if messages[0].role == "system" -%}
+  {%- set system_message = messages[0].content -%}
+  {%- if "/no_think" in system_message -%}
+    {%- set reasoning_mode = "/no_think" -%}
+  {%- elif "/think" in system_message -%}
+    {%- set reasoning_mode = "/think" -%}
+  {%- endif -%}
+  {%- set custom_instructions = system_message.replace("/no_think", "").replace("/think", "").rstrip() -%}
+{%- endif -%}
+{%- if "/system_override" in system_message -%}
+  {{- custom_instructions.replace("/system_override", "").rstrip() -}}
+  {{- "<|im_end|>\n" -}}
+{%- else -%}
+  {{- "## Metadata\n\n" -}}
+  {{- "Knowledge Cutoff Date: June 2025\n" -}}
+  {%- set today = strftime_now("%d %B %Y") -%}
+  {{- "Today Date: " ~ today ~ "\n" -}}
+  {{- "Reasoning Mode: " + reasoning_mode + "\n\n" -}}
+  {{- "## Custom Instructions\n\n" -}}
+  {%- if custom_instructions -%}
+    {{- custom_instructions + "\n\n" -}}
+  {%- elif reasoning_mode == "/think" -%}
+    {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracking, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> Thought section </think> Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion.\n\n" -}}
+  {%- else -%}
+    {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face.\n\n" -}}
+  {%- endif -%}
+  {%- if xml_tools or python_tools or tools -%}
+    {{- "### Tools\n\n" -}}
+    {%- if xml_tools or tools -%}
+      {%- if tools -%}
+        {%- set xml_tools = tools -%}
+      {%- endif -%}
+      {%- set ns = namespace(xml_tool_string="You may call one or more functions to assist with the user query.\nYou are provided with function signatures within <tools></tools> XML tags:\n\n<tools>\n") -%}
+      {%- for tool in xml_tools[:] -%} {# The slicing makes sure that xml_tools is a list #}
+        {%- set ns.xml_tool_string = ns.xml_tool_string ~ (tool | string) ~ "\n" -%}
+      {%- endfor -%}
+      {%- set xml_tool_string = ns.xml_tool_string + "</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>" -%}
+      {{- xml_tool_string -}}
+    {%- endif -%}
+    {%- if python_tools -%}
+      {%- set ns = namespace(python_tool_string="When you send a message containing Python code between '<code>' and '</code>' tags, it will be executed in a stateful Jupyter notebook environment, and you will then be given the output to continued reasoning in an agentic loop.\n\nYou can use the following tools in your python code like regular functions:\n<tools>\n") -%}
+      {%- for tool in python_tools[:] -%} {# The slicing makes sure that python_tools is a list #}
+        {%- set ns.python_tool_string = ns.python_tool_string ~ (tool | string) ~ "\n" -%}
+      {%- endfor -%}
+      {%- set python_tool_string = ns.python_tool_string + "</tools>\n\nThe state persists between code executions: so variables that you define in one step are still available thereafter." -%}
+      {{- python_tool_string -}}
+    {%- endif -%}
+    {{- "\n\n" -}}
+    {{- "<|im_end|>\n" -}}
+  {%- endif -%}
+{%- endif -%}
+{# ───── main loop ───── #}
+{%- for message in messages -%}
+    {%- set content = message.content if message.content is string else "" -%}
+    {%- if message.role == "user" -%}
+        {{ "<|im_start|>" + message.role + "\n"  + content + "<|im_end|>\n" }}
+    {%- elif message.role == "assistant" -%}
+        {% generation %}
+        {%- if reasoning_mode == "/think" -%}
+            {{ "<|im_start|>assistant\n" + content.lstrip("\n") + "<|im_end|>\n" }}
+        {%- else -%}
+            {{ "<|im_start|>assistant\n" + "<think>\n\n</think>\n" + content.lstrip("\n") + "<|im_end|>\n" }}
+        {%- endif -%}
+        {% endgeneration %}
+    {%- elif message.role == "tool" -%}
+    {{ "<|im_start|>" + "user\n"  + content + "<|im_end|>\n" }}
+    {%- endif -%}
+{%- endfor -%}
+{# ───── generation prompt ───── #}
+{%- if add_generation_prompt -%}
+    {%- if reasoning_mode == "/think" -%}
+        {{ "<|im_start|>assistant\n" }}
+    {%- else -%}
+        {{ "<|im_start|>assistant\n" + "<think>\n\n</think>\n"  }}
+    {%- endif -%}
+{%- endif -%}

mlp_projector.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch.nn as nn
+class MLPAudioProjector(nn.Module):
+    """2-layer MLP projector with Qwen-style 2x temporal downsampling."""
+    def __init__(self, config):
+        super().__init__()
+        encoder_dim = getattr(config, "encoder_dim", 768)
+        llm_dim = getattr(config, "llm_dim", 2048)
+        self.downsample = nn.Conv1d(
+            encoder_dim, encoder_dim, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.linear_1 = nn.Linear(encoder_dim, llm_dim, bias=False)
+        self.act = nn.GELU()
+        self.linear_2 = nn.Linear(llm_dim, llm_dim, bias=False)
+        self.apply(self._init_weights)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+    def forward(self, x):
+        """
+        x: [Batch, Seq_Len, Dim]
+        Returns: [Batch, Seq_Len // 2, llm_dim]
+        """
+        # Conv1d expects [Batch, Channels, Seq_Len]
+        x = x.transpose(1, 2)
+        x = self.downsample(x)
+        x = x.transpose(1, 2)
+        x = self.linear_1(x)
+        x = self.act(x)
+        return self.linear_2(x)

moe_projector.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F  # noqa: N812
+from transformers.models.llama.modeling_llama import LlamaRMSNorm
+class SimpleAdapter(nn.Module):
+    """
+    MOSA Section III-B:
+    "consists of two linear layers with a ReLU activation in between,
+    projecting the hidden dimension from 3072 to 4096 and back to 3072."
+    """
+    def __init__(self, in_features, hidden_features, out_features, dropout=0.0):
+        super().__init__()
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(dropout)
+        self.fc2 = nn.Linear(hidden_features, out_features)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.dropout(x)
+        return self.fc2(x)
+class MoEAudioProjector(nn.Module):
+    """
+    MOSA-style projector: Mixture of Simple Adapters.
+    From paper (arXiv:2508.18998):
+    - Dense mixture (softmax over ALL experts) instead of sparse Top-K
+    - Simple Linear->ReLU->Linear adapters (3072->4096->3072)
+    - No auxiliary losses - just cross-entropy on transcripts
+    - Conv downsampling: stride 4 total (two conv layers, stride 2 each)
+    """
+    def __init__(self, config):
+        super().__init__()
+        # Dimensions:
+        # Whisper-large-v3 encoder_dim = 1280
+        # SmolLM3-3B hidden_size = 2048
+        self.encoder_dim = config.encoder_dim  # 1280
+        self.llm_dim = config.llm_dim  # 2048
+        # Number of experts: Base=4, Large=8
+        self.num_experts = getattr(config, "num_experts", 4)
+        # Adapter hidden dim: paper uses 4096
+        adapter_hidden = getattr(config, "projector_hidden_dim", None) or 4096
+        # Dropout rate for experts (not applied to router)
+        self.dropout_rate = getattr(config, "projector_dropout", 0.1)
+        # --- Convolutional Subsampling (Section III-B) ---
+        # "two convolutional layers, each with a kernel size of 3 and a stride of 2"
+        # Maps encoder_dim (1280) -> llm_dim (3072), total stride=4
+        self.conv = nn.Sequential(
+            nn.Conv1d(self.encoder_dim, self.llm_dim, kernel_size=3, stride=2, padding=1),
+            nn.ReLU(),
+            nn.Conv1d(self.llm_dim, self.llm_dim, kernel_size=3, stride=2, padding=1),
+            nn.ReLU(),
+        )
+        # --- Router (Section III-B) ---
+        # Base: "two linear layers... mapping from 1280 to 512 and finally to 4"
+        router_hidden = 512
+        self.router = nn.Sequential(
+            nn.Linear(self.encoder_dim, router_hidden),
+            nn.ReLU(),
+            nn.Linear(router_hidden, self.num_experts),
+        )
+        # --- Experts / Adapters (Section III-B) ---
+        # "projecting the hidden dimension from 3072 to 4096 and back to 3072"
+        self.experts = nn.ModuleList(
+            [
+                SimpleAdapter(self.llm_dim, adapter_hidden, self.llm_dim, dropout=self.dropout_rate)
+                for _ in range(self.num_experts)
+            ]
+        )
+        # Normalization for stability (not in original MOSA but prevents FPE)
+        self.ln_post = LlamaRMSNorm(self.llm_dim, eps=1e-6)
+        # Initialize weights
+        self._init_weights()
+    def _init_weights(self):
+        """Initialize weights for stable training."""
+        std = 0.02
+        with torch.no_grad():
+            # Conv layers
+            for module in self.conv:
+                if isinstance(module, nn.Conv1d):
+                    nn.init.normal_(module.weight, mean=0.0, std=std)
+                    if module.bias is not None:
+                        nn.init.zeros_(module.bias)
+            # Router
+            for module in self.router:
+                if isinstance(module, nn.Linear):
+                    nn.init.normal_(module.weight, mean=0.0, std=std)
+                    if module.bias is not None:
+                        nn.init.zeros_(module.bias)
+            # Experts
+            for expert in self.experts:
+                nn.init.normal_(expert.fc1.weight, mean=0.0, std=std)
+                nn.init.normal_(expert.fc2.weight, mean=0.0, std=std)
+                if expert.fc1.bias is not None:
+                    nn.init.zeros_(expert.fc1.bias)
+                if expert.fc2.bias is not None:
+                    nn.init.zeros_(expert.fc2.bias)
+            # LayerNorm
+            self.ln_post.weight.data.fill_(1.0)
+    def forward(self, x):
+        """
+        Args:
+            x: [batch_size, seq_len, encoder_dim] from Whisper encoder (1280)
+        Returns:
+            output: [batch_size, seq_len // 4, llm_dim] (3072)
+        """
+        batch_size, seq_len, _ = x.shape
+        # Pad to be divisible by stride (4)
+        pad_amt = (4 - (seq_len % 4)) % 4
+        if pad_amt > 0:
+            x = F.pad(x, (0, 0, 0, pad_amt))
+            seq_len = x.shape[1]
+        # 1. Convolutional Downsampling
+        # (B, T, C) -> (B, C, T) -> conv -> (B, C, T//4) -> (B, T//4, C)
+        h_conv = self.conv(x.permute(0, 2, 1)).permute(0, 2, 1)
+        # 2. Router on high-res input, then downsample weights
+        router_logits = self.router(x)  # [B, T, num_experts]
+        # Average over stride window to match conv output
+        router_logits = router_logits.view(batch_size, seq_len // 4, 4, self.num_experts).mean(
+            dim=2
+        )
+        # Dense softmax
+        routing_weights = F.softmax(router_logits, dim=-1)  # [B, T//4, num_experts]
+        # 3. Weighted sum of expert outputs (Eq. 2: y = sum(w_i * E_i(x)))
+        # Use in-place add to reduce memory allocations
+        final_out = torch.zeros_like(h_conv)
+        for i, expert in enumerate(self.experts):
+            expert_out = expert(h_conv)
+            expert_weight = routing_weights[:, :, i : i + 1]
+            final_out.add_(expert_out * expert_weight)
+        return self.ln_post(final_out)
+    def get_aux_loss(self) -> torch.Tensor:
+        """Return auxiliary loss (none for dense MoE - all experts always used)."""
+        return torch.tensor(0.0)

preprocessor_config.json CHANGED Viewed

@@ -7,14 +7,11 @@
   "n_fft": 400,
   "n_samples": 480000,
   "nb_max_frames": 3000,
-  "num_mel_bins": 128,
   "padding_side": "right",
   "padding_value": 0.0,
   "processor_class": "ASRProcessor",
   "return_attention_mask": false,
   "sampling_rate": 16000,
-  "feature_extractor_class": "AutoFeatureExtractor",
-  "tokenizer_class": "AutoTokenizer",
   "auto_map": {
     "AutoProcessor": "asr_processing.ASRProcessor"
   }

   "n_fft": 400,
   "n_samples": 480000,
   "nb_max_frames": 3000,
   "padding_side": "right",
   "padding_value": 0.0,
   "processor_class": "ASRProcessor",
   "return_attention_mask": false,
   "sampling_rate": 16000,
   "auto_map": {
     "AutoProcessor": "asr_processing.ASRProcessor"
   }

residual_projector.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""Residual MLP projector for Whisper → LLM feature space translation.
+Philosophy: Whisper features are already information-complete. The projector
+learns a nonlinear correction/refinement to align them with the LLM's expected
+input distribution, rather than replacing them entirely.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F  # noqa: N812
+class ResidualMLP(nn.Module):
+    """MLP block with residual connection.
+    Output = x + MLP(x)
+    At initialization (weights near zero), output ≈ input, providing a stable
+    starting point. The network learns to add nonlinear corrections as needed.
+    """
+    def __init__(self, dim, hidden_dim, dropout=0.0):
+        super().__init__()
+        self.fc1 = nn.Linear(dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, dim)
+        self.act = nn.GELU()
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        residual = x
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return residual + x
+class ResidualAudioProjector(nn.Module):
+    """Residual MLP projector for audio-to-LLM feature translation.
+    Architecture:
+        1. Temporal pooling (concatenate k consecutive frames)
+        2. Linear projection to LLM dimension
+        3. N residual MLP blocks for nonlinear refinement
+        4. Final layer norm
+    The linear projection handles dimension matching, while residual MLPs
+    learn the nonlinear corrections needed to align acoustic features
+    with semantic embedding space.
+    """
+    def __init__(self, config):
+        super().__init__()
+        # Temporal downsampling factor
+        self.k = getattr(config, "projector_pool_stride", 4)
+        # Dimensions
+        in_dim = config.encoder_dim * self.k  # After concatenating k frames
+        out_dim = config.llm_dim
+        hidden_dim = getattr(config, "projector_hidden_dim", None) or out_dim * 4
+        # Number of residual blocks
+        self.num_layers = getattr(config, "projector_num_layers", 2)
+        dropout_rate = getattr(config, "projector_dropout", 0.0)
+        from transformers.models.llama.modeling_llama import LlamaRMSNorm
+        # Initial projection: encoder_dim * k → llm_dim
+        self.input_proj = nn.Linear(in_dim, out_dim)
+        self.ln_input = LlamaRMSNorm(out_dim, eps=1e-6)
+        # Residual MLP blocks for nonlinear refinement
+        self.layers = nn.ModuleList(
+            [ResidualMLP(out_dim, hidden_dim, dropout=dropout_rate) for _ in range(self.num_layers)]
+        )
+        # Per-layer norms (applied after each residual block)
+        self.layer_norms = nn.ModuleList(
+            [LlamaRMSNorm(out_dim, eps=1e-6) for _ in range(self.num_layers)]
+        )
+        self.output_dropout = nn.Dropout(dropout_rate)
+        # Initialize for stable training
+        self._init_weights(config)
+    def _init_weights(self, config):
+        """Initialize weights for stable residual learning.
+        Key insight: Initialize fc2 of each residual block to near-zero
+        so that initially output ≈ input (identity function).
+        """
+        std = getattr(config, "projector_init_std", 0.02)
+        with torch.no_grad():
+            # Input projection: standard init
+            nn.init.normal_(self.input_proj.weight, mean=0.0, std=std)
+            if self.input_proj.bias is not None:
+                nn.init.zeros_(self.input_proj.bias)
+            # Layer norms
+            self.ln_input.weight.data.fill_(1.0)
+            for ln in self.layer_norms:
+                ln.weight.data.fill_(1.0)
+            # Residual blocks: small init on output projection
+            for layer in self.layers:
+                nn.init.normal_(layer.fc1.weight, mean=0.0, std=std)
+                # Initialize fc2 smaller so residual starts near identity
+                nn.init.normal_(layer.fc2.weight, mean=0.0, std=std * 0.1)
+                if layer.fc1.bias is not None:
+                    nn.init.zeros_(layer.fc1.bias)
+                if layer.fc2.bias is not None:
+                    nn.init.zeros_(layer.fc2.bias)
+    def forward(self, x):
+        """
+        Args:
+            x: [batch_size, seq_len, encoder_dim] from Whisper encoder
+        Returns:
+            [batch_size, seq_len // k, llm_dim] projected features
+        """
+        batch_size, seq_len, dim = x.size()
+        # Ensure correct dtype
+        target_dtype = self.input_proj.weight.dtype
+        if x.dtype != target_dtype:
+            x = x.to(target_dtype)
+        # Pad sequence to be divisible by k
+        remainder = seq_len % self.k
+        if remainder:
+            pad_len = self.k - remainder
+            x = F.pad(x, (0, 0, 0, pad_len))
+        # Temporal pooling: concatenate k consecutive frames
+        # [B, T, D] → [B, T//k, D*k]
+        x = x.contiguous().view(batch_size, -1, dim * self.k)
+        # Project to LLM dimension
+        x = self.input_proj(x)
+        x = self.ln_input(x)
+        # Apply residual MLP blocks
+        for layer, ln in zip(self.layers, self.layer_norms):
+            x = layer(x)
+            x = ln(x)
+        return self.output_dropout(x)

shared_moe_projector.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F  # noqa: N812
+class SwiGLUExpert(nn.Module):
+    """SwiGLU expert MLP (used for both shared and routed experts)."""
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
+        super().__init__()
+        self.gate_proj = nn.Linear(input_dim, hidden_dim, bias=False)
+        self.up_proj = nn.Linear(input_dim, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, output_dim, bias=False)
+        self.act = nn.SiLU()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
+class SharedMoEBlock(nn.Module):
+    """MoE block with shared expert + sparse routed experts."""
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_experts: int = 4,
+        top_k: int = 2,
+    ):
+        super().__init__()
+        self.num_experts = num_experts
+        self.top_k = top_k
+        self.output_dim = output_dim
+        # Router: zero-initialized for natural learning
+        self.router = nn.Linear(input_dim, num_experts, bias=False)
+        nn.init.zeros_(self.router.weight)
+        # Shared expert (always active)
+        self.shared_expert = SwiGLUExpert(input_dim, hidden_dim, output_dim)
+        # Routed experts (sparse)
+        self.experts = nn.ModuleList(
+            [SwiGLUExpert(input_dim, hidden_dim, output_dim) for _ in range(num_experts)]
+        )
+        # For auxiliary loss (cached to avoid recomputation)
+        self.last_router_logits = None
+        self.last_router_probs = None
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, dim = hidden_states.shape
+        # Shared expert output (all tokens)
+        shared_out = self.shared_expert(hidden_states)
+        # Routing
+        flat_hidden = hidden_states.view(-1, dim)
+        router_logits = self.router(flat_hidden)
+        router_probs = F.softmax(router_logits.float(), dim=-1)
+        # Cache for aux loss
+        self.last_router_logits = router_logits
+        self.last_router_probs = router_probs
+        # Top-k selection and renormalization
+        top_k_weights, top_k_indices = torch.topk(router_probs, self.top_k, dim=-1)
+        top_k_weights = top_k_weights / top_k_weights.sum(dim=-1, keepdim=True)
+        top_k_weights = top_k_weights.to(hidden_states.dtype)
+        # Routed expert output via token dispatch
+        routed_out = self._dispatch_experts(flat_hidden, top_k_indices, top_k_weights)
+        routed_out = routed_out.view(batch_size, seq_len, -1)
+        # Combine: shared expert baseline + routed experts (grow in via zero-init down_proj)
+        return shared_out + routed_out
+    def _dispatch_experts(
+        self,
+        hidden_states: torch.Tensor,
+        top_k_indices: torch.Tensor,
+        top_k_weights: torch.Tensor,
+    ) -> torch.Tensor:
+        """Token dispatch - gather tokens per expert, process, scatter back."""
+        num_tokens = hidden_states.shape[0]
+        output = torch.zeros(
+            num_tokens, self.output_dim, device=hidden_states.device, dtype=hidden_states.dtype
+        )
+        for expert_idx, expert in enumerate(self.experts):
+            expert_mask = top_k_indices == expert_idx
+            if not expert_mask.any():
+                continue
+            token_indices, slot_indices = torch.where(expert_mask)
+            expert_input = hidden_states[token_indices]
+            expert_output = expert(expert_input)
+            weights = top_k_weights[token_indices, slot_indices].unsqueeze(-1)
+            output.index_add_(0, token_indices, expert_output * weights)
+        return output
+def load_balancing_loss(router_probs: torch.Tensor, num_experts: int, top_k: int) -> torch.Tensor:
+    """Auxiliary loss to encourage balanced expert usage."""
+    _, selected = torch.topk(router_probs, top_k, dim=-1)
+    expert_mask = F.one_hot(selected, num_experts).float()
+    tokens_per_expert = expert_mask.mean(dim=(0, 1))
+    prob_per_expert = router_probs.mean(dim=0)
+    return (tokens_per_expert * prob_per_expert).sum() * num_experts
+def z_loss(router_logits: torch.Tensor) -> torch.Tensor:
+    """Z-loss to prevent router logits from growing too large."""
+    return torch.logsumexp(router_logits.float(), dim=-1).square().mean()
+class SharedMoEAudioProjector(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Temporal downsampling
+        self.k = getattr(config, "projector_pool_stride", 4)
+        # Dimensions
+        encoder_dim = config.encoder_dim
+        in_dim = encoder_dim * self.k
+        out_dim = config.llm_dim
+        hidden_dim = getattr(config, "projector_hidden_dim", None) or in_dim
+        # MoE config
+        self.num_experts = getattr(config, "num_experts", 4)
+        self.top_k = getattr(config, "num_experts_per_tok", 2)
+        self.aux_loss_coef = getattr(config, "router_aux_loss_coef", 0.02)
+        self.z_loss_coef = getattr(config, "router_z_loss_coef", 0.001)
+        # Layers
+        self.moe = SharedMoEBlock(in_dim, hidden_dim, out_dim, self.num_experts, self.top_k)
+        # Init
+        self._init_weights(in_dim)
+    def _init_weights(self, in_dim: int):
+        with torch.no_grad():
+            # Shared expert - orthogonal init for stable condition numbers
+            nn.init.orthogonal_(self.moe.shared_expert.gate_proj.weight)
+            nn.init.orthogonal_(self.moe.shared_expert.up_proj.weight)
+            nn.init.orthogonal_(self.moe.shared_expert.down_proj.weight, gain=0.5)
+            # Routed experts - orthogonal for gate/up, tiny orthogonal for down (grow-in)
+            # gain=0.01 gives ~1% initial contribution while maintaining good conditioning
+            for expert in self.moe.experts:
+                nn.init.orthogonal_(expert.gate_proj.weight)
+                nn.init.orthogonal_(expert.up_proj.weight)
+                nn.init.orthogonal_(expert.down_proj.weight, gain=0.01)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, dim = x.size()
+        target_dtype = self.moe.shared_expert.gate_proj.weight.dtype
+        if x.dtype != target_dtype:
+            x = x.to(target_dtype)
+        # Pad for pooling (at most k-1 frames -> 1 extra token, negligible impact)
+        if seq_len % self.k:
+            x = F.pad(x, (0, 0, 0, self.k - seq_len % self.k))
+        # Temporal pooling
+        x = x.view(batch_size, -1, dim * self.k)
+        return self.moe(x)
+    def get_aux_loss(self) -> torch.Tensor:
+        """Get auxiliary losses (call after forward)."""
+        if self.moe.last_router_logits is None:
+            return torch.tensor(0.0, device=self.moe.router.weight.device)
+        balance = load_balancing_loss(self.moe.last_router_probs, self.num_experts, self.top_k)
+        z = z_loss(self.moe.last_router_logits)
+        return self.aux_loss_coef * balance + self.z_loss_coef * z

special_tokens_map.json CHANGED Viewed

@@ -1,15 +1,13 @@
 {
   "additional_special_tokens": [
-    "<|im_start|>",
-    "<|im_end|>"
   ],
-  "bos_token": {
-    "content": "<|im_start|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
   "eos_token": {
     "content": "<|im_end|>",
     "lstrip": false,
@@ -17,18 +15,5 @@
     "rstrip": false,
     "single_word": false
   },
-  "pad_token": {
-    "content": "<|im_end|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "unk_token": {
-    "content": "<|endoftext|>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
 }

 {
   "additional_special_tokens": [
+    {
+      "content": "<audio>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
   ],
   "eos_token": {
     "content": "<|im_end|>",
     "lstrip": false,
     "rstrip": false,
     "single_word": false
   },
+  "pad_token": "<|finetune_right_pad_id|>"
 }

swiglu_projector.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""Simple SwiGLU-based audio projector."""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F  # noqa: N812
+class SwiGLU(nn.Module):
+    def __init__(self, in_features, hidden_features, out_features, bias=False, dropout=0.0):
+        super().__init__()
+        self.w1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.w2 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.act = nn.SiLU()
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x_gate = self.act(self.w1(x))
+        x_val = self.w2(x)
+        x = x_gate * x_val
+        x = self.dropout(x)
+        return self.w3(x)
+class AudioProjector(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.k = getattr(config, "projector_pool_stride", 4)
+        in_dim = config.encoder_dim * self.k
+        out_dim = config.llm_dim
+        hidden_dim = config.projector_hidden_dim
+        if hidden_dim is None:
+            hidden_dim = config.encoder_dim * 2
+        dropout_rate = getattr(config, "projector_dropout", 0.0)
+        self.proj1 = SwiGLU(in_dim, hidden_dim, hidden_dim, dropout=dropout_rate)
+        self.proj2 = SwiGLU(hidden_dim, hidden_dim, out_dim, dropout=dropout_rate)
+        self.output_dropout = nn.Dropout(dropout_rate)
+        with torch.no_grad():
+            std = getattr(config, "projector_init_std", 0.02)
+            # Initialize first layer
+            nn.init.normal_(self.proj1.w1.weight, mean=0.0, std=std)
+            nn.init.normal_(self.proj1.w2.weight, mean=0.0, std=std)
+            nn.init.normal_(self.proj1.w3.weight, mean=0.0, std=std)
+            # Initialize second layer
+            nn.init.normal_(self.proj2.w1.weight, mean=0.0, std=std)
+            nn.init.normal_(self.proj2.w2.weight, mean=0.0, std=std)
+            nn.init.normal_(self.proj2.w3.weight, mean=0.0, std=std)
+    def forward(self, x):
+        batch_size, seq_len, dim = x.size()
+        target_dtype = self.proj1.w1.weight.dtype
+        if x.dtype != target_dtype:
+            x = x.to(target_dtype)
+        remainder = seq_len % self.k
+        if remainder:
+            pad_len = self.k - remainder
+            x = F.pad(x, (0, 0, 0, pad_len))
+        x = x.contiguous().view(batch_size, -1, dim * self.k)
+        x = self.proj1(x)
+        x = self.proj2(x)
+        return self.output_dropout(x)

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d9a0a439f19c272474f9c9213ea2665d1f1cf90eb7f2f6a71b40a919554f078c
-size 15781850

 version https://git-lfs.github.com/spec/v1
+oid sha256:d4aeaf198f783cbf58d8cd59812baac429ffe49147bf9648f6618de20b8d4a4c
+size 17209003

tokenizer_config.json CHANGED Viewed

Binary files a/tokenizer_config.json and b/tokenizer_config.json differ