Training in progress - step 500

Browse files

Files changed (12) hide show

asr_config.py +24 -18
asr_modeling.py +105 -77
asr_pipeline.py +439 -6
asr_processing.py +25 -3
chat_template.jinja +89 -94
config.json +165 -109
generation_config.json +12 -8
model.safetensors +2 -2
preprocessor_config.json +1 -1
projectors.py +276 -295
tokenizer.json +2 -2
tokenizer_config.json +0 -0

asr_config.py CHANGED Viewed

@@ -14,29 +14,34 @@ class ASRConfig(transformers.PretrainedConfig):
         attn_implementation: str = "flash_attention_2",
         model_dtype: str = "bfloat16",
         num_beams: Optional[int] = None,
-        system_prompt: str = "/no_think /system_override",
-        user_prompt: str = "Transcribe: <audio>",
         encoder_dim: Optional[int] = None,
         llm_dim: Optional[int] = None,
         audio_sample_rate: int = 16000,
         projector_init_std: float = 0.02,
-        projector_pool_stride: int = 2,
-        downsample_rate: int = 16,
         projector_hidden_dim: Optional[int] = None,
-        projector_type: str = "moe",  # "moe", "swiglu", "residual", "shared_moe", "mlp"
         projector_num_layers: int = 2,  # Number of layers (for residual projector)
-        projector_dropout: float = 0.05,  # Dropout rate for projector layers
-        projector_input_noise: float = 0.02,  # Input noise for projector
         # MoE-specific configuration
         num_experts: int = 4,  # Number of experts in MoE projectors
         num_experts_per_tok: int = 2,  # Top-k experts per token
         router_aux_loss_coef: float = 0.01,  # Auxiliary loss coefficient for load balancing
-        use_specaugment: bool = True,  # Apply SpecAugment during training
         label_smoothing: float = 0.0,  # Label smoothing for cross-entropy loss
-        inference_diversity_penalty: float = 0.0,
         inference_warmup_tokens: int = 10,
         max_new_tokens: Optional[int] = None,
-        min_new_tokens: Optional[int] = None,
         repetition_penalty: Optional[float] = None,
         length_penalty: Optional[float] = None,
         no_repeat_ngram_size: Optional[int] = None,
@@ -46,8 +51,7 @@ class ASRConfig(transformers.PretrainedConfig):
         # Set default generation parameters (greedy decoding only)
         generation_defaults = {
             "num_beams": 1,
-            "max_new_tokens": 96,
-            "min_new_tokens": 0,
             "repetition_penalty": 1.0,
             "length_penalty": 1.0,
             "no_repeat_ngram_size": 0,
@@ -65,6 +69,8 @@ class ASRConfig(transformers.PretrainedConfig):
         self.user_prompt = user_prompt
         self.encoder_dim = encoder_dim
         self.llm_dim = llm_dim
         self.audio_sample_rate = audio_sample_rate
         self.projector_init_std = projector_init_std
         self.projector_pool_stride = projector_pool_stride
@@ -73,14 +79,17 @@ class ASRConfig(transformers.PretrainedConfig):
         self.projector_type = projector_type
         self.projector_num_layers = projector_num_layers
         self.projector_dropout = projector_dropout
-        self.projector_input_noise = projector_input_noise
         # MoE-specific configuration
         self.num_experts = num_experts
         self.num_experts_per_tok = num_experts_per_tok
         self.router_aux_loss_coef = router_aux_loss_coef
-        self.use_specaugment = use_specaugment
         self.label_smoothing = label_smoothing
-        self.inference_diversity_penalty = inference_diversity_penalty
         self.inference_warmup_tokens = inference_warmup_tokens
         # Generation parameters (use explicit value if provided, else use default)
@@ -88,9 +97,6 @@ class ASRConfig(transformers.PretrainedConfig):
         self.max_new_tokens = (
             max_new_tokens if max_new_tokens is not None else generation_defaults["max_new_tokens"]
         )
-        self.min_new_tokens = (
-            min_new_tokens if min_new_tokens is not None else generation_defaults["min_new_tokens"]
-        )
         self.repetition_penalty = (
             repetition_penalty
             if repetition_penalty is not None

         attn_implementation: str = "flash_attention_2",
         model_dtype: str = "bfloat16",
         num_beams: Optional[int] = None,
+        system_prompt: str = "You are a helpful assistant.",
+        user_prompt: str = "Please transcribe this English audio into text: <audio>",
         encoder_dim: Optional[int] = None,
         llm_dim: Optional[int] = None,
+        # Encoder conv layers: list of (padding, kernel_size, stride) tuples
+        # Default is Whisper/GLM-ASR structure: conv1(k=3,s=1,p=1) + conv2(k=3,s=2,p=1)
+        encoder_conv_layers: Optional[list] = None,
         audio_sample_rate: int = 16000,
         projector_init_std: float = 0.02,
+        projector_pool_stride: int = 4,
+        downsample_rate: int = 5,  # Granite default
         projector_hidden_dim: Optional[int] = None,
+        projector_type: str = "moe",  # "moe", "swiglu", "residual", "shared_moe", "mlp", "qformer"
         projector_num_layers: int = 2,  # Number of layers (for residual projector)
+        projector_dropout: float = 0.0,  # Dropout rate for projector layers
         # MoE-specific configuration
         num_experts: int = 4,  # Number of experts in MoE projectors
         num_experts_per_tok: int = 2,  # Top-k experts per token
         router_aux_loss_coef: float = 0.01,  # Auxiliary loss coefficient for load balancing
+        # QFormer-specific configuration (Granite defaults)
+        qformer_window_size: int = 15,  # Window size for QFormer processing
+        qformer_hidden_size: Optional[int] = None,  # QFormer hidden size (defaults to encoder_dim)
+        qformer_num_layers: int = 2,  # Number of QFormer transformer layers
+        qformer_num_heads: int = 16,  # Number of attention heads in QFormer
+        qformer_intermediate_size: Optional[int] = None,  # FFN size (defaults to 4x hidden)
         label_smoothing: float = 0.0,  # Label smoothing for cross-entropy loss
         inference_warmup_tokens: int = 10,
         max_new_tokens: Optional[int] = None,
         repetition_penalty: Optional[float] = None,
         length_penalty: Optional[float] = None,
         no_repeat_ngram_size: Optional[int] = None,
         # Set default generation parameters (greedy decoding only)
         generation_defaults = {
             "num_beams": 1,
+            "max_new_tokens": 256,
             "repetition_penalty": 1.0,
             "length_penalty": 1.0,
             "no_repeat_ngram_size": 0,
         self.user_prompt = user_prompt
         self.encoder_dim = encoder_dim
         self.llm_dim = llm_dim
+        # Default conv layers for Whisper/GLM-ASR: [(pad, kernel, stride), ...]
+        self.encoder_conv_layers = encoder_conv_layers or [(1, 3, 1), (1, 3, 2)]
         self.audio_sample_rate = audio_sample_rate
         self.projector_init_std = projector_init_std
         self.projector_pool_stride = projector_pool_stride
         self.projector_type = projector_type
         self.projector_num_layers = projector_num_layers
         self.projector_dropout = projector_dropout
         # MoE-specific configuration
         self.num_experts = num_experts
         self.num_experts_per_tok = num_experts_per_tok
         self.router_aux_loss_coef = router_aux_loss_coef
+        # QFormer-specific configuration
+        self.qformer_window_size = qformer_window_size
+        self.qformer_hidden_size = qformer_hidden_size
+        self.qformer_num_layers = qformer_num_layers
+        self.qformer_num_heads = qformer_num_heads
+        self.qformer_intermediate_size = qformer_intermediate_size
         self.label_smoothing = label_smoothing
         self.inference_warmup_tokens = inference_warmup_tokens
         # Generation parameters (use explicit value if provided, else use default)
         self.max_new_tokens = (
             max_new_tokens if max_new_tokens is not None else generation_defaults["max_new_tokens"]
         )
         self.repetition_penalty = (
             repetition_penalty
             if repetition_penalty is not None

asr_modeling.py CHANGED Viewed

@@ -13,9 +13,6 @@ from transformers import (
 )
 from transformers.generation import GenerationMixin
 from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.models.whisper.modeling_whisper import (
-    _compute_mask_indices,
-)
 try:
     from .asr_config import ASRConfig
@@ -75,6 +72,21 @@ class ASRModel(PreTrainedModel, GenerationMixin):
                 state_dict = load_file(model_file)
                 model.load_state_dict(state_dict, strict=False)
             return model
         finally:
             cls._is_loading_from_pretrained = False
@@ -108,7 +120,10 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         self.generation_config.length_penalty = config.length_penalty
         self.generation_config.repetition_penalty = config.repetition_penalty
         self.generation_config.no_repeat_ngram_size = config.no_repeat_ngram_size
-        self.generation_config.eos_token_id = self.tokenizer.convert_tokens_to_ids("<|im_end|>")
         self.generation_config.pad_token_id = self.tokenizer.pad_token_id
         # Feature extractor for audio preprocessing
@@ -141,6 +156,22 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             full_model = WhisperModel.from_pretrained(config.audio_model_id, **encoder_kwargs)
             encoder = full_model.encoder
             del full_model
         else:
             encoder = AutoModel.from_pretrained(config.audio_model_id, **encoder_kwargs)
@@ -210,12 +241,15 @@ class ASRModel(PreTrainedModel, GenerationMixin):
             self.tokenizer.pad_token = "<|finetune_right_pad_id|>"
         # Add audio token
-        existing_special = self.tokenizer.additional_special_tokens or []
         if "<audio>" not in existing_special:
             self.tokenizer.add_special_tokens(
                 {"additional_special_tokens": existing_special + ["<audio>"]}
             )
             self.language_model.resize_token_embeddings(len(self.tokenizer), mean_resizing=False)
         self.audio_token_id = self.tokenizer.convert_tokens_to_ids("<audio>")
         self.tokenizer.padding_side = "right"
@@ -263,92 +297,80 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         except ImportError:
             from asr_processing import ASRProcessor  # type: ignore[no-redef]
-        return ASRProcessor(feature_extractor=self.feature_extractor, tokenizer=self.tokenizer)
     def state_dict(self, *args, **kwargs):
         """Only save trainable projector weights."""
         return {f"projector.{k}": v for k, v in self.projector.state_dict().items()}
-    def _apply_specaugment(
         self,
-        input_features: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        if not getattr(self.config, "use_specaugment", False):
-            return input_features
-        if not self.training:
-            return input_features
-        # Input shape: (batch_size, num_mel_bins, sequence_length) for Whisper
-        batch_size, hidden_size, sequence_length = input_features.size()
-        mask_time_prob = getattr(self.config, "mask_time_prob", 0.05)
-        mask_time_length = getattr(self.config, "mask_time_length", 10)
-        mask_feature_prob = getattr(self.config, "mask_feature_prob", 0.0)
-        mask_feature_length = getattr(self.config, "mask_feature_length", 10)
-        # Time masking
-        if mask_time_prob > 0:
-            mask_time_np = _compute_mask_indices(
-                (batch_size, sequence_length),
-                mask_prob=mask_time_prob,
-                mask_length=mask_time_length,
-                attention_mask=attention_mask,
-                min_masks=2,
-            )
-            mask_time_indices = torch.tensor(
-                mask_time_np, device=input_features.device, dtype=torch.bool
-            )
-            # Expand to cover all features: (batch, seq) -> (batch, features, seq)
-            mask_time_expanded = mask_time_indices[:, None].expand(-1, hidden_size, -1)
-            input_features = input_features.masked_fill(mask_time_expanded, 0.0)
-        # Feature masking
-        if mask_feature_prob > 0:
-            mask_feature_np = _compute_mask_indices(
-                (batch_size, hidden_size),
-                mask_prob=mask_feature_prob,
-                mask_length=mask_feature_length,
-                min_masks=2,
-            )
-            mask_feature_indices = torch.tensor(
-                mask_feature_np, device=input_features.device, dtype=torch.bool
-            )
-            # Expand: (batch, features) -> (batch, features, seq)
-            mask_feature_expanded = mask_feature_indices[:, :, None].expand(-1, -1, sequence_length)
-            input_features = input_features.masked_fill(mask_feature_expanded, 0.0)
-        return input_features
     def _encode_audio(
         self,
         audio_features: torch.Tensor,
-        audio_attention_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         """Encode audio and project to LLM embedding space.
-        Returns flattened audio embeddings of shape (total_audio_tokens, hidden_dim).
-        """
-        # Apply SpecAugment during training (before encoding)
-        audio_features = self._apply_specaugment(audio_features, audio_attention_mask)
         with torch.no_grad():
-            encoder_out = self.audio_tower(
-                input_features=audio_features, attention_mask=audio_attention_mask
-            )
             hidden_states = encoder_out.last_hidden_state
         audio_embeds = self.projector(hidden_states)
-        # Flatten: (batch, seq, hidden) -> (batch * seq, hidden)
-        # This allows masked_scatter to do 1:1 replacement
-        return audio_embeds.reshape(-1, audio_embeds.shape[-1])
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         input_features: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         past_key_values: Optional[torch.Tensor] = None,
@@ -356,7 +378,6 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         labels: Optional[torch.Tensor] = None,
         use_cache: Optional[bool] = None,
         cache_position: Optional[torch.Tensor] = None,
-        audio_attention_mask: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> CausalLMOutputWithPast:
         """Forward pass for training and inference."""
@@ -408,23 +429,27 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         return model_inputs
-    def _get_num_audio_tokens(self, input_features: torch.Tensor) -> int:
-        """Calculate number of audio tokens based on input shape.
-        Whisper: input_features shape is (batch, n_mels, mel_len)
-        Encoder output is mel_len // 2 due to stride-2 conv
-        MLP projector adds another stride-2 for 4x total downsampling
         """
-        mel_len = input_features.shape[-1]
-        return mel_len // 4
     @torch.no_grad()
     def generate(
         self,
         input_ids: Optional[torch.Tensor] = None,
         input_features: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
         audio_attention_mask: Optional[torch.Tensor] = None,
         system_prompt: Optional[str] = None,
         **generate_kwargs,
     ) -> torch.Tensor:
@@ -436,6 +461,8 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         """
         if input_features is None:
             raise ValueError("input_features required for generation")
         device = input_features.device
         batch_size = input_features.shape[0]
@@ -445,7 +472,7 @@ class ASRModel(PreTrainedModel, GenerationMixin):
         # If input_ids not provided, build prompt with correct number of audio tokens
         if input_ids is None:
-            num_audio_tokens = self._get_num_audio_tokens(input_features)
             audio_placeholder = "<audio>" * num_audio_tokens
             system_prompt = system_prompt or self.system_prompt
@@ -455,12 +482,13 @@ class ASRModel(PreTrainedModel, GenerationMixin):
                 messages.append({"role": "system", "content": system_prompt})
             messages.append({"role": "user", "content": self.TRANSCRIBE_PROMPT + audio_placeholder})
-            input_ids = self.tokenizer.apply_chat_template(
                 messages,
                 tokenize=True,
                 add_generation_prompt=True,
                 return_tensors="pt",
-            ).to(device)
             if input_ids.dim() == 1:
                 input_ids = input_ids.unsqueeze(0)

 )
 from transformers.generation import GenerationMixin
 from transformers.modeling_outputs import CausalLMOutputWithPast
 try:
     from .asr_config import ASRConfig
                 state_dict = load_file(model_file)
                 model.load_state_dict(state_dict, strict=False)
+            # Load LoRA adapter if present
+            adapter_config = cached_file(
+                pretrained_model_name_or_path,
+                "adapter_config.json",
+                _raise_exceptions_for_missing_entries=False,
+                **cache_kwargs,
+            )
+            if adapter_config is not None:
+                from peft import PeftModel
+                # Pass original repo ID to PEFT, let it handle caching
+                model.language_model = PeftModel.from_pretrained(
+                    model.language_model, pretrained_model_name_or_path, is_trainable=False
+                )
             return model
         finally:
             cls._is_loading_from_pretrained = False
         self.generation_config.length_penalty = config.length_penalty
         self.generation_config.repetition_penalty = config.repetition_penalty
         self.generation_config.no_repeat_ngram_size = config.no_repeat_ngram_size
+        self.generation_config.eos_token_id = [
+            self.tokenizer.convert_tokens_to_ids("<|im_end|>"),
+            self.tokenizer.convert_tokens_to_ids("<|endoftext|>"),
+        ]
         self.generation_config.pad_token_id = self.tokenizer.pad_token_id
         # Feature extractor for audio preprocessing
             full_model = WhisperModel.from_pretrained(config.audio_model_id, **encoder_kwargs)
             encoder = full_model.encoder
             del full_model
+        elif "glm" in config.audio_model_id.lower():
+            # GLM-ASR models use audio_tower as the encoder
+            # Requires transformers >= 5.x or installed from source
+            from transformers import AutoModelForSeq2SeqLM
+            full_model = AutoModelForSeq2SeqLM.from_pretrained(
+                config.audio_model_id, trust_remote_code=True, **encoder_kwargs
+            )
+            # GLM stores encoder at audio_tower (GlmAsrEncoder)
+            encoder = full_model.audio_tower
+            # Clear references to free VRAM from the LLM decoder
+            full_model.language_model = None
+            full_model.multi_modal_projector = None
+            del full_model
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
         else:
             encoder = AutoModel.from_pretrained(config.audio_model_id, **encoder_kwargs)
             self.tokenizer.pad_token = "<|finetune_right_pad_id|>"
         # Add audio token
+        existing_special = getattr(self.tokenizer, "additional_special_tokens", None) or []
         if "<audio>" not in existing_special:
             self.tokenizer.add_special_tokens(
                 {"additional_special_tokens": existing_special + ["<audio>"]}
             )
             self.language_model.resize_token_embeddings(len(self.tokenizer), mean_resizing=False)
+            # Ensure lm_head stays tied to embeddings (e.g., SmolLM3)
+            if hasattr(self.language_model, "tie_weights"):
+                self.language_model.tie_weights()
         self.audio_token_id = self.tokenizer.convert_tokens_to_ids("<audio>")
         self.tokenizer.padding_side = "right"
         except ImportError:
             from asr_processing import ASRProcessor  # type: ignore[no-redef]
+        return ASRProcessor(
+            feature_extractor=self.feature_extractor,
+            tokenizer=self.tokenizer,
+            projector=self.projector,
+            encoder_conv_layers=self.config.encoder_conv_layers,
+        )
     def state_dict(self, *args, **kwargs):
         """Only save trainable projector weights."""
         return {f"projector.{k}": v for k, v in self.projector.state_dict().items()}
+    def _compute_encoder_output_lengths(
         self,
+        audio_attention_mask: torch.Tensor,
     ) -> torch.Tensor:
+        """Compute per-sample encoder output lengths using conv layer formulas.
+        Args:
+            audio_attention_mask: Mask indicating real vs padded mel frames (batch, mel_len)
+        Returns:
+            Tensor of encoder output lengths per sample (batch,)
+        """
+        # Get mel frame lengths from attention mask
+        lengths = audio_attention_mask.sum(dim=-1)
+        # Apply conv layer formulas: output = (input + 2*pad - (kernel-1) - 1) // stride + 1
+        for padding, kernel_size, stride in self.config.encoder_conv_layers:
+            lengths = (lengths + 2 * padding - (kernel_size - 1) - 1) // stride + 1
+        return lengths
     def _encode_audio(
         self,
         audio_features: torch.Tensor,
+        audio_attention_mask: torch.Tensor,
     ) -> torch.Tensor:
         """Encode audio and project to LLM embedding space.
+        Args:
+            audio_features: Mel spectrogram features (batch, n_mels, mel_len)
+            audio_attention_mask: Mask indicating real vs padded mel frames (batch, mel_len)
+        Returns:
+            Flattened audio embeddings of shape (total_audio_tokens, hidden_dim).
+        """
         with torch.no_grad():
+            encoder_out = self.audio_tower(input_features=audio_features)
             hidden_states = encoder_out.last_hidden_state
+        # Compute per-sample encoder output lengths using conv formulas
+        encoder_lengths = self._compute_encoder_output_lengths(audio_attention_mask)
+        # Project to LLM space
         audio_embeds = self.projector(hidden_states)
+        # Compute per-sample projector output lengths
+        projector_lengths = torch.tensor(
+            [self.projector.get_output_length(int(length.item())) for length in encoder_lengths],
+            device=audio_embeds.device,
+        )
+        # Create valid mask for variable-length samples and extract only real embeddings
+        max_len = audio_embeds.shape[1]
+        valid_mask = (
+            torch.arange(max_len, device=audio_embeds.device)[None, :] < projector_lengths[:, None]
+        )
+        return audio_embeds[valid_mask]
     def forward(
         self,
         input_ids: Optional[torch.Tensor] = None,
         input_features: Optional[torch.Tensor] = None,
+        audio_attention_mask: Optional[torch.Tensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
         past_key_values: Optional[torch.Tensor] = None,
         labels: Optional[torch.Tensor] = None,
         use_cache: Optional[bool] = None,
         cache_position: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> CausalLMOutputWithPast:
         """Forward pass for training and inference."""
         return model_inputs
+    def _get_num_audio_tokens(
+        self,
+        audio_attention_mask: torch.Tensor,
+    ) -> int:
+        """Calculate number of audio tokens based on actual audio length.
+        Uses attention mask to get real audio length, then computes:
+        mel_frames -> encoder_frames (via conv formulas) -> projector output tokens
         """
+        encoder_lengths = self._compute_encoder_output_lengths(audio_attention_mask)
+        # Use max length for batch (all samples should have same token count for generation)
+        encoder_output_len = int(encoder_lengths.max().item())
+        return int(self.projector.get_output_length(encoder_output_len))
     @torch.no_grad()
     def generate(
         self,
         input_ids: Optional[torch.Tensor] = None,
         input_features: Optional[torch.Tensor] = None,
         audio_attention_mask: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
         system_prompt: Optional[str] = None,
         **generate_kwargs,
     ) -> torch.Tensor:
         """
         if input_features is None:
             raise ValueError("input_features required for generation")
+        if audio_attention_mask is None:
+            raise ValueError("audio_attention_mask required for generation")
         device = input_features.device
         batch_size = input_features.shape[0]
         # If input_ids not provided, build prompt with correct number of audio tokens
         if input_ids is None:
+            num_audio_tokens = self._get_num_audio_tokens(audio_attention_mask)
             audio_placeholder = "<audio>" * num_audio_tokens
             system_prompt = system_prompt or self.system_prompt
                 messages.append({"role": "system", "content": system_prompt})
             messages.append({"role": "user", "content": self.TRANSCRIBE_PROMPT + audio_placeholder})
+            chat_result = self.tokenizer.apply_chat_template(
                 messages,
                 tokenize=True,
                 add_generation_prompt=True,
                 return_tensors="pt",
+            )
+            input_ids = chat_result.input_ids.to(device)
             if input_ids.dim() == 1:
                 input_ids = input_ids.unsqueeze(0)

asr_pipeline.py CHANGED Viewed

@@ -1,5 +1,8 @@
 from typing import Any
 import torch
 import transformers
@@ -9,6 +12,284 @@ except ImportError:
     from asr_modeling import ASRModel  # type: ignore[no-redef]
 class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
     """ASR Pipeline for audio-to-text transcription."""
@@ -24,6 +305,131 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         super().__init__(
             model=model, feature_extractor=feature_extractor, tokenizer=tokenizer, **kwargs
         )
     def preprocess(self, inputs, **preprocess_params):
         # Handle dict with "array" key (from datasets)
@@ -42,15 +448,12 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
         # Extract audio features and is_last flag
         is_last = model_inputs.pop("is_last", True) if isinstance(model_inputs, dict) else True
-        if isinstance(model_inputs, dict):
-            input_features = model_inputs.get("input_features")
-            if input_features is not None:
-                input_features = input_features.to(self.model.device)
-        else:
-            input_features = model_inputs.to(self.model.device)
         generated_ids = self.model.generate(
             input_features=input_features,
             **generate_kwargs,
         )
@@ -71,4 +474,34 @@ class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
                 tokens = tokens[0]
         text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
         return {"text": text}

+import re
+from pathlib import Path
 from typing import Any
+import numpy as np
 import torch
 import transformers
     from asr_modeling import ASRModel  # type: ignore[no-redef]
+class ForcedAligner:
+    """Lazy-loaded forced aligner for word-level timestamps using torchaudio wav2vec2."""
+    _bundle = None
+    _model = None
+    _labels = None
+    _dictionary = None
+    @classmethod
+    def get_instance(cls, device: str = "cuda"):
+        if cls._model is None:
+            import torchaudio
+            cls._bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
+            cls._model = cls._bundle.get_model().to(device)
+            cls._model.eval()
+            cls._labels = cls._bundle.get_labels()
+            cls._dictionary = {c: i for i, c in enumerate(cls._labels)}
+        return cls._model, cls._labels, cls._dictionary
+    @classmethod
+    def align(
+        cls,
+        audio: np.ndarray,
+        text: str,
+        sample_rate: int = 16000,
+        language: str = "eng",
+        batch_size: int = 16,
+    ) -> list[dict]:
+        """Align transcript to audio and return word-level timestamps.
+        Args:
+            audio: Audio waveform as numpy array
+            text: Transcript text to align
+            sample_rate: Audio sample rate (default 16000)
+            language: ISO-639-3 language code (default "eng" for English, unused)
+            batch_size: Batch size for alignment model (unused)
+        Returns:
+            List of dicts with 'word', 'start', 'end' keys
+        """
+        import torchaudio
+        from torchaudio.functional import forced_align, merge_tokens
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        model, labels, dictionary = cls.get_instance(device)
+        # Convert audio to tensor (copy to ensure array is writable)
+        if isinstance(audio, np.ndarray):
+            waveform = torch.from_numpy(audio.copy()).float()
+        else:
+            waveform = audio.clone().float()
+        # Ensure 2D (channels, time)
+        if waveform.dim() == 1:
+            waveform = waveform.unsqueeze(0)
+        # Resample if needed (wav2vec2 expects 16kHz)
+        if sample_rate != cls._bundle.sample_rate:
+            waveform = torchaudio.functional.resample(
+                waveform, sample_rate, cls._bundle.sample_rate
+            )
+        waveform = waveform.to(device)
+        # Get emissions from model
+        with torch.inference_mode():
+            emissions, _ = model(waveform)
+            emissions = torch.log_softmax(emissions, dim=-1)
+        emission = emissions[0].cpu()
+        # Normalize text: uppercase, keep only valid characters
+        transcript = text.upper()
+        # Build tokens from transcript
+        tokens = []
+        for char in transcript:
+            if char in dictionary:
+                tokens.append(dictionary[char])
+            elif char == " ":
+                tokens.append(dictionary.get("|", dictionary.get(" ", 0)))
+        if not tokens:
+            return []
+        targets = torch.tensor([tokens], dtype=torch.int32)
+        # Run forced alignment
+        # Note: forced_align is deprecated in torchaudio 2.6+ and will be removed in 2.9 (late 2025)
+        # No official replacement announced yet. See https://github.com/pytorch/audio/issues/3902
+        aligned_tokens, scores = forced_align(emission.unsqueeze(0), targets, blank=0)
+        # Use torchaudio's merge_tokens to get token spans (removes blanks and merges repeats)
+        token_spans = merge_tokens(aligned_tokens[0], scores[0])
+        # Convert frame indices to time (model stride is 320 samples at 16kHz = 20ms)
+        frame_duration = 320 / cls._bundle.sample_rate
+        # Group token spans into words based on pipe separator
+        words = text.split()
+        word_timestamps = []
+        current_word_start = None
+        current_word_end = None
+        word_idx = 0
+        for span in token_spans:
+            token_char = labels[span.token]
+            if token_char == "|":  # Word separator
+                if current_word_start is not None and word_idx < len(words):
+                    word_timestamps.append(
+                        {
+                            "word": words[word_idx],
+                            "start": current_word_start * frame_duration,
+                            "end": current_word_end * frame_duration,
+                        }
+                    )
+                    word_idx += 1
+                current_word_start = None
+                current_word_end = None
+            else:
+                if current_word_start is None:
+                    current_word_start = span.start
+                current_word_end = span.end
+        # Don't forget the last word
+        if current_word_start is not None and word_idx < len(words):
+            word_timestamps.append(
+                {
+                    "word": words[word_idx],
+                    "start": current_word_start * frame_duration,
+                    "end": current_word_end * frame_duration,
+                }
+            )
+        return word_timestamps
+class SpeakerDiarizer:
+    """Lazy-loaded speaker diarization using pyannote-audio."""
+    _pipeline = None
+    @classmethod
+    def get_instance(cls, hf_token: str | None = None):
+        """Get or create the diarization pipeline.
+        Args:
+            hf_token: HuggingFace token with access to pyannote models.
+                     Can also be set via HF_TOKEN environment variable.
+        """
+        if cls._pipeline is None:
+            from pyannote.audio import Pipeline
+            cls._pipeline = Pipeline.from_pretrained(
+                "pyannote/speaker-diarization-3.1",
+            )
+            # Move to GPU if available
+            if torch.cuda.is_available():
+                cls._pipeline.to(torch.device("cuda"))
+            elif torch.backends.mps.is_available():
+                cls._pipeline.to(torch.device("mps"))
+        return cls._pipeline
+    @classmethod
+    def diarize(
+        cls,
+        audio: np.ndarray | str,
+        sample_rate: int = 16000,
+        num_speakers: int | None = None,
+        min_speakers: int | None = None,
+        max_speakers: int | None = None,
+        hf_token: str | None = None,
+    ) -> list[dict]:
+        """Run speaker diarization on audio.
+        Args:
+            audio: Audio waveform as numpy array or path to audio file
+            sample_rate: Audio sample rate (default 16000)
+            num_speakers: Exact number of speakers (if known)
+            min_speakers: Minimum number of speakers
+            max_speakers: Maximum number of speakers
+            hf_token: HuggingFace token for pyannote models
+        Returns:
+            List of dicts with 'speaker', 'start', 'end' keys
+        """
+        pipeline = cls.get_instance(hf_token)
+        # Prepare audio input
+        if isinstance(audio, np.ndarray):
+            # pyannote expects {"waveform": tensor, "sample_rate": int}
+            waveform = torch.from_numpy(audio).unsqueeze(0)  # Add channel dim
+            if waveform.dim() == 1:
+                waveform = waveform.unsqueeze(0)
+            audio_input = {"waveform": waveform, "sample_rate": sample_rate}
+        else:
+            # File path
+            audio_input = audio
+        # Run diarization
+        diarization_args = {}
+        if num_speakers is not None:
+            diarization_args["num_speakers"] = num_speakers
+        if min_speakers is not None:
+            diarization_args["min_speakers"] = min_speakers
+        if max_speakers is not None:
+            diarization_args["max_speakers"] = max_speakers
+        diarization = pipeline(audio_input, **diarization_args)
+        # Handle different pyannote return types
+        # pyannote 3.x returns DiarizeOutput dataclass, older versions return Annotation
+        if hasattr(diarization, "itertracks"):
+            annotation = diarization
+        elif hasattr(diarization, "speaker_diarization"):
+            # pyannote 3.x DiarizeOutput dataclass
+            annotation = diarization.speaker_diarization
+        elif isinstance(diarization, tuple):
+            # Some versions return (annotation, embeddings) tuple
+            annotation = diarization[0]
+        else:
+            raise TypeError(f"Unexpected diarization output type: {type(diarization)}")
+        # Convert to simple format
+        segments = []
+        for turn, _, speaker in annotation.itertracks(yield_label=True):
+            segments.append(
+                {
+                    "speaker": speaker,
+                    "start": turn.start,
+                    "end": turn.end,
+                }
+            )
+        return segments
+    @classmethod
+    def assign_speakers_to_words(
+        cls,
+        words: list[dict],
+        speaker_segments: list[dict],
+    ) -> list[dict]:
+        """Assign speaker labels to words based on timestamp overlap.
+        Args:
+            words: List of word dicts with 'word', 'start', 'end' keys
+            speaker_segments: List of speaker dicts with 'speaker', 'start', 'end' keys
+        Returns:
+            Words list with 'speaker' key added to each word
+        """
+        for word in words:
+            word_mid = (word["start"] + word["end"]) / 2
+            # Find the speaker segment that contains this word's midpoint
+            best_speaker = None
+            for seg in speaker_segments:
+                if seg["start"] <= word_mid <= seg["end"]:
+                    best_speaker = seg["speaker"]
+                    break
+            # If no exact match, find closest segment
+            if best_speaker is None and speaker_segments:
+                min_dist = float("inf")
+                for seg in speaker_segments:
+                    seg_mid = (seg["start"] + seg["end"]) / 2
+                    dist = abs(word_mid - seg_mid)
+                    if dist < min_dist:
+                        min_dist = dist
+                        best_speaker = seg["speaker"]
+            word["speaker"] = best_speaker
+        return words
 class ASRPipeline(transformers.AutomaticSpeechRecognitionPipeline):
     """ASR Pipeline for audio-to-text transcription."""
         super().__init__(
             model=model, feature_extractor=feature_extractor, tokenizer=tokenizer, **kwargs
         )
+        self._current_audio = None
+    def _sanitize_parameters(self, **kwargs):
+        """Intercept our custom parameters before parent class validates them."""
+        # Remove our custom parameters so parent doesn't see them
+        kwargs.pop("return_timestamps", None)
+        kwargs.pop("return_speakers", None)
+        kwargs.pop("num_speakers", None)
+        kwargs.pop("min_speakers", None)
+        kwargs.pop("max_speakers", None)
+        kwargs.pop("hf_token", None)
+        return super()._sanitize_parameters(**kwargs)
+    def __call__(
+        self,
+        inputs,
+        **kwargs,
+    ):
+        """Transcribe audio with optional word-level timestamps and speaker diarization.
+        Args:
+            inputs: Audio input (file path, dict with array/sampling_rate, etc.)
+            return_timestamps: If True, return word-level timestamps using forced alignment
+            return_speakers: If True, return speaker labels for each word
+            num_speakers: Exact number of speakers (if known, for diarization)
+            min_speakers: Minimum number of speakers (for diarization)
+            max_speakers: Maximum number of speakers (for diarization)
+            hf_token: HuggingFace token for pyannote models (or set HF_TOKEN env var)
+            **kwargs: Additional arguments passed to the pipeline
+        Returns:
+            Dict with 'text' key, 'words' key if return_timestamps=True,
+            and speaker labels on words if return_speakers=True
+        """
+        # Extract our params before super().__call__ (which will also call _sanitize_parameters)
+        return_timestamps = kwargs.pop("return_timestamps", False)
+        return_speakers = kwargs.pop("return_speakers", False)
+        diarization_params = {
+            "num_speakers": kwargs.pop("num_speakers", None),
+            "min_speakers": kwargs.pop("min_speakers", None),
+            "max_speakers": kwargs.pop("max_speakers", None),
+            "hf_token": kwargs.pop("hf_token", None),
+        }
+        if return_speakers:
+            return_timestamps = True
+        # Store audio for timestamp alignment and diarization
+        if return_timestamps or return_speakers:
+            self._current_audio = self._extract_audio(inputs)
+        # Run standard transcription
+        result = super().__call__(inputs, **kwargs)
+        # Add timestamps if requested
+        if return_timestamps and self._current_audio is not None:
+            text = result.get("text", "")
+            if text:
+                try:
+                    words = ForcedAligner.align(
+                        self._current_audio["array"],
+                        text,
+                        sample_rate=self._current_audio.get("sampling_rate", 16000),
+                    )
+                    result["words"] = words
+                except Exception as e:
+                    result["words"] = []
+                    result["timestamp_error"] = str(e)
+            else:
+                result["words"] = []
+        # Add speaker diarization if requested
+        if return_speakers and self._current_audio is not None:
+            try:
+                # Run diarization
+                speaker_segments = SpeakerDiarizer.diarize(
+                    self._current_audio["array"],
+                    sample_rate=self._current_audio.get("sampling_rate", 16000),
+                    **{k: v for k, v in diarization_params.items() if v is not None},
+                )
+                result["speaker_segments"] = speaker_segments
+                # Assign speakers to words
+                if result.get("words"):
+                    result["words"] = SpeakerDiarizer.assign_speakers_to_words(
+                        result["words"],
+                        speaker_segments,
+                    )
+            except Exception as e:
+                result["speaker_segments"] = []
+                result["diarization_error"] = str(e)
+        # Clean up
+        self._current_audio = None
+        return result
+    def _extract_audio(self, inputs) -> dict | None:
+        """Extract audio array from various input formats using HF utilities."""
+        from transformers.pipelines.audio_utils import ffmpeg_read
+        if isinstance(inputs, dict):
+            if "array" in inputs:
+                return {
+                    "array": inputs["array"],
+                    "sampling_rate": inputs.get("sampling_rate", 16000),
+                }
+            if "raw" in inputs:
+                return {
+                    "array": inputs["raw"],
+                    "sampling_rate": inputs.get("sampling_rate", 16000),
+                }
+        elif isinstance(inputs, str):
+            # File path - load audio using ffmpeg (same as HF pipeline)
+            with Path(inputs).open("rb") as f:
+                audio = ffmpeg_read(f.read(), sampling_rate=16000)
+            return {"array": audio, "sampling_rate": 16000}
+        elif isinstance(inputs, bytes):
+            audio = ffmpeg_read(inputs, sampling_rate=16000)
+            return {"array": audio, "sampling_rate": 16000}
+        elif isinstance(inputs, np.ndarray):
+            return {"array": inputs, "sampling_rate": 16000}
+        return None
     def preprocess(self, inputs, **preprocess_params):
         # Handle dict with "array" key (from datasets)
         # Extract audio features and is_last flag
         is_last = model_inputs.pop("is_last", True) if isinstance(model_inputs, dict) else True
+        input_features = model_inputs["input_features"].to(self.model.device)
+        audio_attention_mask = model_inputs["attention_mask"].to(self.model.device)
         generated_ids = self.model.generate(
             input_features=input_features,
+            audio_attention_mask=audio_attention_mask,
             **generate_kwargs,
         )
                 tokens = tokens[0]
         text = self.tokenizer.decode(tokens, skip_special_tokens=True).strip()
+        # Strip <think>...</think> tags (Qwen3 doesn't respect /no_think prompt)
+        text = re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL).strip()
+        # Truncate if a word repeats more than 3 times consecutively
+        text = self._truncate_repetitions(text, max_repeats=3)
         return {"text": text}
+    def _truncate_repetitions(self, text: str, max_repeats: int = 3) -> str:
+        """Truncate text when a word repeats more than max_repeats times consecutively.
+        Args:
+            text: Input text to check for repetitions
+            max_repeats: Maximum allowed consecutive repetitions (default 3)
+        Returns:
+            Truncated text if repetition detected, otherwise original text
+        """
+        words = text.split()
+        if len(words) <= max_repeats:
+            return text
+        repeat_count = 1
+        for i in range(1, len(words)):
+            if words[i].lower() == words[i - 1].lower():
+                repeat_count += 1
+                if repeat_count > max_repeats:
+                    # Keep up to max_repeats of the repeated word
+                    return " ".join(words[:i])
+            else:
+                repeat_count = 1
+        return text

asr_processing.py CHANGED Viewed

@@ -18,11 +18,28 @@ class ASRProcessor(ProcessorMixin):
     tokenizer_class = "AutoTokenizer"
     AUDIO_TOKEN = "<audio>"
     TRANSCRIBE_PROMPT = "Transcribe: "
-    def __init__(self, feature_extractor, tokenizer):
         self.feature_extractor = feature_extractor
         self.tokenizer = tokenizer
         self.audio_token_id = tokenizer.convert_tokens_to_ids(self.AUDIO_TOKEN)
     def __call__(
         self,
@@ -50,12 +67,17 @@ class ASRProcessor(ProcessorMixin):
             audio_inputs = self.feature_extractor(
                 audio,
                 sampling_rate=getattr(self.feature_extractor, "sampling_rate", 16000),
                 return_tensors=return_tensors,
                 **kwargs,
             )
             result["input_features"] = audio_inputs["input_features"]
-            # Whisper encoder output length = mel_len // 2 (stride-2 conv)
-            num_audio_tokens = audio_inputs["input_features"].shape[-1] // 2
         else:
             num_audio_tokens = 0

     tokenizer_class = "AutoTokenizer"
     AUDIO_TOKEN = "<audio>"
     TRANSCRIBE_PROMPT = "Transcribe: "
+    # Default conv layers for Whisper/GLM-ASR: [(pad, kernel, stride), ...]
+    DEFAULT_ENCODER_CONV_LAYERS = [(1, 3, 1), (1, 3, 2)]
+    def __init__(
+        self,
+        feature_extractor,
+        tokenizer,
+        projector=None,
+        encoder_conv_layers: Optional[list] = None,
+    ):
         self.feature_extractor = feature_extractor
         self.tokenizer = tokenizer
         self.audio_token_id = tokenizer.convert_tokens_to_ids(self.AUDIO_TOKEN)
+        self.projector = projector
+        self.encoder_conv_layers = encoder_conv_layers or self.DEFAULT_ENCODER_CONV_LAYERS
+    def _compute_encoder_output_length(self, mel_length: int) -> int:
+        """Compute encoder output length using conv layer formulas."""
+        length = mel_length
+        for padding, kernel_size, stride in self.encoder_conv_layers:
+            length = (length + 2 * padding - (kernel_size - 1) - 1) // stride + 1
+        return length
     def __call__(
         self,
             audio_inputs = self.feature_extractor(
                 audio,
                 sampling_rate=getattr(self.feature_extractor, "sampling_rate", 16000),
+                return_attention_mask=True,
                 return_tensors=return_tensors,
                 **kwargs,
             )
             result["input_features"] = audio_inputs["input_features"]
+            result["audio_attention_mask"] = audio_inputs["attention_mask"]
+            # Use actual audio length (from attention mask) for token count
+            real_mel_len = int(audio_inputs["attention_mask"].sum(dim=-1).max().item())
+            encoder_output_len = self._compute_encoder_output_length(real_mel_len)
+            num_audio_tokens = self.projector.get_output_length(encoder_output_len)
         else:
             num_audio_tokens = 0

chat_template.jinja CHANGED Viewed

@@ -1,94 +1,89 @@
-{# ───── defaults ───── #}
-{%- if enable_thinking is not defined -%}
-{%- set enable_thinking = true -%}
-{%- endif -%}
-{# ───── reasoning mode ───── #}
-{%- if enable_thinking -%}
-  {%- set reasoning_mode = "/think" -%}
-{%- else -%}
-  {%- set reasoning_mode = "/no_think" -%}
-{%- endif -%}
-{# ───── header (system message) ───── #}
-{{- "<|im_start|>system\n" -}}
-{%- if messages[0].role == "system" -%}
-  {%- set system_message = messages[0].content -%}
-  {%- if "/no_think" in system_message -%}
-    {%- set reasoning_mode = "/no_think" -%}
-  {%- elif "/think" in system_message -%}
-    {%- set reasoning_mode = "/think" -%}
-  {%- endif -%}
-  {%- set custom_instructions = system_message.replace("/no_think", "").replace("/think", "").rstrip() -%}
-{%- endif -%}
-{%- if "/system_override" in system_message -%}
-  {{- custom_instructions.replace("/system_override", "").rstrip() -}}
-  {{- "<|im_end|>\n" -}}
-{%- else -%}
-  {{- "## Metadata\n\n" -}}
-  {{- "Knowledge Cutoff Date: June 2025\n" -}}
-  {%- set today = strftime_now("%d %B %Y") -%}
-  {{- "Today Date: " ~ today ~ "\n" -}}
-  {{- "Reasoning Mode: " + reasoning_mode + "\n\n" -}}
-  {{- "## Custom Instructions\n\n" -}}
-  {%- if custom_instructions -%}
-    {{- custom_instructions + "\n\n" -}}
-  {%- elif reasoning_mode == "/think" -%}
-    {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracking, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> Thought section </think> Solution section. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion.\n\n" -}}
-  {%- else -%}
-    {{- "You are a helpful AI assistant named SmolLM, trained by Hugging Face.\n\n" -}}
-  {%- endif -%}
-  {%- if xml_tools or python_tools or tools -%}
-    {{- "### Tools\n\n" -}}
-    {%- if xml_tools or tools -%}
-      {%- if tools -%}
-        {%- set xml_tools = tools -%}
-      {%- endif -%}
-      {%- set ns = namespace(xml_tool_string="You may call one or more functions to assist with the user query.\nYou are provided with function signatures within <tools></tools> XML tags:\n\n<tools>\n") -%}
-      {%- for tool in xml_tools[:] -%} {# The slicing makes sure that xml_tools is a list #}
-        {%- set ns.xml_tool_string = ns.xml_tool_string ~ (tool | string) ~ "\n" -%}
-      {%- endfor -%}
-      {%- set xml_tool_string = ns.xml_tool_string + "</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>" -%}
-      {{- xml_tool_string -}}
-    {%- endif -%}
-    {%- if python_tools -%}
-      {%- set ns = namespace(python_tool_string="When you send a message containing Python code between '<code>' and '</code>' tags, it will be executed in a stateful Jupyter notebook environment, and you will then be given the output to continued reasoning in an agentic loop.\n\nYou can use the following tools in your python code like regular functions:\n<tools>\n") -%}
-      {%- for tool in python_tools[:] -%} {# The slicing makes sure that python_tools is a list #}
-        {%- set ns.python_tool_string = ns.python_tool_string ~ (tool | string) ~ "\n" -%}
-      {%- endfor -%}
-      {%- set python_tool_string = ns.python_tool_string + "</tools>\n\nThe state persists between code executions: so variables that you define in one step are still available thereafter." -%}
-      {{- python_tool_string -}}
-    {%- endif -%}
-    {{- "\n\n" -}}
-    {{- "<|im_end|>\n" -}}
-  {%- endif -%}
-{%- endif -%}
-{# ───── main loop ───── #}
-{%- for message in messages -%}
-    {%- set content = message.content if message.content is string else "" -%}
-    {%- if message.role == "user" -%}
-        {{ "<|im_start|>" + message.role + "\n"  + content + "<|im_end|>\n" }}
-    {%- elif message.role == "assistant" -%}
-        {% generation %}
-        {%- if reasoning_mode == "/think" -%}
-            {{ "<|im_start|>assistant\n" + content.lstrip("\n") + "<|im_end|>\n" }}
-        {%- else -%}
-            {{ "<|im_start|>assistant\n" + "<think>\n\n</think>\n" + content.lstrip("\n") + "<|im_end|>\n" }}
-        {%- endif -%}
-        {% endgeneration %}
-    {%- elif message.role == "tool" -%}
-    {{ "<|im_start|>" + "user\n"  + content + "<|im_end|>\n" }}
-    {%- endif -%}
-{%- endfor -%}
-{# ───── generation prompt ───── #}
-{%- if add_generation_prompt -%}
-    {%- if reasoning_mode == "/think" -%}
-        {{ "<|im_start|>assistant\n" }}
-    {%- else -%}
-        {{ "<|im_start|>assistant\n" + "<think>\n\n</think>\n"  }}
-    {%- endif -%}
-{%- endif -%}

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0].role == 'system' %}
+        {{- messages[0].content + '\n\n' }}
+    {%- endif %}
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0].role == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+    {%- set index = (messages|length - 1) - loop.index0 %}
+    {%- if ns.multi_step_tool and message.role == "user" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+        {%- set ns.multi_step_tool = false %}
+        {%- set ns.last_query_index = index %}
+    {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+    {%- if message.content is string %}
+        {%- set content = message.content %}
+    {%- else %}
+        {%- set content = '' %}
+    {%- endif %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+        {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {%- set reasoning_content = '' %}
+        {%- if message.reasoning_content is string %}
+            {%- set reasoning_content = message.reasoning_content %}
+        {%- else %}
+            {%- if '</think>' in content %}
+                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
+            {%- endif %}
+        {%- endif %}
+        {%- if loop.index0 > ns.last_query_index %}
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+            {%- else %}
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+            {%- endif %}
+        {%- else %}
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+        {%- endif %}
+        {%- if message.tool_calls %}
+            {%- for tool_call in message.tool_calls %}
+                {%- if (loop.first and content) or (not loop.first) %}
+                    {{- '\n' }}
+                {%- endif %}
+                {%- if tool_call.function %}
+                    {%- set tool_call = tool_call.function %}
+                {%- endif %}
+                {{- '<tool_call>\n{"name": "' }}
+                {{- tool_call.name }}
+                {{- '", "arguments": ' }}
+                {%- if tool_call.arguments is string %}
+                    {{- tool_call.arguments }}
+                {%- else %}
+                    {{- tool_call.arguments | tojson }}
+                {%- endif %}
+                {{- '}\n</tool_call>' }}
+            {%- endfor %}
+        {%- endif %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+    {%- if enable_thinking is defined and enable_thinking is false %}
+        {{- '<think>\n\n</think>\n\n' }}
+    {%- endif %}
+{%- endif %}

config.json CHANGED Viewed

@@ -4,49 +4,126 @@
   ],
   "attn_implementation": "flash_attention_2",
   "audio_config": {
-    "_name_or_path": "openai/whisper-large-v3-turbo",
-    "activation_dropout": 0.0,
-    "activation_function": "gelu",
-    "apply_spec_augment": false,
     "architectures": [
-      "WhisperForConditionalGeneration"
     ],
-    "attention_dropout": 0.0,
-    "bos_token_id": 50257,
-    "classifier_proj_size": 256,
-    "d_model": 1280,
-    "decoder_attention_heads": 20,
-    "decoder_ffn_dim": 5120,
-    "decoder_layerdrop": 0.0,
-    "decoder_layers": 4,
-    "decoder_start_token_id": 50258,
-    "dropout": 0.0,
     "dtype": "bfloat16",
-    "encoder_attention_heads": 20,
-    "encoder_ffn_dim": 5120,
-    "encoder_layerdrop": 0.0,
-    "encoder_layers": 32,
-    "eos_token_id": 50257,
-    "init_std": 0.02,
-    "mask_feature_length": 10,
-    "mask_feature_min_masks": 0,
-    "mask_feature_prob": 0.0,
-    "mask_time_length": 10,
-    "mask_time_min_masks": 2,
-    "mask_time_prob": 0.05,
-    "max_source_positions": 1500,
-    "max_target_positions": 448,
-    "median_filter_width": 7,
-    "model_type": "whisper",
-    "num_hidden_layers": 32,
     "num_mel_bins": 128,
-    "pad_token_id": 50257,
-    "scale_embedding": false,
-    "use_cache": true,
-    "use_weighted_layer_sum": false,
-    "vocab_size": 51866
   },
-  "audio_model_id": "openai/whisper-large-v3-turbo",
   "audio_sample_rate": 16000,
   "auto_map": {
     "AutoConfig": "asr_config.ASRConfig",
@@ -64,17 +141,34 @@
       "type": "audio"
     }
   },
-  "downsample_rate": 16,
   "dtype": "bfloat16",
   "encoder_dim": 1280,
-  "inference_diversity_penalty": 0.0,
   "inference_warmup_tokens": 10,
   "label_smoothing": 0.0,
   "llm_dim": 2048,
-  "max_new_tokens": 96,
-  "min_new_tokens": 0,
   "model_dtype": "bfloat16",
   "model_type": "asr_model",
   "num_experts": 4,
   "num_experts_per_tok": 2,
   "pipeline_tag": "automatic-speech-recognition",
@@ -83,24 +177,30 @@
   "projector_init_std": 0.02,
   "projector_input_noise": 0.0,
   "projector_num_layers": 2,
-  "projector_pool_stride": 2,
   "projector_type": "mlp",
   "router_aux_loss_coef": 0.01,
   "system_prompt": "/no_think /system_override",
   "text_config": {
-    "_name_or_path": "HuggingFaceTB/SmolLM3-3B",
     "architectures": [
-      "SmolLM3ForCausalLM"
     ],
     "attention_bias": false,
     "attention_dropout": 0.0,
-    "bos_token_id": null,
     "dtype": "bfloat16",
-    "eos_token_id": 128012,
     "hidden_act": "silu",
     "hidden_size": 2048,
     "initializer_range": 0.02,
-    "intermediate_size": 11008,
     "layer_types": [
       "full_attention",
       "full_attention",
@@ -129,75 +229,31 @@
       "full_attention",
       "full_attention",
       "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
-      "full_attention",
       "full_attention"
     ],
-    "max_position_embeddings": 65536,
     "max_window_layers": 28,
-    "mlp_bias": false,
-    "model_type": "smollm3",
-    "no_rope_layer_interval": 4,
-    "no_rope_layers": [
-      1,
-      1,
-      1,
-      0,
-      1,
-      1,
-      1,
-      0,
-      1,
-      1,
-      1,
-      0,
-      1,
-      1,
-      1,
-      0,
-      1,
-      1,
-      1,
-      0,
-      1,
-      1,
-      1,
-      0,
-      1,
-      1,
-      1,
-      0,
-      1,
-      1,
-      1,
-      0,
-      1,
-      1,
-      1,
-      0
-    ],
     "num_attention_heads": 16,
-    "num_hidden_layers": 36,
-    "num_key_value_heads": 4,
-    "pretraining_tp": 2,
     "rms_norm_eps": 1e-06,
-    "rope_scaling": null,
-    "rope_theta": 5000000.0,
     "sliding_window": null,
-    "use_cache": false,
     "use_sliding_window": false,
-    "vocab_size": 128257
   },
-  "text_model_id": "HuggingFaceTB/SmolLM3-3B",
-  "transformers_version": "4.57.3",
   "use_cache": false,
   "use_specaugment": true,
-  "user_prompt": "Transcribe: <audio>",
-  "vocab_size": 128257
 }

   ],
   "attn_implementation": "flash_attention_2",
   "audio_config": {
+    "_name_or_path": "zai-org/GLM-ASR-Nano-2512",
     "architectures": [
+      "GlmAsrForConditionalGeneration"
     ],
+    "audio_config": {
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architectures": null,
+      "attention_dropout": 0.0,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "dtype": null,
+      "eos_token_id": null,
+      "finetuning_task": null,
+      "head_dim": 64,
+      "hidden_act": "gelu",
+      "hidden_size": 1280,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "initializer_range": 0.02,
+      "intermediate_size": 5120,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "max_position_embeddings": 1500,
+      "model_type": "glmasr_encoder",
+      "num_attention_heads": 20,
+      "num_hidden_layers": 32,
+      "num_key_value_heads": 20,
+      "num_mel_bins": 128,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "pad_token_id": null,
+      "partial_rotary_factor": 0.5,
+      "prefix": null,
+      "problem_type": null,
+      "return_dict": true,
+      "rope_parameters": {
+        "partial_rotary_factor": 0.5,
+        "rope_theta": 10000.0,
+        "rope_type": "default"
+      },
+      "sep_token_id": null,
+      "task_specific_params": null,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null
+    },
+    "audio_token_id": 59260,
     "dtype": "bfloat16",
+    "hidden_size": 2048,
+    "model_type": "glmasr",
     "num_mel_bins": 128,
+    "projector_hidden_act": "gelu",
+    "text_config": {
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architectures": null,
+      "attention_bias": false,
+      "attention_dropout": 0.0,
+      "bos_token_id": 1,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "dtype": null,
+      "eos_token_id": [
+        59246,
+        59253,
+        59255
+      ],
+      "finetuning_task": null,
+      "head_dim": 128,
+      "hidden_act": "silu",
+      "hidden_size": 2048,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "initializer_range": 0.02,
+      "intermediate_size": 6144,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "max_position_embeddings": 8192,
+      "mlp_bias": false,
+      "model_type": "llama",
+      "num_attention_heads": 16,
+      "num_hidden_layers": 28,
+      "num_key_value_heads": 4,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "pad_token_id": null,
+      "prefix": null,
+      "pretraining_tp": 1,
+      "problem_type": null,
+      "return_dict": true,
+      "rms_norm_eps": 1e-05,
+      "rope_parameters": {
+        "rope_theta": 10000.0,
+        "rope_type": "default"
+      },
+      "sep_token_id": null,
+      "task_specific_params": null,
+      "tie_word_embeddings": false,
+      "tokenizer_class": null,
+      "use_cache": true,
+      "vocab_size": 59264
+    },
+    "vocab_size": 59264
   },
+  "audio_model_id": "zai-org/GLM-ASR-Nano-2512",
   "audio_sample_rate": 16000,
   "auto_map": {
     "AutoConfig": "asr_config.ASRConfig",
       "type": "audio"
     }
   },
+  "downsample_rate": 5,
   "dtype": "bfloat16",
+  "encoder_conv_layers": [
+    [
+      1,
+      3,
+      1
+    ],
+    [
+      1,
+      3,
+      2
+    ]
+  ],
   "encoder_dim": 1280,
   "inference_warmup_tokens": 10,
   "label_smoothing": 0.0,
+  "length_penalty": 1.0,
   "llm_dim": 2048,
+  "lora_alpha": 128,
+  "lora_dropout": 0.05,
+  "lora_r": 64,
+  "lora_target_modules": "all-linear",
+  "max_new_tokens": 256,
   "model_dtype": "bfloat16",
   "model_type": "asr_model",
+  "no_repeat_ngram_size": 0,
+  "num_beams": 1,
   "num_experts": 4,
   "num_experts_per_tok": 2,
   "pipeline_tag": "automatic-speech-recognition",
   "projector_init_std": 0.02,
   "projector_input_noise": 0.0,
   "projector_num_layers": 2,
+  "projector_pool_stride": 4,
   "projector_type": "mlp",
+  "qformer_hidden_size": null,
+  "qformer_intermediate_size": null,
+  "qformer_num_heads": 16,
+  "qformer_num_layers": 2,
+  "qformer_window_size": 15,
+  "repetition_penalty": 1.0,
   "router_aux_loss_coef": 0.01,
   "system_prompt": "/no_think /system_override",
   "text_config": {
+    "_name_or_path": "Qwen/Qwen3-1.7B",
     "architectures": [
+      "Qwen3ForCausalLM"
     ],
     "attention_bias": false,
     "attention_dropout": 0.0,
     "dtype": "bfloat16",
+    "eos_token_id": 151645,
+    "head_dim": 128,
     "hidden_act": "silu",
     "hidden_size": 2048,
     "initializer_range": 0.02,
+    "intermediate_size": 6144,
     "layer_types": [
       "full_attention",
       "full_attention",
       "full_attention",
       "full_attention",
       "full_attention",
       "full_attention"
     ],
+    "max_position_embeddings": 40960,
     "max_window_layers": 28,
+    "model_type": "qwen3",
     "num_attention_heads": 16,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 8,
+    "pad_token_id": 151643,
     "rms_norm_eps": 1e-06,
+    "rope_parameters": {
+      "rope_theta": 1000000,
+      "rope_type": "default"
+    },
     "sliding_window": null,
+    "tie_word_embeddings": true,
+    "use_cache": true,
     "use_sliding_window": false,
+    "vocab_size": 151670
   },
+  "text_model_id": "Qwen/Qwen3-1.7B",
+  "transformers_version": "5.0.0.dev0",
   "use_cache": false,
+  "use_lora": true,
   "use_specaugment": true,
+  "user_prompt": "Please transcribe this English audio into text: <audio>",
+  "vocab_size": 151670
 }

generation_config.json CHANGED Viewed

@@ -1,10 +1,14 @@
 {
-  "bos_token_id": 128000,
-  "eos_token_id": 128012,
-  "max_new_tokens": 96,
-  "pad_token_id": 128004,
-  "temperature": null,
-  "top_k": null,
-  "top_p": null,
-  "transformers_version": "4.57.3"
 }

 {
+  "bos_token_id": 151643,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "length_penalty": 1.0,
+  "max_new_tokens": 256,
+  "no_repeat_ngram_size": 0,
+  "num_beams": 1,
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.0,
+  "transformers_version": "5.0.0.dev0"
 }

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bc986c3239fe8c22e3ee77fac1eb766f6c4c55bf11d3910107ebbad8dddba637
-size 23462224

 version https://git-lfs.github.com/spec/v1
+oid sha256:5f325deceeb565a0764abd09b46d51706bff2d643c0dd96b38070f246b0410de
+size 58732960

preprocessor_config.json CHANGED Viewed

@@ -9,9 +9,9 @@
   "nb_max_frames": 3000,
   "padding_side": "right",
   "padding_value": 0.0,
-  "processor_class": "ASRProcessor",
   "return_attention_mask": false,
   "sampling_rate": 16000,
   "auto_map": {
     "AutoProcessor": "asr_processing.ASRProcessor"
   }

   "nb_max_frames": 3000,
   "padding_side": "right",
   "padding_value": 0.0,
   "return_attention_mask": false,
   "sampling_rate": 16000,
+  "processor_class": "ASRProcessor",
   "auto_map": {
     "AutoProcessor": "asr_processing.ASRProcessor"
   }

projectors.py CHANGED Viewed

@@ -1,16 +1,18 @@
 """Audio projector modules for bridging encoder and decoder embeddings.
 This module contains all projector architectures:
-- MLPAudioProjector: Simple 2-layer MLP with conv downsampling
-- MoEAudioProjector: MOSA-style dense mixture of experts
-- SwiGLUAudioProjector: SwiGLU-based projector with temporal pooling
-- ResidualAudioProjector: Residual MLP blocks with linear projection
 - SharedMoEAudioProjector: Shared expert + sparse routed experts
 """
 import torch
 import torch.nn as nn
 import torch.nn.functional as F  # noqa: N812
 from transformers.models.llama.modeling_llama import LlamaRMSNorm
 # =============================================================================
@@ -19,40 +21,36 @@ from transformers.models.llama.modeling_llama import LlamaRMSNorm
 class MLPAudioProjector(nn.Module):
-    """2-layer MLP projector with conv-based 2x temporal downsampling."""
     def __init__(self, config):
         super().__init__()
         encoder_dim = getattr(config, "encoder_dim", 768)
         llm_dim = getattr(config, "llm_dim", 2048)
-        self.downsample = nn.Conv1d(
-            encoder_dim, encoder_dim, kernel_size=3, stride=2, padding=1, bias=False
-        )
-        self.linear_1 = nn.Linear(encoder_dim, llm_dim, bias=False)
         self.act = nn.GELU()
-        self.linear_2 = nn.Linear(llm_dim, llm_dim, bias=False)
-        self.apply(self._init_weights)
-    def _init_weights(self, module):
-        if isinstance(module, nn.Linear):
-            nn.init.normal_(module.weight, mean=0.0, std=0.02)
-        elif isinstance(module, nn.Conv1d):
-            nn.init.normal_(module.weight, mean=0.0, std=0.02)
-            if module.bias is not None:
-                nn.init.zeros_(module.bias)
     def forward(self, x):
         """
         x: [Batch, Seq_Len, Dim]
-        Returns: [Batch, Seq_Len // 2, llm_dim]
         """
-        # Conv1d expects [Batch, Channels, Seq_Len]
-        x = x.transpose(1, 2)
-        x = self.downsample(x)
-        x = x.transpose(1, 2)
         x = self.linear_1(x)
         x = self.act(x)
@@ -65,291 +63,146 @@ class MLPAudioProjector(nn.Module):
 class SimpleAdapter(nn.Module):
-    """Simple adapter: Linear -> ReLU -> Dropout -> Linear."""
-    def __init__(self, in_features, hidden_features, out_features, dropout=0.0):
         super().__init__()
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.relu = nn.ReLU()
-        self.dropout = nn.Dropout(dropout)
-        self.fc2 = nn.Linear(hidden_features, out_features)
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.relu(x)
-        x = self.dropout(x)
-        return self.fc2(x)
-class MoEAudioProjector(nn.Module):
-    """
-    MOSA-style projector: Mixture of Simple Adapters.
-    From paper (arXiv:2508.18998):
-    - Dense mixture (softmax over ALL experts) instead of sparse Top-K
-    - Simple Linear->ReLU->Linear adapters
-    - No auxiliary losses - just cross-entropy on transcripts
-    - Conv downsampling: stride 4 total (two conv layers, stride 2 each)
-    """
     def __init__(self, config):
         super().__init__()
-        self.encoder_dim = config.encoder_dim
-        self.llm_dim = config.llm_dim
-        self.num_experts = getattr(config, "num_experts", 4)
-        adapter_hidden = getattr(config, "projector_hidden_dim", None) or 4096
-        self.dropout_rate = getattr(config, "projector_dropout", 0.1)
-        # Convolutional Subsampling (stride 4 total)
         self.conv = nn.Sequential(
             nn.Conv1d(self.encoder_dim, self.llm_dim, kernel_size=3, stride=2, padding=1),
-            nn.ReLU(),
             nn.Conv1d(self.llm_dim, self.llm_dim, kernel_size=3, stride=2, padding=1),
-            nn.ReLU(),
         )
-        # Router
-        router_hidden = 512
         self.router = nn.Sequential(
-            nn.Linear(self.encoder_dim, router_hidden),
             nn.ReLU(),
-            nn.Linear(router_hidden, self.num_experts),
         )
-        # Experts
         self.experts = nn.ModuleList(
             [
-                SimpleAdapter(self.llm_dim, adapter_hidden, self.llm_dim, dropout=self.dropout_rate)
                 for _ in range(self.num_experts)
             ]
         )
-        self.ln_post = LlamaRMSNorm(self.llm_dim, eps=1e-6)
-        self._init_weights()
-    def _init_weights(self):
-        std = 0.02
-        with torch.no_grad():
-            for module in self.conv:
-                if isinstance(module, nn.Conv1d):
-                    nn.init.normal_(module.weight, mean=0.0, std=std)
-                    if module.bias is not None:
-                        nn.init.zeros_(module.bias)
-            for module in self.router:
-                if isinstance(module, nn.Linear):
-                    nn.init.normal_(module.weight, mean=0.0, std=std)
-                    if module.bias is not None:
-                        nn.init.zeros_(module.bias)
-            for expert in self.experts:
-                nn.init.normal_(expert.fc1.weight, mean=0.0, std=std)
-                nn.init.normal_(expert.fc2.weight, mean=0.0, std=std)
-                if expert.fc1.bias is not None:
-                    nn.init.zeros_(expert.fc1.bias)
-                if expert.fc2.bias is not None:
-                    nn.init.zeros_(expert.fc2.bias)
-            self.ln_post.weight.data.fill_(1.0)
     def forward(self, x):
         batch_size, seq_len, _ = x.shape
-        # Pad to be divisible by stride (4)
-        pad_amt = (4 - (seq_len % 4)) % 4
-        if pad_amt > 0:
-            x = F.pad(x, (0, 0, 0, pad_amt))
-            seq_len = x.shape[1]
-        # Convolutional Downsampling
-        h_conv = self.conv(x.permute(0, 2, 1)).permute(0, 2, 1)
-        # Router on high-res input, then downsample weights
-        router_logits = self.router(x)
-        router_logits = router_logits.view(batch_size, seq_len // 4, 4, self.num_experts).mean(
-            dim=2
-        )
-        routing_weights = F.softmax(router_logits, dim=-1)
-        # Weighted sum of expert outputs
-        final_out = torch.zeros_like(h_conv)
-        for i, expert in enumerate(self.experts):
-            expert_out = expert(h_conv)
-            expert_weight = routing_weights[:, :, i : i + 1]
-            final_out.add_(expert_out * expert_weight)
-        return self.ln_post(final_out)
-    def get_aux_loss(self) -> torch.Tensor:
-        """Return auxiliary loss (none for dense MoE)."""
-        return torch.tensor(0.0)
-# =============================================================================
-# SwiGLU Projector
-# =============================================================================
-class SwiGLU(nn.Module):
-    def __init__(self, in_features, hidden_features, out_features, bias=False, dropout=0.0):
-        super().__init__()
-        self.w1 = nn.Linear(in_features, hidden_features, bias=bias)
-        self.w2 = nn.Linear(in_features, hidden_features, bias=bias)
-        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
-        self.act = nn.SiLU()
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x):
-        x_gate = self.act(self.w1(x))
-        x_val = self.w2(x)
-        x = x_gate * x_val
-        x = self.dropout(x)
-        return self.w3(x)
-class SwiGLUAudioProjector(nn.Module):
-    """SwiGLU-based projector with temporal pooling."""
-    def __init__(self, config):
-        super().__init__()
-        self.k = getattr(config, "projector_pool_stride", 4)
-        in_dim = config.encoder_dim * self.k
-        out_dim = config.llm_dim
-        hidden_dim = config.projector_hidden_dim
-        if hidden_dim is None:
-            hidden_dim = config.encoder_dim * 2
-        dropout_rate = getattr(config, "projector_dropout", 0.0)
-        self.proj1 = SwiGLU(in_dim, hidden_dim, hidden_dim, dropout=dropout_rate)
-        self.proj2 = SwiGLU(hidden_dim, hidden_dim, out_dim, dropout=dropout_rate)
-        self.output_dropout = nn.Dropout(dropout_rate)
-        with torch.no_grad():
-            std = getattr(config, "projector_init_std", 0.02)
-            nn.init.normal_(self.proj1.w1.weight, mean=0.0, std=std)
-            nn.init.normal_(self.proj1.w2.weight, mean=0.0, std=std)
-            nn.init.normal_(self.proj1.w3.weight, mean=0.0, std=std)
-            nn.init.normal_(self.proj2.w1.weight, mean=0.0, std=std)
-            nn.init.normal_(self.proj2.w2.weight, mean=0.0, std=std)
-            nn.init.normal_(self.proj2.w3.weight, mean=0.0, std=std)
-    def forward(self, x):
-        batch_size, seq_len, dim = x.size()
-        target_dtype = self.proj1.w1.weight.dtype
-        if x.dtype != target_dtype:
-            x = x.to(target_dtype)
-        remainder = seq_len % self.k
-        if remainder:
-            pad_len = self.k - remainder
-            x = F.pad(x, (0, 0, 0, pad_len))
-        x = x.contiguous().view(batch_size, -1, dim * self.k)
-        x = self.proj1(x)
-        x = self.proj2(x)
-        return self.output_dropout(x)
-# Alias for backwards compatibility
-AudioProjector = SwiGLUAudioProjector
-# =============================================================================
-# Residual Projector
-# =============================================================================
-class ResidualMLP(nn.Module):
-    """MLP block with residual connection: Output = x + MLP(x)."""
-    def __init__(self, dim, hidden_dim, dropout=0.0):
-        super().__init__()
-        self.fc1 = nn.Linear(dim, hidden_dim)
-        self.fc2 = nn.Linear(hidden_dim, dim)
-        self.act = nn.GELU()
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x):
-        residual = x
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.dropout(x)
-        x = self.fc2(x)
-        x = self.dropout(x)
-        return residual + x
-class ResidualAudioProjector(nn.Module):
-    """Residual MLP projector for audio-to-LLM feature translation."""
-    def __init__(self, config):
-        super().__init__()
-        self.k = getattr(config, "projector_pool_stride", 4)
-        in_dim = config.encoder_dim * self.k
-        out_dim = config.llm_dim
-        hidden_dim = getattr(config, "projector_hidden_dim", None) or out_dim * 4
-        self.num_layers = getattr(config, "projector_num_layers", 2)
-        dropout_rate = getattr(config, "projector_dropout", 0.0)
-        self.input_proj = nn.Linear(in_dim, out_dim)
-        self.ln_input = LlamaRMSNorm(out_dim, eps=1e-6)
-        self.layers = nn.ModuleList(
-            [ResidualMLP(out_dim, hidden_dim, dropout=dropout_rate) for _ in range(self.num_layers)]
-        )
-        self.layer_norms = nn.ModuleList(
-            [LlamaRMSNorm(out_dim, eps=1e-6) for _ in range(self.num_layers)]
-        )
-        self.output_dropout = nn.Dropout(dropout_rate)
-        self._init_weights(config)
-    def _init_weights(self, config):
-        std = getattr(config, "projector_init_std", 0.02)
-        with torch.no_grad():
-            nn.init.normal_(self.input_proj.weight, mean=0.0, std=std)
-            if self.input_proj.bias is not None:
-                nn.init.zeros_(self.input_proj.bias)
-            self.ln_input.weight.data.fill_(1.0)
-            for ln in self.layer_norms:
-                ln.weight.data.fill_(1.0)
-            for layer in self.layers:
-                nn.init.normal_(layer.fc1.weight, mean=0.0, std=std)
-                nn.init.normal_(layer.fc2.weight, mean=0.0, std=std * 0.1)
-                if layer.fc1.bias is not None:
-                    nn.init.zeros_(layer.fc1.bias)
-                if layer.fc2.bias is not None:
-                    nn.init.zeros_(layer.fc2.bias)
-    def forward(self, x):
-        batch_size, seq_len, dim = x.size()
-        target_dtype = self.input_proj.weight.dtype
-        if x.dtype != target_dtype:
-            x = x.to(target_dtype)
-        remainder = seq_len % self.k
-        if remainder:
-            pad_len = self.k - remainder
-            x = F.pad(x, (0, 0, 0, pad_len))
-        x = x.contiguous().view(batch_size, -1, dim * self.k)
-        x = self.input_proj(x)
-        x = self.ln_input(x)
-        for layer, ln in zip(self.layers, self.layer_norms):
-            x = layer(x)
-            x = ln(x)
-        return self.output_dropout(x)
 # =============================================================================
@@ -357,22 +210,8 @@ class ResidualAudioProjector(nn.Module):
 # =============================================================================
-class SwiGLUExpert(nn.Module):
-    """SwiGLU expert MLP."""
-    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
-        super().__init__()
-        self.gate_proj = nn.Linear(input_dim, hidden_dim, bias=False)
-        self.up_proj = nn.Linear(input_dim, hidden_dim, bias=False)
-        self.down_proj = nn.Linear(hidden_dim, output_dim, bias=False)
-        self.act = nn.SiLU()
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.down_proj(self.act(self.gate_proj(x)) * self.up_proj(x))
 class SharedMoEBlock(nn.Module):
-    """MoE block with shared expert + sparse routed experts."""
     def __init__(
         self,
@@ -387,8 +226,11 @@ class SharedMoEBlock(nn.Module):
         self.top_k = top_k
         self.output_dim = output_dim
         self.router = nn.Linear(input_dim, num_experts, bias=False)
-        nn.init.zeros_(self.router.weight)
         self.shared_expert = SwiGLUExpert(input_dim, hidden_dim, output_dim)
         self.experts = nn.ModuleList(
@@ -401,19 +243,28 @@ class SharedMoEBlock(nn.Module):
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, seq_len, dim = hidden_states.shape
-        shared_out = self.shared_expert(hidden_states)
-        flat_hidden = hidden_states.view(-1, dim)
         router_logits = self.router(flat_hidden)
-        router_probs = F.softmax(router_logits.float(), dim=-1)
         self.last_router_logits = router_logits
         self.last_router_probs = router_probs
-        top_k_weights, top_k_indices = torch.topk(router_probs, self.top_k, dim=-1)
-        top_k_weights = top_k_weights / top_k_weights.sum(dim=-1, keepdim=True)
         top_k_weights = top_k_weights.to(hidden_states.dtype)
         routed_out = self._dispatch_experts(flat_hidden, top_k_indices, top_k_weights)
         routed_out = routed_out.view(batch_size, seq_len, -1)
@@ -437,7 +288,7 @@ class SharedMoEBlock(nn.Module):
             token_indices, slot_indices = torch.where(expert_mask)
             expert_input = hidden_states[token_indices]
-            expert_output = expert(expert_input)
             weights = top_k_weights[token_indices, slot_indices].unsqueeze(-1)
             output.index_add_(0, token_indices, expert_output * weights)
@@ -446,11 +297,9 @@ class SharedMoEBlock(nn.Module):
 def load_balancing_loss(router_probs: torch.Tensor, num_experts: int, top_k: int) -> torch.Tensor:
     """Auxiliary loss to encourage balanced expert usage."""
-    _, selected = torch.topk(router_probs, top_k, dim=-1)
-    expert_mask = F.one_hot(selected, num_experts).float()
-    tokens_per_expert = expert_mask.mean(dim=(0, 1))
     prob_per_expert = router_probs.mean(dim=0)
-    return (tokens_per_expert * prob_per_expert).sum() * num_experts
 def z_loss(router_logits: torch.Tensor) -> torch.Tensor:
@@ -465,8 +314,13 @@ class SharedMoEAudioProjector(nn.Module):
         super().__init__()
         self.k = getattr(config, "projector_pool_stride", 4)
         encoder_dim = config.encoder_dim
         in_dim = encoder_dim * self.k
         out_dim = config.llm_dim
         hidden_dim = getattr(config, "projector_hidden_dim", None) or in_dim
@@ -477,9 +331,9 @@ class SharedMoEAudioProjector(nn.Module):
         self.z_loss_coef = getattr(config, "router_z_loss_coef", 0.001)
         self.moe = SharedMoEBlock(in_dim, hidden_dim, out_dim, self.num_experts, self.top_k)
-        self._init_weights(in_dim)
-    def _init_weights(self, in_dim: int):
         with torch.no_grad():
             nn.init.orthogonal_(self.moe.shared_expert.gate_proj.weight)
             nn.init.orthogonal_(self.moe.shared_expert.up_proj.weight)
@@ -490,6 +344,13 @@ class SharedMoEAudioProjector(nn.Module):
                 nn.init.orthogonal_(expert.up_proj.weight)
                 nn.init.orthogonal_(expert.down_proj.weight, gain=0.01)
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         batch_size, seq_len, dim = x.size()
@@ -497,6 +358,11 @@ class SharedMoEAudioProjector(nn.Module):
         if x.dtype != target_dtype:
             x = x.to(target_dtype)
         if seq_len % self.k:
             x = F.pad(x, (0, 0, 0, self.k - seq_len % self.k))
@@ -514,14 +380,129 @@ class SharedMoEAudioProjector(nn.Module):
         return self.aux_loss_coef * balance + self.z_loss_coef * z
 # =============================================================================
 # Projector Registry
 # =============================================================================
 PROJECTOR_CLASSES = {
     "mlp": MLPAudioProjector,
-    "moe": MoEAudioProjector,
-    "swiglu": SwiGLUAudioProjector,
-    "residual": ResidualAudioProjector,
     "shared_moe": SharedMoEAudioProjector,
 }

 """Audio projector modules for bridging encoder and decoder embeddings.
 This module contains all projector architectures:
+- MLPAudioProjector: Simple 2-layer MLP with frame stacking downsampling
+- MOSAProjector: MOSA-style dense mixture of experts
 - SharedMoEAudioProjector: Shared expert + sparse routed experts
+- QFormerAudioProjector: BLIP-2 QFormer with learnable queries (Granite-style)
 """
+import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F  # noqa: N812
+from transformers import AutoModel, Blip2QFormerConfig
 from transformers.models.llama.modeling_llama import LlamaRMSNorm
 # =============================================================================
 class MLPAudioProjector(nn.Module):
+    """2-layer MLP projector with frame-stacking downsampling (matches GLM-ASR)."""
     def __init__(self, config):
         super().__init__()
         encoder_dim = getattr(config, "encoder_dim", 768)
         llm_dim = getattr(config, "llm_dim", 2048)
+        self.k = getattr(config, "projector_pool_stride", 4)
+        # Frame stacking: concat k adjacent frames then project
+        # Matches GLM-ASR: in_dim -> 2*llm_dim -> llm_dim
+        in_dim = encoder_dim * self.k
+        hidden_dim = llm_dim * 2
+        self.linear_1 = nn.Linear(in_dim, hidden_dim)
         self.act = nn.GELU()
+        self.linear_2 = nn.Linear(hidden_dim, llm_dim)
+    def get_output_length(self, input_length: int) -> int:
+        """Calculate output sequence length given input length."""
+        return input_length // self.k
     def forward(self, x):
         """
         x: [Batch, Seq_Len, Dim]
+        Returns: [Batch, Seq_Len // k, llm_dim]
         """
+        batch, seq, dim = x.shape
+        # Reshape to combine k frames: [B, S, D] -> [B, -1, D*k]
+        # -1 infers sequence length, implicitly downsampling by factor k
+        x = x.reshape(batch, -1, dim * self.k)
         x = self.linear_1(x)
         x = self.act(x)
 class SimpleAdapter(nn.Module):
+    """Simple 2-layer ReLU adapter (from MOSA paper)."""
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
         super().__init__()
+        self.fc1 = nn.Linear(input_dim, hidden_dim)
+        self.act = nn.ReLU()
+        self.fc2 = nn.Linear(hidden_dim, output_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.fc2(self.act(self.fc1(x)))
+class SwiGLUExpert(nn.Module):
+    """SwiGLU expert (gated MLP with SiLU activation)."""
+    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
+        super().__init__()
+        self.gate_proj = nn.Linear(input_dim, hidden_dim, bias=False)
+        self.up_proj = nn.Linear(input_dim, hidden_dim, bias=False)
+        self.down_proj = nn.Linear(hidden_dim, output_dim, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.down_proj(F.silu(self.gate_proj(x)) * self.up_proj(x))
+class MOSAProjector(nn.Module):
     def __init__(self, config):
         super().__init__()
+        self.encoder_dim = getattr(config, "encoder_dim", None) or 1280
+        self.llm_dim = getattr(config, "llm_dim", None) or 2048
+        self.num_experts = getattr(config, "num_experts", None) or 8
+        adapter_hidden = getattr(config, "adapter_hidden_dim", None) or 4096
+        # Auxiliary loss coefficients (MOSA paper uses only cross-entropy, no aux losses)
+        self.aux_loss_coef = getattr(config, "router_aux_loss_coef", 0.0)
+        self.z_loss_coef = getattr(config, "router_z_loss_coef", 0.0)
+        # Store router state for aux loss computation
+        self.last_router_logits = None
+        self.last_routing_weights = None
+        # --- 1. Pre-Norms (CRITICAL for stability) ---
+        self.in_norm = LlamaRMSNorm(self.encoder_dim, eps=1e-8)
+        # --- 2. Convolutional Subsampling (Stride 4) ---
         self.conv = nn.Sequential(
             nn.Conv1d(self.encoder_dim, self.llm_dim, kernel_size=3, stride=2, padding=1),
+            nn.SiLU(),
             nn.Conv1d(self.llm_dim, self.llm_dim, kernel_size=3, stride=2, padding=1),
+            nn.SiLU(),
         )
+        # --- 3. Deep Router (ReLU per MOSA paper) ---
         self.router = nn.Sequential(
+            nn.Linear(self.encoder_dim, 2560),
+            nn.ReLU(),
+            nn.Linear(2560, 5120),
+            nn.ReLU(),
+            nn.Linear(5120, 2560),
+            nn.ReLU(),
+            nn.Linear(2560, 1280),
             nn.ReLU(),
+            nn.Linear(1280, self.num_experts),
         )
+        # --- 4. Experts (Simple 2-layer ReLU adapters per MOSA paper) ---
         self.experts = nn.ModuleList(
             [
+                SimpleAdapter(self.llm_dim, adapter_hidden, self.llm_dim)
                 for _ in range(self.num_experts)
             ]
         )
+        # --- 5. Output Norm ---
+        # Projects often drift in magnitude; this clamps them before the LLM.
+        self.out_norm = LlamaRMSNorm(self.llm_dim, eps=1e-8)
+        # Using PyTorch default initialization (like MOSA paper)
     def forward(self, x):
+        # x: (B, S, 1280)
         batch_size, seq_len, _ = x.shape
+        # Apply Input Norm
+        x = self.in_norm(x)
+        # --- 1. Conv Branch ---
+        x_trans = x.permute(0, 2, 1)  # (B, D, S)
+        h_conv = self.conv(x_trans).permute(0, 2, 1)  # (B, S//4, llm_dim)
+        # --- 2. Router Branch ---
+        pad_amt = (4 - (seq_len % 4)) % 4
+        x_padded = F.pad(x, (0, 0, 0, pad_amt)) if pad_amt > 0 else x
+        # Mean pool to align receptive fields
+        x_pooled = x_padded.view(batch_size, -1, 4, self.encoder_dim).mean(dim=2)  # (B, S//4, D)
+        # Router Logits
+        router_logits = self.router(x_pooled)  # (B, S//4, num_experts)
+        # Softmax for Dense MoE (Soft Mixing)
+        routing_weights = F.softmax(router_logits, dim=-1)
+        # Store for aux loss computation
+        self.last_router_logits = router_logits
+        self.last_routing_weights = routing_weights
+        # --- 3. Expert Mixture (Dense Execution) ---
+        # Warning: High VRAM usage. Runs all experts.
+        # h_conv: (B, S//4, llm_dim)
+        # Stack approach is clean but memory hungry.
+        # Checkpointing could be added here if OOM occurs.
+        expert_outputs = torch.stack([expert(h_conv) for expert in self.experts])  # (E, B, S//4, D)
+        # Weighted Sum
+        # (Experts, Batch, Seq, Dim) * (Batch, Seq, Experts) -> (Batch, Seq, Dim)
+        final_out = torch.einsum("ebsd, bse -> bsd", expert_outputs, routing_weights)
+        return self.out_norm(final_out)
+    def get_output_length(self, input_length: int) -> int:
+        """Calculate output sequence length given input length."""
+        # Two conv layers with stride=2 each = stride 4 total
+        padded = input_length + (4 - input_length % 4) % 4
+        return padded // 4
+    def get_aux_loss(self) -> torch.Tensor:
+        """Compute auxiliary losses: load balancing + z-loss."""
+        if self.last_router_logits is None:
+            return torch.tensor(0.0, device=self.conv[0].weight.device)
+        # Flatten for loss computation: (B, S, E) -> (B*S, E)
+        logits_flat = self.last_router_logits.view(-1, self.num_experts)
+        probs_flat = self.last_routing_weights.view(-1, self.num_experts)
+        balance = load_balancing_loss(probs_flat, self.num_experts, top_k=self.num_experts)
+        z = z_loss(logits_flat)
+        return self.aux_loss_coef * balance + self.z_loss_coef * z
 # =============================================================================
 # =============================================================================
 class SharedMoEBlock(nn.Module):
+    """MoE block with Shared + Sigmoid-Routed Experts."""
     def __init__(
         self,
         self.top_k = top_k
         self.output_dim = output_dim
+        # RMSNorm before routing
+        self.norm = LlamaRMSNorm(input_dim, eps=1e-8)
         self.router = nn.Linear(input_dim, num_experts, bias=False)
+        nn.init.normal_(self.router.weight, mean=0.0, std=0.02)
         self.shared_expert = SwiGLUExpert(input_dim, hidden_dim, output_dim)
         self.experts = nn.ModuleList(
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         batch_size, seq_len, dim = hidden_states.shape
+        # 1. Apply Shared Expert
+        normed_states = self.norm(hidden_states)
+        shared_out = self.shared_expert(normed_states)
+        # 2. Router Logic (Sigmoid Style)
+        flat_hidden = normed_states.view(-1, dim)
         router_logits = self.router(flat_hidden)
+        # Sigmoid routing
+        router_probs = torch.sigmoid(router_logits)
         self.last_router_logits = router_logits
         self.last_router_probs = router_probs
+        # 3. Top-K Selection
+        top_k_scores, top_k_indices = torch.topk(router_probs, self.top_k, dim=-1)
+        # Normalize weights
+        top_k_weights = top_k_scores / (top_k_scores.sum(dim=-1, keepdim=True) + 1e-6)
         top_k_weights = top_k_weights.to(hidden_states.dtype)
+        # 4. Dispatch
         routed_out = self._dispatch_experts(flat_hidden, top_k_indices, top_k_weights)
         routed_out = routed_out.view(batch_size, seq_len, -1)
             token_indices, slot_indices = torch.where(expert_mask)
             expert_input = hidden_states[token_indices]
+            expert_output = expert(expert_input).to(output.dtype)
             weights = top_k_weights[token_indices, slot_indices].unsqueeze(-1)
             output.index_add_(0, token_indices, expert_output * weights)
 def load_balancing_loss(router_probs: torch.Tensor, num_experts: int, top_k: int) -> torch.Tensor:
     """Auxiliary loss to encourage balanced expert usage."""
     prob_per_expert = router_probs.mean(dim=0)
+    target_mean = prob_per_expert.mean()
+    return (prob_per_expert - target_mean).square().sum() * num_experts
 def z_loss(router_logits: torch.Tensor) -> torch.Tensor:
         super().__init__()
         self.k = getattr(config, "projector_pool_stride", 4)
         encoder_dim = config.encoder_dim
+        # Depthwise Conv for temporal mixing
+        self.temporal_conv = nn.Conv1d(
+            encoder_dim, encoder_dim, kernel_size=3, padding=1, groups=encoder_dim
+        )
         in_dim = encoder_dim * self.k
         out_dim = config.llm_dim
         hidden_dim = getattr(config, "projector_hidden_dim", None) or in_dim
         self.z_loss_coef = getattr(config, "router_z_loss_coef", 0.001)
         self.moe = SharedMoEBlock(in_dim, hidden_dim, out_dim, self.num_experts, self.top_k)
+        self._init_weights()
+    def _init_weights(self):
         with torch.no_grad():
             nn.init.orthogonal_(self.moe.shared_expert.gate_proj.weight)
             nn.init.orthogonal_(self.moe.shared_expert.up_proj.weight)
                 nn.init.orthogonal_(expert.up_proj.weight)
                 nn.init.orthogonal_(expert.down_proj.weight, gain=0.01)
+    def get_output_length(self, input_length: int) -> int:
+        """Calculate output sequence length given input length."""
+        # Temporal pooling with stride k
+        if input_length % self.k:
+            input_length += self.k - input_length % self.k
+        return input_length // self.k
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         batch_size, seq_len, dim = x.size()
         if x.dtype != target_dtype:
             x = x.to(target_dtype)
+        # Temporal Context Injection
+        x_ctx = x.transpose(1, 2)
+        x_ctx = self.temporal_conv(x_ctx)
+        x = x + x_ctx.transpose(1, 2)
         if seq_len % self.k:
             x = F.pad(x, (0, 0, 0, self.k - seq_len % self.k))
         return self.aux_loss_coef * balance + self.z_loss_coef * z
+# =============================================================================
+# QFormer Projector (Granite-style)
+# =============================================================================
+class QFormerAudioProjector(nn.Module):
+    """
+    BLIP-2 QFormer projector with learnable queries.
+    Based on GraniteSpeechEncoderProjector - uses a QFormer model with learnable
+    query embeddings to compress and project audio encoder outputs. The audio
+    sequence is processed in windows and downsampled via cross-attention.
+    """
+    def __init__(self, config):
+        super().__init__()
+        encoder_dim = config.encoder_dim
+        llm_dim = config.llm_dim
+        # Window and downsampling parameters (Granite defaults: window=15, downsample=5)
+        self.window_size = getattr(config, "qformer_window_size", 15)
+        self.downsample_rate = getattr(config, "downsample_rate", 5)
+        self.num_queries = self.window_size // self.downsample_rate
+        # QFormer hidden size (matches encoder for cross-attention)
+        qformer_hidden = getattr(config, "qformer_hidden_size", None) or encoder_dim
+        qformer_num_layers = getattr(config, "qformer_num_layers", 2)
+        qformer_num_heads = getattr(config, "qformer_num_heads", 16)
+        qformer_intermediate = getattr(config, "qformer_intermediate_size", None) or (
+            qformer_hidden * 4
+        )
+        # Learnable query embeddings (Granite uses std=1.0)
+        self.query = nn.Parameter(torch.zeros(1, self.num_queries, qformer_hidden))
+        self.query.data.normal_(mean=0.0, std=1.0)
+        # Optional projection if encoder dim != qformer hidden
+        if encoder_dim != qformer_hidden:
+            self.encoder_proj = nn.Linear(encoder_dim, qformer_hidden, bias=False)
+        else:
+            self.encoder_proj = None
+        # Configure QFormer to match Granite's exact config
+        qformer_config = Blip2QFormerConfig(
+            hidden_size=qformer_hidden,
+            num_hidden_layers=qformer_num_layers,
+            num_attention_heads=qformer_num_heads,
+            intermediate_size=qformer_intermediate,
+            encoder_hidden_size=qformer_hidden,
+            cross_attention_frequency=1,
+            # Granite-specific settings
+            hidden_act="gelu",
+            attention_probs_dropout_prob=0.1,
+            hidden_dropout_prob=0.1,
+            layer_norm_eps=1e-12,
+            initializer_range=0.02,
+        )
+        self.qformer = AutoModel.from_config(qformer_config)
+        # Final projection to LLM dimension (Granite uses bias=True)
+        self.linear = nn.Linear(qformer_hidden, llm_dim)
+    def get_output_length(self, input_length: int) -> int:
+        """Calculate output sequence length given input length."""
+        # QFormer uses window-based processing with num_queries per window
+        nblocks = math.ceil(input_length / self.window_size)
+        return nblocks * self.num_queries
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: [batch_size, seq_len, encoder_dim]
+        Returns:
+            projected: [batch_size, num_output_tokens, llm_dim]
+        """
+        batch_size, seq_len, dim = hidden_states.size()
+        # Ensure float dtype for QFormer
+        target_dtype = self.query.dtype
+        if hidden_states.dtype != target_dtype:
+            hidden_states = hidden_states.to(target_dtype)
+        # Optional encoder projection
+        if self.encoder_proj is not None:
+            hidden_states = self.encoder_proj(hidden_states)
+        # Compute number of windows and pad to fit
+        nblocks = math.ceil(seq_len / self.window_size)
+        pad = nblocks * self.window_size - seq_len
+        if pad > 0:
+            hidden_states = F.pad(hidden_states, (0, 0, 0, pad), "constant", 0)
+        # Reshape to process each window: [batch*nblocks, window_size, dim]
+        effective_batch = batch_size * nblocks
+        hidden_states = hidden_states.view(effective_batch, self.window_size, -1)
+        # Expand queries to match batch size
+        query_embeds = self.query.expand(effective_batch, -1, -1)
+        # QFormer cross-attention
+        query_output = self.qformer(
+            query_embeds=query_embeds,
+            encoder_hidden_states=hidden_states,
+            return_dict=True,
+        )
+        # Reshape back: [batch, nblocks * num_queries, hidden]
+        output_tokens = nblocks * self.num_queries
+        query_proj = query_output.last_hidden_state.view(batch_size, output_tokens, -1)
+        # Project to LLM dimension
+        return self.linear(query_proj)
 # =============================================================================
 # Projector Registry
 # =============================================================================
 PROJECTOR_CLASSES = {
     "mlp": MLPAudioProjector,
+    "mosa": MOSAProjector,
     "shared_moe": SharedMoEAudioProjector,
+    "qformer": QFormerAudioProjector,
 }

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d4aeaf198f783cbf58d8cd59812baac429ffe49147bf9648f6618de20b8d4a4c
-size 17209003

 version https://git-lfs.github.com/spec/v1
+oid sha256:33b674fb8444e2553eae8f1b261093371920a28ef75b5c18f4deb3f9217ed0ba
+size 11422834

tokenizer_config.json CHANGED Viewed

Binary files a/tokenizer_config.json and b/tokenizer_config.json differ