# coding=utf-8 # Copyright 2026 NAVER Cloud Corp. and the HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """HyperCLOVAX-SEED Audio Encoder model. Extends WhisperEncoder with the following design choices: - Trained from random initialization with CTC loss (not using pretrained ASR weights) - Temporal pooling (Conv1d, kernel=5, stride=5) applied after the encoder to reduce output rate from 50 Hz to 10 Hz for multimodal integration Acknowledgements: - Audio encoder uses WhisperEncoder from the HuggingFace transformers library (https://github.com/huggingface/transformers), Apache 2.0 License. Original Whisper model: https://github.com/openai/whisper (MIT License). """ from typing import Optional, Tuple, Union import torch import torch.nn as nn from transformers import AutoModel, PreTrainedModel, WhisperConfig from transformers.modeling_outputs import BaseModelOutput try: from transformers import WhisperEncoder except ImportError: from transformers.models.whisper.modeling_whisper import WhisperEncoder from .configuration_hyperclovax_seed_audio_encoder import HyperCLOVAXSeedAudioEncoderConfig class HyperCLOVAXSeedAudioEncoder(PreTrainedModel): """Audio encoder based on WhisperEncoder with temporal pooling (50Hz -> 10Hz).""" config_class = HyperCLOVAXSeedAudioEncoderConfig supports_gradient_checkpointing = True def __init__(self, config: HyperCLOVAXSeedAudioEncoderConfig): super().__init__(config) # Whisper encoder (using HF WhisperEncoder directly) whisper_config = WhisperConfig( d_model=config.d_model, encoder_layers=config.encoder_layers, encoder_attention_heads=config.encoder_attention_heads, encoder_ffn_dim=config.encoder_ffn_dim, num_mel_bins=config.num_mel_bins, max_source_positions=config.max_source_positions, dropout=config.dropout, attention_dropout=config.attention_dropout, ) self.encoder = WhisperEncoder(whisper_config) # Temporal pooling: 50Hz -> 10Hz self.temporal_pool = nn.Conv1d( config.d_model, config.d_model, kernel_size=config.pool_kernel_size, stride=config.pool_stride, ) self.layer_norm = nn.LayerNorm(config.d_model) # Compatibility: modeling_vlm.py accesses audio_model.conv1.weight.{dtype,device} self.conv1 = self.encoder.conv1 self.post_init() def _get_feat_extract_output_lengths( self, input_lengths: torch.LongTensor ) -> Tuple[torch.LongTensor, torch.LongTensor]: """Compute output sequence lengths after Whisper conv + temporal pooling. Whisper conv frontend: Conv1d(128, 768, k=3, s=1, p=1) -> same length Conv1d(768, 768, k=3, s=2, p=1) -> (L - 1) // 2 + 1 Temporal pool: Conv1d(768, 768, k=pool_kernel_size, s=pool_stride) -> (L - k) // s + 1 Args: input_lengths: (B,) number of valid mel frames per sample Returns: (feat_lengths, output_lengths) - encoder output lengths and post-pooling lengths """ # After Whisper conv frontend (second conv has stride 2, padding 1) feat_lengths = (input_lengths - 1) // 2 + 1 # After temporal pooling output_lengths = (feat_lengths - self.config.pool_kernel_size) // self.config.pool_stride + 1 return feat_lengths, output_lengths def forward( self, input_features: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutput]: """Encode mel spectrogram features and apply temporal pooling. Args: input_features: (B, num_mel_bins, T) mel spectrogram (128, 3000) attention_mask: (B, T) binary mask of valid mel frames; forwarded to WhisperEncoder. Returns: BaseModelOutput with last_hidden_state of shape (B, T_10hz, d_model) """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict # Whisper encoder: (B, 128, 3000) -> (B, 1500, 768) input_features = input_features.to(self.encoder.conv1.weight.dtype) encoder_output = self.encoder( input_features, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=True, ) x = encoder_output.last_hidden_state # Temporal pooling: (B, 1500, 768) -> (B, 300, 768) x = x.transpose(1, 2) # (B, d_model, T_enc) x = self.temporal_pool(x) # (B, d_model, T_pool) x = x.transpose(1, 2) # (B, T_pool, d_model) x = self.layer_norm(x) if not return_dict: return (x,) return BaseModelOutput(last_hidden_state=x) AutoModel.register(HyperCLOVAXSeedAudioEncoderConfig, HyperCLOVAXSeedAudioEncoder) __all__ = ["HyperCLOVAXSeedAudioEncoder"]