File size: 24,914 Bytes

0c1d6f8

# coding=utf-8
# Copyright 2026 NAVER Cloud Corp. and the HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
HyperCLOVAX-SEED Audio Processor

Implements Whisper-compatible audio feature extraction:
- Log-mel spectrogram extraction from waveform
- Chunked processing for long audio clips
- Attention mask generation for padded sequences
- Discrete audio token count calculation (conv-based)
"""

from typing import List, Optional, Tuple, Union

import numpy as np
import torch
try:
    from transformers.image_processing_utils import BatchFeature
except ImportError:
    from transformers import BatchFeature
try:
    from torchaudio.functional import melscale_fbanks as _melscale_fbanks
except (ImportError, AttributeError):
    # fallback: transformers mel_filter_bank wrapped to return torch.Tensor
    from transformers.audio_utils import mel_filter_bank as _mel_filter_bank
    def _melscale_fbanks(n_freqs, f_min, f_max, n_mels, sample_rate, norm, mel_scale):
        return torch.from_numpy(_mel_filter_bank(
            num_frequency_bins=n_freqs,
            num_mel_filters=n_mels,
            min_frequency=f_min,
            max_frequency=f_max,
            sampling_rate=sample_rate,
            norm=norm,
            mel_scale=mel_scale,
        ))
from transformers.feature_extraction_sequence_utils import SequenceFeatureExtractor
try:
    from transformers.processing_utils import AudioKwargs
except ImportError:
    from typing import TypedDict as AudioKwargs  # transformers < 4.46


def _conv_output_length(
    input_length: int,
    kernel_size: int = 3,
    stride: int = 2,
    padding: int = 1,
    dilation: int = 1,
) -> int:
    """Compute output length of a 1D convolution.

    Formula: (input + 2*padding - dilation*(kernel-1) - 1) // stride + 1
    """
    return (input_length + 2 * padding - dilation * (kernel_size - 1) - 1) // stride + 1


class HyperCLOVAXSeedAudioKwargs(AudioKwargs, total=False):
    feature_size: Optional[int]
    hop_length: Optional[int]
    chunk_length: Optional[int]
    n_fft: Optional[int]
    n_samples: Optional[int]
    nb_max_frames: Optional[int]
    chunk_unit: Optional[int]
    min_chunk_size: Optional[int]
    dither: Optional[float]
    # Token parameters
    audio_token: Optional[str]
    audio_start_token: Optional[str]
    audio_end_token: Optional[str]
    # Discrete audio parameters
    use_discrete_token: Optional[bool]
    discrete_audio_token: Optional[str]
    discrete_audio_start_token: Optional[str]
    discrete_audio_end_token: Optional[str]


class HyperCLOVAXSeedAudioProcessor(SequenceFeatureExtractor):
    """Audio processor for HyperCLOVAX-SEED.

    Extracts Whisper-compatible log-mel spectrogram features and computes
    attention masks for the audio encoder. Also supports discrete audio
    token count calculation.
    """

    model_input_names = ["audio_values", "audio_masks", "discrete_audio_values"]

    def __init__(
        self,
        feature_size: int = 128,
        sampling_rate: int = 16000,
        hop_length: int = 160,
        chunk_length: int = 30,
        n_fft: int = 400,
        padding_value: float = 0.0,
        padding_side: str = "right",
        dither: float = 0.0,
        return_attention_mask: bool = False,
        n_samples: int = 480000,
        nb_max_frames: int = 3000,
        chunk_unit: int = 80,
        min_chunk_size: int = 1600,
        # Temporal pooling parameters
        pool_kernel_size: int = 5,
        pool_stride: int = 5,
        # Token parameters
        audio_token: str = "<|AUDIO_PAD|>",
        audio_start_token: str = "<|audio_start|>",
        audio_end_token: str = "<|audio_end|>",
        video_audio_pool_size: int = 25,
        # Discrete audio parameters
        use_discrete_token: bool = False,
        discrete_audio_token: str = "<|DISCRETE_AUDIO_PAD|>",
        discrete_audio_start_token: str = "<|discrete_audio_start|>",
        discrete_audio_end_token: str = "<|discrete_audio_end|>",
        **kwargs,
    ):
        super().__init__(
            feature_size=feature_size,
            sampling_rate=sampling_rate,
            hop_length=hop_length,
            chunk_length=chunk_length,
            n_fft=n_fft,
            padding_value=padding_value,
            padding_side=padding_side,
            dither=dither,
            return_attention_mask=return_attention_mask,
            n_samples=n_samples,
            nb_max_frames=nb_max_frames,
            chunk_unit=chunk_unit,
            min_chunk_size=min_chunk_size,
            # Token parameters
            audio_token=audio_token,
            audio_start_token=audio_start_token,
            audio_end_token=audio_end_token,
            video_audio_pool_size=video_audio_pool_size,
            pool_kernel_size=pool_kernel_size,
            pool_stride=pool_stride,
            # Discrete audio parameters
            use_discrete_token=use_discrete_token,
            discrete_audio_token=discrete_audio_token,
            discrete_audio_start_token=discrete_audio_start_token,
            discrete_audio_end_token=discrete_audio_end_token,
        )

        # Mel filter bank (Whisper-compatible) — torchaudio primary, transformers fallback
        self.mel_filters = _melscale_fbanks(
            n_freqs=1 + n_fft // 2,
            f_min=0.0,
            f_max=8000.0,
            n_mels=feature_size,
            sample_rate=sampling_rate,
            norm="slaney",
            mel_scale="slaney",
        )  # torch.Tensor, shape (n_freqs, n_mels)

    def _extract_fbank_features(
        self,
        waveform_batch: np.ndarray,
        device: str = "cpu",
    ) -> np.ndarray:
        """Extract log-mel spectrogram features from a waveform batch.

        Follows the OpenAI Whisper feature extraction pipeline.
        Reference: https://github.com/openai/whisper (MIT License)

        Adapted from WhisperFeatureExtractor._torch_extract_fbank_features.

        Args:
            waveform_batch: Waveform array of shape (batch_size, n_samples).
            device: Device for computation. Defaults to "cpu".

        Returns:
            Log-mel spectrogram of shape (batch_size, feature_size, num_frames).
        """
        waveform = torch.from_numpy(waveform_batch).to(device, torch.float32)
        window = torch.hann_window(self.n_fft, device=device)

        if self.dither != 0.0:
            waveform += self.dither * torch.randn(waveform.shape, dtype=waveform.dtype, device=waveform.device)

        stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True)
        magnitudes = stft[..., :-1].abs() ** 2

        mel_filters = self.mel_filters.to(device=device, dtype=torch.float32)
        mel_spec = mel_filters.T @ magnitudes

        log_spec = torch.clamp(mel_spec, min=1e-10).log10()
        if waveform.dim() == 2:
            max_val = log_spec.max(dim=2, keepdim=True)[0].max(dim=1, keepdim=True)[0]
            log_spec = torch.maximum(log_spec, max_val - 8.0)
        else:
            log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
        log_spec = (log_spec + 4.0) / 4.0

        if device != "cpu":
            log_spec = log_spec.detach().cpu()

        return log_spec.numpy()

    def _pad_and_extract_features(
        self,
        chunks: List[np.ndarray],
        sampling_rate: int,
    ) -> dict:
        """Pad audio chunks and extract mel-spectrogram features.

        Each chunk is padded to n_samples length, then mel-spectrogram is
        extracted and an attention mask is generated.

        Args:
            chunks: List of 1D numpy arrays, each representing an audio chunk.
            sampling_rate: Audio sampling rate.

        Returns:
            Dictionary with:
                - "input_features": Array of shape (num_chunks, feature_size, nb_max_frames).
                - "attention_mask": Array of shape (num_chunks, nb_max_frames).
        """
        n_samples = self.chunk_length * sampling_rate
        nb_max_frames = n_samples // self.hop_length

        padded_waveforms = []
        attention_masks = []

        for chunk in chunks:
            chunk = np.asarray(chunk, dtype=np.float32)
            chunk_len = len(chunk)

            # Pad or truncate
            if chunk_len < n_samples:
                padded = np.full(n_samples, self.padding_value, dtype=np.float32)
                padded[:chunk_len] = chunk
            else:
                padded = chunk[:n_samples]
                chunk_len = n_samples

            padded_waveforms.append(padded)

            # Attention mask (sample-level -> frame-level)
            sample_mask = np.zeros(n_samples, dtype=np.int32)
            sample_mask[:chunk_len] = 1
            frame_mask = sample_mask[:: self.hop_length]
            if len(frame_mask) > nb_max_frames:
                frame_mask = frame_mask[:nb_max_frames]
            elif len(frame_mask) < nb_max_frames:
                frame_mask = np.pad(frame_mask, (0, nb_max_frames - len(frame_mask)))
            attention_masks.append(frame_mask)

        waveform_batch = np.stack(padded_waveforms, axis=0)
        input_features = self._extract_fbank_features(waveform_batch)
        attention_mask = np.stack(attention_masks, axis=0)

        return {
            "input_features": input_features,
            "attention_mask": attention_mask,
        }

    def _get_feature_lengths(self, audio_masks: torch.Tensor) -> torch.Tensor:
        """Compute feature lengths after conv downsampling.

        Args:
            audio_masks: Attention mask of shape (batch, nb_max_frames).

        Returns:
            Feature lengths tensor of shape (batch,).
        """
        return (audio_masks.sum(-1) - 1) // 2 + 1

    def _get_attention_mask(self, audio_masks: torch.Tensor) -> torch.Tensor:
        """Generate attention mask for the audio encoder.

        Creates a causal-style mask where padded positions are filled with -inf.

        Args:
            audio_masks: Attention mask of shape (batch, nb_max_frames).

        Returns:
            Attention mask of shape (batch, 1, max_seq_len, max_seq_len).
        """
        feature_lengths = self._get_feature_lengths(audio_masks=audio_masks)
        max_seq_len = (self.nb_max_frames - 2) // 2 + 1
        padding_mask = torch.arange(max_seq_len) >= feature_lengths.unsqueeze(1)
        attention_mask = padding_mask[:, None, None, :].expand(padding_mask.shape[0], 1, max_seq_len, max_seq_len)
        attention_mask = attention_mask.masked_fill(attention_mask, float("-inf"))
        return attention_mask

    def _preprocess_continuous_audio(
        self,
        audio_clips: List[np.ndarray],
        sampling_rate: Optional[int] = None,
        chunk_length: Optional[int] = None,
    ) -> dict:
        """Preprocess audio clips for continuous audio features.

        Splits each audio clip into chunks of chunk_length seconds, extracts
        mel-spectrogram features, and computes token counts from attention masks.

        Args:
            audio_clips: List of audio clips, each a 1D numpy array (mono, float32).
            sampling_rate: Audio sampling rate. Defaults to self.sampling_rate.
            chunk_length: Chunk duration in seconds. Defaults to self.chunk_length.

        Returns:
            Dictionary with:
                - "audio_values": Tensor of shape (num_total_chunks, feature_size, nb_max_frames).
                - "audio_masks": Tensor of shape (num_total_chunks, nb_max_frames).
                - "audio_attention_mask": Tensor of shape (num_total_chunks, max_seq_len, max_seq_len).
                - "num_audio_tokens": Tensor of shape (N,) with per-clip continuous token counts.
        """
        if sampling_rate is None:
            sampling_rate = self.sampling_rate
        if chunk_length is None:
            chunk_length = self.chunk_length

        if len(audio_clips) == 0:
            max_seq_len = (self.nb_max_frames - 2) // 2 + 1
            return {
                "audio_values": torch.zeros(0, self.feature_size, self.nb_max_frames),
                "audio_masks": torch.zeros(0, self.nb_max_frames),
                "audio_attention_mask": torch.zeros(0, max_seq_len, max_seq_len),
                "num_audio_tokens": torch.tensor([], dtype=torch.long),
            }

        _audio_values, _audio_masks, _num_audio_tokens = [], [], []
        for _audio in audio_clips:
            chunks = []
            chunk_samples = chunk_length * sampling_rate
            for i in range(0, len(_audio), chunk_samples):
                chunks.append(_audio[i : i + chunk_samples])

            result = self._pad_and_extract_features(chunks, sampling_rate)

            _audio_value = result["input_features"]
            _audio_mask = result["attention_mask"]
            _num_audio_token = 0
            for _mask in _audio_mask:
                _input_length = (_mask.shape[-1] - 1) // 2 + 1
                _num_audio_token += (_input_length - self.pool_kernel_size) // self.pool_stride + 1

            _audio_values.append(torch.from_numpy(_audio_value))
            _audio_masks.append(torch.from_numpy(_audio_mask))
            _num_audio_tokens.append(_num_audio_token)

        _audio_values = torch.cat(_audio_values, dim=0)
        _audio_masks = torch.cat(_audio_masks, dim=0)
        _audio_attention_mask = self._get_attention_mask(audio_masks=_audio_masks)

        return {
            "audio_values": _audio_values,
            "audio_masks": _audio_masks,
            "audio_attention_mask": _audio_attention_mask,
            "num_audio_tokens": torch.tensor(_num_audio_tokens, dtype=torch.long),
        }

    def _preprocess_discrete_audio(
        self,
        audio_clips: List[np.ndarray],
        sampling_rate: Optional[int] = None,
        chunk_unit: Optional[int] = None,
        min_chunk_size: Optional[int] = None,
    ) -> dict:
        """Preprocess audio clips for discrete audio tokens.

        Validates each audio clip and computes the number of discrete tokens
        based on conv layer downsampling. Returns padded waveform tensors.

        Args:
            audio_clips: List of audio clips, each a 1D numpy array (mono, float32).
            sampling_rate: Audio sampling rate. Defaults to self.sampling_rate.
            chunk_unit: Chunk duration in seconds for long audio. Defaults to self.chunk_unit.
            min_chunk_size: Minimum audio length in samples. Defaults to self.min_chunk_size.

        Returns:
            Dictionary with:
                - "discrete_audio_values": Tensor of shape (N, max_audio_len).
                - "num_discrete_audio_tokens": Tensor of shape (N,) with per-clip discrete token counts.
        """
        if sampling_rate is None:
            sampling_rate = self.sampling_rate
        if chunk_unit is None:
            chunk_unit = self.chunk_unit
        if min_chunk_size is None:
            min_chunk_size = self.min_chunk_size

        _discrete_audio_values, _num_discrete_audio_tokens = [], []
        for _audio in audio_clips:
            audio_length = len(_audio)
            max_audio_length = 600 * sampling_rate
            audio_duration_sec = audio_length / sampling_rate

            if audio_length < min_chunk_size:
                raise ValueError(f"Discrete audio too short: {audio_length}")
            if np.isnan(_audio).any() or np.isinf(_audio).any():
                raise ValueError("Discrete audio contains NaN/Inf")
            if audio_length > max_audio_length:
                raise ValueError(
                    f"Discrete audio too long: {audio_length} samples = ({audio_duration_sec:.2f}s > 600s)"
                )

            audio_min, audio_max = _audio.min().item(), _audio.max().item()
            if audio_min < -100.0 or audio_max > 100.0:
                raise ValueError(f"Discrete audio values out of range: min {audio_min}, max {audio_max}")

            if audio_length > chunk_unit * sampling_rate:
                total_code_len = 0
                chunk_size = chunk_unit * sampling_rate
                for start in range(0, audio_length, chunk_size):
                    end = min(start + chunk_size, audio_length)
                    if end < audio_length and audio_length - end < min_chunk_size:
                        end = audio_length
                    chunk_len = end - start
                    mel_len = chunk_len // self.hop_length
                    after_conv1 = _conv_output_length(mel_len)
                    code_len = _conv_output_length(after_conv1)
                    total_code_len += code_len
                    if end >= audio_length:
                        break
                _num_discrete = total_code_len
            else:
                mel_len = audio_length // self.hop_length
                after_conv1 = _conv_output_length(mel_len)
                code_len = _conv_output_length(after_conv1)
                _num_discrete = code_len

            _discrete_audio_values.append(torch.tensor(_audio))
            _num_discrete_audio_tokens.append(_num_discrete)

        return {
            "discrete_audio_values": torch.cat(_discrete_audio_values, dim=0),
            "num_discrete_audio_tokens": torch.tensor(_num_discrete_audio_tokens, dtype=torch.long),
        }

    def preprocess(
        self,
        audios: List[np.ndarray],
        sampling_rate: Optional[int] = None,
        chunk_length: Optional[int] = None,
        chunk_unit: Optional[int] = None,
        min_chunk_size: Optional[int] = None,
        use_discrete_token: Optional[bool] = None,
        prefix: Optional[str] = None,
        **kwargs,
    ) -> BatchFeature:
        """Preprocess a list of audio clips.

        Resolves all kwargs at the entry point, then routes to
        ``_preprocess_continuous_audio`` and optionally
        ``_preprocess_discrete_audio``.

        Args:
            audios: List of audio clips, each a 1D numpy array.
            sampling_rate: Audio sampling rate. Defaults to self.sampling_rate.
            chunk_length: Chunk duration in seconds for continuous processing.
                Defaults to self.chunk_length.
            chunk_unit: Chunk duration in seconds for discrete processing.
                Defaults to self.chunk_unit.
            min_chunk_size: Minimum audio length in samples for discrete processing.
                Defaults to self.min_chunk_size.
            use_discrete_token: Whether to run discrete audio processing.
                Defaults to self.use_discrete_token.
            prefix: Optional string to prefix all output keys. Keys starting with
                ``"num_"`` get the prefix inserted after ``"num_"`` (e.g. prefix
                ``"video_"`` turns ``"num_audio_tokens"`` into
                ``"num_video_audio_tokens"``); all other keys are simply prepended
                (e.g. ``"audio_values"`` → ``"video_audio_values"``).
                ``None`` (default) leaves keys unchanged.

        Returns:
            BatchFeature with:
                - audio_values: Tensor of shape (num_total_chunks, feature_size, nb_max_frames).
                - audio_masks: Tensor of shape (num_total_chunks, nb_max_frames).
                - audio_attention_mask: Tensor of shape (num_total_chunks, max_seq_len, max_seq_len).
                - num_audio_tokens: Tensor of shape (N,) with per-clip continuous token counts.
                - discrete_audio_values (optional): Tensor of shape (N, max_audio_len).
                - num_discrete_audio_tokens (optional): Tensor of shape (N,) with per-clip discrete token counts.

            All keys are renamed according to ``prefix`` when provided.
        """
        # 1. Resolve all kwargs at the entry point
        sampling_rate = sampling_rate if sampling_rate is not None else self.sampling_rate
        chunk_length = chunk_length if chunk_length is not None else self.chunk_length
        chunk_unit = chunk_unit if chunk_unit is not None else self.chunk_unit
        min_chunk_size = min_chunk_size if min_chunk_size is not None else self.min_chunk_size
        use_discrete = use_discrete_token if use_discrete_token is not None else self.use_discrete_token

        # 2. Route to continuous sub-processor
        continuous_result = self._preprocess_continuous_audio(
            audios,
            sampling_rate=sampling_rate,
            chunk_length=chunk_length,
        )
        data = {
            "audio_values": continuous_result["audio_values"],
            "audio_attention_mask": continuous_result["audio_attention_mask"],
            "audio_masks": continuous_result["audio_masks"],
            "num_audio_tokens": continuous_result["num_audio_tokens"],
        }

        # 3. Optionally route to discrete sub-processor
        if use_discrete:
            discrete_result = self._preprocess_discrete_audio(
                audios,
                sampling_rate=sampling_rate,
                chunk_unit=chunk_unit,
                min_chunk_size=min_chunk_size,
            )
            data["discrete_audio_values"] = discrete_result["discrete_audio_values"]
            data["num_discrete_audio_tokens"] = discrete_result["num_discrete_audio_tokens"]

        if prefix is not None:
            data = {
                (f"num_{prefix}{k[len('num_'):]}" if k.startswith("num_") else f"{prefix}{k}"): v
                for k, v in data.items()
            }

        return BatchFeature(data=data, tensor_type="pt")

    def __call__(self, audios: List[np.ndarray], **kwargs) -> BatchFeature:
        """Alias for :meth:`preprocess`."""
        return self.preprocess(audios, **kwargs)

    def get_num_audio_tokens(
        self,
        audio_masks: torch.Tensor,
        discrete_audio_values: Optional[torch.Tensor] = None,
        include_boundary_tokens: bool = False,
        chunk_unit: Optional[int] = None,
        sampling_rate: Optional[int] = None,
        return_tuple: Optional[bool] = None,
    ) -> Union[int, Tuple[int, int]]:
        """Compute the number of audio tokens for the given input.

        Args:
            audio_masks: Attention mask for continuous audio. Shape (N,) or (num_chunks, N).
            discrete_audio_values: Discrete audio waveform. None to skip discrete computation.
            include_boundary_tokens: Whether to include start/end boundary tokens.
            chunk_unit: Chunk duration in seconds for discrete processing.
                Defaults to self.chunk_unit.
            sampling_rate: Audio sampling rate. Defaults to self.sampling_rate.
            return_tuple: If True, return (continuous, discrete) tuple.
                Otherwise return the sum.

        Returns:
            Token count as int, or (continuous, discrete) tuple if return_tuple is True.
        """
        chunk_unit = chunk_unit if chunk_unit is not None else self.chunk_unit
        sampling_rate = sampling_rate if sampling_rate is not None else self.sampling_rate

        def _compute_continuous_tokens(audio_mask: torch.Tensor) -> int:
            input_length = (audio_mask.shape[-1] - 1) // 2 + 1
            return (input_length - self.pool_kernel_size) // self.pool_stride + 1

        num_continuous_tokens, num_discrete_tokens = 0, 0
        if len(audio_masks.shape) == 1:
            num_continuous_tokens = _compute_continuous_tokens(audio_masks)
        else:
            num_continuous_tokens = sum(_compute_continuous_tokens(m) for m in audio_masks)
        if include_boundary_tokens:
            num_continuous_tokens += 2

        if self.use_discrete_token and discrete_audio_values is not None:
            audio_length = len(discrete_audio_values)
            chunk_size = chunk_unit * sampling_rate
            for _start in range(0, audio_length, chunk_size):
                _end = min(_start + chunk_size, audio_length)
                _chunked_length = _end - _start
                mel_len = _chunked_length // self.hop_length
                after_conv1 = _conv_output_length(mel_len)
                code_len = _conv_output_length(after_conv1)
                num_discrete_tokens += code_len
            if include_boundary_tokens:
                num_discrete_tokens += 2

        if return_tuple:
            return (num_continuous_tokens, num_discrete_tokens)
        else:
            return num_continuous_tokens + num_discrete_tokens