File size: 7,535 Bytes
# Copyright 2026 Patrick Lumbantobing, Vertox-AI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Utilities functions and classes for audio processing.
"""

from __future__ import annotations

import numpy as np
import numpy.typing as npt


def hz_to_mel(freq: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]:
    """
    Convert Hz to mel using the HTK formula.

    Args:
        freq: Frequencies in Hz.

    Returns:
        Frequencies in mel.
    """
    return 2595.0 * np.log10(1.0 + freq / 700.0)


def mel_to_hz(mels: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]:
    """
    Convert mel to Hz using the HTK formula.

    Args:
        mels: Values in mel.

    Returns:
        Frequencies in Hz.
    """
    return 700.0 * (10.0 ** (mels / 2595.0) - 1.0)


def librosa_style_mel_filterbank(
    *,
    sr: int,
    n_fft: int,
    n_mels: int,
    fmin: float,
    fmax: float | None = None,
    norm: str | None = "slaney",
) -> npt.NDArray[np.float32]:
    """
    Build a mel filterbank compatible with librosa.filters.mel using Slaney normalization.

    Args:
        sr: Sample rate.
        n_fft: FFT size.
        n_mels: Number of mel bins.
        fmin: Minimum frequency in Hz.
        fmax: Maximum frequency in Hz. If None, defaults to sr / 2.
        norm: If "slaney", apply area normalization.

    Returns:
        Mel filterbank with shape [n_mels, n_fft // 2 + 1].
    """
    if fmax is None:
        fmax = sr / 2.0

    n_freqs = n_fft // 2 + 1
    freqs = np.linspace(0.0, sr / 2.0, n_freqs, dtype=np.float64)

    m_min = hz_to_mel(np.array([fmin], dtype=np.float64))[0]
    m_max = hz_to_mel(np.array([fmax], dtype=np.float64))[0]
    m_pts = np.linspace(m_min, m_max, n_mels + 2, dtype=np.float64)
    hz_pts = mel_to_hz(m_pts)

    fb = np.zeros((n_mels, n_freqs), dtype=np.float64)

    for i in range(n_mels):
        left, center, right = hz_pts[i], hz_pts[i + 1], hz_pts[i + 2]

        left_slope = (freqs - left) / (center - left + 1e-10)
        right_slope = (right - freqs) / (right - center + 1e-10)

        fb[i] = np.maximum(0.0, np.minimum(left_slope, right_slope))

    if norm == "slaney":
        # Match Slaney-style area normalization used by librosa/torchaudio.
        enorm = 2.0 / (hz_pts[2:] - hz_pts[:-2])
        fb *= enorm[:, None]

    return fb.astype(np.float32)


def dynamic_range_compression_np(
    x: npt.NDArray[np.float32],
    C: float = 1.0,
    clip_val: float = 1e-5,
) -> npt.NDArray[np.float32]:
    """
    NumPy equivalent of torch.log(torch.clamp(x, min=clip_val) * C).

    Args:
        x: Input array.
        C: Multiplicative constant.
        clip_val: Minimum allowed value before log.

    Returns:
        Log-compressed array.
    """
    return np.log(np.clip(x * C, a_min=clip_val, a_max=None)).astype(np.float32)


def _reflect_pad_1d(x: npt.NDArray[np.float32], pad: int) -> npt.NDArray[np.float32]:
    """
    Reflect-pad a [1, T] waveform along the time axis.

    Args:
        x: Waveform with shape [1, T].
        pad: Number of samples to pad on each side.

    Returns:
        Padded waveform with shape [1, T + 2 * pad].
    """
    if pad == 0:
        return x
    left = x[:, 1 : pad + 1][:, ::-1]
    right = x[:, -pad - 1 : -1][:, ::-1]
    return np.concatenate([left, x, right], axis=1)


def _stft_magnitude(
    y: npt.NDArray[np.float32],
    *,
    n_fft: int,
    hop_size: int,
    win_size: int,
    center: bool,
) -> npt.NDArray[np.float32]:
    """
    Compute magnitude STFT for a single-channel waveform.

    Args:
        y: Input waveform of shape [1, T].
        n_fft: FFT size.
        hop_size: Hop size between frames.
        win_size: Window size.
        center: Whether to pad the input before framing.

    Returns:
        Magnitude spectrogram with shape [1, frames, n_fft // 2 + 1].
    """
    if y.ndim != 2 or y.shape[0] != 1:
        raise ValueError("Expected waveform shape [1, T].")

    x = y.astype(np.float32, copy=False)

    if center:
        pad = n_fft // 2
        x = _reflect_pad_1d(x, pad)

    if x.shape[1] < n_fft:
        raise ValueError("Input is too short for the requested n_fft.")

    num_frames = 1 + (x.shape[1] - n_fft) // hop_size
    frame_starts = hop_size * np.arange(num_frames, dtype=np.int64)
    frame_offsets = np.arange(n_fft, dtype=np.int64)

    frames = x[:, frame_starts[:, None] + frame_offsets[None, :]]  # [1, frames, n_fft]

    window = np.hanning(win_size).astype(np.float32)
    if n_fft > win_size:
        pad_left = (n_fft - win_size) // 2
        pad_right = n_fft - win_size - pad_left
        window = np.pad(window, (pad_left, pad_right))
    elif n_fft < win_size:
        window = window[:n_fft]

    frames = frames * window[None, None, :]

    spec = np.fft.rfft(frames, n=n_fft, axis=-1)
    mag = np.sqrt(np.real(spec) ** 2 + np.imag(spec) ** 2 + 1e-9).astype(np.float32)
    return mag


def mel_spectrogram_numpy(
    y: npt.NDArray[np.float32],
    n_fft: int,
    num_mels: int,
    sampling_rate: int,
    hop_size: int,
    win_size: int,
    fmin: int,
    fmax: int | None = None,
    center: bool = False,
    clip_val: float = 1e-5,
) -> npt.NDArray[np.float32]:
    """
    Compute a mel spectrogram in pure NumPy, matching the torch/torchaudio pipeline.

    This mirrors:
      - librosa.filters.mel(..., norm="slaney")
      - Hann window STFT
      - power-magnitude spectrogram
      - log compression with clipping

    Args:
        y: Waveform with shape [1, T].
        n_fft: FFT size.
        num_mels: Number of mel bins.
        sampling_rate: Sampling rate in Hz.
        hop_size: Hop size between frames.
        win_size: Window size.
        fmin: Minimum mel frequency in Hz.
        fmax: Maximum mel frequency in Hz. If None, defaults to sr / 2.
        center: Whether to pad the signal before framing.
        clip_val: Minimum value before log compression.

    Returns:
        Mel spectrogram with shape [1, num_mels, frames].
    """
    if y.ndim == 1:
        y = np.expand_dims(y, axis=0)
    elif y.ndim == 2 and y.shape[0] != 1:
        raise ValueError("Expected waveform shape [1, T].")
    elif y.ndim > 2:
        raise ValueError("Expected waveform ndim <= 2.")

    if np.min(y) < -1.0:
        pass
    if np.max(y) > 1.0:
        pass

    mel_basis = librosa_style_mel_filterbank(
        sr=sampling_rate,
        n_fft=n_fft,
        n_mels=num_mels,
        fmin=float(fmin),
        fmax=float(fmax) if fmax is not None else None,
        norm="slaney",
    )  # [num_mels, n_fft//2 + 1]

    spec = _stft_magnitude(
        y,
        n_fft=n_fft,
        hop_size=hop_size,
        win_size=win_size,
        center=center,
    )  # [1, frames, freq]

    mel_spec = np.matmul(mel_basis[None, :, :], np.transpose(spec, (0, 2, 1)))
    mel_spec = np.transpose(mel_spec, (0, 1, 2))  # [1, num_mels, frames]

    mel_spec = np.log(np.clip(mel_spec, a_min=clip_val, a_max=None)).astype(np.float32)
    return mel_spec.transpose(0, 2, 1)  # B x T x n_mels