# Copyright 2026 Patrick Lumbantobing, Vertox-AI # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Utilities functions and classes for audio processing. """ from __future__ import annotations import numpy as np import numpy.typing as npt def hz_to_mel(freq: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: """ Convert Hz to mel using the HTK formula. Args: freq: Frequencies in Hz. Returns: Frequencies in mel. """ return 2595.0 * np.log10(1.0 + freq / 700.0) def mel_to_hz(mels: npt.NDArray[np.float64]) -> npt.NDArray[np.float64]: """ Convert mel to Hz using the HTK formula. Args: mels: Values in mel. Returns: Frequencies in Hz. """ return 700.0 * (10.0 ** (mels / 2595.0) - 1.0) def librosa_style_mel_filterbank( *, sr: int, n_fft: int, n_mels: int, fmin: float, fmax: float | None = None, norm: str | None = "slaney", ) -> npt.NDArray[np.float32]: """ Build a mel filterbank compatible with librosa.filters.mel using Slaney normalization. Args: sr: Sample rate. n_fft: FFT size. n_mels: Number of mel bins. fmin: Minimum frequency in Hz. fmax: Maximum frequency in Hz. If None, defaults to sr / 2. norm: If "slaney", apply area normalization. Returns: Mel filterbank with shape [n_mels, n_fft // 2 + 1]. """ if fmax is None: fmax = sr / 2.0 n_freqs = n_fft // 2 + 1 freqs = np.linspace(0.0, sr / 2.0, n_freqs, dtype=np.float64) m_min = hz_to_mel(np.array([fmin], dtype=np.float64))[0] m_max = hz_to_mel(np.array([fmax], dtype=np.float64))[0] m_pts = np.linspace(m_min, m_max, n_mels + 2, dtype=np.float64) hz_pts = mel_to_hz(m_pts) fb = np.zeros((n_mels, n_freqs), dtype=np.float64) for i in range(n_mels): left, center, right = hz_pts[i], hz_pts[i + 1], hz_pts[i + 2] left_slope = (freqs - left) / (center - left + 1e-10) right_slope = (right - freqs) / (right - center + 1e-10) fb[i] = np.maximum(0.0, np.minimum(left_slope, right_slope)) if norm == "slaney": # Match Slaney-style area normalization used by librosa/torchaudio. enorm = 2.0 / (hz_pts[2:] - hz_pts[:-2]) fb *= enorm[:, None] return fb.astype(np.float32) def dynamic_range_compression_np( x: npt.NDArray[np.float32], C: float = 1.0, clip_val: float = 1e-5, ) -> npt.NDArray[np.float32]: """ NumPy equivalent of torch.log(torch.clamp(x, min=clip_val) * C). Args: x: Input array. C: Multiplicative constant. clip_val: Minimum allowed value before log. Returns: Log-compressed array. """ return np.log(np.clip(x * C, a_min=clip_val, a_max=None)).astype(np.float32) def _reflect_pad_1d(x: npt.NDArray[np.float32], pad: int) -> npt.NDArray[np.float32]: """ Reflect-pad a [1, T] waveform along the time axis. Args: x: Waveform with shape [1, T]. pad: Number of samples to pad on each side. Returns: Padded waveform with shape [1, T + 2 * pad]. """ if pad == 0: return x left = x[:, 1 : pad + 1][:, ::-1] right = x[:, -pad - 1 : -1][:, ::-1] return np.concatenate([left, x, right], axis=1) def _stft_magnitude( y: npt.NDArray[np.float32], *, n_fft: int, hop_size: int, win_size: int, center: bool, ) -> npt.NDArray[np.float32]: """ Compute magnitude STFT for a single-channel waveform. Args: y: Input waveform of shape [1, T]. n_fft: FFT size. hop_size: Hop size between frames. win_size: Window size. center: Whether to pad the input before framing. Returns: Magnitude spectrogram with shape [1, frames, n_fft // 2 + 1]. """ if y.ndim != 2 or y.shape[0] != 1: raise ValueError("Expected waveform shape [1, T].") x = y.astype(np.float32, copy=False) if center: pad = n_fft // 2 x = _reflect_pad_1d(x, pad) if x.shape[1] < n_fft: raise ValueError("Input is too short for the requested n_fft.") num_frames = 1 + (x.shape[1] - n_fft) // hop_size frame_starts = hop_size * np.arange(num_frames, dtype=np.int64) frame_offsets = np.arange(n_fft, dtype=np.int64) frames = x[:, frame_starts[:, None] + frame_offsets[None, :]] # [1, frames, n_fft] window = np.hanning(win_size).astype(np.float32) if n_fft > win_size: pad_left = (n_fft - win_size) // 2 pad_right = n_fft - win_size - pad_left window = np.pad(window, (pad_left, pad_right)) elif n_fft < win_size: window = window[:n_fft] frames = frames * window[None, None, :] spec = np.fft.rfft(frames, n=n_fft, axis=-1) mag = np.sqrt(np.real(spec) ** 2 + np.imag(spec) ** 2 + 1e-9).astype(np.float32) return mag def mel_spectrogram_numpy( y: npt.NDArray[np.float32], n_fft: int, num_mels: int, sampling_rate: int, hop_size: int, win_size: int, fmin: int, fmax: int | None = None, center: bool = False, clip_val: float = 1e-5, ) -> npt.NDArray[np.float32]: """ Compute a mel spectrogram in pure NumPy, matching the torch/torchaudio pipeline. This mirrors: - librosa.filters.mel(..., norm="slaney") - Hann window STFT - power-magnitude spectrogram - log compression with clipping Args: y: Waveform with shape [1, T]. n_fft: FFT size. num_mels: Number of mel bins. sampling_rate: Sampling rate in Hz. hop_size: Hop size between frames. win_size: Window size. fmin: Minimum mel frequency in Hz. fmax: Maximum mel frequency in Hz. If None, defaults to sr / 2. center: Whether to pad the signal before framing. clip_val: Minimum value before log compression. Returns: Mel spectrogram with shape [1, num_mels, frames]. """ if y.ndim == 1: y = np.expand_dims(y, axis=0) elif y.ndim == 2 and y.shape[0] != 1: raise ValueError("Expected waveform shape [1, T].") elif y.ndim > 2: raise ValueError("Expected waveform ndim <= 2.") if np.min(y) < -1.0: pass if np.max(y) > 1.0: pass mel_basis = librosa_style_mel_filterbank( sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=float(fmin), fmax=float(fmax) if fmax is not None else None, norm="slaney", ) # [num_mels, n_fft//2 + 1] spec = _stft_magnitude( y, n_fft=n_fft, hop_size=hop_size, win_size=win_size, center=center, ) # [1, frames, freq] mel_spec = np.matmul(mel_basis[None, :, :], np.transpose(spec, (0, 2, 1))) mel_spec = np.transpose(mel_spec, (0, 1, 2)) # [1, num_mels, frames] mel_spec = np.log(np.clip(mel_spec, a_min=clip_val, a_max=None)).astype(np.float32) return mel_spec.transpose(0, 2, 1) # B x T x n_mels