| |
|
| | """
|
| | 音频处理模块 - 加载、保存和处理音频文件
|
| | """
|
| | import numpy as np |
| | import librosa |
| | import soundfile as sf |
| | from typing import Tuple, Optional |
| |
|
| |
|
| | def load_audio(path: str, sr: int = 16000) -> np.ndarray:
|
| | """
|
| | 加载音频文件并重采样
|
| |
|
| | Args:
|
| | path: 音频文件路径
|
| | sr: 目标采样率 (默认 16000)
|
| |
|
| | Returns:
|
| | np.ndarray: 音频数据 (float32, 单声道)
|
| | """
|
| | audio, orig_sr = librosa.load(path, sr=None, mono=True)
|
| |
|
| | if orig_sr != sr:
|
| | audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=sr)
|
| |
|
| | return audio.astype(np.float32)
|
| |
|
| |
|
| | def save_audio(path: str, audio: np.ndarray, sr: int = 48000): |
| | """
|
| | 保存音频到文件
|
| |
|
| | Args:
|
| | path: 输出文件路径
|
| | audio: 音频数据
|
| | sr: 采样率 (默认 48000)
|
| | """
|
| |
|
| | audio = np.clip(audio, -1.0, 1.0) |
| | sf.write(path, audio, sr) |
| |
|
| |
|
| | def soft_clip( |
| | audio: np.ndarray, |
| | threshold: float = 0.9, |
| | ceiling: float = 0.99, |
| | ) -> np.ndarray: |
| | """ |
| | 使用平滑软削波抑制峰值,尽量保留主体响度。 |
| | |
| | Args: |
| | audio: 输入音频 |
| | threshold: 开始压缩的阈值 |
| | ceiling: 软削波上限 |
| | |
| | Returns: |
| | np.ndarray: 处理后的音频 |
| | """ |
| | audio = np.asarray(audio, dtype=np.float32) |
| |
|
| | if threshold <= 0: |
| | raise ValueError("threshold 必须大于 0") |
| | if ceiling <= threshold: |
| | raise ValueError("ceiling 必须大于 threshold") |
| |
|
| | result = audio.copy() |
| | abs_audio = np.abs(result) |
| | mask = abs_audio > threshold |
| | if not np.any(mask): |
| | return result |
| |
|
| | overshoot = (abs_audio[mask] - threshold) / (ceiling - threshold + 1e-8) |
| | compressed = threshold + (ceiling - threshold) * np.tanh(overshoot) |
| | result[mask] = np.sign(result[mask]) * compressed |
| | return result.astype(np.float32, copy=False) |
| |
|
| |
|
| | def soft_clip_array( |
| | audio: np.ndarray, |
| | threshold: float = 0.9, |
| | ceiling: float = 0.99, |
| | ) -> np.ndarray: |
| | """软削波数组版本,支持单声道/多声道。""" |
| | return soft_clip(audio, threshold=threshold, ceiling=ceiling) |
| |
|
| |
|
| | def get_audio_info(path: str) -> dict:
|
| | """
|
| | 获取音频文件信息
|
| |
|
| | Args:
|
| | path: 音频文件路径
|
| |
|
| | Returns:
|
| | dict: 音频信息
|
| | """
|
| | info = sf.info(path)
|
| | return {
|
| | "duration": info.duration,
|
| | "sample_rate": info.samplerate,
|
| | "channels": info.channels,
|
| | "format": info.format
|
| | }
|
| |
|
| |
|
| | def normalize_audio(audio: np.ndarray, target_db: float = -20.0) -> np.ndarray:
|
| | """
|
| | 音频响度归一化
|
| |
|
| | Args:
|
| | audio: 输入音频
|
| | target_db: 目标响度 (dB)
|
| |
|
| | Returns:
|
| | np.ndarray: 归一化后的音频
|
| | """
|
| | rms = np.sqrt(np.mean(audio ** 2))
|
| | if rms > 0:
|
| | target_rms = 10 ** (target_db / 20)
|
| | audio = audio * (target_rms / rms)
|
| | return np.clip(audio, -1.0, 1.0)
|
| |
|
| |
|
| | def trim_silence(audio: np.ndarray, sr: int = 16000,
|
| | top_db: int = 30) -> np.ndarray:
|
| | """
|
| | 去除音频首尾静音
|
| |
|
| | Args:
|
| | audio: 输入音频
|
| | sr: 采样率
|
| | top_db: 静音阈值 (dB)
|
| |
|
| | Returns:
|
| | np.ndarray: 去除静音后的音频
|
| | """
|
| | trimmed, _ = librosa.effects.trim(audio, top_db=top_db)
|
| | return trimmed
|
| |
|