File size: 3,437 Bytes
b6f9c90 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | # -*- coding: utf-8 -*-
"""
音频处理模块 - 加载、保存和处理音频文件
"""
import numpy as np
import librosa
import soundfile as sf
from typing import Tuple, Optional
def load_audio(path: str, sr: int = 16000) -> np.ndarray:
"""
加载音频文件并重采样
Args:
path: 音频文件路径
sr: 目标采样率 (默认 16000)
Returns:
np.ndarray: 音频数据 (float32, 单声道)
"""
audio, orig_sr = librosa.load(path, sr=None, mono=True)
if orig_sr != sr:
audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=sr)
return audio.astype(np.float32)
def save_audio(path: str, audio: np.ndarray, sr: int = 48000):
"""
保存音频到文件
Args:
path: 输出文件路径
audio: 音频数据
sr: 采样率 (默认 48000)
"""
# 确保音频在 [-1, 1] 范围内
audio = np.clip(audio, -1.0, 1.0)
sf.write(path, audio, sr)
def soft_clip(
audio: np.ndarray,
threshold: float = 0.9,
ceiling: float = 0.99,
) -> np.ndarray:
"""
使用平滑软削波抑制峰值,尽量保留主体响度。
Args:
audio: 输入音频
threshold: 开始压缩的阈值
ceiling: 软削波上限
Returns:
np.ndarray: 处理后的音频
"""
audio = np.asarray(audio, dtype=np.float32)
if threshold <= 0:
raise ValueError("threshold 必须大于 0")
if ceiling <= threshold:
raise ValueError("ceiling 必须大于 threshold")
result = audio.copy()
abs_audio = np.abs(result)
mask = abs_audio > threshold
if not np.any(mask):
return result
overshoot = (abs_audio[mask] - threshold) / (ceiling - threshold + 1e-8)
compressed = threshold + (ceiling - threshold) * np.tanh(overshoot)
result[mask] = np.sign(result[mask]) * compressed
return result.astype(np.float32, copy=False)
def soft_clip_array(
audio: np.ndarray,
threshold: float = 0.9,
ceiling: float = 0.99,
) -> np.ndarray:
"""软削波数组版本,支持单声道/多声道。"""
return soft_clip(audio, threshold=threshold, ceiling=ceiling)
def get_audio_info(path: str) -> dict:
"""
获取音频文件信息
Args:
path: 音频文件路径
Returns:
dict: 音频信息
"""
info = sf.info(path)
return {
"duration": info.duration,
"sample_rate": info.samplerate,
"channels": info.channels,
"format": info.format
}
def normalize_audio(audio: np.ndarray, target_db: float = -20.0) -> np.ndarray:
"""
音频响度归一化
Args:
audio: 输入音频
target_db: 目标响度 (dB)
Returns:
np.ndarray: 归一化后的音频
"""
rms = np.sqrt(np.mean(audio ** 2))
if rms > 0:
target_rms = 10 ** (target_db / 20)
audio = audio * (target_rms / rms)
return np.clip(audio, -1.0, 1.0)
def trim_silence(audio: np.ndarray, sr: int = 16000,
top_db: int = 30) -> np.ndarray:
"""
去除音频首尾静音
Args:
audio: 输入音频
sr: 采样率
top_db: 静音阈值 (dB)
Returns:
np.ndarray: 去除静音后的音频
"""
trimmed, _ = librosa.effects.trim(audio, top_db=top_db)
return trimmed
|