Spaces:
Build error
Build error
| import librosa | |
| import numpy as np | |
| import torch | |
| from typing import Tuple | |
| import os | |
| import subprocess | |
| import tempfile | |
| from ..configs.config import AudioConfig | |
| class AudioProcessor: | |
| """音频处理器:负责音频的预处理和特征提取""" | |
| def __init__(self, config: AudioConfig): | |
| self.config = config | |
| self.target_length = 16000 # 固定音频长度为1秒 | |
| def load_audio(self, file_path: str) -> np.ndarray: | |
| """加载音频文件,支持多种格式包括m4a""" | |
| try: | |
| # 检查是否是m4a文件 | |
| if file_path.lower().endswith('.m4a'): | |
| # 创建临时wav文件 | |
| with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav: | |
| temp_wav_path = temp_wav.name | |
| try: | |
| # 使用ffmpeg将m4a转换为wav | |
| command = [ | |
| 'ffmpeg', | |
| '-i', file_path, | |
| '-acodec', 'pcm_s16le', | |
| '-ar', str(self.config.sample_rate), | |
| '-ac', '1', # 转换为单声道 | |
| '-y', # 覆盖已存在的文件 | |
| temp_wav_path | |
| ] | |
| subprocess.run(command, check=True, capture_output=True) | |
| # 加载转换后的wav文件 | |
| audio, _ = librosa.load(temp_wav_path, sr=self.config.sample_rate) | |
| finally: | |
| # 清理临时文件 | |
| if os.path.exists(temp_wav_path): | |
| os.unlink(temp_wav_path) | |
| else: | |
| # 直接加载其他格式的音频文件 | |
| audio, _ = librosa.load(file_path, sr=self.config.sample_rate) | |
| return audio | |
| except Exception as e: | |
| raise RuntimeError(f"Error loading audio file {file_path}: {str(e)}") | |
| def preprocess_audio(self, audio_path: str) -> Tuple[np.ndarray, np.ndarray]: | |
| """ | |
| 预处理音频文件: | |
| 1. 加载音频 | |
| 2. 重采样到目标采样率 | |
| 3. 将音频裁剪或填充到固定长度 | |
| 4. 计算梅尔频谱图 | |
| """ | |
| # 加载音频 | |
| audio, sr = librosa.load(audio_path, sr=self.config.sample_rate) | |
| # 处理音频长度 | |
| if len(audio) > self.target_length: | |
| # 随机裁剪到目标长度 | |
| start = np.random.randint(0, len(audio) - self.target_length) | |
| audio = audio[start:start + self.target_length] | |
| else: | |
| # 填充到目标长度 | |
| padding = self.target_length - len(audio) | |
| audio = np.pad(audio, (0, padding), mode='constant') | |
| # 计算梅尔频谱图 | |
| mel_spec = librosa.feature.melspectrogram( | |
| y=audio, | |
| sr=self.config.sample_rate, | |
| n_fft=self.config.n_fft, | |
| hop_length=self.config.hop_length, | |
| win_length=self.config.win_length, | |
| n_mels=self.config.n_mels, | |
| fmin=self.config.mel_fmin, | |
| fmax=self.config.mel_fmax | |
| ) | |
| # 转换为分贝单位 | |
| mel_spec = librosa.power_to_db(mel_spec, ref=np.max) | |
| # 确保梅尔频谱图的维度正确:(n_mels, time) | |
| # 如果需要,调整时间维度到固定长度 | |
| target_time_steps = 32 # 固定时间步长 | |
| if mel_spec.shape[1] > target_time_steps: | |
| mel_spec = mel_spec[:, :target_time_steps] | |
| else: | |
| pad_width = ((0, 0), (0, target_time_steps - mel_spec.shape[1])) | |
| mel_spec = np.pad(mel_spec, pad_width, mode='constant') | |
| return audio, mel_spec | |
| def audio_to_mel_spec(self, audio: np.ndarray) -> np.ndarray: | |
| """将音频转换为梅尔频谱图""" | |
| # 计算短时傅里叶变换 | |
| stft = librosa.stft( | |
| audio, | |
| n_fft=self.config.n_fft, | |
| hop_length=self.config.hop_length, | |
| win_length=self.config.win_length | |
| ) | |
| # 计算幅度谱 | |
| magnitude = np.abs(stft) | |
| # 生成梅尔滤波器组 | |
| mel_basis = librosa.filters.mel( | |
| sr=self.config.sample_rate, | |
| n_fft=self.config.n_fft, | |
| n_mels=self.config.n_mels, | |
| fmin=self.config.mel_fmin, | |
| fmax=self.config.mel_fmax | |
| ) | |
| # 应用梅尔滤波器 | |
| mel_spec = np.dot(mel_basis, magnitude) | |
| # 转换为对数刻度 | |
| mel_spec = np.log(np.clip(mel_spec, a_min=1e-5, a_max=None)) | |
| return mel_spec | |
| def normalize_audio(self, audio: np.ndarray) -> np.ndarray: | |
| """音频归一化""" | |
| return audio / np.max(np.abs(audio)) | |
| def pad_or_trim(self, audio: np.ndarray, target_length: int) -> np.ndarray: | |
| """将音频填充或裁剪到指定长度""" | |
| if len(audio) > target_length: | |
| return audio[:target_length] | |
| else: | |
| return np.pad(audio, (0, target_length - len(audio)), mode='constant') |