import librosa import numpy as np import torch from config import CONFIG def preprocess_audio(path, device): y, _ = librosa.load(path, sr=CONFIG["sample_rate"]) max_len = int(CONFIG["sample_rate"] * CONFIG["duration"]) y = y[:max_len] if len(y) > max_len else np.pad(y, (0, max_len - len(y))) mel = librosa.feature.melspectrogram( y=y, sr=CONFIG["sample_rate"], n_fft=CONFIG["n_fft"], hop_length=CONFIG["hop_length"], n_mels=CONFIG["n_mels"] ) mel_db = librosa.power_to_db(mel, ref=np.max) mel_db = (mel_db - mel_db.mean()) / (mel_db.std() + 1e-9) tensor = torch.from_numpy(mel_db).unsqueeze(0).unsqueeze(0) return tensor.to(device)