clstm_fastAPI / audio_utils.py
abedir's picture
Upload 7 files
3461076 verified
raw
history blame
714 Bytes
import librosa
import numpy as np
import torch
from config import CONFIG
def preprocess_audio(path, device):
y, _ = librosa.load(path, sr=CONFIG["sample_rate"])
max_len = int(CONFIG["sample_rate"] * CONFIG["duration"])
y = y[:max_len] if len(y) > max_len else np.pad(y, (0, max_len - len(y)))
mel = librosa.feature.melspectrogram(
y=y,
sr=CONFIG["sample_rate"],
n_fft=CONFIG["n_fft"],
hop_length=CONFIG["hop_length"],
n_mels=CONFIG["n_mels"]
)
mel_db = librosa.power_to_db(mel, ref=np.max)
mel_db = (mel_db - mel_db.mean()) / (mel_db.std() + 1e-9)
tensor = torch.from_numpy(mel_db).unsqueeze(0).unsqueeze(0)
return tensor.to(device)