File size: 896 Bytes
960b635 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
import librosa
import numpy as np
import cv2
SR = 16000
DURATION = 4.0
N_MELS = 192
N_FFT = 2048
HOP_LENGTH = 160
IMG_SIZE = 224
def audio_to_spectrogram(wav_path):
y, _ = librosa.load(wav_path, sr=SR)
y, _ = librosa.effects.trim(y, top_db=30)
target = int(SR * DURATION)
if len(y) < target:
pad = target - len(y)
y = np.pad(y, (pad // 2, pad - pad // 2))
else:
y = y[:target]
mel = librosa.feature.melspectrogram(
y=y,
sr=SR,
n_fft=N_FFT,
hop_length=HOP_LENGTH,
n_mels=N_MELS
)
logmel = librosa.power_to_db(mel, ref=np.max)
logmel = (logmel - logmel.min()) / (logmel.max() - logmel.min())
img = (logmel * 255).astype(np.uint8)
img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
return img
|