| import librosa | |
| import numpy as np | |
| import cv2 | |
| SR = 16000 | |
| DURATION = 4.0 | |
| N_MELS = 192 | |
| N_FFT = 2048 | |
| HOP_LENGTH = 160 | |
| IMG_SIZE = 224 | |
| def audio_to_spectrogram(wav_path): | |
| y, _ = librosa.load(wav_path, sr=SR) | |
| y, _ = librosa.effects.trim(y, top_db=30) | |
| target = int(SR * DURATION) | |
| if len(y) < target: | |
| pad = target - len(y) | |
| y = np.pad(y, (pad // 2, pad - pad // 2)) | |
| else: | |
| y = y[:target] | |
| mel = librosa.feature.melspectrogram( | |
| y=y, | |
| sr=SR, | |
| n_fft=N_FFT, | |
| hop_length=HOP_LENGTH, | |
| n_mels=N_MELS | |
| ) | |
| logmel = librosa.power_to_db(mel, ref=np.max) | |
| logmel = (logmel - logmel.min()) / (logmel.max() - logmel.min()) | |
| img = (logmel * 255).astype(np.uint8) | |
| img = cv2.resize(img, (IMG_SIZE, IMG_SIZE)) | |
| img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) | |
| return img | |