|
|
import librosa
|
|
|
import numpy as np
|
|
|
import cv2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
SR = 16000
|
|
|
DURATION = 4.0
|
|
|
MIN_DURATION = 1.5
|
|
|
|
|
|
N_MELS = 192
|
|
|
N_FFT = 2048
|
|
|
HOP_LENGTH = 160
|
|
|
IMG_SIZE = 224
|
|
|
|
|
|
|
|
|
def audio_to_spectrogram(audio_path):
|
|
|
"""
|
|
|
Universal audio preprocessing:
|
|
|
Supports WAV, MP3, FLAC, M4A, OGG
|
|
|
Internally converts everything to:
|
|
|
- mono
|
|
|
- 16 kHz
|
|
|
- fixed duration
|
|
|
"""
|
|
|
|
|
|
|
|
|
try:
|
|
|
y, _ = librosa.load(
|
|
|
audio_path,
|
|
|
sr=SR,
|
|
|
mono=True
|
|
|
)
|
|
|
except Exception as e:
|
|
|
raise RuntimeError(f"Audio decoding failed: {e}")
|
|
|
|
|
|
|
|
|
y, _ = librosa.effects.trim(y, top_db=30)
|
|
|
|
|
|
|
|
|
if len(y) < int(MIN_DURATION * SR):
|
|
|
raise ValueError("Audio too short for reliable analysis")
|
|
|
|
|
|
|
|
|
target = int(SR * DURATION)
|
|
|
|
|
|
if len(y) < target:
|
|
|
pad = target - len(y)
|
|
|
y = np.pad(y, (pad // 2, pad - pad // 2))
|
|
|
else:
|
|
|
y = y[:target]
|
|
|
|
|
|
|
|
|
mel = librosa.feature.melspectrogram(
|
|
|
y=y,
|
|
|
sr=SR,
|
|
|
n_fft=N_FFT,
|
|
|
hop_length=HOP_LENGTH,
|
|
|
n_mels=N_MELS,
|
|
|
power=2.0
|
|
|
)
|
|
|
|
|
|
logmel = librosa.power_to_db(mel, ref=np.max)
|
|
|
|
|
|
|
|
|
logmel = (logmel - np.mean(logmel)) / (np.std(logmel) + 1e-6)
|
|
|
logmel = (logmel - logmel.min()) / (logmel.max() - logmel.min() + 1e-8)
|
|
|
|
|
|
|
|
|
img = (logmel * 255).astype(np.uint8)
|
|
|
img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
|
|
|
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
|
|
|
|
|
return img
|
|
|
|