import librosa import numpy as np import cv2 # ====================================================== # CONFIG (DO NOT CHANGE – must match training) # ====================================================== SR = 16000 DURATION = 4.0 MIN_DURATION = 1.5 N_MELS = 192 N_FFT = 2048 HOP_LENGTH = 160 IMG_SIZE = 224 def audio_to_spectrogram(audio_path): """ Universal audio preprocessing: Supports WAV, MP3, FLAC, M4A, OGG Internally converts everything to: - mono - 16 kHz - fixed duration """ # -------- Load audio (format-agnostic) -------- try: y, _ = librosa.load( audio_path, sr=SR, # force 16 kHz mono=True # force mono ) except Exception as e: raise RuntimeError(f"Audio decoding failed: {e}") # -------- Trim silence -------- y, _ = librosa.effects.trim(y, top_db=30) # -------- Reject very short clips -------- if len(y) < int(MIN_DURATION * SR): raise ValueError("Audio too short for reliable analysis") # -------- Fix duration -------- target = int(SR * DURATION) if len(y) < target: pad = target - len(y) y = np.pad(y, (pad // 2, pad - pad // 2)) else: y = y[:target] # -------- Log-mel spectrogram -------- mel = librosa.feature.melspectrogram( y=y, sr=SR, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS, power=2.0 ) logmel = librosa.power_to_db(mel, ref=np.max) # -------- Normalize (stable) -------- logmel = (logmel - np.mean(logmel)) / (np.std(logmel) + 1e-6) logmel = (logmel - logmel.min()) / (logmel.max() - logmel.min() + 1e-8) # -------- Convert to image -------- img = (logmel * 255).astype(np.uint8) img = cv2.resize(img, (IMG_SIZE, IMG_SIZE)) img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) return img