atharvaballa's picture
Update audio processing logic to support multiple formats
7c86e01
import librosa
import numpy as np
import cv2
# ======================================================
# CONFIG (DO NOT CHANGE – must match training)
# ======================================================
SR = 16000
DURATION = 4.0
MIN_DURATION = 1.5
N_MELS = 192
N_FFT = 2048
HOP_LENGTH = 160
IMG_SIZE = 224
def audio_to_spectrogram(audio_path):
"""
Universal audio preprocessing:
Supports WAV, MP3, FLAC, M4A, OGG
Internally converts everything to:
- mono
- 16 kHz
- fixed duration
"""
# -------- Load audio (format-agnostic) --------
try:
y, _ = librosa.load(
audio_path,
sr=SR, # force 16 kHz
mono=True # force mono
)
except Exception as e:
raise RuntimeError(f"Audio decoding failed: {e}")
# -------- Trim silence --------
y, _ = librosa.effects.trim(y, top_db=30)
# -------- Reject very short clips --------
if len(y) < int(MIN_DURATION * SR):
raise ValueError("Audio too short for reliable analysis")
# -------- Fix duration --------
target = int(SR * DURATION)
if len(y) < target:
pad = target - len(y)
y = np.pad(y, (pad // 2, pad - pad // 2))
else:
y = y[:target]
# -------- Log-mel spectrogram --------
mel = librosa.feature.melspectrogram(
y=y,
sr=SR,
n_fft=N_FFT,
hop_length=HOP_LENGTH,
n_mels=N_MELS,
power=2.0
)
logmel = librosa.power_to_db(mel, ref=np.max)
# -------- Normalize (stable) --------
logmel = (logmel - np.mean(logmel)) / (np.std(logmel) + 1e-6)
logmel = (logmel - logmel.min()) / (logmel.max() - logmel.min() + 1e-8)
# -------- Convert to image --------
img = (logmel * 255).astype(np.uint8)
img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
return img