File size: 1,986 Bytes
960b635
 
 
 
7c86e01
 
 
960b635
 
7c86e01
 
960b635
 
 
 
 
 
7c86e01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
960b635
7c86e01
960b635
 
7c86e01
 
 
 
 
960b635
 
 
 
 
 
 
 
7c86e01
960b635
 
 
 
 
7c86e01
 
960b635
 
 
 
7c86e01
 
 
960b635
7c86e01
960b635
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import librosa
import numpy as np
import cv2

# ======================================================
# CONFIG (DO NOT CHANGE – must match training)
# ======================================================
SR = 16000
DURATION = 4.0
MIN_DURATION = 1.5

N_MELS = 192
N_FFT = 2048
HOP_LENGTH = 160
IMG_SIZE = 224


def audio_to_spectrogram(audio_path):
    """

    Universal audio preprocessing:

    Supports WAV, MP3, FLAC, M4A, OGG

    Internally converts everything to:

    - mono

    - 16 kHz

    - fixed duration

    """

    # -------- Load audio (format-agnostic) --------
    try:
        y, _ = librosa.load(
            audio_path,
            sr=SR,        # force 16 kHz
            mono=True     # force mono
        )
    except Exception as e:
        raise RuntimeError(f"Audio decoding failed: {e}")

    # -------- Trim silence --------
    y, _ = librosa.effects.trim(y, top_db=30)

    # -------- Reject very short clips --------
    if len(y) < int(MIN_DURATION * SR):
        raise ValueError("Audio too short for reliable analysis")

    # -------- Fix duration --------
    target = int(SR * DURATION)

    if len(y) < target:
        pad = target - len(y)
        y = np.pad(y, (pad // 2, pad - pad // 2))
    else:
        y = y[:target]

    # -------- Log-mel spectrogram --------
    mel = librosa.feature.melspectrogram(
        y=y,
        sr=SR,
        n_fft=N_FFT,
        hop_length=HOP_LENGTH,
        n_mels=N_MELS,
        power=2.0
    )

    logmel = librosa.power_to_db(mel, ref=np.max)

    # -------- Normalize (stable) --------
    logmel = (logmel - np.mean(logmel)) / (np.std(logmel) + 1e-6)
    logmel = (logmel - logmel.min()) / (logmel.max() - logmel.min() + 1e-8)

    # -------- Convert to image --------
    img = (logmel * 255).astype(np.uint8)
    img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
    img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)

    return img