File size: 1,557 Bytes
05b56c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import io
from PIL import Image

# Audio parameters
SR = 16000
N_FFT = 1024
HOP_LENGTH = 512
N_MELS = 128
TARGET_DURATION = 5.0
TARGET_LENGTH = int(TARGET_DURATION * SR)

def preprocess_audio(file_path):
    # Load audio (force mono)
    y, sr = librosa.load(file_path, sr=None, mono=True)

    # Normalize amplitude
    peak = np.abs(y).max()
    if peak > 0:
        y = y / peak * 0.99

    # Resample
    if sr != SR:
        y = librosa.resample(y, orig_sr=sr, target_sr=SR)

    # Split audio into 5s chunks
    chunks = []
    for start in range(0, len(y), TARGET_LENGTH):
        chunk = y[start:start + TARGET_LENGTH]
        if len(chunk) < TARGET_LENGTH:
            chunk = np.pad(chunk, (0, TARGET_LENGTH - len(chunk)), mode="constant")

        # Convert to Mel-spectrogram
        S = librosa.feature.melspectrogram(
            y=chunk, sr=SR, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS
        )
        S_dB = librosa.power_to_db(S, ref=np.max)

        # Convert spectrogram to RGBA image
        fig = plt.figure(figsize=(3, 3))
        librosa.display.specshow(S_dB, sr=SR, hop_length=HOP_LENGTH, cmap="magma")
        plt.axis("off")

        buf = io.BytesIO()
        plt.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
        plt.close(fig)

        buf.seek(0)
        img = Image.open(buf).convert("RGBA")  # 4 channels
        chunks.append(img)

    return chunks