File size: 2,665 Bytes
c1e8004
 
96b9863
 
 
 
76e83c1
 
96b9863
76e83c1
96b9863
 
 
 
c8fcc4c
 
 
 
 
 
 
 
 
 
 
 
 
a07b202
 
 
 
 
 
 
 
 
 
 
c1e8004
 
 
 
96b9863
 
76e83c1
c1e8004
96b9863
 
 
76e83c1
 
96b9863
 
 
 
 
 
 
 
 
 
 
 
 
76e83c1
96b9863
a2e21e4
96b9863
 
 
 
76e83c1
96b9863
76e83c1
96b9863
 
76e83c1
c1e8004
c8fcc4c
76e83c1
c1e8004
 
96b9863
c3f5d70
96b9863
76e83c1
96b9863
 
76e83c1
a2e21e4
a07b202
 
 
c1e8004
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import gradio as gr
import numpy as np
import librosa
import librosa.display
import tensorflow as tf
import matplotlib.pyplot as plt
from PIL import Image
import io

# Parametry
SR = 22050
N_MELS = 128
TARGET_FRAMES = 216
LABELS = ['cel', 'cla', 'flu', 'gac', 'gel', 'org', 'pia', 'sax', 'tru', 'vio', 'voi']
polskie_nazwy = {
    'cel': 'wiolonczela',
    'cla': 'klawesyn',
    'flu': 'flet',
    'gac': 'gitara klasyczna',
    'gel': 'gitara elektryczna',
    'org': 'organy',
    'pia': 'fortepian',
    'sax': 'saksofon',
    'tru': 'trąbka',
    'vio': 'skrzypce',
    'voi': 'głos ludzki'
}
dark_theme = gr.themes.Base(
    primary_hue="blue",
    neutral_hue="gray",
    font="sans"
).set(
    body_background_fill="#121212",
    block_background_fill="#1E1E1E",
    body_text_color="#ffffff",
    button_primary_background_fill="#2d72d9",
    button_primary_text_color="#ffffff"
)

# Wczytanie modelu
model = tf.keras.models.load_model("model.h5")

def compute_melspectrogram(y, sr=SR, n_mels=N_MELS):
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    return librosa.power_to_db(S, ref=np.max)

def resize_spectrogram(S, target_frames=TARGET_FRAMES):
    if S.shape[1] < target_frames:
        pad = target_frames - S.shape[1]
        left = pad // 2
        right = pad - left
        S = np.pad(S, ((0, 0), (left, right)), mode='constant')
    elif S.shape[1] > target_frames:
        start = (S.shape[1] - target_frames) // 2
        S = S[:, start:start+target_frames]
    return S

def predict_and_plot(audio_path):
    y, _ = librosa.load(audio_path, sr=SR)
    
    S_full = compute_melspectrogram(y)
    S = resize_spectrogram(S_full)
    
    x = S[np.newaxis, ..., np.newaxis]
    preds = model.predict(x, verbose=0)[0]
    

    fig, ax = plt.subplots(figsize=(8, 4))
    librosa.display.specshow(S_full, sr=SR, x_axis='time', y_axis='mel', cmap='magma', ax=ax)
    ax.set_title("Mel-spektrogram")
    plt.tight_layout()

    buf = io.BytesIO()
    fig.savefig(buf, format='png')
    plt.close(fig)
    buf.seek(0)
    image = Image.open(buf)

    pred_dict = {polskie_nazwy[label]: float(p) for label, p in zip(LABELS, preds)}
    return pred_dict, image

demo = gr.Interface(
    fn=predict_and_plot,
    inputs=gr.Audio(type="filepath", label="Wgraj plik WAV"),
    outputs=[
        gr.Label(num_top_classes=5, label="Predykcja"),
        gr.Image(label="Spektrogram")
    ],
    title="Rozpoznawanie instrumentów",
    description="Model klasyfikuje dźwięki do kilku z klas instrumentów.",
    theme=dark_theme,
    submit_btn="Zatwierdź",
    clear_btn="Wyczyść"
)

if __name__ == "__main__":
    demo.launch()