File size: 2,847 Bytes
c1e8004
 
96b9863
 
 
 
76e83c1
 
96b9863
76e83c1
96b9863
 
 
 
c8fcc4c
 
 
 
 
 
 
 
 
 
 
 
 
a07b202
 
 
 
 
 
 
 
 
 
 
c1e8004
 
 
 
96b9863
 
76e83c1
c1e8004
96b9863
 
 
76e83c1
 
96b9863
 
 
 
 
 
 
76e83c1
96b9863
 
76e83c1
96b9863
 
 
 
 
76e83c1
96b9863
76e83c1
96b9863
 
 
 
76e83c1
96b9863
76e83c1
96b9863
 
76e83c1
c1e8004
c8fcc4c
 
76e83c1
c1e8004
 
96b9863
c3f5d70
96b9863
76e83c1
96b9863
 
76e83c1
a07b202
 
 
 
c1e8004
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import gradio as gr
import numpy as np
import librosa
import librosa.display
import tensorflow as tf
import matplotlib.pyplot as plt
from PIL import Image
import io

# Parametry
SR = 22050
N_MELS = 128
TARGET_FRAMES = 216
LABELS = ['cel', 'cla', 'flu', 'gac', 'gel', 'org', 'pia', 'sax', 'tru', 'vio', 'voi']
polskie_nazwy = {
    'cel': 'wiolonczela',
    'cla': 'klawesyn',
    'flu': 'flet',
    'gac': 'gitara klasyczna',
    'gel': 'gitara elektryczna',
    'org': 'organy',
    'pia': 'fortepian',
    'sax': 'saksofon',
    'tru': 'trąbka',
    'vio': 'skrzypce',
    'voi': 'głos ludzki'
}
dark_theme = gr.themes.Base(
    primary_hue="blue",
    neutral_hue="gray",
    font="sans"
).set(
    body_background_fill="#121212",
    block_background_fill="#1E1E1E",
    body_text_color="#ffffff",
    button_primary_background_fill="#2d72d9",
    button_primary_text_color="#ffffff"
)

# Wczytanie modelu
model = tf.keras.models.load_model("model.h5")

def compute_melspectrogram(y, sr=SR, n_mels=N_MELS):
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    return librosa.power_to_db(S, ref=np.max)

def resize_spectrogram(S, target_frames=TARGET_FRAMES):
    if S.shape[1] < target_frames:
        pad = target_frames - S.shape[1]
        left = pad // 2
        right = pad - left
        S = np.pad(S, ((0, 0), (left, right)), mode='constant')
    elif S.shape[1] > target_frames:
        start = (S.shape[1] - target_frames) // 2
        S = S[:, start:start+target_frames]
    return S

def predict_and_plot(audio_path):
    # Wczytaj dźwięk
    y, _ = librosa.load(audio_path, sr=SR)
    
    # Oblicz i przeskaluj spektrogram
    S_full = compute_melspectrogram(y)
    S = resize_spectrogram(S_full)
    
    # Przygotuj do predykcji
    x = S[np.newaxis, ..., np.newaxis]
    preds = model.predict(x, verbose=0)[0]
    
    # Rysuj spektrogram i zapisz do obrazu
    fig, ax = plt.subplots(figsize=(8, 4))
    librosa.display.specshow(S_full, sr=SR, x_axis='time', y_axis='mel', cmap='magma', ax=ax)
    ax.set_title("Mel-spektrogram")
    plt.tight_layout()

    buf = io.BytesIO()
    fig.savefig(buf, format='png')
    plt.close(fig)
    buf.seek(0)
    image = Image.open(buf)

    # Predykcje jako słownik z polskimi nazwami
    pred_dict = {polskie_nazwy[label]: float(p) for label, p in zip(LABELS, preds)}
    return pred_dict, image

demo = gr.Interface(
    fn=predict_and_plot,
    inputs=gr.Audio(type="filepath", label="Wgraj plik WAV"),
    outputs=[
        gr.Label(num_top_classes=5, label="Predykcja"),
        gr.Image(label="Spektrogram")
    ],
    title="Rozpoznawanie instrumentów",
    description="Model klasyfikuje dźwięki do jednej z klas instrumentów.",
    theme=dark_theme,
    submit_btn="Zatwierdź",
    clear_btn="Wyczyść"
)

if __name__ == "__main__":
    demo.launch()