Spaces:
Sleeping
Sleeping
File size: 2,665 Bytes
c1e8004 96b9863 76e83c1 96b9863 76e83c1 96b9863 c8fcc4c a07b202 c1e8004 96b9863 76e83c1 c1e8004 96b9863 76e83c1 96b9863 76e83c1 96b9863 a2e21e4 96b9863 76e83c1 96b9863 76e83c1 96b9863 76e83c1 c1e8004 c8fcc4c 76e83c1 c1e8004 96b9863 c3f5d70 96b9863 76e83c1 96b9863 76e83c1 a2e21e4 a07b202 c1e8004 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
import gradio as gr
import numpy as np
import librosa
import librosa.display
import tensorflow as tf
import matplotlib.pyplot as plt
from PIL import Image
import io
# Parametry
SR = 22050
N_MELS = 128
TARGET_FRAMES = 216
LABELS = ['cel', 'cla', 'flu', 'gac', 'gel', 'org', 'pia', 'sax', 'tru', 'vio', 'voi']
polskie_nazwy = {
'cel': 'wiolonczela',
'cla': 'klawesyn',
'flu': 'flet',
'gac': 'gitara klasyczna',
'gel': 'gitara elektryczna',
'org': 'organy',
'pia': 'fortepian',
'sax': 'saksofon',
'tru': 'trąbka',
'vio': 'skrzypce',
'voi': 'głos ludzki'
}
dark_theme = gr.themes.Base(
primary_hue="blue",
neutral_hue="gray",
font="sans"
).set(
body_background_fill="#121212",
block_background_fill="#1E1E1E",
body_text_color="#ffffff",
button_primary_background_fill="#2d72d9",
button_primary_text_color="#ffffff"
)
# Wczytanie modelu
model = tf.keras.models.load_model("model.h5")
def compute_melspectrogram(y, sr=SR, n_mels=N_MELS):
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
return librosa.power_to_db(S, ref=np.max)
def resize_spectrogram(S, target_frames=TARGET_FRAMES):
if S.shape[1] < target_frames:
pad = target_frames - S.shape[1]
left = pad // 2
right = pad - left
S = np.pad(S, ((0, 0), (left, right)), mode='constant')
elif S.shape[1] > target_frames:
start = (S.shape[1] - target_frames) // 2
S = S[:, start:start+target_frames]
return S
def predict_and_plot(audio_path):
y, _ = librosa.load(audio_path, sr=SR)
S_full = compute_melspectrogram(y)
S = resize_spectrogram(S_full)
x = S[np.newaxis, ..., np.newaxis]
preds = model.predict(x, verbose=0)[0]
fig, ax = plt.subplots(figsize=(8, 4))
librosa.display.specshow(S_full, sr=SR, x_axis='time', y_axis='mel', cmap='magma', ax=ax)
ax.set_title("Mel-spektrogram")
plt.tight_layout()
buf = io.BytesIO()
fig.savefig(buf, format='png')
plt.close(fig)
buf.seek(0)
image = Image.open(buf)
pred_dict = {polskie_nazwy[label]: float(p) for label, p in zip(LABELS, preds)}
return pred_dict, image
demo = gr.Interface(
fn=predict_and_plot,
inputs=gr.Audio(type="filepath", label="Wgraj plik WAV"),
outputs=[
gr.Label(num_top_classes=5, label="Predykcja"),
gr.Image(label="Spektrogram")
],
title="Rozpoznawanie instrumentów",
description="Model klasyfikuje dźwięki do kilku z klas instrumentów.",
theme=dark_theme,
submit_btn="Zatwierdź",
clear_btn="Wyczyść"
)
if __name__ == "__main__":
demo.launch() |