import gradio as gr
import numpy as np
import librosa
import librosa.display
import tensorflow as tf
import matplotlib.pyplot as plt
from PIL import Image
import io

# Parametry
SR = 22050
N_MELS = 128
TARGET_FRAMES = 216
LABELS = ['cel', 'cla', 'flu', 'gac', 'gel', 'org', 'pia', 'sax', 'tru', 'vio', 'voi']
polskie_nazwy = {
    'cel': 'wiolonczela',
    'cla': 'klawesyn',
    'flu': 'flet',
    'gac': 'gitara klasyczna',
    'gel': 'gitara elektryczna',
    'org': 'organy',
    'pia': 'fortepian',
    'sax': 'saksofon',
    'tru': 'trąbka',
    'vio': 'skrzypce',
    'voi': 'głos ludzki'
}
dark_theme = gr.themes.Base(
    primary_hue="blue",
    neutral_hue="gray",
    font="sans"
).set(
    body_background_fill="#121212",
    block_background_fill="#1E1E1E",
    body_text_color="#ffffff",
    button_primary_background_fill="#2d72d9",
    button_primary_text_color="#ffffff"
)

# Wczytanie modelu
model = tf.keras.models.load_model("model.h5")

def compute_melspectrogram(y, sr=SR, n_mels=N_MELS):
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    return librosa.power_to_db(S, ref=np.max)

def resize_spectrogram(S, target_frames=TARGET_FRAMES):
    if S.shape[1] < target_frames:
        pad = target_frames - S.shape[1]
        left = pad // 2
        right = pad - left
        S = np.pad(S, ((0, 0), (left, right)), mode='constant')
    elif S.shape[1] > target_frames:
        start = (S.shape[1] - target_frames) // 2
        S = S[:, start:start+target_frames]
    return S

def predict_and_plot(audio_path):
    y, _ = librosa.load(audio_path, sr=SR)
    
    S_full = compute_melspectrogram(y)
    S = resize_spectrogram(S_full)
    
    x = S[np.newaxis, ..., np.newaxis]
    preds = model.predict(x, verbose=0)[0]
    

    fig, ax = plt.subplots(figsize=(8, 4))
    librosa.display.specshow(S_full, sr=SR, x_axis='time', y_axis='mel', cmap='magma', ax=ax)
    ax.set_title("Mel-spektrogram")
    plt.tight_layout()

    buf = io.BytesIO()
    fig.savefig(buf, format='png')
    plt.close(fig)
    buf.seek(0)
    image = Image.open(buf)

    pred_dict = {polskie_nazwy[label]: float(p) for label, p in zip(LABELS, preds)}
    return pred_dict, image

demo = gr.Interface(
    fn=predict_and_plot,
    inputs=gr.Audio(type="filepath", label="Wgraj plik WAV"),
    outputs=[
        gr.Label(num_top_classes=5, label="Predykcja"),
        gr.Image(label="Spektrogram")
    ],
    title="Rozpoznawanie instrumentów",
    description="Model klasyfikuje dźwięki do kilku z klas instrumentów.",
    theme=dark_theme,
    submit_btn="Zatwierdź",
    clear_btn="Wyczyść"
)

if __name__ == "__main__":
    demo.launch()