import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

import gradio as gr
import tensorflow as tf
from tensorflow import keras
import librosa
import numpy as np
import warnings

warnings.filterwarnings('ignore')

# --- 1. MODEL LOADING ---
MODEL_PATH = 'best_model1_weights.h5'
try:
    model = keras.models.load_model(MODEL_PATH)
    print("✅ Model loaded!")
except Exception as e:
    print(f"❌ Load error: {e}")
    model = None

# --- 2. THE CORRECT ALPHABETICAL ORDER ---
# This is the order LabelEncoder uses by default
EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']

def extract_features(data, sr):
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sr).T, axis=0)
    return np.hstack((zcr, rms, mfcc, mel))

def preprocess_audio(audio_path):
    try:
        data, sr = librosa.load(audio_path, duration=2.5, offset=0.6, res_type='kaiser_fast')
        
        # Silence check
        if np.sqrt(np.mean(data**2)) < 0.002:
            return "SILENT"

        # Stacking features to reach 2376
        base = extract_features(data, sr)
        features = np.concatenate((base, base, base))

        if len(features) < 2376:
            features = np.pad(features, (0, 2376 - len(features)), 'constant')
        else:
            features = features[:2376]
            
        # Standardize
        std = np.std(features)
        if std > 0:
            features = (features - np.mean(features)) / std
        
        return features.reshape(1, 2376, 1)
    except Exception as e:
        return None

def predict_emotion(audio_filepath):
    if audio_filepath is None: return "No audio", "0%", {}
    
    feat = preprocess_audio(audio_filepath)
    if feat is "SILENT": return "NEUTRAL (Silence)", "100%", {"neutral": 1.0}
    if feat is None: return "Error", "0%", {}

    preds = model.predict(feat, verbose=0)[0]
    idx = np.argmax(preds)
    
    # Map probabilities to the ALPHABETICAL list
    prob_dict = {EMOTIONS[i]: float(preds[i]) for i in range(len(EMOTIONS))}
    
    return EMOTIONS[idx].upper(), f"{preds[idx]*100:.2f}%", prob_dict

# --- 3. INTERFACE ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 🎙️ Speech Emotion Recognition (Fixed Labels)")
    with gr.Row():
        audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath")
        with gr.Column():
            emotion_out = gr.Textbox(label="Detected Emotion")
            conf_out = gr.Textbox(label="Confidence")
            label_chart = gr.Label(label="All Probabilities", num_top_classes=7)

    btn = gr.Button("Analyze", variant="primary")
    btn.click(predict_emotion, inputs=audio_input, outputs=[emotion_out, conf_out, label_chart])

if __name__ == "__main__":
    demo.launch(ssr_mode=False)