Spaces:

E-motionAssistant
/

ser-wav2vec

Running

App Files Files Community

Raemih commited on 17 days ago

Commit

e437e52

1 Parent(s): 3370729

Upload app.py

Browse files

Files changed (1) hide show

app.py +99 -0

app.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import gradio as gr
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from model_utils import load_models, predict
+print("Loading models...")
+load_models(model_dir=".")
+print("Ready.")
+EMOTION_LABELS = ['neutral', 'happy', 'sad', 'angry', 'fear']
+EMOJI          = {'neutral': '😐', 'happy': '😊', 'sad': '😢', 'angry': '😠', 'fear': '😨'}
+COLORS         = {'neutral': '#95a5a6', 'happy': '#2ecc71', 'sad': '#3498db', 'angry': '#e74c3c', 'fear': '#e67e22'}
+def run_inference(audio_path, language, mode):
+    if audio_path is None:
+        return "Please upload or record audio first.", None
+    try:
+        probs = predict(audio_path, language=language, mode=mode)
+    except Exception as e:
+        return f"Error: {e}", None
+    sorted_probs  = sorted(probs.items(), key=lambda x: -x[1])
+    top, top_conf = sorted_probs[0]
+    result_md = (
+        f"## {EMOJI.get(top, '')} {top.upper()}\n\n"
+        f"**Confidence:** {top_conf:.1%}\n\n"
+        f"**Language:** {language}  |  **Mode:** {mode}"
+    )
+    fig, ax = plt.subplots(figsize=(6, 3.2))
+    emos  = [e for e, _ in sorted_probs]
+    vals  = [p for _, p in sorted_probs]
+    cols  = [COLORS.get(e, "#bdc3c7") for e in emos]
+    bars  = ax.barh(emos, vals, color=cols, height=0.5, edgecolor="none")
+    for bar, val in zip(bars, vals):
+        ax.text(val + 0.01, bar.get_y() + bar.get_height() / 2,
+                f"{val:.1%}", va="center", fontsize=9)
+    ax.set_xlim(0, 1.05)
+    ax.set_xlabel("Probability")
+    ax.set_title("Emotion Probabilities", fontweight="bold")
+    ax.invert_yaxis()
+    ax.spines[["top", "right", "left"]].set_visible(False)
+    plt.tight_layout()
+    return result_md, fig
+with gr.Blocks(title="Multilingual SER", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # Multilingual Speech Emotion Recognition
+    Detects emotion in **Sinhala**, **Tamil**, and **English** speech.
+    """)
+    with gr.Row():
+        with gr.Column():
+            audio_in = gr.Audio(
+                sources=["upload", "microphone"],
+                type="filepath",
+                label="Audio Input (WAV/MP3, max 6s)"
+            )
+            language = gr.Radio(
+                choices=["english", "tamil", "sinhala"],
+                value="english",
+                label="Language",
+                info="Select the language spoken — affects normalization"
+            )
+            mode = gr.Radio(
+                choices=["fusion", "gemaps", "ensemble"],
+                value="ensemble",
+                label="Inference Mode",
+                info="ensemble is most robust | gemaps is fastest | fusion is highest accuracy on English/Tamil"
+            )
+            btn = gr.Button("Detect Emotion", variant="primary")
+        with gr.Column():
+            out_text = gr.Markdown()
+            out_plot = gr.Plot(label="Confidence")
+    btn.click(run_inference, [audio_in, language, mode], [out_text, out_plot])
+    gr.Markdown("""
+    ---
+    **Emotions:** Neutral · Happy · Sad · Angry · Fear
+    **Modes:**
+    - `fusion` — Whisper-tiny encoder + eGeMAPS (best on English & Tamil)
+    - `gemaps` — 88 acoustic features only, language-agnostic, ~50ms
+    - `ensemble` — 60% fusion + 40% gemaps (recommended for Sinhala)
+    > Selecting the correct language is important — the model applies
+    > language-specific normalization that was learned during training.
+    """)
+if __name__ == "__main__":
+    demo.launch()