Raemih commited on
Commit
e437e52
·
1 Parent(s): 3370729

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -0
app.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import matplotlib
3
+ matplotlib.use("Agg")
4
+ import matplotlib.pyplot as plt
5
+ from model_utils import load_models, predict
6
+
7
+ print("Loading models...")
8
+ load_models(model_dir=".")
9
+ print("Ready.")
10
+
11
+ EMOTION_LABELS = ['neutral', 'happy', 'sad', 'angry', 'fear']
12
+ EMOJI = {'neutral': '😐', 'happy': '😊', 'sad': '😢', 'angry': '😠', 'fear': '😨'}
13
+ COLORS = {'neutral': '#95a5a6', 'happy': '#2ecc71', 'sad': '#3498db', 'angry': '#e74c3c', 'fear': '#e67e22'}
14
+
15
+
16
+ def run_inference(audio_path, language, mode):
17
+ if audio_path is None:
18
+ return "Please upload or record audio first.", None
19
+ try:
20
+ probs = predict(audio_path, language=language, mode=mode)
21
+ except Exception as e:
22
+ return f"Error: {e}", None
23
+
24
+ sorted_probs = sorted(probs.items(), key=lambda x: -x[1])
25
+ top, top_conf = sorted_probs[0]
26
+
27
+ result_md = (
28
+ f"## {EMOJI.get(top, '')} {top.upper()}\n\n"
29
+ f"**Confidence:** {top_conf:.1%}\n\n"
30
+ f"**Language:** {language} | **Mode:** {mode}"
31
+ )
32
+
33
+ fig, ax = plt.subplots(figsize=(6, 3.2))
34
+ emos = [e for e, _ in sorted_probs]
35
+ vals = [p for _, p in sorted_probs]
36
+ cols = [COLORS.get(e, "#bdc3c7") for e in emos]
37
+ bars = ax.barh(emos, vals, color=cols, height=0.5, edgecolor="none")
38
+ for bar, val in zip(bars, vals):
39
+ ax.text(val + 0.01, bar.get_y() + bar.get_height() / 2,
40
+ f"{val:.1%}", va="center", fontsize=9)
41
+ ax.set_xlim(0, 1.05)
42
+ ax.set_xlabel("Probability")
43
+ ax.set_title("Emotion Probabilities", fontweight="bold")
44
+ ax.invert_yaxis()
45
+ ax.spines[["top", "right", "left"]].set_visible(False)
46
+ plt.tight_layout()
47
+
48
+ return result_md, fig
49
+
50
+
51
+ with gr.Blocks(title="Multilingual SER", theme=gr.themes.Soft()) as demo:
52
+ gr.Markdown("""
53
+ # Multilingual Speech Emotion Recognition
54
+ Detects emotion in **Sinhala**, **Tamil**, and **English** speech.
55
+ """)
56
+
57
+ with gr.Row():
58
+ with gr.Column():
59
+ audio_in = gr.Audio(
60
+ sources=["upload", "microphone"],
61
+ type="filepath",
62
+ label="Audio Input (WAV/MP3, max 6s)"
63
+ )
64
+ language = gr.Radio(
65
+ choices=["english", "tamil", "sinhala"],
66
+ value="english",
67
+ label="Language",
68
+ info="Select the language spoken — affects normalization"
69
+ )
70
+ mode = gr.Radio(
71
+ choices=["fusion", "gemaps", "ensemble"],
72
+ value="ensemble",
73
+ label="Inference Mode",
74
+ info="ensemble is most robust | gemaps is fastest | fusion is highest accuracy on English/Tamil"
75
+ )
76
+ btn = gr.Button("Detect Emotion", variant="primary")
77
+
78
+ with gr.Column():
79
+ out_text = gr.Markdown()
80
+ out_plot = gr.Plot(label="Confidence")
81
+
82
+ btn.click(run_inference, [audio_in, language, mode], [out_text, out_plot])
83
+
84
+ gr.Markdown("""
85
+ ---
86
+ **Emotions:** Neutral · Happy · Sad · Angry · Fear
87
+
88
+ **Modes:**
89
+ - `fusion` — Whisper-tiny encoder + eGeMAPS (best on English & Tamil)
90
+ - `gemaps` — 88 acoustic features only, language-agnostic, ~50ms
91
+ - `ensemble` — 60% fusion + 40% gemaps (recommended for Sinhala)
92
+
93
+ > Selecting the correct language is important — the model applies
94
+ > language-specific normalization that was learned during training.
95
+ """)
96
+
97
+
98
+ if __name__ == "__main__":
99
+ demo.launch()