st192011 commited on
Commit
18da8b7
Β·
verified Β·
1 Parent(s): 3a1b154

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -0
app.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.linear_model import LogisticRegression
6
+ from datasets import load_dataset
7
+ import random
8
+
9
+ print("πŸ“¦ Phase 1: Loading Pre-calculated Assets...")
10
+ df_cached = pd.read_csv("emodb_full_zeroshot_predictions.csv")
11
+ X_embeddings = np.load("emodb_full_embeddings.npy")
12
+
13
+ print("🧠 Phase 2: Dynamically Training Both Linear Classification Heads...")
14
+ # --- Head A: The Global 80/20 Head ---
15
+ labels = df_cached['True_Emotion'].values
16
+ indices = np.arange(len(labels))
17
+ X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
18
+ X_embeddings, labels, indices, test_size=0.20, random_state=42, stratify=labels
19
+ )
20
+ global_head = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
21
+ global_head.fit(X_train, y_train)
22
+
23
+ # --- Head B: The Cross-Speaker Head (Train on Speaker 31 & 34) ---
24
+ train_speakers = ['Speaker_31.0', 'Speaker_34.0']
25
+ cross_train_mask = df_cached['Speaker_Info'].isin(train_speakers)
26
+ X_train_cross = X_embeddings[cross_train_mask]
27
+ y_train_cross = df_cached[cross_train_mask]['True_Emotion'].values
28
+
29
+ cross_head = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
30
+ cross_head.fit(X_train_cross, y_train_cross)
31
+
32
+ print("🌍 Phase 3: Attaching to EmoDB on Hugging Face Hub for Audio Streaming...")
33
+ hf_dataset = load_dataset("harritaylor/er_emodb", split="train")
34
+
35
+ # --- UI Functions ---
36
+ def process_sample(index):
37
+ idx = int(index)
38
+ sample = hf_dataset[idx]
39
+ sr = sample['audio']['sampling_rate']
40
+ audio_array = sample['audio']['array']
41
+ row = df_cached.iloc[idx]
42
+
43
+ # Run Inference on the vector using both heads
44
+ vector = X_embeddings[idx].reshape(1, -1)
45
+ global_pred = global_head.predict(vector)[0]
46
+ cross_pred = cross_head.predict(vector)[0]
47
+
48
+ # Calculate Probabilities for visual feedback
49
+ probs = global_head.predict_proba(vector)[0]
50
+ prob_dict = {global_head.classes_[i]: float(probs[i]) for i in range(len(probs))}
51
+
52
+ return (
53
+ (sr, audio_array),
54
+ row['Speaker_Info'],
55
+ row['True_Emotion'],
56
+ row['Model_Prediction'],
57
+ global_pred,
58
+ cross_pred,
59
+ prob_dict
60
+ )
61
+
62
+ def pick_random_index():
63
+ return random.randint(0, 534)
64
+
65
+ # --- GRADIO INTERFACE ---
66
+ with gr.Blocks(theme=gr.themes.Default(primary_hue="orange", secondary_hue="neutral")) as demo:
67
+ gr.Markdown("# πŸš€ Audio LLM Hidden Space Decoder")
68
+ gr.Markdown("### Evidence that an Audio LLM's internal mathematical representations vastly outshine its text outputs.")
69
+
70
+ with gr.Tabs():
71
+ # TAB 1: INTERACTIVE EXPLORER
72
+ with gr.TabItem("πŸ“Š Dataset Explorer & Evaluation"):
73
+ gr.Markdown("Pick an index manually or hit 'Pick Random Sample' to stream audio directly from EmoDB and compare all three execution layers.")
74
+
75
+ with gr.Row():
76
+ index_slider = gr.Slider(minimum=0, maximum=534, step=1, value=0, label="Select Audio Index")
77
+ random_btn = gr.Button("🎲 Pick Random Sample", variant="secondary")
78
+ analyze_btn = gr.Button("⚑ Analyze Sample", variant="primary")
79
+
80
+ with gr.Row():
81
+ audio_player = gr.Audio(label="Audio Playback (Streamed from HF)")
82
+ speaker_out = gr.Textbox(label="Speaker ID")
83
+ true_out = gr.Textbox(label="Ground Truth (Human Label)")
84
+
85
+ with gr.Row():
86
+ zs_out = gr.Textbox(label="1. Zero-Shot Text Generation Prediction")
87
+ global_out = gr.Textbox(label="2. Global 80/20 Embedding Head Prediction")
88
+ cross_out = gr.Textbox(label="3. Leave-Speakers-Out Head Prediction")
89
+
90
+ confidence_chart = gr.Label(label="Global Embedding Classifier Class Probabilities")
91
+
92
+ # Button mappings
93
+ analyze_btn.click(
94
+ process_sample,
95
+ inputs=[index_slider],
96
+ outputs=[audio_player, speaker_out, true_out, zs_out, global_out, cross_out, confidence_chart]
97
+ )
98
+ random_btn.click(pick_random_index, outputs=[index_slider]).then(
99
+ process_sample,
100
+ inputs=[index_slider],
101
+ outputs=[audio_player, speaker_out, true_out, zs_out, global_out, cross_out, confidence_chart]
102
+ )
103
+
104
+ # TAB 2: TECHNICAL REPORT & STATS
105
+ with gr.TabItem("πŸ“œ Methodological Report & Statistics"):
106
+ gr.Markdown("""
107
+ ## Decoding Audio LLM Hidden Spaces
108
+ **An Empirical Comparison of Information Extraction Methods on Qwen2-Audio-7B-Instruct**
109
+
110
+ ### πŸ“Œ Executive Summary
111
+ When forcing a large multimodal model to output speech interpretations as text tokens, a massive **information bottleneck** occurs. This dashboard showcases that extracting the raw mathematical embeddings hidden behind the model's text decoder unlocks an entirely new layer of granular acoustic intelligence.
112
+
113
+ ### πŸ“Š Comparative Performance Summary
114
+ """)
115
+
116
+ # Main comparison table
117
+ gr.Markdown("""
118
+ | Evaluation Architecture | Test Method | Dataset Coverage | Accuracy |
119
+ | :--- | :--- | :--- | :--- |
120
+ | **Zero-Shot Text Prompting** | Direct Generation | Full Dataset (535 files) | **67.3%** |
121
+ | **Linear Embedding Classifier** | Stratified 80/20 Split | Unseen 20% Subset | **97.2%** |
122
+ | **Linear Embedding Classifier** | Cross-Speaker Generalization | 6 Unseen Speakers (Blind) | **92.2%** |
123
+ """)
124
+
125
+ gr.Markdown("""
126
+ ### 🌍 Cross-Speaker Generalization Breakdown
127
+ To determine if the internal representation generalizes across unique human vocal anatomies, accents, and pitches, we trained a linear classifier **strictly on 2 speakers** (Speaker 31 and 34) and evaluated blindly on the remaining **6 unseen speakers**.
128
+
129
+ The results confirm a highly robust, universal acoustic map:
130
+ """)
131
+
132
+ # Speaker breakdown table
133
+ gr.Markdown("""
134
+ | Unseen Test Speaker ID | Extracted Audio Samples | Downstream Classification Accuracy |
135
+ | :--- | :--- | :--- |
136
+ | **Speaker_21.0** | 43 samples | **88.4%** |
137
+ | **Speaker_32.0** | 99 samples | **91.9%** |
138
+ | **Speaker_26.0** | 55 samples | **85.5%** |
139
+ | **Speaker_30.0** | 35 samples | **91.4%** |
140
+ | **Speaker_35.0** | 69 samples | **97.1%** |
141
+ | **Speaker_25.0** | 56 samples | **96.4%** |
142
+ | **COMBINED BLIND AVERAGE** | **357 samples** | **92.2%** |
143
+ """)
144
+
145
+ gr.Markdown("""
146
+ ### πŸ”‘ Primary Insights & Observations
147
+ 1. **The Linear Advantage:** Complex non-linear architectures (XGBoost, Random Forests) easily fall prey to overfitting due to high dimensionality ($4096\\text{D}$) and low sample sizes. Simple `LogisticRegression` bounds generalize beautifully.
148
+ 2. **Acoustic Edge Cases:** Misclassifications are bounded tightly by the physics of sound. The embedding head's rare failures occur strictly between acoustic "twins" like *Boredom/Neutral* (shared low-energy profiles) or *Anger/Fear* (shared high-energy profiles).
149
+ 3. **The Synergistic Save:** In rare instances where raw audio signals blur high-arousal acoustics, the textual deep reasoning layers of Qwen occasionally navigate structural nuances to succeed where raw vectors misalign.
150
+ """)
151
+
152
+ demo.launch()