File size: 9,992 Bytes
18da8b7
 
 
 
 
83b508c
18da8b7
83b508c
18da8b7
 
 
 
 
 
2987b4b
2e1a0bf
18da8b7
2e1a0bf
 
18da8b7
 
 
 
 
 
 
 
2e1a0bf
 
18da8b7
2e1a0bf
18da8b7
 
 
2e1a0bf
 
83b508c
 
2987b4b
83b508c
 
18da8b7
 
 
 
 
83b508c
 
 
 
 
 
 
 
 
18da8b7
 
 
 
 
 
 
 
 
 
 
 
83b508c
18da8b7
 
 
 
 
 
 
 
 
 
 
 
83b508c
 
18da8b7
 
 
 
 
 
 
 
 
 
 
 
 
 
83b508c
18da8b7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83b508c
18da8b7
 
83b508c
 
 
 
75c2596
83b508c
 
 
 
 
75c2596
83b508c
75c2596
83b508c
 
 
 
 
 
 
 
 
 
548a7be
83b508c
 
 
18da8b7
 
 
 
83b508c
18da8b7
 
 
 
 
 
 
 
 
83b508c
 
 
 
 
 
 
 
 
 
18da8b7
 
83b508c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import gradio as gr
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from datasets import load_dataset, Audio
import random
import tempfile

print("πŸ“¦ Phase 1: Loading Pre-calculated Assets...")
df_cached = pd.read_csv("emodb_full_zeroshot_predictions.csv")
X_embeddings = np.load("emodb_full_embeddings.npy")

print("🧠 Phase 2: Dynamically Training Both Linear Classification Heads...")
# Cleanse PyArrow strings into native NumPy string arrays to avoid Python 3.13 indexing crashes
labels = df_cached['True_Emotion'].to_numpy().astype(str)
indices = np.arange(len(labels))

# --- Head A: The Global 80/20 Head ---
X_train, X_test, y_train, y_test, idx_train, idx_test = train_test_split(
    X_embeddings, labels, indices, test_size=0.20, random_state=42, stratify=labels
)
global_head = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
global_head.fit(X_train, y_train)

# --- Head B: The Cross-Speaker Head (Train on Speaker 31 & 34) ---
train_speakers = ['Speaker_31.0', 'Speaker_34.0']
cross_train_mask = df_cached['Speaker_Info'].isin(train_speakers).to_numpy()

X_train_cross = X_embeddings[cross_train_mask]
y_train_cross = labels[cross_train_mask]

cross_head = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
cross_head.fit(X_train_cross, y_train_cross)
print("βœ… Classification heads successfully trained with native NumPy types!")

print("🌍 Phase 3: Attaching to EmoDB on Hugging Face Hub (Metadata Only)...")
# Force decode=False to bypass torchcodec/soundfile requirements completely
hf_dataset = load_dataset("renumics/emodb", split="train")
hf_dataset = hf_dataset.cast_column("audio", Audio(decode=False))
print("βœ… Dataset streaming connected successfully without audio engine dependencies!")

# --- UI Functions ---
def process_sample(index):
    idx = int(index)
    sample = hf_dataset[idx]
    
    # Safely extract raw file bytes without python-side decoding
    audio_bytes = sample['audio']['bytes']
    
    # Save raw bytes to a temporary wav file for direct browser playback
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
        temp_file.write(audio_bytes)
        audio_path = temp_file.name
        
    row = df_cached.iloc[idx]
    
    # Run Inference on the vector using both heads
    vector = X_embeddings[idx].reshape(1, -1)
    global_pred = global_head.predict(vector)[0]
    cross_pred = cross_head.predict(vector)[0]
    
    # Calculate Probabilities for visual feedback
    probs = global_head.predict_proba(vector)[0]
    prob_dict = {global_head.classes_[i]: float(probs[i]) for i in range(len(probs))}
    
    return (
        audio_path,
        row['Speaker_Info'],
        row['True_Emotion'],
        row['Model_Prediction'],
        global_pred,
        cross_pred,
        prob_dict
    )

def pick_random_index():
    return random.randint(0, 534)

# --- GRADIO INTERFACE ---
# Note: Theme configuration moved to launch() to adhere to Gradio 6.0 standards
with gr.Blocks() as demo:
    gr.Markdown("# πŸš€ Audio LLM Hidden Space Decoder")
    gr.Markdown("### Evidence that an Audio LLM's internal mathematical representations vastly outshine its text outputs.")
    
    with gr.Tabs():
        # TAB 1: INTERACTIVE EXPLORER
        with gr.TabItem("πŸ“Š Dataset Explorer & Evaluation"):
            gr.Markdown("Pick an index manually or hit 'Pick Random Sample' to stream audio directly from EmoDB and compare all three execution layers.")
            
            with gr.Row():
                index_slider = gr.Slider(minimum=0, maximum=534, step=1, value=0, label="Select Audio Index")
                random_btn = gr.Button("🎲 Pick Random Sample", variant="secondary")
                analyze_btn = gr.Button("⚑ Analyze Sample", variant="primary")
                
            with gr.Row():
                audio_player = gr.Audio(label="Audio Playback (Streamed from HF)", type="filepath")
                speaker_out = gr.Textbox(label="Speaker ID")
                true_out = gr.Textbox(label="Ground Truth (Human Label)")
                
            with gr.Row():
                zs_out = gr.Textbox(label="1. Zero-Shot Text Generation Prediction")
                global_out = gr.Textbox(label="2. Global 80/20 Embedding Head Prediction")
                cross_out = gr.Textbox(label="3. Leave-Speakers-Out Head Prediction")
                
            confidence_chart = gr.Label(label="Global Embedding Classifier Class Probabilities")
            
            # Button mappings
            analyze_btn.click(
                process_sample, 
                inputs=[index_slider], 
                outputs=[audio_player, speaker_out, true_out, zs_out, global_out, cross_out, confidence_chart]
            )
            random_btn.click(pick_random_index, outputs=[index_slider]).then(
                process_sample, 
                inputs=[index_slider], 
                outputs=[audio_player, speaker_out, true_out, zs_out, global_out, cross_out, confidence_chart]
            )

        # TAB 2: FULL TECHNICAL REPORT
        with gr.TabItem("πŸ“œ Methodological Report & Statistics"):
            gr.Markdown("""
            # πŸ“ Technical Report: Decoding Audio LLM Hidden Spaces
            **A Comparative Study of Zero-Shot Text Generation vs. Downstream Embedding Classification on EmoDB**

            ### Abstract
            This report evaluates the capacity of the **Qwen2-Audio-7B-Instruct** model to interpret human vocal emotion across 535 audio samples from the Berlin Emotional Speech Database (EmoDB). We contrast direct zero-shot text-prompting against a downstream machine learning layer trained on the model's final hidden-state embeddings (**4096D**). Our experiments demonstrate that while the text-generation layer suffers from an informational bottleneck, the internal embedding space contains an incredibly robust, speaker-independent acoustic map of human emotion.

            ---

            ### Core Findings

            1. **The Representation Is Superior to the Output:** Direct zero-shot text generation yields an overall accuracy of **67.3%**. Extracting the raw **4096D** mathematical vectors and fitting a simple linear classification head achieves **97.2%** accuracyβ€”a **+29.9% absolute performance leap**.
            2. **Universal Cross-Speaker Generalization:** When trained on only two speakers (178 samples) and evaluated blindly on six entirely unseen speakers (357 samples), the embedding head maintains a remarkable **92.2% accuracy**. This proves the model identifies universal acoustic physics of emotion rather than speaker-specific identities.
            3. **The Power of Linear Restraint:** Due to the extreme high-dimensionality low-sample size nature of the data (`N << D`), simple **Logistic Regression** completely outperforms flexible non-linear algorithms (Random Forest, SVM, XGBoost) by resisting overfitting.
            4. **Complimentary Cognitive Profiles:** In the rare instances where the embedding head fails on acoustic "twins" (e.g., mistaking a high-arousal *Anger* sample for *Fear*), the deep reasoning layers of the full text-generation pipeline occasionally correct the mistake.

            ---

            ### Quantitative Performance Comparison

            | Evaluation Strategy | Test Configuration | Dataset Coverage | Accuracy (%) | Error Characteristics |
            | :--- | :--- | :--- | :--- | :--- |
            | **Zero-Shot Text Prompting** | Direct Generation | Full Dataset (535 files) | 67.3% | High variance across vocal pitches |
            | **Linear Embedding Head** | Stratified 80/20 Split | Unseen 20% Subset | **97.2%** | Rare confusion on acoustic twins |
            | **Linear Embedding Head** | Cross-Speaker (Leave-6-Out) | 6 Unseen Speakers (Blind) | **92.2%** | Robust across unique vocal anatomy |

            ---

            ### 🌍 Cross-Speaker Generalization Breakdown
            To determine if the internal representation generalizes across unique human vocal anatomies, accents, and pitches, we trained a linear classifier **strictly on 2 speakers** (Speaker 31 and 34) and evaluated blindly on the remaining **6 unseen speakers**. 
            
            The results confirm a highly robust, universal acoustic map:

            | Unseen Test Speaker ID | Extracted Audio Samples | Downstream Classification Accuracy |
            | :--- | :--- | :--- |
            | **Speaker_21.0** | 43 samples | **88.4%** |
            | **Speaker_32.0** | 99 samples | **91.9%** |
            | **Speaker_26.0** | 55 samples | **85.5%** |
            | **Speaker_30.0** | 35 samples | **91.4%** |
            | **Speaker_35.0** | 69 samples | **97.1%** |
            | **Speaker_25.0** | 56 samples | **96.4%** |
            | **COMBINED BLIND AVERAGE** | **357 samples** | **92.2%** |

            ---

            ### Key Acoustic Insights

            > πŸ“Œ **The Information Bottleneck:** Forcing a 7-billion parameter audio model to compress its total comprehension into a single word token discards massive amounts of emotional nuance. The internal embeddings "know" far more than the text decoder outputs.
            > 
            > πŸ“Œ **Acoustic Twins:** The few misclassifications occur strictly between high-arousal pairs (*Anger* vs. *Fear*) or low-arousal pairs (*Boredom* vs. *Neutral*), where the raw physical properties of speech sound nearly identical.
            >
            > πŸ“Œ **The Synergistic Save:** In rare instances where raw audio signals blur high-arousal acoustics, the textual deep reasoning layers of Qwen occasionally navigate structural nuances to succeed where raw vectors misalign.
            """)

# Launching with theme defined according to Gradio 6.0 guidelines
demo.launch(theme=gr.themes.Default(primary_hue="orange", secondary_hue="neutral"))