Spaces:

Pant0x
/

Voice_model

Sleeping

App Files Files Community

Pant0x commited on Nov 21, 2025

Commit

4f15c75

verified ·

1 Parent(s): 574ac6a

Upload app.py

Browse files

Files changed (1) hide show

app.py +71 -148

app.py CHANGED Viewed

@@ -1,148 +1,71 @@
-import gradio as gr
-import torch
-import torchaudio
-import numpy as np
-from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, AutoConfig
-import matplotlib.pyplot as plt
-# =========================
-# CONFIG
-# =========================
-MODEL_NAME = "Hatman/audio-emotion-detection"
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# =========================
-# LOAD MODEL & FEATURE EXTRACTOR
-# =========================
-feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
-model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
-model.eval()
-# Use the model's label mapping directly
-config = AutoConfig.from_pretrained(MODEL_NAME)
-LABELS = [config.id2label[i] for i in range(len(config.id2label))]
-# Map some emojis to each emotion for fun UI
-EMOJIS = {
-    "Angry": "😡",
-    "Disgusted": "🤢",
-    "Fearful": "😨",
-    "Happy": "😄",
-    "Neutral": "😐",
-    "Sad": "😢",
-    "Surprised": "😲"
-}
-# =========================
-# PREDICTION FUNCTION
-# =========================
-def predict(audio):
-    try:
-        if audio is None:
-            return {"Error": "No audio provided"}, None
-        sr, data = audio
-        data = np.array(data, dtype=np.float32)
-        # Stereo -> Mono
-        if len(data.shape) > 1:
-            data = np.mean(data, axis=1)
-        # Resample to 16kHz
-        if sr != 16000:
-            data = torchaudio.functional.resample(torch.tensor(data), sr, 16000).numpy()
-            sr = 16000
-        # Improved normalization - normalize to [-1, 1] range
-        # Check if data is in int16 range or already normalized
-        if np.abs(data).max() > 1.0:
-            data = data / np.abs(data).max()  # Normalize by max value
-        # Apply gentle audio preprocessing to improve feature extraction
-        # Remove DC offset
-        data = data - np.mean(data)
-        # Apply light pre-emphasis filter to balance frequencies
-        pre_emphasis = 0.97
-        data = np.append(data[0], data[1:] - pre_emphasis * data[:-1])
-        # Feature extraction with proper padding
-        inputs = feature_extractor(
-            data,
-            sampling_rate=sr,
-            return_tensors="pt",
-            padding=True,
-            max_length=16000 * 10,  # Max 10 seconds
-            truncation=True
-        )
-        # Move tensors to device
-        for k in inputs:
-            inputs[k] = inputs[k].to(DEVICE)
-        # Forward pass
-        with torch.no_grad():
-            logits = model(**inputs).logits
-            # Apply temperature scaling to reduce overconfidence
-            # Lower temperature = more uniform distribution
-            temperature = 1.5
-            logits = logits / temperature
-            probs = torch.nn.functional.softmax(logits, dim=-1)[0].cpu().numpy()
-        # Show ALL emotions with their scores (not just top 3)
-        result = {}
-        for i, label in enumerate(LABELS):
-            emoji = EMOJIS.get(label, '')
-            result[f"{label} {emoji}"] = round(float(probs[i]), 4)
-        # Sort by probability
-        result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True))
-        # Generate waveform plot
-        fig, ax = plt.subplots(figsize=(8, 3))
-        time_axis = np.linspace(0, len(data) / sr, len(data))
-        ax.plot(time_axis, data, color='purple', linewidth=0.5)
-        ax.set_title("Audio Waveform", fontsize=12, fontweight='bold')
-        ax.set_xlabel("Time (seconds)")
-        ax.set_ylabel("Amplitude")
-        ax.grid(True, alpha=0.3)
-        plt.tight_layout()
-        return result, fig
-    except Exception as e:
-        return {"Error": str(e)}, None
-# =========================
-# GRADIO APP
-# =========================
-demo = gr.Interface(
-    fn=predict,
-    inputs=gr.Audio(sources=["upload", "microphone"], type="numpy", label="🎤 Upload or Record Audio"),
-    outputs=[
-        gr.Label(num_top_classes=7, label="Emotion Probabilities"),
-        gr.Plot(label="Waveform Visualization")
-    ],
-    title="🎧 Audio Emotion Detection",
-    description=(
-        "Fine-tuned Wav2Vec2 model for emotion recognition from voice. "
-        "Detects: **Angry, Disgusted, Fearful, Happy, Neutral, Sad, Surprised**.\n\n"
-        "**Tips for better results:**\n"
-        "- Speak clearly and naturally\n"
-        "- Record at least 2-3 seconds of audio\n"
-        "- Avoid background noise\n"
-        "- Try exaggerating emotions for testing\n\n"
-        "Audio is automatically resampled to 16kHz and normalized."
-    ),
-    examples=[],
-    allow_flagging="never",
-    theme=gr.themes.Soft()
-)
-# =========================
-# LAUNCH
-# =========================
-if __name__ == "__main__":
-    demo.launch()

+import gradio as gr
+import torch
+from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
+import numpy as np
+import torchaudio
+# =========================
+# CONFIG
+# =========================
+MODEL_NAME = "your-username/Audio-Emotion-Detection"  # <- replace with your repo name
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+# =========================
+# LOAD MODEL & PROCESSOR
+# =========================
+processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
+model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
+# Emotion labels in same order used during training
+LABELS = ["Angry", "Disgusted", "Fearful", "Happy", "Neutral", "Sad", "Surprised"]
+# =========================
+# PREDICTION PIPELINE
+# =========================
+def predict(audio):
+    # audio: tuple (sample_rate, numpy array)
+    sr, data = audio
+    # Resample to 16k if necessary
+    if sr != 16000:
+        data = torchaudio.functional.resample(torch.tensor(data), sr, 16000).numpy()
+        sr = 16000
+    # Process input
+    inputs = processor(
+        data,
+        sampling_rate=sr,
+        return_tensors="pt",
+        padding=True,
+        truncation=True
+    ).to(DEVICE)
+    # Forward pass
+    with torch.no_grad():
+        logits = model(**inputs).logits
+        probs = torch.nn.functional.softmax(logits, dim=-1)[0]
+        pred_idx = torch.argmax(probs).item()
+    return {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
+# =========================
+# GRADIO INTERFACE
+# =========================
+demo = gr.Interface(
+    fn=predict,
+    inputs=gr.Audio(sources=["upload", "microphone"], type="numpy", label="Upload or Record Audio"),
+    outputs=gr.Label(num_top_classes=3),
+    title="Audio Emotion Detection 🎧",
+    description=(
+        "Fine-tuned Wav2Vec2 model for detecting emotions from voice. "
+        "Supports 7 emotions: Angry, Disgusted, Fearful, Happy, Neutral, Sad, and Surprised. "
+        "All audio should be 16kHz."
+    ),
+    allow_flagging="never",
+)
+# =========================
+# LAUNCH APP
+# =========================
+if __name__ == "__main__":
+    demo.launch()