Spaces:

Pant0x
/

Voice_model

Sleeping

App Files Files Community

Pant0x commited on Nov 21, 2025

Commit

76a3749

verified ·

1 Parent(s): e7edbfd

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -38

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import gradio as gr
 import torch
-from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
-import numpy as np
 import torchaudio
 # =========================
 # CONFIG
@@ -11,61 +12,105 @@ MODEL_NAME = "Hatman/audio-emotion-detection"
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # =========================
-# LOAD MODEL & PROCESSOR
 # =========================
-processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
-# Emotion labels in same order used during training (matches the model card)
-LABELS = ["angry", "disgust", "fear", "happy", "neutral", "sad", "surprised"]
 # =========================
-# PREDICTION PIPELINE
 # =========================
 def predict(audio):
-    # audio: tuple (sample_rate, numpy array)
-    sr, data = audio
-    # Resample to 16k if needed
-    if sr != 16000:
-        data = torchaudio.functional.resample(torch.tensor(data), sr, 16000).numpy()
-        sr = 16000
-    # Process input
-    inputs = processor(
-        data,
-        sampling_rate=sr,
-        return_tensors="pt",
-        padding=True,
-        truncation=True
-    ).to(DEVICE)
-    # Forward pass
-    with torch.no_grad():
-        logits = model(**inputs).logits
-        probs = torch.nn.functional.softmax(logits, dim=-1)[0]
-        pred_idx = torch.argmax(probs).item()
-    return {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
 # =========================
-# GRADIO INTERFACE
 # =========================
 demo = gr.Interface(
     fn=predict,
-    inputs=gr.Audio(sources=["upload", "microphone"], type="numpy", label="Upload or Record Audio"),
-    outputs=gr.Label(num_top_classes=3),
     title="Audio Emotion Detection 🎧",
     description=(
-        "Wav2Vec2 emotion classification model. "
-        "Supports 7 emotions: Angry, Disgust, Fear, Happy, Neutral, Sad, and Surprised. "
-        "Upload audio or use your microphone."
     ),
     allow_flagging="never",
 )
 # =========================
-# LAUNCH APP
 # =========================
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import torch
 import torchaudio
+import numpy as np
+from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, AutoConfig
+import matplotlib.pyplot as plt
 # =========================
 # CONFIG
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # =========================
+# LOAD MODEL & FEATURE EXTRACTOR
 # =========================
+feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
+model.eval()
+# Use the model’s label mapping directly
+config = AutoConfig.from_pretrained(MODEL_NAME)
+LABELS = [config.id2label[i] for i in range(len(config.id2label))]
+# Map some emojis to each emotion for fun UI
+EMOJIS = {
+    "Angry": "😡",
+    "Disgusted": "🤢",
+    "Fearful": "😨",
+    "Happy": "😄",
+    "Neutral": "😐",
+    "Sad": "😢",
+    "Surprised": "😲"
+}
 # =========================
+# PREDICTION FUNCTION
 # =========================
 def predict(audio):
+    try:
+        if audio is None:
+            return {"Error": "No audio provided"}, None
+        sr, data = audio
+        data = np.array(data, dtype=np.float32)
+        # Stereo -> Mono
+        if len(data.shape) > 1:
+            data = np.mean(data, axis=1)
+        # Resample to 16kHz
+        if sr != 16000:
+            data = torchaudio.functional.resample(torch.tensor(data), sr, 16000).numpy()
+            sr = 16000
+        # Normalize for Wav2Vec2
+        data = data / 32768.0
+        # Feature extraction
+        inputs = feature_extractor(
+            data,
+            sampling_rate=sr,
+            return_tensors="pt",
+            padding=True
+        )
+        # Move tensors to device
+        for k in inputs:
+            inputs[k] = inputs[k].to(DEVICE)
+        # Forward pass
+        with torch.no_grad():
+            logits = model(**inputs).logits
+            probs = torch.nn.functional.softmax(logits, dim=-1)[0].cpu().numpy()
+        # Format top 3 results with emojis
+        top_idx = np.argsort(probs)[::-1][:3]
+        result = {f"{LABELS[i]} {EMOJIS.get(LABELS[i], '')}": round(float(probs[i]), 4) for i in top_idx}
+        # Generate waveform plot
+        fig, ax = plt.subplots(figsize=(6,2))
+        ax.plot(data, color='purple')
+        ax.set_title("Audio Waveform")
+        ax.set_xlabel("Samples")
+        ax.set_ylabel("Amplitude")
+        ax.set_xticks([])
+        ax.set_yticks([])
+        plt.tight_layout()
+        return result, fig
+    except Exception as e:
+        return {"Error": str(e)}, None
 # =========================
+# GRADIO APP
 # =========================
 demo = gr.Interface(
     fn=predict,
+    inputs=gr.Audio(sources=["upload", "microphone"], type="numpy", label="🎤 Upload or Record Audio"),
+    outputs=[gr.Label(num_top_classes=3), gr.Plot()],
     title="Audio Emotion Detection 🎧",
     description=(
+        "Fine-tuned Wav2Vec2 model (`Hatman/audio-emotion-detection`) "
+        "for emotion recognition from voice. "
+        "Detects: Angry, Disgusted, Fearful, Happy, Neutral, Sad, Surprised. "
+        "Audio is auto-resampled to 16kHz."
     ),
     allow_flagging="never",
 )
 # =========================
+# LAUNCH
 # =========================
 if __name__ == "__main__":
     demo.launch()