Spaces:

Pant0x
/

Voice_model

Sleeping

App Files Files Community

Pant0x commited on Nov 13, 2025

Commit

63e9297

verified ·

1 Parent(s): e351a45

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -29

app.py CHANGED Viewed

@@ -2,7 +2,8 @@ import gradio as gr
 import torch
 import torchaudio
 import numpy as np
-from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
 # =========================
 # CONFIG
@@ -13,14 +14,24 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # =========================
 # LOAD MODEL & FEATURE EXTRACTOR
 # =========================
-print(f"Loading model: {MODEL_NAME}")
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
 model.eval()
-print("Model loaded successfully.")
-# Emotion labels in correct order
-LABELS = ["Angry", "Disgusted", "Fearful", "Happy", "Neutral", "Sad", "Surprised"]
 # =========================
 # PREDICTION FUNCTION
@@ -28,31 +39,22 @@ LABELS = ["Angry", "Disgusted", "Fearful", "Happy", "Neutral", "Sad", "Surprised
 def predict(audio):
     try:
         if audio is None:
-            return {"Error": "No audio provided"}
         sr, data = audio
-        if isinstance(data, list):
-            data = np.array(data, dtype=np.float32)
-        # Convert stereo -> mono
         if len(data.shape) > 1:
             data = np.mean(data, axis=1)
-        # Resample if needed
         if sr != 16000:
-            waveform = torch.tensor(data, dtype=torch.float32)
-            data = torchaudio.functional.resample(waveform, sr, 16000).numpy()
             sr = 16000
-        # Normalize
-        if np.abs(data).max() > 0:
-            data = data / np.abs(data).max()
-        # Make sure dtype and shape are clean
-        data = np.array(data, dtype=np.float32).flatten()
-        # Debug info
-        print(f"Sample rate: {sr}, Data shape: {data.shape}, Device: {DEVICE}")
         # Feature extraction
         inputs = feature_extractor(
@@ -62,7 +64,7 @@ def predict(audio):
             padding=True
         )
-        # Move to device
         for k in inputs:
             inputs[k] = inputs[k].to(DEVICE)
@@ -71,13 +73,24 @@ def predict(audio):
             logits = model(**inputs).logits
             probs = torch.nn.functional.softmax(logits, dim=-1)[0].cpu().numpy()
-        result = {LABELS[i]: round(float(probs[i]), 4) for i in range(len(LABELS))}
-        print(f"Predicted: {result}")
-        return result
     except Exception as e:
-        print(f"ERROR: {str(e)}")
-        return {"Error": str(e)}
 # =========================
 # GRADIO APP
@@ -85,12 +98,12 @@ def predict(audio):
 demo = gr.Interface(
     fn=predict,
     inputs=gr.Audio(sources=["upload", "microphone"], type="numpy", label="🎤 Upload or Record Audio"),
-    outputs=gr.Label(num_top_classes=3),
     title="Audio Emotion Detection 🎧",
     description=(
         "Fine-tuned Wav2Vec2 model (`Hatman/audio-emotion-detection`) "
         "for emotion recognition from voice. "
-        "Detects: Angry, Disgusted, Fearful, Happy, Neutral, Sad, and Surprised. "
         "Audio is auto-resampled to 16kHz."
     ),
     allow_flagging="never",

 import torch
 import torchaudio
 import numpy as np
+from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, AutoConfig
+import matplotlib.pyplot as plt
 # =========================
 # CONFIG
 # =========================
 # LOAD MODEL & FEATURE EXTRACTOR
 # =========================
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
 model.eval()
+# Use the model’s label mapping directly
+config = AutoConfig.from_pretrained(MODEL_NAME)
+LABELS = [config.id2label[i] for i in range(len(config.id2label))]
+# Map some emojis to each emotion for fun UI
+EMOJIS = {
+    "Angry": "😡",
+    "Disgusted": "🤢",
+    "Fearful": "😨",
+    "Happy": "😄",
+    "Neutral": "😐",
+    "Sad": "😢",
+    "Surprised": "😲"
+}
 # =========================
 # PREDICTION FUNCTION
 def predict(audio):
     try:
         if audio is None:
+            return {"Error": "No audio provided"}, None
         sr, data = audio
+        data = np.array(data, dtype=np.float32)
+        # Stereo -> Mono
         if len(data.shape) > 1:
             data = np.mean(data, axis=1)
+        # Resample to 16kHz
         if sr != 16000:
+            data = torchaudio.functional.resample(torch.tensor(data), sr, 16000).numpy()
             sr = 16000
+        # Normalize for Wav2Vec2
+        data = data / 32768.0
         # Feature extraction
         inputs = feature_extractor(
             padding=True
         )
+        # Move tensors to device
         for k in inputs:
             inputs[k] = inputs[k].to(DEVICE)
             logits = model(**inputs).logits
             probs = torch.nn.functional.softmax(logits, dim=-1)[0].cpu().numpy()
+        # Format top 3 results with emojis
+        top_idx = np.argsort(probs)[::-1][:3]
+        result = {f"{LABELS[i]} {EMOJIS.get(LABELS[i], '')}": round(float(probs[i]), 4) for i in top_idx}
+        # Generate waveform plot
+        fig, ax = plt.subplots(figsize=(6,2))
+        ax.plot(data, color='purple')
+        ax.set_title("Audio Waveform")
+        ax.set_xlabel("Samples")
+        ax.set_ylabel("Amplitude")
+        ax.set_xticks([])
+        ax.set_yticks([])
+        plt.tight_layout()
+        return result, fig
     except Exception as e:
+        return {"Error": str(e)}, None
 # =========================
 # GRADIO APP
 demo = gr.Interface(
     fn=predict,
     inputs=gr.Audio(sources=["upload", "microphone"], type="numpy", label="🎤 Upload or Record Audio"),
+    outputs=[gr.Label(num_top_classes=3), gr.Plot()],
     title="Audio Emotion Detection 🎧",
     description=(
         "Fine-tuned Wav2Vec2 model (`Hatman/audio-emotion-detection`) "
         "for emotion recognition from voice. "
+        "Detects: Angry, Disgusted, Fearful, Happy, Neutral, Sad, Surprised. "
         "Audio is auto-resampled to 16kHz."
     ),
     allow_flagging="never",