Spaces:

Pant0x
/

Voice_model

Sleeping

App Files Files Community

Pant0x commited on Nov 13, 2025

Commit

9ee8bab

verified ·

1 Parent(s): d961cfc

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -51

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import gradio as gr
 import torch
-import torchaudio
-import numpy as np
 from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
 # =========================
 # CONFIG
@@ -15,65 +14,39 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # =========================
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
-model.eval()
-# Emotion labels in correct order
 LABELS = ["Angry", "Disgusted", "Fearful", "Happy", "Neutral", "Sad", "Surprised"]
 # =========================
-# PREDICTION FUNCTION
 # =========================
 def predict(audio):
-    try:
-        if audio is None:
-            return {"Error": "No audio provided"}
-        sr, data = audio
-        # Ensure it's a NumPy array
-        if isinstance(data, list):
-            data = np.array(data)
-        # Convert stereo -> mono if needed
-        if len(data.shape) > 1:
-            data = np.mean(data, axis=1)
-        # Remove NaNs or infs
-        if np.isnan(data).any() or np.isinf(data).any():
-            data = np.nan_to_num(data)
-        # Resample to 16kHz if necessary
-        if sr != 16000:
-            data = torchaudio.functional.resample(
-                torch.tensor(data), sr, 16000
-            ).numpy()
-            sr = 16000
-        # Normalize
-        data = data / np.abs(data).max() if np.abs(data).max() > 0 else data
-        # Feature extraction
-        inputs = feature_extractor(
-            data,
-            sampling_rate=sr,
-            return_tensors="pt",
-            padding=True
-        ).to(DEVICE)
-        # Forward pass
-        with torch.no_grad():
-            logits = model(**inputs).logits
-            probs = torch.nn.functional.softmax(logits, dim=-1)[0].cpu().numpy()
-        # Format output
-        result = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
-        return result
-    except Exception as e:
-        return {"Error": str(e)}
 # =========================
-# GRADIO APP
 # =========================
 demo = gr.Interface(
     fn=predict,
@@ -84,13 +57,13 @@ demo = gr.Interface(
         "Fine-tuned Wav2Vec2 model (`Hatman/audio-emotion-detection`) "
         "for emotion recognition from voice. "
         "Detects: Angry, Disgusted, Fearful, Happy, Neutral, Sad, and Surprised. "
-        "Audio must be 16kHz (auto-resampled)."
     ),
     allow_flagging="never",
 )
 # =========================
-# LAUNCH
 # =========================
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import torch
 from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
+import torchaudio
 # =========================
 # CONFIG
 # =========================
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
+# Emotion labels in model's order
 LABELS = ["Angry", "Disgusted", "Fearful", "Happy", "Neutral", "Sad", "Surprised"]
 # =========================
+# PREDICTION PIPELINE
 # =========================
 def predict(audio):
+    sr, data = audio
+    # Resample to 16kHz if needed
+    if sr != 16000:
+        data = torchaudio.functional.resample(torch.tensor(data), sr, 16000).numpy()
+        sr = 16000
+    # Extract features
+    inputs = feature_extractor(
+        data,
+        sampling_rate=sr,
+        return_tensors="pt",
+        padding=True
+    ).to(DEVICE)
+    # Forward pass
+    with torch.no_grad():
+        logits = model(**inputs).logits
+        probs = torch.nn.functional.softmax(logits, dim=-1)[0]
+        pred_idx = torch.argmax(probs).item()
+    return {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
 # =========================
+# GRADIO INTERFACE
 # =========================
 demo = gr.Interface(
     fn=predict,
         "Fine-tuned Wav2Vec2 model (`Hatman/audio-emotion-detection`) "
         "for emotion recognition from voice. "
         "Detects: Angry, Disgusted, Fearful, Happy, Neutral, Sad, and Surprised. "
+        "Audio should be 16kHz for best accuracy."
     ),
     allow_flagging="never",
 )
 # =========================
+# LAUNCH APP
 # =========================
 if __name__ == "__main__":
     demo.launch()