Spaces:

Pant0x
/

Voice_model

Sleeping

App Files Files Community

Pant0x commited on Nov 13, 2025

Commit

d961cfc

verified ·

1 Parent(s): 1c88dc7

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -24

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import gradio as gr
 import torch
-from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
 import torchaudio
 # =========================
 # CONFIG
@@ -14,39 +15,65 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # =========================
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
-# Emotion labels in model's order
 LABELS = ["Angry", "Disgusted", "Fearful", "Happy", "Neutral", "Sad", "Surprised"]
 # =========================
-# PREDICTION PIPELINE
 # =========================
 def predict(audio):
-    sr, data = audio
-    # Resample to 16kHz if needed
-    if sr != 16000:
-        data = torchaudio.functional.resample(torch.tensor(data), sr, 16000).numpy()
-        sr = 16000
-    # Extract features
-    inputs = feature_extractor(
-        data,
-        sampling_rate=sr,
-        return_tensors="pt",
-        padding=True
-    ).to(DEVICE)
-    # Forward pass
-    with torch.no_grad():
-        logits = model(**inputs).logits
-        probs = torch.nn.functional.softmax(logits, dim=-1)[0]
-        pred_idx = torch.argmax(probs).item()
-    return {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
 # =========================
-# GRADIO INTERFACE
 # =========================
 demo = gr.Interface(
     fn=predict,
@@ -57,13 +84,13 @@ demo = gr.Interface(
         "Fine-tuned Wav2Vec2 model (`Hatman/audio-emotion-detection`) "
         "for emotion recognition from voice. "
         "Detects: Angry, Disgusted, Fearful, Happy, Neutral, Sad, and Surprised. "
-        "Audio should be 16kHz for best accuracy."
     ),
     allow_flagging="never",
 )
 # =========================
-# LAUNCH APP
 # =========================
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import torch
 import torchaudio
+import numpy as np
+from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
 # =========================
 # CONFIG
 # =========================
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
+model.eval()
+# Emotion labels in correct order
 LABELS = ["Angry", "Disgusted", "Fearful", "Happy", "Neutral", "Sad", "Surprised"]
 # =========================
+# PREDICTION FUNCTION
 # =========================
 def predict(audio):
+    try:
+        if audio is None:
+            return {"Error": "No audio provided"}
+        sr, data = audio
+        # Ensure it's a NumPy array
+        if isinstance(data, list):
+            data = np.array(data)
+        # Convert stereo -> mono if needed
+        if len(data.shape) > 1:
+            data = np.mean(data, axis=1)
+        # Remove NaNs or infs
+        if np.isnan(data).any() or np.isinf(data).any():
+            data = np.nan_to_num(data)
+        # Resample to 16kHz if necessary
+        if sr != 16000:
+            data = torchaudio.functional.resample(
+                torch.tensor(data), sr, 16000
+            ).numpy()
+            sr = 16000
+        # Normalize
+        data = data / np.abs(data).max() if np.abs(data).max() > 0 else data
+        # Feature extraction
+        inputs = feature_extractor(
+            data,
+            sampling_rate=sr,
+            return_tensors="pt",
+            padding=True
+        ).to(DEVICE)
+        # Forward pass
+        with torch.no_grad():
+            logits = model(**inputs).logits
+            probs = torch.nn.functional.softmax(logits, dim=-1)[0].cpu().numpy()
+        # Format output
+        result = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
+        return result
+    except Exception as e:
+        return {"Error": str(e)}
 # =========================
+# GRADIO APP
 # =========================
 demo = gr.Interface(
     fn=predict,
         "Fine-tuned Wav2Vec2 model (`Hatman/audio-emotion-detection`) "
         "for emotion recognition from voice. "
         "Detects: Angry, Disgusted, Fearful, Happy, Neutral, Sad, and Surprised. "
+        "Audio must be 16kHz (auto-resampled)."
     ),
     allow_flagging="never",
 )
 # =========================
+# LAUNCH
 # =========================
 if __name__ == "__main__":
     demo.launch()