Spaces:

Pant0x
/

Voice_model

Sleeping

App Files Files Community

Pant0x commited on Nov 13, 2025

Commit

e351a45

verified ·

1 Parent(s): 9ee8bab

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -24

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import gradio as gr
 import torch
-from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
 import torchaudio
 # =========================
 # CONFIG
@@ -12,41 +13,74 @@ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # =========================
 # LOAD MODEL & FEATURE EXTRACTOR
 # =========================
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
-# Emotion labels in model's order
 LABELS = ["Angry", "Disgusted", "Fearful", "Happy", "Neutral", "Sad", "Surprised"]
 # =========================
-# PREDICTION PIPELINE
 # =========================
 def predict(audio):
-    sr, data = audio
-    # Resample to 16kHz if needed
-    if sr != 16000:
-        data = torchaudio.functional.resample(torch.tensor(data), sr, 16000).numpy()
-        sr = 16000
-    # Extract features
-    inputs = feature_extractor(
-        data,
-        sampling_rate=sr,
-        return_tensors="pt",
-        padding=True
-    ).to(DEVICE)
-    # Forward pass
-    with torch.no_grad():
-        logits = model(**inputs).logits
-        probs = torch.nn.functional.softmax(logits, dim=-1)[0]
-        pred_idx = torch.argmax(probs).item()
-    return {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
 # =========================
-# GRADIO INTERFACE
 # =========================
 demo = gr.Interface(
     fn=predict,
@@ -57,13 +91,13 @@ demo = gr.Interface(
         "Fine-tuned Wav2Vec2 model (`Hatman/audio-emotion-detection`) "
         "for emotion recognition from voice. "
         "Detects: Angry, Disgusted, Fearful, Happy, Neutral, Sad, and Surprised. "
-        "Audio should be 16kHz for best accuracy."
     ),
     allow_flagging="never",
 )
 # =========================
-# LAUNCH APP
 # =========================
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import torch
 import torchaudio
+import numpy as np
+from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
 # =========================
 # CONFIG
 # =========================
 # LOAD MODEL & FEATURE EXTRACTOR
 # =========================
+print(f"Loading model: {MODEL_NAME}")
 feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
+model.eval()
+print("Model loaded successfully.")
+# Emotion labels in correct order
 LABELS = ["Angry", "Disgusted", "Fearful", "Happy", "Neutral", "Sad", "Surprised"]
 # =========================
+# PREDICTION FUNCTION
 # =========================
 def predict(audio):
+    try:
+        if audio is None:
+            return {"Error": "No audio provided"}
+        sr, data = audio
+        if isinstance(data, list):
+            data = np.array(data, dtype=np.float32)
+        # Convert stereo -> mono
+        if len(data.shape) > 1:
+            data = np.mean(data, axis=1)
+        # Resample if needed
+        if sr != 16000:
+            waveform = torch.tensor(data, dtype=torch.float32)
+            data = torchaudio.functional.resample(waveform, sr, 16000).numpy()
+            sr = 16000
+        # Normalize
+        if np.abs(data).max() > 0:
+            data = data / np.abs(data).max()
+        # Make sure dtype and shape are clean
+        data = np.array(data, dtype=np.float32).flatten()
+        # Debug info
+        print(f"Sample rate: {sr}, Data shape: {data.shape}, Device: {DEVICE}")
+        # Feature extraction
+        inputs = feature_extractor(
+            data,
+            sampling_rate=sr,
+            return_tensors="pt",
+            padding=True
+        )
+        # Move to device
+        for k in inputs:
+            inputs[k] = inputs[k].to(DEVICE)
+        # Forward pass
+        with torch.no_grad():
+            logits = model(**inputs).logits
+            probs = torch.nn.functional.softmax(logits, dim=-1)[0].cpu().numpy()
+        result = {LABELS[i]: round(float(probs[i]), 4) for i in range(len(LABELS))}
+        print(f"Predicted: {result}")
+        return result
+    except Exception as e:
+        print(f"ERROR: {str(e)}")
+        return {"Error": str(e)}
 # =========================
+# GRADIO APP
 # =========================
 demo = gr.Interface(
     fn=predict,
         "Fine-tuned Wav2Vec2 model (`Hatman/audio-emotion-detection`) "
         "for emotion recognition from voice. "
         "Detects: Angry, Disgusted, Fearful, Happy, Neutral, Sad, and Surprised. "
+        "Audio is auto-resampled to 16kHz."
     ),
     allow_flagging="never",
 )
 # =========================
+# LAUNCH
 # =========================
 if __name__ == "__main__":
     demo.launch()