Spaces:

Pant0x
/

Voice_model

Sleeping

App Files Files Community

Pant0x commited on Nov 13, 2025

Commit

afb816c

verified ·

1 Parent(s): 63e9297

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -32

app.py CHANGED Viewed

@@ -18,7 +18,7 @@ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
 model.eval()
-# Use the model’s label mapping directly
 config = AutoConfig.from_pretrained(MODEL_NAME)
 LABELS = [config.id2label[i] for i in range(len(config.id2label))]
@@ -40,55 +40,78 @@ def predict(audio):
     try:
         if audio is None:
             return {"Error": "No audio provided"}, None
         sr, data = audio
         data = np.array(data, dtype=np.float32)
         # Stereo -> Mono
         if len(data.shape) > 1:
             data = np.mean(data, axis=1)
         # Resample to 16kHz
         if sr != 16000:
             data = torchaudio.functional.resample(torch.tensor(data), sr, 16000).numpy()
             sr = 16000
-        # Normalize for Wav2Vec2
-        data = data / 32768.0
-        # Feature extraction
         inputs = feature_extractor(
             data,
             sampling_rate=sr,
             return_tensors="pt",
-            padding=True
         )
         # Move tensors to device
         for k in inputs:
             inputs[k] = inputs[k].to(DEVICE)
         # Forward pass
         with torch.no_grad():
             logits = model(**inputs).logits
             probs = torch.nn.functional.softmax(logits, dim=-1)[0].cpu().numpy()
-        # Format top 3 results with emojis
-        top_idx = np.argsort(probs)[::-1][:3]
-        result = {f"{LABELS[i]} {EMOJIS.get(LABELS[i], '')}": round(float(probs[i]), 4) for i in top_idx}
         # Generate waveform plot
-        fig, ax = plt.subplots(figsize=(6,2))
-        ax.plot(data, color='purple')
-        ax.set_title("Audio Waveform")
-        ax.set_xlabel("Samples")
         ax.set_ylabel("Amplitude")
-        ax.set_xticks([])
-        ax.set_yticks([])
         plt.tight_layout()
         return result, fig
     except Exception as e:
         return {"Error": str(e)}, None
@@ -98,19 +121,28 @@ def predict(audio):
 demo = gr.Interface(
     fn=predict,
     inputs=gr.Audio(sources=["upload", "microphone"], type="numpy", label="🎤 Upload or Record Audio"),
-    outputs=[gr.Label(num_top_classes=3), gr.Plot()],
-    title="Audio Emotion Detection 🎧",
     description=(
-        "Fine-tuned Wav2Vec2 model (`Hatman/audio-emotion-detection`) "
-        "for emotion recognition from voice. "
-        "Detects: Angry, Disgusted, Fearful, Happy, Neutral, Sad, Surprised. "
-        "Audio is auto-resampled to 16kHz."
     ),
     allow_flagging="never",
 )
 # =========================
 # LAUNCH
 # =========================
 if __name__ == "__main__":
-    demo.launch()

 model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME).to(DEVICE)
 model.eval()
+# Use the model's label mapping directly
 config = AutoConfig.from_pretrained(MODEL_NAME)
 LABELS = [config.id2label[i] for i in range(len(config.id2label))]
     try:
         if audio is None:
             return {"Error": "No audio provided"}, None
         sr, data = audio
         data = np.array(data, dtype=np.float32)
         # Stereo -> Mono
         if len(data.shape) > 1:
             data = np.mean(data, axis=1)
         # Resample to 16kHz
         if sr != 16000:
             data = torchaudio.functional.resample(torch.tensor(data), sr, 16000).numpy()
             sr = 16000
+        # Improved normalization - normalize to [-1, 1] range
+        # Check if data is in int16 range or already normalized
+        if np.abs(data).max() > 1.0:
+            data = data / np.abs(data).max()  # Normalize by max value
+        # Apply gentle audio preprocessing to improve feature extraction
+        # Remove DC offset
+        data = data - np.mean(data)
+        # Apply light pre-emphasis filter to balance frequencies
+        pre_emphasis = 0.97
+        data = np.append(data[0], data[1:] - pre_emphasis * data[:-1])
+        # Feature extraction with proper padding
         inputs = feature_extractor(
             data,
             sampling_rate=sr,
             return_tensors="pt",
+            padding=True,
+            max_length=16000 * 10,  # Max 10 seconds
+            truncation=True
         )
         # Move tensors to device
         for k in inputs:
             inputs[k] = inputs[k].to(DEVICE)
         # Forward pass
         with torch.no_grad():
             logits = model(**inputs).logits
+            # Apply temperature scaling to reduce overconfidence
+            # Lower temperature = more uniform distribution
+            temperature = 1.5
+            logits = logits / temperature
             probs = torch.nn.functional.softmax(logits, dim=-1)[0].cpu().numpy()
+        # Show ALL emotions with their scores (not just top 3)
+        result = {}
+        for i, label in enumerate(LABELS):
+            emoji = EMOJIS.get(label, '')
+            result[f"{label} {emoji}"] = round(float(probs[i]), 4)
+        # Sort by probability
+        result = dict(sorted(result.items(), key=lambda x: x[1], reverse=True))
         # Generate waveform plot
+        fig, ax = plt.subplots(figsize=(8, 3))
+        time_axis = np.linspace(0, len(data) / sr, len(data))
+        ax.plot(time_axis, data, color='purple', linewidth=0.5)
+        ax.set_title("Audio Waveform", fontsize=12, fontweight='bold')
+        ax.set_xlabel("Time (seconds)")
         ax.set_ylabel("Amplitude")
+        ax.grid(True, alpha=0.3)
         plt.tight_layout()
         return result, fig
     except Exception as e:
         return {"Error": str(e)}, None
 demo = gr.Interface(
     fn=predict,
     inputs=gr.Audio(sources=["upload", "microphone"], type="numpy", label="🎤 Upload or Record Audio"),
+    outputs=[
+        gr.Label(num_top_classes=7, label="Emotion Probabilities"),
+        gr.Plot(label="Waveform Visualization")
+    ],
+    title="🎧 Audio Emotion Detection",
     description=(
+        "Fine-tuned Wav2Vec2 model for emotion recognition from voice. "
+        "Detects: **Angry, Disgusted, Fearful, Happy, Neutral, Sad, Surprised**.\n\n"
+        "**Tips for better results:**\n"
+        "- Speak clearly and naturally\n"
+        "- Record at least 2-3 seconds of audio\n"
+        "- Avoid background noise\n"
+        "- Try exaggerating emotions for testing\n\n"
+        "Audio is automatically resampled to 16kHz and normalized."
     ),
+    examples=[],
     allow_flagging="never",
+    theme=gr.themes.Soft()
 )
 # =========================
 # LAUNCH
 # =========================
 if __name__ == "__main__":
+    demo.launch()