othsueh
/

clean-jazz-186

Audio Classification

wav2vec2-emodualhead

emotion-recognition

Model card Files Files and versions

othsueh commited on May 13, 2025

Commit

2b7a995

·

verified ·

1 Parent(s): aa9901a

Update handler.py

Files changed (1) hide show

handler.py +20 -4

handler.py CHANGED Viewed

@@ -10,6 +10,7 @@ class EndpointHandler():
     def __init__(self, model_dir: str, **kwargs: Any) -> None:
         # Load config and model with trust_remote_code
         device = 'cuda'
         self.model = UpstreamFinetune.from_pretrained(
             model_dir,
             device=device,
@@ -25,14 +26,29 @@ class EndpointHandler():
         waveform, sr = torchaudio.load(io.BytesIO(audio))
         if sr != sampling_rate:
             waveform = torchaudio.functional.resample(waveform, sr, sampling_rate)
         # Forward pass
         with torch.no_grad():
             cat_logits, reg_outputs = self.model(
                 waveform,
                 sampling_rate
             )
-        # Postprocess to Python types
-        return [
-            { "label": "arousal", "score" : reg_outputs[0]},
-            { "label": "valence", "score": reg_outputs[1]}
         ]

     def __init__(self, model_dir: str, **kwargs: Any) -> None:
         # Load config and model with trust_remote_code
         device = 'cuda'
+        self.emotions = ['angry', 'sad', 'disgust', 'contempt', 'fear', 'neutral', 'surprise', 'happy']
         self.model = UpstreamFinetune.from_pretrained(
             model_dir,
             device=device,
         waveform, sr = torchaudio.load(io.BytesIO(audio))
         if sr != sampling_rate:
             waveform = torchaudio.functional.resample(waveform, sr, sampling_rate)
         # Forward pass
         with torch.no_grad():
             cat_logits, reg_outputs = self.model(
                 waveform,
                 sampling_rate
             )
+        # Convert logits to probabilities using softmax
+        emotion_probs = torch.nn.functional.softmax(cat_logits, dim=1)
+        # Create emotion predictions
+        emotion_predictions = []
+        for i, emotion in enumerate(self.emotions):
+            emotion_predictions.append({
+                "label": emotion,
+                "score": float(emotion_probs[0, i])  # Convert tensor to float
+            })
+        # Add arousal and valence predictions
+        result = emotion_predictions + [
+            {"label": "arousal", "score": float(reg_outputs[0, 0])},
+            {"label": "valence", "score": float(reg_outputs[0, 1])}
         ]
+        return result