CombineCorpus_ORG / handler.py
othsueh's picture
Create handler.py
cdc1bf3 verified
raw
history blame
1.93 kB
import os
import io
import torch
import torchaudio
from typing import Any, Dict
from transformers import AutoConfig, AutoProcessor
from modeling_upstream_finetune import UpstreamFinetune
class EndpointHandler():
def __init__(self, model_dir: str, **kwargs: Any) -> None:
# Load config and model with trust_remote_code
device = 'cuda'
self.emotions = ['neutral','happy','sad','angry','surprise','contempt']
self.model = UpstreamFinetune.from_pretrained(
model_dir,
device=device,
)
self.model.eval()
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
# Expect raw audio bytes or a base64 string in `data["inputs"]`
audio = data["inputs"]
sampling_rate = data.get("sampling_rate", 16000)
# Decode MP3/WAV bytes → waveform tensor
waveform, sr = torchaudio.load(io.BytesIO(audio))
if sr != sampling_rate:
waveform = torchaudio.functional.resample(waveform, sr, sampling_rate)
# Forward pass
with torch.no_grad():
cat_logits, reg_outputs = self.model(
waveform,
sampling_rate
)
# Convert logits to probabilities using softmax
emotion_probs = torch.nn.functional.softmax(cat_logits, dim=1)
# Create emotion predictions
emotion_predictions = []
for i, emotion in enumerate(self.emotions):
emotion_predictions.append({
"label": emotion,
"score": float(emotion_probs[0, i]) # Convert tensor to float
})
# Add arousal and valence predictions
result = emotion_predictions + [
{"label": "arousal", "score": float(reg_outputs[0, 0])},
{"label": "valence", "score": float(reg_outputs[0, 1])}
]
return result