import os import io import torch import torchaudio from typing import Any, Dict from transformers import AutoConfig, AutoProcessor from modeling_upstream_finetune import UpstreamFinetune class EndpointHandler(): def __init__(self, model_dir: str, **kwargs: Any) -> None: # Load config and model with trust_remote_code device = 'cuda' self.emotions = ['neutral','happy','sad','angry','surprise','contempt'] self.model = UpstreamFinetune.from_pretrained( model_dir, device=device, ) self.model.eval() def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: # Expect raw audio bytes or a base64 string in `data["inputs"]` audio = data["inputs"] sampling_rate = data.get("sampling_rate", 16000) # Decode MP3/WAV bytes → waveform tensor waveform, sr = torchaudio.load(io.BytesIO(audio)) if sr != sampling_rate: waveform = torchaudio.functional.resample(waveform, sr, sampling_rate) # Forward pass with torch.no_grad(): cat_logits, reg_outputs = self.model( waveform, sampling_rate ) # Convert logits to probabilities using softmax emotion_probs = torch.nn.functional.softmax(cat_logits, dim=1) # Create emotion predictions emotion_predictions = [] for i, emotion in enumerate(self.emotions): emotion_predictions.append({ "label": emotion, "score": float(emotion_probs[0, i]) # Convert tensor to float }) # Add arousal and valence predictions result = emotion_predictions + [ {"label": "arousal", "score": float(reg_outputs[0, 0])}, {"label": "valence", "score": float(reg_outputs[0, 1])} ] return result