import os
import io
import torch
import torchaudio
from typing import Any, Dict
from transformers import AutoConfig, AutoProcessor
from modeling_upstream_finetune import UpstreamFinetune

class EndpointHandler():
    def __init__(self, model_dir: str, **kwargs: Any) -> None:
        # Load config and model with trust_remote_code
        device = 'cuda'
        self.emotions = ['neutral','happy','sad','angry','surprise','contempt']

        self.model = UpstreamFinetune.from_pretrained(
            model_dir,
            device=device,
        )
        self.model.eval()

    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
        # Expect raw audio bytes or a base64 string in `data["inputs"]`
        audio = data["inputs"]
        sampling_rate = data.get("sampling_rate", 16000)
        
        # Decode MP3/WAV bytes → waveform tensor
        waveform, sr = torchaudio.load(io.BytesIO(audio))
        if sr != sampling_rate:
            waveform = torchaudio.functional.resample(waveform, sr, sampling_rate)
        
        # Forward pass
        with torch.no_grad():
            cat_logits, reg_outputs = self.model(
                waveform,
                sampling_rate
            )
        
        # Convert logits to probabilities using softmax
        emotion_probs = torch.nn.functional.softmax(cat_logits, dim=1)
        
        # Create emotion predictions
        emotion_predictions = []
        for i, emotion in enumerate(self.emotions):
            emotion_predictions.append({
                "label": emotion,
                "score": float(emotion_probs[0, i])  # Convert tensor to float
            })
        
        # Add arousal and valence predictions
        result = emotion_predictions + [
            {"label": "arousal", "score": float(reg_outputs[0, 0])},
            {"label": "valence", "score": float(reg_outputs[0, 1])}
        ]
        
        return result