File size: 7,537 Bytes

b575114

import json
import base64
import io
import torch
import numpy as np
from typing import Dict, List, Any
import os
import sys

# Add the current directory to Python path for local imports
current_dir = os.path.dirname(os.path.abspath(__file__))
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)

try:
    from modeling_emotion_av import EmotionAVModel, EmotionAVConfig
    from feature_extraction_emotion_av import EmotionAVFeatureExtractor
except ImportError as e:
    print(f"Warning: Could not import custom modules: {e}")
    # Fallback imports
    from transformers import AutoModel, AutoConfig, AutoFeatureExtractor


class EndpointHandler:
    def __init__(self, model_dir: str = ""):
        """
        Initialize the handler for the emotion-av model.
        
        Args:
            model_dir (str): Path to the model directory
        """
        try:
            print(f"Initializing handler with model_dir: {model_dir}")
            
            # Validate config file exists and is readable
            config_path = os.path.join(model_dir, "config.json")
            if not os.path.exists(config_path):
                raise FileNotFoundError(f"Config file not found: {config_path}")
            
            # Test reading config file
            with open(config_path, 'r', encoding='utf-8') as f:
                config_content = f.read().strip()
                if not config_content:
                    raise ValueError("Config file is empty")
                
                # Validate JSON
                config_data = json.loads(config_content)
                print(f"Successfully loaded config with keys: {list(config_data.keys())}")
            
            # Load the custom model and feature extractor with error handling
            try:
                self.model = EmotionAVModel.from_pretrained(
                    model_dir, 
                    trust_remote_code=True,
                    local_files_only=True
                )
                print("Successfully loaded EmotionAVModel")
            except Exception as e:
                print(f"Failed to load with EmotionAVModel: {e}")
                # Fallback to AutoModel
                self.model = AutoModel.from_pretrained(
                    model_dir,
                    trust_remote_code=True,
                    local_files_only=True
                )
                print("Successfully loaded with AutoModel")
            
            try:
                self.feature_extractor = EmotionAVFeatureExtractor.from_pretrained(
                    model_dir,
                    trust_remote_code=True,
                    local_files_only=True
                )
                print("Successfully loaded EmotionAVFeatureExtractor")
            except Exception as e:
                print(f"Failed to load with EmotionAVFeatureExtractor: {e}")
                # Fallback to AutoFeatureExtractor
                self.feature_extractor = AutoFeatureExtractor.from_pretrained(
                    model_dir,
                    trust_remote_code=True,
                    local_files_only=True
                )
                print("Successfully loaded with AutoFeatureExtractor")
                
            self.model.eval()
            print("Handler initialization completed successfully")
            
        except Exception as e:
            print(f"Error during handler initialization: {e}")
            raise
    
    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Handle inference requests.
        
        Args:
            data (Dict): Input data containing 'inputs' key with audio data
            
        Returns:
            List[Dict]: Prediction results in HF-compatible format
        """
        try:
            # Get the inputs
            inputs = data.get("inputs", data)
            parameters = data.get("parameters", {})
            
            # Handle different input formats
            if isinstance(inputs, str):
                # Base64 encoded audio
                try:
                    audio_bytes = base64.b64decode(inputs)
                    audio_data = self._process_audio_bytes(audio_bytes)
                except Exception as e:
                    return [{"error": f"Failed to decode base64 audio: {str(e)}"}]
            elif isinstance(inputs, (list, np.ndarray)):
                # Raw audio array
                audio_data = np.array(inputs, dtype=np.float32)
            else:
                return [{"error": "Invalid input format. Expected base64 string or audio array."}]
            
            # Extract features
            features = self.feature_extractor(
                audio_data,
                sampling_rate=parameters.get("sampling_rate", 16000),
                return_tensors="pt"
            )
            
            # Run inference
            with torch.no_grad():
                outputs = self.model(features["input_features"])
            
            # Process outputs
            emotion_logits = outputs.emotion_logits
            arousal_valence = outputs.arousal_valence
            
            # Get emotion probabilities
            emotion_probs = torch.softmax(emotion_logits, dim=-1)
            
            # Denormalize arousal-valence from [0,1] to [-1,1]
            arousal = (arousal_valence[0, 0].item() * 2) - 1
            valence = (arousal_valence[0, 1].item() * 2) - 1
            
            # Create HF-compatible output: Array<{label: string, score: number}>
            results = []
            probs_sorted, indices = torch.sort(emotion_probs[0], descending=True)
            
            # Return all emotions sorted by confidence
            for i in range(len(indices)):
                idx = indices[i].item()
                label = self.model.config.id2label[idx]
                score = probs_sorted[i].item()
                
                # Strictly follow HF format: only label and score
                results.append({
                    "label": label,
                    "score": score
                })
            
            return results
            
        except Exception as e:
            return [{"error": f"Inference failed: {str(e)}"}]
    
    def _process_audio_bytes(self, audio_bytes: bytes) -> np.ndarray:
        """
        Process audio bytes and convert to numpy array.
        
        Args:
            audio_bytes (bytes): Raw audio bytes
            
        Returns:
            np.ndarray: Processed audio array
        """
        try:
            import soundfile as sf
            
            # Create BytesIO object from bytes
            audio_io = io.BytesIO(audio_bytes)
            
            # Load audio using soundfile
            audio_data, sample_rate = sf.read(audio_io)
            
            # Convert to float32 and ensure mono
            if len(audio_data.shape) > 1:
                audio_data = np.mean(audio_data, axis=1)
            
            audio_data = audio_data.astype(np.float32)
            
            return audio_data
            
        except Exception as e:
            # If soundfile fails, try alternative approach
            try:
                import librosa
                audio_io = io.BytesIO(audio_bytes)
                audio_data, sample_rate = librosa.load(audio_io, sr=16000, mono=True)
                return audio_data.astype(np.float32)
            except Exception as e2:
                raise Exception(f"Failed to process audio: {str(e2)}")