pph-emotion-classification-model / modeling_emotion_av.py
vishrutjha's picture
Fix handler for HF Inference API compatibility
b575114 verified
import torch
import torch.nn as nn
import numpy as np
import librosa
from dataclasses import dataclass
from transformers import PreTrainedModel, PretrainedConfig, Wav2Vec2Processor, Wav2Vec2Model
from transformers.utils import ModelOutput
from typing import Optional, Tuple, Dict, Any
import logging
logger = logging.getLogger(__name__)
class EmotionAVConfig(PretrainedConfig):
"""Configuration class for EmotionAV model."""
model_type = "emotion_av"
def __init__(
self,
input_dim: int = 787, # wav2vec2 (768) + mfcc (13) + prosodic (6)
num_emotion_classes: int = 6,
hidden_size: int = 1024,
intermediate_size: int = 512,
final_size: int = 256,
dropout_rate: float = 0.4,
emotion_to_av_mapping: Optional[Dict[str, Dict[str, float]]] = None,
emotion_labels: Optional[list] = None,
**kwargs
):
super().__init__(**kwargs)
self.input_dim = input_dim
self.num_emotion_classes = num_emotion_classes
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.final_size = final_size
self.dropout_rate = dropout_rate
# Default emotion to AV mapping from your training
self.emotion_to_av_mapping = emotion_to_av_mapping or {
'angry': {'arousal': -1.0, 'valence': -0.9269662921348314},
'disgust': {'arousal': 1.0, 'valence': 0.22539062733339038},
'fear': {'arousal': -1.0, 'valence': -0.0003170637456042718},
'happy': {'arousal': -0.5347432024169184, 'valence': 1.0},
'neutral': {'arousal': 0.1546223286796688, 'valence': -1.0},
'sad': {'arousal': 0.06459984477929674, 'valence': -1.0}
}
self.emotion_labels = emotion_labels or ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad']
self.id2label = {i: label for i, label in enumerate(self.emotion_labels)}
self.label2id = {label: i for i, label in enumerate(self.emotion_labels)}
@dataclass
class EmotionAVModelOutput(ModelOutput):
"""
Output class for EmotionAV model.
Args:
emotion_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_emotions)`):
Emotion classification logits.
arousal_valence (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
Arousal and valence regression outputs.
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`):
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`):
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
"""
emotion_logits: torch.FloatTensor = None
arousal_valence: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
class EmotionAVModel(PreTrainedModel):
"""
Audio emotion classification model that predicts both discrete emotions
and continuous arousal-valence values.
"""
config_class = EmotionAVConfig
base_model_prefix = "emotion_av"
def __init__(self, config: EmotionAVConfig):
super().__init__(config)
self.config = config
# Emotion classification branch
self.emotion_branch = nn.Sequential(
nn.Linear(config.input_dim, config.hidden_size),
nn.BatchNorm1d(config.hidden_size),
nn.ReLU(),
nn.Dropout(config.dropout_rate),
nn.Linear(config.hidden_size, config.intermediate_size),
nn.BatchNorm1d(config.intermediate_size),
nn.ReLU(),
nn.Dropout(config.dropout_rate * 0.75),
nn.Linear(config.intermediate_size, config.final_size),
nn.BatchNorm1d(config.final_size),
nn.ReLU(),
nn.Dropout(config.dropout_rate * 0.5)
)
# Arousal-Valence regression branch
self.av_branch = nn.Sequential(
nn.Linear(config.input_dim, config.hidden_size),
nn.BatchNorm1d(config.hidden_size),
nn.ReLU(),
nn.Dropout(config.dropout_rate),
nn.Linear(config.hidden_size, config.intermediate_size),
nn.BatchNorm1d(config.intermediate_size),
nn.ReLU(),
nn.Dropout(config.dropout_rate * 0.75),
nn.Linear(config.intermediate_size, config.final_size),
nn.BatchNorm1d(config.final_size),
nn.ReLU(),
nn.Dropout(config.dropout_rate * 0.5)
)
# Output layers
self.emotion_classifier = nn.Linear(config.final_size, config.num_emotion_classes)
self.av_regressor = nn.Linear(config.final_size, 2) # arousal, valence
# Initialize weights
self.init_weights()
def forward(
self,
input_features: torch.Tensor,
labels: Optional[torch.Tensor] = None,
arousal_valence: Optional[torch.Tensor] = None,
**kwargs
) -> EmotionAVModelOutput:
"""
Forward pass of the model.
Args:
input_features: Tensor of shape (batch_size, input_dim) containing audio features
labels: Optional emotion labels for training
arousal_valence: Optional arousal-valence values for training
Returns:
EmotionAVModelOutput containing emotion logits and arousal-valence predictions
"""
# Process through both branches
emotion_features = self.emotion_branch(input_features)
av_features = self.av_branch(input_features)
# Get predictions
emotion_logits = self.emotion_classifier(emotion_features)
arousal_valence_pred = self.av_regressor(av_features)
return EmotionAVModelOutput(
emotion_logits=emotion_logits,
arousal_valence=arousal_valence_pred,
hidden_states=None,
attentions=None
)
def predict_emotion(self, input_features: torch.Tensor) -> Dict[str, Any]:
"""
Predict emotion and arousal-valence values from audio features.
Args:
input_features: Tensor of audio features
Returns:
Dictionary containing predictions
"""
self.eval()
with torch.no_grad():
outputs = self.forward(input_features)
# Get emotion predictions
emotion_probs = torch.softmax(outputs.emotion_logits, dim=-1)
predicted_emotion_id = torch.argmax(emotion_probs, dim=-1)
# Convert to labels
predicted_emotions = [self.config.id2label[idx.item()] for idx in predicted_emotion_id]
# Get arousal-valence predictions (denormalize from [0,1] to [-1,1])
av_normalized = outputs.arousal_valence
arousal = av_normalized[:, 0] * 2 - 1
valence = av_normalized[:, 1] * 2 - 1
results = []
for i in range(len(predicted_emotions)):
emotion = predicted_emotions[i]
confidence = emotion_probs[i].max().item()
# Create result dictionary
result = {
'emotion': emotion,
'confidence': confidence,
'arousal': arousal[i].item(),
'valence': valence[i].item(),
'emotion_probabilities': {
self.config.id2label[j]: prob.item()
for j, prob in enumerate(emotion_probs[i])
}
}
results.append(result)
return results[0] if len(results) == 1 else results
# Register the model and config with Transformers
try:
from transformers import AutoConfig, AutoModel
AutoConfig.register("emotion_av", EmotionAVConfig)
AutoModel.register(EmotionAVConfig, EmotionAVModel)
except ImportError:
# If transformers is not available, skip registration
pass