modeling_emotion_av.py · vishrutjha/pph-emotion-classification-model at main

pph-emotion-classification-model / modeling_emotion_av.py

Fix handler for HF Inference API compatibility

b575114 verified 11 months ago

8.41 kB

	import torch
	import torch.nn as nn
	import numpy as np
	import librosa
	from dataclasses import dataclass
	from transformers import PreTrainedModel, PretrainedConfig, Wav2Vec2Processor, Wav2Vec2Model
	from transformers.utils import ModelOutput
	from typing import Optional, Tuple, Dict, Any
	import logging

	logger = logging.getLogger(__name__)


	class EmotionAVConfig(PretrainedConfig):
	"""Configuration class for EmotionAV model."""

	model_type = "emotion_av"

	def __init__(
	self,
	input_dim: int = 787, # wav2vec2 (768) + mfcc (13) + prosodic (6)
	num_emotion_classes: int = 6,
	hidden_size: int = 1024,
	intermediate_size: int = 512,
	final_size: int = 256,
	dropout_rate: float = 0.4,
	emotion_to_av_mapping: Optional[Dict[str, Dict[str, float]]] = None,
	emotion_labels: Optional[list] = None,
	**kwargs
	):
	super().__init__(**kwargs)
	self.input_dim = input_dim
	self.num_emotion_classes = num_emotion_classes
	self.hidden_size = hidden_size
	self.intermediate_size = intermediate_size
	self.final_size = final_size
	self.dropout_rate = dropout_rate

	# Default emotion to AV mapping from your training
	self.emotion_to_av_mapping = emotion_to_av_mapping or {
	'angry': {'arousal': -1.0, 'valence': -0.9269662921348314},
	'disgust': {'arousal': 1.0, 'valence': 0.22539062733339038},
	'fear': {'arousal': -1.0, 'valence': -0.0003170637456042718},
	'happy': {'arousal': -0.5347432024169184, 'valence': 1.0},
	'neutral': {'arousal': 0.1546223286796688, 'valence': -1.0},
	'sad': {'arousal': 0.06459984477929674, 'valence': -1.0}
	}

	self.emotion_labels = emotion_labels or ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad']
	self.id2label = {i: label for i, label in enumerate(self.emotion_labels)}
	self.label2id = {label: i for i, label in enumerate(self.emotion_labels)}


	@dataclass
	class EmotionAVModelOutput(ModelOutput):
	"""
	Output class for EmotionAV model.

	Args:
	emotion_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_emotions)`):
	Emotion classification logits.
	arousal_valence (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
	Arousal and valence regression outputs.
	hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`):
	Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
	of shape :obj:`(batch_size, sequence_length, hidden_size)`.
	attentions (:obj:`tuple(torch.FloatTensor)`, `optional`):
	Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
	:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
	"""
	emotion_logits: torch.FloatTensor = None
	arousal_valence: torch.FloatTensor = None
	hidden_states: Optional[Tuple[torch.FloatTensor]] = None
	attentions: Optional[Tuple[torch.FloatTensor]] = None


	class EmotionAVModel(PreTrainedModel):
	"""
	Audio emotion classification model that predicts both discrete emotions
	and continuous arousal-valence values.
	"""

	config_class = EmotionAVConfig
	base_model_prefix = "emotion_av"

	def __init__(self, config: EmotionAVConfig):
	super().__init__(config)
	self.config = config

	# Emotion classification branch
	self.emotion_branch = nn.Sequential(
	nn.Linear(config.input_dim, config.hidden_size),
	nn.BatchNorm1d(config.hidden_size),
	nn.ReLU(),
	nn.Dropout(config.dropout_rate),
	nn.Linear(config.hidden_size, config.intermediate_size),
	nn.BatchNorm1d(config.intermediate_size),
	nn.ReLU(),
	nn.Dropout(config.dropout_rate * 0.75),
	nn.Linear(config.intermediate_size, config.final_size),
	nn.BatchNorm1d(config.final_size),
	nn.ReLU(),
	nn.Dropout(config.dropout_rate * 0.5)
	)

	# Arousal-Valence regression branch
	self.av_branch = nn.Sequential(
	nn.Linear(config.input_dim, config.hidden_size),
	nn.BatchNorm1d(config.hidden_size),
	nn.ReLU(),
	nn.Dropout(config.dropout_rate),
	nn.Linear(config.hidden_size, config.intermediate_size),
	nn.BatchNorm1d(config.intermediate_size),
	nn.ReLU(),
	nn.Dropout(config.dropout_rate * 0.75),
	nn.Linear(config.intermediate_size, config.final_size),
	nn.BatchNorm1d(config.final_size),
	nn.ReLU(),
	nn.Dropout(config.dropout_rate * 0.5)
	)

	# Output layers
	self.emotion_classifier = nn.Linear(config.final_size, config.num_emotion_classes)
	self.av_regressor = nn.Linear(config.final_size, 2) # arousal, valence

	# Initialize weights
	self.init_weights()

	def forward(
	self,
	input_features: torch.Tensor,
	labels: Optional[torch.Tensor] = None,
	arousal_valence: Optional[torch.Tensor] = None,
	**kwargs
	) -> EmotionAVModelOutput:
	"""
	Forward pass of the model.

	Args:
	input_features: Tensor of shape (batch_size, input_dim) containing audio features
	labels: Optional emotion labels for training
	arousal_valence: Optional arousal-valence values for training

	Returns:
	EmotionAVModelOutput containing emotion logits and arousal-valence predictions
	"""
	# Process through both branches
	emotion_features = self.emotion_branch(input_features)
	av_features = self.av_branch(input_features)

	# Get predictions
	emotion_logits = self.emotion_classifier(emotion_features)
	arousal_valence_pred = self.av_regressor(av_features)

	return EmotionAVModelOutput(
	emotion_logits=emotion_logits,
	arousal_valence=arousal_valence_pred,
	hidden_states=None,
	attentions=None
	)

	def predict_emotion(self, input_features: torch.Tensor) -> Dict[str, Any]:
	"""
	Predict emotion and arousal-valence values from audio features.

	Args:
	input_features: Tensor of audio features

	Returns:
	Dictionary containing predictions
	"""
	self.eval()
	with torch.no_grad():
	outputs = self.forward(input_features)

	# Get emotion predictions
	emotion_probs = torch.softmax(outputs.emotion_logits, dim=-1)
	predicted_emotion_id = torch.argmax(emotion_probs, dim=-1)

	# Convert to labels
	predicted_emotions = [self.config.id2label[idx.item()] for idx in predicted_emotion_id]

	# Get arousal-valence predictions (denormalize from [0,1] to [-1,1])
	av_normalized = outputs.arousal_valence
	arousal = av_normalized[:, 0] * 2 - 1
	valence = av_normalized[:, 1] * 2 - 1

	results = []
	for i in range(len(predicted_emotions)):
	emotion = predicted_emotions[i]
	confidence = emotion_probs[i].max().item()

	# Create result dictionary
	result = {
	'emotion': emotion,
	'confidence': confidence,
	'arousal': arousal[i].item(),
	'valence': valence[i].item(),
	'emotion_probabilities': {
	self.config.id2label[j]: prob.item()
	for j, prob in enumerate(emotion_probs[i])
	}
	}
	results.append(result)

	return results[0] if len(results) == 1 else results


	# Register the model and config with Transformers
	try:
	from transformers import AutoConfig, AutoModel
	AutoConfig.register("emotion_av", EmotionAVConfig)
	AutoModel.register(EmotionAVConfig, EmotionAVModel)
	except ImportError:
	# If transformers is not available, skip registration
	pass