| import torch |
| import torch.nn as nn |
| import numpy as np |
| import librosa |
| from dataclasses import dataclass |
| from transformers import PreTrainedModel, PretrainedConfig, Wav2Vec2Processor, Wav2Vec2Model |
| from transformers.utils import ModelOutput |
| from typing import Optional, Tuple, Dict, Any |
| import logging |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class EmotionAVConfig(PretrainedConfig): |
| """Configuration class for EmotionAV model.""" |
| |
| model_type = "emotion_av" |
| |
| def __init__( |
| self, |
| input_dim: int = 787, |
| num_emotion_classes: int = 6, |
| hidden_size: int = 1024, |
| intermediate_size: int = 512, |
| final_size: int = 256, |
| dropout_rate: float = 0.4, |
| emotion_to_av_mapping: Optional[Dict[str, Dict[str, float]]] = None, |
| emotion_labels: Optional[list] = None, |
| **kwargs |
| ): |
| super().__init__(**kwargs) |
| self.input_dim = input_dim |
| self.num_emotion_classes = num_emotion_classes |
| self.hidden_size = hidden_size |
| self.intermediate_size = intermediate_size |
| self.final_size = final_size |
| self.dropout_rate = dropout_rate |
| |
| |
| self.emotion_to_av_mapping = emotion_to_av_mapping or { |
| 'angry': {'arousal': -1.0, 'valence': -0.9269662921348314}, |
| 'disgust': {'arousal': 1.0, 'valence': 0.22539062733339038}, |
| 'fear': {'arousal': -1.0, 'valence': -0.0003170637456042718}, |
| 'happy': {'arousal': -0.5347432024169184, 'valence': 1.0}, |
| 'neutral': {'arousal': 0.1546223286796688, 'valence': -1.0}, |
| 'sad': {'arousal': 0.06459984477929674, 'valence': -1.0} |
| } |
| |
| self.emotion_labels = emotion_labels or ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad'] |
| self.id2label = {i: label for i, label in enumerate(self.emotion_labels)} |
| self.label2id = {label: i for i, label in enumerate(self.emotion_labels)} |
|
|
|
|
| @dataclass |
| class EmotionAVModelOutput(ModelOutput): |
| """ |
| Output class for EmotionAV model. |
| |
| Args: |
| emotion_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_emotions)`): |
| Emotion classification logits. |
| arousal_valence (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): |
| Arousal and valence regression outputs. |
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`): |
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) |
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. |
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`): |
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape |
| :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. |
| """ |
| emotion_logits: torch.FloatTensor = None |
| arousal_valence: torch.FloatTensor = None |
| hidden_states: Optional[Tuple[torch.FloatTensor]] = None |
| attentions: Optional[Tuple[torch.FloatTensor]] = None |
|
|
|
|
| class EmotionAVModel(PreTrainedModel): |
| """ |
| Audio emotion classification model that predicts both discrete emotions |
| and continuous arousal-valence values. |
| """ |
| |
| config_class = EmotionAVConfig |
| base_model_prefix = "emotion_av" |
| |
| def __init__(self, config: EmotionAVConfig): |
| super().__init__(config) |
| self.config = config |
| |
| |
| self.emotion_branch = nn.Sequential( |
| nn.Linear(config.input_dim, config.hidden_size), |
| nn.BatchNorm1d(config.hidden_size), |
| nn.ReLU(), |
| nn.Dropout(config.dropout_rate), |
| nn.Linear(config.hidden_size, config.intermediate_size), |
| nn.BatchNorm1d(config.intermediate_size), |
| nn.ReLU(), |
| nn.Dropout(config.dropout_rate * 0.75), |
| nn.Linear(config.intermediate_size, config.final_size), |
| nn.BatchNorm1d(config.final_size), |
| nn.ReLU(), |
| nn.Dropout(config.dropout_rate * 0.5) |
| ) |
| |
| |
| self.av_branch = nn.Sequential( |
| nn.Linear(config.input_dim, config.hidden_size), |
| nn.BatchNorm1d(config.hidden_size), |
| nn.ReLU(), |
| nn.Dropout(config.dropout_rate), |
| nn.Linear(config.hidden_size, config.intermediate_size), |
| nn.BatchNorm1d(config.intermediate_size), |
| nn.ReLU(), |
| nn.Dropout(config.dropout_rate * 0.75), |
| nn.Linear(config.intermediate_size, config.final_size), |
| nn.BatchNorm1d(config.final_size), |
| nn.ReLU(), |
| nn.Dropout(config.dropout_rate * 0.5) |
| ) |
| |
| |
| self.emotion_classifier = nn.Linear(config.final_size, config.num_emotion_classes) |
| self.av_regressor = nn.Linear(config.final_size, 2) |
| |
| |
| self.init_weights() |
| |
| def forward( |
| self, |
| input_features: torch.Tensor, |
| labels: Optional[torch.Tensor] = None, |
| arousal_valence: Optional[torch.Tensor] = None, |
| **kwargs |
| ) -> EmotionAVModelOutput: |
| """ |
| Forward pass of the model. |
| |
| Args: |
| input_features: Tensor of shape (batch_size, input_dim) containing audio features |
| labels: Optional emotion labels for training |
| arousal_valence: Optional arousal-valence values for training |
| |
| Returns: |
| EmotionAVModelOutput containing emotion logits and arousal-valence predictions |
| """ |
| |
| emotion_features = self.emotion_branch(input_features) |
| av_features = self.av_branch(input_features) |
| |
| |
| emotion_logits = self.emotion_classifier(emotion_features) |
| arousal_valence_pred = self.av_regressor(av_features) |
| |
| return EmotionAVModelOutput( |
| emotion_logits=emotion_logits, |
| arousal_valence=arousal_valence_pred, |
| hidden_states=None, |
| attentions=None |
| ) |
| |
| def predict_emotion(self, input_features: torch.Tensor) -> Dict[str, Any]: |
| """ |
| Predict emotion and arousal-valence values from audio features. |
| |
| Args: |
| input_features: Tensor of audio features |
| |
| Returns: |
| Dictionary containing predictions |
| """ |
| self.eval() |
| with torch.no_grad(): |
| outputs = self.forward(input_features) |
| |
| |
| emotion_probs = torch.softmax(outputs.emotion_logits, dim=-1) |
| predicted_emotion_id = torch.argmax(emotion_probs, dim=-1) |
| |
| |
| predicted_emotions = [self.config.id2label[idx.item()] for idx in predicted_emotion_id] |
| |
| |
| av_normalized = outputs.arousal_valence |
| arousal = av_normalized[:, 0] * 2 - 1 |
| valence = av_normalized[:, 1] * 2 - 1 |
| |
| results = [] |
| for i in range(len(predicted_emotions)): |
| emotion = predicted_emotions[i] |
| confidence = emotion_probs[i].max().item() |
| |
| |
| result = { |
| 'emotion': emotion, |
| 'confidence': confidence, |
| 'arousal': arousal[i].item(), |
| 'valence': valence[i].item(), |
| 'emotion_probabilities': { |
| self.config.id2label[j]: prob.item() |
| for j, prob in enumerate(emotion_probs[i]) |
| } |
| } |
| results.append(result) |
| |
| return results[0] if len(results) == 1 else results |
|
|
|
|
| |
| try: |
| from transformers import AutoConfig, AutoModel |
| AutoConfig.register("emotion_av", EmotionAVConfig) |
| AutoModel.register(EmotionAVConfig, EmotionAVModel) |
| except ImportError: |
| |
| pass |