| """ |
| Emotion detection ensemble models. |
| """ |
|
|
| import numpy as np |
| import torch |
| from typing import Dict, Any, Optional |
| import logging |
|
|
| from ..base import BaseModel, BaseEnsemble |
| from ..voting import get_voting_strategy |
|
|
| logger = logging.getLogger(__name__) |
|
|
|
|
| class Emotion2VecModel(BaseModel): |
| """emotion2vec+ model for emotion recognition.""" |
|
|
| def __init__(self, model_name: str = "iic/emotion2vec_plus_large", weight: float = 0.35, device: str = 'cpu', use_finetuned: bool = True): |
| super().__init__(name="emotion2vec", weight=weight, device=device) |
| self.model_name = model_name |
| self.use_finetuned = use_finetuned |
|
|
| def load(self): |
| """Load emotion2vec model using funasr.""" |
| try: |
| from pathlib import Path |
|
|
| |
| |
| |
|
|
| logger.warning("⚠️ emotion2vec requires funasr library (not transformers)") |
| logger.info("Using wav2vec2-large-xlsr-53 as compatible alternative...") |
|
|
| from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification |
|
|
| |
| finetuned_path = Path("models/emotion/emotion2vec_finetuned_ptbr") |
| if self.use_finetuned and finetuned_path.exists(): |
| logger.info(f"Loading fine-tuned model from {finetuned_path}...") |
| self.processor = Wav2Vec2Processor.from_pretrained(str(finetuned_path)) |
| self.model = Wav2Vec2ForSequenceClassification.from_pretrained(str(finetuned_path)) |
| logger.info("✅ Using FINE-TUNED model (trained on VERBO/emoUERJ)") |
| else: |
| |
| |
| pt_br_model = "alefiury/wav2vec2-xls-r-300m-pt-br-spontaneous-speech-emotion-recognition" |
| logger.info(f"Loading {pt_br_model}...") |
| self.processor = Wav2Vec2Processor.from_pretrained(pt_br_model) |
| self.model = Wav2Vec2ForSequenceClassification.from_pretrained(pt_br_model) |
| if self.use_finetuned: |
| logger.warning("⚠️ Fine-tuned model not found, using pre-trained PT-BR model") |
| logger.info("To fine-tune: python scripts/training/finetune_emotion2vec.py") |
|
|
| self.model.to(self.device) |
| self.model.eval() |
|
|
| self.is_loaded = True |
| logger.info(f"✅ emotion2vec (wav2vec2 compatible) loaded on {self.device}") |
|
|
| except Exception as e: |
| logger.error(f"Failed to load emotion2vec: {e}") |
| logger.info("Install: pip install transformers torch") |
| logger.info("For native emotion2vec: pip install funasr modelscope") |
| raise |
|
|
| def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]: |
| """Predict emotion using emotion2vec.""" |
| if not self.is_loaded: |
| raise RuntimeError("Model not loaded") |
|
|
| try: |
| |
| if sample_rate != 16000: |
| import librosa |
| audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) |
|
|
| |
| inputs = self.processor( |
| audio, |
| sampling_rate=16000, |
| return_tensors="pt", |
| padding=True |
| ) |
| inputs = {k: v.to(self.device) for k, v in inputs.items()} |
|
|
| |
| with torch.no_grad(): |
| logits = self.model(**inputs).logits |
| probs = torch.nn.functional.softmax(logits, dim=-1) |
|
|
| |
| predicted_id = torch.argmax(logits, dim=-1).item() |
| confidence = probs[0][predicted_id].item() |
| emotion = self.model.config.id2label.get(predicted_id, "unknown") |
|
|
| return { |
| "label": emotion, |
| "confidence": float(confidence), |
| "probabilities": { |
| self.model.config.id2label.get(i, f"class_{i}"): float(probs[0][i]) |
| for i in range(len(probs[0])) |
| } |
| } |
|
|
| except Exception as e: |
| logger.error(f"emotion2vec prediction error: {e}") |
| return {"label": "error", "confidence": 0.0} |
|
|
|
|
| class SenseVoiceModel(BaseModel): |
| """SenseVoice model for emotion and event detection.""" |
|
|
| def __init__(self, model_name: str = "FunAudioLLM/SenseVoiceSmall", weight: float = 0.25, device: str = 'cpu'): |
| super().__init__(name="sensevoice", weight=weight, device=device) |
| self.model_name = model_name |
|
|
| def load(self): |
| """Load SenseVoice model.""" |
| try: |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor |
|
|
| logger.info(f"Loading {self.model_name}...") |
|
|
| self.processor = AutoProcessor.from_pretrained(self.model_name) |
| self.model = AutoModelForSpeechSeq2Seq.from_pretrained(self.model_name) |
| self.model.to(self.device) |
| self.model.eval() |
|
|
| self.is_loaded = True |
| logger.info(f"✅ SenseVoice loaded on {self.device}") |
|
|
| except Exception as e: |
| logger.error(f"Failed to load SenseVoice: {e}") |
| logger.info("SenseVoice may require specific installation. Attempting alternative...") |
|
|
| |
| try: |
| from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification |
| self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53") |
| self.model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-large-xlsr-53") |
| self.model.to(self.device) |
| self.is_loaded = True |
| logger.warning("Using wav2vec2 as SenseVoice fallback") |
| except: |
| raise |
|
|
| def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]: |
| """Predict using SenseVoice.""" |
| if not self.is_loaded: |
| raise RuntimeError("Model not loaded") |
|
|
| try: |
| |
| if sample_rate != 16000: |
| import librosa |
| audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) |
|
|
| |
| inputs = self.processor( |
| audio, |
| sampling_rate=16000, |
| return_tensors="pt", |
| padding=True |
| ) |
| inputs = {k: v.to(self.device) for k, v in inputs.items()} |
|
|
| |
| with torch.no_grad(): |
| outputs = self.model(**inputs) |
| logits = outputs.logits if hasattr(outputs, 'logits') else outputs[0] |
| probs = torch.nn.functional.softmax(logits, dim=-1) |
|
|
| predicted_id = torch.argmax(logits, dim=-1).item() |
| confidence = probs[0][predicted_id].item() |
|
|
| |
| emotion_map = { |
| 0: "neutral", |
| 1: "happy", |
| 2: "sad", |
| 3: "angry", |
| 4: "fearful", |
| 5: "disgusted", |
| 6: "surprised" |
| } |
| emotion = emotion_map.get(predicted_id, "unknown") |
|
|
| return { |
| "label": emotion, |
| "confidence": float(confidence) |
| } |
|
|
| except Exception as e: |
| logger.error(f"SenseVoice prediction error: {e}") |
| return {"label": "error", "confidence": 0.0} |
|
|
|
|
| class WhisperEmotionModel(BaseModel): |
| """Whisper model fine-tuned for emotion recognition.""" |
|
|
| def __init__(self, model_name: str = "openai/whisper-large-v3", weight: float = 0.20, device: str = 'cpu'): |
| super().__init__(name="whisper", weight=weight, device=device) |
| self.model_name = model_name |
|
|
| def load(self): |
| """Load Whisper model.""" |
| try: |
| from transformers import WhisperProcessor, WhisperForConditionalGeneration |
|
|
| logger.info(f"Loading {self.model_name}...") |
|
|
| self.processor = WhisperProcessor.from_pretrained(self.model_name) |
| self.model = WhisperForConditionalGeneration.from_pretrained(self.model_name) |
| self.model.to(self.device) |
| self.model.eval() |
|
|
| self.is_loaded = True |
| logger.info(f"✅ Whisper loaded on {self.device}") |
|
|
| except Exception as e: |
| logger.error(f"Failed to load Whisper: {e}") |
| raise |
|
|
| def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]: |
| """Extract embeddings from Whisper for emotion classification.""" |
| if not self.is_loaded: |
| raise RuntimeError("Model not loaded") |
|
|
| try: |
| |
| if sample_rate != 16000: |
| import librosa |
| audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) |
|
|
| |
| inputs = self.processor( |
| audio, |
| sampling_rate=16000, |
| return_tensors="pt" |
| ) |
| inputs = {k: v.to(self.device) for k, v in inputs.items()} |
|
|
| |
| with torch.no_grad(): |
| encoder_outputs = self.model.get_encoder()(**inputs) |
| embeddings = encoder_outputs.last_hidden_state.mean(dim=1) |
|
|
| |
| |
| |
|
|
| return { |
| "label": "neutral", |
| "confidence": 0.5, |
| "embeddings_shape": embeddings.shape |
| } |
|
|
| except Exception as e: |
| logger.error(f"Whisper prediction error: {e}") |
| return {"label": "error", "confidence": 0.0} |
|
|
|
|
| class HuBERTEmotionModel(BaseModel): |
| """HuBERT model for emotion recognition.""" |
|
|
| def __init__(self, model_name: str = "facebook/hubert-xlarge-ls960-ft", weight: float = 0.12, device: str = 'cpu'): |
| super().__init__(name="hubert", weight=weight, device=device) |
| self.model_name = model_name |
|
|
| def load(self): |
| """Load HuBERT model.""" |
| try: |
| from transformers import Wav2Vec2Processor, HubertForSequenceClassification |
|
|
| logger.info(f"Loading {self.model_name}...") |
|
|
| |
| try: |
| self.processor = Wav2Vec2Processor.from_pretrained(self.model_name) |
| self.model = HubertForSequenceClassification.from_pretrained(self.model_name) |
| except: |
| |
| logger.warning("HuBERT classification model not available, using base HuBERT") |
| from transformers import HubertModel |
| self.processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-base-ls960") |
| self.model = HubertModel.from_pretrained(self.model_name) |
|
|
| self.model.to(self.device) |
| self.model.eval() |
|
|
| self.is_loaded = True |
| logger.info(f"✅ HuBERT loaded on {self.device}") |
|
|
| except Exception as e: |
| logger.error(f"Failed to load HuBERT: {e}") |
| raise |
|
|
| def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]: |
| """Predict using HuBERT.""" |
| if not self.is_loaded: |
| raise RuntimeError("Model not loaded") |
|
|
| try: |
| |
| if sample_rate != 16000: |
| import librosa |
| audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) |
|
|
| |
| inputs = self.processor( |
| audio, |
| sampling_rate=16000, |
| return_tensors="pt", |
| padding=True |
| ) |
| inputs = {k: v.to(self.device) for k, v in inputs.items()} |
|
|
| |
| with torch.no_grad(): |
| outputs = self.model(**inputs) |
|
|
| |
| if hasattr(outputs, 'logits'): |
| logits = outputs.logits |
| elif hasattr(outputs, 'last_hidden_state'): |
| |
| hidden = outputs.last_hidden_state |
| logits = hidden.mean(dim=1) |
| else: |
| logits = outputs[0] |
|
|
| |
| |
| probs = torch.nn.functional.softmax(logits, dim=-1) if logits.dim() > 1 else torch.tensor([0.5]) |
|
|
| emotion_map = {0: "neutral", 1: "happy", 2: "sad", 3: "angry"} |
| predicted_id = 0 if logits.dim() == 1 else torch.argmax(logits, dim=-1).item() |
| confidence = float(probs[0][predicted_id] if probs.dim() > 1 else 0.5) |
| emotion = emotion_map.get(predicted_id, "neutral") |
|
|
| return { |
| "label": emotion, |
| "confidence": confidence |
| } |
|
|
| except Exception as e: |
| logger.error(f"HuBERT prediction error: {e}") |
| return {"label": "error", "confidence": 0.0} |
|
|
|
|
| class Wav2Vec2PTBRModel(BaseModel): |
| """Wav2Vec2 model fine-tuned for Portuguese BR.""" |
|
|
| def __init__(self, model_name: str = "alefiury/wav2vec2-xls-r-300m-pt-br-spontaneous-speech-emotion-recognition", weight: float = 0.08, device: str = 'cpu'): |
| super().__init__(name="wav2vec2_ptbr", weight=weight, device=device) |
| self.model_name = model_name |
|
|
| def load(self): |
| """Load Wav2Vec2 PT-BR model.""" |
| try: |
| from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification |
|
|
| logger.info(f"Loading {self.model_name}...") |
|
|
| self.processor = Wav2Vec2Processor.from_pretrained(self.model_name) |
| self.model = Wav2Vec2ForSequenceClassification.from_pretrained(self.model_name) |
| self.model.to(self.device) |
| self.model.eval() |
|
|
| self.is_loaded = True |
| logger.info(f"✅ Wav2Vec2 PT-BR loaded on {self.device}") |
|
|
| except Exception as e: |
| logger.error(f"Failed to load Wav2Vec2 PT-BR: {e}") |
| logger.warning("Using fallback XLSR model") |
|
|
| |
| from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification |
| self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53") |
| self.model = Wav2Vec2ForSequenceClassification.from_pretrained("facebook/wav2vec2-large-xlsr-53") |
| self.model.to(self.device) |
| self.is_loaded = True |
|
|
| def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]: |
| """Predict using Wav2Vec2 PT-BR.""" |
| if not self.is_loaded: |
| raise RuntimeError("Model not loaded") |
|
|
| try: |
| |
| if sample_rate != 16000: |
| import librosa |
| audio = librosa.resample(audio, orig_sr=sample_rate, target_sr=16000) |
|
|
| |
| inputs = self.processor( |
| audio, |
| sampling_rate=16000, |
| return_tensors="pt", |
| padding=True |
| ) |
| inputs = {k: v.to(self.device) for k, v in inputs.items()} |
|
|
| |
| with torch.no_grad(): |
| logits = self.model(**inputs).logits |
| probs = torch.nn.functional.softmax(logits, dim=-1) |
|
|
| predicted_id = torch.argmax(logits, dim=-1).item() |
| confidence = probs[0][predicted_id].item() |
|
|
| |
| if hasattr(self.model.config, 'id2label'): |
| emotion = self.model.config.id2label.get(predicted_id, "unknown") |
| else: |
| emotion_map = {0: "neutral", 1: "non_neutral_female", 2: "non_neutral_male"} |
| emotion = emotion_map.get(predicted_id, "neutral") |
|
|
| return { |
| "label": emotion, |
| "confidence": float(confidence) |
| } |
|
|
| except Exception as e: |
| logger.error(f"Wav2Vec2 PT-BR prediction error: {e}") |
| return {"label": "error", "confidence": 0.0} |
|
|
|
|
| class EmotionEnsemble(BaseEnsemble): |
| """Ensemble of emotion detection models.""" |
|
|
| def __init__(self, |
| mode: str = 'balanced', |
| device: str = 'cpu', |
| voting_strategy: str = 'weighted'): |
| """ |
| Initialize emotion ensemble. |
| |
| Args: |
| mode: 'quick' (2 models), 'balanced' (3 models), 'full' (5 models) |
| device: 'cpu' or 'cuda' |
| voting_strategy: 'majority', 'weighted', 'confidence' |
| """ |
| self.mode = mode |
| self.device = device |
|
|
| |
| models = self._get_models_for_mode(mode, device) |
|
|
| super().__init__(models=models, voting_strategy=voting_strategy) |
| self.voter = get_voting_strategy(voting_strategy) |
|
|
| def _get_models_for_mode(self, mode: str, device: str): |
| """Get models based on mode.""" |
| if mode == 'quick': |
| |
| |
| return [ |
| WhisperEmotionModel(weight=0.6, device=device), |
| SenseVoiceModel(weight=0.4, device=device) |
| ] |
| elif mode == 'balanced': |
| |
| |
| |
| return [ |
| WhisperEmotionModel(weight=0.40, device=device), |
| SenseVoiceModel(weight=0.35, device=device), |
| Wav2Vec2PTBRModel(weight=0.25, device=device) |
| ] |
| elif mode == 'full': |
| |
| return [ |
| WhisperEmotionModel(weight=0.30, device=device), |
| SenseVoiceModel(weight=0.25, device=device), |
| Wav2Vec2PTBRModel(weight=0.20, device=device), |
| HuBERTEmotionModel(weight=0.15, device=device), |
| Emotion2VecModel(weight=0.10, device=device) |
| ] |
| else: |
| logger.warning(f"Unknown mode '{mode}'. Using 'balanced'.") |
| return self._get_models_for_mode('balanced', device) |
|
|
| def predict(self, audio: np.ndarray, sample_rate: int = 16000) -> Dict[str, Any]: |
| """ |
| Predict emotion using ensemble. |
| |
| Args: |
| audio: Audio array |
| sample_rate: Sample rate |
| |
| Returns: |
| Ensemble prediction with votes and confidence |
| """ |
| |
| predictions = self.predict_all(audio, sample_rate) |
|
|
| if not predictions: |
| return { |
| "label": "unknown", |
| "confidence": 0.0, |
| "error": "No valid predictions" |
| } |
|
|
| |
| result = self.voter.vote(predictions, key='label') |
|
|
| |
| result['predictions'] = predictions |
|
|
| |
| result['agreement'] = self.calculate_agreement(predictions, 'label') |
|
|
| return result |
|
|