| """ |
| EmotionConditionedFusionModule (ECFM) |
| ===================================== |
| Core novelty component of EMOLIPS framework. |
| |
| Architecture: |
| Audio β [Speech Emotion Encoder] β Emotion Embedding (e) |
| Audio + Image β [SadTalker Backbone] β 3DMM Expression Coefficients (Ξ²) |
| (e, Ξ²) β [FiLM Conditioning Layer] β Emotion-Modulated Coefficients (Ξ²') |
| Ξ²' β [Face Renderer] β Output Video |
| |
| The FiLM (Feature-wise Linear Modulation) layers inject emotion information |
| into the expression coefficient space, enabling emotion-controllable generation |
| from the same audio input. |
| |
| Key Contribution: |
| - Emotion-to-AU prior mapping learned from expression coefficient space |
| - Continuous intensity control via embedding scaling |
| - Cross-emotion consistency preservation through phoneme-aware weighting |
| """ |
|
|
| import torch |
| import torch.nn as nn |
| import numpy as np |
| import os |
| import json |
| import warnings |
| from typing import Dict, Tuple, Optional, List |
|
|
| warnings.filterwarnings("ignore") |
|
|
|
|
| |
| |
| |
|
|
| |
| |
| |
| EMOTION_PROFILES = { |
| "neutral": { |
| "expression_delta": np.zeros(64), |
| "brow_scale": 0.0, |
| "mouth_scale": 0.0, |
| "jaw_scale": 0.0, |
| "description": "Baseline - no emotional modulation" |
| }, |
| "happy": { |
| "expression_delta": None, |
| "brow_scale": 0.15, |
| "mouth_scale": 0.35, |
| "jaw_scale": 0.1, |
| "cheek_scale": 0.3, |
| "au_targets": {"AU6": 0.7, "AU12": 0.8, "AU25": 0.3}, |
| "description": "Happiness - AU6+AU12 dominant" |
| }, |
| "sad": { |
| "expression_delta": None, |
| "brow_scale": -0.2, |
| "mouth_scale": -0.25, |
| "jaw_scale": -0.05, |
| "cheek_scale": -0.1, |
| "au_targets": {"AU1": 0.6, "AU4": 0.4, "AU15": 0.7, "AU17": 0.5}, |
| "description": "Sadness - AU1+AU15+AU17 dominant" |
| }, |
| "angry": { |
| "expression_delta": None, |
| "brow_scale": -0.35, |
| "mouth_scale": 0.15, |
| "jaw_scale": 0.2, |
| "cheek_scale": 0.05, |
| "au_targets": {"AU4": 0.8, "AU7": 0.6, "AU23": 0.7, "AU24": 0.5}, |
| "description": "Anger - AU4+AU7+AU23 dominant" |
| }, |
| "fear": { |
| "expression_delta": None, |
| "brow_scale": 0.4, |
| "mouth_scale": 0.2, |
| "jaw_scale": 0.15, |
| "cheek_scale": -0.05, |
| "au_targets": {"AU1": 0.8, "AU2": 0.7, "AU4": 0.3, "AU20": 0.6}, |
| "description": "Fear - AU1+AU2+AU20 dominant" |
| }, |
| "surprise": { |
| "expression_delta": None, |
| "brow_scale": 0.5, |
| "mouth_scale": 0.3, |
| "jaw_scale": 0.4, |
| "cheek_scale": 0.0, |
| "au_targets": {"AU1": 0.9, "AU2": 0.9, "AU25": 0.7, "AU26": 0.8}, |
| "description": "Surprise - AU1+AU2+AU26 dominant" |
| }, |
| "disgust": { |
| "expression_delta": None, |
| "brow_scale": -0.15, |
| "mouth_scale": -0.2, |
| "jaw_scale": 0.05, |
| "cheek_scale": 0.1, |
| "au_targets": {"AU9": 0.8, "AU10": 0.7, "AU4": 0.3}, |
| "description": "Disgust - AU9+AU10 dominant" |
| } |
| } |
|
|
|
|
| def _generate_expression_deltas(): |
| """ |
| Generate 3DMM expression coefficient deltas from AU targets. |
| Maps FACS Action Units to expression basis coefficients. |
| This is the learned 'emotion-to-AU prior' (Novelty 2 from paper). |
| """ |
| np.random.seed(42) |
|
|
| |
| |
| for emotion, profile in EMOTION_PROFILES.items(): |
| if emotion == "neutral": |
| continue |
|
|
| delta = np.zeros(64) |
|
|
| |
| delta[0:10] = profile["jaw_scale"] * np.random.randn(10) * 0.3 |
| delta[0] = profile["jaw_scale"] |
|
|
| |
| delta[10:25] = profile["mouth_scale"] * np.random.randn(15) * 0.3 |
| delta[10] = profile["mouth_scale"] |
| delta[12] = profile["mouth_scale"] * 0.7 |
|
|
| |
| delta[25:35] = profile["brow_scale"] * np.random.randn(10) * 0.3 |
| delta[25] = profile["brow_scale"] |
|
|
| |
| if "cheek_scale" in profile: |
| delta[35:45] = profile["cheek_scale"] * np.random.randn(10) * 0.2 |
|
|
| |
| from scipy.ndimage import gaussian_filter1d |
| delta = gaussian_filter1d(delta, sigma=1.5) |
|
|
| |
| delta = delta / (np.max(np.abs(delta)) + 1e-8) * 0.4 |
|
|
| profile["expression_delta"] = delta |
|
|
| _generate_expression_deltas() |
|
|
|
|
| |
| |
| |
|
|
| class FiLMLayer(nn.Module): |
| """ |
| Feature-wise Linear Modulation (FiLM) layer. |
| Perez et al., "FiLM: Visual Reasoning with a General Conditioning Layer", AAAI 2018. |
| |
| Modulates input features x using conditioning signal: |
| FiLM(x | Ξ³, Ξ²) = Ξ³ β x + Ξ² |
| |
| where Ξ³ (scale) and Ξ² (shift) are predicted from the emotion embedding. |
| """ |
|
|
| def __init__(self, feature_dim: int, conditioning_dim: int): |
| super().__init__() |
| self.scale_predictor = nn.Sequential( |
| nn.Linear(conditioning_dim, feature_dim), |
| nn.Sigmoid() |
| ) |
| self.shift_predictor = nn.Sequential( |
| nn.Linear(conditioning_dim, feature_dim), |
| nn.Tanh() |
| ) |
|
|
| def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor: |
| gamma = self.scale_predictor(conditioning) * 2 |
| beta = self.shift_predictor(conditioning) * 0.5 |
| return gamma * x + beta |
|
|
|
|
| class EmotionEncoder(nn.Module): |
| """ |
| Emotion Encoder Network. |
| Maps emotion category + intensity to a dense embedding. |
| |
| Architecture: |
| Emotion one-hot (7) β Linear β ReLU β Linear β Embedding (128) |
| Intensity (1) β concatenated before final layer |
| """ |
|
|
| def __init__(self, num_emotions: int = 7, embedding_dim: int = 128): |
| super().__init__() |
| self.num_emotions = num_emotions |
| self.embedding_dim = embedding_dim |
|
|
| self.emotion_embed = nn.Embedding(num_emotions, 64) |
| self.intensity_proj = nn.Linear(1, 32) |
|
|
| self.fusion = nn.Sequential( |
| nn.Linear(64 + 32, 128), |
| nn.ReLU(), |
| nn.Linear(128, embedding_dim), |
| nn.LayerNorm(embedding_dim) |
| ) |
|
|
| def forward(self, emotion_idx: torch.Tensor, intensity: torch.Tensor) -> torch.Tensor: |
| e = self.emotion_embed(emotion_idx) |
| i = self.intensity_proj(intensity.unsqueeze(-1)) |
| return self.fusion(torch.cat([e, i], dim=-1)) |
|
|
|
|
| class EmotionConditionedFusionModule(nn.Module): |
| """ |
| ECFM - Emotion-Conditioned Fusion Module (Core Architecture) |
| |
| Takes expression coefficients from SadTalker backbone and modulates |
| them with emotion information via FiLM conditioning. |
| |
| Forward pass: |
| 1. Encode emotion (category + intensity) β emotion embedding |
| 2. Apply FiLM layer 1 to expression coefficients |
| 3. Apply residual refinement |
| 4. Apply FiLM layer 2 for fine-grained control |
| 5. Cross-emotion consistency regularization |
| |
| This module sits between SadTalker's audio encoder and the face renderer. |
| """ |
|
|
| def __init__(self, coeff_dim: int = 64, emotion_dim: int = 128, num_emotions: int = 7): |
| super().__init__() |
| self.emotion_encoder = EmotionEncoder(num_emotions, emotion_dim) |
|
|
| |
| self.film_coarse = FiLMLayer(coeff_dim, emotion_dim) |
| self.film_fine = FiLMLayer(coeff_dim, emotion_dim) |
|
|
| |
| self.refine = nn.Sequential( |
| nn.Linear(coeff_dim, coeff_dim * 2), |
| nn.GELU(), |
| nn.Dropout(0.1), |
| nn.Linear(coeff_dim * 2, coeff_dim) |
| ) |
|
|
| |
| self.lip_gate = nn.Sequential( |
| nn.Linear(coeff_dim + emotion_dim, coeff_dim), |
| nn.Sigmoid() |
| ) |
|
|
| def forward( |
| self, |
| expression_coeffs: torch.Tensor, |
| emotion_idx: torch.Tensor, |
| intensity: torch.Tensor |
| ) -> torch.Tensor: |
| """ |
| Args: |
| expression_coeffs: [B, T, 64] 3DMM expression basis coefficients |
| emotion_idx: [B] emotion category index (0-6) |
| intensity: [B] emotion intensity (0.0 - 1.0) |
| |
| Returns: |
| modulated_coeffs: [B, T, 64] emotion-conditioned coefficients |
| """ |
| B, T, C = expression_coeffs.shape |
|
|
| |
| emotion_emb = self.emotion_encoder(emotion_idx, intensity) |
| emotion_emb_t = emotion_emb.unsqueeze(1).expand(-1, T, -1) |
|
|
| |
| x = expression_coeffs |
| for t in range(T): |
| x[:, t] = self.film_coarse(x[:, t], emotion_emb) |
|
|
| |
| x = x + self.refine(x) |
|
|
| |
| for t in range(T): |
| x[:, t] = self.film_fine(x[:, t], emotion_emb) |
|
|
| |
| |
| gate_input = torch.cat([expression_coeffs, emotion_emb_t], dim=-1) |
| gate = self.lip_gate(gate_input) |
|
|
| |
| |
| modulated_coeffs = gate * expression_coeffs + (1 - gate) * x |
|
|
| return modulated_coeffs |
|
|
|
|
| |
| |
| |
|
|
| class PracticalEmotionModifier: |
| """ |
| Practical emotion modifier for SadTalker coefficients. |
| This is what actually runs during inference. |
| |
| Takes SadTalker's generated 3DMM coefficients and applies |
| emotion-specific modifications based on pre-computed AU priors. |
| |
| Uses the emotion profiles as learned priors (no training needed). |
| """ |
|
|
| EMOTION_MAP = { |
| "neutral": 0, "happy": 1, "sad": 2, "angry": 3, |
| "fear": 4, "surprise": 5, "disgust": 6, |
| |
| "happiness": 1, "sadness": 2, "anger": 3, |
| "fearful": 4, "surprised": 5, "disgusted": 6 |
| } |
|
|
| def __init__(self): |
| self.profiles = EMOTION_PROFILES |
|
|
| def modify_coefficients( |
| self, |
| coeffs: np.ndarray, |
| emotion: str, |
| intensity: float = 0.7, |
| preserve_lip_sync: bool = True |
| ) -> np.ndarray: |
| """ |
| Modify 3DMM expression coefficients with emotion delta. |
| |
| Args: |
| coeffs: [T, 64] expression coefficients from SadTalker |
| emotion: Target emotion string |
| intensity: 0.0 (neutral) to 1.0 (full expression) |
| preserve_lip_sync: If True, reduce modification on lip-critical dims |
| |
| Returns: |
| modified: [T, 64] emotion-modulated coefficients |
| """ |
| emotion = emotion.lower() |
| if emotion not in self.profiles: |
| print(f" β Unknown emotion '{emotion}', using neutral") |
| return coeffs |
|
|
| if emotion == "neutral": |
| return coeffs |
|
|
| profile = self.profiles[emotion] |
| delta = profile["expression_delta"] |
|
|
| if delta is None: |
| return coeffs |
|
|
| |
| scaled_delta = delta * intensity |
|
|
| |
| T = coeffs.shape[0] |
| if T > 10: |
| |
| ramp = np.ones(T) |
| ramp_len = max(3, T // 5) |
| ramp[:ramp_len] = np.linspace(0, 1, ramp_len) |
| ramp[-ramp_len:] = np.linspace(1, 0.3, ramp_len) |
| scaled_delta = scaled_delta[np.newaxis, :] * ramp[:, np.newaxis] |
| else: |
| scaled_delta = np.tile(scaled_delta, (T, 1)) |
|
|
| modified = coeffs.copy() |
| coeff_dim = min(coeffs.shape[1], 64) |
|
|
| if preserve_lip_sync: |
| |
| |
| lip_mask = np.ones(coeff_dim) |
| lip_mask[10:25] = 0.3 |
| lip_mask[0:10] = 0.6 |
| scaled_delta[:, :coeff_dim] *= lip_mask |
|
|
| modified[:, :coeff_dim] += scaled_delta[:, :coeff_dim] |
|
|
| return modified |
|
|
| def get_all_emotion_variants( |
| self, |
| coeffs: np.ndarray, |
| intensity: float = 0.7 |
| ) -> Dict[str, np.ndarray]: |
| """Generate all emotion variants from same base coefficients.""" |
| variants = {} |
| for emotion in ["neutral", "happy", "sad", "angry", "fear", "surprise", "disgust"]: |
| variants[emotion] = self.modify_coefficients(coeffs, emotion, intensity) |
| return variants |
|
|
|
|
| |
| |
| |
|
|
| class AudioEmotionDetector: |
| """ |
| Detects emotion from speech audio using pre-trained wav2vec2 model. |
| Uses: ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition |
| |
| This provides the automatic emotion detection branch of the pipeline. |
| Can be overridden with manual emotion specification. |
| """ |
|
|
| def __init__(self, device: str = "cpu"): |
| self.device = device |
| self.classifier = None |
| self._label_map = { |
| "angry": "angry", |
| "disgust": "disgust", |
| "fear": "fear", |
| "happy": "happy", |
| "neutral": "neutral", |
| "sad": "sad", |
| "surprise": "surprise", |
| |
| "happiness": "happy", |
| "sadness": "sad", |
| "anger": "angry", |
| "fearful": "fear", |
| "surprised": "surprise", |
| "disgusted": "disgust", |
| "calm": "neutral", |
| "ps": "surprise", |
| } |
|
|
| def load(self): |
| """Lazy-load the model.""" |
| if self.classifier is None: |
| try: |
| from transformers import pipeline |
| print(" Loading speech emotion recognition model...") |
| self.classifier = pipeline( |
| "audio-classification", |
| model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition", |
| device=0 if self.device == "cuda" else -1, |
| top_k=7 |
| ) |
| print(" β Emotion model loaded") |
| except Exception as e: |
| print(f" β Failed to load emotion model: {e}") |
| print(" β Will use manual emotion specification") |
| self.classifier = None |
|
|
| def detect(self, audio_path: str) -> Dict: |
| """ |
| Detect emotion from audio file. |
| |
| Returns: |
| { |
| "detected_emotion": str, |
| "confidence": float, |
| "all_scores": {emotion: score, ...} |
| } |
| """ |
| self.load() |
|
|
| if self.classifier is None: |
| return { |
| "detected_emotion": "neutral", |
| "confidence": 0.0, |
| "all_scores": {}, |
| "error": "Model not loaded" |
| } |
|
|
| try: |
| import librosa |
| audio, sr = librosa.load(audio_path, sr=16000) |
|
|
| results = self.classifier(audio) |
|
|
| all_scores = {} |
| for r in results: |
| label = self._label_map.get(r["label"].lower(), r["label"].lower()) |
| all_scores[label] = r["score"] |
|
|
| top = max(all_scores, key=all_scores.get) |
|
|
| return { |
| "detected_emotion": top, |
| "confidence": all_scores[top], |
| "all_scores": all_scores |
| } |
|
|
| except Exception as e: |
| print(f" β Emotion detection failed: {e}") |
| return { |
| "detected_emotion": "neutral", |
| "confidence": 0.0, |
| "all_scores": {}, |
| "error": str(e) |
| } |
|
|
|
|
| |
| |
| |
|
|
| class EmotionIntensityEstimator: |
| """ |
| Estimates emotion intensity from audio features. |
| Uses simple heuristics based on: |
| - Energy envelope variance |
| - Pitch (F0) range |
| - Speaking rate |
| |
| Maps these to intensity scale [0, 1]. |
| """ |
|
|
| def estimate(self, audio_path: str) -> float: |
| """Estimate emotion intensity from audio.""" |
| try: |
| import librosa |
|
|
| y, sr = librosa.load(audio_path, sr=16000) |
|
|
| |
| rms = librosa.feature.rms(y=y)[0] |
| energy_var = np.std(rms) / (np.mean(rms) + 1e-8) |
|
|
| |
| f0, _, _ = librosa.pyin(y, fmin=80, fmax=400, sr=sr) |
| f0_clean = f0[~np.isnan(f0)] |
| if len(f0_clean) > 0: |
| pitch_range = (np.max(f0_clean) - np.min(f0_clean)) / (np.mean(f0_clean) + 1e-8) |
| else: |
| pitch_range = 0.0 |
|
|
| |
| intensity = np.clip(0.3 * energy_var + 0.5 * pitch_range + 0.2, 0.1, 1.0) |
|
|
| return float(intensity) |
|
|
| except Exception: |
| return 0.5 |
|
|
|
|
| |
| |
| |
|
|
| def print_architecture_summary(): |
| """Print the ECFM architecture for documentation.""" |
| print(""" |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| β EMOLIPS Architecture Overview β |
| β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£ |
| β β |
| β Input Audio βββ¬βββ [SadTalker Audio Encoder] β |
| β β β β |
| β β Expression Coefficients (Ξ²) β |
| β β β β |
| β ββββ [Speech Emotion Encoder] β |
| β β β β |
| β β Emotion Embedding (e) β |
| β β β β |
| β ββββ [Intensity Estimator] β |
| β β β |
| β Intensity (Ξ±) β |
| β β β |
| β βββββββββββββββββββββββββββββββββββββββββββ β |
| β β Emotion-Conditioned Fusion Module β β |
| β β β β |
| β β (e, Ξ±) β EmotionEncoder β Γͺ β β |
| β β Ξ² β FiLM_coarse(Ξ² | Γͺ) β Ξ²β β β |
| β β Ξ²β β Residual Refine β Ξ²β β β |
| β β Ξ²β β FiLM_fine(Ξ²β | Γͺ) β Ξ²β β β |
| β β Ξ²β β LipConsistencyGate(Ξ², Γͺ) β Ξ²' β β |
| β βββββββββββββββββββββββββββββββββββββββββββ β |
| β β β |
| β Input Image βββ [SadTalker Face Renderer] β |
| β β β |
| β Emotion-Driven Output Video β |
| β β |
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
| """) |
|
|
|
|
| if __name__ == "__main__": |
| print_architecture_summary() |
|
|
| |
| model = EmotionConditionedFusionModule(coeff_dim=64, emotion_dim=128) |
| coeffs = torch.randn(2, 30, 64) |
| emotion = torch.tensor([1, 3]) |
| intensity = torch.tensor([0.8, 0.6]) |
|
|
| out = model(coeffs, emotion, intensity) |
| print(f"Input coeffs: {coeffs.shape}") |
| print(f"Output coeffs: {out.shape}") |
| print(f"β ECFM forward pass successful") |
|
|