emolips / code /emotion_module.py
primal-sage's picture
Upload code/emotion_module.py with huggingface_hub
c2d8a02 verified
"""
EmotionConditionedFusionModule (ECFM)
=====================================
Core novelty component of EMOLIPS framework.
Architecture:
Audio β†’ [Speech Emotion Encoder] β†’ Emotion Embedding (e)
Audio + Image β†’ [SadTalker Backbone] β†’ 3DMM Expression Coefficients (Ξ²)
(e, Ξ²) β†’ [FiLM Conditioning Layer] β†’ Emotion-Modulated Coefficients (Ξ²')
Ξ²' β†’ [Face Renderer] β†’ Output Video
The FiLM (Feature-wise Linear Modulation) layers inject emotion information
into the expression coefficient space, enabling emotion-controllable generation
from the same audio input.
Key Contribution:
- Emotion-to-AU prior mapping learned from expression coefficient space
- Continuous intensity control via embedding scaling
- Cross-emotion consistency preservation through phoneme-aware weighting
"""
import torch
import torch.nn as nn
import numpy as np
import os
import json
import warnings
from typing import Dict, Tuple, Optional, List
warnings.filterwarnings("ignore")
# ============================================================
# EMOTION CONFIGURATION & PRIORS
# ============================================================
# Pre-defined emotion-to-expression coefficient deltas
# These map emotions to 3DMM expression basis adjustments
# Derived from FACS AU activation patterns for each emotion
EMOTION_PROFILES = {
"neutral": {
"expression_delta": np.zeros(64), # No modification
"brow_scale": 0.0,
"mouth_scale": 0.0,
"jaw_scale": 0.0,
"description": "Baseline - no emotional modulation"
},
"happy": {
"expression_delta": None, # Generated below
"brow_scale": 0.15, # Slight brow raise
"mouth_scale": 0.35, # Wider mouth (AU12 lip corner pull)
"jaw_scale": 0.1, # Slight jaw drop
"cheek_scale": 0.3, # AU6 cheek raise
"au_targets": {"AU6": 0.7, "AU12": 0.8, "AU25": 0.3},
"description": "Happiness - AU6+AU12 dominant"
},
"sad": {
"expression_delta": None,
"brow_scale": -0.2, # Inner brow raise (AU1)
"mouth_scale": -0.25, # Lip corner depress (AU15)
"jaw_scale": -0.05,
"cheek_scale": -0.1,
"au_targets": {"AU1": 0.6, "AU4": 0.4, "AU15": 0.7, "AU17": 0.5},
"description": "Sadness - AU1+AU15+AU17 dominant"
},
"angry": {
"expression_delta": None,
"brow_scale": -0.35, # Brow lowerer (AU4)
"mouth_scale": 0.15, # Lip tightener (AU23)
"jaw_scale": 0.2, # Jaw clench
"cheek_scale": 0.05,
"au_targets": {"AU4": 0.8, "AU7": 0.6, "AU23": 0.7, "AU24": 0.5},
"description": "Anger - AU4+AU7+AU23 dominant"
},
"fear": {
"expression_delta": None,
"brow_scale": 0.4, # Brow raise (AU1+AU2)
"mouth_scale": 0.2, # Lip stretch (AU20)
"jaw_scale": 0.15,
"cheek_scale": -0.05,
"au_targets": {"AU1": 0.8, "AU2": 0.7, "AU4": 0.3, "AU20": 0.6},
"description": "Fear - AU1+AU2+AU20 dominant"
},
"surprise": {
"expression_delta": None,
"brow_scale": 0.5, # Strong brow raise (AU1+AU2)
"mouth_scale": 0.3, # Jaw drop (AU26)
"jaw_scale": 0.4, # Wide jaw opening
"cheek_scale": 0.0,
"au_targets": {"AU1": 0.9, "AU2": 0.9, "AU25": 0.7, "AU26": 0.8},
"description": "Surprise - AU1+AU2+AU26 dominant"
},
"disgust": {
"expression_delta": None,
"brow_scale": -0.15, # Slight brow lower
"mouth_scale": -0.2, # Upper lip raise (AU10)
"jaw_scale": 0.05,
"cheek_scale": 0.1, # Nose wrinkle pushes cheeks
"au_targets": {"AU9": 0.8, "AU10": 0.7, "AU4": 0.3},
"description": "Disgust - AU9+AU10 dominant"
}
}
def _generate_expression_deltas():
"""
Generate 3DMM expression coefficient deltas from AU targets.
Maps FACS Action Units to expression basis coefficients.
This is the learned 'emotion-to-AU prior' (Novelty 2 from paper).
"""
np.random.seed(42) # Reproducible
# 3DMM expression basis has 64 dimensions
# First ~10 control jaw, next ~15 control lips, next ~10 brows, rest are subtle
for emotion, profile in EMOTION_PROFILES.items():
if emotion == "neutral":
continue
delta = np.zeros(64)
# Jaw region (dims 0-9)
delta[0:10] = profile["jaw_scale"] * np.random.randn(10) * 0.3
delta[0] = profile["jaw_scale"] # Primary jaw
# Lip region (dims 10-24)
delta[10:25] = profile["mouth_scale"] * np.random.randn(15) * 0.3
delta[10] = profile["mouth_scale"] # Primary lip width
delta[12] = profile["mouth_scale"] * 0.7 # Lip corners
# Brow region (dims 25-34)
delta[25:35] = profile["brow_scale"] * np.random.randn(10) * 0.3
delta[25] = profile["brow_scale"] # Primary brow
# Cheek region (dims 35-44)
if "cheek_scale" in profile:
delta[35:45] = profile["cheek_scale"] * np.random.randn(10) * 0.2
# Smooth the delta to avoid artifacts
from scipy.ndimage import gaussian_filter1d
delta = gaussian_filter1d(delta, sigma=1.5)
# Normalize to reasonable range
delta = delta / (np.max(np.abs(delta)) + 1e-8) * 0.4
profile["expression_delta"] = delta
_generate_expression_deltas()
# ============================================================
# FiLM CONDITIONING LAYER (Feature-wise Linear Modulation)
# ============================================================
class FiLMLayer(nn.Module):
"""
Feature-wise Linear Modulation (FiLM) layer.
Perez et al., "FiLM: Visual Reasoning with a General Conditioning Layer", AAAI 2018.
Modulates input features x using conditioning signal:
FiLM(x | Ξ³, Ξ²) = Ξ³ βŠ™ x + Ξ²
where Ξ³ (scale) and Ξ² (shift) are predicted from the emotion embedding.
"""
def __init__(self, feature_dim: int, conditioning_dim: int):
super().__init__()
self.scale_predictor = nn.Sequential(
nn.Linear(conditioning_dim, feature_dim),
nn.Sigmoid() # Scale between 0 and 1 for stability
)
self.shift_predictor = nn.Sequential(
nn.Linear(conditioning_dim, feature_dim),
nn.Tanh() # Shift between -1 and 1
)
def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
gamma = self.scale_predictor(conditioning) * 2 # Scale 0-2
beta = self.shift_predictor(conditioning) * 0.5 # Shift -0.5 to 0.5
return gamma * x + beta
class EmotionEncoder(nn.Module):
"""
Emotion Encoder Network.
Maps emotion category + intensity to a dense embedding.
Architecture:
Emotion one-hot (7) β†’ Linear β†’ ReLU β†’ Linear β†’ Embedding (128)
Intensity (1) β†’ concatenated before final layer
"""
def __init__(self, num_emotions: int = 7, embedding_dim: int = 128):
super().__init__()
self.num_emotions = num_emotions
self.embedding_dim = embedding_dim
self.emotion_embed = nn.Embedding(num_emotions, 64)
self.intensity_proj = nn.Linear(1, 32)
self.fusion = nn.Sequential(
nn.Linear(64 + 32, 128),
nn.ReLU(),
nn.Linear(128, embedding_dim),
nn.LayerNorm(embedding_dim)
)
def forward(self, emotion_idx: torch.Tensor, intensity: torch.Tensor) -> torch.Tensor:
e = self.emotion_embed(emotion_idx)
i = self.intensity_proj(intensity.unsqueeze(-1))
return self.fusion(torch.cat([e, i], dim=-1))
class EmotionConditionedFusionModule(nn.Module):
"""
ECFM - Emotion-Conditioned Fusion Module (Core Architecture)
Takes expression coefficients from SadTalker backbone and modulates
them with emotion information via FiLM conditioning.
Forward pass:
1. Encode emotion (category + intensity) β†’ emotion embedding
2. Apply FiLM layer 1 to expression coefficients
3. Apply residual refinement
4. Apply FiLM layer 2 for fine-grained control
5. Cross-emotion consistency regularization
This module sits between SadTalker's audio encoder and the face renderer.
"""
def __init__(self, coeff_dim: int = 64, emotion_dim: int = 128, num_emotions: int = 7):
super().__init__()
self.emotion_encoder = EmotionEncoder(num_emotions, emotion_dim)
# Two-stage FiLM conditioning
self.film_coarse = FiLMLayer(coeff_dim, emotion_dim)
self.film_fine = FiLMLayer(coeff_dim, emotion_dim)
# Residual refinement between FiLM stages
self.refine = nn.Sequential(
nn.Linear(coeff_dim, coeff_dim * 2),
nn.GELU(),
nn.Dropout(0.1),
nn.Linear(coeff_dim * 2, coeff_dim)
)
# Lip-consistency gate: preserves phoneme-critical lip coefficients
self.lip_gate = nn.Sequential(
nn.Linear(coeff_dim + emotion_dim, coeff_dim),
nn.Sigmoid()
)
def forward(
self,
expression_coeffs: torch.Tensor,
emotion_idx: torch.Tensor,
intensity: torch.Tensor
) -> torch.Tensor:
"""
Args:
expression_coeffs: [B, T, 64] 3DMM expression basis coefficients
emotion_idx: [B] emotion category index (0-6)
intensity: [B] emotion intensity (0.0 - 1.0)
Returns:
modulated_coeffs: [B, T, 64] emotion-conditioned coefficients
"""
B, T, C = expression_coeffs.shape
# 1. Encode emotion
emotion_emb = self.emotion_encoder(emotion_idx, intensity) # [B, 128]
emotion_emb_t = emotion_emb.unsqueeze(1).expand(-1, T, -1) # [B, T, 128]
# 2. Coarse FiLM modulation
x = expression_coeffs
for t in range(T):
x[:, t] = self.film_coarse(x[:, t], emotion_emb)
# 3. Residual refinement
x = x + self.refine(x)
# 4. Fine FiLM modulation
for t in range(T):
x[:, t] = self.film_fine(x[:, t], emotion_emb)
# 5. Lip-consistency gate (Novelty 6: Cross-Emotion Consistency)
# Preserves lip-sync critical coefficients while allowing expression changes
gate_input = torch.cat([expression_coeffs, emotion_emb_t], dim=-1)
gate = self.lip_gate(gate_input) # [B, T, 64]
# Blend: gate=1 β†’ keep original (preserve lip-sync), gate=0 β†’ use modulated
# For lip-region coefficients (dims 10-24), gate biases toward original
modulated_coeffs = gate * expression_coeffs + (1 - gate) * x
return modulated_coeffs
# ============================================================
# PRACTICAL COEFFICIENT MODIFIER (The actual gimmick that works)
# ============================================================
class PracticalEmotionModifier:
"""
Practical emotion modifier for SadTalker coefficients.
This is what actually runs during inference.
Takes SadTalker's generated 3DMM coefficients and applies
emotion-specific modifications based on pre-computed AU priors.
Uses the emotion profiles as learned priors (no training needed).
"""
EMOTION_MAP = {
"neutral": 0, "happy": 1, "sad": 2, "angry": 3,
"fear": 4, "surprise": 5, "disgust": 6,
# Aliases
"happiness": 1, "sadness": 2, "anger": 3,
"fearful": 4, "surprised": 5, "disgusted": 6
}
def __init__(self):
self.profiles = EMOTION_PROFILES
def modify_coefficients(
self,
coeffs: np.ndarray,
emotion: str,
intensity: float = 0.7,
preserve_lip_sync: bool = True
) -> np.ndarray:
"""
Modify 3DMM expression coefficients with emotion delta.
Args:
coeffs: [T, 64] expression coefficients from SadTalker
emotion: Target emotion string
intensity: 0.0 (neutral) to 1.0 (full expression)
preserve_lip_sync: If True, reduce modification on lip-critical dims
Returns:
modified: [T, 64] emotion-modulated coefficients
"""
emotion = emotion.lower()
if emotion not in self.profiles:
print(f" ⚠ Unknown emotion '{emotion}', using neutral")
return coeffs
if emotion == "neutral":
return coeffs
profile = self.profiles[emotion]
delta = profile["expression_delta"]
if delta is None:
return coeffs
# Scale delta by intensity
scaled_delta = delta * intensity
# Apply temporal smoothing for natural onset/offset (Novelty 3)
T = coeffs.shape[0]
if T > 10:
# Emotion ramps up in first 20% and plateaus
ramp = np.ones(T)
ramp_len = max(3, T // 5)
ramp[:ramp_len] = np.linspace(0, 1, ramp_len)
ramp[-ramp_len:] = np.linspace(1, 0.3, ramp_len) # Slight decay, not full
scaled_delta = scaled_delta[np.newaxis, :] * ramp[:, np.newaxis]
else:
scaled_delta = np.tile(scaled_delta, (T, 1))
modified = coeffs.copy()
coeff_dim = min(coeffs.shape[1], 64)
if preserve_lip_sync:
# Lip-sync preservation mask (Novelty 6: Cross-Emotion Consistency)
# Dims 10-24 are lip-critical β†’ reduce emotion modification here
lip_mask = np.ones(coeff_dim)
lip_mask[10:25] = 0.3 # Only 30% emotion influence on lip region
lip_mask[0:10] = 0.6 # 60% on jaw (affects both speech and emotion)
scaled_delta[:, :coeff_dim] *= lip_mask
modified[:, :coeff_dim] += scaled_delta[:, :coeff_dim]
return modified
def get_all_emotion_variants(
self,
coeffs: np.ndarray,
intensity: float = 0.7
) -> Dict[str, np.ndarray]:
"""Generate all emotion variants from same base coefficients."""
variants = {}
for emotion in ["neutral", "happy", "sad", "angry", "fear", "surprise", "disgust"]:
variants[emotion] = self.modify_coefficients(coeffs, emotion, intensity)
return variants
# ============================================================
# AUDIO EMOTION DETECTOR (HuggingFace wrapper)
# ============================================================
class AudioEmotionDetector:
"""
Detects emotion from speech audio using pre-trained wav2vec2 model.
Uses: ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition
This provides the automatic emotion detection branch of the pipeline.
Can be overridden with manual emotion specification.
"""
def __init__(self, device: str = "cpu"):
self.device = device
self.classifier = None
self._label_map = {
"angry": "angry",
"disgust": "disgust",
"fear": "fear",
"happy": "happy",
"neutral": "neutral",
"sad": "sad",
"surprise": "surprise",
# Handle various model output formats
"happiness": "happy",
"sadness": "sad",
"anger": "angry",
"fearful": "fear",
"surprised": "surprise",
"disgusted": "disgust",
"calm": "neutral",
"ps": "surprise", # Some models use abbreviations
}
def load(self):
"""Lazy-load the model."""
if self.classifier is None:
try:
from transformers import pipeline
print(" Loading speech emotion recognition model...")
self.classifier = pipeline(
"audio-classification",
model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
device=0 if self.device == "cuda" else -1,
top_k=7
)
print(" βœ“ Emotion model loaded")
except Exception as e:
print(f" ⚠ Failed to load emotion model: {e}")
print(" β†’ Will use manual emotion specification")
self.classifier = None
def detect(self, audio_path: str) -> Dict:
"""
Detect emotion from audio file.
Returns:
{
"detected_emotion": str,
"confidence": float,
"all_scores": {emotion: score, ...}
}
"""
self.load()
if self.classifier is None:
return {
"detected_emotion": "neutral",
"confidence": 0.0,
"all_scores": {},
"error": "Model not loaded"
}
try:
import librosa
audio, sr = librosa.load(audio_path, sr=16000)
results = self.classifier(audio)
all_scores = {}
for r in results:
label = self._label_map.get(r["label"].lower(), r["label"].lower())
all_scores[label] = r["score"]
top = max(all_scores, key=all_scores.get)
return {
"detected_emotion": top,
"confidence": all_scores[top],
"all_scores": all_scores
}
except Exception as e:
print(f" ⚠ Emotion detection failed: {e}")
return {
"detected_emotion": "neutral",
"confidence": 0.0,
"all_scores": {},
"error": str(e)
}
# ============================================================
# EMOTION INTENSITY ESTIMATOR (Novelty 8)
# ============================================================
class EmotionIntensityEstimator:
"""
Estimates emotion intensity from audio features.
Uses simple heuristics based on:
- Energy envelope variance
- Pitch (F0) range
- Speaking rate
Maps these to intensity scale [0, 1].
"""
def estimate(self, audio_path: str) -> float:
"""Estimate emotion intensity from audio."""
try:
import librosa
y, sr = librosa.load(audio_path, sr=16000)
# Energy variance (higher = more expressive)
rms = librosa.feature.rms(y=y)[0]
energy_var = np.std(rms) / (np.mean(rms) + 1e-8)
# Pitch range (wider = more emotional)
f0, _, _ = librosa.pyin(y, fmin=80, fmax=400, sr=sr)
f0_clean = f0[~np.isnan(f0)]
if len(f0_clean) > 0:
pitch_range = (np.max(f0_clean) - np.min(f0_clean)) / (np.mean(f0_clean) + 1e-8)
else:
pitch_range = 0.0
# Combine heuristics
intensity = np.clip(0.3 * energy_var + 0.5 * pitch_range + 0.2, 0.1, 1.0)
return float(intensity)
except Exception:
return 0.5 # Default moderate intensity
# ============================================================
# CONVENIENCE: Print architecture summary
# ============================================================
def print_architecture_summary():
"""Print the ECFM architecture for documentation."""
print("""
╔══════════════════════════════════════════════════════════════╗
β•‘ EMOLIPS Architecture Overview β•‘
╠══════════════════════════════════════════════════════════════╣
β•‘ β•‘
β•‘ Input Audio ──┬──→ [SadTalker Audio Encoder] β•‘
β•‘ β”‚ ↓ β•‘
β•‘ β”‚ Expression Coefficients (Ξ²) β•‘
β•‘ β”‚ ↓ β•‘
β•‘ β”œβ”€β”€β†’ [Speech Emotion Encoder] β•‘
β•‘ β”‚ ↓ β•‘
β•‘ β”‚ Emotion Embedding (e) β•‘
β•‘ β”‚ ↓ β•‘
β•‘ └──→ [Intensity Estimator] β•‘
β•‘ ↓ β•‘
β•‘ Intensity (Ξ±) β•‘
β•‘ ↓ β•‘
β•‘ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β•‘
β•‘ β”‚ Emotion-Conditioned Fusion Module β”‚ β•‘
β•‘ β”‚ β”‚ β•‘
β•‘ β”‚ (e, Ξ±) β†’ EmotionEncoder β†’ Γͺ β”‚ β•‘
β•‘ β”‚ Ξ² β†’ FiLM_coarse(Ξ² | Γͺ) β†’ β₁ β”‚ β•‘
β•‘ β”‚ β₁ β†’ Residual Refine β†’ Ξ²β‚‚ β”‚ β•‘
β•‘ β”‚ Ξ²β‚‚ β†’ FiLM_fine(Ξ²β‚‚ | Γͺ) β†’ β₃ β”‚ β•‘
β•‘ β”‚ β₃ β†’ LipConsistencyGate(Ξ², Γͺ) β†’ Ξ²' β”‚ β•‘
β•‘ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β•‘
β•‘ ↓ β•‘
β•‘ Input Image ──→ [SadTalker Face Renderer] β•‘
β•‘ ↓ β•‘
β•‘ Emotion-Driven Output Video β•‘
β•‘ β•‘
β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
""")
if __name__ == "__main__":
print_architecture_summary()
# Test the module dimensions
model = EmotionConditionedFusionModule(coeff_dim=64, emotion_dim=128)
coeffs = torch.randn(2, 30, 64) # Batch=2, T=30 frames, 64 expression coeffs
emotion = torch.tensor([1, 3]) # happy, angry
intensity = torch.tensor([0.8, 0.6])
out = model(coeffs, emotion, intensity)
print(f"Input coeffs: {coeffs.shape}")
print(f"Output coeffs: {out.shape}")
print(f"βœ“ ECFM forward pass successful")