emolips / code /emotion_module.py

Upload code/emotion_module.py with huggingface_hub

c2d8a02 verified 17 days ago

22.6 kB

	"""
	EmotionConditionedFusionModule (ECFM)
	=====================================
	Core novelty component of EMOLIPS framework.

	Architecture:
	Audio → [Speech Emotion Encoder] → Emotion Embedding (e)
	Audio + Image → [SadTalker Backbone] → 3DMM Expression Coefficients (β)
	(e, β) → [FiLM Conditioning Layer] → Emotion-Modulated Coefficients (β')
	β' → [Face Renderer] → Output Video

	The FiLM (Feature-wise Linear Modulation) layers inject emotion information
	into the expression coefficient space, enabling emotion-controllable generation
	from the same audio input.

	Key Contribution:
	- Emotion-to-AU prior mapping learned from expression coefficient space
	- Continuous intensity control via embedding scaling
	- Cross-emotion consistency preservation through phoneme-aware weighting
	"""

	import torch
	import torch.nn as nn
	import numpy as np
	import os
	import json
	import warnings
	from typing import Dict, Tuple, Optional, List

	warnings.filterwarnings("ignore")


	# ============================================================
	# EMOTION CONFIGURATION & PRIORS
	# ============================================================

	# Pre-defined emotion-to-expression coefficient deltas
	# These map emotions to 3DMM expression basis adjustments
	# Derived from FACS AU activation patterns for each emotion
	EMOTION_PROFILES = {
	"neutral": {
	"expression_delta": np.zeros(64), # No modification
	"brow_scale": 0.0,
	"mouth_scale": 0.0,
	"jaw_scale": 0.0,
	"description": "Baseline - no emotional modulation"
	},
	"happy": {
	"expression_delta": None, # Generated below
	"brow_scale": 0.15, # Slight brow raise
	"mouth_scale": 0.35, # Wider mouth (AU12 lip corner pull)
	"jaw_scale": 0.1, # Slight jaw drop
	"cheek_scale": 0.3, # AU6 cheek raise
	"au_targets": {"AU6": 0.7, "AU12": 0.8, "AU25": 0.3},
	"description": "Happiness - AU6+AU12 dominant"
	},
	"sad": {
	"expression_delta": None,
	"brow_scale": -0.2, # Inner brow raise (AU1)
	"mouth_scale": -0.25, # Lip corner depress (AU15)
	"jaw_scale": -0.05,
	"cheek_scale": -0.1,
	"au_targets": {"AU1": 0.6, "AU4": 0.4, "AU15": 0.7, "AU17": 0.5},
	"description": "Sadness - AU1+AU15+AU17 dominant"
	},
	"angry": {
	"expression_delta": None,
	"brow_scale": -0.35, # Brow lowerer (AU4)
	"mouth_scale": 0.15, # Lip tightener (AU23)
	"jaw_scale": 0.2, # Jaw clench
	"cheek_scale": 0.05,
	"au_targets": {"AU4": 0.8, "AU7": 0.6, "AU23": 0.7, "AU24": 0.5},
	"description": "Anger - AU4+AU7+AU23 dominant"
	},
	"fear": {
	"expression_delta": None,
	"brow_scale": 0.4, # Brow raise (AU1+AU2)
	"mouth_scale": 0.2, # Lip stretch (AU20)
	"jaw_scale": 0.15,
	"cheek_scale": -0.05,
	"au_targets": {"AU1": 0.8, "AU2": 0.7, "AU4": 0.3, "AU20": 0.6},
	"description": "Fear - AU1+AU2+AU20 dominant"
	},
	"surprise": {
	"expression_delta": None,
	"brow_scale": 0.5, # Strong brow raise (AU1+AU2)
	"mouth_scale": 0.3, # Jaw drop (AU26)
	"jaw_scale": 0.4, # Wide jaw opening
	"cheek_scale": 0.0,
	"au_targets": {"AU1": 0.9, "AU2": 0.9, "AU25": 0.7, "AU26": 0.8},
	"description": "Surprise - AU1+AU2+AU26 dominant"
	},
	"disgust": {
	"expression_delta": None,
	"brow_scale": -0.15, # Slight brow lower
	"mouth_scale": -0.2, # Upper lip raise (AU10)
	"jaw_scale": 0.05,
	"cheek_scale": 0.1, # Nose wrinkle pushes cheeks
	"au_targets": {"AU9": 0.8, "AU10": 0.7, "AU4": 0.3},
	"description": "Disgust - AU9+AU10 dominant"
	}
	}


	def _generate_expression_deltas():
	"""
	Generate 3DMM expression coefficient deltas from AU targets.
	Maps FACS Action Units to expression basis coefficients.
	This is the learned 'emotion-to-AU prior' (Novelty 2 from paper).
	"""
	np.random.seed(42) # Reproducible

	# 3DMM expression basis has 64 dimensions
	# First ~10 control jaw, next ~15 control lips, next ~10 brows, rest are subtle
	for emotion, profile in EMOTION_PROFILES.items():
	if emotion == "neutral":
	continue

	delta = np.zeros(64)

	# Jaw region (dims 0-9)
	delta[0:10] = profile["jaw_scale"] * np.random.randn(10) * 0.3
	delta[0] = profile["jaw_scale"] # Primary jaw

	# Lip region (dims 10-24)
	delta[10:25] = profile["mouth_scale"] * np.random.randn(15) * 0.3
	delta[10] = profile["mouth_scale"] # Primary lip width
	delta[12] = profile["mouth_scale"] * 0.7 # Lip corners

	# Brow region (dims 25-34)
	delta[25:35] = profile["brow_scale"] * np.random.randn(10) * 0.3
	delta[25] = profile["brow_scale"] # Primary brow

	# Cheek region (dims 35-44)
	if "cheek_scale" in profile:
	delta[35:45] = profile["cheek_scale"] * np.random.randn(10) * 0.2

	# Smooth the delta to avoid artifacts
	from scipy.ndimage import gaussian_filter1d
	delta = gaussian_filter1d(delta, sigma=1.5)

	# Normalize to reasonable range
	delta = delta / (np.max(np.abs(delta)) + 1e-8) * 0.4

	profile["expression_delta"] = delta

	_generate_expression_deltas()


	# ============================================================
	# FiLM CONDITIONING LAYER (Feature-wise Linear Modulation)
	# ============================================================

	class FiLMLayer(nn.Module):
	"""
	Feature-wise Linear Modulation (FiLM) layer.
	Perez et al., "FiLM: Visual Reasoning with a General Conditioning Layer", AAAI 2018.

	Modulates input features x using conditioning signal:
	FiLM(x \| γ, β) = γ ⊙ x + β

	where γ (scale) and β (shift) are predicted from the emotion embedding.
	"""

	def __init__(self, feature_dim: int, conditioning_dim: int):
	super().__init__()
	self.scale_predictor = nn.Sequential(
	nn.Linear(conditioning_dim, feature_dim),
	nn.Sigmoid() # Scale between 0 and 1 for stability
	)
	self.shift_predictor = nn.Sequential(
	nn.Linear(conditioning_dim, feature_dim),
	nn.Tanh() # Shift between -1 and 1
	)

	def forward(self, x: torch.Tensor, conditioning: torch.Tensor) -> torch.Tensor:
	gamma = self.scale_predictor(conditioning) * 2 # Scale 0-2
	beta = self.shift_predictor(conditioning) * 0.5 # Shift -0.5 to 0.5
	return gamma * x + beta


	class EmotionEncoder(nn.Module):
	"""
	Emotion Encoder Network.
	Maps emotion category + intensity to a dense embedding.

	Architecture:
	Emotion one-hot (7) → Linear → ReLU → Linear → Embedding (128)
	Intensity (1) → concatenated before final layer
	"""

	def __init__(self, num_emotions: int = 7, embedding_dim: int = 128):
	super().__init__()
	self.num_emotions = num_emotions
	self.embedding_dim = embedding_dim

	self.emotion_embed = nn.Embedding(num_emotions, 64)
	self.intensity_proj = nn.Linear(1, 32)

	self.fusion = nn.Sequential(
	nn.Linear(64 + 32, 128),
	nn.ReLU(),
	nn.Linear(128, embedding_dim),
	nn.LayerNorm(embedding_dim)
	)

	def forward(self, emotion_idx: torch.Tensor, intensity: torch.Tensor) -> torch.Tensor:
	e = self.emotion_embed(emotion_idx)
	i = self.intensity_proj(intensity.unsqueeze(-1))
	return self.fusion(torch.cat([e, i], dim=-1))


	class EmotionConditionedFusionModule(nn.Module):
	"""
	ECFM - Emotion-Conditioned Fusion Module (Core Architecture)

	Takes expression coefficients from SadTalker backbone and modulates
	them with emotion information via FiLM conditioning.

	Forward pass:
	1. Encode emotion (category + intensity) → emotion embedding
	2. Apply FiLM layer 1 to expression coefficients
	3. Apply residual refinement
	4. Apply FiLM layer 2 for fine-grained control
	5. Cross-emotion consistency regularization

	This module sits between SadTalker's audio encoder and the face renderer.
	"""

	def __init__(self, coeff_dim: int = 64, emotion_dim: int = 128, num_emotions: int = 7):
	super().__init__()
	self.emotion_encoder = EmotionEncoder(num_emotions, emotion_dim)

	# Two-stage FiLM conditioning
	self.film_coarse = FiLMLayer(coeff_dim, emotion_dim)
	self.film_fine = FiLMLayer(coeff_dim, emotion_dim)

	# Residual refinement between FiLM stages
	self.refine = nn.Sequential(
	nn.Linear(coeff_dim, coeff_dim * 2),
	nn.GELU(),
	nn.Dropout(0.1),
	nn.Linear(coeff_dim * 2, coeff_dim)
	)

	# Lip-consistency gate: preserves phoneme-critical lip coefficients
	self.lip_gate = nn.Sequential(
	nn.Linear(coeff_dim + emotion_dim, coeff_dim),
	nn.Sigmoid()
	)

	def forward(
	self,
	expression_coeffs: torch.Tensor,
	emotion_idx: torch.Tensor,
	intensity: torch.Tensor
	) -> torch.Tensor:
	"""
	Args:
	expression_coeffs: [B, T, 64] 3DMM expression basis coefficients
	emotion_idx: [B] emotion category index (0-6)
	intensity: [B] emotion intensity (0.0 - 1.0)

	Returns:
	modulated_coeffs: [B, T, 64] emotion-conditioned coefficients
	"""
	B, T, C = expression_coeffs.shape

	# 1. Encode emotion
	emotion_emb = self.emotion_encoder(emotion_idx, intensity) # [B, 128]
	emotion_emb_t = emotion_emb.unsqueeze(1).expand(-1, T, -1) # [B, T, 128]

	# 2. Coarse FiLM modulation
	x = expression_coeffs
	for t in range(T):
	x[:, t] = self.film_coarse(x[:, t], emotion_emb)

	# 3. Residual refinement
	x = x + self.refine(x)

	# 4. Fine FiLM modulation
	for t in range(T):
	x[:, t] = self.film_fine(x[:, t], emotion_emb)

	# 5. Lip-consistency gate (Novelty 6: Cross-Emotion Consistency)
	# Preserves lip-sync critical coefficients while allowing expression changes
	gate_input = torch.cat([expression_coeffs, emotion_emb_t], dim=-1)
	gate = self.lip_gate(gate_input) # [B, T, 64]

	# Blend: gate=1 → keep original (preserve lip-sync), gate=0 → use modulated
	# For lip-region coefficients (dims 10-24), gate biases toward original
	modulated_coeffs = gate * expression_coeffs + (1 - gate) * x

	return modulated_coeffs


	# ============================================================
	# PRACTICAL COEFFICIENT MODIFIER (The actual gimmick that works)
	# ============================================================

	class PracticalEmotionModifier:
	"""
	Practical emotion modifier for SadTalker coefficients.
	This is what actually runs during inference.

	Takes SadTalker's generated 3DMM coefficients and applies
	emotion-specific modifications based on pre-computed AU priors.

	Uses the emotion profiles as learned priors (no training needed).
	"""

	EMOTION_MAP = {
	"neutral": 0, "happy": 1, "sad": 2, "angry": 3,
	"fear": 4, "surprise": 5, "disgust": 6,
	# Aliases
	"happiness": 1, "sadness": 2, "anger": 3,
	"fearful": 4, "surprised": 5, "disgusted": 6
	}

	def __init__(self):
	self.profiles = EMOTION_PROFILES

	def modify_coefficients(
	self,
	coeffs: np.ndarray,
	emotion: str,
	intensity: float = 0.7,
	preserve_lip_sync: bool = True
	) -> np.ndarray:
	"""
	Modify 3DMM expression coefficients with emotion delta.

	Args:
	coeffs: [T, 64] expression coefficients from SadTalker
	emotion: Target emotion string
	intensity: 0.0 (neutral) to 1.0 (full expression)
	preserve_lip_sync: If True, reduce modification on lip-critical dims

	Returns:
	modified: [T, 64] emotion-modulated coefficients
	"""
	emotion = emotion.lower()
	if emotion not in self.profiles:
	print(f" ⚠ Unknown emotion '{emotion}', using neutral")
	return coeffs

	if emotion == "neutral":
	return coeffs

	profile = self.profiles[emotion]
	delta = profile["expression_delta"]

	if delta is None:
	return coeffs

	# Scale delta by intensity
	scaled_delta = delta * intensity

	# Apply temporal smoothing for natural onset/offset (Novelty 3)
	T = coeffs.shape[0]
	if T > 10:
	# Emotion ramps up in first 20% and plateaus
	ramp = np.ones(T)
	ramp_len = max(3, T // 5)
	ramp[:ramp_len] = np.linspace(0, 1, ramp_len)
	ramp[-ramp_len:] = np.linspace(1, 0.3, ramp_len) # Slight decay, not full
	scaled_delta = scaled_delta[np.newaxis, :] * ramp[:, np.newaxis]
	else:
	scaled_delta = np.tile(scaled_delta, (T, 1))

	modified = coeffs.copy()
	coeff_dim = min(coeffs.shape[1], 64)

	if preserve_lip_sync:
	# Lip-sync preservation mask (Novelty 6: Cross-Emotion Consistency)
	# Dims 10-24 are lip-critical → reduce emotion modification here
	lip_mask = np.ones(coeff_dim)
	lip_mask[10:25] = 0.3 # Only 30% emotion influence on lip region
	lip_mask[0:10] = 0.6 # 60% on jaw (affects both speech and emotion)
	scaled_delta[:, :coeff_dim] *= lip_mask

	modified[:, :coeff_dim] += scaled_delta[:, :coeff_dim]

	return modified

	def get_all_emotion_variants(
	self,
	coeffs: np.ndarray,
	intensity: float = 0.7
	) -> Dict[str, np.ndarray]:
	"""Generate all emotion variants from same base coefficients."""
	variants = {}
	for emotion in ["neutral", "happy", "sad", "angry", "fear", "surprise", "disgust"]:
	variants[emotion] = self.modify_coefficients(coeffs, emotion, intensity)
	return variants


	# ============================================================
	# AUDIO EMOTION DETECTOR (HuggingFace wrapper)
	# ============================================================

	class AudioEmotionDetector:
	"""
	Detects emotion from speech audio using pre-trained wav2vec2 model.
	Uses: ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition

	This provides the automatic emotion detection branch of the pipeline.
	Can be overridden with manual emotion specification.
	"""

	def __init__(self, device: str = "cpu"):
	self.device = device
	self.classifier = None
	self._label_map = {
	"angry": "angry",
	"disgust": "disgust",
	"fear": "fear",
	"happy": "happy",
	"neutral": "neutral",
	"sad": "sad",
	"surprise": "surprise",
	# Handle various model output formats
	"happiness": "happy",
	"sadness": "sad",
	"anger": "angry",
	"fearful": "fear",
	"surprised": "surprise",
	"disgusted": "disgust",
	"calm": "neutral",
	"ps": "surprise", # Some models use abbreviations
	}

	def load(self):
	"""Lazy-load the model."""
	if self.classifier is None:
	try:
	from transformers import pipeline
	print(" Loading speech emotion recognition model...")
	self.classifier = pipeline(
	"audio-classification",
	model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
	device=0 if self.device == "cuda" else -1,
	top_k=7
	)
	print(" ✓ Emotion model loaded")
	except Exception as e:
	print(f" ⚠ Failed to load emotion model: {e}")
	print(" → Will use manual emotion specification")
	self.classifier = None

	def detect(self, audio_path: str) -> Dict:
	"""
	Detect emotion from audio file.

	Returns:
	{
	"detected_emotion": str,
	"confidence": float,
	"all_scores": {emotion: score, ...}
	}
	"""
	self.load()

	if self.classifier is None:
	return {
	"detected_emotion": "neutral",
	"confidence": 0.0,
	"all_scores": {},
	"error": "Model not loaded"
	}

	try:
	import librosa
	audio, sr = librosa.load(audio_path, sr=16000)

	results = self.classifier(audio)

	all_scores = {}
	for r in results:
	label = self._label_map.get(r["label"].lower(), r["label"].lower())
	all_scores[label] = r["score"]

	top = max(all_scores, key=all_scores.get)

	return {
	"detected_emotion": top,
	"confidence": all_scores[top],
	"all_scores": all_scores
	}

	except Exception as e:
	print(f" ⚠ Emotion detection failed: {e}")
	return {
	"detected_emotion": "neutral",
	"confidence": 0.0,
	"all_scores": {},
	"error": str(e)
	}


	# ============================================================
	# EMOTION INTENSITY ESTIMATOR (Novelty 8)
	# ============================================================

	class EmotionIntensityEstimator:
	"""
	Estimates emotion intensity from audio features.
	Uses simple heuristics based on:
	- Energy envelope variance
	- Pitch (F0) range
	- Speaking rate

	Maps these to intensity scale [0, 1].
	"""

	def estimate(self, audio_path: str) -> float:
	"""Estimate emotion intensity from audio."""
	try:
	import librosa

	y, sr = librosa.load(audio_path, sr=16000)

	# Energy variance (higher = more expressive)
	rms = librosa.feature.rms(y=y)[0]
	energy_var = np.std(rms) / (np.mean(rms) + 1e-8)

	# Pitch range (wider = more emotional)
	f0, _, _ = librosa.pyin(y, fmin=80, fmax=400, sr=sr)
	f0_clean = f0[~np.isnan(f0)]
	if len(f0_clean) > 0:
	pitch_range = (np.max(f0_clean) - np.min(f0_clean)) / (np.mean(f0_clean) + 1e-8)
	else:
	pitch_range = 0.0

	# Combine heuristics
	intensity = np.clip(0.3 * energy_var + 0.5 * pitch_range + 0.2, 0.1, 1.0)

	return float(intensity)

	except Exception:
	return 0.5 # Default moderate intensity


	# ============================================================
	# CONVENIENCE: Print architecture summary
	# ============================================================

	def print_architecture_summary():
	"""Print the ECFM architecture for documentation."""
	print("""
	╔══════════════════════════════════════════════════════════════╗
	║ EMOLIPS Architecture Overview ║
	╠══════════════════════════════════════════════════════════════╣
	║ ║
	║ Input Audio ──┬──→ [SadTalker Audio Encoder] ║
	║ │ ↓ ║
	║ │ Expression Coefficients (β) ║
	║ │ ↓ ║
	║ ├──→ [Speech Emotion Encoder] ║
	║ │ ↓ ║
	║ │ Emotion Embedding (e) ║
	║ │ ↓ ║
	║ └──→ [Intensity Estimator] ║
	║ ↓ ║
	║ Intensity (α) ║
	║ ↓ ║
	║ ┌─────────────────────────────────────────┐ ║
	║ │ Emotion-Conditioned Fusion Module │ ║
	║ │ │ ║
	║ │ (e, α) → EmotionEncoder → ê │ ║
	║ │ β → FiLM_coarse(β \| ê) → β₁ │ ║
	║ │ β₁ → Residual Refine → β₂ │ ║
	║ │ β₂ → FiLM_fine(β₂ \| ê) → β₃ │ ║
	║ │ β₃ → LipConsistencyGate(β, ê) → β' │ ║
	║ └─────────────────────────────────────────┘ ║
	║ ↓ ║
	║ Input Image ──→ [SadTalker Face Renderer] ║
	║ ↓ ║
	║ Emotion-Driven Output Video ║
	║ ║
	╚══════════════════════════════════════════════════════════════╝
	""")


	if __name__ == "__main__":
	print_architecture_summary()

	# Test the module dimensions
	model = EmotionConditionedFusionModule(coeff_dim=64, emotion_dim=128)
	coeffs = torch.randn(2, 30, 64) # Batch=2, T=30 frames, 64 expression coeffs
	emotion = torch.tensor([1, 3]) # happy, angry
	intensity = torch.tensor([0.8, 0.6])

	out = model(coeffs, emotion, intensity)
	print(f"Input coeffs: {coeffs.shape}")
	print(f"Output coeffs: {out.shape}")
	print(f"✓ ECFM forward pass successful")