File size: 7,739 Bytes
9ec2c53 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | #!/usr/bin/env python3
"""
EMOLIPS Inference Script
========================
Usage:
# Auto-detect emotion from audio:
python inference.py --audio speech.wav --image face.jpg
# Specify emotion + intensity:
python inference.py --audio speech.wav --image face.jpg --emotion happy --intensity 0.8
# Generate all 7 emotions:
python inference.py --audio speech.wav --image face.jpg --all-emotions
# Standalone mode (no SadTalker needed, just face warping demo):
python inference.py --image face.jpg --standalone
# Quick test (no GPU, no SadTalker, just verify pipeline):
python inference.py --test
"""
import argparse
import os
import sys
import time
import json
def main():
parser = argparse.ArgumentParser(description="EMOLIPS: Emotion-Driven Lip-Sync")
# Required inputs
parser.add_argument("--audio", "-a", type=str, help="Path to audio file (WAV/MP3)")
parser.add_argument("--image", "-i", type=str, help="Path to source face image")
# Emotion control
parser.add_argument("--emotion", "-e", type=str, default=None,
choices=["neutral", "happy", "sad", "angry", "fear", "surprise", "disgust"],
help="Target emotion (auto-detected if not specified)")
parser.add_argument("--intensity", type=float, default=None,
help="Emotion intensity 0.0-1.0 (auto-estimated if not specified)")
# Generation modes
parser.add_argument("--all-emotions", action="store_true",
help="Generate all 7 emotion variants")
parser.add_argument("--standalone", action="store_true",
help="Standalone mode (no SadTalker, face warping only)")
parser.add_argument("--test", action="store_true",
help="Quick pipeline test")
# Output
parser.add_argument("--output", "-o", type=str, default="outputs/emolips_output.mp4",
help="Output video path")
parser.add_argument("--output-dir", type=str, default="outputs",
help="Output directory for multi-emotion generation")
# SadTalker settings
parser.add_argument("--sadtalker-dir", type=str, default="./SadTalker",
help="Path to SadTalker directory")
parser.add_argument("--expression-scale", type=float, default=1.0,
help="SadTalker expression scale")
parser.add_argument("--still", action="store_true",
help="Reduce head motion")
parser.add_argument("--preprocess", type=str, default="crop",
choices=["crop", "resize", "full", "extcrop", "extfull"])
parser.add_argument("--size", type=int, default=256, choices=[256, 512])
# Hardware
parser.add_argument("--device", type=str, default="cuda",
help="Device: cuda or cpu")
args = parser.parse_args()
# ================================================================
# MODE: Quick test
# ================================================================
if args.test:
print("=" * 50)
print(" EMOLIPS Pipeline Test")
print("=" * 50)
print("\n[1] Testing emotion module...")
from emotion_module import (
EmotionConditionedFusionModule,
PracticalEmotionModifier,
print_architecture_summary
)
import torch
# Test PyTorch module
model = EmotionConditionedFusionModule()
coeffs = torch.randn(1, 30, 64)
emotion = torch.tensor([1])
intensity = torch.tensor([0.8])
out = model(coeffs, emotion, intensity)
print(f" ✓ ECFM forward pass: {coeffs.shape} → {out.shape}")
# Test practical modifier
modifier = PracticalEmotionModifier()
base = np.random.randn(30, 64).astype(np.float32)
for e in ["happy", "sad", "angry", "fear", "surprise", "disgust"]:
modified = modifier.modify_coefficients(base, e, 0.7)
diff = np.mean(np.abs(modified - base))
print(f" ✓ {e:10s} → mean delta: {diff:.4f}")
import numpy as np
print("\n[2] Testing architecture summary...")
print_architecture_summary()
print("\n[3] Pipeline structure OK!")
print(" ✓ All tests passed")
print("\nTo run full inference:")
print(" python inference.py --audio speech.wav --image face.jpg --emotion happy")
return
# ================================================================
# MODE: Standalone (no SadTalker)
# ================================================================
if args.standalone:
if not args.image:
print("Error: --image required for standalone mode")
sys.exit(1)
from pipeline import EmolipsStandalone
print("=" * 50)
print(" EMOLIPS Standalone Demo")
print("=" * 50)
standalone = EmolipsStandalone()
# If audio provided, detect emotion
if args.audio:
detection = standalone.emotion_detector.detect(args.audio)
print(f"\nDetected emotion: {detection['detected_emotion']} "
f"({detection['confidence']:.2f})")
print(f"All scores: {json.dumps(detection.get('all_scores', {}), indent=2)}")
# Generate demo videos for all emotions
emotions = [args.emotion] if args.emotion else None
standalone.save_demo_video(
image_path=args.image,
emotions=emotions,
output_dir=args.output_dir
)
print(f"\n✓ Demo videos saved to {args.output_dir}/")
return
# ================================================================
# MODE: Full pipeline
# ================================================================
if not args.audio or not args.image:
print("Error: --audio and --image required for full pipeline")
print(" Use --standalone for demo without audio")
print(" Use --test for quick pipeline test")
sys.exit(1)
from pipeline import EmolipsPipeline
pipeline = EmolipsPipeline(
sadtalker_dir=args.sadtalker_dir,
device=args.device
)
start_time = time.time()
if args.all_emotions:
# Generate all 7 emotions
results = pipeline.generate_all_emotions(
audio_path=args.audio,
image_path=args.image,
output_dir=args.output_dir,
intensity=args.intensity or 0.7,
expression_scale=args.expression_scale,
still_mode=args.still,
preprocess=args.preprocess,
size=args.size
)
success_count = sum(1 for r in results if r.get("success"))
print(f"\n{'='*50}")
print(f" Generated {success_count}/7 emotion variants")
print(f" Output dir: {args.output_dir}")
else:
# Single emotion generation
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
result = pipeline.generate(
audio_path=args.audio,
image_path=args.image,
emotion=args.emotion,
intensity=args.intensity,
output_path=args.output,
expression_scale=args.expression_scale,
still_mode=args.still,
preprocess=args.preprocess,
size=args.size
)
if result.get("success"):
print(f"\n✓ Output: {result['output']}")
else:
print(f"\n✗ Generation failed")
elapsed = time.time() - start_time
print(f" Time: {elapsed:.1f}s")
if __name__ == "__main__":
import numpy as np # Needed for test mode
main()
|