#!/usr/bin/env python3 """ EMOLIPS Inference Script ======================== Usage: # Auto-detect emotion from audio: python inference.py --audio speech.wav --image face.jpg # Specify emotion + intensity: python inference.py --audio speech.wav --image face.jpg --emotion happy --intensity 0.8 # Generate all 7 emotions: python inference.py --audio speech.wav --image face.jpg --all-emotions # Standalone mode (no SadTalker needed, just face warping demo): python inference.py --image face.jpg --standalone # Quick test (no GPU, no SadTalker, just verify pipeline): python inference.py --test """ import argparse import os import sys import time import json def main(): parser = argparse.ArgumentParser(description="EMOLIPS: Emotion-Driven Lip-Sync") # Required inputs parser.add_argument("--audio", "-a", type=str, help="Path to audio file (WAV/MP3)") parser.add_argument("--image", "-i", type=str, help="Path to source face image") # Emotion control parser.add_argument("--emotion", "-e", type=str, default=None, choices=["neutral", "happy", "sad", "angry", "fear", "surprise", "disgust"], help="Target emotion (auto-detected if not specified)") parser.add_argument("--intensity", type=float, default=None, help="Emotion intensity 0.0-1.0 (auto-estimated if not specified)") # Generation modes parser.add_argument("--all-emotions", action="store_true", help="Generate all 7 emotion variants") parser.add_argument("--standalone", action="store_true", help="Standalone mode (no SadTalker, face warping only)") parser.add_argument("--test", action="store_true", help="Quick pipeline test") # Output parser.add_argument("--output", "-o", type=str, default="outputs/emolips_output.mp4", help="Output video path") parser.add_argument("--output-dir", type=str, default="outputs", help="Output directory for multi-emotion generation") # SadTalker settings parser.add_argument("--sadtalker-dir", type=str, default="./SadTalker", help="Path to SadTalker directory") parser.add_argument("--expression-scale", type=float, default=1.0, help="SadTalker expression scale") parser.add_argument("--still", action="store_true", help="Reduce head motion") parser.add_argument("--preprocess", type=str, default="crop", choices=["crop", "resize", "full", "extcrop", "extfull"]) parser.add_argument("--size", type=int, default=256, choices=[256, 512]) # Hardware parser.add_argument("--device", type=str, default="cuda", help="Device: cuda or cpu") args = parser.parse_args() # ================================================================ # MODE: Quick test # ================================================================ if args.test: print("=" * 50) print(" EMOLIPS Pipeline Test") print("=" * 50) print("\n[1] Testing emotion module...") from emotion_module import ( EmotionConditionedFusionModule, PracticalEmotionModifier, print_architecture_summary ) import torch # Test PyTorch module model = EmotionConditionedFusionModule() coeffs = torch.randn(1, 30, 64) emotion = torch.tensor([1]) intensity = torch.tensor([0.8]) out = model(coeffs, emotion, intensity) print(f" ✓ ECFM forward pass: {coeffs.shape} → {out.shape}") # Test practical modifier modifier = PracticalEmotionModifier() base = np.random.randn(30, 64).astype(np.float32) for e in ["happy", "sad", "angry", "fear", "surprise", "disgust"]: modified = modifier.modify_coefficients(base, e, 0.7) diff = np.mean(np.abs(modified - base)) print(f" ✓ {e:10s} → mean delta: {diff:.4f}") import numpy as np print("\n[2] Testing architecture summary...") print_architecture_summary() print("\n[3] Pipeline structure OK!") print(" ✓ All tests passed") print("\nTo run full inference:") print(" python inference.py --audio speech.wav --image face.jpg --emotion happy") return # ================================================================ # MODE: Standalone (no SadTalker) # ================================================================ if args.standalone: if not args.image: print("Error: --image required for standalone mode") sys.exit(1) from pipeline import EmolipsStandalone print("=" * 50) print(" EMOLIPS Standalone Demo") print("=" * 50) standalone = EmolipsStandalone() # If audio provided, detect emotion if args.audio: detection = standalone.emotion_detector.detect(args.audio) print(f"\nDetected emotion: {detection['detected_emotion']} " f"({detection['confidence']:.2f})") print(f"All scores: {json.dumps(detection.get('all_scores', {}), indent=2)}") # Generate demo videos for all emotions emotions = [args.emotion] if args.emotion else None standalone.save_demo_video( image_path=args.image, emotions=emotions, output_dir=args.output_dir ) print(f"\n✓ Demo videos saved to {args.output_dir}/") return # ================================================================ # MODE: Full pipeline # ================================================================ if not args.audio or not args.image: print("Error: --audio and --image required for full pipeline") print(" Use --standalone for demo without audio") print(" Use --test for quick pipeline test") sys.exit(1) from pipeline import EmolipsPipeline pipeline = EmolipsPipeline( sadtalker_dir=args.sadtalker_dir, device=args.device ) start_time = time.time() if args.all_emotions: # Generate all 7 emotions results = pipeline.generate_all_emotions( audio_path=args.audio, image_path=args.image, output_dir=args.output_dir, intensity=args.intensity or 0.7, expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess, size=args.size ) success_count = sum(1 for r in results if r.get("success")) print(f"\n{'='*50}") print(f" Generated {success_count}/7 emotion variants") print(f" Output dir: {args.output_dir}") else: # Single emotion generation os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) result = pipeline.generate( audio_path=args.audio, image_path=args.image, emotion=args.emotion, intensity=args.intensity, output_path=args.output, expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess, size=args.size ) if result.get("success"): print(f"\n✓ Output: {result['output']}") else: print(f"\n✗ Generation failed") elapsed = time.time() - start_time print(f" Time: {elapsed:.1f}s") if __name__ == "__main__": import numpy as np # Needed for test mode main()