| |
| """ |
| EMOLIPS Inference Script |
| ======================== |
| Usage: |
| # Auto-detect emotion from audio: |
| python inference.py --audio speech.wav --image face.jpg |
| |
| # Specify emotion + intensity: |
| python inference.py --audio speech.wav --image face.jpg --emotion happy --intensity 0.8 |
| |
| # Generate all 7 emotions: |
| python inference.py --audio speech.wav --image face.jpg --all-emotions |
| |
| # Standalone mode (no SadTalker needed, just face warping demo): |
| python inference.py --image face.jpg --standalone |
| |
| # Quick test (no GPU, no SadTalker, just verify pipeline): |
| python inference.py --test |
| """ |
|
|
| import argparse |
| import os |
| import sys |
| import time |
| import json |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="EMOLIPS: Emotion-Driven Lip-Sync") |
|
|
| |
| parser.add_argument("--audio", "-a", type=str, help="Path to audio file (WAV/MP3)") |
| parser.add_argument("--image", "-i", type=str, help="Path to source face image") |
|
|
| |
| parser.add_argument("--emotion", "-e", type=str, default=None, |
| choices=["neutral", "happy", "sad", "angry", "fear", "surprise", "disgust"], |
| help="Target emotion (auto-detected if not specified)") |
| parser.add_argument("--intensity", type=float, default=None, |
| help="Emotion intensity 0.0-1.0 (auto-estimated if not specified)") |
|
|
| |
| parser.add_argument("--all-emotions", action="store_true", |
| help="Generate all 7 emotion variants") |
| parser.add_argument("--standalone", action="store_true", |
| help="Standalone mode (no SadTalker, face warping only)") |
| parser.add_argument("--test", action="store_true", |
| help="Quick pipeline test") |
|
|
| |
| parser.add_argument("--output", "-o", type=str, default="outputs/emolips_output.mp4", |
| help="Output video path") |
| parser.add_argument("--output-dir", type=str, default="outputs", |
| help="Output directory for multi-emotion generation") |
|
|
| |
| parser.add_argument("--sadtalker-dir", type=str, default="./SadTalker", |
| help="Path to SadTalker directory") |
| parser.add_argument("--expression-scale", type=float, default=1.0, |
| help="SadTalker expression scale") |
| parser.add_argument("--still", action="store_true", |
| help="Reduce head motion") |
| parser.add_argument("--preprocess", type=str, default="crop", |
| choices=["crop", "resize", "full", "extcrop", "extfull"]) |
| parser.add_argument("--size", type=int, default=256, choices=[256, 512]) |
|
|
| |
| parser.add_argument("--device", type=str, default="cuda", |
| help="Device: cuda or cpu") |
|
|
| args = parser.parse_args() |
|
|
| |
| |
| |
| if args.test: |
| print("=" * 50) |
| print(" EMOLIPS Pipeline Test") |
| print("=" * 50) |
|
|
| print("\n[1] Testing emotion module...") |
| from emotion_module import ( |
| EmotionConditionedFusionModule, |
| PracticalEmotionModifier, |
| print_architecture_summary |
| ) |
| import torch |
|
|
| |
| model = EmotionConditionedFusionModule() |
| coeffs = torch.randn(1, 30, 64) |
| emotion = torch.tensor([1]) |
| intensity = torch.tensor([0.8]) |
| out = model(coeffs, emotion, intensity) |
| print(f" β ECFM forward pass: {coeffs.shape} β {out.shape}") |
|
|
| |
| modifier = PracticalEmotionModifier() |
| base = np.random.randn(30, 64).astype(np.float32) |
| for e in ["happy", "sad", "angry", "fear", "surprise", "disgust"]: |
| modified = modifier.modify_coefficients(base, e, 0.7) |
| diff = np.mean(np.abs(modified - base)) |
| print(f" β {e:10s} β mean delta: {diff:.4f}") |
|
|
| import numpy as np |
|
|
| print("\n[2] Testing architecture summary...") |
| print_architecture_summary() |
|
|
| print("\n[3] Pipeline structure OK!") |
| print(" β All tests passed") |
| print("\nTo run full inference:") |
| print(" python inference.py --audio speech.wav --image face.jpg --emotion happy") |
| return |
|
|
| |
| |
| |
| if args.standalone: |
| if not args.image: |
| print("Error: --image required for standalone mode") |
| sys.exit(1) |
|
|
| from pipeline import EmolipsStandalone |
|
|
| print("=" * 50) |
| print(" EMOLIPS Standalone Demo") |
| print("=" * 50) |
|
|
| standalone = EmolipsStandalone() |
|
|
| |
| if args.audio: |
| detection = standalone.emotion_detector.detect(args.audio) |
| print(f"\nDetected emotion: {detection['detected_emotion']} " |
| f"({detection['confidence']:.2f})") |
| print(f"All scores: {json.dumps(detection.get('all_scores', {}), indent=2)}") |
|
|
| |
| emotions = [args.emotion] if args.emotion else None |
| standalone.save_demo_video( |
| image_path=args.image, |
| emotions=emotions, |
| output_dir=args.output_dir |
| ) |
|
|
| print(f"\nβ Demo videos saved to {args.output_dir}/") |
| return |
|
|
| |
| |
| |
| if not args.audio or not args.image: |
| print("Error: --audio and --image required for full pipeline") |
| print(" Use --standalone for demo without audio") |
| print(" Use --test for quick pipeline test") |
| sys.exit(1) |
|
|
| from pipeline import EmolipsPipeline |
|
|
| pipeline = EmolipsPipeline( |
| sadtalker_dir=args.sadtalker_dir, |
| device=args.device |
| ) |
|
|
| start_time = time.time() |
|
|
| if args.all_emotions: |
| |
| results = pipeline.generate_all_emotions( |
| audio_path=args.audio, |
| image_path=args.image, |
| output_dir=args.output_dir, |
| intensity=args.intensity or 0.7, |
| expression_scale=args.expression_scale, |
| still_mode=args.still, |
| preprocess=args.preprocess, |
| size=args.size |
| ) |
| success_count = sum(1 for r in results if r.get("success")) |
| print(f"\n{'='*50}") |
| print(f" Generated {success_count}/7 emotion variants") |
| print(f" Output dir: {args.output_dir}") |
| else: |
| |
| os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True) |
|
|
| result = pipeline.generate( |
| audio_path=args.audio, |
| image_path=args.image, |
| emotion=args.emotion, |
| intensity=args.intensity, |
| output_path=args.output, |
| expression_scale=args.expression_scale, |
| still_mode=args.still, |
| preprocess=args.preprocess, |
| size=args.size |
| ) |
|
|
| if result.get("success"): |
| print(f"\nβ Output: {result['output']}") |
| else: |
| print(f"\nβ Generation failed") |
|
|
| elapsed = time.time() - start_time |
| print(f" Time: {elapsed:.1f}s") |
|
|
|
|
| if __name__ == "__main__": |
| import numpy as np |
| main() |
|
|