emolips / code /inference.py
primal-sage's picture
Upload code/inference.py with huggingface_hub
9ec2c53 verified
#!/usr/bin/env python3
"""
EMOLIPS Inference Script
========================
Usage:
# Auto-detect emotion from audio:
python inference.py --audio speech.wav --image face.jpg
# Specify emotion + intensity:
python inference.py --audio speech.wav --image face.jpg --emotion happy --intensity 0.8
# Generate all 7 emotions:
python inference.py --audio speech.wav --image face.jpg --all-emotions
# Standalone mode (no SadTalker needed, just face warping demo):
python inference.py --image face.jpg --standalone
# Quick test (no GPU, no SadTalker, just verify pipeline):
python inference.py --test
"""
import argparse
import os
import sys
import time
import json
def main():
parser = argparse.ArgumentParser(description="EMOLIPS: Emotion-Driven Lip-Sync")
# Required inputs
parser.add_argument("--audio", "-a", type=str, help="Path to audio file (WAV/MP3)")
parser.add_argument("--image", "-i", type=str, help="Path to source face image")
# Emotion control
parser.add_argument("--emotion", "-e", type=str, default=None,
choices=["neutral", "happy", "sad", "angry", "fear", "surprise", "disgust"],
help="Target emotion (auto-detected if not specified)")
parser.add_argument("--intensity", type=float, default=None,
help="Emotion intensity 0.0-1.0 (auto-estimated if not specified)")
# Generation modes
parser.add_argument("--all-emotions", action="store_true",
help="Generate all 7 emotion variants")
parser.add_argument("--standalone", action="store_true",
help="Standalone mode (no SadTalker, face warping only)")
parser.add_argument("--test", action="store_true",
help="Quick pipeline test")
# Output
parser.add_argument("--output", "-o", type=str, default="outputs/emolips_output.mp4",
help="Output video path")
parser.add_argument("--output-dir", type=str, default="outputs",
help="Output directory for multi-emotion generation")
# SadTalker settings
parser.add_argument("--sadtalker-dir", type=str, default="./SadTalker",
help="Path to SadTalker directory")
parser.add_argument("--expression-scale", type=float, default=1.0,
help="SadTalker expression scale")
parser.add_argument("--still", action="store_true",
help="Reduce head motion")
parser.add_argument("--preprocess", type=str, default="crop",
choices=["crop", "resize", "full", "extcrop", "extfull"])
parser.add_argument("--size", type=int, default=256, choices=[256, 512])
# Hardware
parser.add_argument("--device", type=str, default="cuda",
help="Device: cuda or cpu")
args = parser.parse_args()
# ================================================================
# MODE: Quick test
# ================================================================
if args.test:
print("=" * 50)
print(" EMOLIPS Pipeline Test")
print("=" * 50)
print("\n[1] Testing emotion module...")
from emotion_module import (
EmotionConditionedFusionModule,
PracticalEmotionModifier,
print_architecture_summary
)
import torch
# Test PyTorch module
model = EmotionConditionedFusionModule()
coeffs = torch.randn(1, 30, 64)
emotion = torch.tensor([1])
intensity = torch.tensor([0.8])
out = model(coeffs, emotion, intensity)
print(f" βœ“ ECFM forward pass: {coeffs.shape} β†’ {out.shape}")
# Test practical modifier
modifier = PracticalEmotionModifier()
base = np.random.randn(30, 64).astype(np.float32)
for e in ["happy", "sad", "angry", "fear", "surprise", "disgust"]:
modified = modifier.modify_coefficients(base, e, 0.7)
diff = np.mean(np.abs(modified - base))
print(f" βœ“ {e:10s} β†’ mean delta: {diff:.4f}")
import numpy as np
print("\n[2] Testing architecture summary...")
print_architecture_summary()
print("\n[3] Pipeline structure OK!")
print(" βœ“ All tests passed")
print("\nTo run full inference:")
print(" python inference.py --audio speech.wav --image face.jpg --emotion happy")
return
# ================================================================
# MODE: Standalone (no SadTalker)
# ================================================================
if args.standalone:
if not args.image:
print("Error: --image required for standalone mode")
sys.exit(1)
from pipeline import EmolipsStandalone
print("=" * 50)
print(" EMOLIPS Standalone Demo")
print("=" * 50)
standalone = EmolipsStandalone()
# If audio provided, detect emotion
if args.audio:
detection = standalone.emotion_detector.detect(args.audio)
print(f"\nDetected emotion: {detection['detected_emotion']} "
f"({detection['confidence']:.2f})")
print(f"All scores: {json.dumps(detection.get('all_scores', {}), indent=2)}")
# Generate demo videos for all emotions
emotions = [args.emotion] if args.emotion else None
standalone.save_demo_video(
image_path=args.image,
emotions=emotions,
output_dir=args.output_dir
)
print(f"\nβœ“ Demo videos saved to {args.output_dir}/")
return
# ================================================================
# MODE: Full pipeline
# ================================================================
if not args.audio or not args.image:
print("Error: --audio and --image required for full pipeline")
print(" Use --standalone for demo without audio")
print(" Use --test for quick pipeline test")
sys.exit(1)
from pipeline import EmolipsPipeline
pipeline = EmolipsPipeline(
sadtalker_dir=args.sadtalker_dir,
device=args.device
)
start_time = time.time()
if args.all_emotions:
# Generate all 7 emotions
results = pipeline.generate_all_emotions(
audio_path=args.audio,
image_path=args.image,
output_dir=args.output_dir,
intensity=args.intensity or 0.7,
expression_scale=args.expression_scale,
still_mode=args.still,
preprocess=args.preprocess,
size=args.size
)
success_count = sum(1 for r in results if r.get("success"))
print(f"\n{'='*50}")
print(f" Generated {success_count}/7 emotion variants")
print(f" Output dir: {args.output_dir}")
else:
# Single emotion generation
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
result = pipeline.generate(
audio_path=args.audio,
image_path=args.image,
emotion=args.emotion,
intensity=args.intensity,
output_path=args.output,
expression_scale=args.expression_scale,
still_mode=args.still,
preprocess=args.preprocess,
size=args.size
)
if result.get("success"):
print(f"\nβœ“ Output: {result['output']}")
else:
print(f"\nβœ— Generation failed")
elapsed = time.time() - start_time
print(f" Time: {elapsed:.1f}s")
if __name__ == "__main__":
import numpy as np # Needed for test mode
main()