File size: 7,739 Bytes
9ec2c53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/usr/bin/env python3
"""
EMOLIPS Inference Script
========================
Usage:
    # Auto-detect emotion from audio:
    python inference.py --audio speech.wav --image face.jpg

    # Specify emotion + intensity:
    python inference.py --audio speech.wav --image face.jpg --emotion happy --intensity 0.8

    # Generate all 7 emotions:
    python inference.py --audio speech.wav --image face.jpg --all-emotions

    # Standalone mode (no SadTalker needed, just face warping demo):
    python inference.py --image face.jpg --standalone

    # Quick test (no GPU, no SadTalker, just verify pipeline):
    python inference.py --test
"""

import argparse
import os
import sys
import time
import json


def main():
    parser = argparse.ArgumentParser(description="EMOLIPS: Emotion-Driven Lip-Sync")

    # Required inputs
    parser.add_argument("--audio", "-a", type=str, help="Path to audio file (WAV/MP3)")
    parser.add_argument("--image", "-i", type=str, help="Path to source face image")

    # Emotion control
    parser.add_argument("--emotion", "-e", type=str, default=None,
                       choices=["neutral", "happy", "sad", "angry", "fear", "surprise", "disgust"],
                       help="Target emotion (auto-detected if not specified)")
    parser.add_argument("--intensity", type=float, default=None,
                       help="Emotion intensity 0.0-1.0 (auto-estimated if not specified)")

    # Generation modes
    parser.add_argument("--all-emotions", action="store_true",
                       help="Generate all 7 emotion variants")
    parser.add_argument("--standalone", action="store_true",
                       help="Standalone mode (no SadTalker, face warping only)")
    parser.add_argument("--test", action="store_true",
                       help="Quick pipeline test")

    # Output
    parser.add_argument("--output", "-o", type=str, default="outputs/emolips_output.mp4",
                       help="Output video path")
    parser.add_argument("--output-dir", type=str, default="outputs",
                       help="Output directory for multi-emotion generation")

    # SadTalker settings
    parser.add_argument("--sadtalker-dir", type=str, default="./SadTalker",
                       help="Path to SadTalker directory")
    parser.add_argument("--expression-scale", type=float, default=1.0,
                       help="SadTalker expression scale")
    parser.add_argument("--still", action="store_true",
                       help="Reduce head motion")
    parser.add_argument("--preprocess", type=str, default="crop",
                       choices=["crop", "resize", "full", "extcrop", "extfull"])
    parser.add_argument("--size", type=int, default=256, choices=[256, 512])

    # Hardware
    parser.add_argument("--device", type=str, default="cuda",
                       help="Device: cuda or cpu")

    args = parser.parse_args()

    # ================================================================
    # MODE: Quick test
    # ================================================================
    if args.test:
        print("=" * 50)
        print("  EMOLIPS Pipeline Test")
        print("=" * 50)

        print("\n[1] Testing emotion module...")
        from emotion_module import (
            EmotionConditionedFusionModule,
            PracticalEmotionModifier,
            print_architecture_summary
        )
        import torch

        # Test PyTorch module
        model = EmotionConditionedFusionModule()
        coeffs = torch.randn(1, 30, 64)
        emotion = torch.tensor([1])
        intensity = torch.tensor([0.8])
        out = model(coeffs, emotion, intensity)
        print(f"  ✓ ECFM forward pass: {coeffs.shape}{out.shape}")

        # Test practical modifier
        modifier = PracticalEmotionModifier()
        base = np.random.randn(30, 64).astype(np.float32)
        for e in ["happy", "sad", "angry", "fear", "surprise", "disgust"]:
            modified = modifier.modify_coefficients(base, e, 0.7)
            diff = np.mean(np.abs(modified - base))
            print(f"  ✓ {e:10s} → mean delta: {diff:.4f}")

        import numpy as np

        print("\n[2] Testing architecture summary...")
        print_architecture_summary()

        print("\n[3] Pipeline structure OK!")
        print("  ✓ All tests passed")
        print("\nTo run full inference:")
        print("  python inference.py --audio speech.wav --image face.jpg --emotion happy")
        return

    # ================================================================
    # MODE: Standalone (no SadTalker)
    # ================================================================
    if args.standalone:
        if not args.image:
            print("Error: --image required for standalone mode")
            sys.exit(1)

        from pipeline import EmolipsStandalone

        print("=" * 50)
        print("  EMOLIPS Standalone Demo")
        print("=" * 50)

        standalone = EmolipsStandalone()

        # If audio provided, detect emotion
        if args.audio:
            detection = standalone.emotion_detector.detect(args.audio)
            print(f"\nDetected emotion: {detection['detected_emotion']} "
                  f"({detection['confidence']:.2f})")
            print(f"All scores: {json.dumps(detection.get('all_scores', {}), indent=2)}")

        # Generate demo videos for all emotions
        emotions = [args.emotion] if args.emotion else None
        standalone.save_demo_video(
            image_path=args.image,
            emotions=emotions,
            output_dir=args.output_dir
        )

        print(f"\n✓ Demo videos saved to {args.output_dir}/")
        return

    # ================================================================
    # MODE: Full pipeline
    # ================================================================
    if not args.audio or not args.image:
        print("Error: --audio and --image required for full pipeline")
        print("       Use --standalone for demo without audio")
        print("       Use --test for quick pipeline test")
        sys.exit(1)

    from pipeline import EmolipsPipeline

    pipeline = EmolipsPipeline(
        sadtalker_dir=args.sadtalker_dir,
        device=args.device
    )

    start_time = time.time()

    if args.all_emotions:
        # Generate all 7 emotions
        results = pipeline.generate_all_emotions(
            audio_path=args.audio,
            image_path=args.image,
            output_dir=args.output_dir,
            intensity=args.intensity or 0.7,
            expression_scale=args.expression_scale,
            still_mode=args.still,
            preprocess=args.preprocess,
            size=args.size
        )
        success_count = sum(1 for r in results if r.get("success"))
        print(f"\n{'='*50}")
        print(f"  Generated {success_count}/7 emotion variants")
        print(f"  Output dir: {args.output_dir}")
    else:
        # Single emotion generation
        os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)

        result = pipeline.generate(
            audio_path=args.audio,
            image_path=args.image,
            emotion=args.emotion,
            intensity=args.intensity,
            output_path=args.output,
            expression_scale=args.expression_scale,
            still_mode=args.still,
            preprocess=args.preprocess,
            size=args.size
        )

        if result.get("success"):
            print(f"\n✓ Output: {result['output']}")
        else:
            print(f"\n✗ Generation failed")

    elapsed = time.time() - start_time
    print(f"  Time: {elapsed:.1f}s")


if __name__ == "__main__":
    import numpy as np  # Needed for test mode
    main()