File size: 7,310 Bytes
bed932b
 
 
 
 
 
 
64e548c
 
 
 
bed932b
64e548c
 
bed932b
64e548c
 
 
 
 
bed932b
64e548c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bed932b
64e548c
 
 
 
 
 
 
 
 
 
 
 
bed932b
64e548c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bed932b
 
64e548c
bed932b
 
 
64e548c
 
 
 
 
 
 
 
 
 
bed932b
64e548c
 
 
 
 
 
 
 
bed932b
64e548c
bed932b
 
 
64e548c
 
 
 
 
 
bed932b
 
64e548c
 
 
 
 
 
bed932b
64e548c
bed932b
 
64e548c
 
 
 
 
 
 
 
 
 
 
 
 
bed932b
64e548c
bed932b
64e548c
bed932b
64e548c
 
 
bed932b
 
64e548c
 
 
 
 
 
 
 
 
 
 
 
bed932b
64e548c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bed932b
64e548c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bed932b
 
64e548c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
#!/usr/bin/env python3
"""
MuseTalk - Audio-Driven Video Generation Space
Self-hosted Gradio interface for MuseTalk
"""

import gradio as gr
import os
import tempfile
from pathlib import Path
from inference import MuseTalkInference

# Initialize inference engine
inference_engine = None

def initialize_engine():
    global inference_engine
    if inference_engine is None:
        inference_engine = MuseTalkInference()
    return inference_engine

# Validation functions
def validate_audio(audio_path):
    """Validate audio file."""
    if not audio_path:
        return False, "Please upload an audio file"
    
    if not os.path.exists(audio_path):
        return False, "Audio file not found"
    
    # Check file size (max 100MB)
    file_size = os.path.getsize(audio_path) / (1024 * 1024)
    if file_size > 100:
        return False, f"Audio file too large ({file_size:.1f}MB, max 100MB)"
    
    return True, "Audio file valid"

def validate_video(video_path):
    """Validate video/image file."""
    if not video_path:
        return False, "Please upload a video or image file"
    
    if not os.path.exists(video_path):
        return False, "Video/image file not found"
    
    # Check file size (max 500MB)
    file_size = os.path.getsize(video_path) / (1024 * 1024)
    if file_size > 500:
        return False, f"Video/image file too large ({file_size:.1f}MB, max 500MB)"
    
    return True, "Video/image file valid"

def generate_lipsync_video(audio_file, video_file, fps, quality):
    """Generate lip-synced video using MuseTalk inference."""
    try:
        # Validate inputs
        audio_valid, audio_msg = validate_audio(audio_file)
        if not audio_valid:
            return None, f"Audio validation failed: {audio_msg}"
        
        video_valid, video_msg = validate_video(video_file)
        if not video_valid:
            return None, f"Video validation failed: {video_msg}"
        
        # Initialize inference engine
        engine = initialize_engine()
        
        # Create temporary output file
        output_dir = tempfile.gettempdir()
        output_path = os.path.join(output_dir, "musetalk_output.mp4")
        
        # Define progress callback
        def progress_callback(progress, status):
            print(f"[{progress}%] {status}")
        
        # Run inference
        result_path = engine.generate(
            audio_path=audio_file,
            video_path=video_file,
            output_path=output_path,
            fps=int(fps),
            progress_callback=progress_callback
        )
        
        return result_path, f"Successfully generated lip-synced video (Quality: {quality})"
        
    except Exception as e:
        error_msg = f"Error during generation: {str(e)}"
        print(error_msg)
        return None, error_msg

# Create Gradio interface
with gr.Blocks(title="MuseTalk - Audio-Driven Video Generation") as demo:
    gr.Markdown("# MuseTalk - Audio-Driven Video Generation")
    gr.Markdown("Generate realistic lip-synced videos from audio")
    
    # Main title and description
    gr.Markdown(
        """
        ## MuseTalk - AI Audio-Driven Video Generation
        
        MuseTalk generates realistic lip-synced videos from audio input. 
        This is a self-hosted Space running on Hugging Face.
        """
    )
    
    with gr.Row():
        gr.Markdown(
            """
            ### Features
            - Audio-driven video generation
            - Realistic lip-sync
            - Customizable video parameters
            """
        )
    
    gr.Markdown("### Input Files")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("#### Audio")
            audio_input = gr.Audio(
                label="Upload Audio",
                type="filepath",
                format="wav"
            )
        
        with gr.Column():
            gr.Markdown("#### Video/Image")
            video_input = gr.File(
                label="Upload Video or Image",
                file_count="single",
                file_types=["video", "image"]
            )
    
    gr.Markdown("### Parameters")
    
    with gr.Row():
        fps_slider = gr.Slider(
            minimum=20,
            maximum=60,
            value=25,
            step=1,
            label="FPS (Frames Per Second)"
        )
        
        quality_radio = gr.Radio(
            choices=["Low", "Medium", "High"],
            value="Medium",
            label="Quality"
        )
    
    gr.Markdown("### Generation")
    
    generate_button = gr.Button("Generate Lip-Synced Video", variant="primary")
    
    output_video = gr.Video(
        label="Generated Video",
        format="mp4"
    )
    
    status_text = gr.Textbox(
        label="Status",
        interactive=False,
        lines=3
    )
    
    # Connect generate button to inference function
    generate_button.click(
        fn=generate_lipsync_video,
        inputs=[audio_input, video_input, fps_slider, quality_radio],
        outputs=[output_video, status_text]
    )
    
    # Accordion sections
    with gr.Accordion("About MuseTalk", open=False):
        gr.Markdown(
            """
            ### About MuseTalk
            
            MuseTalk is an AI model for audio-driven video generation that produces 
            realistic lip-synced videos. The model operates in latent space using 
            efficient single-step inpainting, enabling fast inference.
            
            **Key Features:**
            - Audio-driven lip-sync generation
            - Supports multiple languages (Chinese, English, Japanese, etc.)
            - Efficient inference on consumer hardware
            - High-quality 30fps+ output
            
            **Model Architecture:**
            - Uses whisper-tiny for audio feature extraction
            - DWPose for face detection and alignment
            - Latent space inpainting (not diffusion-based)
            - Supports 256x256 face region size
            """
        )
    
    with gr.Accordion("Documentation & Setup", open=False):
        gr.Markdown(
            """
            ### How to Use
            
            1. **Upload Audio**: Select an audio file (WAV, MP3, M4A, OGG) up to 10 minutes
            2. **Upload Video/Image**: Select a reference video or image with a face
            3. **Adjust Parameters**:
               - FPS: Output video frame rate (20-60)
               - Quality: Output quality level (Low/Medium/High)
            4. **Generate**: Click "Generate Lip-Synced Video"
            5. **Download**: Your generated video will appear below
            
            ### Supported Formats
            
            **Audio**: WAV, MP3, M4A, OGG (up to 10 minutes)
            **Video**: MP4, AVI, MOV, MKV (H264/H265 codec)
            **Image**: PNG, JPG, JPEG, BMP (with clear face visible)
            
            ### Technical Details
            
            - **Device**: CPU-based inference with PyTorch
            - **Memory**: Optimized for 4GB+ VRAM devices
            - **Speed**: ~1-5 minutes depending on video length and quality
            - **Output**: MP4 format with H264 codec
            """
        )

if __name__ == "__main__":
    demo.launch(share=False, server_name="0.0.0.0", server_port=7860)