Spaces:

trymonolith
/

MuseTalk

Running

File size: 7,310 Bytes

#!/usr/bin/env python3
"""
MuseTalk - Audio-Driven Video Generation Space
Self-hosted Gradio interface for MuseTalk
"""

import gradio as gr
import os
import tempfile
from pathlib import Path
from inference import MuseTalkInference

# Initialize inference engine
inference_engine = None

def initialize_engine():
    global inference_engine
    if inference_engine is None:
        inference_engine = MuseTalkInference()
    return inference_engine

# Validation functions
def validate_audio(audio_path):
    """Validate audio file."""
    if not audio_path:
        return False, "Please upload an audio file"
    
    if not os.path.exists(audio_path):
        return False, "Audio file not found"
    
    # Check file size (max 100MB)
    file_size = os.path.getsize(audio_path) / (1024 * 1024)
    if file_size > 100:
        return False, f"Audio file too large ({file_size:.1f}MB, max 100MB)"
    
    return True, "Audio file valid"

def validate_video(video_path):
    """Validate video/image file."""
    if not video_path:
        return False, "Please upload a video or image file"
    
    if not os.path.exists(video_path):
        return False, "Video/image file not found"
    
    # Check file size (max 500MB)
    file_size = os.path.getsize(video_path) / (1024 * 1024)
    if file_size > 500:
        return False, f"Video/image file too large ({file_size:.1f}MB, max 500MB)"
    
    return True, "Video/image file valid"

def generate_lipsync_video(audio_file, video_file, fps, quality):
    """Generate lip-synced video using MuseTalk inference."""
    try:
        # Validate inputs
        audio_valid, audio_msg = validate_audio(audio_file)
        if not audio_valid:
            return None, f"Audio validation failed: {audio_msg}"
        
        video_valid, video_msg = validate_video(video_file)
        if not video_valid:
            return None, f"Video validation failed: {video_msg}"
        
        # Initialize inference engine
        engine = initialize_engine()
        
        # Create temporary output file
        output_dir = tempfile.gettempdir()
        output_path = os.path.join(output_dir, "musetalk_output.mp4")
        
        # Define progress callback
        def progress_callback(progress, status):
            print(f"[{progress}%] {status}")
        
        # Run inference
        result_path = engine.generate(
            audio_path=audio_file,
            video_path=video_file,
            output_path=output_path,
            fps=int(fps),
            progress_callback=progress_callback
        )
        
        return result_path, f"Successfully generated lip-synced video (Quality: {quality})"
        
    except Exception as e:
        error_msg = f"Error during generation: {str(e)}"
        print(error_msg)
        return None, error_msg

# Create Gradio interface
with gr.Blocks(title="MuseTalk - Audio-Driven Video Generation") as demo:
    gr.Markdown("# MuseTalk - Audio-Driven Video Generation")
    gr.Markdown("Generate realistic lip-synced videos from audio")
    
    # Main title and description
    gr.Markdown(
        """
        ## MuseTalk - AI Audio-Driven Video Generation
        
        MuseTalk generates realistic lip-synced videos from audio input. 
        This is a self-hosted Space running on Hugging Face.
        """
    )
    
    with gr.Row():
        gr.Markdown(
            """
            ### Features
            - Audio-driven video generation
            - Realistic lip-sync
            - Customizable video parameters
            """
        )
    
    gr.Markdown("### Input Files")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("#### Audio")
            audio_input = gr.Audio(
                label="Upload Audio",
                type="filepath",
                format="wav"
            )
        
        with gr.Column():
            gr.Markdown("#### Video/Image")
            video_input = gr.File(
                label="Upload Video or Image",
                file_count="single",
                file_types=["video", "image"]
            )
    
    gr.Markdown("### Parameters")
    
    with gr.Row():
        fps_slider = gr.Slider(
            minimum=20,
            maximum=60,
            value=25,
            step=1,
            label="FPS (Frames Per Second)"
        )
        
        quality_radio = gr.Radio(
            choices=["Low", "Medium", "High"],
            value="Medium",
            label="Quality"
        )
    
    gr.Markdown("### Generation")
    
    generate_button = gr.Button("Generate Lip-Synced Video", variant="primary")
    
    output_video = gr.Video(
        label="Generated Video",
        format="mp4"
    )
    
    status_text = gr.Textbox(
        label="Status",
        interactive=False,
        lines=3
    )
    
    # Connect generate button to inference function
    generate_button.click(
        fn=generate_lipsync_video,
        inputs=[audio_input, video_input, fps_slider, quality_radio],
        outputs=[output_video, status_text]
    )
    
    # Accordion sections
    with gr.Accordion("About MuseTalk", open=False):
        gr.Markdown(
            """
            ### About MuseTalk
            
            MuseTalk is an AI model for audio-driven video generation that produces 
            realistic lip-synced videos. The model operates in latent space using 
            efficient single-step inpainting, enabling fast inference.
            
            **Key Features:**
            - Audio-driven lip-sync generation
            - Supports multiple languages (Chinese, English, Japanese, etc.)
            - Efficient inference on consumer hardware
            - High-quality 30fps+ output
            
            **Model Architecture:**
            - Uses whisper-tiny for audio feature extraction
            - DWPose for face detection and alignment
            - Latent space inpainting (not diffusion-based)
            - Supports 256x256 face region size
            """
        )
    
    with gr.Accordion("Documentation & Setup", open=False):
        gr.Markdown(
            """
            ### How to Use
            
            1. **Upload Audio**: Select an audio file (WAV, MP3, M4A, OGG) up to 10 minutes
            2. **Upload Video/Image**: Select a reference video or image with a face
            3. **Adjust Parameters**:
               - FPS: Output video frame rate (20-60)
               - Quality: Output quality level (Low/Medium/High)
            4. **Generate**: Click "Generate Lip-Synced Video"
            5. **Download**: Your generated video will appear below
            
            ### Supported Formats
            
            **Audio**: WAV, MP3, M4A, OGG (up to 10 minutes)
            **Video**: MP4, AVI, MOV, MKV (H264/H265 codec)
            **Image**: PNG, JPG, JPEG, BMP (with clear face visible)
            
            ### Technical Details
            
            - **Device**: CPU-based inference with PyTorch
            - **Memory**: Optimized for 4GB+ VRAM devices
            - **Speed**: ~1-5 minutes depending on video length and quality
            - **Output**: MP4 format with H264 codec
            """
        )

if __name__ == "__main__":
    demo.launch(share=False, server_name="0.0.0.0", server_port=7860)