Spaces:
Running
Running
File size: 7,310 Bytes
bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c bed932b 64e548c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 | #!/usr/bin/env python3
"""
MuseTalk - Audio-Driven Video Generation Space
Self-hosted Gradio interface for MuseTalk
"""
import gradio as gr
import os
import tempfile
from pathlib import Path
from inference import MuseTalkInference
# Initialize inference engine
inference_engine = None
def initialize_engine():
global inference_engine
if inference_engine is None:
inference_engine = MuseTalkInference()
return inference_engine
# Validation functions
def validate_audio(audio_path):
"""Validate audio file."""
if not audio_path:
return False, "Please upload an audio file"
if not os.path.exists(audio_path):
return False, "Audio file not found"
# Check file size (max 100MB)
file_size = os.path.getsize(audio_path) / (1024 * 1024)
if file_size > 100:
return False, f"Audio file too large ({file_size:.1f}MB, max 100MB)"
return True, "Audio file valid"
def validate_video(video_path):
"""Validate video/image file."""
if not video_path:
return False, "Please upload a video or image file"
if not os.path.exists(video_path):
return False, "Video/image file not found"
# Check file size (max 500MB)
file_size = os.path.getsize(video_path) / (1024 * 1024)
if file_size > 500:
return False, f"Video/image file too large ({file_size:.1f}MB, max 500MB)"
return True, "Video/image file valid"
def generate_lipsync_video(audio_file, video_file, fps, quality):
"""Generate lip-synced video using MuseTalk inference."""
try:
# Validate inputs
audio_valid, audio_msg = validate_audio(audio_file)
if not audio_valid:
return None, f"Audio validation failed: {audio_msg}"
video_valid, video_msg = validate_video(video_file)
if not video_valid:
return None, f"Video validation failed: {video_msg}"
# Initialize inference engine
engine = initialize_engine()
# Create temporary output file
output_dir = tempfile.gettempdir()
output_path = os.path.join(output_dir, "musetalk_output.mp4")
# Define progress callback
def progress_callback(progress, status):
print(f"[{progress}%] {status}")
# Run inference
result_path = engine.generate(
audio_path=audio_file,
video_path=video_file,
output_path=output_path,
fps=int(fps),
progress_callback=progress_callback
)
return result_path, f"Successfully generated lip-synced video (Quality: {quality})"
except Exception as e:
error_msg = f"Error during generation: {str(e)}"
print(error_msg)
return None, error_msg
# Create Gradio interface
with gr.Blocks(title="MuseTalk - Audio-Driven Video Generation") as demo:
gr.Markdown("# MuseTalk - Audio-Driven Video Generation")
gr.Markdown("Generate realistic lip-synced videos from audio")
# Main title and description
gr.Markdown(
"""
## MuseTalk - AI Audio-Driven Video Generation
MuseTalk generates realistic lip-synced videos from audio input.
This is a self-hosted Space running on Hugging Face.
"""
)
with gr.Row():
gr.Markdown(
"""
### Features
- Audio-driven video generation
- Realistic lip-sync
- Customizable video parameters
"""
)
gr.Markdown("### Input Files")
with gr.Row():
with gr.Column():
gr.Markdown("#### Audio")
audio_input = gr.Audio(
label="Upload Audio",
type="filepath",
format="wav"
)
with gr.Column():
gr.Markdown("#### Video/Image")
video_input = gr.File(
label="Upload Video or Image",
file_count="single",
file_types=["video", "image"]
)
gr.Markdown("### Parameters")
with gr.Row():
fps_slider = gr.Slider(
minimum=20,
maximum=60,
value=25,
step=1,
label="FPS (Frames Per Second)"
)
quality_radio = gr.Radio(
choices=["Low", "Medium", "High"],
value="Medium",
label="Quality"
)
gr.Markdown("### Generation")
generate_button = gr.Button("Generate Lip-Synced Video", variant="primary")
output_video = gr.Video(
label="Generated Video",
format="mp4"
)
status_text = gr.Textbox(
label="Status",
interactive=False,
lines=3
)
# Connect generate button to inference function
generate_button.click(
fn=generate_lipsync_video,
inputs=[audio_input, video_input, fps_slider, quality_radio],
outputs=[output_video, status_text]
)
# Accordion sections
with gr.Accordion("About MuseTalk", open=False):
gr.Markdown(
"""
### About MuseTalk
MuseTalk is an AI model for audio-driven video generation that produces
realistic lip-synced videos. The model operates in latent space using
efficient single-step inpainting, enabling fast inference.
**Key Features:**
- Audio-driven lip-sync generation
- Supports multiple languages (Chinese, English, Japanese, etc.)
- Efficient inference on consumer hardware
- High-quality 30fps+ output
**Model Architecture:**
- Uses whisper-tiny for audio feature extraction
- DWPose for face detection and alignment
- Latent space inpainting (not diffusion-based)
- Supports 256x256 face region size
"""
)
with gr.Accordion("Documentation & Setup", open=False):
gr.Markdown(
"""
### How to Use
1. **Upload Audio**: Select an audio file (WAV, MP3, M4A, OGG) up to 10 minutes
2. **Upload Video/Image**: Select a reference video or image with a face
3. **Adjust Parameters**:
- FPS: Output video frame rate (20-60)
- Quality: Output quality level (Low/Medium/High)
4. **Generate**: Click "Generate Lip-Synced Video"
5. **Download**: Your generated video will appear below
### Supported Formats
**Audio**: WAV, MP3, M4A, OGG (up to 10 minutes)
**Video**: MP4, AVI, MOV, MKV (H264/H265 codec)
**Image**: PNG, JPG, JPEG, BMP (with clear face visible)
### Technical Details
- **Device**: CPU-based inference with PyTorch
- **Memory**: Optimized for 4GB+ VRAM devices
- **Speed**: ~1-5 minutes depending on video length and quality
- **Output**: MP4 format with H264 codec
"""
)
if __name__ == "__main__":
demo.launch(share=False, server_name="0.0.0.0", server_port=7860) |