Spaces:
Running
Running
File size: 9,642 Bytes
38572a2 e04d126 38572a2 6646464 13c462f e04d126 38572a2 e04d126 38572a2 13c462f e04d126 13c462f e04d126 38572a2 e04d126 38572a2 e04d126 38572a2 e04d126 38572a2 e04d126 38572a2 e04d126 38572a2 e04d126 38572a2 e04d126 38572a2 e04d126 38572a2 a209921 e04d126 a209921 e04d126 38572a2 e04d126 38572a2 e04d126 38572a2 e04d126 38572a2 e04d126 38572a2 6646464 38572a2 e04d126 38572a2 e04d126 38572a2 e04d126 38572a2 e04d126 13c462f e04d126 38572a2 e04d126 38572a2 e04d126 38572a2 6646464 e04d126 6646464 e04d126 6646464 e04d126 6646464 e04d126 6646464 e04d126 6646464 e04d126 6646464 e04d126 13c462f 38572a2 6646464 e04d126 38572a2 6646464 38572a2 6646464 e04d126 6646464 e04d126 6646464 e04d126 6646464 e04d126 6646464 e04d126 6646464 e04d126 38572a2 6646464 e04d126 38572a2 e04d126 38572a2 e04d126 38572a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 |
import os
import random
import logging
from typing import Any
import torch
import gradio as gr
from PIL import Image
from utils.model_loader import ModelManager
from utils.gpu_manager import gpu_manager
import wan
from wan.utils.utils import cache_image, cache_video, is_video
from wan.utils.multitalk_utils import save_video_ffmpeg
# =========================
# HOTFIX: Gradio /api_info crash
# =========================
# Fixes: TypeError: argument of type 'bool' is not iterable
# Caused by gradio_client trying to interpret JSON Schema nodes that can be booleans
try:
import gradio_client.utils as gcu
_old_json_schema_to_python_type = gcu._json_schema_to_python_type
def _json_schema_to_python_type_patched(schema: Any, defs=None):
if isinstance(schema, bool):
return "Any"
return _old_json_schema_to_python_type(schema, defs)
gcu._json_schema_to_python_type = _json_schema_to_python_type_patched
except Exception as e:
print("gradio_client patch skipped:", e)
# =========================
# Logging
# =========================
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# =========================
# Globals
# =========================
model_manager: ModelManager | None = None
models_loaded = False
def initialize_models(progress=gr.Progress()):
"""Download/prepare model assets on first use."""
global model_manager, models_loaded
if models_loaded:
return
try:
progress(0.1, desc="Initializing model manager...")
model_manager = ModelManager()
progress(0.3, desc="Downloading models (first time only)...")
# Pre-download assets (actual heavy loading happens on first inference)
model_manager.get_wan_model_path()
model_manager.get_infinitetalk_weights_path()
model_manager.get_wav2vec_model_path()
models_loaded = True
progress(1.0, desc="Models ready!")
logger.info("Models initialized successfully")
except Exception as e:
logger.exception("Error initializing models")
raise gr.Error(f"Failed to initialize models: {str(e)}")
def _set_seed(seed: int) -> int:
"""Set deterministic seeds and return the final seed used."""
if seed == -1:
seed = random.randint(0, 99_999_999)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed(seed)
return seed
def generate_video(
image_or_video,
audio_file,
resolution="480p",
steps=40,
audio_guide_scale=3.0,
seed=-1,
progress=gr.Progress(),
):
"""
Generate a talking video from an image OR dub an existing video.
Note: This is a simplified pipeline example. Your real pipeline may use
wan_pipeline + diffusion steps etc. This version just stitches frames + audio.
"""
try:
if not torch.cuda.is_available():
raise gr.Error("⚠️ GPU not available. This Space requires GPU hardware to generate videos.")
# Ensure models are prepared
if not models_loaded:
initialize_models(progress)
progress(0.1, desc="Processing audio...")
progress(0.2, desc="Loading models...")
# Load models (kept for parity with your structure)
size = f"infinitetalk-{resolution.replace('p', '')}"
wan_pipeline = model_manager.load_wan_model(size=size, device="cuda") # noqa: F841
progress(0.3, desc="Processing input...")
# Decide whether the input is a video or image
if is_video(image_or_video):
logger.info("Processing video dubbing input...")
input_frames = cache_video(image_or_video)
else:
logger.info("Processing image-to-video input...")
input_image = Image.open(image_or_video).convert("RGB")
input_frames = [input_image]
progress(0.4, desc="Generating video...")
seed = _set_seed(int(seed))
output_path = f"/tmp/output_{seed}.mp4"
# Simplified output save (frames + audio)
save_video_ffmpeg(
input_frames,
output_path,
audio_file,
high_quality_save=False,
)
progress(1.0, desc="Complete!")
return output_path
except Exception as e:
logger.exception("Error generating video")
gpu_manager.cleanup()
raise gr.Error(f"Generation failed: {str(e)}")
def create_interface():
"""Create Gradio UI."""
with gr.Blocks(title="InfiniteTalk - Talking Video Generator") as demo:
gr.Markdown(
"""
# 🎬 InfiniteTalk - Talking Video Generator
Generate realistic talking head videos with accurate lip-sync from images or dub existing videos with new audio!
**Note**: First generation may take a few minutes while models download. Subsequent generations are faster.
"""
)
with gr.Tabs():
# Tab 1: Image-to-Video
with gr.Tab("📸 Image-to-Video"):
gr.Markdown("Transform a static portrait into a talking video")
with gr.Row():
with gr.Column():
image_input = gr.Image(
type="filepath",
label="Upload Portrait Image (clear face visibility recommended)",
)
audio_input = gr.Audio(
type="filepath",
label="Upload Audio (MP3, WAV, or FLAC)",
)
with gr.Accordion("Advanced Settings", open=False):
resolution = gr.Radio(
choices=["480p", "720p"],
value="480p",
label="Resolution (480p faster, 720p higher quality)",
)
steps = gr.Slider(
minimum=20,
maximum=50,
value=40,
step=1,
label="Diffusion Steps (more = higher quality but slower)",
)
audio_scale = gr.Slider(
minimum=1.0,
maximum=5.0,
value=3.0,
step=0.5,
label="Audio Guide Scale (2–4 recommended)",
)
seed = gr.Number(value=-1, label="Seed (-1 for random)")
generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
with gr.Column():
output_video = gr.Video(label="Generated Video")
gr.Markdown("**💡 Tip**: Use a high-quality portrait image with clear facial features.")
generate_btn.click(
fn=generate_video,
inputs=[image_input, audio_input, resolution, steps, audio_scale, seed],
outputs=output_video,
)
# Tab 2: Video Dubbing
with gr.Tab("🎥 Video Dubbing"):
gr.Markdown("Dub an existing video with new audio while maintaining natural movements")
with gr.Row():
with gr.Column():
video_input = gr.Video(label="Upload Video (with visible face)")
audio_input_v2v = gr.Audio(
type="filepath",
label="Upload New Audio (MP3, WAV, or FLAC)",
)
with gr.Accordion("Advanced Settings", open=False):
resolution_v2v = gr.Radio(
choices=["480p", "720p"],
value="480p",
label="Resolution",
)
steps_v2v = gr.Slider(
minimum=20,
maximum=50,
value=40,
step=1,
label="Diffusion Steps",
)
audio_scale_v2v = gr.Slider(
minimum=1.0,
maximum=5.0,
value=3.0,
step=0.5,
label="Audio Guide Scale",
)
seed_v2v = gr.Number(value=-1, label="Seed")
generate_btn_v2v = gr.Button("🎬 Generate Dubbed Video", variant="primary", size="lg")
with gr.Column():
output_video_v2v = gr.Video(label="Dubbed Video")
gr.Markdown("**💡 Tip**: Use a video with consistent face visibility.")
generate_btn_v2v.click(
fn=generate_video,
inputs=[video_input, audio_input_v2v, resolution_v2v, steps_v2v, audio_scale_v2v, seed_v2v],
outputs=output_video_v2v,
)
gr.Markdown(
"""
---
### About
Powered by InfiniteTalk (Apache 2.0)
⚠️ **Note**: This Space requires GPU hardware to generate videos.
"""
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch()
|