import os import random import logging from typing import Any import torch import gradio as gr from PIL import Image from utils.model_loader import ModelManager from utils.gpu_manager import gpu_manager import wan from wan.utils.utils import cache_image, cache_video, is_video from wan.utils.multitalk_utils import save_video_ffmpeg # ========================= # HOTFIX: Gradio /api_info crash # ========================= # Fixes: TypeError: argument of type 'bool' is not iterable # Caused by gradio_client trying to interpret JSON Schema nodes that can be booleans try: import gradio_client.utils as gcu _old_json_schema_to_python_type = gcu._json_schema_to_python_type def _json_schema_to_python_type_patched(schema: Any, defs=None): if isinstance(schema, bool): return "Any" return _old_json_schema_to_python_type(schema, defs) gcu._json_schema_to_python_type = _json_schema_to_python_type_patched except Exception as e: print("gradio_client patch skipped:", e) # ========================= # Logging # ========================= logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # ========================= # Globals # ========================= model_manager: ModelManager | None = None models_loaded = False def initialize_models(progress=gr.Progress()): """Download/prepare model assets on first use.""" global model_manager, models_loaded if models_loaded: return try: progress(0.1, desc="Initializing model manager...") model_manager = ModelManager() progress(0.3, desc="Downloading models (first time only)...") # Pre-download assets (actual heavy loading happens on first inference) model_manager.get_wan_model_path() model_manager.get_infinitetalk_weights_path() model_manager.get_wav2vec_model_path() models_loaded = True progress(1.0, desc="Models ready!") logger.info("Models initialized successfully") except Exception as e: logger.exception("Error initializing models") raise gr.Error(f"Failed to initialize models: {str(e)}") def _set_seed(seed: int) -> int: """Set deterministic seeds and return the final seed used.""" if seed == -1: seed = random.randint(0, 99_999_999) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) return seed def generate_video( image_or_video, audio_file, resolution="480p", steps=40, audio_guide_scale=3.0, seed=-1, progress=gr.Progress(), ): """ Generate a talking video from an image OR dub an existing video. Note: This is a simplified pipeline example. Your real pipeline may use wan_pipeline + diffusion steps etc. This version just stitches frames + audio. """ try: if not torch.cuda.is_available(): raise gr.Error("⚠️ GPU not available. This Space requires GPU hardware to generate videos.") # Ensure models are prepared if not models_loaded: initialize_models(progress) progress(0.1, desc="Processing audio...") progress(0.2, desc="Loading models...") # Load models (kept for parity with your structure) size = f"infinitetalk-{resolution.replace('p', '')}" wan_pipeline = model_manager.load_wan_model(size=size, device="cuda") # noqa: F841 progress(0.3, desc="Processing input...") # Decide whether the input is a video or image if is_video(image_or_video): logger.info("Processing video dubbing input...") input_frames = cache_video(image_or_video) else: logger.info("Processing image-to-video input...") input_image = Image.open(image_or_video).convert("RGB") input_frames = [input_image] progress(0.4, desc="Generating video...") seed = _set_seed(int(seed)) output_path = f"/tmp/output_{seed}.mp4" # Simplified output save (frames + audio) save_video_ffmpeg( input_frames, output_path, audio_file, high_quality_save=False, ) progress(1.0, desc="Complete!") return output_path except Exception as e: logger.exception("Error generating video") gpu_manager.cleanup() raise gr.Error(f"Generation failed: {str(e)}") def create_interface(): """Create Gradio UI.""" with gr.Blocks(title="InfiniteTalk - Talking Video Generator") as demo: gr.Markdown( """ # 🎬 InfiniteTalk - Talking Video Generator Generate realistic talking head videos with accurate lip-sync from images or dub existing videos with new audio! **Note**: First generation may take a few minutes while models download. Subsequent generations are faster. """ ) with gr.Tabs(): # Tab 1: Image-to-Video with gr.Tab("📸 Image-to-Video"): gr.Markdown("Transform a static portrait into a talking video") with gr.Row(): with gr.Column(): image_input = gr.Image( type="filepath", label="Upload Portrait Image (clear face visibility recommended)", ) audio_input = gr.Audio( type="filepath", label="Upload Audio (MP3, WAV, or FLAC)", ) with gr.Accordion("Advanced Settings", open=False): resolution = gr.Radio( choices=["480p", "720p"], value="480p", label="Resolution (480p faster, 720p higher quality)", ) steps = gr.Slider( minimum=20, maximum=50, value=40, step=1, label="Diffusion Steps (more = higher quality but slower)", ) audio_scale = gr.Slider( minimum=1.0, maximum=5.0, value=3.0, step=0.5, label="Audio Guide Scale (2–4 recommended)", ) seed = gr.Number(value=-1, label="Seed (-1 for random)") generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg") with gr.Column(): output_video = gr.Video(label="Generated Video") gr.Markdown("**💡 Tip**: Use a high-quality portrait image with clear facial features.") generate_btn.click( fn=generate_video, inputs=[image_input, audio_input, resolution, steps, audio_scale, seed], outputs=output_video, ) # Tab 2: Video Dubbing with gr.Tab("🎥 Video Dubbing"): gr.Markdown("Dub an existing video with new audio while maintaining natural movements") with gr.Row(): with gr.Column(): video_input = gr.Video(label="Upload Video (with visible face)") audio_input_v2v = gr.Audio( type="filepath", label="Upload New Audio (MP3, WAV, or FLAC)", ) with gr.Accordion("Advanced Settings", open=False): resolution_v2v = gr.Radio( choices=["480p", "720p"], value="480p", label="Resolution", ) steps_v2v = gr.Slider( minimum=20, maximum=50, value=40, step=1, label="Diffusion Steps", ) audio_scale_v2v = gr.Slider( minimum=1.0, maximum=5.0, value=3.0, step=0.5, label="Audio Guide Scale", ) seed_v2v = gr.Number(value=-1, label="Seed") generate_btn_v2v = gr.Button("🎬 Generate Dubbed Video", variant="primary", size="lg") with gr.Column(): output_video_v2v = gr.Video(label="Dubbed Video") gr.Markdown("**💡 Tip**: Use a video with consistent face visibility.") generate_btn_v2v.click( fn=generate_video, inputs=[video_input, audio_input_v2v, resolution_v2v, steps_v2v, audio_scale_v2v, seed_v2v], outputs=output_video_v2v, ) gr.Markdown( """ --- ### About Powered by InfiniteTalk (Apache 2.0) ⚠️ **Note**: This Space requires GPU hardware to generate videos. """ ) return demo if __name__ == "__main__": demo = create_interface() demo.launch()