Spaces:
Running
Running
| import os | |
| import random | |
| import logging | |
| from typing import Any | |
| import torch | |
| import gradio as gr | |
| from PIL import Image | |
| from utils.model_loader import ModelManager | |
| from utils.gpu_manager import gpu_manager | |
| import wan | |
| from wan.utils.utils import cache_image, cache_video, is_video | |
| from wan.utils.multitalk_utils import save_video_ffmpeg | |
| # ========================= | |
| # HOTFIX: Gradio /api_info crash | |
| # ========================= | |
| # Fixes: TypeError: argument of type 'bool' is not iterable | |
| # Caused by gradio_client trying to interpret JSON Schema nodes that can be booleans | |
| try: | |
| import gradio_client.utils as gcu | |
| _old_json_schema_to_python_type = gcu._json_schema_to_python_type | |
| def _json_schema_to_python_type_patched(schema: Any, defs=None): | |
| if isinstance(schema, bool): | |
| return "Any" | |
| return _old_json_schema_to_python_type(schema, defs) | |
| gcu._json_schema_to_python_type = _json_schema_to_python_type_patched | |
| except Exception as e: | |
| print("gradio_client patch skipped:", e) | |
| # ========================= | |
| # Logging | |
| # ========================= | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # ========================= | |
| # Globals | |
| # ========================= | |
| model_manager: ModelManager | None = None | |
| models_loaded = False | |
| def initialize_models(progress=gr.Progress()): | |
| """Download/prepare model assets on first use.""" | |
| global model_manager, models_loaded | |
| if models_loaded: | |
| return | |
| try: | |
| progress(0.1, desc="Initializing model manager...") | |
| model_manager = ModelManager() | |
| progress(0.3, desc="Downloading models (first time only)...") | |
| # Pre-download assets (actual heavy loading happens on first inference) | |
| model_manager.get_wan_model_path() | |
| model_manager.get_infinitetalk_weights_path() | |
| model_manager.get_wav2vec_model_path() | |
| models_loaded = True | |
| progress(1.0, desc="Models ready!") | |
| logger.info("Models initialized successfully") | |
| except Exception as e: | |
| logger.exception("Error initializing models") | |
| raise gr.Error(f"Failed to initialize models: {str(e)}") | |
| def _set_seed(seed: int) -> int: | |
| """Set deterministic seeds and return the final seed used.""" | |
| if seed == -1: | |
| seed = random.randint(0, 99_999_999) | |
| torch.manual_seed(seed) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed(seed) | |
| return seed | |
| def generate_video( | |
| image_or_video, | |
| audio_file, | |
| resolution="480p", | |
| steps=40, | |
| audio_guide_scale=3.0, | |
| seed=-1, | |
| progress=gr.Progress(), | |
| ): | |
| """ | |
| Generate a talking video from an image OR dub an existing video. | |
| Note: This is a simplified pipeline example. Your real pipeline may use | |
| wan_pipeline + diffusion steps etc. This version just stitches frames + audio. | |
| """ | |
| try: | |
| if not torch.cuda.is_available(): | |
| raise gr.Error("⚠️ GPU not available. This Space requires GPU hardware to generate videos.") | |
| # Ensure models are prepared | |
| if not models_loaded: | |
| initialize_models(progress) | |
| progress(0.1, desc="Processing audio...") | |
| progress(0.2, desc="Loading models...") | |
| # Load models (kept for parity with your structure) | |
| size = f"infinitetalk-{resolution.replace('p', '')}" | |
| wan_pipeline = model_manager.load_wan_model(size=size, device="cuda") # noqa: F841 | |
| progress(0.3, desc="Processing input...") | |
| # Decide whether the input is a video or image | |
| if is_video(image_or_video): | |
| logger.info("Processing video dubbing input...") | |
| input_frames = cache_video(image_or_video) | |
| else: | |
| logger.info("Processing image-to-video input...") | |
| input_image = Image.open(image_or_video).convert("RGB") | |
| input_frames = [input_image] | |
| progress(0.4, desc="Generating video...") | |
| seed = _set_seed(int(seed)) | |
| output_path = f"/tmp/output_{seed}.mp4" | |
| # Simplified output save (frames + audio) | |
| save_video_ffmpeg( | |
| input_frames, | |
| output_path, | |
| audio_file, | |
| high_quality_save=False, | |
| ) | |
| progress(1.0, desc="Complete!") | |
| return output_path | |
| except Exception as e: | |
| logger.exception("Error generating video") | |
| gpu_manager.cleanup() | |
| raise gr.Error(f"Generation failed: {str(e)}") | |
| def create_interface(): | |
| """Create Gradio UI.""" | |
| with gr.Blocks(title="InfiniteTalk - Talking Video Generator") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🎬 InfiniteTalk - Talking Video Generator | |
| Generate realistic talking head videos with accurate lip-sync from images or dub existing videos with new audio! | |
| **Note**: First generation may take a few minutes while models download. Subsequent generations are faster. | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| # Tab 1: Image-to-Video | |
| with gr.Tab("📸 Image-to-Video"): | |
| gr.Markdown("Transform a static portrait into a talking video") | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image( | |
| type="filepath", | |
| label="Upload Portrait Image (clear face visibility recommended)", | |
| ) | |
| audio_input = gr.Audio( | |
| type="filepath", | |
| label="Upload Audio (MP3, WAV, or FLAC)", | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| resolution = gr.Radio( | |
| choices=["480p", "720p"], | |
| value="480p", | |
| label="Resolution (480p faster, 720p higher quality)", | |
| ) | |
| steps = gr.Slider( | |
| minimum=20, | |
| maximum=50, | |
| value=40, | |
| step=1, | |
| label="Diffusion Steps (more = higher quality but slower)", | |
| ) | |
| audio_scale = gr.Slider( | |
| minimum=1.0, | |
| maximum=5.0, | |
| value=3.0, | |
| step=0.5, | |
| label="Audio Guide Scale (2–4 recommended)", | |
| ) | |
| seed = gr.Number(value=-1, label="Seed (-1 for random)") | |
| generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg") | |
| with gr.Column(): | |
| output_video = gr.Video(label="Generated Video") | |
| gr.Markdown("**💡 Tip**: Use a high-quality portrait image with clear facial features.") | |
| generate_btn.click( | |
| fn=generate_video, | |
| inputs=[image_input, audio_input, resolution, steps, audio_scale, seed], | |
| outputs=output_video, | |
| ) | |
| # Tab 2: Video Dubbing | |
| with gr.Tab("🎥 Video Dubbing"): | |
| gr.Markdown("Dub an existing video with new audio while maintaining natural movements") | |
| with gr.Row(): | |
| with gr.Column(): | |
| video_input = gr.Video(label="Upload Video (with visible face)") | |
| audio_input_v2v = gr.Audio( | |
| type="filepath", | |
| label="Upload New Audio (MP3, WAV, or FLAC)", | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| resolution_v2v = gr.Radio( | |
| choices=["480p", "720p"], | |
| value="480p", | |
| label="Resolution", | |
| ) | |
| steps_v2v = gr.Slider( | |
| minimum=20, | |
| maximum=50, | |
| value=40, | |
| step=1, | |
| label="Diffusion Steps", | |
| ) | |
| audio_scale_v2v = gr.Slider( | |
| minimum=1.0, | |
| maximum=5.0, | |
| value=3.0, | |
| step=0.5, | |
| label="Audio Guide Scale", | |
| ) | |
| seed_v2v = gr.Number(value=-1, label="Seed") | |
| generate_btn_v2v = gr.Button("🎬 Generate Dubbed Video", variant="primary", size="lg") | |
| with gr.Column(): | |
| output_video_v2v = gr.Video(label="Dubbed Video") | |
| gr.Markdown("**💡 Tip**: Use a video with consistent face visibility.") | |
| generate_btn_v2v.click( | |
| fn=generate_video, | |
| inputs=[video_input, audio_input_v2v, resolution_v2v, steps_v2v, audio_scale_v2v, seed_v2v], | |
| outputs=output_video_v2v, | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| ### About | |
| Powered by InfiniteTalk (Apache 2.0) | |
| ⚠️ **Note**: This Space requires GPU hardware to generate videos. | |
| """ | |
| ) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_interface() | |
| demo.launch() | |