infinitetalk2

Running

File size: 9,642 Bytes

import os
import random
import logging
from typing import Any

import torch
import gradio as gr
from PIL import Image

from utils.model_loader import ModelManager
from utils.gpu_manager import gpu_manager

import wan
from wan.utils.utils import cache_image, cache_video, is_video
from wan.utils.multitalk_utils import save_video_ffmpeg


# =========================
# HOTFIX: Gradio /api_info crash
# =========================
# Fixes: TypeError: argument of type 'bool' is not iterable
# Caused by gradio_client trying to interpret JSON Schema nodes that can be booleans
try:
    import gradio_client.utils as gcu

    _old_json_schema_to_python_type = gcu._json_schema_to_python_type

    def _json_schema_to_python_type_patched(schema: Any, defs=None):
        if isinstance(schema, bool):
            return "Any"
        return _old_json_schema_to_python_type(schema, defs)

    gcu._json_schema_to_python_type = _json_schema_to_python_type_patched
except Exception as e:
    print("gradio_client patch skipped:", e)


# =========================
# Logging
# =========================
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


# =========================
# Globals
# =========================
model_manager: ModelManager | None = None
models_loaded = False


def initialize_models(progress=gr.Progress()):
    """Download/prepare model assets on first use."""
    global model_manager, models_loaded

    if models_loaded:
        return

    try:
        progress(0.1, desc="Initializing model manager...")
        model_manager = ModelManager()

        progress(0.3, desc="Downloading models (first time only)...")

        # Pre-download assets (actual heavy loading happens on first inference)
        model_manager.get_wan_model_path()
        model_manager.get_infinitetalk_weights_path()
        model_manager.get_wav2vec_model_path()

        models_loaded = True
        progress(1.0, desc="Models ready!")
        logger.info("Models initialized successfully")

    except Exception as e:
        logger.exception("Error initializing models")
        raise gr.Error(f"Failed to initialize models: {str(e)}")


def _set_seed(seed: int) -> int:
    """Set deterministic seeds and return the final seed used."""
    if seed == -1:
        seed = random.randint(0, 99_999_999)

    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)

    return seed


def generate_video(
    image_or_video,
    audio_file,
    resolution="480p",
    steps=40,
    audio_guide_scale=3.0,
    seed=-1,
    progress=gr.Progress(),
):
    """
    Generate a talking video from an image OR dub an existing video.

    Note: This is a simplified pipeline example. Your real pipeline may use
    wan_pipeline + diffusion steps etc. This version just stitches frames + audio.
    """
    try:
        if not torch.cuda.is_available():
            raise gr.Error("⚠️ GPU not available. This Space requires GPU hardware to generate videos.")

        # Ensure models are prepared
        if not models_loaded:
            initialize_models(progress)

        progress(0.1, desc="Processing audio...")

        progress(0.2, desc="Loading models...")
        # Load models (kept for parity with your structure)
        size = f"infinitetalk-{resolution.replace('p', '')}"
        wan_pipeline = model_manager.load_wan_model(size=size, device="cuda")  # noqa: F841

        progress(0.3, desc="Processing input...")

        # Decide whether the input is a video or image
        if is_video(image_or_video):
            logger.info("Processing video dubbing input...")
            input_frames = cache_video(image_or_video)
        else:
            logger.info("Processing image-to-video input...")
            input_image = Image.open(image_or_video).convert("RGB")
            input_frames = [input_image]

        progress(0.4, desc="Generating video...")

        seed = _set_seed(int(seed))
        output_path = f"/tmp/output_{seed}.mp4"

        # Simplified output save (frames + audio)
        save_video_ffmpeg(
            input_frames,
            output_path,
            audio_file,
            high_quality_save=False,
        )

        progress(1.0, desc="Complete!")
        return output_path

    except Exception as e:
        logger.exception("Error generating video")
        gpu_manager.cleanup()
        raise gr.Error(f"Generation failed: {str(e)}")


def create_interface():
    """Create Gradio UI."""
    with gr.Blocks(title="InfiniteTalk - Talking Video Generator") as demo:
        gr.Markdown(
            """
# 🎬 InfiniteTalk - Talking Video Generator

Generate realistic talking head videos with accurate lip-sync from images or dub existing videos with new audio!

**Note**: First generation may take a few minutes while models download. Subsequent generations are faster.
"""
        )

        with gr.Tabs():
            # Tab 1: Image-to-Video
            with gr.Tab("📸 Image-to-Video"):
                gr.Markdown("Transform a static portrait into a talking video")

                with gr.Row():
                    with gr.Column():
                        image_input = gr.Image(
                            type="filepath",
                            label="Upload Portrait Image (clear face visibility recommended)",
                        )
                        audio_input = gr.Audio(
                            type="filepath",
                            label="Upload Audio (MP3, WAV, or FLAC)",
                        )

                        with gr.Accordion("Advanced Settings", open=False):
                            resolution = gr.Radio(
                                choices=["480p", "720p"],
                                value="480p",
                                label="Resolution (480p faster, 720p higher quality)",
                            )
                            steps = gr.Slider(
                                minimum=20,
                                maximum=50,
                                value=40,
                                step=1,
                                label="Diffusion Steps (more = higher quality but slower)",
                            )
                            audio_scale = gr.Slider(
                                minimum=1.0,
                                maximum=5.0,
                                value=3.0,
                                step=0.5,
                                label="Audio Guide Scale (2–4 recommended)",
                            )
                            seed = gr.Number(value=-1, label="Seed (-1 for random)")

                        generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")

                    with gr.Column():
                        output_video = gr.Video(label="Generated Video")
                        gr.Markdown("**💡 Tip**: Use a high-quality portrait image with clear facial features.")

                generate_btn.click(
                    fn=generate_video,
                    inputs=[image_input, audio_input, resolution, steps, audio_scale, seed],
                    outputs=output_video,
                )

            # Tab 2: Video Dubbing
            with gr.Tab("🎥 Video Dubbing"):
                gr.Markdown("Dub an existing video with new audio while maintaining natural movements")

                with gr.Row():
                    with gr.Column():
                        video_input = gr.Video(label="Upload Video (with visible face)")
                        audio_input_v2v = gr.Audio(
                            type="filepath",
                            label="Upload New Audio (MP3, WAV, or FLAC)",
                        )

                        with gr.Accordion("Advanced Settings", open=False):
                            resolution_v2v = gr.Radio(
                                choices=["480p", "720p"],
                                value="480p",
                                label="Resolution",
                            )
                            steps_v2v = gr.Slider(
                                minimum=20,
                                maximum=50,
                                value=40,
                                step=1,
                                label="Diffusion Steps",
                            )
                            audio_scale_v2v = gr.Slider(
                                minimum=1.0,
                                maximum=5.0,
                                value=3.0,
                                step=0.5,
                                label="Audio Guide Scale",
                            )
                            seed_v2v = gr.Number(value=-1, label="Seed")

                        generate_btn_v2v = gr.Button("🎬 Generate Dubbed Video", variant="primary", size="lg")

                    with gr.Column():
                        output_video_v2v = gr.Video(label="Dubbed Video")
                        gr.Markdown("**💡 Tip**: Use a video with consistent face visibility.")

                generate_btn_v2v.click(
                    fn=generate_video,
                    inputs=[video_input, audio_input_v2v, resolution_v2v, steps_v2v, audio_scale_v2v, seed_v2v],
                    outputs=output_video_v2v,
                )

        gr.Markdown(
            """
---
### About
Powered by InfiniteTalk (Apache 2.0)

⚠️ **Note**: This Space requires GPU hardware to generate videos.
"""
        )

    return demo


if __name__ == "__main__":
    demo = create_interface()
    demo.launch()