infinitetalk2

Sleeping

App Files Files Community

FarmerlineML commited on Jan 14

Commit

e04d126

verified ·

1 Parent(s): 0965356

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -71

app.py CHANGED Viewed

@@ -1,25 +1,56 @@
 import os
 import random
 import logging
 import torch
 import gradio as gr
 from PIL import Image
 from utils.model_loader import ModelManager
 from utils.gpu_manager import gpu_manager
 import wan
 from wan.utils.utils import cache_image, cache_video, is_video
 from wan.utils.multitalk_utils import save_video_ffmpeg
-# Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Global variables
-model_manager = None
 models_loaded = False
 def initialize_models(progress=gr.Progress()):
-    """Initialize models on first use"""
     global model_manager, models_loaded
     if models_loaded:
@@ -29,9 +60,9 @@ def initialize_models(progress=gr.Progress()):
         progress(0.1, desc="Initializing model manager...")
         model_manager = ModelManager()
-        progress(0.3, desc="Downloading models (first time only - may take 2-3 minutes)...")
-        # Download models (lazy loading - they'll be loaded on first inference)
         model_manager.get_wan_model_path()
         model_manager.get_infinitetalk_weights_path()
         model_manager.get_wav2vec_model_path()
@@ -41,9 +72,22 @@ def initialize_models(progress=gr.Progress()):
         logger.info("Models initialized successfully")
     except Exception as e:
-        logger.error(f"Error initializing models: {e}")
         raise gr.Error(f"Failed to initialize models: {str(e)}")
 def generate_video(
     image_or_video,
     audio_file,
@@ -51,76 +95,74 @@ def generate_video(
     steps=40,
     audio_guide_scale=3.0,
     seed=-1,
-    progress=gr.Progress()
 ):
-    """Generate talking video from image or dub existing video"""
     try:
         if not torch.cuda.is_available():
-            raise gr.Error(
-                "⚠️ GPU not available. This Space requires GPU hardware to generate videos."
-            )
-        # Initialize models if needed
         if not models_loaded:
             initialize_models(progress)
         progress(0.1, desc="Processing audio...")
-        # Process audio (add your audio processing function here)
-        # (Skip this step in the simplified version)
         progress(0.2, desc="Loading models...")
-        # Load models
         size = f"infinitetalk-{resolution.replace('p', '')}"
-        wan_pipeline = model_manager.load_wan_model(size=size, device="cuda")
         progress(0.3, desc="Processing input...")
-        # Determine if input is image or video
-        is_input_video = is_video(image_or_video)
-        if is_input_video:
-            logger.info("Processing video dubbing...")
             input_frames = cache_video(image_or_video)
         else:
-            logger.info("Processing image-to-video...")
             input_image = Image.open(image_or_video).convert("RGB")
             input_frames = [input_image]
         progress(0.4, desc="Generating video...")
-        # Set random seed
-        if seed == -1:
-            seed = random.randint(0, 99999999)
-        torch.manual_seed(seed)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed(seed)
         output_path = f"/tmp/output_{seed}.mp4"
-        # Generate the video (simplified version)
-        save_video_ffmpeg(input_frames, output_path, audio_file, high_quality_save=False)
         progress(1.0, desc="Complete!")
         return output_path
     except Exception as e:
-        logger.error(f"Error generating video: {e}")
         gpu_manager.cleanup()
         raise gr.Error(f"Generation failed: {str(e)}")
-def create_interface():
-    """Create Gradio interface"""
     with gr.Blocks(title="InfiniteTalk - Talking Video Generator") as demo:
-        gr.Markdown("""
-        # 🎬 InfiniteTalk - Talking Video Generator
-        Generate realistic talking head videos with accurate lip-sync from images or dub existing videos with new audio!
-        **Note**: First generation may take 2-3 minutes while models download. Subsequent generations are much faster (~40s for 10s video).
-        """)
         with gr.Tabs():
             # Tab 1: Image-to-Video
@@ -131,48 +173,45 @@ def create_interface():
                     with gr.Column():
                         image_input = gr.Image(
                             type="filepath",
-                            label="Upload Portrait Image (clear face visibility recommended)"
                         )
                         audio_input = gr.Audio(
                             type="filepath",
-                            label="Upload Audio (MP3, WAV, or FLAC)"
                         )
                         with gr.Accordion("Advanced Settings", open=False):
                             resolution = gr.Radio(
                                 choices=["480p", "720p"],
                                 value="480p",
-                                label="Resolution (480p faster, 720p higher quality)"
                             )
                             steps = gr.Slider(
                                 minimum=20,
                                 maximum=50,
                                 value=40,
                                 step=1,
-                                label="Diffusion Steps (more = higher quality but slower)"
                             )
                             audio_scale = gr.Slider(
                                 minimum=1.0,
                                 maximum=5.0,
                                 value=3.0,
                                 step=0.5,
-                                label="Audio Guide Scale (2-4 recommended)"
-                            )
-                            seed = gr.Number(
-                                value=-1,
-                                label="Seed (-1 for random)"
                             )
                         generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
                     with gr.Column():
                         output_video = gr.Video(label="Generated Video")
-                        gr.Markdown("**💡 Tip**: Use high-quality portrait images with clear facial features for best results")
                 generate_btn.click(
                     fn=generate_video,
                     inputs=[image_input, audio_input, resolution, steps, audio_scale, seed],
-                    outputs=output_video
                 )
             # Tab 2: Video Dubbing
@@ -184,54 +223,52 @@ def create_interface():
                         video_input = gr.Video(label="Upload Video (with visible face)")
                         audio_input_v2v = gr.Audio(
                             type="filepath",
-                            label="Upload New Audio (MP3, WAV, or FLAC)"
                         )
                         with gr.Accordion("Advanced Settings", open=False):
                             resolution_v2v = gr.Radio(
                                 choices=["480p", "720p"],
                                 value="480p",
-                                label="Resolution"
                             )
                             steps_v2v = gr.Slider(
                                 minimum=20,
                                 maximum=50,
                                 value=40,
                                 step=1,
-                                label="Diffusion Steps"
                             )
                             audio_scale_v2v = gr.Slider(
                                 minimum=1.0,
                                 maximum=5.0,
                                 value=3.0,
                                 step=0.5,
-                                label="Audio Guide Scale"
-                            )
-                            seed_v2v = gr.Number(
-                                value=-1,
-                                label="Seed"
                             )
                         generate_btn_v2v = gr.Button("🎬 Generate Dubbed Video", variant="primary", size="lg")
                     with gr.Column():
                         output_video_v2v = gr.Video(label="Dubbed Video")
-                        gr.Markdown("**💡 Tip**: For best results, use videos with consistent face visibility throughout")
                 generate_btn_v2v.click(
                     fn=generate_video,
                     inputs=[video_input, audio_input_v2v, resolution_v2v, steps_v2v, audio_scale_v2v, seed_v2v],
-                    outputs=output_video_v2v
                 )
-        # Footer
-        gr.Markdown("""
-        ---
-        ### About
-        Powered by [InfiniteTalk](https://github.com/MeiGen-AI/InfiniteTalk) - Apache 2.0 License
-        ⚠️ **Note**: This Space requires GPU hardware to generate videos. Apply for a Community GPU Grant in Settings.
-        """)
     return demo

 import os
 import random
 import logging
+from typing import Any
 import torch
 import gradio as gr
 from PIL import Image
 from utils.model_loader import ModelManager
 from utils.gpu_manager import gpu_manager
 import wan
 from wan.utils.utils import cache_image, cache_video, is_video
 from wan.utils.multitalk_utils import save_video_ffmpeg
+# =========================
+# HOTFIX: Gradio /api_info crash
+# =========================
+# Fixes: TypeError: argument of type 'bool' is not iterable
+# Caused by gradio_client trying to interpret JSON Schema nodes that can be booleans
+try:
+    import gradio_client.utils as gcu
+    _old_json_schema_to_python_type = gcu._json_schema_to_python_type
+    def _json_schema_to_python_type_patched(schema: Any, defs=None):
+        if isinstance(schema, bool):
+            return "Any"
+        return _old_json_schema_to_python_type(schema, defs)
+    gcu._json_schema_to_python_type = _json_schema_to_python_type_patched
+except Exception as e:
+    print("gradio_client patch skipped:", e)
+# =========================
+# Logging
+# =========================
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# =========================
+# Globals
+# =========================
+model_manager: ModelManager | None = None
 models_loaded = False
 def initialize_models(progress=gr.Progress()):
+    """Download/prepare model assets on first use."""
     global model_manager, models_loaded
     if models_loaded:
         progress(0.1, desc="Initializing model manager...")
         model_manager = ModelManager()
+        progress(0.3, desc="Downloading models (first time only)...")
+        # Pre-download assets (actual heavy loading happens on first inference)
         model_manager.get_wan_model_path()
         model_manager.get_infinitetalk_weights_path()
         model_manager.get_wav2vec_model_path()
         logger.info("Models initialized successfully")
     except Exception as e:
+        logger.exception("Error initializing models")
         raise gr.Error(f"Failed to initialize models: {str(e)}")
+def _set_seed(seed: int) -> int:
+    """Set deterministic seeds and return the final seed used."""
+    if seed == -1:
+        seed = random.randint(0, 99_999_999)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+    return seed
 def generate_video(
     image_or_video,
     audio_file,
     steps=40,
     audio_guide_scale=3.0,
     seed=-1,
+    progress=gr.Progress(),
 ):
+    """
+    Generate a talking video from an image OR dub an existing video.
+    Note: This is a simplified pipeline example. Your real pipeline may use
+    wan_pipeline + diffusion steps etc. This version just stitches frames + audio.
+    """
     try:
         if not torch.cuda.is_available():
+            raise gr.Error("⚠️ GPU not available. This Space requires GPU hardware to generate videos.")
+        # Ensure models are prepared
         if not models_loaded:
             initialize_models(progress)
         progress(0.1, desc="Processing audio...")
         progress(0.2, desc="Loading models...")
+        # Load models (kept for parity with your structure)
         size = f"infinitetalk-{resolution.replace('p', '')}"
+        wan_pipeline = model_manager.load_wan_model(size=size, device="cuda")  # noqa: F841
         progress(0.3, desc="Processing input...")
+        # Decide whether the input is a video or image
+        if is_video(image_or_video):
+            logger.info("Processing video dubbing input...")
             input_frames = cache_video(image_or_video)
         else:
+            logger.info("Processing image-to-video input...")
             input_image = Image.open(image_or_video).convert("RGB")
             input_frames = [input_image]
         progress(0.4, desc="Generating video...")
+        seed = _set_seed(int(seed))
         output_path = f"/tmp/output_{seed}.mp4"
+        # Simplified output save (frames + audio)
+        save_video_ffmpeg(
+            input_frames,
+            output_path,
+            audio_file,
+            high_quality_save=False,
+        )
         progress(1.0, desc="Complete!")
         return output_path
     except Exception as e:
+        logger.exception("Error generating video")
         gpu_manager.cleanup()
         raise gr.Error(f"Generation failed: {str(e)}")
+def create_interface():
+    """Create Gradio UI."""
     with gr.Blocks(title="InfiniteTalk - Talking Video Generator") as demo:
+        gr.Markdown(
+            """
+# 🎬 InfiniteTalk - Talking Video Generator
+Generate realistic talking head videos with accurate lip-sync from images or dub existing videos with new audio!
+**Note**: First generation may take a few minutes while models download. Subsequent generations are faster.
+"""
+        )
         with gr.Tabs():
             # Tab 1: Image-to-Video
                     with gr.Column():
                         image_input = gr.Image(
                             type="filepath",
+                            label="Upload Portrait Image (clear face visibility recommended)",
                         )
                         audio_input = gr.Audio(
                             type="filepath",
+                            label="Upload Audio (MP3, WAV, or FLAC)",
                         )
                         with gr.Accordion("Advanced Settings", open=False):
                             resolution = gr.Radio(
                                 choices=["480p", "720p"],
                                 value="480p",
+                                label="Resolution (480p faster, 720p higher quality)",
                             )
                             steps = gr.Slider(
                                 minimum=20,
                                 maximum=50,
                                 value=40,
                                 step=1,
+                                label="Diffusion Steps (more = higher quality but slower)",
                             )
                             audio_scale = gr.Slider(
                                 minimum=1.0,
                                 maximum=5.0,
                                 value=3.0,
                                 step=0.5,
+                                label="Audio Guide Scale (2–4 recommended)",
                             )
+                            seed = gr.Number(value=-1, label="Seed (-1 for random)")
                         generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
                     with gr.Column():
                         output_video = gr.Video(label="Generated Video")
+                        gr.Markdown("**💡 Tip**: Use a high-quality portrait image with clear facial features.")
                 generate_btn.click(
                     fn=generate_video,
                     inputs=[image_input, audio_input, resolution, steps, audio_scale, seed],
+                    outputs=output_video,
                 )
             # Tab 2: Video Dubbing
                         video_input = gr.Video(label="Upload Video (with visible face)")
                         audio_input_v2v = gr.Audio(
                             type="filepath",
+                            label="Upload New Audio (MP3, WAV, or FLAC)",
                         )
                         with gr.Accordion("Advanced Settings", open=False):
                             resolution_v2v = gr.Radio(
                                 choices=["480p", "720p"],
                                 value="480p",
+                                label="Resolution",
                             )
                             steps_v2v = gr.Slider(
                                 minimum=20,
                                 maximum=50,
                                 value=40,
                                 step=1,
+                                label="Diffusion Steps",
                             )
                             audio_scale_v2v = gr.Slider(
                                 minimum=1.0,
                                 maximum=5.0,
                                 value=3.0,
                                 step=0.5,
+                                label="Audio Guide Scale",
                             )
+                            seed_v2v = gr.Number(value=-1, label="Seed")
                         generate_btn_v2v = gr.Button("🎬 Generate Dubbed Video", variant="primary", size="lg")
                     with gr.Column():
                         output_video_v2v = gr.Video(label="Dubbed Video")
+                        gr.Markdown("**💡 Tip**: Use a video with consistent face visibility.")
                 generate_btn_v2v.click(
                     fn=generate_video,
                     inputs=[video_input, audio_input_v2v, resolution_v2v, steps_v2v, audio_scale_v2v, seed_v2v],
+                    outputs=output_video_v2v,
                 )
+        gr.Markdown(
+            """
+---
+### About
+Powered by InfiniteTalk (Apache 2.0)
+⚠️ **Note**: This Space requires GPU hardware to generate videos.
+"""
+        )
     return demo