MultiPerson

Running on Zero

App Files Files Community

C4G-HKUST commited on 9 days ago

Commit

fd55666

1 Parent(s): 490e55a

feat: time out check

Browse files

Files changed (3) hide show

app.py +127 -6
wan/audio2video_multiID.py +2 -0
wan/utils/infer_utils.py +53 -7

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import spaces
 warnings.filterwarnings('ignore')
 import random
 import torch
 import torch.distributed as dist
 from PIL import Image
@@ -435,7 +436,7 @@ def run_graio_demo(args):
     logging.info("Model and face processor loaded successfully.")
     def generate_video(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
-                    sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector, fixed_steps=None):
         # 参考 LivePortrait: 在 worker 进程中直接使用 cuda 设备
         # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/src/gradio_pipeline.py
         # @spaces.GPU 装饰器已经初始化了 GPU，这里直接使用即可
@@ -480,7 +481,18 @@ def run_graio_demo(args):
             if audio_paths and len(audio_paths) > 0:
                 # 使用 cfg 中的 fps，如果不可用则使用默认值 24
                 fps = getattr(cfg, 'fps', 24)
-                current_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
                 logging.info(f"Dynamically determined frame number: {current_frame_num} (mode: {audio_mode_selector})")
             else:
                 # 没有音频时使用默认帧数
@@ -519,6 +531,7 @@ def run_graio_demo(args):
             audio_paths=audio_paths,
             task_key="gradio_output",
             mode=audio_mode_selector,
         )
         if isinstance(video, dict):
@@ -610,6 +623,63 @@ def run_graio_demo(args):
     def gpu_wrapped_generate_video_fast(*args, **kwargs):
         # 固定使用10步去噪，通过关键字参数传递
         kwargs['fixed_steps'] = 10
         return gpu_wrapped_generate_video_worker(*args, **kwargs)
     # 高质量生成模式：780秒，用户选择去噪步数
@@ -666,7 +736,8 @@ def run_graio_demo(args):
             except Exception as e:
                 logging.warning(f"Failed to move models to GPU: {e}")
-        return generate_video(*args, **kwargs)
@@ -815,22 +886,72 @@ def run_graio_demo(args):
                 )
         # 快速生成按钮：210秒，固定10步
         run_i2v_button_fast.click(
-            fn=gpu_wrapped_generate_video_fast,
             inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],
             outputs=[result_gallery],
         )
         # 高质量生成按钮：780秒，用户选择步数
         run_i2v_button_quality.click(
-            fn=gpu_wrapped_generate_video_quality,
             inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],
             outputs=[result_gallery],
         )
     # 参考 Meigen-MultiTalk 的成功配置
     # 在 Hugging Face Spaces 上，Gradio 会自动处理端口和服务器配置
-    demo.queue(max_size=4).launch(show_error=True)

 warnings.filterwarnings('ignore')
 import random
+import math
 import torch
 import torch.distributed as dist
 from PIL import Image
     logging.info("Model and face processor loaded successfully.")
     def generate_video(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
+                    sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector, fixed_steps=None, trim_to_6s=False):
         # 参考 LivePortrait: 在 worker 进程中直接使用 cuda 设备
         # 参考: https://huggingface.co/spaces/KlingTeam/LivePortrait/blob/main/src/gradio_pipeline.py
         # @spaces.GPU 装饰器已经初始化了 GPU，这里直接使用即可
             if audio_paths and len(audio_paths) > 0:
                 # 使用 cfg 中的 fps，如果不可用则使用默认值 24
                 fps = getattr(cfg, 'fps', 24)
+                calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
+                # Fast模式：如果trim_to_6s为True，强制限制为6秒对应的帧数
+                if trim_to_6s:
+                    # 计算6秒对应的帧数（4n+1格式）
+                    max_frames_6s = int(math.ceil(6.0 * fps))
+                    max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
+                    current_frame_num = min(calculated_frame_num, max_frames_6s)
+                    logging.warning(f"Fast mode: Audio duration exceeds 6 seconds. Trimming to 6 seconds ({max_frames_6s} frames). Original: {calculated_frame_num} frames")
+                else:
+                    current_frame_num = calculated_frame_num
                 logging.info(f"Dynamically determined frame number: {current_frame_num} (mode: {audio_mode_selector})")
             else:
                 # 没有音频时使用默认帧数
             audio_paths=audio_paths,
             task_key="gradio_output",
             mode=audio_mode_selector,
+            trim_to_6s=trim_to_6s,
         )
         if isinstance(video, dict):
     def gpu_wrapped_generate_video_fast(*args, **kwargs):
         # 固定使用10步去噪，通过关键字参数传递
         kwargs['fixed_steps'] = 10
+        # Fast模式音频长度检测：检查是否超过6秒
+        # 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
+        #          sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
+        if len(args) >= 11:
+            img2vid_image = args[0]
+            img2vid_prompt = args[1]
+            n_prompt = args[2]
+            img2vid_audio_1 = args[3]
+            img2vid_audio_2 = args[4]
+            img2vid_audio_3 = args[5]
+            sd_steps = args[6]
+            seed = args[7]
+            guide_scale = args[8]
+            person_num_selector = args[9]
+            audio_mode_selector = args[10]
+            # 根据人数收集音频路径
+            audio_paths = []
+            if person_num_selector == "1 Person":
+                if img2vid_audio_1:
+                    audio_paths.append(img2vid_audio_1)
+            elif person_num_selector == "2 Persons":
+                if img2vid_audio_1:
+                    audio_paths.append(img2vid_audio_1)
+                if img2vid_audio_2:
+                    audio_paths.append(img2vid_audio_2)
+            elif person_num_selector == "3 Persons":
+                if img2vid_audio_1:
+                    audio_paths.append(img2vid_audio_1)
+                if img2vid_audio_2:
+                    audio_paths.append(img2vid_audio_2)
+                if img2vid_audio_3:
+                    audio_paths.append(img2vid_audio_3)
+            # 检测音频长度是否超过6秒
+            if audio_paths and len(audio_paths) > 0:
+                fps = getattr(cfg, 'fps', 24)
+                try:
+                    calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
+                    # 计算6秒对应的帧数
+                    max_frames_6s = int(math.ceil(6.0 * fps))
+                    max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
+                    if calculated_frame_num > max_frames_6s:
+                        # 超过6秒，设置trim_to_6s标记
+                        kwargs['trim_to_6s'] = True
+                        calculated_duration = calculated_frame_num / fps
+                        logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 6 seconds limit. Will trim to 6 seconds.")
+                    else:
+                        kwargs['trim_to_6s'] = False
+                except Exception as e:
+                    logging.warning(f"Failed to check audio duration: {e}")
+                    kwargs['trim_to_6s'] = False
+            else:
+                kwargs['trim_to_6s'] = False
         return gpu_wrapped_generate_video_worker(*args, **kwargs)
     # 高质量生成模式：780秒，用户选择去噪步数
             except Exception as e:
                 logging.warning(f"Failed to move models to GPU: {e}")
+        result = generate_video(*args, **kwargs)
+        return result
                 )
+        # 包装函数：处理警告信息显示
+        def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
+                                    sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
+            # 在开始生成前先检测音频长度，如果超过6秒立即显示警告
+            # 根据人数收集音频路径
+            audio_paths = []
+            if person_num_selector == "1 Person":
+                if img2vid_audio_1:
+                    audio_paths.append(img2vid_audio_1)
+            elif person_num_selector == "2 Persons":
+                if img2vid_audio_1:
+                    audio_paths.append(img2vid_audio_1)
+                if img2vid_audio_2:
+                    audio_paths.append(img2vid_audio_2)
+            elif person_num_selector == "3 Persons":
+                if img2vid_audio_1:
+                    audio_paths.append(img2vid_audio_1)
+                if img2vid_audio_2:
+                    audio_paths.append(img2vid_audio_2)
+                if img2vid_audio_3:
+                    audio_paths.append(img2vid_audio_3)
+            # 检测音频长度是否超过6秒
+            if audio_paths and len(audio_paths) > 0:
+                fps = getattr(cfg, 'fps', 24)
+                try:
+                    calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
+                    # 计算6秒对应的帧数
+                    max_frames_6s = int(math.ceil(6.0 * fps))
+                    max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
+                    if calculated_frame_num > max_frames_6s:
+                        # 超过6秒，立即显示警告
+                        calculated_duration = calculated_frame_num / fps
+                        warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 6-second limit for Fast Mode. The audio will be automatically trimmed to 6 seconds to prevent timeout."
+                        gr.Warning(warning_msg, duration=5)
+                except Exception as e:
+                    logging.warning(f"Failed to check audio duration: {e}")
+            # 继续执行视频生成
+            result = gpu_wrapped_generate_video_fast(
+                img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
+                sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
+            )
+            return result
+        def handle_quality_generation(*args):
+            result = gpu_wrapped_generate_video_quality(*args)
+            return result
         # 快速生成按钮：210秒，固定10步
         run_i2v_button_fast.click(
+            fn=handle_fast_generation,
             inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],
             outputs=[result_gallery],
         )
         # 高质量生成按钮：780秒，用户选择步数
         run_i2v_button_quality.click(
+            fn=handle_quality_generation,
             inputs=[img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3, sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector],
             outputs=[result_gallery],
         )
     # 参考 Meigen-MultiTalk 的成功配置
     # 在 Hugging Face Spaces 上，Gradio 会自动处理端口和服务器配置
+    demo.queue(max_size=10).launch(show_error=True)

wan/audio2video_multiID.py CHANGED Viewed

@@ -199,6 +199,7 @@ class WanAF2V:
         audio_paths=None, # New: audio path list, supports multiple audio files
         task_key=None,
         mode="pad",  # Audio processing mode: "pad" or "concat"
     ):
         r"""
         Generates video frames from input image and text prompt using diffusion process.
@@ -514,6 +515,7 @@ class WanAF2V:
             half_dtype=self.half_dtype,
             preprocess_audio=preprocess_audio,
             resample_audio=resample_audio,
         )
         # Prepare audio_ref_features - new list mode

         audio_paths=None, # New: audio path list, supports multiple audio files
         task_key=None,
         mode="pad",  # Audio processing mode: "pad" or "concat"
+        trim_to_6s=False,  # Fast mode: trim audio to 6 seconds
     ):
         r"""
         Generates video frames from input image and text prompt using diffusion process.
             half_dtype=self.half_dtype,
             preprocess_audio=preprocess_audio,
             resample_audio=resample_audio,
+            trim_to_6s=trim_to_6s,
         )
         # Prepare audio_ref_features - new list mode

wan/utils/infer_utils.py CHANGED Viewed

@@ -118,6 +118,7 @@ def process_audio_features(
     half_dtype=None,
     preprocess_audio=None,
     resample_audio=None,
 ):
     """
     Process audio files and extract audio features.
@@ -202,6 +203,31 @@ def process_audio_features(
             total_length = sum(audio_lengths)
             print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
             # Ensure total length is in 4n+1 format (model requirement)
             total_length = ((total_length - 1) // 4) * 4 + 1
             print(f"Adjusted total length to 4n+1 format: {total_length} frames")
@@ -257,7 +283,7 @@ def process_audio_features(
                     audio_feat_list.append(zero_audio_feat)
                     print(f"Audio {i} is missing, created zero features with shape: {zero_audio_feat.shape}")
         else:
-            # Pad mode: keep existing logic, no changes needed
             for i, audio_path in enumerate(audio_paths):
                 if audio_path and os.path.exists(audio_path):
                     print(f"Processing audio {i}: {audio_path}")
@@ -270,10 +296,19 @@ def process_audio_features(
                     with torch.no_grad():
                         print(f"wav2vec_model: {wav2vec_model}")
                         print(f"cache_dir:{cache_dir}")
-                        # Use dynamically determined frame number F
                         audio_emb, audio_length = preprocess_audio(
                             wav_path=target_resampled_audio_path,
-                            num_generated_frames_per_clip=F,  # Use dynamically determined frame number
                             fps=fps,
                             wav2vec_model=wav2vec_model,
                             vocal_separator_model=vocal_separator_model,
@@ -284,7 +319,8 @@ def process_audio_features(
                         audio_dtype = half_dtype if use_half else torch.bfloat16
                         audio_emb = audio_emb.to(device, dtype=audio_dtype)
-                    audio_feat = audio_emb[:F]  # Use dynamically determined frame number
                     audio_feat_list.append(audio_feat)
                     print(f"Audio {i} processed, shape: {audio_feat.shape}")
                 else:
@@ -310,10 +346,19 @@ def process_audio_features(
                     target_resampled_audio_path,
                 )
             with torch.no_grad():
-                # Use dynamically determined frame number F
                 audio_emb, audio_length = preprocess_audio(
                     wav_path=audio,
-                    num_generated_frames_per_clip=F,  # Use dynamically determined frame number
                     fps=fps,
                     wav2vec_model=wav2vec_model,
                     vocal_separator_model=vocal_separator_model,
@@ -324,7 +369,8 @@ def process_audio_features(
                 audio_dtype = half_dtype if use_half else torch.bfloat16
                 audio_emb = audio_emb.to(device, dtype=audio_dtype)
-            audio_feat = audio_emb[:F]  # Use dynamically determined frame number
             audio_feat_list.append(audio_feat)
             print(f"Single audio processed, shape: {audio_feat.shape}")
         else:

     half_dtype=None,
     preprocess_audio=None,
     resample_audio=None,
+    trim_to_6s=False,  # Fast mode: trim audio to 6 seconds
 ):
     """
     Process audio files and extract audio features.
             total_length = sum(audio_lengths)
             print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
+            # Fast mode: trim to 6 seconds if trim_to_6s is True
+            if trim_to_6s:
+                import math
+                # Calculate 6 seconds in frames
+                max_frames_6s = int(math.ceil(6.0 * fps))
+                max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
+                if total_length > max_frames_6s:
+                    print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_6s} frames (6 seconds)")
+                    # Truncate each audio proportionally
+                    scale_factor = max_frames_6s / total_length
+                    cumulative_length = 0
+                    for i, audio_len in enumerate(audio_lengths):
+                        if audio_len > 0:
+                            new_audio_len = int(audio_len * scale_factor)
+                            # Ensure it fits within remaining space
+                            remaining_space = max_frames_6s - cumulative_length
+                            new_audio_len = min(new_audio_len, remaining_space)
+                            audio_lengths[i] = new_audio_len
+                            # Truncate the corresponding raw audio feature
+                            if raw_audio_feat_list[i] is not None:
+                                raw_audio_feat_list[i] = raw_audio_feat_list[i][:new_audio_len]
+                            cumulative_length += new_audio_len
+                    total_length = sum(audio_lengths)
+                    print(f"After trimming: total_length = {total_length} frames")
             # Ensure total length is in 4n+1 format (model requirement)
             total_length = ((total_length - 1) // 4) * 4 + 1
             print(f"Adjusted total length to 4n+1 format: {total_length} frames")
                     audio_feat_list.append(zero_audio_feat)
                     print(f"Audio {i} is missing, created zero features with shape: {zero_audio_feat.shape}")
         else:
+            # Pad mode: keep existing logic, but apply trim_to_6s if needed
             for i, audio_path in enumerate(audio_paths):
                 if audio_path and os.path.exists(audio_path):
                     print(f"Processing audio {i}: {audio_path}")
                     with torch.no_grad():
                         print(f"wav2vec_model: {wav2vec_model}")
                         print(f"cache_dir:{cache_dir}")
+                        # Fast mode: if trim_to_6s, limit to 6 seconds
+                        target_frames = F
+                        if trim_to_6s:
+                            import math
+                            max_frames_6s = int(math.ceil(6.0 * fps))
+                            max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
+                            target_frames = min(F, max_frames_6s)
+                            if F > max_frames_6s:
+                                print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_6s} frames (6 seconds)")
+                        # Use dynamically determined frame number
                         audio_emb, audio_length = preprocess_audio(
                             wav_path=target_resampled_audio_path,
+                            num_generated_frames_per_clip=target_frames,  # Use target frames (may be trimmed)
                             fps=fps,
                             wav2vec_model=wav2vec_model,
                             vocal_separator_model=vocal_separator_model,
                         audio_dtype = half_dtype if use_half else torch.bfloat16
                         audio_emb = audio_emb.to(device, dtype=audio_dtype)
+                    # Ensure we don't exceed F frames (for consistency with other tensors)
+                    audio_feat = audio_emb[:F]  # Use F to maintain consistency
                     audio_feat_list.append(audio_feat)
                     print(f"Audio {i} processed, shape: {audio_feat.shape}")
                 else:
                     target_resampled_audio_path,
                 )
             with torch.no_grad():
+                # Fast mode: if trim_to_6s, limit to 6 seconds
+                target_frames = F
+                if trim_to_6s:
+                    import math
+                    max_frames_6s = int(math.ceil(6.0 * fps))
+                    max_frames_6s = ((max_frames_6s - 1) // 4) * 4 + 1
+                    target_frames = min(F, max_frames_6s)
+                    if F > max_frames_6s:
+                        print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_6s} frames (6 seconds)")
+                # Use dynamically determined frame number
                 audio_emb, audio_length = preprocess_audio(
                     wav_path=audio,
+                    num_generated_frames_per_clip=target_frames,  # Use target frames (may be trimmed)
                     fps=fps,
                     wav2vec_model=wav2vec_model,
                     vocal_separator_model=vocal_separator_model,
                 audio_dtype = half_dtype if use_half else torch.bfloat16
                 audio_emb = audio_emb.to(device, dtype=audio_dtype)
+            # Ensure we don't exceed F frames (for consistency with other tensors)
+            audio_feat = audio_emb[:F]  # Use F to maintain consistency
             audio_feat_list.append(audio_feat)
             print(f"Single audio processed, shape: {audio_feat.shape}")
         else: