MultiPerson

Running on Zero

App Files Files Community

C4G-HKUST commited on 7 days ago

Commit

1b0ed38

1 Parent(s): f725d0c

docs: update

Browse files

Files changed (3) hide show

app.py +21 -21
wan/audio2video_multiID.py +1 -1
wan/utils/infer_utils.py +20 -20

app.py CHANGED Viewed

@@ -483,12 +483,12 @@ def run_graio_demo(args):
                 fps = getattr(cfg, 'fps', 24)
                 calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
-                # Fast模式：如果trim_to_6s为True，强制限制为5秒对应的帧数
                 if trim_to_6s:
-                    # 5秒固定为121帧（4n+1格式：5秒*24fps=120帧，向上取整为121帧）
-                    max_frames_5s = 121
-                    current_frame_num = min(calculated_frame_num, max_frames_5s)
-                    logging.warning(f"Fast mode: Audio duration exceeds 5 seconds. Trimming to 5 seconds ({max_frames_5s} frames). Original: {calculated_frame_num} frames")
                 else:
                     current_frame_num = calculated_frame_num
@@ -684,7 +684,7 @@ def run_graio_demo(args):
         # 固定使用8步去噪，通过关键字参数传递
         kwargs['fixed_steps'] = 8
-        # Fast模式音频长度检测：检查是否超过5秒
         # 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
         #          sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
         if len(args) >= 11:
@@ -718,19 +718,19 @@ def run_graio_demo(args):
                 if img2vid_audio_3:
                     audio_paths.append(img2vid_audio_3)
-            # 检测音频长度是否超过5秒
             if audio_paths and len(audio_paths) > 0:
                 fps = getattr(cfg, 'fps', 24)
                 try:
                     calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
-                    # 5秒固定为121帧（4n+1格式：5秒*24fps=120帧，向上取整为121帧）
-                    max_frames_5s = 121
-                    if calculated_frame_num > max_frames_5s:
-                        # 超过5秒，设置trim_to_6s标记
                         kwargs['trim_to_6s'] = True
                         calculated_duration = calculated_frame_num / fps
-                        logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 5 seconds limit. Will trim to 5 seconds.")
                     else:
                         kwargs['trim_to_6s'] = False
                 except Exception as e:
@@ -823,7 +823,7 @@ def run_graio_demo(args):
                             ⚠️ Important Video Duration Limits
                         </div>
                         <div style="font-size: 14px; color: #856404; line-height: 1.6;">
-                            Fast Mode: Maximum video duration should be less than 5 seconds. Audio inputs longer than 5 seconds will be trimmed to 5 seconds.<br>
                         </div>
                     </div>
@@ -910,7 +910,7 @@ def run_graio_demo(args):
                     )
                 gr.Markdown("""
                 **Generation Modes:**
-                - **Fast Mode (120s GPU budget, suitable for any type of users)**: Fixed 8 denoising steps for quick generation. Maximum video duration: 5 seconds.
                 - **Quality Mode (Dynamic GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). GPU duration is dynamically calculated as: video_seconds × steps × 3.5 s.
                 *Note: Fast mode has a fixed 120s GPU budget. Quality mode dynamically allocates GPU time based on video length and denoising steps. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
@@ -951,7 +951,7 @@ def run_graio_demo(args):
         # 包装函数：处理警告信息显示
         def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
                                     sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
-            # 在开始生成前先检测音频长度，如果超过5秒立即显示警告
             # 根据人数收集音频路径
             audio_paths = []
             if person_num_selector == "1 Person":
@@ -970,18 +970,18 @@ def run_graio_demo(args):
                 if img2vid_audio_3:
                     audio_paths.append(img2vid_audio_3)
-            # 检测音频长度是否超过5秒
             if audio_paths and len(audio_paths) > 0:
                 fps = getattr(cfg, 'fps', 24)
                 try:
                     calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
-                    # 5秒固定为121帧（4n+1格式：5秒*24fps=120帧，向上取整为121帧）
-                    max_frames_5s = 121
-                    if calculated_frame_num > max_frames_5s:
-                        # 超过5秒，立即显示警告
                         calculated_duration = calculated_frame_num / fps
-                        warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 5-second limit for Fast Mode. The audio will be automatically trimmed to 5 seconds to prevent timeout."
                         gr.Warning(warning_msg, duration=5)
                 except Exception as e:
                     logging.warning(f"Failed to check audio duration: {e}")

                 fps = getattr(cfg, 'fps', 24)
                 calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
+                # Fast模式：如果trim_to_6s为True，强制限制为4秒对应的帧数
                 if trim_to_6s:
+                    # 4秒固定为97帧（4n+1格式：4秒*24fps=96帧，向上取整为97帧）
+                    max_frames_4s = 97
+                    current_frame_num = min(calculated_frame_num, max_frames_4s)
+                    logging.warning(f"Fast mode: Audio duration exceeds 4 seconds. Trimming to 4 seconds ({max_frames_4s} frames). Original: {calculated_frame_num} frames")
                 else:
                     current_frame_num = calculated_frame_num
         # 固定使用8步去噪，通过关键字参数传递
         kwargs['fixed_steps'] = 8
+        # Fast模式音频长度检测：检查是否超过4秒
         # 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
         #          sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
         if len(args) >= 11:
                 if img2vid_audio_3:
                     audio_paths.append(img2vid_audio_3)
+            # 检测音频长度是否超过4秒
             if audio_paths and len(audio_paths) > 0:
                 fps = getattr(cfg, 'fps', 24)
                 try:
                     calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
+                    # 4秒固定为97帧（4n+1格式：4秒*24fps=96帧，向上取整为97帧）
+                    max_frames_4s = 97
+                    if calculated_frame_num > max_frames_4s:
+                        # 超过4秒，设置trim_to_6s标记
                         kwargs['trim_to_6s'] = True
                         calculated_duration = calculated_frame_num / fps
+                        logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 4 seconds limit. Will trim to 4 seconds.")
                     else:
                         kwargs['trim_to_6s'] = False
                 except Exception as e:
                             ⚠️ Important Video Duration Limits
                         </div>
                         <div style="font-size: 14px; color: #856404; line-height: 1.6;">
+                            Fast Mode: Maximum video duration should be less than 4 seconds. Audio inputs longer than 4 seconds will be trimmed to 4 seconds.<br>
                         </div>
                     </div>
                     )
                 gr.Markdown("""
                 **Generation Modes:**
+                - **Fast Mode (120s GPU budget, suitable for any type of users)**: Fixed 8 denoising steps for quick generation. Maximum video duration: 4 seconds.
                 - **Quality Mode (Dynamic GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). GPU duration is dynamically calculated as: video_seconds × steps × 3.5 s.
                 *Note: Fast mode has a fixed 120s GPU budget. Quality mode dynamically allocates GPU time based on video length and denoising steps. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
         # 包装函数：处理警告信息显示
         def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
                                     sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
+            # 在开始生成前先检测音频长度，如果超过4秒立即显示警告
             # 根据人数收集音频路径
             audio_paths = []
             if person_num_selector == "1 Person":
                 if img2vid_audio_3:
                     audio_paths.append(img2vid_audio_3)
+            # 检测音频长度是否超过4秒
             if audio_paths and len(audio_paths) > 0:
                 fps = getattr(cfg, 'fps', 24)
                 try:
                     calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
+                    # 4秒固定为97帧（4n+1格式：4秒*24fps=96帧，向上取整为97帧）
+                    max_frames_4s = 97
+                    if calculated_frame_num > max_frames_4s:
+                        # 超过4秒，立即显示警告
                         calculated_duration = calculated_frame_num / fps
+                        warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 4-second limit for Fast Mode. The audio will be automatically trimmed to 4 seconds to prevent timeout."
                         gr.Warning(warning_msg, duration=5)
                 except Exception as e:
                     logging.warning(f"Failed to check audio duration: {e}")

wan/audio2video_multiID.py CHANGED Viewed

@@ -199,7 +199,7 @@ class WanAF2V:
         audio_paths=None, # New: audio path list, supports multiple audio files
         task_key=None,
         mode="pad",  # Audio processing mode: "pad" or "concat"
-        trim_to_6s=False,  # Fast mode: trim audio to 5 seconds
     ):
         r"""
         Generates video frames from input image and text prompt using diffusion process.

         audio_paths=None, # New: audio path list, supports multiple audio files
         task_key=None,
         mode="pad",  # Audio processing mode: "pad" or "concat"
+        trim_to_6s=False,  # Fast mode: trim audio to 4 seconds
     ):
         r"""
         Generates video frames from input image and text prompt using diffusion process.

wan/utils/infer_utils.py CHANGED Viewed

@@ -118,7 +118,7 @@ def process_audio_features(
     half_dtype=None,
     preprocess_audio=None,
     resample_audio=None,
-    trim_to_6s=False,  # Fast mode: trim audio to 5 seconds
 ):
     """
     Process audio files and extract audio features.
@@ -203,20 +203,20 @@ def process_audio_features(
             total_length = sum(audio_lengths)
             print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
-            # Fast mode: trim to 5 seconds if trim_to_6s is True
             if trim_to_6s:
-                # 5秒固定为121帧（4n+1格式：5秒*24fps=120帧，向上取整为121帧）
-                max_frames_5s = 121
-                if total_length > max_frames_5s:
-                    print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_5s} frames (5 seconds)")
                     # Truncate each audio proportionally
-                    scale_factor = max_frames_5s / total_length
                     cumulative_length = 0
                     for i, audio_len in enumerate(audio_lengths):
                         if audio_len > 0:
                             new_audio_len = int(audio_len * scale_factor)
                             # Ensure it fits within remaining space
-                            remaining_space = max_frames_5s - cumulative_length
                             new_audio_len = min(new_audio_len, remaining_space)
                             audio_lengths[i] = new_audio_len
                             # Truncate the corresponding raw audio feature
@@ -294,14 +294,14 @@ def process_audio_features(
                     with torch.no_grad():
                         print(f"wav2vec_model: {wav2vec_model}")
                         print(f"cache_dir:{cache_dir}")
-                        # Fast mode: if trim_to_6s, limit to 5 seconds
                         target_frames = F
                         if trim_to_6s:
-                            # 5秒固定为121帧（4n+1格式：5秒*24fps=120帧，向上取整为121帧）
-                            max_frames_5s = 121
-                            target_frames = min(F, max_frames_5s)
-                            if F > max_frames_5s:
-                                print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_5s} frames (5 seconds)")
                         # Use dynamically determined frame number
                         audio_emb, audio_length = preprocess_audio(
                             wav_path=target_resampled_audio_path,
@@ -343,14 +343,14 @@ def process_audio_features(
                     target_resampled_audio_path,
                 )
             with torch.no_grad():
-                # Fast mode: if trim_to_6s, limit to 5 seconds
                 target_frames = F
                 if trim_to_6s:
-                    # 5秒固定为121帧（4n+1格式：5秒*24fps=120帧，向上取整为121帧）
-                    max_frames_5s = 121
-                    target_frames = min(F, max_frames_5s)
-                    if F > max_frames_5s:
-                        print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_5s} frames (5 seconds)")
                 # Use dynamically determined frame number
                 audio_emb, audio_length = preprocess_audio(
                     wav_path=audio,

     half_dtype=None,
     preprocess_audio=None,
     resample_audio=None,
+    trim_to_6s=False,  # Fast mode: trim audio to 4 seconds
 ):
     """
     Process audio files and extract audio features.
             total_length = sum(audio_lengths)
             print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
+            # Fast mode: trim to 4 seconds if trim_to_6s is True
             if trim_to_6s:
+                # 4秒固定为97帧（4n+1格式：4秒*24fps=96帧，向上取整为97帧）
+                max_frames_4s = 97
+                if total_length > max_frames_4s:
+                    print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_4s} frames (4 seconds)")
                     # Truncate each audio proportionally
+                    scale_factor = max_frames_4s / total_length
                     cumulative_length = 0
                     for i, audio_len in enumerate(audio_lengths):
                         if audio_len > 0:
                             new_audio_len = int(audio_len * scale_factor)
                             # Ensure it fits within remaining space
+                            remaining_space = max_frames_4s - cumulative_length
                             new_audio_len = min(new_audio_len, remaining_space)
                             audio_lengths[i] = new_audio_len
                             # Truncate the corresponding raw audio feature
                     with torch.no_grad():
                         print(f"wav2vec_model: {wav2vec_model}")
                         print(f"cache_dir:{cache_dir}")
+                        # Fast mode: if trim_to_6s, limit to 4 seconds
                         target_frames = F
                         if trim_to_6s:
+                            # 4秒固定为97帧（4n+1格式：4秒*24fps=96帧，向上取整为97帧）
+                            max_frames_4s = 97
+                            target_frames = min(F, max_frames_4s)
+                            if F > max_frames_4s:
+                                print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_4s} frames (4 seconds)")
                         # Use dynamically determined frame number
                         audio_emb, audio_length = preprocess_audio(
                             wav_path=target_resampled_audio_path,
                     target_resampled_audio_path,
                 )
             with torch.no_grad():
+                # Fast mode: if trim_to_6s, limit to 4 seconds
                 target_frames = F
                 if trim_to_6s:
+                    # 4秒固定为97帧（4n+1格式：4秒*24fps=96帧，向上取整为97帧）
+                    max_frames_4s = 97
+                    target_frames = min(F, max_frames_4s)
+                    if F > max_frames_4s:
+                        print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_4s} frames (4 seconds)")
                 # Use dynamically determined frame number
                 audio_emb, audio_length = preprocess_audio(
                     wav_path=audio,