C4G-HKUST commited on
Commit
1b0ed38
·
1 Parent(s): f725d0c

docs: update

Browse files
Files changed (3) hide show
  1. app.py +21 -21
  2. wan/audio2video_multiID.py +1 -1
  3. wan/utils/infer_utils.py +20 -20
app.py CHANGED
@@ -483,12 +483,12 @@ def run_graio_demo(args):
483
  fps = getattr(cfg, 'fps', 24)
484
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
485
 
486
- # Fast模式:如果trim_to_6s为True,强制限制为5秒对应的帧数
487
  if trim_to_6s:
488
- # 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
489
- max_frames_5s = 121
490
- current_frame_num = min(calculated_frame_num, max_frames_5s)
491
- logging.warning(f"Fast mode: Audio duration exceeds 5 seconds. Trimming to 5 seconds ({max_frames_5s} frames). Original: {calculated_frame_num} frames")
492
  else:
493
  current_frame_num = calculated_frame_num
494
 
@@ -684,7 +684,7 @@ def run_graio_demo(args):
684
  # 固定使用8步去噪,通过关键字参数传递
685
  kwargs['fixed_steps'] = 8
686
 
687
- # Fast模式音频长度检测:检查是否超过5
688
  # 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
689
  # sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
690
  if len(args) >= 11:
@@ -718,19 +718,19 @@ def run_graio_demo(args):
718
  if img2vid_audio_3:
719
  audio_paths.append(img2vid_audio_3)
720
 
721
- # 检测音频长度是否超过5
722
  if audio_paths and len(audio_paths) > 0:
723
  fps = getattr(cfg, 'fps', 24)
724
  try:
725
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
726
- # 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
727
- max_frames_5s = 121
728
 
729
- if calculated_frame_num > max_frames_5s:
730
- # 超过5秒,设置trim_to_6s标记
731
  kwargs['trim_to_6s'] = True
732
  calculated_duration = calculated_frame_num / fps
733
- logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 5 seconds limit. Will trim to 5 seconds.")
734
  else:
735
  kwargs['trim_to_6s'] = False
736
  except Exception as e:
@@ -823,7 +823,7 @@ def run_graio_demo(args):
823
  ⚠️ Important Video Duration Limits
824
  </div>
825
  <div style="font-size: 14px; color: #856404; line-height: 1.6;">
826
- Fast Mode: Maximum video duration should be less than 5 seconds. Audio inputs longer than 5 seconds will be trimmed to 5 seconds.<br>
827
  </div>
828
  </div>
829
 
@@ -910,7 +910,7 @@ def run_graio_demo(args):
910
  )
911
  gr.Markdown("""
912
  **Generation Modes:**
913
- - **Fast Mode (120s GPU budget, suitable for any type of users)**: Fixed 8 denoising steps for quick generation. Maximum video duration: 5 seconds.
914
  - **Quality Mode (Dynamic GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). GPU duration is dynamically calculated as: video_seconds × steps × 3.5 s.
915
 
916
  *Note: Fast mode has a fixed 120s GPU budget. Quality mode dynamically allocates GPU time based on video length and denoising steps. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
@@ -951,7 +951,7 @@ def run_graio_demo(args):
951
  # 包装函数:处理警告信息显示
952
  def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
953
  sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
954
- # 在开始生成前先检测音频长度,如果超过5秒立即显示警告
955
  # 根据人数收集音频路径
956
  audio_paths = []
957
  if person_num_selector == "1 Person":
@@ -970,18 +970,18 @@ def run_graio_demo(args):
970
  if img2vid_audio_3:
971
  audio_paths.append(img2vid_audio_3)
972
 
973
- # 检测音频长度是否超过5
974
  if audio_paths and len(audio_paths) > 0:
975
  fps = getattr(cfg, 'fps', 24)
976
  try:
977
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
978
- # 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
979
- max_frames_5s = 121
980
 
981
- if calculated_frame_num > max_frames_5s:
982
- # 超过5秒,立即显示警告
983
  calculated_duration = calculated_frame_num / fps
984
- warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 5-second limit for Fast Mode. The audio will be automatically trimmed to 5 seconds to prevent timeout."
985
  gr.Warning(warning_msg, duration=5)
986
  except Exception as e:
987
  logging.warning(f"Failed to check audio duration: {e}")
 
483
  fps = getattr(cfg, 'fps', 24)
484
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
485
 
486
+ # Fast模式:如果trim_to_6s为True,强制限制为4秒对应的帧数
487
  if trim_to_6s:
488
+ # 4秒固定为97帧(4n+1格式:4秒*24fps=96帧,向上取整为97帧)
489
+ max_frames_4s = 97
490
+ current_frame_num = min(calculated_frame_num, max_frames_4s)
491
+ logging.warning(f"Fast mode: Audio duration exceeds 4 seconds. Trimming to 4 seconds ({max_frames_4s} frames). Original: {calculated_frame_num} frames")
492
  else:
493
  current_frame_num = calculated_frame_num
494
 
 
684
  # 固定使用8步去噪,通过关键字参数传递
685
  kwargs['fixed_steps'] = 8
686
 
687
+ # Fast模式音频长度检测:检查是否超过4
688
  # 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
689
  # sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
690
  if len(args) >= 11:
 
718
  if img2vid_audio_3:
719
  audio_paths.append(img2vid_audio_3)
720
 
721
+ # 检测音频长度是否超过4
722
  if audio_paths and len(audio_paths) > 0:
723
  fps = getattr(cfg, 'fps', 24)
724
  try:
725
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
726
+ # 4秒固定为97帧(4n+1格式:4秒*24fps=96帧,向上取整为97帧)
727
+ max_frames_4s = 97
728
 
729
+ if calculated_frame_num > max_frames_4s:
730
+ # 超过4秒,设置trim_to_6s标记
731
  kwargs['trim_to_6s'] = True
732
  calculated_duration = calculated_frame_num / fps
733
+ logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 4 seconds limit. Will trim to 4 seconds.")
734
  else:
735
  kwargs['trim_to_6s'] = False
736
  except Exception as e:
 
823
  ⚠️ Important Video Duration Limits
824
  </div>
825
  <div style="font-size: 14px; color: #856404; line-height: 1.6;">
826
+ Fast Mode: Maximum video duration should be less than 4 seconds. Audio inputs longer than 4 seconds will be trimmed to 4 seconds.<br>
827
  </div>
828
  </div>
829
 
 
910
  )
911
  gr.Markdown("""
912
  **Generation Modes:**
913
+ - **Fast Mode (120s GPU budget, suitable for any type of users)**: Fixed 8 denoising steps for quick generation. Maximum video duration: 4 seconds.
914
  - **Quality Mode (Dynamic GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). GPU duration is dynamically calculated as: video_seconds × steps × 3.5 s.
915
 
916
  *Note: Fast mode has a fixed 120s GPU budget. Quality mode dynamically allocates GPU time based on video length and denoising steps. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
 
951
  # 包装函数:处理警告信息显示
952
  def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
953
  sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
954
+ # 在开始生成前先检测音频长度,如果超过4秒立即显示警告
955
  # 根据人数收集音频路径
956
  audio_paths = []
957
  if person_num_selector == "1 Person":
 
970
  if img2vid_audio_3:
971
  audio_paths.append(img2vid_audio_3)
972
 
973
+ # 检测音频长度是否超过4
974
  if audio_paths and len(audio_paths) > 0:
975
  fps = getattr(cfg, 'fps', 24)
976
  try:
977
  calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
978
+ # 4秒固定为97帧(4n+1格式:4秒*24fps=96帧,向上取整为97帧)
979
+ max_frames_4s = 97
980
 
981
+ if calculated_frame_num > max_frames_4s:
982
+ # 超过4秒,立即显示警告
983
  calculated_duration = calculated_frame_num / fps
984
+ warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 4-second limit for Fast Mode. The audio will be automatically trimmed to 4 seconds to prevent timeout."
985
  gr.Warning(warning_msg, duration=5)
986
  except Exception as e:
987
  logging.warning(f"Failed to check audio duration: {e}")
wan/audio2video_multiID.py CHANGED
@@ -199,7 +199,7 @@ class WanAF2V:
199
  audio_paths=None, # New: audio path list, supports multiple audio files
200
  task_key=None,
201
  mode="pad", # Audio processing mode: "pad" or "concat"
202
- trim_to_6s=False, # Fast mode: trim audio to 5 seconds
203
  ):
204
  r"""
205
  Generates video frames from input image and text prompt using diffusion process.
 
199
  audio_paths=None, # New: audio path list, supports multiple audio files
200
  task_key=None,
201
  mode="pad", # Audio processing mode: "pad" or "concat"
202
+ trim_to_6s=False, # Fast mode: trim audio to 4 seconds
203
  ):
204
  r"""
205
  Generates video frames from input image and text prompt using diffusion process.
wan/utils/infer_utils.py CHANGED
@@ -118,7 +118,7 @@ def process_audio_features(
118
  half_dtype=None,
119
  preprocess_audio=None,
120
  resample_audio=None,
121
- trim_to_6s=False, # Fast mode: trim audio to 5 seconds
122
  ):
123
  """
124
  Process audio files and extract audio features.
@@ -203,20 +203,20 @@ def process_audio_features(
203
  total_length = sum(audio_lengths)
204
  print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
205
 
206
- # Fast mode: trim to 5 seconds if trim_to_6s is True
207
  if trim_to_6s:
208
- # 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
209
- max_frames_5s = 121
210
- if total_length > max_frames_5s:
211
- print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_5s} frames (5 seconds)")
212
  # Truncate each audio proportionally
213
- scale_factor = max_frames_5s / total_length
214
  cumulative_length = 0
215
  for i, audio_len in enumerate(audio_lengths):
216
  if audio_len > 0:
217
  new_audio_len = int(audio_len * scale_factor)
218
  # Ensure it fits within remaining space
219
- remaining_space = max_frames_5s - cumulative_length
220
  new_audio_len = min(new_audio_len, remaining_space)
221
  audio_lengths[i] = new_audio_len
222
  # Truncate the corresponding raw audio feature
@@ -294,14 +294,14 @@ def process_audio_features(
294
  with torch.no_grad():
295
  print(f"wav2vec_model: {wav2vec_model}")
296
  print(f"cache_dir:{cache_dir}")
297
- # Fast mode: if trim_to_6s, limit to 5 seconds
298
  target_frames = F
299
  if trim_to_6s:
300
- # 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
301
- max_frames_5s = 121
302
- target_frames = min(F, max_frames_5s)
303
- if F > max_frames_5s:
304
- print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_5s} frames (5 seconds)")
305
  # Use dynamically determined frame number
306
  audio_emb, audio_length = preprocess_audio(
307
  wav_path=target_resampled_audio_path,
@@ -343,14 +343,14 @@ def process_audio_features(
343
  target_resampled_audio_path,
344
  )
345
  with torch.no_grad():
346
- # Fast mode: if trim_to_6s, limit to 5 seconds
347
  target_frames = F
348
  if trim_to_6s:
349
- # 5秒固定为121帧(4n+1格式:5秒*24fps=120帧,向上取整为121帧)
350
- max_frames_5s = 121
351
- target_frames = min(F, max_frames_5s)
352
- if F > max_frames_5s:
353
- print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_5s} frames (5 seconds)")
354
  # Use dynamically determined frame number
355
  audio_emb, audio_length = preprocess_audio(
356
  wav_path=audio,
 
118
  half_dtype=None,
119
  preprocess_audio=None,
120
  resample_audio=None,
121
+ trim_to_6s=False, # Fast mode: trim audio to 4 seconds
122
  ):
123
  """
124
  Process audio files and extract audio features.
 
203
  total_length = sum(audio_lengths)
204
  print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
205
 
206
+ # Fast mode: trim to 4 seconds if trim_to_6s is True
207
  if trim_to_6s:
208
+ # 4秒固定为97帧(4n+1格式:4秒*24fps=96帧,向上取整为97帧)
209
+ max_frames_4s = 97
210
+ if total_length > max_frames_4s:
211
+ print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_4s} frames (4 seconds)")
212
  # Truncate each audio proportionally
213
+ scale_factor = max_frames_4s / total_length
214
  cumulative_length = 0
215
  for i, audio_len in enumerate(audio_lengths):
216
  if audio_len > 0:
217
  new_audio_len = int(audio_len * scale_factor)
218
  # Ensure it fits within remaining space
219
+ remaining_space = max_frames_4s - cumulative_length
220
  new_audio_len = min(new_audio_len, remaining_space)
221
  audio_lengths[i] = new_audio_len
222
  # Truncate the corresponding raw audio feature
 
294
  with torch.no_grad():
295
  print(f"wav2vec_model: {wav2vec_model}")
296
  print(f"cache_dir:{cache_dir}")
297
+ # Fast mode: if trim_to_6s, limit to 4 seconds
298
  target_frames = F
299
  if trim_to_6s:
300
+ # 4秒固定为97帧(4n+1格式:4秒*24fps=96帧,向上取整为97帧)
301
+ max_frames_4s = 97
302
+ target_frames = min(F, max_frames_4s)
303
+ if F > max_frames_4s:
304
+ print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_4s} frames (4 seconds)")
305
  # Use dynamically determined frame number
306
  audio_emb, audio_length = preprocess_audio(
307
  wav_path=target_resampled_audio_path,
 
343
  target_resampled_audio_path,
344
  )
345
  with torch.no_grad():
346
+ # Fast mode: if trim_to_6s, limit to 4 seconds
347
  target_frames = F
348
  if trim_to_6s:
349
+ # 4秒固定为97帧(4n+1格式:4秒*24fps=96帧,向上取整为97帧)
350
+ max_frames_4s = 97
351
+ target_frames = min(F, max_frames_4s)
352
+ if F > max_frames_4s:
353
+ print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_4s} frames (4 seconds)")
354
  # Use dynamically determined frame number
355
  audio_emb, audio_length = preprocess_audio(
356
  wav_path=audio,