Spaces:
Running
on
Zero
Running
on
Zero
docs: update
Browse files- app.py +21 -21
- wan/audio2video_multiID.py +1 -1
- wan/utils/infer_utils.py +20 -20
app.py
CHANGED
|
@@ -483,12 +483,12 @@ def run_graio_demo(args):
|
|
| 483 |
fps = getattr(cfg, 'fps', 24)
|
| 484 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 485 |
|
| 486 |
-
# Fast模式:如果trim_to_6s为True,强制限制为
|
| 487 |
if trim_to_6s:
|
| 488 |
-
#
|
| 489 |
-
|
| 490 |
-
current_frame_num = min(calculated_frame_num,
|
| 491 |
-
logging.warning(f"Fast mode: Audio duration exceeds
|
| 492 |
else:
|
| 493 |
current_frame_num = calculated_frame_num
|
| 494 |
|
|
@@ -684,7 +684,7 @@ def run_graio_demo(args):
|
|
| 684 |
# 固定使用8步去噪,通过关键字参数传递
|
| 685 |
kwargs['fixed_steps'] = 8
|
| 686 |
|
| 687 |
-
# Fast模式音频长度检测:检查是否超过
|
| 688 |
# 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
|
| 689 |
# sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
|
| 690 |
if len(args) >= 11:
|
|
@@ -718,19 +718,19 @@ def run_graio_demo(args):
|
|
| 718 |
if img2vid_audio_3:
|
| 719 |
audio_paths.append(img2vid_audio_3)
|
| 720 |
|
| 721 |
-
# 检测音频长度是否超过
|
| 722 |
if audio_paths and len(audio_paths) > 0:
|
| 723 |
fps = getattr(cfg, 'fps', 24)
|
| 724 |
try:
|
| 725 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 726 |
-
#
|
| 727 |
-
|
| 728 |
|
| 729 |
-
if calculated_frame_num >
|
| 730 |
-
# 超过
|
| 731 |
kwargs['trim_to_6s'] = True
|
| 732 |
calculated_duration = calculated_frame_num / fps
|
| 733 |
-
logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds
|
| 734 |
else:
|
| 735 |
kwargs['trim_to_6s'] = False
|
| 736 |
except Exception as e:
|
|
@@ -823,7 +823,7 @@ def run_graio_demo(args):
|
|
| 823 |
⚠️ Important Video Duration Limits
|
| 824 |
</div>
|
| 825 |
<div style="font-size: 14px; color: #856404; line-height: 1.6;">
|
| 826 |
-
Fast Mode: Maximum video duration should be less than
|
| 827 |
</div>
|
| 828 |
</div>
|
| 829 |
|
|
@@ -910,7 +910,7 @@ def run_graio_demo(args):
|
|
| 910 |
)
|
| 911 |
gr.Markdown("""
|
| 912 |
**Generation Modes:**
|
| 913 |
-
- **Fast Mode (120s GPU budget, suitable for any type of users)**: Fixed 8 denoising steps for quick generation. Maximum video duration:
|
| 914 |
- **Quality Mode (Dynamic GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). GPU duration is dynamically calculated as: video_seconds × steps × 3.5 s.
|
| 915 |
|
| 916 |
*Note: Fast mode has a fixed 120s GPU budget. Quality mode dynamically allocates GPU time based on video length and denoising steps. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
|
|
@@ -951,7 +951,7 @@ def run_graio_demo(args):
|
|
| 951 |
# 包装函数:处理警告信息显示
|
| 952 |
def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
|
| 953 |
sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
|
| 954 |
-
# 在开始生成前先检测音频长度,如果超过
|
| 955 |
# 根据人数收集音频路径
|
| 956 |
audio_paths = []
|
| 957 |
if person_num_selector == "1 Person":
|
|
@@ -970,18 +970,18 @@ def run_graio_demo(args):
|
|
| 970 |
if img2vid_audio_3:
|
| 971 |
audio_paths.append(img2vid_audio_3)
|
| 972 |
|
| 973 |
-
# 检测音频长度是否超过
|
| 974 |
if audio_paths and len(audio_paths) > 0:
|
| 975 |
fps = getattr(cfg, 'fps', 24)
|
| 976 |
try:
|
| 977 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 978 |
-
#
|
| 979 |
-
|
| 980 |
|
| 981 |
-
if calculated_frame_num >
|
| 982 |
-
# 超过
|
| 983 |
calculated_duration = calculated_frame_num / fps
|
| 984 |
-
warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the
|
| 985 |
gr.Warning(warning_msg, duration=5)
|
| 986 |
except Exception as e:
|
| 987 |
logging.warning(f"Failed to check audio duration: {e}")
|
|
|
|
| 483 |
fps = getattr(cfg, 'fps', 24)
|
| 484 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 485 |
|
| 486 |
+
# Fast模式:如果trim_to_6s为True,强制限制为4秒对应的帧数
|
| 487 |
if trim_to_6s:
|
| 488 |
+
# 4秒固定为97帧(4n+1格式:4秒*24fps=96帧,向上取整为97帧)
|
| 489 |
+
max_frames_4s = 97
|
| 490 |
+
current_frame_num = min(calculated_frame_num, max_frames_4s)
|
| 491 |
+
logging.warning(f"Fast mode: Audio duration exceeds 4 seconds. Trimming to 4 seconds ({max_frames_4s} frames). Original: {calculated_frame_num} frames")
|
| 492 |
else:
|
| 493 |
current_frame_num = calculated_frame_num
|
| 494 |
|
|
|
|
| 684 |
# 固定使用8步去噪,通过关键字参数传递
|
| 685 |
kwargs['fixed_steps'] = 8
|
| 686 |
|
| 687 |
+
# Fast模式音频长度检测:检查是否超过4秒
|
| 688 |
# 参数顺序: img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
|
| 689 |
# sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector
|
| 690 |
if len(args) >= 11:
|
|
|
|
| 718 |
if img2vid_audio_3:
|
| 719 |
audio_paths.append(img2vid_audio_3)
|
| 720 |
|
| 721 |
+
# 检测音频长度是否超过4秒
|
| 722 |
if audio_paths and len(audio_paths) > 0:
|
| 723 |
fps = getattr(cfg, 'fps', 24)
|
| 724 |
try:
|
| 725 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 726 |
+
# 4秒固定为97帧(4n+1格式:4秒*24fps=96帧,向上取整为97帧)
|
| 727 |
+
max_frames_4s = 97
|
| 728 |
|
| 729 |
+
if calculated_frame_num > max_frames_4s:
|
| 730 |
+
# 超过4秒,设置trim_to_6s标记
|
| 731 |
kwargs['trim_to_6s'] = True
|
| 732 |
calculated_duration = calculated_frame_num / fps
|
| 733 |
+
logging.warning(f"Fast mode: Audio duration ({calculated_duration:.2f}s) exceeds 4 seconds limit. Will trim to 4 seconds.")
|
| 734 |
else:
|
| 735 |
kwargs['trim_to_6s'] = False
|
| 736 |
except Exception as e:
|
|
|
|
| 823 |
⚠️ Important Video Duration Limits
|
| 824 |
</div>
|
| 825 |
<div style="font-size: 14px; color: #856404; line-height: 1.6;">
|
| 826 |
+
Fast Mode: Maximum video duration should be less than 4 seconds. Audio inputs longer than 4 seconds will be trimmed to 4 seconds.<br>
|
| 827 |
</div>
|
| 828 |
</div>
|
| 829 |
|
|
|
|
| 910 |
)
|
| 911 |
gr.Markdown("""
|
| 912 |
**Generation Modes:**
|
| 913 |
+
- **Fast Mode (120s GPU budget, suitable for any type of users)**: Fixed 8 denoising steps for quick generation. Maximum video duration: 4 seconds.
|
| 914 |
- **Quality Mode (Dynamic GPU budget)**: Custom denoising steps (adjustable via "Diffusion steps" slider, default: 25 steps). GPU duration is dynamically calculated as: video_seconds × steps × 3.5 s.
|
| 915 |
|
| 916 |
*Note: Fast mode has a fixed 120s GPU budget. Quality mode dynamically allocates GPU time based on video length and denoising steps. Multi-person videos generally require longer duration and more Usage Quota for better quality.*
|
|
|
|
| 951 |
# 包装函数:处理警告信息显示
|
| 952 |
def handle_fast_generation(img2vid_image, img2vid_prompt, n_prompt, img2vid_audio_1, img2vid_audio_2, img2vid_audio_3,
|
| 953 |
sd_steps, seed, guide_scale, person_num_selector, audio_mode_selector):
|
| 954 |
+
# 在开始生成前先检测音频长度,如果超过4秒立即显示警告
|
| 955 |
# 根据人数收集音频路径
|
| 956 |
audio_paths = []
|
| 957 |
if person_num_selector == "1 Person":
|
|
|
|
| 970 |
if img2vid_audio_3:
|
| 971 |
audio_paths.append(img2vid_audio_3)
|
| 972 |
|
| 973 |
+
# 检测音频长度是否超过4秒
|
| 974 |
if audio_paths and len(audio_paths) > 0:
|
| 975 |
fps = getattr(cfg, 'fps', 24)
|
| 976 |
try:
|
| 977 |
calculated_frame_num = calculate_frame_num_from_audio(audio_paths, fps, mode=audio_mode_selector)
|
| 978 |
+
# 4秒固定为97帧(4n+1格式:4秒*24fps=96帧,向上取整为97帧)
|
| 979 |
+
max_frames_4s = 97
|
| 980 |
|
| 981 |
+
if calculated_frame_num > max_frames_4s:
|
| 982 |
+
# 超过4秒,立即显示警告
|
| 983 |
calculated_duration = calculated_frame_num / fps
|
| 984 |
+
warning_msg = f"⚠️ Warning: Your audio duration ({calculated_duration:.2f}s) exceeds the 4-second limit for Fast Mode. The audio will be automatically trimmed to 4 seconds to prevent timeout."
|
| 985 |
gr.Warning(warning_msg, duration=5)
|
| 986 |
except Exception as e:
|
| 987 |
logging.warning(f"Failed to check audio duration: {e}")
|
wan/audio2video_multiID.py
CHANGED
|
@@ -199,7 +199,7 @@ class WanAF2V:
|
|
| 199 |
audio_paths=None, # New: audio path list, supports multiple audio files
|
| 200 |
task_key=None,
|
| 201 |
mode="pad", # Audio processing mode: "pad" or "concat"
|
| 202 |
-
trim_to_6s=False, # Fast mode: trim audio to
|
| 203 |
):
|
| 204 |
r"""
|
| 205 |
Generates video frames from input image and text prompt using diffusion process.
|
|
|
|
| 199 |
audio_paths=None, # New: audio path list, supports multiple audio files
|
| 200 |
task_key=None,
|
| 201 |
mode="pad", # Audio processing mode: "pad" or "concat"
|
| 202 |
+
trim_to_6s=False, # Fast mode: trim audio to 4 seconds
|
| 203 |
):
|
| 204 |
r"""
|
| 205 |
Generates video frames from input image and text prompt using diffusion process.
|
wan/utils/infer_utils.py
CHANGED
|
@@ -118,7 +118,7 @@ def process_audio_features(
|
|
| 118 |
half_dtype=None,
|
| 119 |
preprocess_audio=None,
|
| 120 |
resample_audio=None,
|
| 121 |
-
trim_to_6s=False, # Fast mode: trim audio to
|
| 122 |
):
|
| 123 |
"""
|
| 124 |
Process audio files and extract audio features.
|
|
@@ -203,20 +203,20 @@ def process_audio_features(
|
|
| 203 |
total_length = sum(audio_lengths)
|
| 204 |
print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
|
| 205 |
|
| 206 |
-
# Fast mode: trim to
|
| 207 |
if trim_to_6s:
|
| 208 |
-
#
|
| 209 |
-
|
| 210 |
-
if total_length >
|
| 211 |
-
print(f"Fast mode: Trimming audio from {total_length} frames to {
|
| 212 |
# Truncate each audio proportionally
|
| 213 |
-
scale_factor =
|
| 214 |
cumulative_length = 0
|
| 215 |
for i, audio_len in enumerate(audio_lengths):
|
| 216 |
if audio_len > 0:
|
| 217 |
new_audio_len = int(audio_len * scale_factor)
|
| 218 |
# Ensure it fits within remaining space
|
| 219 |
-
remaining_space =
|
| 220 |
new_audio_len = min(new_audio_len, remaining_space)
|
| 221 |
audio_lengths[i] = new_audio_len
|
| 222 |
# Truncate the corresponding raw audio feature
|
|
@@ -294,14 +294,14 @@ def process_audio_features(
|
|
| 294 |
with torch.no_grad():
|
| 295 |
print(f"wav2vec_model: {wav2vec_model}")
|
| 296 |
print(f"cache_dir:{cache_dir}")
|
| 297 |
-
# Fast mode: if trim_to_6s, limit to
|
| 298 |
target_frames = F
|
| 299 |
if trim_to_6s:
|
| 300 |
-
#
|
| 301 |
-
|
| 302 |
-
target_frames = min(F,
|
| 303 |
-
if F >
|
| 304 |
-
print(f"Fast mode: Trimming audio {i} from {F} frames to {
|
| 305 |
# Use dynamically determined frame number
|
| 306 |
audio_emb, audio_length = preprocess_audio(
|
| 307 |
wav_path=target_resampled_audio_path,
|
|
@@ -343,14 +343,14 @@ def process_audio_features(
|
|
| 343 |
target_resampled_audio_path,
|
| 344 |
)
|
| 345 |
with torch.no_grad():
|
| 346 |
-
# Fast mode: if trim_to_6s, limit to
|
| 347 |
target_frames = F
|
| 348 |
if trim_to_6s:
|
| 349 |
-
#
|
| 350 |
-
|
| 351 |
-
target_frames = min(F,
|
| 352 |
-
if F >
|
| 353 |
-
print(f"Fast mode: Trimming single audio from {F} frames to {
|
| 354 |
# Use dynamically determined frame number
|
| 355 |
audio_emb, audio_length = preprocess_audio(
|
| 356 |
wav_path=audio,
|
|
|
|
| 118 |
half_dtype=None,
|
| 119 |
preprocess_audio=None,
|
| 120 |
resample_audio=None,
|
| 121 |
+
trim_to_6s=False, # Fast mode: trim audio to 4 seconds
|
| 122 |
):
|
| 123 |
"""
|
| 124 |
Process audio files and extract audio features.
|
|
|
|
| 203 |
total_length = sum(audio_lengths)
|
| 204 |
print(f"Total audio length in concat mode (from processed frames): {total_length} frames")
|
| 205 |
|
| 206 |
+
# Fast mode: trim to 4 seconds if trim_to_6s is True
|
| 207 |
if trim_to_6s:
|
| 208 |
+
# 4秒固定为97帧(4n+1格式:4秒*24fps=96帧,向上取整为97帧)
|
| 209 |
+
max_frames_4s = 97
|
| 210 |
+
if total_length > max_frames_4s:
|
| 211 |
+
print(f"Fast mode: Trimming audio from {total_length} frames to {max_frames_4s} frames (4 seconds)")
|
| 212 |
# Truncate each audio proportionally
|
| 213 |
+
scale_factor = max_frames_4s / total_length
|
| 214 |
cumulative_length = 0
|
| 215 |
for i, audio_len in enumerate(audio_lengths):
|
| 216 |
if audio_len > 0:
|
| 217 |
new_audio_len = int(audio_len * scale_factor)
|
| 218 |
# Ensure it fits within remaining space
|
| 219 |
+
remaining_space = max_frames_4s - cumulative_length
|
| 220 |
new_audio_len = min(new_audio_len, remaining_space)
|
| 221 |
audio_lengths[i] = new_audio_len
|
| 222 |
# Truncate the corresponding raw audio feature
|
|
|
|
| 294 |
with torch.no_grad():
|
| 295 |
print(f"wav2vec_model: {wav2vec_model}")
|
| 296 |
print(f"cache_dir:{cache_dir}")
|
| 297 |
+
# Fast mode: if trim_to_6s, limit to 4 seconds
|
| 298 |
target_frames = F
|
| 299 |
if trim_to_6s:
|
| 300 |
+
# 4秒固定为97帧(4n+1格式:4秒*24fps=96帧,向上取整为97帧)
|
| 301 |
+
max_frames_4s = 97
|
| 302 |
+
target_frames = min(F, max_frames_4s)
|
| 303 |
+
if F > max_frames_4s:
|
| 304 |
+
print(f"Fast mode: Trimming audio {i} from {F} frames to {max_frames_4s} frames (4 seconds)")
|
| 305 |
# Use dynamically determined frame number
|
| 306 |
audio_emb, audio_length = preprocess_audio(
|
| 307 |
wav_path=target_resampled_audio_path,
|
|
|
|
| 343 |
target_resampled_audio_path,
|
| 344 |
)
|
| 345 |
with torch.no_grad():
|
| 346 |
+
# Fast mode: if trim_to_6s, limit to 4 seconds
|
| 347 |
target_frames = F
|
| 348 |
if trim_to_6s:
|
| 349 |
+
# 4秒固定为97帧(4n+1格式:4秒*24fps=96帧,向上取整为97帧)
|
| 350 |
+
max_frames_4s = 97
|
| 351 |
+
target_frames = min(F, max_frames_4s)
|
| 352 |
+
if F > max_frames_4s:
|
| 353 |
+
print(f"Fast mode: Trimming single audio from {F} frames to {max_frames_4s} frames (4 seconds)")
|
| 354 |
# Use dynamically determined frame number
|
| 355 |
audio_emb, audio_length = preprocess_audio(
|
| 356 |
wav_path=audio,
|