Spaces:
Running on Zero
Running on Zero
Commit ·
9f79da8
1
Parent(s): cc23c05
Add verbose debug logging throughout TARO generation pipeline
Browse filesAdd print statements in _taro_gpu_infer (ctx load, segments, results
count), _post_process_samples (per-sample wav/audio/video existence),
generate_taro (every stage), _run_taro (try/except with traceback),
_unpack_outputs (per-slot vid/aud/meta), and _build_waveform_html
(audio path existence check) to pinpoint where generation fails.
app.py
CHANGED
|
@@ -825,14 +825,18 @@ def _post_process_samples(results: list, *, model: str, tmp_dir: str,
|
|
| 825 |
Returns a list of (video_path, audio_path, seg_meta) tuples.
|
| 826 |
"""
|
| 827 |
outputs = []
|
|
|
|
| 828 |
for sample_idx, result in enumerate(results):
|
| 829 |
seg_wavs = result[0]
|
|
|
|
| 830 |
|
| 831 |
full_wav = _stitch_wavs(seg_wavs, crossfade_s, crossfade_db, total_dur_s, sr, segments)
|
| 832 |
audio_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.wav")
|
| 833 |
_save_wav(audio_path, full_wav, sr)
|
|
|
|
| 834 |
video_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.mp4")
|
| 835 |
mux_video_audio(silent_video, audio_path, video_path, model=model)
|
|
|
|
| 836 |
wav_paths = _save_seg_wavs(seg_wavs, tmp_dir, f"{model}_{sample_idx}")
|
| 837 |
|
| 838 |
extras = extra_meta_fn(sample_idx, result, tmp_dir) if extra_meta_fn else {}
|
|
@@ -843,6 +847,7 @@ def _post_process_samples(results: list, *, model: str, tmp_dir: str,
|
|
| 843 |
total_dur_s=total_dur_s, source_video=source_video, **extras,
|
| 844 |
)
|
| 845 |
outputs.append((video_path, audio_path, seg_meta))
|
|
|
|
| 846 |
return outputs
|
| 847 |
|
| 848 |
|
|
@@ -875,10 +880,12 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
|
|
| 875 |
from TARO.samplers import euler_sampler, euler_maruyama_sampler
|
| 876 |
|
| 877 |
ctx = _ctx_load("taro_gpu_infer")
|
|
|
|
| 878 |
tmp_dir = ctx["tmp_dir"]
|
| 879 |
silent_video = ctx["silent_video"]
|
| 880 |
segments = ctx["segments"]
|
| 881 |
total_dur_s = ctx["total_dur_s"]
|
|
|
|
| 882 |
|
| 883 |
extract_cavp, onset_model = _load_taro_feature_extractors(device)
|
| 884 |
cavp_feats = extract_cavp(silent_video, tmp_path=tmp_dir)
|
|
@@ -931,6 +938,7 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
|
|
| 931 |
if torch.cuda.is_available():
|
| 932 |
torch.cuda.empty_cache()
|
| 933 |
|
|
|
|
| 934 |
return results
|
| 935 |
|
| 936 |
|
|
@@ -938,6 +946,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
|
|
| 938 |
crossfade_s, crossfade_db, num_samples):
|
| 939 |
"""TARO: video-conditioned diffusion, 16 kHz, 8.192 s sliding window.
|
| 940 |
CPU pre/post-processing wraps the GPU-only inference to minimize ZeroGPU cost."""
|
|
|
|
| 941 |
crossfade_s = float(crossfade_s)
|
| 942 |
crossfade_db = float(crossfade_db)
|
| 943 |
num_samples = int(num_samples)
|
|
@@ -945,6 +954,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
|
|
| 945 |
# ── CPU pre-processing (no GPU needed) ──
|
| 946 |
tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
|
| 947 |
video_file, TARO_MODEL_DUR, crossfade_s)
|
|
|
|
| 948 |
|
| 949 |
_ctx_store("taro_gpu_infer", {
|
| 950 |
"tmp_dir": tmp_dir, "silent_video": silent_video,
|
|
@@ -972,7 +982,9 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
|
|
| 972 |
_feats_saved = True
|
| 973 |
return (wavs, cavp_feats, onset_feats)
|
| 974 |
|
|
|
|
| 975 |
results = [_upsample_and_save_feats(r) for r in results]
|
|
|
|
| 976 |
|
| 977 |
def _taro_extras(sample_idx, result, td):
|
| 978 |
return {"cavp_path": cavp_path, "onset_path": onset_path}
|
|
@@ -985,7 +997,10 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
|
|
| 985 |
source_video=video_file,
|
| 986 |
extra_meta_fn=_taro_extras,
|
| 987 |
)
|
| 988 |
-
|
|
|
|
|
|
|
|
|
|
| 989 |
|
| 990 |
|
| 991 |
# ================================================================== #
|
|
@@ -2021,7 +2036,9 @@ def _build_waveform_html(audio_path: str, segments: list, slot_id: str,
|
|
| 2021 |
The waveform is SILENT. The playhead tracks the Gradio <video> element
|
| 2022 |
in the same slot via its timeupdate event.
|
| 2023 |
"""
|
|
|
|
| 2024 |
if not audio_path or not os.path.exists(audio_path):
|
|
|
|
| 2025 |
return "<p style='color:#888;font-size:12px'>No audio yet.</p>"
|
| 2026 |
|
| 2027 |
# Serve audio via Gradio's file API instead of base64-encoding the entire
|
|
@@ -2338,12 +2355,14 @@ def _unpack_outputs(flat: list, n: int, tab_prefix: str) -> list:
|
|
| 2338 |
can read it when calling the Gradio queue API for regen.
|
| 2339 |
"""
|
| 2340 |
n = int(n)
|
|
|
|
| 2341 |
vid_updates = []
|
| 2342 |
wave_updates = []
|
| 2343 |
for i in range(MAX_SLOTS):
|
| 2344 |
vid_path = flat[i * 3]
|
| 2345 |
aud_path = flat[i * 3 + 1]
|
| 2346 |
meta = flat[i * 3 + 2]
|
|
|
|
| 2347 |
vid_updates.append(gr.update(value=vid_path))
|
| 2348 |
if aud_path and meta:
|
| 2349 |
slot_id = f"{tab_prefix}_{i}"
|
|
@@ -2865,8 +2884,18 @@ with gr.Blocks(title="Generate Audio for Video", css=_SLOT_CSS, js=_GLOBAL_JS) a
|
|
| 2865 |
)
|
| 2866 |
|
| 2867 |
def _run_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n):
|
| 2868 |
-
|
| 2869 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2870 |
|
| 2871 |
# Split group visibility into a separate .then() to avoid Gradio 5 SSR
|
| 2872 |
# "Too many arguments" caused by including gr.Group in mixed output lists.
|
|
|
|
| 825 |
Returns a list of (video_path, audio_path, seg_meta) tuples.
|
| 826 |
"""
|
| 827 |
outputs = []
|
| 828 |
+
print(f"[_post_process_samples] model={model} num_results={len(results)} tmp_dir={tmp_dir!r}")
|
| 829 |
for sample_idx, result in enumerate(results):
|
| 830 |
seg_wavs = result[0]
|
| 831 |
+
print(f"[_post_process_samples] sample {sample_idx}: seg_wavs count={len(seg_wavs) if seg_wavs else None}")
|
| 832 |
|
| 833 |
full_wav = _stitch_wavs(seg_wavs, crossfade_s, crossfade_db, total_dur_s, sr, segments)
|
| 834 |
audio_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.wav")
|
| 835 |
_save_wav(audio_path, full_wav, sr)
|
| 836 |
+
print(f"[_post_process_samples] sample {sample_idx}: saved audio={audio_path!r} exists={os.path.exists(audio_path)}")
|
| 837 |
video_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.mp4")
|
| 838 |
mux_video_audio(silent_video, audio_path, video_path, model=model)
|
| 839 |
+
print(f"[_post_process_samples] sample {sample_idx}: muxed video={video_path!r} exists={os.path.exists(video_path)}")
|
| 840 |
wav_paths = _save_seg_wavs(seg_wavs, tmp_dir, f"{model}_{sample_idx}")
|
| 841 |
|
| 842 |
extras = extra_meta_fn(sample_idx, result, tmp_dir) if extra_meta_fn else {}
|
|
|
|
| 847 |
total_dur_s=total_dur_s, source_video=source_video, **extras,
|
| 848 |
)
|
| 849 |
outputs.append((video_path, audio_path, seg_meta))
|
| 850 |
+
print(f"[_post_process_samples] returning {len(outputs)} output(s)")
|
| 851 |
return outputs
|
| 852 |
|
| 853 |
|
|
|
|
| 880 |
from TARO.samplers import euler_sampler, euler_maruyama_sampler
|
| 881 |
|
| 882 |
ctx = _ctx_load("taro_gpu_infer")
|
| 883 |
+
print(f"[_taro_gpu_infer] ctx keys={list(ctx.keys())}")
|
| 884 |
tmp_dir = ctx["tmp_dir"]
|
| 885 |
silent_video = ctx["silent_video"]
|
| 886 |
segments = ctx["segments"]
|
| 887 |
total_dur_s = ctx["total_dur_s"]
|
| 888 |
+
print(f"[_taro_gpu_infer] tmp_dir={tmp_dir!r} silent_video={silent_video!r} segments={segments} total_dur_s={total_dur_s}")
|
| 889 |
|
| 890 |
extract_cavp, onset_model = _load_taro_feature_extractors(device)
|
| 891 |
cavp_feats = extract_cavp(silent_video, tmp_path=tmp_dir)
|
|
|
|
| 938 |
if torch.cuda.is_available():
|
| 939 |
torch.cuda.empty_cache()
|
| 940 |
|
| 941 |
+
print(f"[_taro_gpu_infer] returning {len(results)} results")
|
| 942 |
return results
|
| 943 |
|
| 944 |
|
|
|
|
| 946 |
crossfade_s, crossfade_db, num_samples):
|
| 947 |
"""TARO: video-conditioned diffusion, 16 kHz, 8.192 s sliding window.
|
| 948 |
CPU pre/post-processing wraps the GPU-only inference to minimize ZeroGPU cost."""
|
| 949 |
+
print(f"[generate_taro] START video_file={video_file!r} num_samples={num_samples}")
|
| 950 |
crossfade_s = float(crossfade_s)
|
| 951 |
crossfade_db = float(crossfade_db)
|
| 952 |
num_samples = int(num_samples)
|
|
|
|
| 954 |
# ── CPU pre-processing (no GPU needed) ──
|
| 955 |
tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
|
| 956 |
video_file, TARO_MODEL_DUR, crossfade_s)
|
| 957 |
+
print(f"[generate_taro] preprocess done: total_dur_s={total_dur_s:.2f} segments={segments} tmp_dir={tmp_dir}")
|
| 958 |
|
| 959 |
_ctx_store("taro_gpu_infer", {
|
| 960 |
"tmp_dir": tmp_dir, "silent_video": silent_video,
|
|
|
|
| 982 |
_feats_saved = True
|
| 983 |
return (wavs, cavp_feats, onset_feats)
|
| 984 |
|
| 985 |
+
print(f"[generate_taro] gpu_infer returned {len(results)} result(s)")
|
| 986 |
results = [_upsample_and_save_feats(r) for r in results]
|
| 987 |
+
print(f"[generate_taro] upsample done, cavp_path={cavp_path} onset_path={onset_path}")
|
| 988 |
|
| 989 |
def _taro_extras(sample_idx, result, td):
|
| 990 |
return {"cavp_path": cavp_path, "onset_path": onset_path}
|
|
|
|
| 997 |
source_video=video_file,
|
| 998 |
extra_meta_fn=_taro_extras,
|
| 999 |
)
|
| 1000 |
+
print(f"[generate_taro] post_process done: {len(outputs)} output(s)")
|
| 1001 |
+
padded = _pad_outputs(outputs)
|
| 1002 |
+
print(f"[generate_taro] padded outputs: {[(type(x).__name__, x is not None) for x in padded[:6]]}")
|
| 1003 |
+
return padded
|
| 1004 |
|
| 1005 |
|
| 1006 |
# ================================================================== #
|
|
|
|
| 2036 |
The waveform is SILENT. The playhead tracks the Gradio <video> element
|
| 2037 |
in the same slot via its timeupdate event.
|
| 2038 |
"""
|
| 2039 |
+
print(f"[_build_waveform_html] audio_path={audio_path!r} exists={os.path.exists(audio_path) if audio_path else False} slot={slot_id}")
|
| 2040 |
if not audio_path or not os.path.exists(audio_path):
|
| 2041 |
+
print(f"[_build_waveform_html] returning placeholder — audio missing")
|
| 2042 |
return "<p style='color:#888;font-size:12px'>No audio yet.</p>"
|
| 2043 |
|
| 2044 |
# Serve audio via Gradio's file API instead of base64-encoding the entire
|
|
|
|
| 2355 |
can read it when calling the Gradio queue API for regen.
|
| 2356 |
"""
|
| 2357 |
n = int(n)
|
| 2358 |
+
print(f"[_unpack_outputs] tab={tab_prefix} n={n} flat_len={len(flat)}")
|
| 2359 |
vid_updates = []
|
| 2360 |
wave_updates = []
|
| 2361 |
for i in range(MAX_SLOTS):
|
| 2362 |
vid_path = flat[i * 3]
|
| 2363 |
aud_path = flat[i * 3 + 1]
|
| 2364 |
meta = flat[i * 3 + 2]
|
| 2365 |
+
print(f"[_unpack_outputs] slot {i}: vid={vid_path is not None} aud={aud_path!r} meta={meta is not None}")
|
| 2366 |
vid_updates.append(gr.update(value=vid_path))
|
| 2367 |
if aud_path and meta:
|
| 2368 |
slot_id = f"{tab_prefix}_{i}"
|
|
|
|
| 2884 |
)
|
| 2885 |
|
| 2886 |
def _run_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n):
|
| 2887 |
+
print(f"[_run_taro] called video={video!r} n={n}")
|
| 2888 |
+
try:
|
| 2889 |
+
flat = generate_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n)
|
| 2890 |
+
print(f"[_run_taro] generate_taro returned flat_len={len(flat) if flat else None}")
|
| 2891 |
+
result = _unpack_outputs(flat, n, "taro")
|
| 2892 |
+
print(f"[_run_taro] _unpack_outputs returned {len(result)} updates")
|
| 2893 |
+
return result
|
| 2894 |
+
except Exception as e:
|
| 2895 |
+
import traceback
|
| 2896 |
+
print(f"[_run_taro] EXCEPTION: {e}")
|
| 2897 |
+
traceback.print_exc()
|
| 2898 |
+
raise
|
| 2899 |
|
| 2900 |
# Split group visibility into a separate .then() to avoid Gradio 5 SSR
|
| 2901 |
# "Too many arguments" caused by including gr.Group in mixed output lists.
|