Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

BoxOfColors commited on about 9 hours ago

Commit

9f79da8

1 Parent(s): cc23c05

Add verbose debug logging throughout TARO generation pipeline

Add print statements in _taro_gpu_infer (ctx load, segments, results
count), _post_process_samples (per-sample wav/audio/video existence),
generate_taro (every stage), _run_taro (try/except with traceback),
_unpack_outputs (per-slot vid/aud/meta), and _build_waveform_html
(audio path existence check) to pinpoint where generation fails.

Files changed (1) hide show

app.py +32 -3

app.py CHANGED Viewed

@@ -825,14 +825,18 @@ def _post_process_samples(results: list, *, model: str, tmp_dir: str,
     Returns a list of (video_path, audio_path, seg_meta) tuples.
     """
     outputs = []
     for sample_idx, result in enumerate(results):
         seg_wavs = result[0]
         full_wav   = _stitch_wavs(seg_wavs, crossfade_s, crossfade_db, total_dur_s, sr, segments)
         audio_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.wav")
         _save_wav(audio_path, full_wav, sr)
         video_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path, model=model)
         wav_paths  = _save_seg_wavs(seg_wavs, tmp_dir, f"{model}_{sample_idx}")
         extras = extra_meta_fn(sample_idx, result, tmp_dir) if extra_meta_fn else {}
@@ -843,6 +847,7 @@ def _post_process_samples(results: list, *, model: str, tmp_dir: str,
             total_dur_s=total_dur_s, source_video=source_video, **extras,
         )
         outputs.append((video_path, audio_path, seg_meta))
     return outputs
@@ -875,10 +880,12 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
     ctx        = _ctx_load("taro_gpu_infer")
     tmp_dir    = ctx["tmp_dir"]
     silent_video = ctx["silent_video"]
     segments   = ctx["segments"]
     total_dur_s = ctx["total_dur_s"]
     extract_cavp, onset_model = _load_taro_feature_extractors(device)
     cavp_feats  = extract_cavp(silent_video, tmp_path=tmp_dir)
@@ -931,6 +938,7 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     return results
@@ -938,6 +946,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
                   crossfade_s, crossfade_db, num_samples):
     """TARO: video-conditioned diffusion, 16 kHz, 8.192 s sliding window.
     CPU pre/post-processing wraps the GPU-only inference to minimize ZeroGPU cost."""
     crossfade_s  = float(crossfade_s)
     crossfade_db = float(crossfade_db)
     num_samples  = int(num_samples)
@@ -945,6 +954,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
     # ── CPU pre-processing (no GPU needed) ──
     tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
         video_file, TARO_MODEL_DUR, crossfade_s)
     _ctx_store("taro_gpu_infer", {
         "tmp_dir": tmp_dir, "silent_video": silent_video,
@@ -972,7 +982,9 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
             _feats_saved = True
         return (wavs, cavp_feats, onset_feats)
     results = [_upsample_and_save_feats(r) for r in results]
     def _taro_extras(sample_idx, result, td):
         return {"cavp_path": cavp_path, "onset_path": onset_path}
@@ -985,7 +997,10 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
         source_video=video_file,
         extra_meta_fn=_taro_extras,
     )
-    return _pad_outputs(outputs)
 # ================================================================== #
@@ -2021,7 +2036,9 @@ def _build_waveform_html(audio_path: str, segments: list, slot_id: str,
     The waveform is SILENT. The playhead tracks the Gradio <video> element
     in the same slot via its timeupdate event.
     """
     if not audio_path or not os.path.exists(audio_path):
         return "<p style='color:#888;font-size:12px'>No audio yet.</p>"
     # Serve audio via Gradio's file API instead of base64-encoding the entire
@@ -2338,12 +2355,14 @@ def _unpack_outputs(flat: list, n: int, tab_prefix: str) -> list:
     can read it when calling the Gradio queue API for regen.
     """
     n = int(n)
     vid_updates  = []
     wave_updates = []
     for i in range(MAX_SLOTS):
         vid_path  = flat[i * 3]
         aud_path  = flat[i * 3 + 1]
         meta      = flat[i * 3 + 2]
         vid_updates.append(gr.update(value=vid_path))
         if aud_path and meta:
             slot_id      = f"{tab_prefix}_{i}"
@@ -2865,8 +2884,18 @@ with gr.Blocks(title="Generate Audio for Video", css=_SLOT_CSS, js=_GLOBAL_JS) a
             )
             def _run_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n):
-                flat = generate_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n)
-                return _unpack_outputs(flat, n, "taro")
             # Split group visibility into a separate .then() to avoid Gradio 5 SSR
             # "Too many arguments" caused by including gr.Group in mixed output lists.

     Returns a list of (video_path, audio_path, seg_meta) tuples.
     """
     outputs = []
+    print(f"[_post_process_samples] model={model} num_results={len(results)} tmp_dir={tmp_dir!r}")
     for sample_idx, result in enumerate(results):
         seg_wavs = result[0]
+        print(f"[_post_process_samples] sample {sample_idx}: seg_wavs count={len(seg_wavs) if seg_wavs else None}")
         full_wav   = _stitch_wavs(seg_wavs, crossfade_s, crossfade_db, total_dur_s, sr, segments)
         audio_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.wav")
         _save_wav(audio_path, full_wav, sr)
+        print(f"[_post_process_samples] sample {sample_idx}: saved audio={audio_path!r} exists={os.path.exists(audio_path)}")
         video_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.mp4")
         mux_video_audio(silent_video, audio_path, video_path, model=model)
+        print(f"[_post_process_samples] sample {sample_idx}: muxed video={video_path!r} exists={os.path.exists(video_path)}")
         wav_paths  = _save_seg_wavs(seg_wavs, tmp_dir, f"{model}_{sample_idx}")
         extras = extra_meta_fn(sample_idx, result, tmp_dir) if extra_meta_fn else {}
             total_dur_s=total_dur_s, source_video=source_video, **extras,
         )
         outputs.append((video_path, audio_path, seg_meta))
+    print(f"[_post_process_samples] returning {len(outputs)} output(s)")
     return outputs
     from TARO.samplers   import euler_sampler, euler_maruyama_sampler
     ctx        = _ctx_load("taro_gpu_infer")
+    print(f"[_taro_gpu_infer] ctx keys={list(ctx.keys())}")
     tmp_dir    = ctx["tmp_dir"]
     silent_video = ctx["silent_video"]
     segments   = ctx["segments"]
     total_dur_s = ctx["total_dur_s"]
+    print(f"[_taro_gpu_infer] tmp_dir={tmp_dir!r} silent_video={silent_video!r} segments={segments} total_dur_s={total_dur_s}")
     extract_cavp, onset_model = _load_taro_feature_extractors(device)
     cavp_feats  = extract_cavp(silent_video, tmp_path=tmp_dir)
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+    print(f"[_taro_gpu_infer] returning {len(results)} results")
     return results
                   crossfade_s, crossfade_db, num_samples):
     """TARO: video-conditioned diffusion, 16 kHz, 8.192 s sliding window.
     CPU pre/post-processing wraps the GPU-only inference to minimize ZeroGPU cost."""
+    print(f"[generate_taro] START video_file={video_file!r} num_samples={num_samples}")
     crossfade_s  = float(crossfade_s)
     crossfade_db = float(crossfade_db)
     num_samples  = int(num_samples)
     # ── CPU pre-processing (no GPU needed) ──
     tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
         video_file, TARO_MODEL_DUR, crossfade_s)
+    print(f"[generate_taro] preprocess done: total_dur_s={total_dur_s:.2f} segments={segments} tmp_dir={tmp_dir}")
     _ctx_store("taro_gpu_infer", {
         "tmp_dir": tmp_dir, "silent_video": silent_video,
             _feats_saved = True
         return (wavs, cavp_feats, onset_feats)
+    print(f"[generate_taro] gpu_infer returned {len(results)} result(s)")
     results = [_upsample_and_save_feats(r) for r in results]
+    print(f"[generate_taro] upsample done, cavp_path={cavp_path} onset_path={onset_path}")
     def _taro_extras(sample_idx, result, td):
         return {"cavp_path": cavp_path, "onset_path": onset_path}
         source_video=video_file,
         extra_meta_fn=_taro_extras,
     )
+    print(f"[generate_taro] post_process done: {len(outputs)} output(s)")
+    padded = _pad_outputs(outputs)
+    print(f"[generate_taro] padded outputs: {[(type(x).__name__, x is not None) for x in padded[:6]]}")
+    return padded
 # ================================================================== #
     The waveform is SILENT. The playhead tracks the Gradio <video> element
     in the same slot via its timeupdate event.
     """
+    print(f"[_build_waveform_html] audio_path={audio_path!r} exists={os.path.exists(audio_path) if audio_path else False} slot={slot_id}")
     if not audio_path or not os.path.exists(audio_path):
+        print(f"[_build_waveform_html] returning placeholder — audio missing")
         return "<p style='color:#888;font-size:12px'>No audio yet.</p>"
     # Serve audio via Gradio's file API instead of base64-encoding the entire
     can read it when calling the Gradio queue API for regen.
     """
     n = int(n)
+    print(f"[_unpack_outputs] tab={tab_prefix} n={n} flat_len={len(flat)}")
     vid_updates  = []
     wave_updates = []
     for i in range(MAX_SLOTS):
         vid_path  = flat[i * 3]
         aud_path  = flat[i * 3 + 1]
         meta      = flat[i * 3 + 2]
+        print(f"[_unpack_outputs] slot {i}: vid={vid_path is not None} aud={aud_path!r} meta={meta is not None}")
         vid_updates.append(gr.update(value=vid_path))
         if aud_path and meta:
             slot_id      = f"{tab_prefix}_{i}"
             )
             def _run_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n):
+                print(f"[_run_taro] called video={video!r} n={n}")
+                try:
+                    flat = generate_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n)
+                    print(f"[_run_taro] generate_taro returned flat_len={len(flat) if flat else None}")
+                    result = _unpack_outputs(flat, n, "taro")
+                    print(f"[_run_taro] _unpack_outputs returned {len(result)} updates")
+                    return result
+                except Exception as e:
+                    import traceback
+                    print(f"[_run_taro] EXCEPTION: {e}")
+                    traceback.print_exc()
+                    raise
             # Split group visibility into a separate .then() to avoid Gradio 5 SSR
             # "Too many arguments" caused by including gr.Group in mixed output lists.