BoxOfColors commited on
Commit
9f79da8
·
1 Parent(s): cc23c05

Add verbose debug logging throughout TARO generation pipeline

Browse files

Add print statements in _taro_gpu_infer (ctx load, segments, results
count), _post_process_samples (per-sample wav/audio/video existence),
generate_taro (every stage), _run_taro (try/except with traceback),
_unpack_outputs (per-slot vid/aud/meta), and _build_waveform_html
(audio path existence check) to pinpoint where generation fails.

Files changed (1) hide show
  1. app.py +32 -3
app.py CHANGED
@@ -825,14 +825,18 @@ def _post_process_samples(results: list, *, model: str, tmp_dir: str,
825
  Returns a list of (video_path, audio_path, seg_meta) tuples.
826
  """
827
  outputs = []
 
828
  for sample_idx, result in enumerate(results):
829
  seg_wavs = result[0]
 
830
 
831
  full_wav = _stitch_wavs(seg_wavs, crossfade_s, crossfade_db, total_dur_s, sr, segments)
832
  audio_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.wav")
833
  _save_wav(audio_path, full_wav, sr)
 
834
  video_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.mp4")
835
  mux_video_audio(silent_video, audio_path, video_path, model=model)
 
836
  wav_paths = _save_seg_wavs(seg_wavs, tmp_dir, f"{model}_{sample_idx}")
837
 
838
  extras = extra_meta_fn(sample_idx, result, tmp_dir) if extra_meta_fn else {}
@@ -843,6 +847,7 @@ def _post_process_samples(results: list, *, model: str, tmp_dir: str,
843
  total_dur_s=total_dur_s, source_video=source_video, **extras,
844
  )
845
  outputs.append((video_path, audio_path, seg_meta))
 
846
  return outputs
847
 
848
 
@@ -875,10 +880,12 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
875
  from TARO.samplers import euler_sampler, euler_maruyama_sampler
876
 
877
  ctx = _ctx_load("taro_gpu_infer")
 
878
  tmp_dir = ctx["tmp_dir"]
879
  silent_video = ctx["silent_video"]
880
  segments = ctx["segments"]
881
  total_dur_s = ctx["total_dur_s"]
 
882
 
883
  extract_cavp, onset_model = _load_taro_feature_extractors(device)
884
  cavp_feats = extract_cavp(silent_video, tmp_path=tmp_dir)
@@ -931,6 +938,7 @@ def _taro_gpu_infer(video_file, seed_val, cfg_scale, num_steps, mode,
931
  if torch.cuda.is_available():
932
  torch.cuda.empty_cache()
933
 
 
934
  return results
935
 
936
 
@@ -938,6 +946,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
938
  crossfade_s, crossfade_db, num_samples):
939
  """TARO: video-conditioned diffusion, 16 kHz, 8.192 s sliding window.
940
  CPU pre/post-processing wraps the GPU-only inference to minimize ZeroGPU cost."""
 
941
  crossfade_s = float(crossfade_s)
942
  crossfade_db = float(crossfade_db)
943
  num_samples = int(num_samples)
@@ -945,6 +954,7 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
945
  # ── CPU pre-processing (no GPU needed) ──
946
  tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
947
  video_file, TARO_MODEL_DUR, crossfade_s)
 
948
 
949
  _ctx_store("taro_gpu_infer", {
950
  "tmp_dir": tmp_dir, "silent_video": silent_video,
@@ -972,7 +982,9 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
972
  _feats_saved = True
973
  return (wavs, cavp_feats, onset_feats)
974
 
 
975
  results = [_upsample_and_save_feats(r) for r in results]
 
976
 
977
  def _taro_extras(sample_idx, result, td):
978
  return {"cavp_path": cavp_path, "onset_path": onset_path}
@@ -985,7 +997,10 @@ def generate_taro(video_file, seed_val, cfg_scale, num_steps, mode,
985
  source_video=video_file,
986
  extra_meta_fn=_taro_extras,
987
  )
988
- return _pad_outputs(outputs)
 
 
 
989
 
990
 
991
  # ================================================================== #
@@ -2021,7 +2036,9 @@ def _build_waveform_html(audio_path: str, segments: list, slot_id: str,
2021
  The waveform is SILENT. The playhead tracks the Gradio <video> element
2022
  in the same slot via its timeupdate event.
2023
  """
 
2024
  if not audio_path or not os.path.exists(audio_path):
 
2025
  return "<p style='color:#888;font-size:12px'>No audio yet.</p>"
2026
 
2027
  # Serve audio via Gradio's file API instead of base64-encoding the entire
@@ -2338,12 +2355,14 @@ def _unpack_outputs(flat: list, n: int, tab_prefix: str) -> list:
2338
  can read it when calling the Gradio queue API for regen.
2339
  """
2340
  n = int(n)
 
2341
  vid_updates = []
2342
  wave_updates = []
2343
  for i in range(MAX_SLOTS):
2344
  vid_path = flat[i * 3]
2345
  aud_path = flat[i * 3 + 1]
2346
  meta = flat[i * 3 + 2]
 
2347
  vid_updates.append(gr.update(value=vid_path))
2348
  if aud_path and meta:
2349
  slot_id = f"{tab_prefix}_{i}"
@@ -2865,8 +2884,18 @@ with gr.Blocks(title="Generate Audio for Video", css=_SLOT_CSS, js=_GLOBAL_JS) a
2865
  )
2866
 
2867
  def _run_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n):
2868
- flat = generate_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n)
2869
- return _unpack_outputs(flat, n, "taro")
 
 
 
 
 
 
 
 
 
 
2870
 
2871
  # Split group visibility into a separate .then() to avoid Gradio 5 SSR
2872
  # "Too many arguments" caused by including gr.Group in mixed output lists.
 
825
  Returns a list of (video_path, audio_path, seg_meta) tuples.
826
  """
827
  outputs = []
828
+ print(f"[_post_process_samples] model={model} num_results={len(results)} tmp_dir={tmp_dir!r}")
829
  for sample_idx, result in enumerate(results):
830
  seg_wavs = result[0]
831
+ print(f"[_post_process_samples] sample {sample_idx}: seg_wavs count={len(seg_wavs) if seg_wavs else None}")
832
 
833
  full_wav = _stitch_wavs(seg_wavs, crossfade_s, crossfade_db, total_dur_s, sr, segments)
834
  audio_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.wav")
835
  _save_wav(audio_path, full_wav, sr)
836
+ print(f"[_post_process_samples] sample {sample_idx}: saved audio={audio_path!r} exists={os.path.exists(audio_path)}")
837
  video_path = os.path.join(tmp_dir, f"{model}_{sample_idx}.mp4")
838
  mux_video_audio(silent_video, audio_path, video_path, model=model)
839
+ print(f"[_post_process_samples] sample {sample_idx}: muxed video={video_path!r} exists={os.path.exists(video_path)}")
840
  wav_paths = _save_seg_wavs(seg_wavs, tmp_dir, f"{model}_{sample_idx}")
841
 
842
  extras = extra_meta_fn(sample_idx, result, tmp_dir) if extra_meta_fn else {}
 
847
  total_dur_s=total_dur_s, source_video=source_video, **extras,
848
  )
849
  outputs.append((video_path, audio_path, seg_meta))
850
+ print(f"[_post_process_samples] returning {len(outputs)} output(s)")
851
  return outputs
852
 
853
 
 
880
  from TARO.samplers import euler_sampler, euler_maruyama_sampler
881
 
882
  ctx = _ctx_load("taro_gpu_infer")
883
+ print(f"[_taro_gpu_infer] ctx keys={list(ctx.keys())}")
884
  tmp_dir = ctx["tmp_dir"]
885
  silent_video = ctx["silent_video"]
886
  segments = ctx["segments"]
887
  total_dur_s = ctx["total_dur_s"]
888
+ print(f"[_taro_gpu_infer] tmp_dir={tmp_dir!r} silent_video={silent_video!r} segments={segments} total_dur_s={total_dur_s}")
889
 
890
  extract_cavp, onset_model = _load_taro_feature_extractors(device)
891
  cavp_feats = extract_cavp(silent_video, tmp_path=tmp_dir)
 
938
  if torch.cuda.is_available():
939
  torch.cuda.empty_cache()
940
 
941
+ print(f"[_taro_gpu_infer] returning {len(results)} results")
942
  return results
943
 
944
 
 
946
  crossfade_s, crossfade_db, num_samples):
947
  """TARO: video-conditioned diffusion, 16 kHz, 8.192 s sliding window.
948
  CPU pre/post-processing wraps the GPU-only inference to minimize ZeroGPU cost."""
949
+ print(f"[generate_taro] START video_file={video_file!r} num_samples={num_samples}")
950
  crossfade_s = float(crossfade_s)
951
  crossfade_db = float(crossfade_db)
952
  num_samples = int(num_samples)
 
954
  # ── CPU pre-processing (no GPU needed) ──
955
  tmp_dir, silent_video, total_dur_s, segments = _cpu_preprocess(
956
  video_file, TARO_MODEL_DUR, crossfade_s)
957
+ print(f"[generate_taro] preprocess done: total_dur_s={total_dur_s:.2f} segments={segments} tmp_dir={tmp_dir}")
958
 
959
  _ctx_store("taro_gpu_infer", {
960
  "tmp_dir": tmp_dir, "silent_video": silent_video,
 
982
  _feats_saved = True
983
  return (wavs, cavp_feats, onset_feats)
984
 
985
+ print(f"[generate_taro] gpu_infer returned {len(results)} result(s)")
986
  results = [_upsample_and_save_feats(r) for r in results]
987
+ print(f"[generate_taro] upsample done, cavp_path={cavp_path} onset_path={onset_path}")
988
 
989
  def _taro_extras(sample_idx, result, td):
990
  return {"cavp_path": cavp_path, "onset_path": onset_path}
 
997
  source_video=video_file,
998
  extra_meta_fn=_taro_extras,
999
  )
1000
+ print(f"[generate_taro] post_process done: {len(outputs)} output(s)")
1001
+ padded = _pad_outputs(outputs)
1002
+ print(f"[generate_taro] padded outputs: {[(type(x).__name__, x is not None) for x in padded[:6]]}")
1003
+ return padded
1004
 
1005
 
1006
  # ================================================================== #
 
2036
  The waveform is SILENT. The playhead tracks the Gradio <video> element
2037
  in the same slot via its timeupdate event.
2038
  """
2039
+ print(f"[_build_waveform_html] audio_path={audio_path!r} exists={os.path.exists(audio_path) if audio_path else False} slot={slot_id}")
2040
  if not audio_path or not os.path.exists(audio_path):
2041
+ print(f"[_build_waveform_html] returning placeholder — audio missing")
2042
  return "<p style='color:#888;font-size:12px'>No audio yet.</p>"
2043
 
2044
  # Serve audio via Gradio's file API instead of base64-encoding the entire
 
2355
  can read it when calling the Gradio queue API for regen.
2356
  """
2357
  n = int(n)
2358
+ print(f"[_unpack_outputs] tab={tab_prefix} n={n} flat_len={len(flat)}")
2359
  vid_updates = []
2360
  wave_updates = []
2361
  for i in range(MAX_SLOTS):
2362
  vid_path = flat[i * 3]
2363
  aud_path = flat[i * 3 + 1]
2364
  meta = flat[i * 3 + 2]
2365
+ print(f"[_unpack_outputs] slot {i}: vid={vid_path is not None} aud={aud_path!r} meta={meta is not None}")
2366
  vid_updates.append(gr.update(value=vid_path))
2367
  if aud_path and meta:
2368
  slot_id = f"{tab_prefix}_{i}"
 
2884
  )
2885
 
2886
  def _run_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n):
2887
+ print(f"[_run_taro] called video={video!r} n={n}")
2888
+ try:
2889
+ flat = generate_taro(video, seed, cfg, steps, mode, cf_dur, cf_db, n)
2890
+ print(f"[_run_taro] generate_taro returned flat_len={len(flat) if flat else None}")
2891
+ result = _unpack_outputs(flat, n, "taro")
2892
+ print(f"[_run_taro] _unpack_outputs returned {len(result)} updates")
2893
+ return result
2894
+ except Exception as e:
2895
+ import traceback
2896
+ print(f"[_run_taro] EXCEPTION: {e}")
2897
+ traceback.print_exc()
2898
+ raise
2899
 
2900
  # Split group visibility into a separate .then() to avoid Gradio 5 SSR
2901
  # "Too many arguments" caused by including gr.Group in mixed output lists.