Spaces:

FunAudioLLM
/

PrismAudio

Running on Zero

App Files Files Community

prismaudio-project commited on 3 days ago

Commit

537397a

1 Parent(s): bdc92e0

fix

Browse files

Files changed (1) hide show

app.py +27 -38

app.py CHANGED Viewed

@@ -358,22 +358,15 @@ def generate_audio_core(video_file, caption):
     DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
     _MODELS["feature_extractor"].to(DEVICE)
     _MODELS["diffusion"].to(DEVICE)
-    start_time =time.time()
-    """
-    Gradio generator function (yields status + result progressively).
-    Yields:
-        (status_str, combined_video_path_or_None)
-    """
-    # ---- Basic validation ----
     if video_file is None:
         return "❌ Please upload a video file first.", None
     if not caption or caption.strip() == "":
         caption=""
     caption = caption.strip()
     logs    = []
@@ -382,11 +375,11 @@ def generate_audio_core(video_file, caption):
         logs.append(msg)
         return "\n".join(logs)
-    # ---- Working directory (auto-cleaned on exit) ----
     work_dir = tempfile.mkdtemp(prefix="PrismAudio_")
     try:
         # ---- Step 1: Convert / copy to mp4 ----
         status = log_step("📹 Step 1: Preparing video...")
         src_ext  = os.path.splitext(video_file)[1].lower()
@@ -399,50 +392,46 @@ def generate_audio_core(video_file, caption):
                 return log_step(f"❌ Video conversion failed:\n{err}"), None
         else:
             shutil.copy(video_file, mp4_path)
-        log_step("   Video ready.")
         # ---- Step 2: Validate duration ----
         status = log_step("📹 Step 2: Checking video duration...")
         duration = get_video_duration(mp4_path)
-        log_step(f"   Duration: {duration:.2f}s")
         # ---- Step 3: Extract video frames ----
-        status = log_step("🎞️  Step 3: Extracting video frames (clip & sync)...")
         clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
-        log_step(f"   clip_chunk : {tuple(clip_chunk.shape)}")
-        log_step(f"   sync_chunk : {tuple(sync_chunk.shape)}")
         # ---- Step 4: Extract model features ----
-        status = log_step("🧠 Step 4: Extracting text / video / sync features...")
-        #yield status, None
         info = extract_features(clip_chunk, sync_chunk, caption)
-        log_step(f"   text_features         : {tuple(info['text_features'].shape)}")
-        log_step(f"   global_video_features : {tuple(info['global_video_features'].shape)}")
-        log_step(f"   video_features        : {tuple(info['video_features'].shape)}")
-        log_step(f"   global_text_features  : {tuple(info['global_text_features'].shape)}")
-        log_step(f"   sync_features         : {tuple(info['sync_features'].shape)}")
         # ---- Step 5: Build inference batch ----
-        #status = log_step("📦 Step 5: Building inference batch...")
-        #yield status, None
         audio_latent, meta = build_meta(info, duration, caption)
-        log_step(f"   audio_latent : {tuple(audio_latent.shape)}")
         # ---- Step 6: Diffusion sampling ----
         status = log_step("🎵 Step 6: Running diffusion sampling...")
-        #yield status, None
         generated_audio = run_diffusion(audio_latent, meta, duration)
-        log_step(f"   Generated audio shape : {tuple(generated_audio.shape)}")
         # ---- Step 7: Save generated audio (temp) ----
         status = log_step("💾 Step 7: Saving generated audio...")
-        #yield status, None
         audio_path = os.path.join(work_dir, "generated_audio.wav")
         torchaudio.save(
@@ -450,38 +439,37 @@ def generate_audio_core(video_file, caption):
             generated_audio[0],  # (1, T)
             SAMPLE_RATE,
         )
-        log_step(f"   Audio saved: {audio_path}")
         # ---- Step 8: Mux audio into original video ----
         status = log_step("🎬 Step 8: Merging audio into video...")
-        #yield status, None
         combined_path = os.path.join(work_dir, "output_with_audio.mp4")
         ok, err = combine_audio_video(mp4_path, audio_path, combined_path)
         if not ok:
             return log_step(f"❌ Failed to combine audio and video:\n{err}"), None
-        log_step("✅ Done! Audio and video merged successfully.")
         return "\n".join(logs), combined_path
     except Exception as e:
         log_step(f"❌ Unexpected error: {str(e)}")
         log.exception(e)
         return "\n".join(logs), None
-    end_time =time.time()
-    print("cost: ",end_time-start_time)
-    # Note: work_dir is NOT deleted here so Gradio can serve the output file.
-    # Gradio manages its own GRADIO_TEMP_DIR cleanup on restart.
 def generate_audio(video_file, caption):
-    # 先yield状态
     yield "⏳ Waiting for GPU...", None
     result_logs, result_video = generate_audio_core(video_file, caption)
     yield result_logs, result_video
 # ==================== Gradio UI ====================
 def build_ui() -> gr.Blocks:
@@ -556,6 +544,7 @@ def build_ui() -> gr.Blocks:
 <Spatial> Natural sound distribution across the stereo field, suggesting birds are around the listener. Food interaction sounds can be localized.
 """],
                 ["demos/Rail transport_3_479.mp4", "Generate ambient countryside sounds with a gentle breeze rustling the leaves of a large tree. From the right, introduce a faint rumble of wheels on a track and a steam engine chugging. Allow the sounds to grow louder and pan from right to left as the steam train travels across the landscape. Include the powerful chugging and clattering of carriages in the soundscape, then gradually recede the sounds to the left. Ensure no additional background noise or music is present."],
                 ["demos/Cat_2_438.mp4", "A cat perched in a tree, letting out loud and sweet meows."],
             ],
             inputs=[video_input, caption_input],

     DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
     _MODELS["feature_extractor"].to(DEVICE)
     _MODELS["diffusion"].to(DEVICE)
+    total_start_time = time.time()
     if video_file is None:
         return "❌ Please upload a video file first.", None
     if not caption or caption.strip() == "":
         caption=""
     caption = caption.strip()
     logs    = []
         logs.append(msg)
         return "\n".join(logs)
     work_dir = tempfile.mkdtemp(prefix="PrismAudio_")
     try:
         # ---- Step 1: Convert / copy to mp4 ----
+        step_start = time.time()
         status = log_step("📹 Step 1: Preparing video...")
         src_ext  = os.path.splitext(video_file)[1].lower()
                 return log_step(f"❌ Video conversion failed:\n{err}"), None
         else:
             shutil.copy(video_file, mp4_path)
+        log_step(f"   Video ready. ⏱️ Step 1 cost: {time.time() - step_start:.2f}s")
         # ---- Step 2: Validate duration ----
+        step_start = time.time()
         status = log_step("📹 Step 2: Checking video duration...")
         duration = get_video_duration(mp4_path)
+        log_step(f"   Duration: {duration:.2f}s ⏱️ Step 2 cost: {time.time() - step_start:.2f}s")
         # ---- Step 3: Extract video frames ----
+        step_start = time.time()
+        status = log_step("🎞️  Step 3: Extracting video frames...")
         clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
+        log_step(f"   Frames extracted. ⏱️ Step 3 cost: {time.time() - step_start:.2f}s")
         # ---- Step 4: Extract model features ----
+        step_start = time.time()
+        status = log_step("🧠 Step 4: Extracting text / video features...")
         info = extract_features(clip_chunk, sync_chunk, caption)
+        log_step(f"   Features extracted. ⏱️ Step 4 cost: {time.time() - step_start:.2f}s")
         # ---- Step 5: Build inference batch ----
+        step_start = time.time()
+        status = log_step("📦 Step 5: Building inference batch...")
         audio_latent, meta = build_meta(info, duration, caption)
+        log_step(f"   audio_latent : {tuple(audio_latent.shape)} ⏱️ Step 5 cost: {time.time() - step_start:.2f}s")
         # ---- Step 6: Diffusion sampling ----
+        step_start = time.time()
         status = log_step("🎵 Step 6: Running diffusion sampling...")
         generated_audio = run_diffusion(audio_latent, meta, duration)
+        log_step(f"   Diffusion sampling done. ⏱️ Step 6 cost: {time.time() - step_start:.2f}s")
         # ---- Step 7: Save generated audio (temp) ----
+        step_start = time.time()
         status = log_step("💾 Step 7: Saving generated audio...")
         audio_path = os.path.join(work_dir, "generated_audio.wav")
         torchaudio.save(
             generated_audio[0],  # (1, T)
             SAMPLE_RATE,
         )
+        log_step(f"   Audio saved: {audio_path} ⏱️ Step 7 cost: {time.time() - step_start:.2f}s")
         # ---- Step 8: Mux audio into original video ----
+        step_start = time.time()
         status = log_step("🎬 Step 8: Merging audio into video...")
         combined_path = os.path.join(work_dir, "output_with_audio.mp4")
         ok, err = combine_audio_video(mp4_path, audio_path, combined_path)
         if not ok:
             return log_step(f"❌ Failed to combine audio and video:\n{err}"), None
+        log_step(f"   Audio and video merged. ⏱️ Step 8 cost: {time.time() - step_start:.2f}s")
+        total_cost = time.time() - total_start_time
+        log_step(f"✅ Done! Audio and video merged successfully. ⏱️ Total cost: {total_cost:.2f}s")
         return "\n".join(logs), combined_path
     except Exception as e:
         log_step(f"❌ Unexpected error: {str(e)}")
         log.exception(e)
         return "\n".join(logs), None
 def generate_audio(video_file, caption):
     yield "⏳ Waiting for GPU...", None
     result_logs, result_video = generate_audio_core(video_file, caption)
     yield result_logs, result_video
 # ==================== Gradio UI ====================
 def build_ui() -> gr.Blocks:
 <Spatial> Natural sound distribution across the stereo field, suggesting birds are around the listener. Food interaction sounds can be localized.
 """],
                 ["demos/Rail transport_3_479.mp4", "Generate ambient countryside sounds with a gentle breeze rustling the leaves of a large tree. From the right, introduce a faint rumble of wheels on a track and a steam engine chugging. Allow the sounds to grow louder and pan from right to left as the steam train travels across the landscape. Include the powerful chugging and clattering of carriages in the soundscape, then gradually recede the sounds to the left. Ensure no additional background noise or music is present."],
+                ["demos/3ClbaJYWVO4_000030.mp4", "Produce delicate and melodious guitar strumming that gracefully flows and dances with the musical rhythm."],
                 ["demos/Cat_2_438.mp4", "A cat perched in a tree, letting out loud and sweet meows."],
             ],
             inputs=[video_input, caption_input],