Spaces:

FunAudioLLM
/

PrismAudio

Running on Zero

App Files Files Community

prismaudio-project commited on Mar 23

Commit

72b3eff

1 Parent(s): be7e0ef

add

Browse files

Files changed (1) hide show

app.py +34 -29

app.py CHANGED Viewed

@@ -345,7 +345,11 @@ def run_diffusion(audio_latent: torch.Tensor, meta: dict, duration: float) -> to
 # ==================== Full Inference Pipeline ====================
-def generate_audio(video_file, caption: str):
     start_time =time.time()
     """
@@ -356,11 +360,11 @@ def generate_audio(video_file, caption: str):
     """
     # ---- Basic validation ----
     if video_file is None:
-        yield "❌ Please upload a video file first.", None
-        return
     if not caption or caption.strip() == "":
-        yield "❌ Please enter a caption / prompt.", None
-        return
     caption = caption.strip()
     logs    = []
@@ -375,9 +379,7 @@ def generate_audio(video_file, caption: str):
     try:
         # ---- Step 1: Convert / copy to mp4 ----
-        status = log_step("📹 Step 1: Preparing video...")
-        yield status, None
         src_ext  = os.path.splitext(video_file)[1].lower()
         mp4_path = os.path.join(work_dir, "input.mp4")
@@ -386,30 +388,28 @@ def generate_audio(video_file, caption: str):
             log_step("   Converting to mp4...")
             ok, err = convert_to_mp4(video_file, mp4_path)
             if not ok:
-                yield log_step(f"❌ Video conversion failed:\n{err}"), None
-                return
         else:
             shutil.copy(video_file, mp4_path)
         log_step("   Video ready.")
         # ---- Step 2: Validate duration ----
-        status = log_step("📹 Step 2: Checking video duration...")
-        yield status, None
         duration = get_video_duration(mp4_path)
         log_step(f"   Duration: {duration:.2f}s")
         # ---- Step 3: Extract video frames ----
-        status = log_step("🎞️  Step 3: Extracting video frames (clip & sync)...")
-        yield status, None
         clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
         log_step(f"   clip_chunk : {tuple(clip_chunk.shape)}")
         log_step(f"   sync_chunk : {tuple(sync_chunk.shape)}")
         # ---- Step 4: Extract model features ----
-        status = log_step("🧠 Step 4: Extracting text / video / sync features...")
-        yield status, None
         info = extract_features(clip_chunk, sync_chunk, caption)
         log_step(f"   text_features         : {tuple(info['text_features'].shape)}")
@@ -419,22 +419,22 @@ def generate_audio(video_file, caption: str):
         log_step(f"   sync_features         : {tuple(info['sync_features'].shape)}")
         # ---- Step 5: Build inference batch ----
-        status = log_step("📦 Step 5: Building inference batch...")
-        yield status, None
         audio_latent, meta = build_meta(info, duration, caption)
         log_step(f"   audio_latent : {tuple(audio_latent.shape)}")
         # ---- Step 6: Diffusion sampling ----
-        status = log_step("🎵 Step 6: Running diffusion sampling...")
-        yield status, None
         generated_audio = run_diffusion(audio_latent, meta, duration)
         log_step(f"   Generated audio shape : {tuple(generated_audio.shape)}")
         # ---- Step 7: Save generated audio (temp) ----
-        status = log_step("💾 Step 7: Saving generated audio...")
-        yield status, None
         audio_path = os.path.join(work_dir, "generated_audio.wav")
         torchaudio.save(
@@ -445,22 +445,22 @@ def generate_audio(video_file, caption: str):
         log_step(f"   Audio saved: {audio_path}")
         # ---- Step 8: Mux audio into original video ----
-        status = log_step("🎬 Step 8: Merging audio into video...")
-        yield status, None
         combined_path = os.path.join(work_dir, "output_with_audio.mp4")
         ok, err = combine_audio_video(mp4_path, audio_path, combined_path)
         if not ok:
-            yield log_step(f"❌ Failed to combine audio and video:\n{err}"), None
-            return
         log_step("✅ Done! Audio and video merged successfully.")
-        yield "\n".join(logs), combined_path
     except Exception as e:
         log_step(f"❌ Unexpected error: {str(e)}")
         log.exception(e)
-        yield "\n".join(logs), None
     end_time =time.time()
     print("cost: ",end_time-start_time)
@@ -468,6 +468,11 @@ def generate_audio(video_file, caption: str):
     # Note: work_dir is NOT deleted here so Gradio can serve the output file.
     # Gradio manages its own GRADIO_TEMP_DIR cleanup on restart.
 # ==================== Gradio UI ====================
@@ -622,7 +627,7 @@ if __name__ == "__main__":
         log.info("✅ All model files found.")
     # ⭐ Load all models once at startup
-    load_all_models()
     demo = build_ui()
     demo.queue(max_size=3)

 # ==================== Full Inference Pipeline ====================
+@spaces.GPU
+def generate_audio_core(video_file, caption):
+    if _MODELS["diffusion"] is None:
+        load_all_models()
     start_time =time.time()
     """
     """
     # ---- Basic validation ----
     if video_file is None:
+        return "❌ Please upload a video file first.", None
     if not caption or caption.strip() == "":
+        caption=""
     caption = caption.strip()
     logs    = []
     try:
         # ---- Step 1: Convert / copy to mp4 ----
+        #status = log_step("📹 Step 1: Preparing video...")
         src_ext  = os.path.splitext(video_file)[1].lower()
         mp4_path = os.path.join(work_dir, "input.mp4")
             log_step("   Converting to mp4...")
             ok, err = convert_to_mp4(video_file, mp4_path)
             if not ok:
+                return log_step(f"❌ Video conversion failed:\n{err}"), None
         else:
             shutil.copy(video_file, mp4_path)
         log_step("   Video ready.")
         # ---- Step 2: Validate duration ----
+        #status = log_step("📹 Step 2: Checking video duration...")
         duration = get_video_duration(mp4_path)
         log_step(f"   Duration: {duration:.2f}s")
         # ---- Step 3: Extract video frames ----
+        #status = log_step("🎞️  Step 3: Extracting video frames (clip & sync)...")
         clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
         log_step(f"   clip_chunk : {tuple(clip_chunk.shape)}")
         log_step(f"   sync_chunk : {tuple(sync_chunk.shape)}")
         # ---- Step 4: Extract model features ----
+        #status = log_step("🧠 Step 4: Extracting text / video / sync features...")
+        #yield status, None
         info = extract_features(clip_chunk, sync_chunk, caption)
         log_step(f"   text_features         : {tuple(info['text_features'].shape)}")
         log_step(f"   sync_features         : {tuple(info['sync_features'].shape)}")
         # ---- Step 5: Build inference batch ----
+        #status = log_step("📦 Step 5: Building inference batch...")
+        #yield status, None
         audio_latent, meta = build_meta(info, duration, caption)
         log_step(f"   audio_latent : {tuple(audio_latent.shape)}")
         # ---- Step 6: Diffusion sampling ----
+        #status = log_step("🎵 Step 6: Running diffusion sampling...")
+        #yield status, None
         generated_audio = run_diffusion(audio_latent, meta, duration)
         log_step(f"   Generated audio shape : {tuple(generated_audio.shape)}")
         # ---- Step 7: Save generated audio (temp) ----
+        #status = log_step("💾 Step 7: Saving generated audio...")
+        #yield status, None
         audio_path = os.path.join(work_dir, "generated_audio.wav")
         torchaudio.save(
         log_step(f"   Audio saved: {audio_path}")
         # ---- Step 8: Mux audio into original video ----
+        #status = log_step("🎬 Step 8: Merging audio into video...")
+        #yield status, None
         combined_path = os.path.join(work_dir, "output_with_audio.mp4")
         ok, err = combine_audio_video(mp4_path, audio_path, combined_path)
         if not ok:
+            return log_step(f"❌ Failed to combine audio and video:\n{err}"), None
         log_step("✅ Done! Audio and video merged successfully.")
+        return "\n".join(logs), combined_path
     except Exception as e:
         log_step(f"❌ Unexpected error: {str(e)}")
         log.exception(e)
+        return "\n".join(logs), None
     end_time =time.time()
     print("cost: ",end_time-start_time)
     # Note: work_dir is NOT deleted here so Gradio can serve the output file.
     # Gradio manages its own GRADIO_TEMP_DIR cleanup on restart.
+def generate_audio(video_file, caption):
+    # 先yield状态
+    yield "⏳ Waiting for GPU...", None
+    result_logs, result_video = generate_audio_core(video_file, caption)
+    yield result_logs, result_video
 # ==================== Gradio UI ====================
         log.info("✅ All model files found.")
     # ⭐ Load all models once at startup
+    #load_all_models()
     demo = build_ui()
     demo.queue(max_size=3)