Spaces:

FunAudioLLM
/

PrismAudio

Running on Zero

App Files Files Community

prismaudio-project commited on 2 days ago

Commit

c367d8b

1 Parent(s): 6cba118

fix

Browse files

Files changed (1) hide show

app.py +10 -23

app.py CHANGED Viewed

@@ -352,8 +352,7 @@ def run_diffusion(audio_latent: torch.Tensor, meta: dict, duration: float) -> to
 @spaces.GPU
 def generate_audio_core(video_file, caption):
-    if _MODELS["diffusion"] is None:
-        load_all_models()
     start_time =time.time()
@@ -383,8 +382,10 @@ def generate_audio_core(video_file, caption):
     work_dir = tempfile.mkdtemp(dir=os.environ["GRADIO_TEMP_DIR"], prefix="PrismAudio_")
     try:
         # ---- Step 1: Convert / copy to mp4 ----
-        #status = log_step("📹 Step 1: Preparing video...")
         src_ext  = os.path.splitext(video_file)[1].lower()
         mp4_path = os.path.join(work_dir, "input.mp4")
@@ -399,21 +400,21 @@ def generate_audio_core(video_file, caption):
         log_step("   Video ready.")
         # ---- Step 2: Validate duration ----
-        #status = log_step("📹 Step 2: Checking video duration...")
         duration = get_video_duration(mp4_path)
         log_step(f"   Duration: {duration:.2f}s")
         # ---- Step 3: Extract video frames ----
-        #status = log_step("🎞️  Step 3: Extracting video frames (clip & sync)...")
         clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
         log_step(f"   clip_chunk : {tuple(clip_chunk.shape)}")
         log_step(f"   sync_chunk : {tuple(sync_chunk.shape)}")
         # ---- Step 4: Extract model features ----
-        #status = log_step("🧠 Step 4: Extracting text / video / sync features...")
         #yield status, None
         info = extract_features(clip_chunk, sync_chunk, caption)
@@ -431,14 +432,14 @@ def generate_audio_core(video_file, caption):
         log_step(f"   audio_latent : {tuple(audio_latent.shape)}")
         # ---- Step 6: Diffusion sampling ----
-        #status = log_step("🎵 Step 6: Running diffusion sampling...")
         #yield status, None
         generated_audio = run_diffusion(audio_latent, meta, duration)
         log_step(f"   Generated audio shape : {tuple(generated_audio.shape)}")
         # ---- Step 7: Save generated audio (temp) ----
-        #status = log_step("💾 Step 7: Saving generated audio...")
         #yield status, None
         audio_path = os.path.join(work_dir, "generated_audio.wav")
@@ -450,7 +451,7 @@ def generate_audio_core(video_file, caption):
         log_step(f"   Audio saved: {audio_path}")
         # ---- Step 8: Mux audio into original video ----
-        #status = log_step("🎬 Step 8: Merging audio into video...")
         #yield status, None
         combined_path = os.path.join(work_dir, "output_with_audio.mp4")
@@ -561,20 +562,6 @@ def build_ui() -> gr.Blocks:
 2. Enter a text prompt describing the desired audio content.
 3. Click **🚀 Generate Audio** and watch the log on the right for progress.
 4. The output video (original visuals + generated audio) appears below when done.
-**Notes**
-- All models are pre-loaded at startup — no warm-up delay on the first request.
-- Everything stays in memory; only the final wav and merged mp4 are written to disk.
-- A CUDA GPU is strongly recommended; CPU inference will be very slow.
-- Queue depth is limited to 3 concurrent requests to avoid OOM.
-**Current model paths**
-```
-MODEL_CONFIG_PATH     = {MODEL_CONFIG_PATH}
-CKPT_PATH             = {CKPT_PATH}
-VAE_CKPT_PATH         = {VAE_CKPT_PATH}
-SYNCHFORMER_CKPT_PATH = {SYNCHFORMER_CKPT_PATH}
-```
             """)
         # ======================================================

 @spaces.GPU
 def generate_audio_core(video_file, caption):
     start_time =time.time()
     work_dir = tempfile.mkdtemp(dir=os.environ["GRADIO_TEMP_DIR"], prefix="PrismAudio_")
     try:
+        if _MODELS["diffusion"] is None:
+            load_all_models()
         # ---- Step 1: Convert / copy to mp4 ----
+        status = log_step("📹 Step 1: Preparing video...")
         src_ext  = os.path.splitext(video_file)[1].lower()
         mp4_path = os.path.join(work_dir, "input.mp4")
         log_step("   Video ready.")
         # ---- Step 2: Validate duration ----
+        status = log_step("📹 Step 2: Checking video duration...")
         duration = get_video_duration(mp4_path)
         log_step(f"   Duration: {duration:.2f}s")
         # ---- Step 3: Extract video frames ----
+        status = log_step("🎞️  Step 3: Extracting video frames (clip & sync)...")
         clip_chunk, sync_chunk, duration = extract_video_frames(mp4_path)
         log_step(f"   clip_chunk : {tuple(clip_chunk.shape)}")
         log_step(f"   sync_chunk : {tuple(sync_chunk.shape)}")
         # ---- Step 4: Extract model features ----
+        status = log_step("🧠 Step 4: Extracting text / video / sync features...")
         #yield status, None
         info = extract_features(clip_chunk, sync_chunk, caption)
         log_step(f"   audio_latent : {tuple(audio_latent.shape)}")
         # ---- Step 6: Diffusion sampling ----
+        status = log_step("🎵 Step 6: Running diffusion sampling...")
         #yield status, None
         generated_audio = run_diffusion(audio_latent, meta, duration)
         log_step(f"   Generated audio shape : {tuple(generated_audio.shape)}")
         # ---- Step 7: Save generated audio (temp) ----
+        status = log_step("💾 Step 7: Saving generated audio...")
         #yield status, None
         audio_path = os.path.join(work_dir, "generated_audio.wav")
         log_step(f"   Audio saved: {audio_path}")
         # ---- Step 8: Mux audio into original video ----
+        status = log_step("🎬 Step 8: Merging audio into video...")
         #yield status, None
         combined_path = os.path.join(work_dir, "output_with_audio.mp4")
 2. Enter a text prompt describing the desired audio content.
 3. Click **🚀 Generate Audio** and watch the log on the right for progress.
 4. The output video (original visuals + generated audio) appears below when done.
             """)
         # ======================================================