Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

App Files Files Community

JackIsNotInTheBox commited on 6 days ago

Commit

211f7b6

1 Parent(s): 47a6cd2

Strip original audio from uploaded videos before processing and final output

Browse files

Files changed (1) hide show

app.py +117 -44

app.py CHANGED Viewed

@@ -3,14 +3,14 @@ import subprocess
 import sys
 try:
-    import mmcv
-    print("mmcv already installed")
 except ImportError:
-    print("Installing mmcv with --no-build-isolation...")
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-build-isolation", "mmcv>=2.0.0"])
-    print("mmcv installed successfully")
-import torch
 import numpy as np
 import random
 import soundfile as sf
@@ -31,76 +31,149 @@ taro_ckpt_path = hf_hub_download(repo_id=REPO_ID, filename="taro_ckpt.pt", cache
 print("Checkpoints downloaded.")
 def set_global_seed(seed):
-    np.random.seed(seed % (2**32))
-    random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.backends.cudnn.deterministic = True
-@spaces.GPU(duration=300)
 def generate_audio(video_file, seed_val, cfg_scale, num_steps, mode):
-    set_global_seed(int(seed_val))
-    torch.set_grad_enabled(False)
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    weight_dtype = torch.bfloat16
-    from cavp_util import Extract_CAVP_Features
-    from onset_util import VideoOnsetNet, extract_onset
-    from models import MMDiT
-    from samplers import euler_sampler, euler_maruyama_sampler
-    from diffusers import AudioLDM2Pipeline
-    extract_cavp = Extract_CAVP_Features(device=device, config_path="./cavp/cavp.yaml", ckpt_path=cavp_ckpt_path)
-    state_dict = torch.load(onset_ckpt_path, map_location=device, weights_only=False)["state_dict"]
-    new_state_dict = {}
-    for key, value in state_dict.items():
-        if "model.net.model" in key:
-            new_key = key.replace("model.net.model", "net.model")
-        elif "model.fc." in key:
-            new_key = key.replace("model.fc", "fc")
-        else:
-            new_key = key
-        new_state_dict[new_key] = value
-    onset_model = VideoOnsetNet(False).to(device)
     onset_model.load_state_dict(new_state_dict)
     onset_model.eval()
     model = MMDiT(adm_in_channels=120, z_dims=[768], encoder_depth=4).to(device)
     ckpt = torch.load(taro_ckpt_path, map_location=device, weights_only=False)["ema"]
     model.load_state_dict(ckpt)
     model.eval()
     model.to(weight_dtype)
     model_audioldm = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
     vae = model_audioldm.vae.to(device)
     vae.eval()
     vocoder = model_audioldm.vocoder.to(device)
     tmp_dir = tempfile.mkdtemp()
-    cavp_feats = extract_cavp(video_file, tmp_path=tmp_dir)
-    onset_feats = extract_onset(video_file, onset_model, tmp_path=tmp_dir, device=device)
     sr = 16000
     truncate = 131072
     fps = 4
     truncate_frame = int(fps * truncate / sr)
     truncate_onset = 120
     latents_scale = torch.tensor([0.18215]*8).view(1, 8, 1, 1).to(device)
     video_feats = torch.from_numpy(cavp_feats[:truncate_frame]).unsqueeze(0).to(device).to(weight_dtype)
     onset_feats_t = torch.from_numpy(onset_feats[:truncate_onset]).unsqueeze(0).to(device).to(weight_dtype)
     z = torch.randn(len(video_feats), model.in_channels, 204, 16, device=device).to(weight_dtype)
-    sampling_kwargs = dict(model=model, latents=z, y=onset_feats_t, context=video_feats, num_steps=int(num_steps), heun=False, cfg_scale=float(cfg_scale), guidance_low=0.0, guidance_high=0.7, path_type="linear")
     with torch.no_grad():
-        if mode == "sde":
-            samples = euler_maruyama_sampler(**sampling_kwargs)
-        else:
-            samples = euler_sampler(**sampling_kwargs)
-    samples = vae.decode(samples / latents_scale).sample
     wav_samples = vocoder(samples.squeeze()).detach().cpu().numpy()
     audio_path = os.path.join(tmp_dir, "output.wav")
     sf.write(audio_path, wav_samples, sr)
     duration = truncate / sr
     trimmed_video = os.path.join(tmp_dir, "trimmed.mp4")
     output_video = os.path.join(tmp_dir, "output.mp4")
-    ffmpeg.input(video_file, ss=0, t=duration).output(trimmed_video, vcodec="libx264", an=None).run(overwrite_output=True, quiet=True)
     input_v = ffmpeg.input(trimmed_video)
     input_a = ffmpeg.input(audio_path)
-    ffmpeg.output(input_v, input_a, output_video, vcodec="libx264", acodec="aac", strict="experimental").run(overwrite_output=True, quiet=True)
     return output_video, audio_path
-demo = gr.Interface(fn=generate_audio, inputs=[gr.Video(label="Input Video"), gr.Number(label="Seed", value=0, precision=0), gr.Slider(label="CFG Scale", minimum=1, maximum=15, value=8, step=0.5), gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=25, step=1), gr.Radio(label="Sampling Mode", choices=["sde", "ode"], value="sde")], outputs=[gr.Video(label="Output Video with Audio"), gr.Audio(label="Generated Audio")], title="TARO: Video-to-Audio Synthesis (ICCV 2025)", description="Upload a video and generate synchronized audio using TARO. Optimal duration is as close to 8 seconds as possible.")
 demo.queue().launch()

 import sys
 try:
+        import mmcv
+        print("mmcv already installed")
 except ImportError:
+        print("Installing mmcv with --no-build-isolation...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-build-isolation", "mmcv>=2.0.0"])
+        print("mmcv installed successfully")
+    import torch
 import numpy as np
 import random
 import soundfile as sf
 print("Checkpoints downloaded.")
 def set_global_seed(seed):
+        np.random.seed(seed % (2**32))
+        random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        torch.backends.cudnn.deterministic = True
+    def strip_audio_from_video(video_path, output_path):
+            """Strip any existing audio from a video file, outputting a silent video."""""
+            (
+                        ffmpeg
+                        .input(video_path)
+                        .output(output_path, vcodec="libx264", an=None)
+                        .run(overwrite_output=True, quiet=True)
+            )
+        @spaces.GPU(duration=300)
 def generate_audio(video_file, seed_val, cfg_scale, num_steps, mode):
+        set_global_seed(int(seed_val))
+        torch.set_grad_enabled(False)
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        weight_dtype = torch.bfloat16
+        from cavp_util import Extract_CAVP_Features
+        from onset_util import VideoOnsetNet, extract_onset
+        from models import MMDiT
+        from samplers import euler_sampler, euler_maruyama_sampler
+        from diffusers import AudioLDM2Pipeline
+        extract_cavp = Extract_CAVP_Features(device=device, config_path="./cavp/cavp.yaml", ckpt_path=cavp_ckpt_path)
+        state_dict = torch.load(onset_ckpt_path, map_location=device, weights_only=False)["state_dict"]
+        new_state_dict = {}
+        for key, value in state_dict.items():
+                    if "model.net.model" in key:
+                                    new_key = key.replace("model.net.model", "net.model")
+                    elif "model.fc." in key:
+                                    new_key = key.replace("model.fc", "fc")
+                    else:
+                                    new_key = key
+                                new_state_dict[new_key] = value
+                onset_model = VideoOnsetNet(False).to(device)
     onset_model.load_state_dict(new_state_dict)
     onset_model.eval()
     model = MMDiT(adm_in_channels=120, z_dims=[768], encoder_depth=4).to(device)
     ckpt = torch.load(taro_ckpt_path, map_location=device, weights_only=False)["ema"]
     model.load_state_dict(ckpt)
     model.eval()
     model.to(weight_dtype)
     model_audioldm = AudioLDM2Pipeline.from_pretrained("cvssp/audioldm2")
     vae = model_audioldm.vae.to(device)
     vae.eval()
     vocoder = model_audioldm.vocoder.to(device)
     tmp_dir = tempfile.mkdtemp()
+    # Strip any existing audio from the input video before feature extraction
+    silent_video = os.path.join(tmp_dir, "silent_input.mp4")
+    strip_audio_from_video(video_file, silent_video)
+    cavp_feats = extract_cavp(silent_video, tmp_path=tmp_dir)
+    onset_feats = extract_onset(silent_video, onset_model, tmp_path=tmp_dir, device=device)
     sr = 16000
     truncate = 131072
     fps = 4
     truncate_frame = int(fps * truncate / sr)
     truncate_onset = 120
     latents_scale = torch.tensor([0.18215]*8).view(1, 8, 1, 1).to(device)
     video_feats = torch.from_numpy(cavp_feats[:truncate_frame]).unsqueeze(0).to(device).to(weight_dtype)
     onset_feats_t = torch.from_numpy(onset_feats[:truncate_onset]).unsqueeze(0).to(device).to(weight_dtype)
     z = torch.randn(len(video_feats), model.in_channels, 204, 16, device=device).to(weight_dtype)
+    sampling_kwargs = dict(
+                model=model,
+                latents=z,
+                y=onset_feats_t,
+                context=video_feats,
+                num_steps=int(num_steps),
+                heun=False,
+                cfg_scale=float(cfg_scale),
+                guidance_low=0.0,
+                guidance_high=0.7,
+                path_type="linear"
+    )
     with torch.no_grad():
+                if mode == "sde":
+                                samples = euler_maruyama_sampler(**sampling_kwargs)
+                else:
+                                samples = euler_sampler(**sampling_kwargs)
+                        samples = vae.decode(samples / latents_scale).sample
     wav_samples = vocoder(samples.squeeze()).detach().cpu().numpy()
     audio_path = os.path.join(tmp_dir, "output.wav")
     sf.write(audio_path, wav_samples, sr)
     duration = truncate / sr
+    # Trim the silent input video to the target duration (no audio)
     trimmed_video = os.path.join(tmp_dir, "trimmed.mp4")
     output_video = os.path.join(tmp_dir, "output.mp4")
+    (
+                ffmpeg
+                .input(silent_video, ss=0, t=duration)
+                .output(trimmed_video, vcodec="libx264", an=None)
+                .run(overwrite_output=True, quiet=True)
+    )
+    # Combine the trimmed silent video with the generated audio
     input_v = ffmpeg.input(trimmed_video)
     input_a = ffmpeg.input(audio_path)
+    (
+                ffmpeg
+                .output(input_v, input_a, output_video, vcodec="libx264", acodec="aac", strict="experimental")
+                .run(overwrite_output=True, quiet=True)
+    )
     return output_video, audio_path
+demo = gr.Interface(
+        fn=generate_audio,
+        inputs=[
+                    gr.Video(label="Input Video"),
+                    gr.Number(label="Seed", value=0, precision=0),
+                    gr.Slider(label="CFG Scale", minimum=1, maximum=15, value=8, step=0.5),
+                    gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=25, step=1),
+                    gr.Radio(label="Sampling Mode", choices=["sde", "ode"], value="sde")
+        ],
+        outputs=[
+                    gr.Video(label="Output Video with Audio"),
+                    gr.Audio(label="Generated Audio")
+        ],
+        title="TARO: Video-to-Audio Synthesis (ICCV 2025)",
+        description="Upload a video and generate synchronized audio using TARO. Optimal duration is as close to 8 seconds as possible."
+)
 demo.queue().launch()