Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Running on Zero

App Files Files Community

JackIsNotInTheBox commited on 7 days ago

Commit

ff95229

1 Parent(s): 98a45f3

Fix indentation errors in app.py

Browse files

Files changed (1) hide show

app.py +84 -84

app.py CHANGED Viewed

@@ -3,12 +3,12 @@ import subprocess
 import sys
 try:
-        import mmcv
-        print("mmcv already installed")
 except ImportError:
-        print("Installing mmcv with --no-build-isolation...")
-        subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-build-isolation", "mmcv>=2.0.0"])
-        print("mmcv installed successfully")
 import torch
 import numpy as np
@@ -31,49 +31,49 @@ taro_ckpt_path = hf_hub_download(repo_id=REPO_ID, filename="taro_ckpt.pt", cache
 print("Checkpoints downloaded.")
 def set_global_seed(seed):
-        np.random.seed(seed % (2**32))
-        random.seed(seed)
-        torch.manual_seed(seed)
-        torch.cuda.manual_seed(seed)
-        torch.backends.cudnn.deterministic = True
-    def strip_audio_from_video(video_path, output_path):
-            """Strip any existing audio from a video file, outputting a silent video."""""
-            (
-                        ffmpeg
-                        .input(video_path)
-                        .output(output_path, vcodec="libx264", an=None)
-                        .run(overwrite_output=True, quiet=True)
-            )
-        @spaces.GPU(duration=300)
 def generate_audio(video_file, seed_val, cfg_scale, num_steps, mode):
-        set_global_seed(int(seed_val))
-        torch.set_grad_enabled(False)
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        weight_dtype = torch.bfloat16
-        from cavp_util import Extract_CAVP_Features
-        from onset_util import VideoOnsetNet, extract_onset
-        from models import MMDiT
-        from samplers import euler_sampler, euler_maruyama_sampler
-        from diffusers import AudioLDM2Pipeline
-        extract_cavp = Extract_CAVP_Features(device=device, config_path="./cavp/cavp.yaml", ckpt_path=cavp_ckpt_path)
-        state_dict = torch.load(onset_ckpt_path, map_location=device, weights_only=False)["state_dict"]
-        new_state_dict = {}
-        for key, value in state_dict.items():
-                    if "model.net.model" in key:
-                                    new_key = key.replace("model.net.model", "net.model")
-                    elif "model.fc." in key:
-                                    new_key = key.replace("model.fc", "fc")
-                    else:
-                                    new_key = key
-                                new_state_dict[new_key] = value
-                onset_model = VideoOnsetNet(False).to(device)
     onset_model.load_state_dict(new_state_dict)
     onset_model.eval()
@@ -111,25 +111,25 @@ def generate_audio(video_file, seed_val, cfg_scale, num_steps, mode):
     z = torch.randn(len(video_feats), model.in_channels, 204, 16, device=device).to(weight_dtype)
     sampling_kwargs = dict(
-                model=model,
-                latents=z,
-                y=onset_feats_t,
-                context=video_feats,
-                num_steps=int(num_steps),
-                heun=False,
-                cfg_scale=float(cfg_scale),
-                guidance_low=0.0,
-                guidance_high=0.7,
-                path_type="linear"
     )
     with torch.no_grad():
-                if mode == "sde":
-                                samples = euler_maruyama_sampler(**sampling_kwargs)
-                else:
-                                samples = euler_sampler(**sampling_kwargs)
-                        samples = vae.decode(samples / latents_scale).sample
     wav_samples = vocoder(samples.squeeze()).detach().cpu().numpy()
     audio_path = os.path.join(tmp_dir, "output.wav")
@@ -142,38 +142,38 @@ def generate_audio(video_file, seed_val, cfg_scale, num_steps, mode):
     output_video = os.path.join(tmp_dir, "output.mp4")
     (
-                ffmpeg
-                .input(silent_video, ss=0, t=duration)
-                .output(trimmed_video, vcodec="libx264", an=None)
-                .run(overwrite_output=True, quiet=True)
     )
     # Combine the trimmed silent video with the generated audio
     input_v = ffmpeg.input(trimmed_video)
     input_a = ffmpeg.input(audio_path)
     (
-                ffmpeg
-                .output(input_v, input_a, output_video, vcodec="libx264", acodec="aac", strict="experimental")
-                .run(overwrite_output=True, quiet=True)
     )
     return output_video, audio_path
 demo = gr.Interface(
-        fn=generate_audio,
-        inputs=[
-                    gr.Video(label="Input Video"),
-                    gr.Number(label="Seed", value=0, precision=0),
-                    gr.Slider(label="CFG Scale", minimum=1, maximum=15, value=8, step=0.5),
-                    gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=25, step=1),
-                    gr.Radio(label="Sampling Mode", choices=["sde", "ode"], value="sde")
-        ],
-        outputs=[
-                    gr.Video(label="Output Video with Audio"),
-                    gr.Audio(label="Generated Audio")
-        ],
-        title="TARO: Video-to-Audio Synthesis (ICCV 2025)",
-        description="Upload a video and generate synchronized audio using TARO. Optimal duration is as close to 8 seconds as possible."
 )
-demo.queue().launch()

 import sys
 try:
+    import mmcv
+    print("mmcv already installed")
 except ImportError:
+    print("Installing mmcv with --no-build-isolation...")
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-build-isolation", "mmcv>=2.0.0"])
+    print("mmcv installed successfully")
 import torch
 import numpy as np
 print("Checkpoints downloaded.")
 def set_global_seed(seed):
+    np.random.seed(seed % (2**32))
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+def strip_audio_from_video(video_path, output_path):
+    """Strip any existing audio from a video file, outputting a silent video."""
+    (
+        ffmpeg
+        .input(video_path)
+        .output(output_path, vcodec="libx264", an=None)
+        .run(overwrite_output=True, quiet=True)
+    )
+@spaces.GPU(duration=300)
 def generate_audio(video_file, seed_val, cfg_scale, num_steps, mode):
+    set_global_seed(int(seed_val))
+    torch.set_grad_enabled(False)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    weight_dtype = torch.bfloat16
+    from cavp_util import Extract_CAVP_Features
+    from onset_util import VideoOnsetNet, extract_onset
+    from models import MMDiT
+    from samplers import euler_sampler, euler_maruyama_sampler
+    from diffusers import AudioLDM2Pipeline
+    extract_cavp = Extract_CAVP_Features(device=device, config_path="./cavp/cavp.yaml", ckpt_path=cavp_ckpt_path)
+    state_dict = torch.load(onset_ckpt_path, map_location=device, weights_only=False)["state_dict"]
+    new_state_dict = {}
+    for key, value in state_dict.items():
+        if "model.net.model" in key:
+            new_key = key.replace("model.net.model", "net.model")
+        elif "model.fc." in key:
+            new_key = key.replace("model.fc", "fc")
+        else:
+            new_key = key
+        new_state_dict[new_key] = value
+    onset_model = VideoOnsetNet(False).to(device)
     onset_model.load_state_dict(new_state_dict)
     onset_model.eval()
     z = torch.randn(len(video_feats), model.in_channels, 204, 16, device=device).to(weight_dtype)
     sampling_kwargs = dict(
+        model=model,
+        latents=z,
+        y=onset_feats_t,
+        context=video_feats,
+        num_steps=int(num_steps),
+        heun=False,
+        cfg_scale=float(cfg_scale),
+        guidance_low=0.0,
+        guidance_high=0.7,
+        path_type="linear"
     )
     with torch.no_grad():
+        if mode == "sde":
+            samples = euler_maruyama_sampler(**sampling_kwargs)
+        else:
+            samples = euler_sampler(**sampling_kwargs)
+    samples = vae.decode(samples / latents_scale).sample
     wav_samples = vocoder(samples.squeeze()).detach().cpu().numpy()
     audio_path = os.path.join(tmp_dir, "output.wav")
     output_video = os.path.join(tmp_dir, "output.mp4")
     (
+        ffmpeg
+        .input(silent_video, ss=0, t=duration)
+        .output(trimmed_video, vcodec="libx264", an=None)
+        .run(overwrite_output=True, quiet=True)
     )
     # Combine the trimmed silent video with the generated audio
     input_v = ffmpeg.input(trimmed_video)
     input_a = ffmpeg.input(audio_path)
     (
+        ffmpeg
+        .output(input_v, input_a, output_video, vcodec="libx264", acodec="aac", strict="experimental")
+        .run(overwrite_output=True, quiet=True)
     )
     return output_video, audio_path
 demo = gr.Interface(
+    fn=generate_audio,
+    inputs=[
+        gr.Video(label="Input Video"),
+        gr.Number(label="Seed", value=0, precision=0),
+        gr.Slider(label="CFG Scale", minimum=1, maximum=15, value=8, step=0.5),
+        gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=25, step=1),
+        gr.Radio(label="Sampling Mode", choices=["sde", "ode"], value="sde")
+    ],
+    outputs=[
+        gr.Video(label="Output Video with Audio"),
+        gr.Audio(label="Generated Audio")
+    ],
+    title="TARO: Video-to-Audio Synthesis (ICCV 2025)",
+    description="Upload a video and generate synchronized audio using TARO. Optimal duration is as close to 8 seconds as possible."
 )
+demo.queue().launch()