MMAudio

Build error

App Files Files Community

abidlabs HF Staff commited on Jan 14

Commit

6d9caf3

verified ·

1 Parent(s): 57b1197

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -186

app.py CHANGED Viewed

@@ -38,18 +38,32 @@ setup_eval_logging()
 def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
     seq_cfg = model.seq_cfg
     net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
     net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
     log.info(f'Loaded weights from {model.model_path}')
-    feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
-                                  synchformer_ckpt=model.synchformer_ckpt,
-                                  enable_conditions=True,
-                                  mode=model.mode,
-                                  bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
-                                  need_vae_encoder=False)
     feature_utils = feature_utils.to(device, dtype).eval()
     return net, feature_utils, seq_cfg
@@ -60,222 +74,134 @@ net, feature_utils, seq_cfg = get_model()
 @spaces.GPU(duration=120)
 @torch.inference_mode()
-def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
-                   cfg_strength: float, duration: float):
     rng = torch.Generator(device=device)
     if seed >= 0:
         rng.manual_seed(seed)
     else:
         rng.seed()
     fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
     video_info = load_video(video, duration)
     clip_frames = video_info.clip_frames
     sync_frames = video_info.sync_frames
     duration = video_info.duration_sec
     clip_frames = clip_frames.unsqueeze(0)
     sync_frames = sync_frames.unsqueeze(0)
     seq_cfg.duration = duration
-    net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
-    audios = generate(clip_frames,
-                      sync_frames, [prompt],
-                      negative_text=[negative_prompt],
-                      feature_utils=feature_utils,
-                      net=net,
-                      fm=fm,
-                      rng=rng,
-                      cfg_strength=cfg_strength)
     audio = audios.float().cpu()[0]
-    # current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
     video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
-    # output_dir.mkdir(exist_ok=True, parents=True)
-    # video_save_path = output_dir / f'{current_time_string}.mp4'
     make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
     log.info(f'Saved video to {video_save_path}')
     return video_save_path
 @spaces.GPU(duration=120)
 @torch.inference_mode()
-def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
-                  duration: float):
     rng = torch.Generator(device=device)
     if seed >= 0:
         rng.manual_seed(seed)
     else:
         rng.seed()
     fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
     clip_frames = sync_frames = None
     seq_cfg.duration = duration
-    net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
-    audios = generate(clip_frames,
-                      sync_frames, [prompt],
-                      negative_text=[negative_prompt],
-                      feature_utils=feature_utils,
-                      net=net,
-                      fm=fm,
-                      rng=rng,
-                      cfg_strength=cfg_strength)
     audio = audios.float().cpu()[0]
     audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
     torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
     log.info(f'Saved audio to {audio_save_path}')
-    return audio_save_path
-video_to_audio_tab = gr.Interface(
-    fn=video_to_audio,
-    description="""
-    Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
-    Code: <a href="https://github.com/hkchengrex/MMAudio">https://github.com/hkchengrex/MMAudio</a><br>
-    Ho Kei Cheng, Masato Ishii, Akio Hayakawa, Takashi Shibuya, Alexander Schwing, Yuki Mitsufuji
-    University of Illinois Urbana-Champaign, Sony AI, and Sony Group Corporation
-    CVPR 2025
-    NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
-    Doing so does not improve results.
-    The model has been trained on 8-second videos. Using much longer or shorter videos will degrade performance. Around 5s~12s should be fine.
-    """,
-    inputs=[
-        gr.Video(),
-        gr.Text(label='Prompt'),
-        gr.Text(label='Negative prompt', value='music'),
-        gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
-        gr.Number(label='Num steps', value=25, precision=0, minimum=1),
-        gr.Number(label='Guidance Strength', value=4.5, minimum=1),
-        gr.Number(label='Duration (sec)', value=8, minimum=1),
-    ],
-    outputs='playable_video',
-    cache_examples=False,
-    title='MMAudio — Video-to-Audio Synthesis',
-    examples=[
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4',
-            'waves, seagulls',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_serpent.mp4',
-            '',
-            'music',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_seahorse.mp4',
-            'bubbles',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_india.mp4',
-            'Indian holy music',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_galloping.mp4',
-            'galloping',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_kraken.mp4',
-            'waves, storm',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_nyc.mp4',
-            '',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/mochi_storm.mp4',
-            'storm',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_spring.mp4',
-            '',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_typing.mp4',
-            'typing',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-        [
-            'https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/hunyuan_wake_up.mp4',
-            '',
-            '',
-            0,
-            25,
-            4.5,
-            10,
-        ],
-    ])
-text_to_audio_tab = gr.Interface(
-    fn=text_to_audio,
-    inputs=[
-        gr.Text(label='Prompt'),
-        gr.Text(label='Negative prompt'),
-        gr.Number(label='Seed (-1: random)', value=-1, precision=0, minimum=-1),
-        gr.Number(label='Num steps', value=25, precision=0, minimum=1),
-        gr.Number(label='Guidance Strength', value=4.5, minimum=1),
-        gr.Number(label='Duration (sec)', value=8, minimum=1),
-    ],
-    outputs='audio',
-    cache_examples=False,
-    title='MMAudio — Text-to-Audio Synthesis',
-)
-if __name__ == "__main__":
-    gr.TabbedInterface([video_to_audio_tab, text_to_audio_tab],
-                       ['Video-to-Audio', 'Text-to-Audio']).launch(allowed_paths=[output_dir])

 def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
+    """
+    Load and initialize the MMAudio model and its associated utilities.
+    This function constructs the MMAudio neural network, loads pretrained
+    weights, initializes feature extraction utilities, and prepares the
+    sequence configuration needed for inference.
+    Returns:
+        tuple:
+            - net (MMAudio): The loaded MMAudio neural network in evaluation mode.
+            - feature_utils (FeaturesUtils): Utility object for audio and video feature extraction.
+            - seq_cfg (SequenceConfig): Configuration object defining sequence lengths and duration.
+    """
     seq_cfg = model.seq_cfg
     net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
     net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
     log.info(f'Loaded weights from {model.model_path}')
+    feature_utils = FeaturesUtils(
+        tod_vae_ckpt=model.vae_path,
+        synchformer_ckpt=model.synchformer_ckpt,
+        enable_conditions=True,
+        mode=model.mode,
+        bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
+        need_vae_encoder=False
+    )
     feature_utils = feature_utils.to(device, dtype).eval()
     return net, feature_utils, seq_cfg
 @spaces.GPU(duration=120)
 @torch.inference_mode()
+def video_to_audio(
+    video: gr.Video,
+    prompt: str,
+    negative_prompt: str,
+    seed: int,
+    num_steps: int,
+    cfg_strength: float,
+    duration: float,
+):
+    """
+    Generate audio conditioned on a video and text prompt.
+    This function extracts visual features from a video, combines them
+    with text conditioning, and synthesizes synchronized audio using
+    the MMAudio model. The output is a video file with generated audio.
+    Args:
+        video (gr.Video): Input video used for visual and temporal conditioning.
+        prompt (str): Text prompt describing the desired audio content.
+        negative_prompt (str): Text describing audio characteristics to avoid.
+        seed (int): Random seed for reproducibility (-1 for random).
+        num_steps (int): Number of diffusion inference steps.
+        cfg_strength (float): Classifier-free guidance strength.
+        duration (float): Duration of the generated audio in seconds.
+    Returns:
+        str: File path to the generated video containing synthesized audio.
+    """
     rng = torch.Generator(device=device)
     if seed >= 0:
         rng.manual_seed(seed)
     else:
         rng.seed()
     fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
     video_info = load_video(video, duration)
     clip_frames = video_info.clip_frames
     sync_frames = video_info.sync_frames
     duration = video_info.duration_sec
     clip_frames = clip_frames.unsqueeze(0)
     sync_frames = sync_frames.unsqueeze(0)
     seq_cfg.duration = duration
+    net.update_seq_lengths(
+        seq_cfg.latent_seq_len,
+        seq_cfg.clip_seq_len,
+        seq_cfg.sync_seq_len
+    )
+    audios = generate(
+        clip_frames,
+        sync_frames,
+        [prompt],
+        negative_text=[negative_prompt],
+        feature_utils=feature_utils,
+        net=net,
+        fm=fm,
+        rng=rng,
+        cfg_strength=cfg_strength,
+    )
     audio = audios.float().cpu()[0]
     video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
     make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
     log.info(f'Saved video to {video_save_path}')
     return video_save_path
 @spaces.GPU(duration=120)
 @torch.inference_mode()
+def text_to_audio(
+    prompt: str,
+    negative_prompt: str,
+    seed: int,
+    num_steps: int,
+    cfg_strength: float,
+    duration: float,
+):
+    """
+    Generate audio purely from text prompts.
+    This function synthesizes standalone audio using the MMAudio model
+    without any video conditioning, relying solely on textual prompts
+    and diffusion-based generation.
+    Args:
+        prompt (str): Text prompt describing the desired audio content.
+        negative_prompt (str): Text describing audio characteristics to avoid.
+        seed (int): Random seed for reproducibility (-1 for random).
+        num_steps (int): Number of diffusion inference steps.
+        cfg_strength (float): Classifier-free guidance strength.
+        duration (float): Duration of the generated audio in seconds.
+    Returns:
+        str: File path to the generated audio file.
+    """
     rng = torch.Generator(device=device)
     if seed >= 0:
         rng.manual_seed(seed)
     else:
         rng.seed()
     fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
     clip_frames = sync_frames = None
     seq_cfg.duration = duration
+    net.update_seq_lengths(
+        seq_cfg.latent_seq_len,
+        seq_cfg.clip_seq_len,
+        seq_cfg.sync_seq_len
+    )
+    audios = generate(
+        clip_frames,
+        sync_frames,
+        [prompt],
+        negative_text=[negative_prompt],
+        feature_utils=feature_utils,
+        net=net,
+        fm=fm,
+        rng=rng,
+        cfg_strength=cfg_strength,
+    )
     audio = audios.float().cpu()[0]
     audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
     torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
     log.info(f'Saved audio to {audio_save_path}')
+    return audio_save_path