Spaces:
Runtime error
Runtime error
multimodalart HF Staff
Fix ghost loader: bind output visibility to mode, not to generate
aab0d80 verified | import spaces # noqa: F401 must precede torch / diffusers | |
| import subprocess | |
| import tempfile | |
| from pathlib import Path | |
| import gradio as gr | |
| import imageio.v3 as iio | |
| import numpy as np | |
| import torch | |
| from diffusers import Cosmos3OmniPipeline | |
| from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler | |
| MODEL_ID = "nvidia/Cosmos3-Nano" | |
| pipe = Cosmos3OmniPipeline.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.bfloat16, | |
| enable_safety_checker=True, | |
| ) | |
| pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=10.0) | |
| pipe.to("cuda") | |
| AUDIO_SR = int(pipe.sound_tokenizer.config.sampling_rate) | |
| MODE_IMAGE = "Image" | |
| MODE_VIDEO = "Video" | |
| RESOLUTIONS = { | |
| "720p (1280x720, slow)": (1280, 720), | |
| "480p (832x480, fast)": (832, 480), | |
| "360p (640x352, fastest)": (640, 352), | |
| } | |
| def _duration(mode, prompt, image, resolution, num_frames, steps, *_): | |
| w, h = RESOLUTIONS[resolution] | |
| n = 1 if mode == MODE_IMAGE else num_frames | |
| pixels = w * h * n * steps | |
| return min(1200, int(60 + pixels / 8_000_000)) | |
| def generate( | |
| mode, | |
| prompt, | |
| image, | |
| resolution, | |
| num_frames, | |
| steps, | |
| guidance, | |
| enable_sound, | |
| negative_prompt, | |
| seed, | |
| randomize_seed, | |
| progress=gr.Progress(track_tqdm=True), | |
| ): | |
| if not prompt or not prompt.strip(): | |
| raise gr.Error("Please enter a prompt.") | |
| width, height = RESOLUTIONS[resolution] | |
| if randomize_seed: | |
| seed = int(torch.randint(0, 2**31 - 1, (1,)).item()) | |
| generator = torch.Generator(device="cuda").manual_seed(int(seed)) | |
| fps = 24 | |
| is_image = mode == MODE_IMAGE | |
| sound = bool(enable_sound) and not is_image | |
| result = pipe( | |
| prompt=prompt, | |
| negative_prompt=negative_prompt or None, | |
| image=image, | |
| num_frames=1 if is_image else int(num_frames), | |
| height=height, | |
| width=width, | |
| fps=fps, | |
| num_inference_steps=int(steps), | |
| guidance_scale=float(guidance), | |
| enable_sound=sound, | |
| generator=generator, | |
| output_type="pil", | |
| ) | |
| out_dir = Path(tempfile.mkdtemp(prefix="cosmos3_")) | |
| if is_image: | |
| img = result.video[0] | |
| img_path = out_dir / "image.png" | |
| img.save(img_path) | |
| return str(img_path), None, seed | |
| frames = np.stack([np.asarray(f) for f in result.video], axis=0) | |
| silent_path = out_dir / "video.mp4" | |
| iio.imwrite(silent_path, frames, fps=fps, codec="libx264") | |
| video_path = silent_path | |
| if sound and result.sound is not None: | |
| wav = result.sound.detach().to(torch.float32).cpu().numpy() | |
| if wav.ndim == 2: | |
| wav = wav.T | |
| wav_path = out_dir / "audio.wav" | |
| import soundfile as sf | |
| sf.write(wav_path, wav, AUDIO_SR) | |
| video_path = out_dir / "video_with_audio.mp4" | |
| subprocess.run( | |
| [ | |
| "ffmpeg", "-y", "-loglevel", "error", | |
| "-i", str(silent_path), | |
| "-i", str(wav_path), | |
| "-c:v", "copy", "-c:a", "aac", "-shortest", | |
| str(video_path), | |
| ], | |
| check=True, | |
| ) | |
| return None, str(video_path), seed | |
| EXAMPLE_T2I = ( | |
| "A close-up portrait of an astronaut in a jungle, cold color palette, " | |
| "muted colors, detailed, 8k." | |
| ) | |
| EXAMPLE_T2V = ( | |
| "A cinematic wide shot of a vintage red convertible driving along a " | |
| "coastal highway at sunset. Waves crash against rocks below as the car " | |
| "speeds toward the horizon. Golden hour lighting, lens flare, smooth " | |
| "tracking shot." | |
| ) | |
| EXAMPLE_T2VS = ( | |
| "A close-up of rain hitting a window at night, neon city lights blurred " | |
| "in the background. Soft ambient sound of rain and distant traffic." | |
| ) | |
| def _on_mode_change(mode): | |
| is_image = mode == MODE_IMAGE | |
| return ( | |
| gr.update(visible=not is_image, value=False if is_image else None), # sound | |
| gr.update(visible=not is_image), # num_frames | |
| gr.update(visible=is_image, value=None), # out_image | |
| gr.update(visible=not is_image, value=None), # out_video | |
| ) | |
| with gr.Blocks(title="Cosmos3-Nano") as demo: | |
| gr.Markdown( | |
| """ | |
| # NVIDIA Cosmos3-Nano — omnimodal world model | |
| Generate images or videos (with optional conditioning image and audio) using | |
| [`nvidia/Cosmos3-Nano`](https://huggingface.co/nvidia/Cosmos3-Nano) | |
| (16B params) via the Diffusers `Cosmos3OmniPipeline`. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| mode = gr.Radio( | |
| label="Mode", | |
| choices=[MODE_IMAGE, MODE_VIDEO], | |
| value=MODE_VIDEO, | |
| ) | |
| prompt = gr.Textbox( | |
| label="Prompt", lines=4, value=EXAMPLE_T2V, | |
| placeholder="Describe what to generate...", | |
| ) | |
| image = gr.Image( | |
| label="Conditioning image (optional)", type="pil", height=240, | |
| ) | |
| enable_sound = gr.Checkbox(label="Generate audio", value=False) | |
| with gr.Accordion("Advanced settings", open=False): | |
| negative_prompt = gr.Textbox(label="Negative prompt", lines=2, value="") | |
| resolution = gr.Dropdown( | |
| label="Resolution", | |
| choices=list(RESOLUTIONS.keys()), | |
| value="480p (832x480, fast)", | |
| ) | |
| num_frames = gr.Slider( | |
| label="Frames (24 fps)", minimum=33, maximum=189, value=65, step=4 | |
| ) | |
| steps = gr.Slider( | |
| label="Inference steps", minimum=15, maximum=50, value=25, step=1 | |
| ) | |
| guidance = gr.Slider( | |
| label="Guidance scale", minimum=1.0, maximum=10.0, value=6.0, step=0.5 | |
| ) | |
| seed = gr.Number(label="Seed", value=0, precision=0) | |
| randomize_seed = gr.Checkbox(label="Randomize seed", value=True) | |
| run = gr.Button("Generate", variant="primary") | |
| with gr.Column(scale=1): | |
| out_image = gr.Image(label="Generated image", visible=False) | |
| out_video = gr.Video(label="Generated video", autoplay=True, visible=True) | |
| used_seed = gr.Number(label="Seed used", interactive=False) | |
| gr.Examples( | |
| examples=[ | |
| [MODE_IMAGE, EXAMPLE_T2I, False], | |
| [MODE_VIDEO, EXAMPLE_T2V, False], | |
| [MODE_VIDEO, EXAMPLE_T2VS, True], | |
| ], | |
| inputs=[mode, prompt, enable_sound], | |
| label="Examples", | |
| ) | |
| mode.change( | |
| fn=_on_mode_change, inputs=[mode], | |
| outputs=[enable_sound, num_frames, out_image, out_video], | |
| ) | |
| run.click( | |
| fn=generate, | |
| inputs=[ | |
| mode, prompt, image, resolution, num_frames, steps, guidance, | |
| enable_sound, negative_prompt, seed, randomize_seed, | |
| ], | |
| outputs=[out_image, out_video, used_seed], | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=10).launch(theme=gr.themes.Citrus()) | |