import spaces # noqa: F401 must precede torch / diffusers import subprocess import tempfile from pathlib import Path import gradio as gr import imageio.v3 as iio import numpy as np import torch from diffusers import Cosmos3OmniPipeline from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler MODEL_ID = "nvidia/Cosmos3-Nano" pipe = Cosmos3OmniPipeline.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, enable_safety_checker=True, ) pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=10.0) pipe.to("cuda") AUDIO_SR = int(pipe.sound_tokenizer.config.sampling_rate) MODE_IMAGE = "Image" MODE_VIDEO = "Video" RESOLUTIONS = { "720p (1280x720, slow)": (1280, 720), "480p (832x480, fast)": (832, 480), "360p (640x352, fastest)": (640, 352), } def _duration(mode, prompt, image, resolution, num_frames, steps, *_): w, h = RESOLUTIONS[resolution] n = 1 if mode == MODE_IMAGE else num_frames pixels = w * h * n * steps return min(1200, int(60 + pixels / 8_000_000)) @spaces.GPU(duration=_duration) def generate( mode, prompt, image, resolution, num_frames, steps, guidance, enable_sound, negative_prompt, seed, randomize_seed, progress=gr.Progress(track_tqdm=True), ): if not prompt or not prompt.strip(): raise gr.Error("Please enter a prompt.") width, height = RESOLUTIONS[resolution] if randomize_seed: seed = int(torch.randint(0, 2**31 - 1, (1,)).item()) generator = torch.Generator(device="cuda").manual_seed(int(seed)) fps = 24 is_image = mode == MODE_IMAGE sound = bool(enable_sound) and not is_image result = pipe( prompt=prompt, negative_prompt=negative_prompt or None, image=image, num_frames=1 if is_image else int(num_frames), height=height, width=width, fps=fps, num_inference_steps=int(steps), guidance_scale=float(guidance), enable_sound=sound, generator=generator, output_type="pil", ) out_dir = Path(tempfile.mkdtemp(prefix="cosmos3_")) if is_image: img = result.video[0] img_path = out_dir / "image.png" img.save(img_path) return str(img_path), None, seed frames = np.stack([np.asarray(f) for f in result.video], axis=0) silent_path = out_dir / "video.mp4" iio.imwrite(silent_path, frames, fps=fps, codec="libx264") video_path = silent_path if sound and result.sound is not None: wav = result.sound.detach().to(torch.float32).cpu().numpy() if wav.ndim == 2: wav = wav.T wav_path = out_dir / "audio.wav" import soundfile as sf sf.write(wav_path, wav, AUDIO_SR) video_path = out_dir / "video_with_audio.mp4" subprocess.run( [ "ffmpeg", "-y", "-loglevel", "error", "-i", str(silent_path), "-i", str(wav_path), "-c:v", "copy", "-c:a", "aac", "-shortest", str(video_path), ], check=True, ) return None, str(video_path), seed EXAMPLE_T2I = ( "A close-up portrait of an astronaut in a jungle, cold color palette, " "muted colors, detailed, 8k." ) EXAMPLE_T2V = ( "A cinematic wide shot of a vintage red convertible driving along a " "coastal highway at sunset. Waves crash against rocks below as the car " "speeds toward the horizon. Golden hour lighting, lens flare, smooth " "tracking shot." ) EXAMPLE_T2VS = ( "A close-up of rain hitting a window at night, neon city lights blurred " "in the background. Soft ambient sound of rain and distant traffic." ) def _on_mode_change(mode): is_image = mode == MODE_IMAGE return ( gr.update(visible=not is_image, value=False if is_image else None), # sound gr.update(visible=not is_image), # num_frames gr.update(visible=is_image, value=None), # out_image gr.update(visible=not is_image, value=None), # out_video ) with gr.Blocks(title="Cosmos3-Nano") as demo: gr.Markdown( """ # NVIDIA Cosmos3-Nano — omnimodal world model Generate images or videos (with optional conditioning image and audio) using [`nvidia/Cosmos3-Nano`](https://huggingface.co/nvidia/Cosmos3-Nano) (16B params) via the Diffusers `Cosmos3OmniPipeline`. """ ) with gr.Row(): with gr.Column(scale=1): mode = gr.Radio( label="Mode", choices=[MODE_IMAGE, MODE_VIDEO], value=MODE_VIDEO, ) prompt = gr.Textbox( label="Prompt", lines=4, value=EXAMPLE_T2V, placeholder="Describe what to generate...", ) image = gr.Image( label="Conditioning image (optional)", type="pil", height=240, ) enable_sound = gr.Checkbox(label="Generate audio", value=False) with gr.Accordion("Advanced settings", open=False): negative_prompt = gr.Textbox(label="Negative prompt", lines=2, value="") resolution = gr.Dropdown( label="Resolution", choices=list(RESOLUTIONS.keys()), value="480p (832x480, fast)", ) num_frames = gr.Slider( label="Frames (24 fps)", minimum=33, maximum=189, value=65, step=4 ) steps = gr.Slider( label="Inference steps", minimum=15, maximum=50, value=25, step=1 ) guidance = gr.Slider( label="Guidance scale", minimum=1.0, maximum=10.0, value=6.0, step=0.5 ) seed = gr.Number(label="Seed", value=0, precision=0) randomize_seed = gr.Checkbox(label="Randomize seed", value=True) run = gr.Button("Generate", variant="primary") with gr.Column(scale=1): out_image = gr.Image(label="Generated image", visible=False) out_video = gr.Video(label="Generated video", autoplay=True, visible=True) used_seed = gr.Number(label="Seed used", interactive=False) gr.Examples( examples=[ [MODE_IMAGE, EXAMPLE_T2I, False], [MODE_VIDEO, EXAMPLE_T2V, False], [MODE_VIDEO, EXAMPLE_T2VS, True], ], inputs=[mode, prompt, enable_sound], label="Examples", ) mode.change( fn=_on_mode_change, inputs=[mode], outputs=[enable_sound, num_frames, out_image, out_video], ) run.click( fn=generate, inputs=[ mode, prompt, image, resolution, num_frames, steps, guidance, enable_sound, negative_prompt, seed, randomize_seed, ], outputs=[out_image, out_video, used_seed], ) if __name__ == "__main__": demo.queue(max_size=10).launch(theme=gr.themes.Citrus())