Cosmos3-Nano / app.py
multimodalart's picture
multimodalart HF Staff
Fix ghost loader: bind output visibility to mode, not to generate
aab0d80 verified
import spaces # noqa: F401 must precede torch / diffusers
import subprocess
import tempfile
from pathlib import Path
import gradio as gr
import imageio.v3 as iio
import numpy as np
import torch
from diffusers import Cosmos3OmniPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
MODEL_ID = "nvidia/Cosmos3-Nano"
pipe = Cosmos3OmniPipeline.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
enable_safety_checker=True,
)
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=10.0)
pipe.to("cuda")
AUDIO_SR = int(pipe.sound_tokenizer.config.sampling_rate)
MODE_IMAGE = "Image"
MODE_VIDEO = "Video"
RESOLUTIONS = {
"720p (1280x720, slow)": (1280, 720),
"480p (832x480, fast)": (832, 480),
"360p (640x352, fastest)": (640, 352),
}
def _duration(mode, prompt, image, resolution, num_frames, steps, *_):
w, h = RESOLUTIONS[resolution]
n = 1 if mode == MODE_IMAGE else num_frames
pixels = w * h * n * steps
return min(1200, int(60 + pixels / 8_000_000))
@spaces.GPU(duration=_duration)
def generate(
mode,
prompt,
image,
resolution,
num_frames,
steps,
guidance,
enable_sound,
negative_prompt,
seed,
randomize_seed,
progress=gr.Progress(track_tqdm=True),
):
if not prompt or not prompt.strip():
raise gr.Error("Please enter a prompt.")
width, height = RESOLUTIONS[resolution]
if randomize_seed:
seed = int(torch.randint(0, 2**31 - 1, (1,)).item())
generator = torch.Generator(device="cuda").manual_seed(int(seed))
fps = 24
is_image = mode == MODE_IMAGE
sound = bool(enable_sound) and not is_image
result = pipe(
prompt=prompt,
negative_prompt=negative_prompt or None,
image=image,
num_frames=1 if is_image else int(num_frames),
height=height,
width=width,
fps=fps,
num_inference_steps=int(steps),
guidance_scale=float(guidance),
enable_sound=sound,
generator=generator,
output_type="pil",
)
out_dir = Path(tempfile.mkdtemp(prefix="cosmos3_"))
if is_image:
img = result.video[0]
img_path = out_dir / "image.png"
img.save(img_path)
return str(img_path), None, seed
frames = np.stack([np.asarray(f) for f in result.video], axis=0)
silent_path = out_dir / "video.mp4"
iio.imwrite(silent_path, frames, fps=fps, codec="libx264")
video_path = silent_path
if sound and result.sound is not None:
wav = result.sound.detach().to(torch.float32).cpu().numpy()
if wav.ndim == 2:
wav = wav.T
wav_path = out_dir / "audio.wav"
import soundfile as sf
sf.write(wav_path, wav, AUDIO_SR)
video_path = out_dir / "video_with_audio.mp4"
subprocess.run(
[
"ffmpeg", "-y", "-loglevel", "error",
"-i", str(silent_path),
"-i", str(wav_path),
"-c:v", "copy", "-c:a", "aac", "-shortest",
str(video_path),
],
check=True,
)
return None, str(video_path), seed
EXAMPLE_T2I = (
"A close-up portrait of an astronaut in a jungle, cold color palette, "
"muted colors, detailed, 8k."
)
EXAMPLE_T2V = (
"A cinematic wide shot of a vintage red convertible driving along a "
"coastal highway at sunset. Waves crash against rocks below as the car "
"speeds toward the horizon. Golden hour lighting, lens flare, smooth "
"tracking shot."
)
EXAMPLE_T2VS = (
"A close-up of rain hitting a window at night, neon city lights blurred "
"in the background. Soft ambient sound of rain and distant traffic."
)
def _on_mode_change(mode):
is_image = mode == MODE_IMAGE
return (
gr.update(visible=not is_image, value=False if is_image else None), # sound
gr.update(visible=not is_image), # num_frames
gr.update(visible=is_image, value=None), # out_image
gr.update(visible=not is_image, value=None), # out_video
)
with gr.Blocks(title="Cosmos3-Nano") as demo:
gr.Markdown(
"""
# NVIDIA Cosmos3-Nano — omnimodal world model
Generate images or videos (with optional conditioning image and audio) using
[`nvidia/Cosmos3-Nano`](https://huggingface.co/nvidia/Cosmos3-Nano)
(16B params) via the Diffusers `Cosmos3OmniPipeline`.
"""
)
with gr.Row():
with gr.Column(scale=1):
mode = gr.Radio(
label="Mode",
choices=[MODE_IMAGE, MODE_VIDEO],
value=MODE_VIDEO,
)
prompt = gr.Textbox(
label="Prompt", lines=4, value=EXAMPLE_T2V,
placeholder="Describe what to generate...",
)
image = gr.Image(
label="Conditioning image (optional)", type="pil", height=240,
)
enable_sound = gr.Checkbox(label="Generate audio", value=False)
with gr.Accordion("Advanced settings", open=False):
negative_prompt = gr.Textbox(label="Negative prompt", lines=2, value="")
resolution = gr.Dropdown(
label="Resolution",
choices=list(RESOLUTIONS.keys()),
value="480p (832x480, fast)",
)
num_frames = gr.Slider(
label="Frames (24 fps)", minimum=33, maximum=189, value=65, step=4
)
steps = gr.Slider(
label="Inference steps", minimum=15, maximum=50, value=25, step=1
)
guidance = gr.Slider(
label="Guidance scale", minimum=1.0, maximum=10.0, value=6.0, step=0.5
)
seed = gr.Number(label="Seed", value=0, precision=0)
randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
run = gr.Button("Generate", variant="primary")
with gr.Column(scale=1):
out_image = gr.Image(label="Generated image", visible=False)
out_video = gr.Video(label="Generated video", autoplay=True, visible=True)
used_seed = gr.Number(label="Seed used", interactive=False)
gr.Examples(
examples=[
[MODE_IMAGE, EXAMPLE_T2I, False],
[MODE_VIDEO, EXAMPLE_T2V, False],
[MODE_VIDEO, EXAMPLE_T2VS, True],
],
inputs=[mode, prompt, enable_sound],
label="Examples",
)
mode.change(
fn=_on_mode_change, inputs=[mode],
outputs=[enable_sound, num_frames, out_image, out_video],
)
run.click(
fn=generate,
inputs=[
mode, prompt, image, resolution, num_frames, steps, guidance,
enable_sound, negative_prompt, seed, randomize_seed,
],
outputs=[out_image, out_video, used_seed],
)
if __name__ == "__main__":
demo.queue(max_size=10).launch(theme=gr.themes.Citrus())