Cosmos3-Nano

Runtime error

App Files Files Community

Cosmos3-Nano / app.py

multimodalart HF Staff

Fix ghost loader: bind output visibility to mode, not to generate

aab0d80 verified 8 days ago

raw

history blame contribute delete

7.17 kB

	import spaces # noqa: F401 must precede torch / diffusers

	import subprocess
	import tempfile
	from pathlib import Path

	import gradio as gr
	import imageio.v3 as iio
	import numpy as np
	import torch
	from diffusers import Cosmos3OmniPipeline
	from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler

	MODEL_ID = "nvidia/Cosmos3-Nano"

	pipe = Cosmos3OmniPipeline.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16,
	enable_safety_checker=True,
	)
	pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=10.0)
	pipe.to("cuda")

	AUDIO_SR = int(pipe.sound_tokenizer.config.sampling_rate)

	MODE_IMAGE = "Image"
	MODE_VIDEO = "Video"

	RESOLUTIONS = {
	"720p (1280x720, slow)": (1280, 720),
	"480p (832x480, fast)": (832, 480),
	"360p (640x352, fastest)": (640, 352),
	}


	def _duration(mode, prompt, image, resolution, num_frames, steps, *_):
	w, h = RESOLUTIONS[resolution]
	n = 1 if mode == MODE_IMAGE else num_frames
	pixels = w * h * n * steps
	return min(1200, int(60 + pixels / 8_000_000))


	@spaces.GPU(duration=_duration)
	def generate(
	mode,
	prompt,
	image,
	resolution,
	num_frames,
	steps,
	guidance,
	enable_sound,
	negative_prompt,
	seed,
	randomize_seed,
	progress=gr.Progress(track_tqdm=True),
	):
	if not prompt or not prompt.strip():
	raise gr.Error("Please enter a prompt.")

	width, height = RESOLUTIONS[resolution]

	if randomize_seed:
	seed = int(torch.randint(0, 2**31 - 1, (1,)).item())
	generator = torch.Generator(device="cuda").manual_seed(int(seed))

	fps = 24
	is_image = mode == MODE_IMAGE
	sound = bool(enable_sound) and not is_image

	result = pipe(
	prompt=prompt,
	negative_prompt=negative_prompt or None,
	image=image,
	num_frames=1 if is_image else int(num_frames),
	height=height,
	width=width,
	fps=fps,
	num_inference_steps=int(steps),
	guidance_scale=float(guidance),
	enable_sound=sound,
	generator=generator,
	output_type="pil",
	)

	out_dir = Path(tempfile.mkdtemp(prefix="cosmos3_"))

	if is_image:
	img = result.video[0]
	img_path = out_dir / "image.png"
	img.save(img_path)
	return str(img_path), None, seed

	frames = np.stack([np.asarray(f) for f in result.video], axis=0)
	silent_path = out_dir / "video.mp4"
	iio.imwrite(silent_path, frames, fps=fps, codec="libx264")

	video_path = silent_path
	if sound and result.sound is not None:
	wav = result.sound.detach().to(torch.float32).cpu().numpy()
	if wav.ndim == 2:
	wav = wav.T
	wav_path = out_dir / "audio.wav"
	import soundfile as sf

	sf.write(wav_path, wav, AUDIO_SR)

	video_path = out_dir / "video_with_audio.mp4"
	subprocess.run(
	[
	"ffmpeg", "-y", "-loglevel", "error",
	"-i", str(silent_path),
	"-i", str(wav_path),
	"-c:v", "copy", "-c:a", "aac", "-shortest",
	str(video_path),
	],
	check=True,
	)

	return None, str(video_path), seed


	EXAMPLE_T2I = (
	"A close-up portrait of an astronaut in a jungle, cold color palette, "
	"muted colors, detailed, 8k."
	)
	EXAMPLE_T2V = (
	"A cinematic wide shot of a vintage red convertible driving along a "
	"coastal highway at sunset. Waves crash against rocks below as the car "
	"speeds toward the horizon. Golden hour lighting, lens flare, smooth "
	"tracking shot."
	)
	EXAMPLE_T2VS = (
	"A close-up of rain hitting a window at night, neon city lights blurred "
	"in the background. Soft ambient sound of rain and distant traffic."
	)


	def _on_mode_change(mode):
	is_image = mode == MODE_IMAGE
	return (
	gr.update(visible=not is_image, value=False if is_image else None), # sound
	gr.update(visible=not is_image), # num_frames
	gr.update(visible=is_image, value=None), # out_image
	gr.update(visible=not is_image, value=None), # out_video
	)


	with gr.Blocks(title="Cosmos3-Nano") as demo:
	gr.Markdown(
	"""
	# NVIDIA Cosmos3-Nano — omnimodal world model
	Generate images or videos (with optional conditioning image and audio) using
	[`nvidia/Cosmos3-Nano`](https://huggingface.co/nvidia/Cosmos3-Nano)
	(16B params) via the Diffusers `Cosmos3OmniPipeline`.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	mode = gr.Radio(
	label="Mode",
	choices=[MODE_IMAGE, MODE_VIDEO],
	value=MODE_VIDEO,
	)
	prompt = gr.Textbox(
	label="Prompt", lines=4, value=EXAMPLE_T2V,
	placeholder="Describe what to generate...",
	)
	image = gr.Image(
	label="Conditioning image (optional)", type="pil", height=240,
	)
	enable_sound = gr.Checkbox(label="Generate audio", value=False)
	with gr.Accordion("Advanced settings", open=False):
	negative_prompt = gr.Textbox(label="Negative prompt", lines=2, value="")
	resolution = gr.Dropdown(
	label="Resolution",
	choices=list(RESOLUTIONS.keys()),
	value="480p (832x480, fast)",
	)
	num_frames = gr.Slider(
	label="Frames (24 fps)", minimum=33, maximum=189, value=65, step=4
	)
	steps = gr.Slider(
	label="Inference steps", minimum=15, maximum=50, value=25, step=1
	)
	guidance = gr.Slider(
	label="Guidance scale", minimum=1.0, maximum=10.0, value=6.0, step=0.5
	)
	seed = gr.Number(label="Seed", value=0, precision=0)
	randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
	run = gr.Button("Generate", variant="primary")

	with gr.Column(scale=1):
	out_image = gr.Image(label="Generated image", visible=False)
	out_video = gr.Video(label="Generated video", autoplay=True, visible=True)
	used_seed = gr.Number(label="Seed used", interactive=False)

	gr.Examples(
	examples=[
	[MODE_IMAGE, EXAMPLE_T2I, False],
	[MODE_VIDEO, EXAMPLE_T2V, False],
	[MODE_VIDEO, EXAMPLE_T2VS, True],
	],
	inputs=[mode, prompt, enable_sound],
	label="Examples",
	)

	mode.change(
	fn=_on_mode_change, inputs=[mode],
	outputs=[enable_sound, num_frames, out_image, out_video],
	)

	run.click(
	fn=generate,
	inputs=[
	mode, prompt, image, resolution, num_frames, steps, guidance,
	enable_sound, negative_prompt, seed, randomize_seed,
	],
	outputs=[out_image, out_video, used_seed],
	)


	if __name__ == "__main__":
	demo.queue(max_size=10).launch(theme=gr.themes.Citrus())