Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Cosmos 3","local":"cosmos-3","sections":[{"title":"What’s new in Cosmos 3","local":"whats-new-in-cosmos-3","sections":[],"depth":2},{"title":"Available checkpoints","local":"available-checkpoints","sections":[],"depth":2},{"title":"Text-to-image","local":"text-to-image","sections":[],"depth":2},{"title":"Text-to-video","local":"text-to-video","sections":[],"depth":2},{"title":"Image-to-video","local":"image-to-video","sections":[],"depth":2},{"title":"Text-to-video with sound","local":"text-to-video-with-sound","sections":[],"depth":2},{"title":"Metadata templates","local":"metadata-templates","sections":[],"depth":2},{"title":"Safety checker","local":"safety-checker","sections":[],"depth":2},{"title":"Cosmos3OmniPipeline","local":"diffusers.Cosmos3OmniPipeline","sections":[],"depth":2},{"title":"Cosmos3OmniPipelineOutput","local":"diffusers.pipelines.cosmos.pipeline_cosmos3_omni.Cosmos3OmniPipelineOutput","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/diffusers/pr_12968/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/entry/start.9eac431c.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/chunks/scheduler.53228c21.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/chunks/singletons.2c6306c6.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/chunks/index.e93d0901.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/chunks/paths.23504870.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/entry/app.d5430b76.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/chunks/preload-helper.a47feefc.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/chunks/index.cac5d66a.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/nodes/0.eb16ba39.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/nodes/143.27b2c1ea.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/chunks/Docstring.87335afe.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/chunks/globals.7f7f1b26.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.c07a61ec.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/chunks/CodeBlock.606cbaf4.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/chunks/IconCopy.56f02e4d.js"> | |
| <link rel="modulepreload" href="/docs/diffusers/pr_12968/en/_app/immutable/chunks/HfOption.6b51ddef.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Cosmos 3","local":"cosmos-3","sections":[{"title":"What’s new in Cosmos 3","local":"whats-new-in-cosmos-3","sections":[],"depth":2},{"title":"Available checkpoints","local":"available-checkpoints","sections":[],"depth":2},{"title":"Text-to-image","local":"text-to-image","sections":[],"depth":2},{"title":"Text-to-video","local":"text-to-video","sections":[],"depth":2},{"title":"Image-to-video","local":"image-to-video","sections":[],"depth":2},{"title":"Text-to-video with sound","local":"text-to-video-with-sound","sections":[],"depth":2},{"title":"Metadata templates","local":"metadata-templates","sections":[],"depth":2},{"title":"Safety checker","local":"safety-checker","sections":[],"depth":2},{"title":"Cosmos3OmniPipeline","local":"diffusers.Cosmos3OmniPipeline","sections":[],"depth":2},{"title":"Cosmos3OmniPipelineOutput","local":"diffusers.pipelines.cosmos.pipeline_cosmos3_omni.Cosmos3OmniPipelineOutput","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="cosmos-3" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#cosmos-3"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Cosmos 3</span></h1> <p data-svelte-h="svelte-1bfg7u9">NVIDIA Cosmos 3 is a unified world foundation model (WFM) for Physical AI — a single omni-model that combines world generation, physical reasoning, and action generation. It replaces the separate Predict, Reason, and Transfer models from earlier Cosmos releases: whether you’re building for robotics, autonomous vehicles, or smart spaces, Cosmos 3 gives you one foundation to simulate and understand the physical world.</p> <p data-svelte-h="svelte-16acpvv">What’s shipping with this release:</p> <ul data-svelte-h="svelte-l7xz5s"><li>Models on the Hugging Face Hub with model cards and licensing</li> <li>Cosmos 3 Diffusers integration for generation pipelines (this page)</li> <li>Post-training scripts for fine-tuning Cosmos 3 on your own data</li> <li>Open synthetic data generation (SDG) datasets for Physical AI</li></ul> <h2 class="relative group"><a id="whats-new-in-cosmos-3" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#whats-new-in-cosmos-3"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>What’s new in Cosmos 3</span></h2> <p data-svelte-h="svelte-t3k2lx">The biggest change from previous Cosmos releases is that Cosmos 3 is an <em>omni-model</em>, built on a Mixture-of-Transformers (MoT) architecture. Previously, developers worked with separate models for world generation (Predict), controlled generation (Transfer), scene understanding (Reason), and action-policy generation. Cosmos 3 unifies all of these in one model that reasons and generates across modalities in a single forward pass.</p> <p data-svelte-h="svelte-1qyj0gu">From one model you can:</p> <ul data-svelte-h="svelte-v2tptm"><li>Generate physically plausible video worlds from text, images, or action inputs (image-to-video, text-to-video, action-conditioned video generation).</li> <li>Reason about physical properties like motion, causality, and spatial relationships.</li> <li>Predict future video and action sequences from the current state.</li> <li>Transfer scenes across viewpoints and conditions with structural control <em>(coming soon)</em>.</li></ul> <p data-svelte-h="svelte-1xm4gj6">Under the hood, a single <code>Cosmos3OmniTransformer</code> runs a Qwen-style language model in parallel with a diffusion generation pathway: text tokens flow through a causal “understanding” stream while video and sound latents flow through a bi-directionally-attended “generation” stream, joined by a 3D multimodal RoPE. See the <a href="https://huggingface.co/papers/2501.03575" rel="nofollow">Cosmos World Foundation Model Platform paper</a> for the architectural background.</p> <h2 class="relative group"><a id="available-checkpoints" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#available-checkpoints"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Available checkpoints</span></h2> <p data-svelte-h="svelte-1jbbrtc">Two checkpoints are released on the Hub — <a href="https://huggingface.co/nvidia/Cosmos3-Nano" rel="nofollow"><code>nvidia/Cosmos3-Nano</code></a> (smaller, faster) and <a href="https://huggingface.co/nvidia/Cosmos3-Super" rel="nofollow"><code>nvidia/Cosmos3-Super</code></a> (larger, higher quality). The same pipeline class supports text-to-image, text-to-video, image-to-video, and (with a sound-capable checkpoint) text+image-to-video-with-sound — pick a repo and use the per-model tab in each workflow below.</p> <blockquote class="tip" data-svelte-h="svelte-r1jcqf"><p>Make sure to check out the Schedulers <a href="../../using-diffusers/schedulers">guide</a> to learn how to explore the tradeoff between scheduler speed and quality, and see the <a href="../../using-diffusers/loading#reuse-a-pipeline">reuse components across pipelines</a> section to learn how to efficiently load the same components into multiple pipelines.</p></blockquote> <h2 class="relative group"><a id="text-to-image" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#text-to-image"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Text-to-image</span></h2> <p data-svelte-h="svelte-1tzoljf">Single-frame generation. The model is conditioned only on the text prompt; pass <code>num_frames=1</code>.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">Nano </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">Super </div></div> <div class="language-select"><div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> Cosmos3OmniPipeline | |
| pipe = Cosmos3OmniPipeline.from_pretrained( | |
| <span class="hljs-string">"nvidia/Cosmos3-Nano"</span>, torch_dtype=torch.bfloat16, device_map=<span class="hljs-string">"cuda"</span> | |
| ) | |
| prompt = ( | |
| <span class="hljs-string">"A medium shot of a modern robotics research laboratory with white walls and a gray floor. "</span> | |
| <span class="hljs-string">"A robotic arm with a metallic finish is mounted on a clean white workbench, its gripper positioned "</span> | |
| <span class="hljs-string">"above a row of small colored objects. A laptop and neatly arranged tools sit beside the robot. "</span> | |
| <span class="hljs-string">"A large monitor on the wall behind displays a software interface. The scene is brightly lit by "</span> | |
| <span class="hljs-string">"overhead fluorescent lights."</span> | |
| ) | |
| result = pipe(prompt=prompt, num_frames=<span class="hljs-number">1</span>, height=<span class="hljs-number">720</span>, width=<span class="hljs-number">1280</span>) | |
| result.video[<span class="hljs-number">0</span>].save(<span class="hljs-string">"cosmos3_t2i.jpg"</span>, <span class="hljs-built_in">format</span>=<span class="hljs-string">"JPEG"</span>, quality=<span class="hljs-number">85</span>)<!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="text-to-video" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#text-to-video"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Text-to-video</span></h2> <p data-svelte-h="svelte-85sm2e">Multi-frame generation conditioned on text alone. Pick <code>num_frames</code> based on the target duration — the default <code>num_frames=189</code> produces ≈ 7.9 s at 24 FPS.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">Nano </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">Super </div></div> <div class="language-select"><div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> Cosmos3OmniPipeline | |
| <span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_video | |
| pipe = Cosmos3OmniPipeline.from_pretrained( | |
| <span class="hljs-string">"nvidia/Cosmos3-Nano"</span>, torch_dtype=torch.bfloat16, device_map=<span class="hljs-string">"cuda"</span> | |
| ) | |
| prompt = ( | |
| <span class="hljs-string">"The video opens with a view of a well-lit indoor space featuring a wooden display case with "</span> | |
| <span class="hljs-string">"compartments filled with various fruits, including bananas, apples, pears, oranges, and carambolas. "</span> | |
| <span class="hljs-string">"The bananas are neatly arranged in the middle compartment, while apples are in the left and a mix "</span> | |
| <span class="hljs-string">"of pears, oranges, and carambolas are in the right. Two robotic arms with grippers are positioned "</span> | |
| <span class="hljs-string">"at the bottom of the frame, with the one on the left remaining stationary, partially obscuring the "</span> | |
| <span class="hljs-string">"apples. The robotic arm on the right begins its action, extending towards the right side of the "</span> | |
| <span class="hljs-string">"display case. It carefully picks up a pear from the fruit section, placing it into a plastic bag "</span> | |
| <span class="hljs-string">"in the shopping cart nearby, which has red handles. After securing the pear, the arm retracts back "</span> | |
| <span class="hljs-string">"to its original position. The process repeats as the robotic arm picks up an orange and places it "</span> | |
| <span class="hljs-string">"in the bag, followed by a carambola. The final frame captures the robotic arm returning to its "</span> | |
| <span class="hljs-string">"initial position, leaving the display case and surrounding area unchanged. The video showcases a "</span> | |
| <span class="hljs-string">"seamless and efficient automated fruit-picking process, highlighting the precision and efficiency "</span> | |
| <span class="hljs-string">"of modern robotics in a retail setting."</span> | |
| ) | |
| <span class="hljs-comment"># Recommended quality-control negative prompt for text-to-video.</span> | |
| negative_prompt = ( | |
| <span class="hljs-string">"The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "</span> | |
| <span class="hljs-string">"over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "</span> | |
| <span class="hljs-string">"underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky "</span> | |
| <span class="hljs-string">"movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, "</span> | |
| <span class="hljs-string">"fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. "</span> | |
| <span class="hljs-string">"Overall, the video is of poor quality."</span> | |
| ) | |
| result = pipe( | |
| prompt=prompt, | |
| negative_prompt=negative_prompt, | |
| num_frames=<span class="hljs-number">189</span>, | |
| height=<span class="hljs-number">720</span>, | |
| width=<span class="hljs-number">1280</span>, | |
| fps=<span class="hljs-number">24.0</span>, | |
| ) | |
| <span class="hljs-comment"># macro_block_size=1 allows arbitrary frame sizes (Cosmos3 outputs are not always divisible by 16).</span> | |
| export_to_video(result.video, <span class="hljs-string">"cosmos3_t2v.mp4"</span>, fps=<span class="hljs-number">24</span>, macro_block_size=<span class="hljs-number">1</span>)<!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="image-to-video" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-to-video"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image-to-video</span></h2> <p data-svelte-h="svelte-1gj03yu">Pass a conditioning image via <code>image=</code>. The pipeline anchors frame 0 to the supplied image and denoises the rest.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">Nano </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">Super </div></div> <div class="language-select"><div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> Cosmos3OmniPipeline | |
| <span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> export_to_video, load_image | |
| pipe = Cosmos3OmniPipeline.from_pretrained( | |
| <span class="hljs-string">"nvidia/Cosmos3-Nano"</span>, torch_dtype=torch.bfloat16, device_map=<span class="hljs-string">"cuda"</span> | |
| ) | |
| image = load_image( | |
| <span class="hljs-string">"https://github.com/nvidia-cosmos/cosmos-dependencies/releases/download/assets/robot_153.jpg"</span> | |
| ) | |
| prompt = ( | |
| <span class="hljs-string">"The video opens with a view of a testing environment, characterized by a large wooden table at the "</span> | |
| <span class="hljs-string">"center. On this table, two robot arms are positioned at opposite ends, with the left arm closer to "</span> | |
| <span class="hljs-string">"the camera and the right arm further away. Between the hands lies a dark wooden shelf with a red "</span> | |
| <span class="hljs-string">"spherical object on its top rack, likely serving as a platform or obstacle. In the background, "</span> | |
| <span class="hljs-string">"various pieces of equipment, including a tripod, a chair, are visible. A person wearing a blue "</span> | |
| <span class="hljs-string">"jacket and black pants stands near the center of the room, observing the experiment, with a static "</span> | |
| <span class="hljs-string">"hand position throughout. The floor is tiled with a patterned design, and additional items like a "</span> | |
| <span class="hljs-string">"small robot figure and some cables can be seen scattered around the space. As the video progresses, "</span> | |
| <span class="hljs-string">"the right robotic hand extends outward, moving from its initial position towards the red spherical "</span> | |
| <span class="hljs-string">"object on the shelf. The hand then picks up the object and places it on the lowest rack of the "</span> | |
| <span class="hljs-string">"shelf, completing a smooth, deliberate manipulation. The left robotic hand remains stationary "</span> | |
| <span class="hljs-string">"throughout the sequence. No new objects appear in the video; all existing elements maintain their "</span> | |
| <span class="hljs-string">"positions except for the movement of the right robotic hand. The scene concludes with the right "</span> | |
| <span class="hljs-string">"robotic hand returning to its initial position, while the left hand continues to rest on the table. "</span> | |
| <span class="hljs-string">"The overall environment remains unchanged, with the focus remaining on the interaction between the "</span> | |
| <span class="hljs-string">"robotic hands and the wooden block, highlighting precise control during the demonstration."</span> | |
| ) | |
| <span class="hljs-comment"># Recommended quality-control negative prompt for image-to-video.</span> | |
| negative_prompt = ( | |
| <span class="hljs-string">"The video captures a series of frames showing macroblocking artifacts, chromatic aberration, "</span> | |
| <span class="hljs-string">"high-frequency noise, and rolling shutter distortion. It includes static with no motion, motion blur, "</span> | |
| <span class="hljs-string">"over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "</span> | |
| <span class="hljs-string">"underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky "</span> | |
| <span class="hljs-string">"movements, low frame rate, bit-depth compression artifacts, color banding, unnatural transitions, "</span> | |
| <span class="hljs-string">"outdated special effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual "</span> | |
| <span class="hljs-string">"noise, and flickering. Avoid moiré patterns, edge halos, and temporal aliasing. Furthermore, the content "</span> | |
| <span class="hljs-string">"defies common sense, generating illogical scenarios, nonsensical entities, absurd character behaviors, "</span> | |
| <span class="hljs-string">"and conceptual paradoxes that violate basic human reasoning and everyday reality. The video looks like a "</span> | |
| <span class="hljs-string">"surreal or glitchy hallucination. Overall, the video is of poor quality."</span> | |
| ) | |
| result = pipe( | |
| prompt=prompt, | |
| negative_prompt=negative_prompt, | |
| image=image, | |
| num_frames=<span class="hljs-number">189</span>, | |
| height=<span class="hljs-number">720</span>, | |
| width=<span class="hljs-number">1280</span>, | |
| fps=<span class="hljs-number">24.0</span>, | |
| ) | |
| <span class="hljs-comment"># macro_block_size=1 allows arbitrary frame sizes (Cosmos3 outputs are not always divisible by 16).</span> | |
| export_to_video(result.video, <span class="hljs-string">"cosmos3_i2v.mp4"</span>, fps=<span class="hljs-number">24</span>, macro_block_size=<span class="hljs-number">1</span>)<!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="text-to-video-with-sound" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#text-to-video-with-sound"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Text-to-video with sound</span></h2> <p data-svelte-h="svelte-33u9t5">When the checkpoint carries a <code>sound_tokenizer</code>, pass <code>enable_sound=True</code> to jointly generate a synchronized audio track. The waveform is returned alongside the video and can be muxed into the MP4 with <a href="/docs/diffusers/pr_12968/en/api/utilities#diffusers.utils.encode_video">encode_video()</a>.</p> <p data-svelte-h="svelte-4uszwp">This is the same call as the text-to-video example above with <code>enable_sound=True</code> added:</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">Nano </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">Super </div></div> <div class="language-select"><div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> Cosmos3OmniPipeline | |
| <span class="hljs-keyword">from</span> diffusers.utils <span class="hljs-keyword">import</span> encode_video | |
| pipe = Cosmos3OmniPipeline.from_pretrained( | |
| <span class="hljs-string">"nvidia/Cosmos3-Nano"</span>, torch_dtype=torch.bfloat16, device_map=<span class="hljs-string">"cuda"</span> | |
| ) | |
| prompt = ( | |
| <span class="hljs-string">"The video opens with a view of a well-lit indoor space featuring a wooden display case with "</span> | |
| <span class="hljs-string">"compartments filled with various fruits, including bananas, apples, pears, oranges, and carambolas. "</span> | |
| <span class="hljs-string">"The bananas are neatly arranged in the middle compartment, while apples are in the left and a mix "</span> | |
| <span class="hljs-string">"of pears, oranges, and carambolas are in the right. Two robotic arms with grippers are positioned "</span> | |
| <span class="hljs-string">"at the bottom of the frame, with the one on the left remaining stationary, partially obscuring the "</span> | |
| <span class="hljs-string">"apples. The robotic arm on the right begins its action, extending towards the right side of the "</span> | |
| <span class="hljs-string">"display case. It carefully picks up a pear from the fruit section, placing it into a plastic bag "</span> | |
| <span class="hljs-string">"in the shopping cart nearby, which has red handles. After securing the pear, the arm retracts back "</span> | |
| <span class="hljs-string">"to its original position. The process repeats as the robotic arm picks up an orange and places it "</span> | |
| <span class="hljs-string">"in the bag, followed by a carambola. The final frame captures the robotic arm returning to its "</span> | |
| <span class="hljs-string">"initial position, leaving the display case and surrounding area unchanged. The video showcases a "</span> | |
| <span class="hljs-string">"seamless and efficient automated fruit-picking process, highlighting the precision and efficiency "</span> | |
| <span class="hljs-string">"of modern robotics in a retail setting. Audio description: the soft whir of servo motors, gentle "</span> | |
| <span class="hljs-string">"thuds as fruits land in the plastic bag, the rustle of the bag settling in the shopping cart, and "</span> | |
| <span class="hljs-string">"a faint refrigeration hum in the background."</span> | |
| ) | |
| <span class="hljs-comment"># Recommended quality-control negative prompt (same as text-to-video).</span> | |
| negative_prompt = ( | |
| <span class="hljs-string">"The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "</span> | |
| <span class="hljs-string">"over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "</span> | |
| <span class="hljs-string">"underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, jerky "</span> | |
| <span class="hljs-string">"movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special effects, "</span> | |
| <span class="hljs-string">"fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and flickering. "</span> | |
| <span class="hljs-string">"Overall, the video is of poor quality."</span> | |
| ) | |
| result = pipe( | |
| prompt=prompt, | |
| negative_prompt=negative_prompt, | |
| num_frames=<span class="hljs-number">189</span>, | |
| height=<span class="hljs-number">720</span>, | |
| width=<span class="hljs-number">1280</span>, | |
| fps=<span class="hljs-number">24.0</span>, | |
| enable_sound=<span class="hljs-literal">True</span>, | |
| ) | |
| encode_video( | |
| result.video, | |
| fps=<span class="hljs-number">24</span>, | |
| audio=result.sound, | |
| audio_sample_rate=pipe.sound_tokenizer.config.sampling_rate, | |
| output_path=<span class="hljs-string">"cosmos3_with_sound.mp4"</span>, | |
| )<!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="metadata-templates" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#metadata-templates"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Metadata templates</span></h2> <p data-svelte-h="svelte-13gle33"><code>tokenize_prompt</code> appends short metadata sentences inside the user message so the LLM sees the conditioning the model was trained with. The positive prompt gets sentences like <em>“The video is 7.9 seconds long and is of 24 FPS.”</em> and <em>“This video is of 720x1280 resolution.”</em>; the negative prompt gets the inverse (<em>”… is not …”</em>).</p> <p data-svelte-h="svelte-2xtold">Both are on by default. Disable either pair through <code>__call__</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->result = pipe( | |
| prompt=prompt, | |
| negative_prompt=negative_prompt, | |
| num_frames=<span class="hljs-number">189</span>, | |
| height=<span class="hljs-number">720</span>, | |
| width=<span class="hljs-number">1280</span>, | |
| fps=<span class="hljs-number">24.0</span>, | |
| add_duration_template=<span class="hljs-literal">False</span>, <span class="hljs-comment"># skip the duration sentence on both prompts</span> | |
| add_resolution_template=<span class="hljs-literal">False</span>, <span class="hljs-comment"># skip the resolution sentence on both prompts</span> | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1326i2u"><code>add_duration_template</code> has no effect when <code>num_frames == 1</code> (image mode); only the resolution sentence is appended in that case.</p> <h2 class="relative group"><a id="safety-checker" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#safety-checker"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Safety checker</span></h2> <p data-svelte-h="svelte-2cm2bd">Cosmos3 wires up the <a href="https://pypi.org/project/cosmos-guardrail/" rel="nofollow"><code>cosmos_guardrail</code></a> <code>CosmosSafetyChecker</code> and runs it <strong>by default</strong>. The text guardrail rejects unsafe prompts before generation (<code>ValueError</code>); the video guardrail runs on the decoded frames and either pixelates detected faces or rejects the output. Audio output is not guardrailed.</p> <p data-svelte-h="svelte-10ubvxy">Install the optional dependency to enable the default checker:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=" "><!-- HTML_TAG_START -->pip <span class="hljs-keyword">install</span> cosmos_guardrail<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-fkq18s">The checker is mandatory under the NVIDIA Open Model License Agreement. The two flags below exist for tests and development workflows where the guardrail would be redundant (e.g., the input has already been cleared, or you are intentionally exercising the pipeline on edge inputs).</p> <p data-svelte-h="svelte-jq9awm"><strong>Disable at construction</strong> (no checker is instantiated, so no guardrail models are downloaded or loaded into memory):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> diffusers <span class="hljs-keyword">import</span> Cosmos3OmniPipeline | |
| pipe = Cosmos3OmniPipeline.from_pretrained( | |
| <span class="hljs-string">"nvidia/Cosmos3-Nano"</span>, | |
| torch_dtype=torch.bfloat16, | |
| device_map=<span class="hljs-string">"cuda"</span>, | |
| enable_safety_checker=<span class="hljs-literal">False</span>, | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ze8d61"><strong>Disable for a single call</strong> (checker stays loaded — useful for one-off bypass while keeping the default on for subsequent calls):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->result = pipe( | |
| prompt=prompt, | |
| num_frames=<span class="hljs-number">189</span>, | |
| height=<span class="hljs-number">720</span>, | |
| width=<span class="hljs-number">1280</span>, | |
| fps=<span class="hljs-number">24.0</span>, | |
| enable_safety_check=<span class="hljs-literal">False</span>, | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-10arwsi">To supply a custom checker (e.g., a no-op subclass for fast tests), pass it as <code>safety_checker=</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START -->pipe = Cosmos3OmniPipeline.from_pretrained( | |
| <span class="hljs-string">"nvidia/Cosmos3-Nano"</span>, | |
| torch_dtype=torch.bfloat16, | |
| device_map=<span class="hljs-string">"cuda"</span>, | |
| safety_checker=MyCustomSafetyChecker(), | |
| )<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="diffusers.Cosmos3OmniPipeline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#diffusers.Cosmos3OmniPipeline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Cosmos3OmniPipeline</span></h2> <div class="docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"> <div><span class="group flex space-x-1.5 items-center text-gray-800 bg-gradient-to-r rounded-tr-lg -mt-4 -ml-4 pt-3 px-2.5" id="diffusers.Cosmos3OmniPipeline"><!-- HTML_TAG_START --><h3 class="!m-0"><span class="flex-1 break-all md:text-lg bg-gradient-to-r px-2.5 py-1.5 rounded-xl from-indigo-50/70 to-white dark:from-gray-900 dark:to-gray-950 dark:text-indigo-300 text-indigo-700"><svg class="mr-1.5 text-indigo-500 inline-block -mt-0.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width=".8em" height=".8em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path class="uim-quaternary" d="M20.23 7.24L12 12L3.77 7.24a1.98 1.98 0 0 1 .7-.71L11 2.76c.62-.35 1.38-.35 2 0l6.53 3.77c.29.173.531.418.7.71z" opacity=".25" fill="currentColor"></path><path class="uim-tertiary" d="M12 12v9.5a2.09 2.09 0 0 1-.91-.21L4.5 17.48a2.003 2.003 0 0 1-1-1.73v-7.5a2.06 2.06 0 0 1 .27-1.01L12 12z" opacity=".5" fill="currentColor"></path><path class="uim-primary" d="M20.5 8.25v7.5a2.003 2.003 0 0 1-1 1.73l-6.62 3.82c-.275.13-.576.198-.88.2V12l8.23-4.76c.175.308.268.656.27 1.01z" fill="currentColor"></path></svg><span class="font-light">class</span> <span class="font-medium">diffusers.</span><span class="font-semibold">Cosmos3OmniPipeline</span></span></h3><!-- HTML_TAG_END --> <a id="diffusers.Cosmos3OmniPipeline" class="header-link invisible with-hover:group-hover:visible pr-2" href="#diffusers.Cosmos3OmniPipeline"><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></a> <a class="!ml-auto !text-gray-400 !no-underline text-sm flex items-center" href="https://github.com/huggingface/diffusers/blob/vr_12968/src/diffusers/pipelines/cosmos/pipeline_cosmos3_omni.py#L165" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span class="hidden md:block mx-0.5 hover:!underline" data-svelte-h="svelte-122apf4">source</span> <span data-svelte-h="svelte-x0xyl0">></span></a></span> <p class="font-mono text-xs md:text-sm !leading-relaxed !my-6"><span data-svelte-h="svelte-8mvn6a">(</span> <span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">transformer<span class="opacity-60">: Cosmos3OmniTransformer</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">text_tokenizer<span class="opacity-60">: AutoTokenizer</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">vae<span class="opacity-60">: AutoencoderKLWan</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">scheduler<span class="opacity-60">: UniPCMultistepScheduler</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">sound_tokenizer<span class="opacity-60">: diffusers.models.autoencoders.autoencoder_cosmos3_audio.Cosmos3AVAEAudioTokenizer | None = None</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">safety_checker<span class="opacity-60">: diffusers.pipelines.cosmos.pipeline_cosmos3_omni.CosmosSafetyChecker | None = None</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">enable_safety_checker<span class="opacity-60">: bool = True</span></span> </span> <span data-svelte-h="svelte-1jq0pl7">)</span> </p> <div class="!mb-10 relative docstring-details "> </div></div> <div class="docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"> <div><span class="group flex space-x-1.5 items-center text-gray-800 bg-gradient-to-r rounded-tr-lg -mt-4 -ml-4 pt-3 px-2.5" id="diffusers.Cosmos3OmniPipeline.decode_sound"><!-- HTML_TAG_START --><h4 class="!m-0"><span class="flex-1 rounded-xl py-0.5 break-all bg-gradient-to-r from-blue-50/60 to-white dark:from-gray-900 dark:to-gray-950 text-blue-700 dark:text-blue-300 font-medium px-2"><svg width="1em" height="1em" viewBox="0 0 32 33" class="mr-1 inline-block -mt-0.5" xmlns="http://www.w3.org/2000/svg"><path d="M5.80566 18.3545C4.90766 17.4565 4.90766 16.0005 5.80566 15.1025L14.3768 6.53142C15.2748 5.63342 16.7307 5.63342 17.6287 6.53142L26.1999 15.1025C27.0979 16.0005 27.0979 17.4565 26.1999 18.3545L17.6287 26.9256C16.7307 27.8236 15.2748 27.8236 14.3768 26.9256L5.80566 18.3545Z" fill="currentColor" fill-opacity="0.25"/><path fill-rule="evenodd" clip-rule="evenodd" d="M16.4801 13.9619C16.4801 12.9761 16.7467 12.5436 16.9443 12.3296C17.1764 12.078 17.5731 11.8517 18.2275 11.707C18.8821 11.5623 19.638 11.5342 20.4038 11.5582C20.7804 11.57 21.1341 11.5932 21.4719 11.6156L21.5263 11.6193C21.8195 11.6389 22.1626 11.6618 22.4429 11.6618V7.40825C22.3209 7.40825 22.1219 7.39596 21.7544 7.37149C21.4202 7.34925 20.9976 7.32115 20.5371 7.30672C19.6286 7.27824 18.4672 7.29779 17.3093 7.55377C16.1512 7.8098 14.8404 8.33724 13.8181 9.4452C12.7612 10.5907 12.2266 12.1236 12.2266 13.9619V15.0127H10.6836V19.2662H12.2266V26.6332H16.4801V19.2662H20.3394V15.0127H16.4801V13.9619Z" fill="currentColor"/></svg>decode_sound</span></h4><!-- HTML_TAG_END --> <a id="diffusers.Cosmos3OmniPipeline.decode_sound" class="header-link invisible with-hover:group-hover:visible pr-2" href="#diffusers.Cosmos3OmniPipeline.decode_sound"><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></a> <a class="!ml-auto !text-gray-400 !no-underline text-sm flex items-center" href="https://github.com/huggingface/diffusers/blob/vr_12968/src/diffusers/pipelines/cosmos/pipeline_cosmos3_omni.py#L259" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span class="hidden md:block mx-0.5 hover:!underline" data-svelte-h="svelte-122apf4">source</span> <span data-svelte-h="svelte-x0xyl0">></span></a></span> <p class="font-mono text-xs md:text-sm !leading-relaxed !my-6"><span data-svelte-h="svelte-8mvn6a">(</span> <span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">latent<span class="opacity-60">: Tensor</span></span> </span> <span data-svelte-h="svelte-1jq0pl7">)</span> </p> <div class="!mb-10 relative docstring-details "> </div></div> <p data-svelte-h="svelte-1bfskfo">Decode a sound latent <code>[C, T]</code> to a waveform <code>[audio_ch, N]</code>.</p> <p data-svelte-h="svelte-13hldjp">Adds/removes the batch dimension expected by the sound tokenizer decoder.</p></div> <div class="docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"> <div><span class="group flex space-x-1.5 items-center text-gray-800 bg-gradient-to-r rounded-tr-lg -mt-4 -ml-4 pt-3 px-2.5" id="diffusers.Cosmos3OmniPipeline.prepare_latents"><!-- HTML_TAG_START --><h4 class="!m-0"><span class="flex-1 rounded-xl py-0.5 break-all bg-gradient-to-r from-blue-50/60 to-white dark:from-gray-900 dark:to-gray-950 text-blue-700 dark:text-blue-300 font-medium px-2"><svg width="1em" height="1em" viewBox="0 0 32 33" class="mr-1 inline-block -mt-0.5" xmlns="http://www.w3.org/2000/svg"><path d="M5.80566 18.3545C4.90766 17.4565 4.90766 16.0005 5.80566 15.1025L14.3768 6.53142C15.2748 5.63342 16.7307 5.63342 17.6287 6.53142L26.1999 15.1025C27.0979 16.0005 27.0979 17.4565 26.1999 18.3545L17.6287 26.9256C16.7307 27.8236 15.2748 27.8236 14.3768 26.9256L5.80566 18.3545Z" fill="currentColor" fill-opacity="0.25"/><path fill-rule="evenodd" clip-rule="evenodd" d="M16.4801 13.9619C16.4801 12.9761 16.7467 12.5436 16.9443 12.3296C17.1764 12.078 17.5731 11.8517 18.2275 11.707C18.8821 11.5623 19.638 11.5342 20.4038 11.5582C20.7804 11.57 21.1341 11.5932 21.4719 11.6156L21.5263 11.6193C21.8195 11.6389 22.1626 11.6618 22.4429 11.6618V7.40825C22.3209 7.40825 22.1219 7.39596 21.7544 7.37149C21.4202 7.34925 20.9976 7.32115 20.5371 7.30672C19.6286 7.27824 18.4672 7.29779 17.3093 7.55377C16.1512 7.8098 14.8404 8.33724 13.8181 9.4452C12.7612 10.5907 12.2266 12.1236 12.2266 13.9619V15.0127H10.6836V19.2662H12.2266V26.6332H16.4801V19.2662H20.3394V15.0127H16.4801V13.9619Z" fill="currentColor"/></svg>prepare_latents</span></h4><!-- HTML_TAG_END --> <a id="diffusers.Cosmos3OmniPipeline.prepare_latents" class="header-link invisible with-hover:group-hover:visible pr-2" href="#diffusers.Cosmos3OmniPipeline.prepare_latents"><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></a> <a class="!ml-auto !text-gray-400 !no-underline text-sm flex items-center" href="https://github.com/huggingface/diffusers/blob/vr_12968/src/diffusers/pipelines/cosmos/pipeline_cosmos3_omni.py#L399" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span class="hidden md:block mx-0.5 hover:!underline" data-svelte-h="svelte-122apf4">source</span> <span data-svelte-h="svelte-x0xyl0">></span></a></span> <p class="font-mono text-xs md:text-sm !leading-relaxed !my-6"><span data-svelte-h="svelte-8mvn6a">(</span> <span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">image<span class="opacity-60">: torch.Tensor | None = None</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">num_frames<span class="opacity-60">: int = 189</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">height<span class="opacity-60">: int = 720</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">width<span class="opacity-60">: int = 1280</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">fps<span class="opacity-60">: float = 24.0</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">latents<span class="opacity-60">: torch.Tensor | None = None</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">sound_latents<span class="opacity-60">: torch.Tensor | None = None</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">generator<span class="opacity-60">: torch._C.Generator | None = None</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">device<span class="opacity-60">: str = 'cuda'</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">dtype<span class="opacity-60">: dtype = torch.bfloat16</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">enable_sound<span class="opacity-60">: bool = False</span></span> </span> <span data-svelte-h="svelte-1jq0pl7">)</span> </p> <div class="!mb-10 relative docstring-details "> </div></div> <p data-svelte-h="svelte-1r3tkns">Build conditioning + initial noise for a single sample.</p></div> <div class="docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"> <div><span class="group flex space-x-1.5 items-center text-gray-800 bg-gradient-to-r rounded-tr-lg -mt-4 -ml-4 pt-3 px-2.5" id="diffusers.Cosmos3OmniPipeline.tokenize_prompt"><!-- HTML_TAG_START --><h4 class="!m-0"><span class="flex-1 rounded-xl py-0.5 break-all bg-gradient-to-r from-blue-50/60 to-white dark:from-gray-900 dark:to-gray-950 text-blue-700 dark:text-blue-300 font-medium px-2"><svg width="1em" height="1em" viewBox="0 0 32 33" class="mr-1 inline-block -mt-0.5" xmlns="http://www.w3.org/2000/svg"><path d="M5.80566 18.3545C4.90766 17.4565 4.90766 16.0005 5.80566 15.1025L14.3768 6.53142C15.2748 5.63342 16.7307 5.63342 17.6287 6.53142L26.1999 15.1025C27.0979 16.0005 27.0979 17.4565 26.1999 18.3545L17.6287 26.9256C16.7307 27.8236 15.2748 27.8236 14.3768 26.9256L5.80566 18.3545Z" fill="currentColor" fill-opacity="0.25"/><path fill-rule="evenodd" clip-rule="evenodd" d="M16.4801 13.9619C16.4801 12.9761 16.7467 12.5436 16.9443 12.3296C17.1764 12.078 17.5731 11.8517 18.2275 11.707C18.8821 11.5623 19.638 11.5342 20.4038 11.5582C20.7804 11.57 21.1341 11.5932 21.4719 11.6156L21.5263 11.6193C21.8195 11.6389 22.1626 11.6618 22.4429 11.6618V7.40825C22.3209 7.40825 22.1219 7.39596 21.7544 7.37149C21.4202 7.34925 20.9976 7.32115 20.5371 7.30672C19.6286 7.27824 18.4672 7.29779 17.3093 7.55377C16.1512 7.8098 14.8404 8.33724 13.8181 9.4452C12.7612 10.5907 12.2266 12.1236 12.2266 13.9619V15.0127H10.6836V19.2662H12.2266V26.6332H16.4801V19.2662H20.3394V15.0127H16.4801V13.9619Z" fill="currentColor"/></svg>tokenize_prompt</span></h4><!-- HTML_TAG_END --> <a id="diffusers.Cosmos3OmniPipeline.tokenize_prompt" class="header-link invisible with-hover:group-hover:visible pr-2" href="#diffusers.Cosmos3OmniPipeline.tokenize_prompt"><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></a> <a class="!ml-auto !text-gray-400 !no-underline text-sm flex items-center" href="https://github.com/huggingface/diffusers/blob/vr_12968/src/diffusers/pipelines/cosmos/pipeline_cosmos3_omni.py#L530" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span class="hidden md:block mx-0.5 hover:!underline" data-svelte-h="svelte-122apf4">source</span> <span data-svelte-h="svelte-x0xyl0">></span></a></span> <p class="font-mono text-xs md:text-sm !leading-relaxed !my-6"><span data-svelte-h="svelte-8mvn6a">(</span> <span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">prompt<span class="opacity-60">: str</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">negative_prompt<span class="opacity-60">: str | None = None</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">num_frames<span class="opacity-60">: int = 189</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">height<span class="opacity-60">: int = 720</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">width<span class="opacity-60">: int = 1280</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">fps<span class="opacity-60">: float = 24.0</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">use_system_prompt<span class="opacity-60">: bool = True</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">add_resolution_template<span class="opacity-60">: bool = True</span></span> </span><span class="comma cursor-default"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">add_duration_template<span class="opacity-60">: bool = True</span></span> </span> <span data-svelte-h="svelte-1jq0pl7">)</span> </p> <div class="!mb-10 relative docstring-details "> </div></div> <p data-svelte-h="svelte-1g5f9fd">Apply prompt-augmentation templates and tokenize cond/uncond prompts via the Qwen2 chat template.</p> <p data-svelte-h="svelte-1nulle2">This pipeline does not run a separate text encoder: the joint Cosmos3 transformer consumes raw Qwen2 token IDs | |
| alongside vision (and optionally sound) tokens.</p> <p data-svelte-h="svelte-1p5xsfp">When <code>negative_prompt</code> is <code>None</code>, an empty string is used; the Cosmos3 docs page documents recommended | |
| quality-control negative prompts to pass explicitly for text2video / image2video. The duration and resolution | |
| templates are appended to the prompt, and inverse templates are appended to the negative prompt, when enabled.</p></div></div> <ul data-svelte-h="svelte-1p6h59i"><li>all</li> <li><strong>call</strong></li></ul> <h2 class="relative group"><a id="diffusers.pipelines.cosmos.pipeline_cosmos3_omni.Cosmos3OmniPipelineOutput" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#diffusers.pipelines.cosmos.pipeline_cosmos3_omni.Cosmos3OmniPipelineOutput"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Cosmos3OmniPipelineOutput</span></h2> <div class="docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"> <div><span class="group flex space-x-1.5 items-center text-gray-800 bg-gradient-to-r rounded-tr-lg -mt-4 -ml-4 pt-3 px-2.5" id="diffusers.pipelines.cosmos.pipeline_cosmos3_omni.Cosmos3OmniPipelineOutput"><!-- HTML_TAG_START --><h3 class="!m-0"><span class="flex-1 break-all md:text-lg bg-gradient-to-r px-2.5 py-1.5 rounded-xl from-indigo-50/70 to-white dark:from-gray-900 dark:to-gray-950 dark:text-indigo-300 text-indigo-700"><svg class="mr-1.5 text-indigo-500 inline-block -mt-0.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width=".8em" height=".8em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 24 24"><path class="uim-quaternary" d="M20.23 7.24L12 12L3.77 7.24a1.98 1.98 0 0 1 .7-.71L11 2.76c.62-.35 1.38-.35 2 0l6.53 3.77c.29.173.531.418.7.71z" opacity=".25" fill="currentColor"></path><path class="uim-tertiary" d="M12 12v9.5a2.09 2.09 0 0 1-.91-.21L4.5 17.48a2.003 2.003 0 0 1-1-1.73v-7.5a2.06 2.06 0 0 1 .27-1.01L12 12z" opacity=".5" fill="currentColor"></path><path class="uim-primary" d="M20.5 8.25v7.5a2.003 2.003 0 0 1-1 1.73l-6.62 3.82c-.275.13-.576.198-.88.2V12l8.23-4.76c.175.308.268.656.27 1.01z" fill="currentColor"></path></svg><span class="font-light">class</span> <span class="font-medium">diffusers.pipelines.cosmos.pipeline_cosmos3_omni.</span><span class="font-semibold">Cosmos3OmniPipelineOutput</span></span></h3><!-- HTML_TAG_END --> <a id="diffusers.pipelines.cosmos.pipeline_cosmos3_omni.Cosmos3OmniPipelineOutput" class="header-link invisible with-hover:group-hover:visible pr-2" href="#diffusers.pipelines.cosmos.pipeline_cosmos3_omni.Cosmos3OmniPipelineOutput"><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></a> <a class="!ml-auto !text-gray-400 !no-underline text-sm flex items-center" href="https://github.com/huggingface/diffusers/blob/vr_12968/src/diffusers/pipelines/cosmos/pipeline_cosmos3_omni.py#L135" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span class="hidden md:block mx-0.5 hover:!underline" data-svelte-h="svelte-122apf4">source</span> <span data-svelte-h="svelte-x0xyl0">></span></a></span> <p class="font-mono text-xs md:text-sm !leading-relaxed !my-6"><span data-svelte-h="svelte-8mvn6a">(</span> <span class="comma cursor-pointer"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">video<span class="opacity-60">: typing.Any</span></span> </span><span class="comma cursor-pointer"><span class="rounded hover:bg-black hover:text-white dark:hover:bg-white dark:hover:text-black">sound<span class="opacity-60">: torch.Tensor | None = None</span></span> </span> <span data-svelte-h="svelte-1jq0pl7">)</span> </p> <div class="!mb-10 relative docstring-details "> <p class="flex items-center font-semibold !mt-2 !mb-2 text-gray-800" data-svelte-h="svelte-lt6pb6">Parameters <span class="flex-auto border-t-2 border-gray-100 dark:border-gray-700 ml-3"></span></p> <ul class="px-2"><li class="text-base !pl-4 my-3 rounded "><span class="group flex space-x-1.5 items-start"><a id="diffusers.pipelines.cosmos.pipeline_cosmos3_omni.Cosmos3OmniPipelineOutput.video" class="header-link block pr-0.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#diffusers.pipelines.cosmos.pipeline_cosmos3_omni.Cosmos3OmniPipelineOutput.video"><span><svg class="text-smd" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span><!-- HTML_TAG_START --><strong>video</strong> — The generated video. The exact type depends on <code>output_type</code> | |
| passed to the pipeline: a list of PIL frames for <code>"pil"</code> (default), an <code>np.ndarray</code> of shape <code>[T, H, W, C]</code> for <code>"np"</code>, a <code>torch.Tensor</code> of shape <code>[T, C, H, W]</code> for <code>"pt"</code>, or a raw latent tensor | |
| when <code>output_type="latent"</code>.<!-- HTML_TAG_END --> </span></span> </li><li class="text-base !pl-4 my-3 rounded "><span class="group flex space-x-1.5 items-start"><a id="diffusers.pipelines.cosmos.pipeline_cosmos3_omni.Cosmos3OmniPipelineOutput.sound" class="header-link block pr-0.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#diffusers.pipelines.cosmos.pipeline_cosmos3_omni.Cosmos3OmniPipelineOutput.sound"><span><svg class="text-smd" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span><!-- HTML_TAG_START --><strong>sound</strong> — Decoded audio waveform of shape <code>[C, N]</code>. <code>None</code> when | |
| <code>enable_sound=False</code>.<!-- HTML_TAG_END --> </span></span> </li></ul> </div></div> <p data-svelte-h="svelte-1b7skfg">Output dataclass for <a href="/docs/diffusers/pr_12968/en/api/pipelines/cosmos3#diffusers.Cosmos3OmniPipeline">Cosmos3OmniPipeline</a>.</p></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/diffusers/blob/main/docs/source/en/api/pipelines/cosmos3.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_1xrfl8c = { | |
| assets: "/docs/diffusers/pr_12968/en", | |
| base: "/docs/diffusers/pr_12968/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/diffusers/pr_12968/en/_app/immutable/entry/start.9eac431c.js"), | |
| import("/docs/diffusers/pr_12968/en/_app/immutable/entry/app.d5430b76.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 143], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 81.5 kB
- Xet hash:
- 66a22e31801ce615c6676422c4b26a2a0e8556a566b6cfb900d16e3cb5411930
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.