Buckets:

hf-doc-build/doc / diffusers /main /en /training /nemo_automodel.html
HuggingFaceDocBuilder's picture
download
raw
60 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;NeMo Automodel&quot;,&quot;local&quot;:&quot;nemo-automodel&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Supported models&quot;,&quot;local&quot;:&quot;supported-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Installation&quot;,&quot;local&quot;:&quot;installation&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Hardware requirements&quot;,&quot;local&quot;:&quot;hardware-requirements&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Data preparation&quot;,&quot;local&quot;:&quot;data-preparation&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Output format&quot;,&quot;local&quot;:&quot;output-format&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Training configuration&quot;,&quot;local&quot;:&quot;training-configuration&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Config field reference&quot;,&quot;local&quot;:&quot;config-field-reference&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Launch training&quot;,&quot;local&quot;:&quot;launch-training&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Generation&quot;,&quot;local&quot;:&quot;generation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Diffusers integration&quot;,&quot;local&quot;:&quot;diffusers-integration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;NVIDIA Team&quot;,&quot;local&quot;:&quot;nvidia-team&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Resources&quot;,&quot;local&quot;:&quot;resources&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/diffusers/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/entry/start.0c5ebd6d.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/chunks/scheduler.53228c21.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/chunks/singletons.b50c3a69.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/chunks/index.e93d0901.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/chunks/paths.1f6b9aa5.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/entry/app.8edb864d.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/chunks/preload-helper.5c375679.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/chunks/index.100fac89.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/nodes/0.6eef4be0.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/nodes/311.8d6d178e.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/chunks/CopyLLMTxtMenu.37908ba4.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/chunks/globals.7f7f1b26.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/chunks/IconCopy.38cf8f56.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.0a9663c0.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/chunks/CodeBlock.0adb3827.js">
<link rel="modulepreload" href="/docs/diffusers/main/en/_app/immutable/chunks/HfOption.fad27e59.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;NeMo Automodel&quot;,&quot;local&quot;:&quot;nemo-automodel&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Supported models&quot;,&quot;local&quot;:&quot;supported-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Installation&quot;,&quot;local&quot;:&quot;installation&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Hardware requirements&quot;,&quot;local&quot;:&quot;hardware-requirements&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Data preparation&quot;,&quot;local&quot;:&quot;data-preparation&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Output format&quot;,&quot;local&quot;:&quot;output-format&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Training configuration&quot;,&quot;local&quot;:&quot;training-configuration&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Config field reference&quot;,&quot;local&quot;:&quot;config-field-reference&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Launch training&quot;,&quot;local&quot;:&quot;launch-training&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Generation&quot;,&quot;local&quot;:&quot;generation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Diffusers integration&quot;,&quot;local&quot;:&quot;diffusers-integration&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;NVIDIA Team&quot;,&quot;local&quot;:&quot;nvidia-team&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Resources&quot;,&quot;local&quot;:&quot;resources&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="nemo-automodel" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#nemo-automodel"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>NeMo Automodel</span></h1> <p data-svelte-h="svelte-vwsori"><a href="https://github.com/NVIDIA-NeMo/Automodel" rel="nofollow">NeMo Automodel</a> is a PyTorch DTensor-native training library from NVIDIA for fine-tuning and pretraining diffusion models at scale. It is Hugging Face native — train any Diffusers-format model from the Hub with no checkpoint conversion. The same YAML recipe and hackable training script runs on any scale from 1 GPU to hundreds of nodes, with <a href="https://pytorch.org/docs/stable/fsdp.html" rel="nofollow">FSDP2</a> distributed training, multiresolution bucketed dataloading, and pre-encoded latent space training for maximum GPU utilization. It uses <a href="https://huggingface.co/papers/2210.02747" rel="nofollow">flow matching</a> for training and is fully open source (Apache 2.0), NVIDIA-supported, and actively maintained.</p> <p data-svelte-h="svelte-1g9mvbe">NeMo Automodel integrates directly with Diffusers. It loads pretrained models from the Hugging Face Hub using Diffusers model classes and generates outputs with the <a href="/docs/diffusers/main/en/api/pipelines/overview#diffusers.DiffusionPipeline">DiffusionPipeline</a>.</p> <p data-svelte-h="svelte-1bjj3o0">The typical workflow is to install NeMo Automodel (pip or Docker), prepare your data by encoding it into <code>.meta</code> files, configure a YAML recipe, launch training with <code>torchrun</code>, and run inference with the resulting checkpoint.</p> <h2 class="relative group"><a id="supported-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#supported-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Supported models</span></h2> <table data-svelte-h="svelte-db0jgo"><thead><tr><th>Model</th> <th>Hugging Face ID</th> <th>Task</th> <th>Parameters</th> <th>Use case</th></tr></thead> <tbody><tr><td>Wan 2.1 T2V 1.3B</td> <td><a href="https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers" rel="nofollow">Wan-AI/Wan2.1-T2V-1.3B-Diffusers</a></td> <td>Text-to-Video</td> <td>1.3B</td> <td>video generation on limited hardware (fits on single 40GB A100)</td></tr> <tr><td>FLUX.1-dev</td> <td><a href="https://huggingface.co/black-forest-labs/FLUX.1-dev" rel="nofollow">black-forest-labs/FLUX.1-dev</a></td> <td>Text-to-Image</td> <td>12B</td> <td>high-quality image generation</td></tr> <tr><td>HunyuanVideo 1.5</td> <td><a href="https://huggingface.co/hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v" rel="nofollow">hunyuanvideo-community/HunyuanVideo-1.5-Diffusers-720p_t2v</a></td> <td>Text-to-Video</td> <td>13B</td> <td>high-quality video generation</td></tr></tbody></table> <h2 class="relative group"><a id="installation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#installation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Installation</span></h2> <h3 class="relative group"><a id="hardware-requirements" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#hardware-requirements"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Hardware requirements</span></h3> <table data-svelte-h="svelte-1yhsr3i"><thead><tr><th>Component</th> <th>Minimum</th> <th>Recommended</th></tr></thead> <tbody><tr><td>GPU</td> <td>A100 40GB</td> <td>A100 80GB / H100</td></tr> <tr><td>GPUs</td> <td>4</td> <td>8+</td></tr> <tr><td>RAM</td> <td>128 GB</td> <td>256 GB+</td></tr> <tr><td>Storage</td> <td>500 GB SSD</td> <td>2 TB NVMe</td></tr></tbody></table> <p data-svelte-h="svelte-1n9l4qf">Install NeMo Automodel with pip. For the full set of installation methods (including from source), see the <a href="https://docs.nvidia.com/nemo/automodel/latest/guides/installation.html" rel="nofollow">NeMo Automodel installation guide</a>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->pip3 install nemo-automodel<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-bpga3n">Alternatively, use the pre-built Docker container which includes all dependencies.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->docker pull nvcr.io/nvidia/nemo-automodel:26.02.00
docker run --gpus all -it --<span class="hljs-built_in">rm</span> --shm-size=8g nvcr.io/nvidia/nemo-automodel:26.02.00<!-- HTML_TAG_END --></pre></div> <blockquote class="warning" data-svelte-h="svelte-846bqe"><p>Checkpoints are lost when the container exits unless you bind-mount the checkpoint directory to the host. For example, add <code>-v /host/path/checkpoints:/workspace/checkpoints</code> to the <code>docker run</code> command.</p></blockquote> <h2 class="relative group"><a id="data-preparation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#data-preparation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Data preparation</span></h2> <p data-svelte-h="svelte-1qi0b7c">NeMo Automodel trains diffusion models in latent space. Raw images or videos must be preprocessed into <code>.meta</code> files containing VAE latents and text embeddings before training. This avoids re-encoding on every training step.</p> <p data-svelte-h="svelte-11qkjw5">Use the built-in preprocessing tool to encode your data. The tool automatically distributes work across all available GPUs.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">video preprocessing </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">image preprocessing </div></div> <div class="language-select"><p data-svelte-h="svelte-o4c46c">The video preprocessing command is the same for both Wan 2.1 and HunyuanVideo, but the flags differ. Wan 2.1 uses <code>--processor wan</code> with <code>--resolution_preset</code> and <code>--caption_format sidecar</code>, while HunyuanVideo uses <code>--processor hunyuan</code> with <code>--target_frames</code> to set the frame count and <code>--caption_format meta_json</code>.</p> <p data-svelte-h="svelte-s4duoi"><strong>Wan 2.1:</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->python -m tools.diffusion.preprocessing_multiprocess video \
--video_dir /data/videos \
--output_dir /cache \
--processor wan \
--resolution_preset 512p \
--caption_format sidecar<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1vhq00g"><strong>HunyuanVideo:</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->python -m tools.diffusion.preprocessing_multiprocess video \
--video_dir /data/videos \
--output_dir /cache \
--processor hunyuan \
--target_frames 121 \
--caption_format meta_json<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="output-format" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#output-format"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Output format</span></h3> <p data-svelte-h="svelte-1rr885g">Preprocessing produces a cache directory organized by resolution bucket. NeMo Automodel supports multi-resolution training through bucketed sampling. Samples are grouped by spatial resolution so each batch contains same-size samples, avoiding padding waste.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=" "><!-- HTML_TAG_START --><span class="hljs-string">/cache/</span>
├── 512x512/ <span class="hljs-comment"># Resolution bucket</span>
│ ├── &lt;hash1&gt;<span class="hljs-string">.meta</span> <span class="hljs-comment"># VAE latents + text embeddings</span>
│ ├── &lt;hash2&gt;<span class="hljs-string">.meta</span>
│ └── <span class="hljs-string">...</span>
├── 832x480/ <span class="hljs-comment"># Another resolution bucket</span>
│ └── <span class="hljs-string">...</span>
├── metadata.json <span class="hljs-comment"># Global config (processor, model, total items)</span>
└── metadata_shard_0000.json <span class="hljs-comment"># Per-sample metadata (paths, resolutions, captions)</span><!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-6bij95"><p>See the <a href="https://docs.nvidia.com/nemo/automodel/latest/guides/diffusion/dataset.html" rel="nofollow">Diffusion Dataset Preparation</a> guide for caption formats, input data requirements, and all available preprocessing arguments.</p></blockquote> <h2 class="relative group"><a id="training-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#training-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Training configuration</span></h2> <p data-svelte-h="svelte-1vt7e3n">Fine-tuning is driven by two components:</p> <ol data-svelte-h="svelte-1q8zoje"><li>A recipe script (<a href="https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/diffusion/finetune/finetune.py" rel="nofollow">finetune.py</a>) is a Python entry point that contains the training loop: loading the model, building the dataloader, running forward/backward passes, computing the flow matching loss, checkpointing, and logging.</li> <li>A YAML configuration file specifies all settings the recipe uses: which model to fine-tune, where the data lives, optimizer hyperparameters, parallelism strategy, and more. You customize training by editing this file rather than modifying code, allowing you to scale from 1 to hundreds of GPUs.</li></ol> <p data-svelte-h="svelte-1ctsq8o">Any YAML field can also be overridden from the CLI:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->torchrun --nproc-per-node=8 examples/diffusion/finetune/finetune.py \
-c examples/diffusion/finetune/wan2_1_t2v_flow.yaml \
--optim.learning_rate 1e-5 \
--step_scheduler.num_epochs 50<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-erc81m">Below is the annotated config for fine-tuning Wan 2.1 T2V 1.3B, with each section explained.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-yaml "><!-- HTML_TAG_START --><span class="hljs-attr">seed:</span> <span class="hljs-number">42</span>
<span class="hljs-comment"># ── Experiment tracking (optional) ──────────────────────────────────────────</span>
<span class="hljs-comment"># Weights &amp; Biases integration for logging metrics, losses, and learning rates.</span>
<span class="hljs-comment"># Set mode: &quot;disabled&quot; to turn off.</span>
<span class="hljs-attr">wandb:</span>
<span class="hljs-attr">project:</span> <span class="hljs-string">wan-t2v-flow-matching</span>
<span class="hljs-attr">mode:</span> <span class="hljs-string">online</span>
<span class="hljs-attr">name:</span> <span class="hljs-string">wan2_1_t2v_fm</span>
<span class="hljs-comment"># ── Model ───────────────────────────────────────────────────────────────────</span>
<span class="hljs-comment"># pretrained_model_name_or_path: any Hugging Face model ID or local path.</span>
<span class="hljs-comment"># mode: &quot;finetune&quot; loads pretrained weights; &quot;pretrain&quot; trains from scratch.</span>
<span class="hljs-attr">model:</span>
<span class="hljs-attr">pretrained_model_name_or_path:</span> <span class="hljs-string">Wan-AI/Wan2.1-T2V-1.3B-Diffusers</span>
<span class="hljs-attr">mode:</span> <span class="hljs-string">finetune</span>
<span class="hljs-comment"># ── Training schedule ───────────────────────────────────────────────────────</span>
<span class="hljs-comment"># global_batch_size: effective batch across all GPUs.</span>
<span class="hljs-comment"># Gradient accumulation is computed automatically: global / (local × num_gpus).</span>
<span class="hljs-attr">step_scheduler:</span>
<span class="hljs-attr">global_batch_size:</span> <span class="hljs-number">8</span>
<span class="hljs-attr">local_batch_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">ckpt_every_steps:</span> <span class="hljs-number">1000</span> <span class="hljs-comment"># Save a checkpoint every N steps</span>
<span class="hljs-attr">num_epochs:</span> <span class="hljs-number">100</span>
<span class="hljs-attr">log_every:</span> <span class="hljs-number">2</span> <span class="hljs-comment"># Log metrics every N steps</span>
<span class="hljs-comment"># ── Data ────────────────────────────────────────────────────────────────────</span>
<span class="hljs-comment"># _target_: the dataloader factory function.</span>
<span class="hljs-comment"># Use build_video_multiresolution_dataloader for video models (Wan, HunyuanVideo).</span>
<span class="hljs-comment"># Use build_text_to_image_multiresolution_dataloader for image models (FLUX).</span>
<span class="hljs-comment"># model_type: &quot;wan&quot; or &quot;hunyuan&quot; (selects the correct latent format).</span>
<span class="hljs-comment"># base_resolution: target resolution for multiresolution bucketing.</span>
<span class="hljs-attr">data:</span>
<span class="hljs-attr">dataloader:</span>
<span class="hljs-attr">_target_:</span> <span class="hljs-string">nemo_automodel.components.datasets.diffusion.build_video_multiresolution_dataloader</span>
<span class="hljs-attr">cache_dir:</span> <span class="hljs-string">PATH_TO_YOUR_DATA</span>
<span class="hljs-attr">model_type:</span> <span class="hljs-string">wan</span>
<span class="hljs-attr">base_resolution:</span> [<span class="hljs-number">512</span>, <span class="hljs-number">512</span>]
<span class="hljs-attr">dynamic_batch_size:</span> <span class="hljs-literal">false</span> <span class="hljs-comment"># When true, adjusts batch per bucket to maintain constant memory</span>
<span class="hljs-attr">shuffle:</span> <span class="hljs-literal">true</span>
<span class="hljs-attr">drop_last:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">num_workers:</span> <span class="hljs-number">0</span>
<span class="hljs-comment"># ── Optimizer ───────────────────────────────────────────────────────────────</span>
<span class="hljs-comment"># learning_rate: 5e-6 is a good starting point for fine-tuning.</span>
<span class="hljs-comment"># Adjust weight_decay and betas for your dataset.</span>
<span class="hljs-attr">optim:</span>
<span class="hljs-attr">learning_rate:</span> <span class="hljs-number">5e-6</span>
<span class="hljs-attr">optimizer:</span>
<span class="hljs-attr">weight_decay:</span> <span class="hljs-number">0.01</span>
<span class="hljs-attr">betas:</span> [<span class="hljs-number">0.9</span>, <span class="hljs-number">0.999</span>]
<span class="hljs-comment"># ── Learning rate scheduler ─────────────────────────────────────────────────</span>
<span class="hljs-comment"># Supports cosine, linear, and constant schedules.</span>
<span class="hljs-attr">lr_scheduler:</span>
<span class="hljs-attr">lr_decay_style:</span> <span class="hljs-string">cosine</span>
<span class="hljs-attr">lr_warmup_steps:</span> <span class="hljs-number">0</span>
<span class="hljs-attr">min_lr:</span> <span class="hljs-number">1e-6</span>
<span class="hljs-comment"># ── Flow matching ───────────────────────────────────────────────────────────</span>
<span class="hljs-comment"># adapter_type: model-specific adapter — must match the model:</span>
<span class="hljs-comment"># &quot;simple&quot; for Wan 2.1, &quot;flux&quot; for FLUX.1-dev, &quot;hunyuan&quot; for HunyuanVideo.</span>
<span class="hljs-comment"># timestep_sampling: &quot;uniform&quot; for Wan, &quot;logit_normal&quot; for FLUX and HunyuanVideo.</span>
<span class="hljs-comment"># flow_shift: shifts the flow schedule (model-dependent).</span>
<span class="hljs-comment"># i2v_prob: probability of image-to-video conditioning during training (video models).</span>
<span class="hljs-attr">flow_matching:</span>
<span class="hljs-attr">adapter_type:</span> <span class="hljs-string">&quot;simple&quot;</span>
<span class="hljs-attr">adapter_kwargs:</span> {}
<span class="hljs-attr">timestep_sampling:</span> <span class="hljs-string">&quot;uniform&quot;</span>
<span class="hljs-attr">logit_mean:</span> <span class="hljs-number">0.0</span>
<span class="hljs-attr">logit_std:</span> <span class="hljs-number">1.0</span>
<span class="hljs-attr">flow_shift:</span> <span class="hljs-number">3.0</span>
<span class="hljs-attr">num_train_timesteps:</span> <span class="hljs-number">1000</span>
<span class="hljs-attr">i2v_prob:</span> <span class="hljs-number">0.3</span>
<span class="hljs-attr">use_loss_weighting:</span> <span class="hljs-literal">true</span>
<span class="hljs-comment"># ── FSDP2 distributed training ──────────────────────────────────────────────</span>
<span class="hljs-comment"># dp_size: number of GPUs for data parallelism (typically = total GPUs on node).</span>
<span class="hljs-comment"># tp_size, cp_size, pp_size: tensor, context, and pipeline parallelism.</span>
<span class="hljs-comment"># For most fine-tuning, dp_size is all you need; leave others at 1.</span>
<span class="hljs-attr">fsdp:</span>
<span class="hljs-attr">tp_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">cp_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">pp_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">dp_replicate_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">dp_size:</span> <span class="hljs-number">8</span>
<span class="hljs-comment"># ── Checkpointing ──────────────────────────────────────────────────────────</span>
<span class="hljs-comment"># checkpoint_dir: where to save checkpoints (use a persistent path with Docker).</span>
<span class="hljs-comment"># restore_from: path to resume training from a previous checkpoint.</span>
<span class="hljs-attr">checkpoint:</span>
<span class="hljs-attr">enabled:</span> <span class="hljs-literal">true</span>
<span class="hljs-attr">checkpoint_dir:</span> <span class="hljs-string">PATH_TO_YOUR_CKPT_DIR</span>
<span class="hljs-attr">model_save_format:</span> <span class="hljs-string">torch_save</span>
<span class="hljs-attr">save_consolidated:</span> <span class="hljs-literal">false</span>
<span class="hljs-attr">restore_from:</span> <span class="hljs-literal">null</span><!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="config-field-reference" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#config-field-reference"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Config field reference</span></h3> <p data-svelte-h="svelte-8zrcpw">The table below lists the minimal required configs. See the <a href="https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/diffusion/finetune" rel="nofollow">NeMo Automodel examples</a> have full example configs for all models.</p> <table data-svelte-h="svelte-kcyat9"><thead><tr><th>Section</th> <th>Required?</th> <th>What to Change</th></tr></thead> <tbody><tr><td><code>model</code></td> <td>Yes</td> <td>Set <code>pretrained_model_name_or_path</code> to the Hugging Face model ID. Set <code>mode: finetune</code> or <code>mode: pretrain</code>.</td></tr> <tr><td><code>step_scheduler</code></td> <td>Yes</td> <td><code>global_batch_size</code> is the effective batch size across all GPUs. <code>ckpt_every_steps</code> controls checkpoint frequency. Gradient accumulation is computed automatically.</td></tr> <tr><td><code>data</code></td> <td>Yes</td> <td>Set <code>cache_dir</code> to the path containing your preprocessed <code>.meta</code> files. Change <code>_target_</code> and <code>model_type</code> for different models.</td></tr> <tr><td><code>optim</code></td> <td>Yes</td> <td><code>learning_rate: 5e-6</code> is a good default for fine-tuning. Adjust for your dataset and model.</td></tr> <tr><td><code>lr_scheduler</code></td> <td>Yes</td> <td>Choose <code>cosine</code>, <code>linear</code>, or <code>constant</code> for <code>lr_decay_style</code>. Set <code>lr_warmup_steps</code> for gradual warmup.</td></tr> <tr><td><code>flow_matching</code></td> <td>Yes</td> <td><code>adapter_type</code> must match the model (<code>simple</code> for Wan, <code>flux</code> for FLUX, <code>hunyuan</code> for HunyuanVideo). See model-specific configs for <code>adapter_kwargs</code>.</td></tr> <tr><td><code>fsdp</code></td> <td>Yes</td> <td>Set <code>dp_size</code> to the number of GPUs. For multi-node, set to total GPUs across all nodes.</td></tr> <tr><td><code>checkpoint</code></td> <td>Recommended</td> <td>Set <code>checkpoint_dir</code> to a persistent path, especially in Docker. Use <code>restore_from</code> to resume from a previous checkpoint.</td></tr> <tr><td><code>wandb</code></td> <td>Optional</td> <td>Configure to enable Weights &amp; Biases experiment tracking. Set <code>mode: disabled</code> to turn off.</td></tr></tbody></table> <h2 class="relative group"><a id="launch-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#launch-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Launch training</span></h2> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">single-node </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">multi-node </div></div> <div class="language-select"><div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->torchrun --nproc-per-node=8 \
examples/diffusion/finetune/finetune.py \
-c examples/diffusion/finetune/wan2_1_t2v_flow.yaml<!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="generation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#generation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Generation</span></h2> <p data-svelte-h="svelte-1i39ww6">After training, generate videos or images from text prompts using the fine-tuned checkpoint.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">Wan 2.1 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">FLUX </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">HunyuanVideo </div></div> <div class="language-select"><div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->python examples/diffusion/generate/generate.py \
-c examples/diffusion/generate/configs/generate_wan.yaml<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-yt8iqi">With a fine-tuned checkpoint:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->python examples/diffusion/generate/generate.py \
-c examples/diffusion/generate/configs/generate_wan.yaml \
--model.checkpoint ./checkpoints/step_1000 \
--inference.prompts <span class="hljs-string">&#x27;[&quot;A dog running on a beach&quot;]&#x27;</span><!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="diffusers-integration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#diffusers-integration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Diffusers integration</span></h2> <p data-svelte-h="svelte-1e4bt5y">NeMo Automodel is built on top of Diffusers and uses it as the backbone for model loading and inference. It loads models directly from the Hugging Face Hub using Diffusers model classes such as <a href="/docs/diffusers/main/en/api/models/wan_transformer_3d#diffusers.WanTransformer3DModel">WanTransformer3DModel</a>, <a href="/docs/diffusers/main/en/api/models/flux_transformer#diffusers.FluxTransformer2DModel">FluxTransformer2DModel</a>, and <a href="/docs/diffusers/main/en/api/models/hunyuan_video_transformer_3d#diffusers.HunyuanVideoTransformer3DModel">HunyuanVideoTransformer3DModel</a>, and generates outputs via Diffusers pipelines like <a href="/docs/diffusers/main/en/api/pipelines/wan#diffusers.WanPipeline">WanPipeline</a> and <a href="/docs/diffusers/main/en/api/pipelines/flux#diffusers.FluxPipeline">FluxPipeline</a>.</p> <p data-svelte-h="svelte-4t1ef4">This integration provides several benefits for Diffusers users:</p> <ul data-svelte-h="svelte-bv1z1p"><li><strong>No checkpoint conversion</strong>: pretrained weights from the Hub work out of the box. Point <code>pretrained_model_name_or_path</code> at any Diffusers-format model ID and start training immediately.</li> <li><strong>Day-0 model support</strong>: when a new diffusion model is added to Diffusers and uploaded to the Hub, it can be fine-tuned with NeMo Automodel without waiting for a dedicated training script.</li> <li><strong>Pipeline-compatible outputs</strong>: fine-tuned checkpoints are saved in a format that can be loaded directly back into Diffusers pipelines for inference, sharing on the Hub, or further optimization with tools like quantization and compilation.</li> <li><strong>Scalable training for Diffusers models</strong>: NeMo Automodel adds distributed training capabilities (FSDP2, multi-node, multiresolution bucketing) that go beyond what the built-in Diffusers training scripts provide, while keeping the same model and pipeline interfaces.</li> <li><strong>Shared ecosystem</strong>: any model, LoRA adapter, or pipeline component from the Diffusers ecosystem remains compatible throughout the training and inference workflow.</li></ul> <h2 class="relative group"><a id="nvidia-team" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#nvidia-team"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>NVIDIA Team</span></h2> <ul data-svelte-h="svelte-jce5f"><li>Pranav Prashant Thombre, <a href="mailto:pthombre@nvidia.com">pthombre@nvidia.com</a></li> <li>Linnan Wang, <a href="mailto:linnanw@nvidia.com">linnanw@nvidia.com</a></li> <li>Alexandros Koumparoulis, <a href="mailto:akoumparouli@nvidia.com">akoumparouli@nvidia.com</a></li></ul> <h2 class="relative group"><a id="resources" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#resources"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Resources</span></h2> <ul data-svelte-h="svelte-1hzqipv"><li><a href="https://github.com/NVIDIA-NeMo/Automodel" rel="nofollow">NeMo Automodel GitHub</a></li> <li><a href="https://docs.nvidia.com/nemo/automodel/latest/guides/diffusion/finetune.html" rel="nofollow">Diffusion Fine-Tuning Guide</a></li> <li><a href="https://docs.nvidia.com/nemo/automodel/latest/guides/diffusion/dataset.html" rel="nofollow">Diffusion Dataset Preparation</a></li> <li><a href="https://docs.nvidia.com/nemo/automodel/latest/model-coverage/diffusion.html" rel="nofollow">Diffusion Model Coverage</a></li> <li><a href="https://huggingface.co/docs/transformers/en/community_integrations/nemo_automodel_finetuning" rel="nofollow">NeMo Automodel for Transformers (LLM/VLM fine-tuning)</a></li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/diffusers/blob/main/docs/source/en/training/nemo_automodel.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_aglnsu = {
assets: "/docs/diffusers/main/en",
base: "/docs/diffusers/main/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/diffusers/main/en/_app/immutable/entry/start.0c5ebd6d.js"),
import("/docs/diffusers/main/en/_app/immutable/entry/app.8edb864d.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 311],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
60 kB
·
Xet hash:
f016c5d042a4fc7281315a2ad14c045f787aadfbf7c75fcc21c5f7160c449f53

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.