Buckets:

hf-doc-build/doc / optimum-neuron /main /en /guides /export_model.html
rtrm's picture
download
raw
60 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Export a model to Neuron&quot;,&quot;local&quot;:&quot;export-a-model-to-neuron&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Summary&quot;,&quot;local&quot;:&quot;summary&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Why compile to Neuron model?&quot;,&quot;local&quot;:&quot;why-compile-to-neuron-model&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Exporting a model to Neuron using the CLI&quot;,&quot;local&quot;:&quot;exporting-a-model-to-neuron-using-the-cli&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Exporting standard (non-LLM) models&quot;,&quot;local&quot;:&quot;exporting-standard-non-llm-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Exporting Stable Diffusion to Neuron&quot;,&quot;local&quot;:&quot;exporting-stable-diffusion-to-neuron&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Exporting Stable Diffusion XL to Neuron&quot;,&quot;local&quot;:&quot;exporting-stable-diffusion-xl-to-neuron&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Exporting LLMs to Neuron&quot;,&quot;local&quot;:&quot;exporting-llms-to-neuron&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Exporting neuron models using NeuronX docker images&quot;,&quot;local&quot;:&quot;exporting-neuron-models-using-neuronx-docker-images&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Exporting options and Docker / SageMaker environment variables&quot;,&quot;local&quot;:&quot;exporting-options-and-docker--sagemaker-environment-variables&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/optimum.neuron/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/entry/start.e7cdb183.js">
<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/scheduler.56725da7.js">
<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/singletons.635e76a3.js">
<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/paths.ed3a4dd8.js">
<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/entry/app.c5810efa.js">
<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/preload-helper.ec99a452.js">
<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/index.18a26576.js">
<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/nodes/0.f24306d7.js">
<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/nodes/12.9c19b708.js">
<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/Tip.5b941656.js">
<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/CopyLLMTxtMenu.18367b53.js">
<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/globals.7f7f1b26.js">
<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.47599cff.js">
<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/CodeBlock.bd8b9965.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Export a model to Neuron&quot;,&quot;local&quot;:&quot;export-a-model-to-neuron&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Summary&quot;,&quot;local&quot;:&quot;summary&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Why compile to Neuron model?&quot;,&quot;local&quot;:&quot;why-compile-to-neuron-model&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Exporting a model to Neuron using the CLI&quot;,&quot;local&quot;:&quot;exporting-a-model-to-neuron-using-the-cli&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Exporting standard (non-LLM) models&quot;,&quot;local&quot;:&quot;exporting-standard-non-llm-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Exporting Stable Diffusion to Neuron&quot;,&quot;local&quot;:&quot;exporting-stable-diffusion-to-neuron&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Exporting Stable Diffusion XL to Neuron&quot;,&quot;local&quot;:&quot;exporting-stable-diffusion-xl-to-neuron&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Exporting LLMs to Neuron&quot;,&quot;local&quot;:&quot;exporting-llms-to-neuron&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Exporting neuron models using NeuronX docker images&quot;,&quot;local&quot;:&quot;exporting-neuron-models-using-neuronx-docker-images&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Exporting options and Docker / SageMaker environment variables&quot;,&quot;local&quot;:&quot;exporting-options-and-docker--sagemaker-environment-variables&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="export-a-model-to-neuron" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#export-a-model-to-neuron"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Export a model to Neuron</span></h1> <h2 class="relative group"><a id="summary" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#summary"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Summary</span></h2> <p data-svelte-h="svelte-166ewzk">Exporting a PyTorch model to Neuron model is as simple as</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli <span class="hljs-built_in">export</span> neuron \
--model bert-base-uncased \
--sequence_length 128 \
--batch_size 1 \
bert_neuron/<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-b35baa">Check out the help for more options:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli <span class="hljs-built_in">export</span> neuron --<span class="hljs-built_in">help</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="why-compile-to-neuron-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#why-compile-to-neuron-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Why compile to Neuron model?</span></h2> <p data-svelte-h="svelte-1esvin7">AWS provides two generations of the Trainium/Inferentia accelerator built for machine learning inference with higher throughput, lower latency but lower cost: <a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/inf2-arch.html" rel="nofollow">inf2 (NeuronCore-v2)</a>.</p> <p data-svelte-h="svelte-1und41a">In production environments, to deploy 🤗 <a href="https://huggingface.co/docs/transformers/index" rel="nofollow">Transformers</a> models on Neuron devices, you need to compile your models and export them to a serialized format before inference. Through Ahead-Of-Time (AOT) compilation with Neuron Compiler( <a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/compiler/neuronx-cc/index.html" rel="nofollow">neuronx-cc</a> or <a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/compiler/neuron-cc/neuron-cc.html" rel="nofollow">neuron-cc</a> ), your models will be converted to serialized and optimized <a href="https://pytorch.org/docs/stable/generated/torch.jit.ScriptModule.html" rel="nofollow">TorchScript modules</a>.</p> <p data-svelte-h="svelte-2ri5fz">Although pre-compilation avoids overhead during the inference, a compiled Neuron model has some limitations:</p> <ul data-svelte-h="svelte-9ojx5"><li>The input shapes and data types used during the compilation cannot be changed.</li> <li>Neuron models are specialized for each hardware and SDK version, which means:
<ul><li>Models compiled with Neuron can no longer be executed in non-Neuron environment.</li> <li>Models compiled for trn2 (NeuronCore-v3) are not compatible with inf2 (NeuronCore-v2), and vice versa.</li> <li>Models compiled for an SDK version are (generally) not compatible with another SDK version.</li></ul></li></ul> <p data-svelte-h="svelte-189k6c5">In this guide, we’ll show you how to export your models to serialized models optimized for Neuron devices.</p> <blockquote class="tip"><p data-svelte-h="svelte-1u9pown">🤗 Optimum provides support for the Neuron export by leveraging configuration objects.
These configuration objects come ready made for a number of model architectures, and are designed to be easily extendable to other architectures.</p> <p data-svelte-h="svelte-430w7w"><strong>To check the supported architectures, go to the <a href="../package_reference/configuration">configuration reference page</a>.</strong></p></blockquote> <h2 class="relative group"><a id="exporting-a-model-to-neuron-using-the-cli" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#exporting-a-model-to-neuron-using-the-cli"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Exporting a model to Neuron using the CLI</span></h2> <p data-svelte-h="svelte-jz73et">To export a 🤗 Transformers model to Neuron, you’ll first need to install some extra dependencies:</p> <p data-svelte-h="svelte-1qm5x97"><strong>For Inf2</strong></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install optimum-neuron[neuronx]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-cb2pjp">The Optimum Neuron export can be used through Optimum command-line:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli <span class="hljs-built_in">export</span> neuron --<span class="hljs-built_in">help</span><!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="exporting-standard-non-llm-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#exporting-standard-non-llm-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Exporting standard (non-LLM) models</span></h3> <p data-svelte-h="svelte-p91rke">Most models present on the Hugging Face hub can be straightforwardly exported using torch trace, then converted to serialized and optimized TorchScript modules.</p> <blockquote class="tip"><img title="Compilation flow" alt="Compilation flow" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/optimum/neuron/inf_compile_flow.png"> <p data-svelte-h="svelte-di565y"><strong>NEFF</strong>: Neuron Executable File Format which is a binary executable on Neuron devices.</p></blockquote> <p data-svelte-h="svelte-9ahyv2">When exporting a model, two sets of export arguments must be passed:</p> <ul data-svelte-h="svelte-uw9cro"><li><code>compiler_args</code> are optional arguments for the compiler, these arguments usually control how the compiler makes tradeoff between the inference performance (latency and throughput) and the accuracy,</li> <li><code>input_shapes</code> are mandatory static shape information that you need to send to the neuron compiler.</li></ul> <p data-svelte-h="svelte-1ldr9ls">Please type the following command to see all export parameters:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli <span class="hljs-built_in">export</span> neuron -h<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-eag7fc">Exporting a standard NLP model can be done as follows:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli <span class="hljs-built_in">export</span> neuron --model distilbert-base-uncased-distilled-squad \
--batch_size 1 --sequence_length 16 \
--auto_cast matmul --auto_cast_type fp16 \
distilbert_base_uncased_squad_neuron/<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1k4xv5r">Here the model was exported with a static input shape of <code>(1, 16)</code>, and with compiler arguments specifying
that matmul operation must be performed with <code>float16</code> precision for faster inference.</p> <blockquote class="tip"><p data-svelte-h="svelte-18qxt3n">You can also compile the model on a CPU-only instance. In this case, specify the target instance type by passing <code>--instance_type</code> from <code>{inf2, trn1, trn1n, trn2}</code>.</p> <p data-svelte-h="svelte-184kmao">If you are using a <code>NeuronModelForXXX</code> class to export the model on a CPU-only instance, you must define an environment variable <code>NEURON_PLATFORM_TARGET_OVERRIDE</code> before importing anything from the <code>neuronx_distributed</code> library, and specify the target instance type. For example:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> os
os.environ[<span class="hljs-string">&quot;NEURON_PLATFORM_TARGET_OVERRIDE&quot;</span>] = <span class="hljs-string">&quot;inf2&quot;</span><!-- HTML_TAG_END --></pre></div></blockquote> <p data-svelte-h="svelte-9jbb9y">After export, you should see the following logs which validate the model on Neuron devices by comparing with PyTorch model on CPU:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Validating Neuron model...
-[✓] Neuron model output names match reference model (last_hidden_state)
- Validating Neuron Model output <span class="hljs-string">&quot;last_hidden_state&quot;</span>:
-[✓] (1, 16, 32) matches (1, 16, 32)
-[✓] all values close (atol: 0.0001)
The Neuronx <span class="hljs-built_in">export</span> succeeded and the exported model was saved at: distilbert_base_uncased_squad_neuron/<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-v1u856">This exports a neuron-compiled TorchScript module of the checkpoint defined by the <code>--model</code> argument.</p> <p data-svelte-h="svelte-vql48n">As you can see, the task was automatically detected. This was possible because the model was on the Hub. For local models, providing the <code>--task</code> argument is needed or it will default to the model architecture without any task specific head:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli <span class="hljs-built_in">export</span> neuron --model local_path --task question-answering --batch_size 1 --sequence_length 16 --dynamic-batch-size distilbert_base_uncased_squad_neuron/<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1hgclei">Note that providing the <code>--task</code> argument for a model on the Hub will disable the automatic task detection. The resulting <code>model.neuron</code> file, can then be loaded and run on Neuron devices.</p> <p data-svelte-h="svelte-yhg472">For each model architecture, you can find the list of supported tasks via the <code>~exporters.tasks.TasksManager</code>. For example, for DistilBERT, for the Neuron export, we have:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> optimum.exporters.tasks <span class="hljs-keyword">import</span> TasksManager
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> optimum.exporters.neuron.model_configs <span class="hljs-keyword">import</span> * <span class="hljs-comment"># Register neuron specific configs to the TasksManager</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>distilbert_tasks = <span class="hljs-built_in">list</span>(TasksManager.get_supported_tasks_for_model_type(<span class="hljs-string">&quot;distilbert&quot;</span>, <span class="hljs-string">&quot;neuron&quot;</span>).keys())
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(distilbert_tasks)
[<span class="hljs-string">&#x27;feature-extraction&#x27;</span>, <span class="hljs-string">&#x27;fill-mask&#x27;</span>, <span class="hljs-string">&#x27;multiple-choice&#x27;</span>, <span class="hljs-string">&#x27;question-answering&#x27;</span>, <span class="hljs-string">&#x27;text-classification&#x27;</span>, <span class="hljs-string">&#x27;token-classification&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ab1ldn">You can then pass one of these tasks to the <code>--task</code> argument in the <code>optimum-cli export neuron</code> command, as mentioned above.</p> <p data-svelte-h="svelte-3rh7j4">Once exported, the neuron model can be used for inference directly with the <code>NeuronModelForXXX</code> class:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronModelForSequenceClassification
<span class="hljs-meta">&gt;&gt;&gt; </span>tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;./distilbert-base-uncased-finetuned-sst-2-english_neuron/&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>model = NeuronModelForSequenceClassification.from_pretrained(<span class="hljs-string">&quot;./distilbert-base-uncased-finetuned-sst-2-english_neuron/&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>inputs = tokenizer(<span class="hljs-string">&quot;Hamilton is considered to be the best musical of human history.&quot;</span>, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>logits = model(**inputs).logits
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(model.config.id2label[logits.argmax().item()])
<span class="hljs-string">&#x27;POSITIVE&#x27;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ugid4e">As you see, there is no need to pass the neuron arguments used during the export as they are
saved in a <code>config.json</code> file, and will be restored automatically by <code>NeuronModelForXXX</code> class.</p> <blockquote class="tip">Be careful, inputs are always padded to the shapes used for the compilation, and the padding brings computation overhead.
Adjust the static shapes to be higher than the shape of the inputs that you will feed into the model during the inference, but not much more.</blockquote> <h3 class="relative group"><a id="exporting-stable-diffusion-to-neuron" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#exporting-stable-diffusion-to-neuron"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Exporting Stable Diffusion to Neuron</span></h3> <p data-svelte-h="svelte-vfmt4g">With the Optimum CLI you can compile components in the Stable Diffusion pipeline to gain acceleration on neuron devices during the inference.</p> <p data-svelte-h="svelte-1lozjup">So far, we support the export of following components in the pipeline:</p> <ul data-svelte-h="svelte-1bxicg6"><li>CLIP text encoder</li> <li>U-Net</li> <li>VAE encoder</li> <li>VAE decoder</li></ul> <blockquote class="tip"><p data-svelte-h="svelte-9etaml">“These blocks are chosen because they represent the bulk of the compute in the pipeline, and performance benchmarking has shown that running them on Neuron yields significant performance benefit.”</p> <p data-svelte-h="svelte-1dh7n4s">Besides, don’t hesitate to tweak the compilation configuration to find the best tradeoff between performance v.s accuracy in your use case. By default, we suggest casting FP32 matrix multiplication operations to BF16 which offers good performance with moderate sacrifice of the accuracy. Check out the guide from <a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/appnotes/neuronx-cc/neuronx-cc-training-mixed-precision.html#neuronx-cc-training-mixed-precision" rel="nofollow">AWS Neuron documentation</a> to better understand the options for your compilation.</p></blockquote> <p data-svelte-h="svelte-1s0jvqd">Exporting a stable diffusion checkpoint can be done using the CLI:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli <span class="hljs-built_in">export</span> neuron --model stabilityai/stable-diffusion-2-1-base \
--task stable-diffusion \
--batch_size 1 \
--height 512 `<span class="hljs-comment"># height in pixels of generated image, eg. 512, 768` \</span>
--width 512 `<span class="hljs-comment"># width in pixels of generated image, eg. 512, 768` \</span>
--num_images_per_prompt 4 `<span class="hljs-comment"># number of images to generate per prompt, defaults to 1` \</span>
--auto_cast matmul `<span class="hljs-comment"># cast only matrix multiplication operations` \</span>
--auto_cast_type bf16 `<span class="hljs-comment"># cast operations from FP32 to BF16` \</span>
sd_neuron/<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="exporting-stable-diffusion-xl-to-neuron" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#exporting-stable-diffusion-xl-to-neuron"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Exporting Stable Diffusion XL to Neuron</span></h3> <p data-svelte-h="svelte-12l5vu1">Similar to Stable Diffusion, you will be able to use Optimum CLI to compile components in the SDXL pipeline for inference on neuron devices.</p> <p data-svelte-h="svelte-1cwhii6">We support the export of following components in the pipeline to boost the speed:</p> <ul data-svelte-h="svelte-17e9k9s"><li>Text encoder</li> <li>Second text encoder</li> <li>U-Net (a three times larger UNet than the one in Stable Diffusion pipeline)</li> <li>VAE encoder</li> <li>VAE decoder</li></ul> <blockquote class="tip"><p data-svelte-h="svelte-r3q4ut">“Stable Diffusion XL works especially well with images between 768 and 1024.”</p></blockquote> <p data-svelte-h="svelte-112a1oc">Exporting a SDXL checkpoint can be done using the CLI:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli <span class="hljs-built_in">export</span> neuron --model stabilityai/stable-diffusion-xl-base-1.0 \
--task stable-diffusion-xl \
--batch_size 1 \
--height 1024 `<span class="hljs-comment"># height in pixels of generated image, eg. 768, 1024` \</span>
--width 1024 `<span class="hljs-comment"># width in pixels of generated image, eg. 768, 1024` \</span>
--num_images_per_prompt 4 `<span class="hljs-comment"># number of images to generate per prompt, defaults to 1` \</span>
--auto_cast matmul `<span class="hljs-comment"># cast only matrix multiplication operations` \</span>
--auto_cast_type bf16 `<span class="hljs-comment"># cast operations from FP32 to BF16` \</span>
sd_neuron/<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="exporting-llms-to-neuron" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#exporting-llms-to-neuron"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Exporting LLMs to Neuron</span></h3> <p data-svelte-h="svelte-1fq0lez">Just like the standard NLP models, you need to specify static parameters when exporting an LLM model:</p> <ul data-svelte-h="svelte-1tmw6po"><li><code>batch_size</code> is the number of input sequences that the model will accept. Defaults to 1,</li> <li><code>sequence_length</code> is the maximum number of tokens in an input sequence. Defaults to <code>max_position_embeddings</code> (<code>n_positions</code> for older models).</li> <li><code>tensor_parallel_size</code> is the number of neuron cores used when instantiating the model. Each neuron core has 16 Gb of memory, which means that
bigger models need to be split on multiple cores. Defaults to 1,</li></ul> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->optimum-cli <span class="hljs-built_in">export</span> neuron --model meta-llama/Llama-3.2-1B \
--batch_size 1 \
--sequence_length 4096 \
--tensor_parallel_size 2 \
llama3_neuron/<!-- HTML_TAG_END --></pre></div> <blockquote class="tip">The export of LLM models can take much longer than standard models (sometimes more than one hour).</blockquote> <p data-svelte-h="svelte-1jrgmyt">As explained before, the neuron model parameters are static.
This means in particular that during inference:</p> <ul data-svelte-h="svelte-1y4z1sb"><li>the <code>batch_size</code> of the inputs should be lower to the <code>batch_size</code> used during export,</li> <li>the <code>length</code> of the input sequences should be lower than the <code>sequence_length</code> used during export,</li> <li>the maximum number of tokens (input + generated) cannot exceed the <code>sequence_length</code> used during export.</li></ul> <p data-svelte-h="svelte-154my7d">Once exported, neuron llm models can simply be reloaded using the <code>NeuronModelForCausalLM</code> class.
As with the original transformers models, use <code>generate()</code> instead of <code>forward()</code> to generate text sequences.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->from transformers import AutoTokenizer
<span class="hljs-deletion">-from transformers import AutoModelForCausalLM</span>
<span class="hljs-addition">+from optimum.neuron import NeuronModelForCausalLM</span>
# Instantiate and convert to Neuron a PyTorch checkpoint
<span class="hljs-deletion">-model = AutoModelForCausalLM.from_pretrained(&quot;meta-llama/Llama-3.2-1B&quot;)</span>
<span class="hljs-addition">+model = NeuronModelForCausalLM.from_pretrained(&quot;./llama3-neuron&quot;)</span>
tokenizer = AutoTokenizer.from_pretrained(&quot;meta-llama/Llama-3.2-1B&quot;)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokens = tokenizer(&quot;I really wish &quot;, return_tensors=&quot;pt&quot;)
with torch.inference_mode():
sample_output = model.generate(
**tokens,
do_sample=True,
max_new_tokens=256,
temperature=0.7,
)
outputs = [tokenizer.decode(tok) for tok in sample_output]
print(outputs)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1266wn0">The generation is highly configurable. Please refer to <a href="https://huggingface.co/docs/transformers/generation_strategies" rel="nofollow">https://huggingface.co/docs/transformers/generation_strategies</a> for details.</p> <p data-svelte-h="svelte-1g2jw6w">Please be aware that:</p> <ul data-svelte-h="svelte-14ajc9a"><li>for each model architecture, default values are provided for all parameters, but values passed to the <code>generate</code> method will take precedence,</li> <li>the generation parameters can be stored in a <code>generation_config.json</code> file. When such a file is present in model directory,
it will be parsed to set the default parameters (the values passed to the <code>generate</code> method still take precedence).</li></ul> <h2 class="relative group"><a id="exporting-neuron-models-using-neuronx-docker-images" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#exporting-neuron-models-using-neuronx-docker-images"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Exporting neuron models using NeuronX docker images</span></h2> <p data-svelte-h="svelte-14ws1ng">The NeuronX vLLM image includes not only NeuronX runtime, but also all packages and tools required to export Neuron models.</p> <p data-svelte-h="svelte-imsem4">Use the following command to export a model to Neuron using a vLLM image:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-title">docker</span> run <span class="hljs-comment">--entrypoint optimum-cli \</span>
-v $(pwd)/<span class="hljs-class"><span class="hljs-keyword">data</span>:/<span class="hljs-keyword">data</span> \</span>
<span class="hljs-comment">--privileged \</span>
ghcr.io/huggingface/optimum-neuron-vllm:latest \
<span class="hljs-keyword">export</span> neuron \
<span class="hljs-comment">--model &lt;organization&gt;/&lt;model&gt; \</span>
<span class="hljs-comment">--batch_size 1 \</span>
<span class="hljs-comment">--sequence_length 4096 \</span>
<span class="hljs-comment">--tensor_parallel_size 2 \</span>
/<span class="hljs-class"><span class="hljs-keyword">data</span>/&lt;neuron_model_path&gt;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ezhare">The exported model will be saved under <code>./data/&lt;neuron_model_path&gt;</code>.</p> <h2 class="relative group"><a id="exporting-options-and-docker--sagemaker-environment-variables" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#exporting-options-and-docker--sagemaker-environment-variables"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Exporting options and Docker / SageMaker environment variables</span></h2> <p data-svelte-h="svelte-11sp8vf">You must make sure that the options used for compilation match the options used for deployment.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attr">SM_ON_MODEL</span> = model
<span class="hljs-attr">SM_ON_BATCH_SIZE</span> = batch_size
<span class="hljs-attr">SM_ON_SEQUENCE_LENGTH</span> = sequence_length
<span class="hljs-attr">SM_ON_TENSOR_PARALLEL_SIZE</span> = tensor_parallel_size<!-- HTML_TAG_END --></pre></div> <p></p>
<script>
{
__sveltekit_89oqon = {
assets: "/docs/optimum.neuron/main/en",
base: "/docs/optimum.neuron/main/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/optimum.neuron/main/en/_app/immutable/entry/start.e7cdb183.js"),
import("/docs/optimum.neuron/main/en/_app/immutable/entry/app.c5810efa.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 12],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
60 kB
·
Xet hash:
d5d9ab8d3c3cc4c7ddb093e630a4ef3359041b1b3c95c9c894a4bb713f0c894b

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.