Buckets:

rtrm's picture
download
raw
45.7 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Deploy Mixtral 8x7B on AWS Inferentia2&quot;,&quot;local&quot;:&quot;deploy-mixtral-8x7b-on-aws-inferentia2&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;1. Setup development environment&quot;,&quot;local&quot;:&quot;1-setup-development-environment&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;2. Retrieve the latest Hugging Face vLLM Neuron DLC&quot;,&quot;local&quot;:&quot;2-retrieve-the-latest-hugging-face-vllm-neuron-dlc&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;3. Deploy Mixtral 8x7B to Inferentia2&quot;,&quot;local&quot;:&quot;3-deploy-mixtral-8x7b-to-inferentia2&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;4. Clean up&quot;,&quot;local&quot;:&quot;4-clean-up&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/optimum.neuron/pr_1097/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/start.82e2ff17.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/scheduler.56725da7.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/singletons.2080b4fc.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/paths.90dabf70.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/app.9b0c0103.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/preload-helper.9dba61fb.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/index.18a26576.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/nodes/0.912aab06.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/nodes/20.b88c61e9.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/CopyLLMTxtMenu.fb3856d8.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/globals.7f7f1b26.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.a16844e0.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/CodeBlock.2d00672f.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Deploy Mixtral 8x7B on AWS Inferentia2&quot;,&quot;local&quot;:&quot;deploy-mixtral-8x7b-on-aws-inferentia2&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;1. Setup development environment&quot;,&quot;local&quot;:&quot;1-setup-development-environment&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;2. Retrieve the latest Hugging Face vLLM Neuron DLC&quot;,&quot;local&quot;:&quot;2-retrieve-the-latest-hugging-face-vllm-neuron-dlc&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;3. Deploy Mixtral 8x7B to Inferentia2&quot;,&quot;local&quot;:&quot;3-deploy-mixtral-8x7b-to-inferentia2&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;4. Clean up&quot;,&quot;local&quot;:&quot;4-clean-up&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="deploy-mixtral-8x7b-on-aws-inferentia2" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deploy-mixtral-8x7b-on-aws-inferentia2"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Deploy Mixtral 8x7B on AWS Inferentia2</span></h1> <p data-svelte-h="svelte-17okc9y">Mixtral 8x7B is an open-source LLM from Mistral AI. It is a Sparse Mixture of Experts and has a similar architecture to Mistral 7B, but comes with a twist: it’s actually 8 “expert” models in one. If you want to learn more about MoEs check out <a href="https://huggingface.co/blog/moe" rel="nofollow">Mixture of Experts Explained</a>.</p> <p data-svelte-h="svelte-pam5v6">In this tutorial you will learn how to deploy <a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1" rel="nofollow">mistralai/Mixtral-8x7B-Instruct-v0.1</a> model on AWS Inferentia2 with Hugging Face Optimum Neuron on Amazon SageMaker. We are going to use the Hugging Face vLLM Neuron Container, a purpose-built Inference Container to easily deploy LLMs on AWS Inferentia2 powered by <a href="https://github.com/vllm-project/vllm.git" rel="nofollow">vLLM</a> and <a href="https://huggingface.co/docs/optimum-neuron/index" rel="nofollow">Optimum Neuron</a>.</p> <p data-svelte-h="svelte-df2280">We will cover how to:</p> <ol data-svelte-h="svelte-iuabs3"><li><a href="#1-setup-development-environment">Setup a development environment</a></li> <li><a href="#2-retrieve-the-latest-hugging-face-vllm-neuron-dlc">Retrieve the latest Hugging Face vLLM Neuron DLC</a></li> <li><a href="#3-deploy-Mixtral-8x7B-to-inferentia2">Deploy Mixtral 8x7B to Inferentia2</a></li> <li><a href="#4-clean-up">Clean up</a></li></ol> <p data-svelte-h="svelte-fedw35">Lets get started! 🚀</p> <p data-svelte-h="svelte-1q2zsrn"><a href="https://aws.amazon.com/ec2/instance-types/inf2/" rel="nofollow">AWS inferentia (Inf2)</a> are purpose-built EC2 for deep learning (DL) inference workloads. Here are the different instances of the Inferentia2 family.</p> <table data-svelte-h="svelte-1tmwmqe"><thead><tr><th>instance size</th> <th>accelerators</th> <th>Neuron Cores</th> <th>accelerator memory</th> <th>vCPU</th> <th>CPU Memory</th> <th>on-demand price ($/h)</th></tr></thead> <tbody><tr><td>inf2.xlarge</td> <td>1</td> <td>2</td> <td>32</td> <td>4</td> <td>16</td> <td>0.76</td></tr> <tr><td>inf2.8xlarge</td> <td>1</td> <td>2</td> <td>32</td> <td>32</td> <td>128</td> <td>1.97</td></tr> <tr><td>inf2.24xlarge</td> <td>6</td> <td>12</td> <td>192</td> <td>96</td> <td>384</td> <td>6.49</td></tr> <tr><td>inf2.48xlarge</td> <td>12</td> <td>24</td> <td>384</td> <td>192</td> <td>768</td> <td>12.98</td></tr></tbody></table> <h2 class="relative group"><a id="1-setup-development-environment" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1-setup-development-environment"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. Setup development environment</span></h2> <p data-svelte-h="svelte-3gjk7m">For this tutorial, we are going to use a Notebook Instance in Amazon SageMaker with the Python 3 (ipykernel) and the <code>sagemaker</code> python SDK to deploy Mixtral 8x7B to a SageMaker inference endpoint.</p> <p data-svelte-h="svelte-gxxxnf">Make sur you have the latest version of the SageMaker SDK installed.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!pip install sagemaker --upgrade --quiet<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-k2b9z7">Then, instantiate the sagemaker role and session.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> boto3
<span class="hljs-keyword">from</span> sagemaker.core.helper.session_helper <span class="hljs-keyword">import</span> get_execution_role
<span class="hljs-keyword">try</span>:
role = get_execution_role()
<span class="hljs-keyword">except</span> ValueError:
iam = boto3.client(<span class="hljs-string">&quot;iam&quot;</span>)
role = iam.get_role(RoleName=<span class="hljs-string">&quot;sagemaker_execution_role&quot;</span>)[<span class="hljs-string">&quot;Role&quot;</span>][<span class="hljs-string">&quot;Arn&quot;</span>]
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;sagemaker role arn: <span class="hljs-subst">{role}</span>&quot;</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="2-retrieve-the-latest-hugging-face-vllm-neuron-dlc" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-retrieve-the-latest-hugging-face-vllm-neuron-dlc"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. Retrieve the latest Hugging Face vLLM Neuron DLC</span></h2> <p data-svelte-h="svelte-1awnn15">The latest Hugging Face vLLM Neuron DLCs can be used to run inference on AWS Inferentia2. To retrieve it you can use the method <code>image_uris.retrieve</code> of the Sagemaker SDK. However, if you have the Optimum Neuron package installed, you can use the <code>ecr.image_uri</code> function to retrieve the appropriate Hugging Face vLLM Neuron DLC URI based on your desired <code>region</code> and <code>version</code>. Default values can be deduced by your AWS credentials. For more details see the <a href="https://huggingface.co/docs/optimum-neuron/containers" rel="nofollow">containers</a> documentation.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!pip install optimum-neuron[neuronx]
<span class="hljs-keyword">from</span> optimum.neuron.utils <span class="hljs-keyword">import</span> ecr
REGION = <span class="hljs-string">&quot;us-east-1&quot;</span>
llm_image = ecr.image_uri(<span class="hljs-string">&quot;vllm&quot;</span>, region=REGION)
<span class="hljs-comment"># print image uri</span>
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;llm image uri: <span class="hljs-subst">{llm_image}</span>&quot;</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="3-deploy-mixtral-8x7b-to-inferentia2" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-deploy-mixtral-8x7b-to-inferentia2"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. Deploy Mixtral 8x7B to Inferentia2</span></h2> <p data-svelte-h="svelte-4yozv7">At the time of writing, <a href="https://awsdocs-neuron.readthedocs-hosted.com/en/v2.6.0/general/arch/neuron-features/dynamic-shapes.html#neuron-dynamic-shapes" rel="nofollow">AWS Inferentia2 does not support dynamic shapes for inference</a>, which means that we need to specify our sequence length and batch size ahead of time.
To make it easier for customers to utilize the full power of Inferentia2, we created a <a href="https://huggingface.co/docs/optimum-neuron/guides/cache_system" rel="nofollow">neuron model cache</a>, which contains pre-compiled configurations for the most popular LLMs, including Mixtral 8x7B.</p> <p data-svelte-h="svelte-6rv5nu">This means we don’t need to compile the model ourselves, but we can use the pre-compiled model from the cache. You can find compiled/cached configurations on the
<a href="https://huggingface.co/aws-neuron/optimum-neuron-cache/tree/main/inference-cache-config" rel="nofollow">Hugging Face Hub</a>. If your desired configuration is not yet cached, you can compile it yourself using the <a href="https://huggingface.co/docs/optimum-neuron/guides/export_model" rel="nofollow">Optimum CLI</a> or open a request at the <a href="https://huggingface.co/aws-neuron/optimum-neuron-cache/discussions" rel="nofollow">Cache repository</a>.</p> <p data-svelte-h="svelte-1quxa5j">Let’s check the different configurations that are in the cache. For that you first need to log in the Hugging Face Hub, using a <a href="https://huggingface.co/docs/hub/en/security-tokens" rel="nofollow">User Access Token</a> with read access.</p> <p data-svelte-h="svelte-bda3ge">Make sure you have the necessary permissions to access the model. You can request access to the model <a href="https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1" rel="nofollow">here</a>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login
notebook_login()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-gvpj30">Then, we need to install the latest version of Optimum Neuron.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!pip install optimum-neuron --upgrade --quiet<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-9yxrws">Finally, we can query the cache and retrieve the existing set of configurations for which we maintained a compiled version of the model.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!optimum-cli neuron cache lookup <span class="hljs-string">&quot;mistralai/Mixtral-8x7B-Instruct-v0.1&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mgox28">You should retrieve two entries in the cache:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">***</span> <span class="hljs-number">2</span> <span class="hljs-string">entrie(s)</span> <span class="hljs-string">found</span> <span class="hljs-string">in</span> <span class="hljs-string">cache</span> <span class="hljs-string">for</span> <span class="hljs-string">mistralai/Mixtral-8x7B-Instruct-v0.1</span> <span class="hljs-string">for</span> <span class="hljs-string">inference.***</span>
<span class="hljs-attr">auto_cast_type:</span> <span class="hljs-string">bf16</span>
<span class="hljs-attr">batch_size:</span> <span class="hljs-number">1</span>
<span class="hljs-attr">checkpoint_id:</span> <span class="hljs-string">mistralai/Mixtral-8x7B-Instruct-v0.1</span>
<span class="hljs-attr">checkpoint_revision:</span> <span class="hljs-string">41bd4c9e7e4fb318ca40e721131d4933966c2cc1</span>
<span class="hljs-attr">compiler_type:</span> <span class="hljs-string">neuronx-cc</span>
<span class="hljs-attr">compiler_version:</span> <span class="hljs-number">2.16</span><span class="hljs-number">.372</span><span class="hljs-number">.0</span><span class="hljs-string">+4a9b2326</span>
<span class="hljs-attr">num_cores:</span> <span class="hljs-number">24</span>
<span class="hljs-attr">sequence_length:</span> <span class="hljs-number">4096</span>
<span class="hljs-attr">task:</span> <span class="hljs-string">text-generation</span>
<span class="hljs-attr">auto_cast_type:</span> <span class="hljs-string">bf16</span>
<span class="hljs-attr">batch_size:</span> <span class="hljs-number">4</span>
<span class="hljs-attr">checkpoint_id:</span> <span class="hljs-string">mistralai/Mixtral-8x7B-Instruct-v0.1</span>
<span class="hljs-attr">checkpoint_revision:</span> <span class="hljs-string">41bd4c9e7e4fb318ca40e721131d4933966c2cc1</span>
<span class="hljs-attr">compiler_type:</span> <span class="hljs-string">neuronx-cc</span>
<span class="hljs-attr">compiler_version:</span> <span class="hljs-number">2.16</span><span class="hljs-number">.372</span><span class="hljs-number">.0</span><span class="hljs-string">+4a9b2326</span>
<span class="hljs-attr">num_cores:</span> <span class="hljs-number">24</span>
<span class="hljs-attr">sequence_length:</span> <span class="hljs-number">4096</span>
<span class="hljs-attr">task:</span> <span class="hljs-string">text-generation</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ay2eoj"><strong>Deploying Mixtral 8x7B to a SageMaker Endpoint</strong></p> <p data-svelte-h="svelte-qkir6d">All we need when deploying the model to Amazon SageMaker, is to set the Hugging Face model id and token.</p> <ul data-svelte-h="svelte-1f783f3"><li><code>SM_ON_MODEL</code>: The Hugging Face model ID.</li> <li><code>HF_TOKEN</code>: The Hugging Face API token to access gated models.</li></ul> <p data-svelte-h="svelte-1uoxmdt">Note: even if your model is not gated, we recommend setting your Hugging Face token to avoid rate limitations when fetching weights or pre-compiled neuron artifacts.</p> <p data-svelte-h="svelte-rcd9mj">Optionally, you can specify some deployment parameters to select a specific cached configuration (otherwise a default one will be selected).</p> <ul data-svelte-h="svelte-sq03vv"><li><code>SM_ON_TENSOR_PARALLEL_SIZE</code>: Number of Neuron Cores used for the compilation.</li> <li><code>SM_ON_BATCH_SIZE</code>: The batch size that was used to compile the model.</li> <li><code>SM_ON_SEQUENCE_LENGTH</code>: The sequence length that was used to compile the model.</li></ul> <p data-svelte-h="svelte-1qiwbk5"><strong>Select the right instance type</strong></p> <p data-svelte-h="svelte-jofx65">Mixtral 8x7B is a large model and requires a lot of memory. We are going to use the <code>inf2.48xlarge</code> instance type, which has 192 vCPUs and 384 GB of accelerator memory. The <code>inf2.48xlarge</code> instance comes with 12 Inferentia2 accelerators that include 24 Neuron Cores. In our case we will use a batch size of 4 and a sequence length of 4096.</p> <p data-svelte-h="svelte-ttsgj0">After that we can create our endpoint configuration and deploy the model to Amazon SageMaker. It will be fully compatible with the OpenAI Chat Completion API.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> sagemaker.core.resources <span class="hljs-keyword">import</span> Model, ContainerDefinition
<span class="hljs-comment"># Define Model and Endpoint configuration parameter</span>
environment = {
<span class="hljs-string">&quot;SM_ON_MODEL&quot;</span>: <span class="hljs-string">&quot;mistralai/Mixtral-8x7B-Instruct-v0.1&quot;</span>,
<span class="hljs-string">&quot;SM_ON_BATCH_SIZE&quot;</span>: <span class="hljs-string">&quot;1&quot;</span>, <span class="hljs-comment"># Select the configuration with batch size 1</span>
<span class="hljs-string">&quot;HF_TOKEN&quot;</span>: <span class="hljs-string">&quot;&lt;REPLACE WITH YOUR TOKEN&gt;&quot;</span>,
}
<span class="hljs-keyword">assert</span> environment[<span class="hljs-string">&quot;HF_TOKEN&quot;</span>] != <span class="hljs-string">&quot;&lt;REPLACE WITH YOUR TOKEN&gt;&quot;</span>, (
<span class="hljs-string">&quot;Please replace &#x27;&lt;REPLACE WITH YOUR TOKEN&gt;&#x27; with your Hugging Face Hub API token&quot;</span>
)
container = ContainerDefinition(image=llm_image, environment=environment)
<span class="hljs-comment"># create Model with the container definition</span>
model = Model.create(
model_name=<span class="hljs-string">&quot;mixtral-8x7b-neuronx-model&quot;</span>, primary_container=container, execution_role_arn=role, region=REGION
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-4gejcg">After we have created the <code>Model</code> we need to define a deployment configuration. We will deploy the model with the <code>ml.inf2.48xlarge</code> instance type. vLLM will automatically distribute and shard the model across all Inferentia devices.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> sagemaker.core.resources <span class="hljs-keyword">import</span> EndpointConfig, ProductionVariant
<span class="hljs-comment"># sagemaker config</span>
instance_type = <span class="hljs-string">&quot;ml.inf2.48xlarge&quot;</span>
health_check_timeout = <span class="hljs-number">3600</span> <span class="hljs-comment"># additional time to load the model</span>
volume_size = <span class="hljs-number">512</span> <span class="hljs-comment"># size in GB of the EBS volume</span>
<span class="hljs-comment"># create EndpointConfig</span>
endpoint_config = EndpointConfig(
endpoint_config_name=<span class="hljs-string">&quot;mixtral-8x7b-endpoint-config&quot;</span>,
production_variants=[
ProductionVariant(
model_name=model.model_name,
instance_type=instance_type,
initial_instance_count=<span class="hljs-number">1</span>,
container_startup_health_check_timeout=health_check_timeout,
volume_size=volume_size,
environment=config,
image_uri=llm_image,
)
],
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-aha0vi">We can now deploy the <code>Model</code> to an <code>Endpoint</code>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> sagemaker.core.resources <span class="hljs-keyword">import</span> Endpoint
endpoint = Endpoint.create(
endpoint_name=<span class="hljs-string">&quot;mixtral-8x7b-neuronx-endpoint&quot;</span>,
endpoint_config_name=endpoint_config.endpoint_config_name,
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-qvkvmm">SageMaker will now create our endpoint and deploy the model to it. It takes around 15 minutes for deployment.</p> <p data-svelte-h="svelte-1nsplg8">After our endpoint is deployed we can run inference on it. We will use the <code>invoke</code> method from the endpoint to run inference on our endpoint.</p> <p data-svelte-h="svelte-7vzs06">The endpoint supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. The Messages API allows us to interact with the model in a conversational way. We can define the role of the message and the content. The role can be either <code>system</code>,<code>assistant</code> or <code>user</code>. The <code>system</code> role is used to provide context to the model and the <code>user</code> role is used to ask questions or provide input to the model.</p> <p data-svelte-h="svelte-iif644">Parameters can be defined as separate attributes of the payload. Check out the chat completion <a href="https://platform.openai.com/docs/api-reference/chat/create" rel="nofollow">documentation</a> to find supported parameters.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Prompt to generate</span>
messages = [
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a helpful assistant.&quot;</span>},
{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;What is deep learning in one sentence?&quot;</span>},
]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1pq3qhh">Okay lets test it.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> json
<span class="hljs-comment"># Generation arguments https://platform.openai.com/docs/api-reference/chat/create</span>
result = endpoint.invoke(
body=json.dumps(
{
<span class="hljs-string">&quot;messages&quot;</span>: messages,
<span class="hljs-string">&quot;max_tokens&quot;</span>: <span class="hljs-number">50</span>,
<span class="hljs-string">&quot;top_k&quot;</span>: <span class="hljs-number">50</span>,
<span class="hljs-string">&quot;top_p&quot;</span>: <span class="hljs-number">0.9</span>,
<span class="hljs-string">&quot;temperature&quot;</span>: <span class="hljs-number">0.7</span>,
}
),
content_type=<span class="hljs-string">&quot;application/json&quot;</span>,
)
output = json.loads(result.body.read().decode(<span class="hljs-string">&quot;utf-8&quot;</span>))
message = output[<span class="hljs-string">&quot;choices&quot;</span>][<span class="hljs-number">0</span>][<span class="hljs-string">&quot;message&quot;</span>]
<span class="hljs-keyword">assert</span> message[<span class="hljs-string">&quot;role&quot;</span>] == <span class="hljs-string">&quot;assistant&quot;</span>
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Generated response:&quot;</span>, message[<span class="hljs-string">&quot;content&quot;</span>])<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="4-clean-up" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#4-clean-up"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>4. Clean up</span></h2> <p data-svelte-h="svelte-100mxno">To clean up, we can delete the model and endpoint.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model.delete()
endpoint_config.delete()
endpoint.delete()<!-- HTML_TAG_END --></pre></div> <p></p>
<script>
{
__sveltekit_19pvy0q = {
assets: "/docs/optimum.neuron/pr_1097/en",
base: "/docs/optimum.neuron/pr_1097/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/start.82e2ff17.js"),
import("/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/app.9b0c0103.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 20],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
45.7 kB
·
Xet hash:
1f55481c0cc56574f5ed796d69ce1e26366013867ad9ae7dcec89014af1775f1

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.