Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Deploy Llama 3.3 70B on AWS Inferentia2","local":"deploy-llama-33-70b-on-aws-inferentia2","sections":[{"title":"1. Setup development environment","local":"1-setup-development-environment","sections":[],"depth":2},{"title":"2. Retrieve the latest Hugging Face vLLM Neuron DLC","local":"2-retrieve-the-latest-hugging-face-vllm-neuron-dlc","sections":[],"depth":2},{"title":"3. Deploy Llama 3.3 70B to Inferentia2","local":"3-deploy-llama-33-70b-to-inferentia2","sections":[],"depth":2},{"title":"4. Clean up","local":"4-clean-up","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/optimum.neuron/pr_1097/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/start.82e2ff17.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/scheduler.56725da7.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/singletons.2080b4fc.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/paths.90dabf70.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/app.9b0c0103.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/preload-helper.9dba61fb.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/index.18a26576.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/nodes/0.912aab06.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/nodes/19.8ce0677c.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/CopyLLMTxtMenu.fb3856d8.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/globals.7f7f1b26.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.a16844e0.js"> | |
| <link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/CodeBlock.2d00672f.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Deploy Llama 3.3 70B on AWS Inferentia2","local":"deploy-llama-33-70b-on-aws-inferentia2","sections":[{"title":"1. Setup development environment","local":"1-setup-development-environment","sections":[],"depth":2},{"title":"2. Retrieve the latest Hugging Face vLLM Neuron DLC","local":"2-retrieve-the-latest-hugging-face-vllm-neuron-dlc","sections":[],"depth":2},{"title":"3. Deploy Llama 3.3 70B to Inferentia2","local":"3-deploy-llama-33-70b-to-inferentia2","sections":[],"depth":2},{"title":"4. Clean up","local":"4-clean-up","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="deploy-llama-33-70b-on-aws-inferentia2" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deploy-llama-33-70b-on-aws-inferentia2"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Deploy Llama 3.3 70B on AWS Inferentia2</span></h1> <p data-svelte-h="svelte-70qgmy">In this tutorial you will learn how to deploy <a href="https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct" rel="nofollow">/meta-llama/Llama-3.3-70B-Instruct</a> model on AWS Inferentia2 with Hugging Face Optimum on Amazon SageMaker. We are going to use the Hugging Face vLLM Neuron Container, a purpose-built Inference Container to easily deploy LLMs on AWS Inferentia2 powered by <a href="https://github.com/vllm-project/vllm.git" rel="nofollow">vLLM</a> and <a href="https://huggingface.co/docs/optimum-neuron/index" rel="nofollow">Optimum Neuron</a>.</p> <p data-svelte-h="svelte-df2280">We will cover how to:</p> <ol data-svelte-h="svelte-gvdc6b"><li><a href="#1-setup-development-environment">Setup development environment</a></li> <li><a href="#2-retrieve-the-new-hugging-face-vllm-neuron-dlc">Retrieve the new Hugging Face vLLM Neuron DLC</a></li> <li><a href="#3-deploy-llama-33-70b-to-inferentia2">Deploy Llama 3.3 70B to inferentia2</a></li> <li><a href="#4-clean-up">Clean up</a></li></ol> <p data-svelte-h="svelte-fedw35">Lets get started! 🚀</p> <p data-svelte-h="svelte-1q2zsrn"><a href="https://aws.amazon.com/ec2/instance-types/inf2/" rel="nofollow">AWS inferentia (Inf2)</a> are purpose-built EC2 for deep learning (DL) inference workloads. Here are the different instances of the Inferentia2 family.</p> <table data-svelte-h="svelte-1tmwmqe"><thead><tr><th>instance size</th> <th>accelerators</th> <th>Neuron Cores</th> <th>accelerator memory</th> <th>vCPU</th> <th>CPU Memory</th> <th>on-demand price ($/h)</th></tr></thead> <tbody><tr><td>inf2.xlarge</td> <td>1</td> <td>2</td> <td>32</td> <td>4</td> <td>16</td> <td>0.76</td></tr> <tr><td>inf2.8xlarge</td> <td>1</td> <td>2</td> <td>32</td> <td>32</td> <td>128</td> <td>1.97</td></tr> <tr><td>inf2.24xlarge</td> <td>6</td> <td>12</td> <td>192</td> <td>96</td> <td>384</td> <td>6.49</td></tr> <tr><td>inf2.48xlarge</td> <td>12</td> <td>24</td> <td>384</td> <td>192</td> <td>768</td> <td>12.98</td></tr></tbody></table> <h2 class="relative group"><a id="1-setup-development-environment" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1-setup-development-environment"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. Setup development environment</span></h2> <p data-svelte-h="svelte-1e6mm58">For this tutorial, we are going to use a Notebook Instance in Amazon SageMaker with the Python 3 (ipykernel) and the <code>sagemaker</code> python SDK to deploy Llama 3.3 70B to a SageMaker inference endpoint.</p> <p data-svelte-h="svelte-gxxxnf">Make sur you have the latest version of the SageMaker SDK installed.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!pip install sagemaker --upgrade --quiet<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-k2b9z7">Then, instantiate the sagemaker role and session.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> boto3 | |
| <span class="hljs-keyword">from</span> sagemaker.core.helper.session_helper <span class="hljs-keyword">import</span> get_execution_role | |
| <span class="hljs-keyword">try</span>: | |
| role = get_execution_role() | |
| <span class="hljs-keyword">except</span> ValueError: | |
| iam = boto3.client(<span class="hljs-string">"iam"</span>) | |
| role = iam.get_role(RoleName=<span class="hljs-string">"sagemaker_execution_role"</span>)[<span class="hljs-string">"Role"</span>][<span class="hljs-string">"Arn"</span>] | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"sagemaker role arn: <span class="hljs-subst">{role}</span>"</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="2-retrieve-the-latest-hugging-face-vllm-neuron-dlc" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-retrieve-the-latest-hugging-face-vllm-neuron-dlc"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. Retrieve the latest Hugging Face vLLM Neuron DLC</span></h2> <p data-svelte-h="svelte-1awnn15">The latest Hugging Face vLLM Neuron DLCs can be used to run inference on AWS Inferentia2. To retrieve it you can use the method <code>image_uris.retrieve</code> of the Sagemaker SDK. However, if you have the Optimum Neuron package installed, you can use the <code>ecr.image_uri</code> function to retrieve the appropriate Hugging Face vLLM Neuron DLC URI based on your desired <code>region</code> and <code>version</code>. Default values can be deduced by your AWS credentials. For more details see the <a href="https://huggingface.co/docs/optimum-neuron/containers" rel="nofollow">containers</a> documentation.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!pip install optimum-neuron[neuronx] | |
| <span class="hljs-keyword">from</span> optimum.neuron.utils <span class="hljs-keyword">import</span> ecr | |
| REGION = <span class="hljs-string">"us-east-1"</span> | |
| llm_image = ecr.image_uri(<span class="hljs-string">"vllm"</span>, region=REGION) | |
| <span class="hljs-comment"># print image uri</span> | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"llm image uri: <span class="hljs-subst">{llm_image}</span>"</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="3-deploy-llama-33-70b-to-inferentia2" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-deploy-llama-33-70b-to-inferentia2"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. Deploy Llama 3.3 70B to Inferentia2</span></h2> <p data-svelte-h="svelte-w7lp5p">At the time of writing, <a href="https://awsdocs-neuron.readthedocs-hosted.com/en/v2.6.0/general/arch/neuron-features/dynamic-shapes.html#neuron-dynamic-shapes" rel="nofollow">AWS Inferentia2 does not support dynamic shapes for inference</a>, which means that we need to specify our sequence length and batch size ahead of time. | |
| To make it easier for customers to utilize the full power of Inferentia2, we created a <a href="https://huggingface.co/docs/optimum-neuron/guides/cache_system" rel="nofollow">neuron model cache</a>, which contains pre-compiled configurations for the most popular LLMs, including Llama 3.3 70B.</p> <p data-svelte-h="svelte-1kijx5w">This means we don’t need to compile the model ourselves, but we can use the pre-compiled model from the cache. You can find compiled/cached configurations on the <a href="https://huggingface.co/aws-neuron/optimum-neuron-cache/tree/main/inference-cache-config" rel="nofollow">Hugging Face Hub</a>. If your desired configuration is not yet cached, you can compile it yourself using the <a href="https://huggingface.co/docs/optimum-neuron/guides/export_model" rel="nofollow">Optimum CLI</a> or open a request at the <a href="https://huggingface.co/aws-neuron/optimum-neuron-cache/discussions" rel="nofollow">Cache repository</a>.</p> <p data-svelte-h="svelte-prnhqf"><strong>Deploying Llama 3.3 70B to a SageMaker Endpoint</strong></p> <p data-svelte-h="svelte-qkir6d">All we need when deploying the model to Amazon SageMaker, is to set the Hugging Face model id and token.</p> <ul data-svelte-h="svelte-1f783f3"><li><code>SM_ON_MODEL</code>: The Hugging Face model ID.</li> <li><code>HF_TOKEN</code>: The Hugging Face API token to access gated models.</li></ul> <p data-svelte-h="svelte-1fl683x">Note: even if you model is not gated, we recommend setting your Hugging Face token to avoid rate limitations when fetching weights or pre-compiled neuron artifacts.</p> <p data-svelte-h="svelte-rcd9mj">Optionally, you can specify some deployment parameters to select a specific cached configuration (otherwise a default one will be selected).</p> <ul data-svelte-h="svelte-sq03vv"><li><code>SM_ON_TENSOR_PARALLEL_SIZE</code>: Number of Neuron Cores used for the compilation.</li> <li><code>SM_ON_BATCH_SIZE</code>: The batch size that was used to compile the model.</li> <li><code>SM_ON_SEQUENCE_LENGTH</code>: The sequence length that was used to compile the model.</li></ul> <p data-svelte-h="svelte-1qiwbk5"><strong>Select the right instance type</strong></p> <p data-svelte-h="svelte-1h03olo">Llama 3.3 70B is a large model and requires a lot of memory. We are going to use the <code>inf2.48xlarge</code> instance type, which has 192 vCPUs and 384 GB of accelerator memory. The <code>inf2.48xlarge</code> instance comes with 12 Inferentia2 accelerators that include 24 Neuron Cores. If you want to find the cached configurations for Llama 3.3 70B, you can find them <a href="https://huggingface.co/aws-neuron/optimum-neuron-cache/blob/main/inference-cache-config/llama3-70b.json" rel="nofollow">here</a>. Here we will let the framework select one of them automatically, so we won’t specify any specific deployment parameter.</p> <p data-svelte-h="svelte-1mh0xpb">Before we can deploy Llama 3.3 70B to Inferentia2, we need to make sure we have the necessary permissions to access the model. You can request access to the model <a href="https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct" rel="nofollow">here</a> and create a User access token following this <a href="https://huggingface.co/docs/hub/en/security-tokens" rel="nofollow">guide</a>.</p> <p data-svelte-h="svelte-10wr0tf">After that we can create our endpoint configuration and deploy the model to Amazon SageMaker.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> sagemaker.core.resources <span class="hljs-keyword">import</span> Model, ContainerDefinition | |
| <span class="hljs-comment"># Define Model and Endpoint configuration parameter</span> | |
| environment = { | |
| <span class="hljs-string">"SM_ON_MODEL"</span>: <span class="hljs-string">"meta-llama/Llama-3.3-70B-Instruct"</span>, | |
| <span class="hljs-string">"HF_TOKEN"</span>: <span class="hljs-string">"<REPLACE WITH YOUR TOKEN>"</span>, | |
| } | |
| <span class="hljs-keyword">assert</span> environment[<span class="hljs-string">"HF_TOKEN"</span>] != <span class="hljs-string">"<REPLACE WITH YOUR TOKEN>"</span>, ( | |
| <span class="hljs-string">"Please replace '<REPLACE WITH YOUR TOKEN>' with your Hugging Face Hub API token"</span> | |
| ) | |
| container = ContainerDefinition(image=llm_image, environment=environment) | |
| <span class="hljs-comment"># create Model with the container definition</span> | |
| model = Model.create( | |
| model_name=<span class="hljs-string">"llama-3-3-70b-neuronx-model"</span>, primary_container=container, execution_role_arn=role, region=REGION | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-4gejcg">After we have created the <code>Model</code> we need to define a deployment configuration. We will deploy the model with the <code>ml.inf2.48xlarge</code> instance type. vLLM will automatically distribute and shard the model across all Inferentia devices.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> sagemaker.core.resources <span class="hljs-keyword">import</span> EndpointConfig, ProductionVariant | |
| <span class="hljs-comment"># sagemaker config</span> | |
| instance_type = <span class="hljs-string">"ml.inf2.48xlarge"</span> | |
| health_check_timeout = <span class="hljs-number">3600</span> <span class="hljs-comment"># additional time to load the model</span> | |
| volume_size = <span class="hljs-number">512</span> <span class="hljs-comment"># size in GB of the EBS volume</span> | |
| endpoint_config = EndpointConfig.create( | |
| endpoint_config_name=<span class="hljs-string">"llama-3-3-70b-neuronx-endpoint-config"</span>, | |
| production_variants=[ | |
| ProductionVariant( | |
| variant_name=<span class="hljs-string">"AllTraffic"</span>, | |
| model_name=model.model_name, | |
| initial_instance_count=<span class="hljs-number">1</span>, | |
| instance_type=instance_type, | |
| container_startup_health_check_timeout_in_seconds=health_check_timeout, | |
| volume_size_in_gb=volume_size, | |
| inference_ami_version=<span class="hljs-string">"al2-ami-sagemaker-inference-neuron-2"</span>, | |
| ) | |
| ], | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-aha0vi">We can now deploy the <code>Model</code> to an <code>Endpoint</code>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> sagemaker.core.resources <span class="hljs-keyword">import</span> Endpoint | |
| endpoint = Endpoint.create( | |
| endpoint_name=<span class="hljs-string">"llama-3-3-70b-neuronx-endpoint"</span>, | |
| endpoint_config_name=endpoint_config.endpoint_config_name, | |
| ) | |
| endpoint.wait_for_status(target_status=<span class="hljs-string">"InService"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-hs5wbn">SageMaker will now create our endpoint and deploy the model to it. It takes around 30 minutes for deployment.</p> <p data-svelte-h="svelte-pojg70">After our endpoint is deployed we can run inference on it. We will use the <code>invoke</code> method to run inference on our endpoint.</p> <p data-svelte-h="svelte-7vzs06">The endpoint supports the Messages API, which is fully compatible with the OpenAI Chat Completion API. The Messages API allows us to interact with the model in a conversational way. We can define the role of the message and the content. The role can be either <code>system</code>,<code>assistant</code> or <code>user</code>. The <code>system</code> role is used to provide context to the model and the <code>user</code> role is used to ask questions or provide input to the model.</p> <p data-svelte-h="svelte-iif644">Parameters can be defined as separate attributes of the payload. Check out the chat completion <a href="https://platform.openai.com/docs/api-reference/chat/create" rel="nofollow">documentation</a> to find supported parameters.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Prompt to generate</span> | |
| messages = [ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a helpful assistant."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"What is deep learning in one sentence?"</span>}, | |
| ]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1pq3qhh">Okay lets test it.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> json | |
| <span class="hljs-comment"># Generation arguments https://platform.openai.com/docs/api-reference/chat/create</span> | |
| result = endpoint.invoke( | |
| body=json.dumps( | |
| { | |
| <span class="hljs-string">"messages"</span>: messages, | |
| <span class="hljs-string">"max_tokens"</span>: <span class="hljs-number">50</span>, | |
| <span class="hljs-string">"top_k"</span>: <span class="hljs-number">50</span>, | |
| <span class="hljs-string">"top_p"</span>: <span class="hljs-number">0.9</span>, | |
| <span class="hljs-string">"temperature"</span>: <span class="hljs-number">0.7</span>, | |
| } | |
| ), | |
| content_type=<span class="hljs-string">"application/json"</span>, | |
| ) | |
| output = json.loads(result.body.read().decode(<span class="hljs-string">"utf-8"</span>)) | |
| message = output[<span class="hljs-string">"choices"</span>][<span class="hljs-number">0</span>][<span class="hljs-string">"message"</span>] | |
| <span class="hljs-keyword">assert</span> message[<span class="hljs-string">"role"</span>] == <span class="hljs-string">"assistant"</span> | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">"Generated response:"</span>, message[<span class="hljs-string">"content"</span>])<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="4-clean-up" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#4-clean-up"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>4. Clean up</span></h2> <p data-svelte-h="svelte-100mxno">To clean up, we can delete the model and endpoint.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model.delete() | |
| endpoint_config.delete() | |
| endpoint.delete()<!-- HTML_TAG_END --></pre></div> <p></p> | |
| <script> | |
| { | |
| __sveltekit_19pvy0q = { | |
| assets: "/docs/optimum.neuron/pr_1097/en", | |
| base: "/docs/optimum.neuron/pr_1097/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/start.82e2ff17.js"), | |
| import("/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/app.9b0c0103.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 19], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 37.5 kB
- Xet hash:
- 2b17fcfc65273db52f43c919497e9f05ed5b47430e15215c2ddc4dc928164999
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.