Buckets:

rtrm's picture
download
raw
52.4 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Create your own chatbot with llama-2-13B on AWS Inferentia&quot;,&quot;local&quot;:&quot;create-your-own-chatbot-with-llama-2-13b-on-aws-inferentia&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Prerequisite: Setup AWS environment&quot;,&quot;local&quot;:&quot;prerequisite-setup-aws-environment&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;1. Export the Llama 2 model to Neuron&quot;,&quot;local&quot;:&quot;1-export-the-llama-2-model-to-neuron&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;A few more words about export parameters.&quot;,&quot;local&quot;:&quot;a-few-more-words-about-export-parameters&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;2. Generate text using Llama 2 on AWS Inferentia2&quot;,&quot;local&quot;:&quot;2-generate-text-using-llama-2-on-aws-inferentia2&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;3. Create a chat application using llama on AWS Inferentia2&quot;,&quot;local&quot;:&quot;3-create-a-chat-application-using-llama-on-aws-inferentia2&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/optimum.neuron/pr_1097/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/start.82e2ff17.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/scheduler.56725da7.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/singletons.2080b4fc.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/paths.90dabf70.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/app.9b0c0103.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/preload-helper.9dba61fb.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/index.18a26576.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/nodes/0.912aab06.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/nodes/21.ecd4ebcd.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/CopyLLMTxtMenu.fb3856d8.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/globals.7f7f1b26.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.a16844e0.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/CodeBlock.2d00672f.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Create your own chatbot with llama-2-13B on AWS Inferentia&quot;,&quot;local&quot;:&quot;create-your-own-chatbot-with-llama-2-13b-on-aws-inferentia&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Prerequisite: Setup AWS environment&quot;,&quot;local&quot;:&quot;prerequisite-setup-aws-environment&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;1. Export the Llama 2 model to Neuron&quot;,&quot;local&quot;:&quot;1-export-the-llama-2-model-to-neuron&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;A few more words about export parameters.&quot;,&quot;local&quot;:&quot;a-few-more-words-about-export-parameters&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;2. Generate text using Llama 2 on AWS Inferentia2&quot;,&quot;local&quot;:&quot;2-generate-text-using-llama-2-on-aws-inferentia2&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;3. Create a chat application using llama on AWS Inferentia2&quot;,&quot;local&quot;:&quot;3-create-a-chat-application-using-llama-on-aws-inferentia2&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="create-your-own-chatbot-with-llama-2-13b-on-aws-inferentia" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#create-your-own-chatbot-with-llama-2-13b-on-aws-inferentia"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Create your own chatbot with llama-2-13B on AWS Inferentia</span></h1> <p data-svelte-h="svelte-1skq0wn">This guide will detail how to export, deploy and run a <strong>LLama-2 13B</strong> chat model on AWS inferentia.</p> <p data-svelte-h="svelte-1hahfn0">You will learn how to:</p> <ul data-svelte-h="svelte-1ph7922"><li>set up your AWS instance,</li> <li>export the Llama-2 model to the Neuron format,</li> <li>push the exported model to the Hugging Face Hub,</li> <li>deploy the model and use it in a chat application.</li></ul> <p data-svelte-h="svelte-1jogm5f">Note: This tutorial was created on a inf2.48xlarge AWS EC2 Instance.</p> <h2 class="relative group"><a id="prerequisite-setup-aws-environment" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#prerequisite-setup-aws-environment"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Prerequisite: Setup AWS environment</span></h2> <p data-svelte-h="svelte-1h04ij0"><em>you can skip that section if you are already running this notebook on your instance.</em></p> <p data-svelte-h="svelte-gte1ah">In this example, we will use the <em>inf2.48xlarge</em> instance with 12 Neuron devices, corresponding to 24 Neuron Cores and the <a href="https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2" rel="nofollow">Hugging Face Neuron Deep Learning AMI</a>.</p> <p data-svelte-h="svelte-10g2ajd">This guide doesn’t cover how to create the instance in detail. You can refer to the <a href="https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/EC2_GetStarted.html" rel="nofollow">offical documentation</a>. At step 4. you will select the
<a href="https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2" rel="nofollow">Hugging Face Neuron Deep Learning AMI</a> and at step 5. you will select an <em>inf2</em> instance type.</p> <p data-svelte-h="svelte-fcgwlz">Once the instance is up and running, you can ssh into it. But instead of developing inside a terminal you need to launch a Jupyter server to run this notebook.</p> <p data-svelte-h="svelte-1t7xa18">For this, you need first to add a port for forwarding in the ssh command, which will tunnel our localhost traffic to the AWS instance.</p> <p data-svelte-h="svelte-h9klj4">From a local terminal, type the following commands:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->HOSTNAME=&quot;&quot; # IP address, e.g. ec2-3-80-....
KEY_PATH=&quot;&quot; # local path to key, e.g. ssh/trn.pem
ssh -L 8080:localhost:8080 -i ${KEY_NAME}.pem ubuntu@$HOSTNAME<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1sov39d">On the instance, you can now start the jupyter server.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attribute">python</span> -m notebook --<span class="hljs-literal">allow</span>-root --port=<span class="hljs-number">8080</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-r6ohpb">You should see a familiar jupyter output with a URL.</p> <p data-svelte-h="svelte-x30pyw">You can click on it, and a jupyter environment will open in your local browser.</p> <p data-svelte-h="svelte-1w15e9h">You can then browse to this notebook (<code>notebooks/text-generation/llama2-13-chatbot</code>) to continue with the guide.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Special widgets are required for a nicer display</span>
!{sys.executable} -m pip install ipywidgets<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="1-export-the-llama-2-model-to-neuron" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1-export-the-llama-2-model-to-neuron"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. Export the Llama 2 model to Neuron</span></h2> <p data-svelte-h="svelte-1hepwaf">For this guide, we will use the non-gated <a href="https://huggingface.co/NousResearch/Llama-2-13b-chat-hf" rel="nofollow">NousResearch/Llama-2-13b-chat-hf</a> model, which is functionally equivalent to the original <a href="https://huggingface.co/meta-llama/Llama-2-13b-chat-hf" rel="nofollow">meta-llama/Llama-2-13b-chat-hf</a>.</p> <p data-svelte-h="svelte-1s9dc42">This model is part of the <strong>Llama 2</strong> family of models, and has been tuned to recognize chat interactions
between a <em>user</em> and an <em>assistant</em> (more on that later).</p> <p data-svelte-h="svelte-mznum8">As explained in the <a href="https://huggingface.co/docs/optimum-neuron/guides/export_model#why-compile-to-neuron-model" rel="nofollow">optimum-neuron documentation</a>
, models need to be compiled and exported to a serialized format before running them on Neuron devices.</p> <p data-svelte-h="svelte-1infapy">Fortunately, 🤗 <strong>optimum-neuron</strong> offers a <a href="https://huggingface.co/docs/optimum-neuron/guides/models#configuring-the-export-of-a-generative-model" rel="nofollow">very simple API</a>
to export standard 🤗 <a href="https://huggingface.co/docs/transformers/index" rel="nofollow">transformers models</a> to the Neuron format.</p> <p data-svelte-h="svelte-vz83yd">When exporting the model, we will specify two sets of parameters:</p> <ul data-svelte-h="svelte-1z4h5r"><li>using <em>compiler_args</em>, we specify on how many cores we want the model to be deployed (each neuron device has two cores), and with which precision (here <em>float16</em>),</li> <li>using <em>input_shapes</em>, we set the static input and output dimensions of the model. All model compilers require static shapes, and neuron makes no exception. Note that the
<em>sequence_length</em> not only constrains the length of the input context, but also the length of the Key/Value cache, and thus, the output length.</li></ul> <p data-svelte-h="svelte-i8skpg">Depending on your choice of parameters and inferentia host, this may take from a few minutes to more than an hour.</p> <p data-svelte-h="svelte-121tgcd">For your convenience, we host a pre-compiled version of that model on the Hugging Face hub, so you can skip the export and start using the model immediately in section 2.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronModelForCausalLM
compiler_args = {<span class="hljs-string">&quot;num_cores&quot;</span>: <span class="hljs-number">24</span>, <span class="hljs-string">&quot;auto_cast_type&quot;</span>: <span class="hljs-string">&quot;fp16&quot;</span>}
input_shapes = {<span class="hljs-string">&quot;batch_size&quot;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&quot;sequence_length&quot;</span>: <span class="hljs-number">2048</span>}
model = NeuronModelForCausalLM.from_pretrained(
<span class="hljs-string">&quot;NousResearch/Llama-2-13b-chat-hf&quot;</span>, export=<span class="hljs-literal">True</span>, **compiler_args, **input_shapes
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-12a4oka">This probably took a while.</p> <p data-svelte-h="svelte-wxy07q">Fortunately, you will need to do this only once because you can save your model and reload it later.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->model.save_pretrained(<span class="hljs-string">&quot;llama-2-13b-chat-neuron&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-16smhm8">Even better, you can push it to the <a href="https://huggingface.co/models" rel="nofollow">Hugging Face hub</a>.</p> <p data-svelte-h="svelte-1jkprhy">For that, you need to be logged in to a <a href="https://huggingface.co/join" rel="nofollow">HuggingFace account</a>.</p> <p data-svelte-h="svelte-1ifk3wl">If you are not connected already on your instance, you will now be prompted for an access token.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login
notebook_login(new_session=<span class="hljs-literal">False</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8c3of4">By default, the model will be uploaded to your account (organization equal to your user name).</p> <p data-svelte-h="svelte-dzb13l">Feel free to edit the cell below if you want to upload the model to a specific <a href="https://huggingface.co/docs/hub/organizations" rel="nofollow">Hugging Face organization</a>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> whoami
org = whoami()[<span class="hljs-string">&quot;name&quot;</span>]
repo_id = <span class="hljs-string">f&quot;<span class="hljs-subst">{org}</span>/llama-2-13b-chat-neuron&quot;</span>
model.push_to_hub(<span class="hljs-string">&quot;llama-2-13b-chat-neuron&quot;</span>, repository_id=repo_id)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="a-few-more-words-about-export-parameters" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#a-few-more-words-about-export-parameters"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>A few more words about export parameters.</span></h3> <p data-svelte-h="svelte-mi975y">The minimum memory required to load a model can be computed with:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --> memory = bytes per <span class="hljs-keyword">parameter</span> * <span class="hljs-keyword">number</span> of parameters<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-zmj8oy">The <strong>Llama 2 13B</strong> model uses <em>float16</em> weights (stored on 2 bytes) and has 13 billion parameters, which means it requires at least 2 * 13B or ~26GB of memory to store its weights.</p> <p data-svelte-h="svelte-1hs4joy">Each NeuronCore has 16GB of memory which means that a 26GB model cannot fit on a single NeuronCore.</p> <p data-svelte-h="svelte-13xxwf3">In reality, the total space required is much greater than just the number of parameters due to caching attention layer projections (KV caching).
This caching mechanism grows memory allocations linearly with sequence length and batch size.</p> <p data-svelte-h="svelte-13q3i6n">Here we set the <em>batch_size</em> to 1, meaning that we can only process one input prompt in parallel. We set the <em>sequence_length</em> to 2048, which corresponds to half the model maximum capacity (4096).</p> <p data-svelte-h="svelte-mdqsxr">The formula to evaluate the size of the KV cache is more involved as it also depends on parameters related to the model architecture, such as the width of the embeddings and the number of decoder blocks.</p> <p data-svelte-h="svelte-12eo3v5">Bottom-line is, to get very large language models to fit, tensor parallelism is used to split weights, data, and compute across multiple NeuronCores, keeping in mind that the memory on each core cannot exceed 16GB.</p> <p data-svelte-h="svelte-qqihmw">Note that increasing the number of cores beyond the minimum requirement almost always results in a faster model.
Increasing the tensor parallelism degree improves memory bandwidth which improves model performance.</p> <p data-svelte-h="svelte-13dz4vh">To optimize performance it’s recommended to use all cores available on the instance.</p> <p data-svelte-h="svelte-exk3d9">In this guide we use all the 24 cores of the <em>inf2.48xlarge</em>, but this should be changed to 12 if you are
using a <em>inf2.24xlarge</em> instance.</p> <h2 class="relative group"><a id="2-generate-text-using-llama-2-on-aws-inferentia2" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-generate-text-using-llama-2-on-aws-inferentia2"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. Generate text using Llama 2 on AWS Inferentia2</span></h2> <p data-svelte-h="svelte-a6bs6o">Once your model has been exported, you can generate text using the transformers library, as it has been described in <a href="https://huggingface.co/blog/how-to-generate" rel="nofollow">detail in this post</a>.</p> <p data-svelte-h="svelte-zz3frh">If as suggested you skipped the first section, don’t worry: we will use a precompiled model already present on the hub instead.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronModelForCausalLM
<span class="hljs-keyword">try</span>:
model
<span class="hljs-keyword">except</span> NameError:
<span class="hljs-comment"># Edit this to use another base model</span>
model = NeuronModelForCausalLM.from_pretrained(<span class="hljs-string">&quot;aws-neuron/Llama-2-13b-chat-hf-neuron-latency&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ps6mip">We will need a <em>Llama 2</em> tokenizer to convert the prompt strings to text tokens.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;NousResearch/Llama-2-13b-chat-hf&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1y7mxpx">The following generation strategies are supported:</p> <ul data-svelte-h="svelte-a4n5id"><li>greedy search,</li> <li>multinomial sampling with top-k and top-p (with temperature).</li></ul> <p data-svelte-h="svelte-k217aq">Most logits pre-processing/filters (such as repetition penalty) are supported.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->inputs = tokenizer(<span class="hljs-string">&quot;What is deep-learning ?&quot;</span>, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>)
outputs = model.generate(**inputs, max_new_tokens=<span class="hljs-number">128</span>, do_sample=<span class="hljs-literal">True</span>, temperature=<span class="hljs-number">0.9</span>, top_k=<span class="hljs-number">50</span>, top_p=<span class="hljs-number">0.9</span>)
tokenizer.batch_decode(outputs, skip_special_tokens=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="3-create-a-chat-application-using-llama-on-aws-inferentia2" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-create-a-chat-application-using-llama-on-aws-inferentia2"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. Create a chat application using llama on AWS Inferentia2</span></h2> <p data-svelte-h="svelte-1ezk4g0">We specifically selected a <strong>Llama 2</strong> chat variant to illustrate the excellent behaviour of the exported model when the length of the encoding context grows.</p> <p data-svelte-h="svelte-1e1yopm">The model expects the prompts to be formatted following a specific template corresponding to the interactions between a <em>user</em> role and an <em>assistant</em> role.</p> <p data-svelte-h="svelte-1xcz5ok">Each chat model has its own convention for encoding such contents, and we will not go into too much details in this guide, because we will directly use the <a href="https://huggingface.co/blog/chat-templates" rel="nofollow">Hugging Face chat templates</a> corresponding to our model.</p> <p data-svelte-h="svelte-1v5xgsn">The utility function below converts a list of exchanges between the user and the model into a well-formatted chat prompt.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">format_chat_prompt</span>(<span class="hljs-params">message, history, max_tokens</span>):
<span class="hljs-string">&quot;&quot;&quot;Convert a history of messages to a chat prompt
Args:
message(str): the new user message.
history (List[str]): the list of user messages and assistant responses.
max_tokens (int): the maximum number of input tokens accepted by the model.
Returns:
a `str` prompt.
&quot;&quot;&quot;</span>
chat = []
<span class="hljs-comment"># Convert all messages in history to chat interactions</span>
<span class="hljs-keyword">for</span> interaction <span class="hljs-keyword">in</span> history:
chat.append({<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: interaction[<span class="hljs-number">0</span>]})
chat.append({<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;assistant&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: interaction[<span class="hljs-number">1</span>]})
<span class="hljs-comment"># Add the new message</span>
chat.append({<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: message})
<span class="hljs-comment"># Generate the prompt, verifying that we don&#x27;t go beyond the maximum number of tokens</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">0</span>, <span class="hljs-built_in">len</span>(chat), <span class="hljs-number">2</span>):
<span class="hljs-comment"># Generate candidate prompt with the last n-i entries</span>
prompt = tokenizer.apply_chat_template(chat[i:], tokenize=<span class="hljs-literal">False</span>)
<span class="hljs-comment"># Tokenize to check if we&#x27;re over the limit</span>
tokens = tokenizer(prompt)
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(tokens.input_ids) &lt;= max_tokens:
<span class="hljs-comment"># We&#x27;re good, stop here</span>
<span class="hljs-keyword">return</span> prompt
<span class="hljs-comment"># We shall never reach this line</span>
<span class="hljs-keyword">raise</span> SystemError<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1hfx6dq">We are now equipped to build a simplistic chat application.</p> <p data-svelte-h="svelte-ffg1bu">We simply store the interactions between the user and the assistant in a list that we use to generate
the input prompt.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->history = []
max_tokens = <span class="hljs-number">1024</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">chat</span>(<span class="hljs-params">message, history, max_tokens</span>):
prompt = format_chat_prompt(message, history, max_tokens)
<span class="hljs-comment"># Uncomment the line below to see what the formatted prompt looks like</span>
<span class="hljs-comment"># print(prompt)</span>
inputs = tokenizer(prompt, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>)
outputs = model.generate(
**inputs, max_length=<span class="hljs-number">2048</span>, do_sample=<span class="hljs-literal">True</span>, temperature=<span class="hljs-number">0.9</span>, top_k=<span class="hljs-number">50</span>, repetition_penalty=<span class="hljs-number">1.2</span>
)
<span class="hljs-comment"># Do not include the input tokens</span>
outputs = outputs[<span class="hljs-number">0</span>, inputs.input_ids.size(-<span class="hljs-number">1</span>) :]
response = tokenizer.decode(outputs, skip_special_tokens=<span class="hljs-literal">True</span>)
history.append([message, response])
<span class="hljs-keyword">return</span> response<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(chat(<span class="hljs-string">&quot;My favorite color is blue. My favorite fruit is strawberry.&quot;</span>, history, max_tokens))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(chat(<span class="hljs-string">&quot;Name a fruit that is on my favorite colour.&quot;</span>, history, max_tokens))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(chat(<span class="hljs-string">&quot;What is the colour of my favorite fruit ?&quot;</span>, history, max_tokens))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1b0u28j"><strong>Warning</strong>: While very powerful, Large language models can sometimes <em>hallucinate</em>. We call <em>hallucinations</em> generated content that is irrelevant or made-up but presented by the model as if it was accurate. This is a flaw of LLMs and is not a side effect of using them on Trainium / Inferentia.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><!-- HTML_TAG_END --></pre></div> <p></p>
<script>
{
__sveltekit_19pvy0q = {
assets: "/docs/optimum.neuron/pr_1097/en",
base: "/docs/optimum.neuron/pr_1097/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/start.82e2ff17.js"),
import("/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/app.9b0c0103.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 21],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
52.4 kB
·
Xet hash:
f7e9da0271f1cc249d34ddf8250d211e851436ad56a3e5eb622baf1086fc52fa

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.