Buckets:

hf-doc-build
/

doc

Files

xet

hf-doc-build/doc / optimum-neuron /main /en /tutorials /fine_tune_llama_7b.html

rtrm

4 months ago

download

raw

65.7 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Fine-tune and Test Llama 2 7B on AWS Trainium","local":"fine-tune-and-test-llama-2-7b-on-aws-trainium","sections":[{"title":"Quick intro: AWS Trainium","local":"quick-intro-aws-trainium","sections":[],"depth":2},{"title":"1. Setup AWS environment","local":"1-setup-aws-environment","sections":[],"depth":2},{"title":"2. Load and prepare the dataset","local":"2-load-and-prepare-the-dataset","sections":[],"depth":2},{"title":"3. Fine-tune Llama on AWS Trainium using the NeuronTrainer","local":"3-fine-tune-llama-on-aws-trainium-using-the-neurontrainer","sections":[],"depth":2},{"title":"4. Evaluate and test fine-tuned Llama model","local":"4-evaluate-and-test-fine-tuned-llama-model","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/optimum.neuron/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/entry/start.ae7452d0.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/scheduler.a2b4ca8e.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/singletons.afcc50b4.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/paths.c3d1ecd8.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/entry/app.f4665957.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/index.d2f673cc.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/nodes/0.35a7fce3.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/nodes/26.4c15ecf5.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/CodeBlock.792343a6.js">
	<link rel="modulepreload" href="/docs/optimum.neuron/main/en/_app/immutable/chunks/Heading.675d4c1e.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Fine-tune and Test Llama 2 7B on AWS Trainium","local":"fine-tune-and-test-llama-2-7b-on-aws-trainium","sections":[{"title":"Quick intro: AWS Trainium","local":"quick-intro-aws-trainium","sections":[],"depth":2},{"title":"1. Setup AWS environment","local":"1-setup-aws-environment","sections":[],"depth":2},{"title":"2. Load and prepare the dataset","local":"2-load-and-prepare-the-dataset","sections":[],"depth":2},{"title":"3. Fine-tune Llama on AWS Trainium using the NeuronTrainer","local":"3-fine-tune-llama-on-aws-trainium-using-the-neurontrainer","sections":[],"depth":2},{"title":"4. Evaluate and test fine-tuned Llama model","local":"4-evaluate-and-test-fine-tuned-llama-model","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="fine-tune-and-test-llama-2-7b-on-aws-trainium" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fine-tune-and-test-llama-2-7b-on-aws-trainium"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Fine-tune and Test Llama 2 7B on AWS Trainium</span></h1> <p data-svelte-h="svelte-1u6blc9"><em>There is a notebook version of that tutorial <a href="https://github.com/huggingface/optimum-neuron/blob/main/notebooks/text-generation/llama2-7b-fine-tuning.ipynb" rel="nofollow">here</a></em>.</p> <p data-svelte-h="svelte-bteys1">This tutorial will teach you how to fine-tune open LLMs like <a href="https://huggingface.co/meta-llama/Llama-2-7b-hf" rel="nofollow">Llama 2</a> on AWS Trainium. In our example, we are going to leverage Hugging Face <a href="https://huggingface.co/docs/optimum-neuron/index" rel="nofollow">https://huggingface.co/docs/optimum-neuron/index</a>, <a href="https://huggingface.co/docs/transformers/index" rel="nofollow">Transformers</a> and <a href="https://huggingface.co/docs/datasets/index" rel="nofollow">https://huggingface.co/docs/datasets/index</a>.</p> <p data-svelte-h="svelte-1hahfn0">You will learn how to:</p> <ol data-svelte-h="svelte-1baq49t"><li><a href="#1-setup-aws-environment">Setup AWS environment</a></li> <li><a href="#2-load-and-prepare-the-dataset">Load and process the dataset</a></li> <li><a href="#3-fine-tune-llama-on-aws-trainium-using-the-neurontrainer">Fine-tune Llama on AWS Trainium using the <code>NeuronTrainer</code></a></li> <li><a href="#4-evaluate-and-test-fine-tuned-llama-model">Evaluate and test fine-tuned Llama model</a></li></ol> <h2 class="relative group"><a id="quick-intro-aws-trainium" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quick-intro-aws-trainium"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Quick intro: AWS Trainium</span></h2> <p data-svelte-h="svelte-1gu40it"><a href="https://aws.amazon.com/de/ec2/instance-types/trn1/" rel="nofollow">AWS Trainium (Trn1)</a> is a purpose-built EC2 for deep learning (DL) training workloads. Trainium is the successor of <a href="https://aws.amazon.com/ec2/instance-types/inf1/?nc1=h_ls" rel="nofollow">AWS Inferentia</a> focused on high-performance training workloads. Trainium has been optimized for training natural language processing, computer vision, and recommender models. The accelerator supports a wide range of data types, including FP32, TF32, BF16, FP16, UINT8, and configurable FP8.</p> <p data-svelte-h="svelte-f3jgz1">The biggest Trainium instance, the <code>trn1.32xlarge</code> comes with over 500GB of memory, making it easy to fine-tune ~10B parameter models on a single instance. Below you will find an overview of the available instance types. More details <a href="https://aws.amazon.com/de/ec2/instance-types/trn1/#Product_details" rel="nofollow">here</a>:</p> <table data-svelte-h="svelte-48yzfd"><thead><tr><th>instance size</th> <th>accelerators</th> <th>accelerator memory</th> <th>vCPU</th> <th>CPU Memory</th> <th>price per hour</th></tr></thead> <tbody><tr><td>trn1.2xlarge</td> <td>1</td> <td>32</td> <td>8</td> <td>32</td> <td>\$1.34</td></tr> <tr><td>trn1.32xlarge</td> <td>16</td> <td>512</td> <td>128</td> <td>512</td> <td>\$21.50</td></tr> <tr><td>trn1n.32xlarge (2x bandwidth)</td> <td>16</td> <td>512</td> <td>128</td> <td>512</td> <td>\$24.78</td></tr></tbody></table> <p data-svelte-h="svelte-1qrp4mv"><em>Note: This tutorial was created on a trn1.32xlarge AWS EC2 Instance.</em></p> <h2 class="relative group"><a id="1-setup-aws-environment" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1-setup-aws-environment"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. Setup AWS environment</span></h2> <p data-svelte-h="svelte-jip0oh">In this example, we will use the <code>trn1.32xlarge</code> instance on AWS with 16 Accelerator, including 32 Neuron Cores and the <a href="https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2" rel="nofollow">Hugging Face Neuron Deep Learning AMI</a>. The Hugging Face AMI comes with all important libraries, like Transformers, Datasets, Optimum and Neuron packages pre-installed. This makes it super easy to get started, since there is no need for environment management.</p> <p data-svelte-h="svelte-s32fl1">This tutorial doesn’t cover how to create the instance in detail. You can check out the dedicated tutorial about <a href="https://huggingface.co/docs/optimum-neuron/guides/setup_aws_instance" rel="nofollow">“Setting up AWS Trainium for Hugging Face Transformers”</a>, which includes a step-by-step guide on setting up the environment.</p> <p data-svelte-h="svelte-1w4fiht">Once the instance is up and running, we can ssh into it. But instead of developing inside a terminal we want to use a <code>Jupyter</code> environment, which we can use for preparing our dataset and launching the training. For this, we need to add a port for forwarding in the <code>ssh</code> command, which will tunnel our localhost traffic to the Trainium instance.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->PUBLIC_DNS=<span class="hljs-string">""</span> <span class="hljs-comment"># IP address, e.g. ec2-3-80-....</span>
	KEY_PATH=<span class="hljs-string">""</span> <span class="hljs-comment"># local path to key, e.g. ssh/trn.pem</span>

	ssh -L 8080:localhost:8080 -i <span class="hljs-variable">${KEY_NAME}</span>.pem ubuntu@<span class="hljs-variable">$PUBLIC_DNS</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-p2qobr">Let’s now pull the optimum repository with the <a href="https://github.com/huggingface/optimum-neuron/tree/main/notebooks/text-generation" rel="nofollow">example notebook and scripts</a>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->git <span class="hljs-built_in">clone</span> https://github.com/huggingface/optimum-neuron.git<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1hi5xni">Next we can change our directory to <code>notbooks/text-generation</code> and launch the <code>jupyter</code> environment.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># change directory</span>
	<span class="hljs-built_in">cd</span> optimum-neuron/notebooks/text-generation
	<span class="hljs-comment"># launch jupyter</span>
	python -m notebook --allow-root --port=8080<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-si97e0">You should see a familiar <strong><code>jupyter</code></strong> output with a URL to the notebook.</p> <p data-svelte-h="svelte-7s5jat"><strong><code>http://localhost:8080/?token=8c1739aff1755bd7958c4cfccc8d08cb5da5234f61f129a9</code></strong></p> <p data-svelte-h="svelte-wrj5fl">We can click on it, and a <strong><code>jupyter</code></strong> environment opens in our local browser. Open the notebook <strong><code>llama2-7b-fine-tuning.ipynb</code></strong> and lets get started.</p> <p data-svelte-h="svelte-k7cbiv"><em>Note: We are going to use the Jupyter environment only for preparing the dataset and then <code>torchrun</code> for launching our training script for distributed training.</em></p> <p data-svelte-h="svelte-1o2i2ph">If you are going to use official Llama 2 checkpoint you need to login into our hugging face account, which has access to the model, to use your token for accessing the gated repository. We can do this by running the following command:</p> <p data-svelte-h="svelte-rga01y"><em>Note: We also provide an ungated checkpoint.</em></p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!huggingface-cli login --token YOUR_TOKEN<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="2-load-and-prepare-the-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-load-and-prepare-the-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. Load and prepare the dataset</span></h2> <p data-svelte-h="svelte-170rdxe">We will use <a href="https://huggingface.co/datasets/databricks/databricks-dolly-15k" rel="nofollow">Dolly</a> an open source dataset of instruction-following records on categories outlined in the <a href="https://arxiv.org/abs/2203.02155" rel="nofollow">InstructGPT paper</a>, including brainstorming, classification, closed QA, generation, information extraction, open QA, and summarization.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-string">"instruction"</span>: <span class="hljs-string">"What is world of warcraft"</span>,
	<span class="hljs-string">"context"</span>: <span class="hljs-string">""</span>,
	<span class="hljs-string">"response"</span>: <span class="hljs-string">"World of warcraft is a massive online multi player role playing game. It was released in 2004 by bizarre entertainment"</span>
	}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-nx074l">To load the <code>dolly</code> dataset, we use the <code>load_dataset()</code> method from the 🤗 Datasets library.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
	<span class="hljs-keyword">from</span> random <span class="hljs-keyword">import</span> randrange

	<span class="hljs-comment"># Load dataset from the hub</span>
	dataset = load_dataset(<span class="hljs-string">"databricks/databricks-dolly-15k"</span>, split=<span class="hljs-string">"train"</span>)

	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"dataset size: <span class="hljs-subst">{<span class="hljs-built_in">len</span>(dataset)}</span>"</span>)
	<span class="hljs-built_in">print</span>(dataset[randrange(<span class="hljs-built_in">len</span>(dataset))])
	<span class="hljs-comment"># dataset size: 15011</span>
	<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jgf9sc">To instruct tune our model we need to convert our structured examples into a collection of tasks described via instructions. We define a <code>formatting_function</code> that takes a sample and returns a string with our format instruction.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">format_dolly</span>(<span class="hljs-params">sample</span>):
	instruction = <span class="hljs-string">f"### Instruction\n<span class="hljs-subst">{sample[<span class="hljs-string">'instruction'</span>]}</span>"</span>
	context = <span class="hljs-string">f"### Context\n<span class="hljs-subst">{sample[<span class="hljs-string">'context'</span>]}</span>"</span> <span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(sample[<span class="hljs-string">"context"</span>]) > <span class="hljs-number">0</span> <span class="hljs-keyword">else</span> <span class="hljs-literal">None</span>
	response = <span class="hljs-string">f"### Answer\n<span class="hljs-subst">{sample[<span class="hljs-string">'response'</span>]}</span>"</span>
	<span class="hljs-comment"># join all the parts together</span>
	prompt = <span class="hljs-string">"\n\n"</span>.join([i <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> [instruction, context, response] <span class="hljs-keyword">if</span> i <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>])
	<span class="hljs-keyword">return</span> prompt
	<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-123z1dj">let’s test our formatting function on a random example.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> random <span class="hljs-keyword">import</span> randrange

	<span class="hljs-built_in">print</span>(format_dolly(dataset[randrange(<span class="hljs-built_in">len</span>(dataset))]))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ya9sie">In addition, to formatting our samples we also want to pack multiple samples to one sequence to have a more efficient training. This means that we are stacking multiple samples to one sequence and split them with an EOS Token. This makes the training more efficient. Packing/stacking samples can be done during training or before. We will do it before training to save time. We created a utility method <a href="https://github.com/huggingface/optimum-neuron/tree/main/notebooks/text-generation/scripts/utils/pack_dataset.py" rel="nofollow">pack_dataset</a> that takes a dataset and a packing function and returns a packed dataset.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer

	<span class="hljs-comment"># Hugging Face model id</span>
	model_id = <span class="hljs-string">"philschmid/Llama-2-7b-hf"</span> <span class="hljs-comment"># ungated</span>
	<span class="hljs-comment"># model_id = "meta-llama/Llama-2-7b-hf" # gated</span>

	tokenizer = AutoTokenizer.from_pretrained(model_id)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1v8dcxr">To pack/stack our dataset we need to first tokenize it and then we can pack it with the <code>pack_dataset</code> method. To prepare our dataset we will now:</p> <ol data-svelte-h="svelte-v6jbub"><li>Format our samples using the template method and add an EOS token at the end of each sample</li> <li>Tokenize our dataset to convert it from text to tokens</li> <li>Pack our dataset to 2048 tokens</li></ol> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> random <span class="hljs-keyword">import</span> randint
	<span class="hljs-comment"># add utils method to path for loading dataset</span>
	<span class="hljs-keyword">import</span> sys
	sys.path.append(<span class="hljs-string">"./scripts/utils"</span>) <span class="hljs-comment"># make sure you change this to the correct path</span>
	<span class="hljs-keyword">from</span> pack_dataset <span class="hljs-keyword">import</span> pack_dataset


	<span class="hljs-comment"># template dataset to add prompt to each sample</span>
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">template_dataset</span>(<span class="hljs-params">sample</span>):
	sample[<span class="hljs-string">"text"</span>] = <span class="hljs-string">f"<span class="hljs-subst">{format_dolly(sample)}</span><span class="hljs-subst">{tokenizer.eos_token}</span>"</span>
	<span class="hljs-keyword">return</span> sample

	<span class="hljs-comment"># apply prompt template per sample</span>
	dataset = dataset.<span class="hljs-built_in">map</span>(template_dataset, remove_columns=<span class="hljs-built_in">list</span>(dataset.features))
	<span class="hljs-comment"># print random sample</span>
	<span class="hljs-built_in">print</span>(dataset[randint(<span class="hljs-number">0</span>, <span class="hljs-built_in">len</span>(dataset))][<span class="hljs-string">"text"</span>])

	<span class="hljs-comment"># tokenize dataset</span>
	dataset = dataset.<span class="hljs-built_in">map</span>(
	<span class="hljs-keyword">lambda</span> sample: tokenizer(sample[<span class="hljs-string">"text"</span>]), batched=<span class="hljs-literal">True</span>, remove_columns=<span class="hljs-built_in">list</span>(dataset.features)
	)

	<span class="hljs-comment"># chunk dataset</span>
	lm_dataset = pack_dataset(dataset, chunk_length=<span class="hljs-number">2048</span>) <span class="hljs-comment"># We use 2048 as the maximum length for packing</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-16jyes1">After we processed the datasets we are going save it to disk. You could also save it to S3 or the Hugging Face Hub for later use.</p> <p data-svelte-h="svelte-1qdlstl"><em>Note: Packing and preprocessing your dataset can be run outside of the Trainium instance.</em></p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># save train_dataset to disk</span>
	dataset_path = <span class="hljs-string">"tokenized_dolly"</span>
	lm_dataset.save_to_disk(dataset_path)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="3-fine-tune-llama-on-aws-trainium-using-the-neurontrainer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-fine-tune-llama-on-aws-trainium-using-the-neurontrainer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. Fine-tune Llama on AWS Trainium using the NeuronTrainer</span></h2> <p data-svelte-h="svelte-3kvj29">Normally you would use the <strong><a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer" rel="nofollow">Trainer</a></strong> and <strong><a href="https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments" rel="nofollow">TrainingArguments</a></strong> to fine-tune PyTorch-based transformer models.</p> <p data-svelte-h="svelte-16nqarb">But together with AWS, we have developed a <code>NeuronTrainer</code> to improve performance, robustness, and safety when training on Trainium instances. The <code>NeuronTrainer</code> is part of the <code>optimum-neuron</code> library and can be used as a 1-to-1 replacement for the <code>Trainer</code>.</p> <p data-svelte-h="svelte-d5p0c0">When it comes to distributed training on AWS Trainium there are a few things we need to take care of. Since Llama is a big model it might not fit on a single accelerator, thats why we added support for different distributed training strategies to the <code>NeuronTrainer</code> including:</p> <ul data-svelte-h="svelte-1cjxveo"><li><a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/tutorials/training/zero1_gpt2.html" rel="nofollow">ZeRO-1</a>: shards the optimizer state over multiple devices.</li> <li><a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/tensor_parallelism_overview.html" rel="nofollow">Tensor Parallelism</a>: shards the model parameters along a given dimension on multiple devices, defined with <code>tensor_parallel_size</code></li> <li><a href="https://arxiv.org/pdf/2205.05198.pdf" rel="nofollow">Sequence parallelism</a> shards the activations on the sequence axis outside of the tensor parallel regions. It is useful because it saves memory by sharding the activations.</li> <li><a href="https://awsdocs-neuron.readthedocs-hosted.com/en/latest/libraries/neuronx-distributed/pipeline_parallelism_overview.html" rel="nofollow">Pipeline Parallelism</a>: <em>coming soon</em></li></ul> <p data-svelte-h="svelte-6yisi">We prepared a <a href="https://github.com/huggingface/optimum-neuron/blob/main/notebooks/text-generation/scripts/run_clm.py" rel="nofollow">run_clm.py</a>, which implements those distributed training strategies for you already. If you want to know more about the details you can take a look at the <a href="https://huggingface.co/docs/optimum-neuron/guides/distributed_training" rel="nofollow">documentation</a>. When training models on AWS Accelerators we first need to compile our model with our training arguments.</p> <p data-svelte-h="svelte-1gx5zns">To overcome this we added a <a href="https://huggingface.co/docs/optimum-neuron/guides/cache_system" rel="nofollow">model cache</a>, which allows us to use precompiled models and configuration from Hugging Face Hub to skip the compilation step. But every change in the config, will lead to a new compilation, which could result in some cache misses.</p> <p data-svelte-h="svelte-1efdq7y"><em>Note: If your configuration is not cached please open an issue on <a href="https://github.com/huggingface/optimum-neuron/issues" rel="nofollow">Github</a>, we are happy to include it.</em></p> <p data-svelte-h="svelte-1brtt7t">We pre-compiled the config for our training already meaning you can either skip the cell below or rerun it will only take a few minutes since it reuses the cached configuration.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># precompilation command</span>
	!MALLOC_ARENA_MAX=<span class="hljs-number">64</span> neuron_parallel_compile torchrun --nproc_per_node=<span class="hljs-number">32</span> scripts/run_clm.py \
	--model_id {model_id} \
	--dataset_path {dataset_path} \
	--bf16 <span class="hljs-literal">True</span> \
	--learning_rate <span class="hljs-number">5e-5</span> \
	--output_dir dolly_llama \
	--overwrite_output_dir <span class="hljs-literal">True</span> \
	--per_device_train_batch_size <span class="hljs-number">1</span> \
	--gradient_checkpointing <span class="hljs-literal">True</span> \
	--tensor_parallel_size <span class="hljs-number">8</span> \
	--max_steps <span class="hljs-number">10</span> \
	--logging_steps <span class="hljs-number">10</span> \
	--gradient_accumulation_steps <span class="hljs-number">16</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ro2mv7"><em>Note: Compiling without a cache can take ~40 minutes. It will also create dummy files in the <code>dolly_llama_sharded</code> during compilation you we have to remove them afterwards. We also need to add <code>MALLOC_ARENA_MAX=64</code> to limit the CPU allocation to avoid potential crashes, don’t remove it for now.</em></p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># remove dummy artifacts which are created by the precompilation command</span>
	!rm -rf dolly_llama<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1lieax4">After the compilation is done we can start our training with a similar command, we just need to remove the <code>neuron_parallel_compile</code>. We will use <code>torchrun</code> to launch our training script. <code>torchrun</code> is a tool that automatically distributes a PyTorch model across multiple accelerators. We can pass the number of accelerators as <code>nproc_per_node</code> arguments alongside our hyperparameters.
	The difference to the compilation command is that we changed from <code>max_steps=10</code> to <code>num_train_epochs=3</code>.</p> <p data-svelte-h="svelte-17lv8z9">Launch the training, with the following command.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!MALLOC_ARENA_MAX=<span class="hljs-number">64</span> torchrun --nproc_per_node=<span class="hljs-number">32</span> scripts/run_clm.py \
	--model_id {model_id} \
	--dataset_path {dataset_path} \
	--bf16 <span class="hljs-literal">True</span> \
	--learning_rate <span class="hljs-number">5e-5</span> \
	--output_dir dolly_llama \
	--overwrite_output_dir <span class="hljs-literal">True</span> \
	--skip_cache_push <span class="hljs-literal">True</span> \
	--per_device_train_batch_size <span class="hljs-number">1</span> \
	--gradient_checkpointing <span class="hljs-literal">True</span> \
	--tensor_parallel_size <span class="hljs-number">8</span> \
	--num_train_epochs <span class="hljs-number">3</span> \
	--logging_steps <span class="hljs-number">10</span> \
	--gradient_accumulation_steps <span class="hljs-number">16</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-159xyop">Thats it, we successfully trained Llama 7B on AWS Trainium. The training took for 3 epochs on dolly (15k samples) took 43:24 minutes where the raw training time was only 31:46 minutes. This leads to a cost of ~$15.5 for the e2e training on the trn1.32xlarge instance. Not Bad!</p> <p data-svelte-h="svelte-1nzjqkj">But before we can share and test our model we need to consolidate our model. Since we used Tensor Parallelism during training, we need to consolidate the model weights before we can use it. Tensor Parallelism shards the model weights accross different workers, only sharded checkpoints will be saved during training.</p> <p data-svelte-h="svelte-szrdkh">The Optimum CLI provides a way of doing that very easily via the `optimum neuron consolidate“ command:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!optimum-cli neuron consolidate dolly_llama/tensor_parallel_shards dolly_llama<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15qqwtu">Lets remove our “sharded” checkpoints as we have consolidated them already to safetensors.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!rm -rf dolly_llama/tensor_parallel_shards<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="4-evaluate-and-test-fine-tuned-llama-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#4-evaluate-and-test-fine-tuned-llama-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>4. Evaluate and test fine-tuned Llama model</span></h2> <p data-svelte-h="svelte-8ad54z">Similar to training to be able to run inferece on AWS Trainium or AWS Inferentia2 we need to compile our model for the correct use. We will use our Trainium instance for the inference test, but we recommend customer to switch to Inferentia2 for inference.</p> <p data-svelte-h="svelte-1pgrxrm">Optimum Neuron implements similar to Transformers AutoModel classes for easy inference use. We will use the <code>NeuronModelForCausalLM</code> class to load our vanilla transformers checkpoint and convert it to neuron.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronModelForCausalLM
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer

	compiler_args = {<span class="hljs-string">"num_cores"</span>: <span class="hljs-number">2</span>, <span class="hljs-string">"auto_cast_type"</span>: <span class="hljs-string">'fp16'</span>}
	input_shapes = {<span class="hljs-string">"batch_size"</span>: <span class="hljs-number">1</span>, <span class="hljs-string">"sequence_length"</span>: <span class="hljs-number">2048</span>}

	tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"dolly_llama"</span>)
	model = NeuronModelForCausalLM.from_pretrained(
	<span class="hljs-string">"dolly_llama"</span>,
	export=<span class="hljs-literal">True</span>,
	**compiler_args,
	**input_shapes)
	<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1sinvz7"><em>Note: Inference compilation can take ~25minutes. Luckily, you need to only run this onces. Since you can save the model afterwards. If you are going to run on Inferentia2 you need to recompile again. The compilation is parameter and hardware specific.</em></p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># COMMENT IN if you want to save the compiled model</span>
	<span class="hljs-comment"># model.save_pretrained("compiled_dolly_llama")</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jx2yqv">We can now test inference, but have to make sure we format our input to our prompt format we used for fine-tuning. Therefore we created a helper method, which accepts a <code>dict</code> with our <code>instruction</code> and optionally a <code>context</code>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">format_dolly_inference</span>(<span class="hljs-params">sample</span>):
	instruction = <span class="hljs-string">f"### Instruction\n<span class="hljs-subst">{sample[<span class="hljs-string">'instruction'</span>]}</span>"</span>
	context = <span class="hljs-string">f"### Context\n<span class="hljs-subst">{sample[<span class="hljs-string">'context'</span>]}</span>"</span> <span class="hljs-keyword">if</span> <span class="hljs-string">"context"</span> <span class="hljs-keyword">in</span> sample <span class="hljs-keyword">else</span> <span class="hljs-literal">None</span>
	response = <span class="hljs-string">f"### Answer\n"</span>
	<span class="hljs-comment"># join all the parts together</span>
	prompt = <span class="hljs-string">"\n\n"</span>.join([i <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> [instruction, context, response] <span class="hljs-keyword">if</span> i <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>])
	<span class="hljs-keyword">return</span> prompt


	<span class="hljs-keyword">def</span> <span class="hljs-title function_">generate</span>(<span class="hljs-params">sample</span>):
	prompt = format_dolly_inference(sample)
	inputs = tokenizer(prompt, return_tensors=<span class="hljs-string">"pt"</span>)
	outputs = model.generate(**inputs,
	max_new_tokens=<span class="hljs-number">512</span>,
	do_sample=<span class="hljs-literal">True</span>,
	temperature=<span class="hljs-number">0.9</span>,
	top_k=<span class="hljs-number">50</span>,
	top_p=<span class="hljs-number">0.9</span>)
	<span class="hljs-keyword">return</span> tokenizer.decode(outputs[<span class="hljs-number">0</span>], skip_special_tokens=<span class="hljs-literal">False</span>)[<span class="hljs-built_in">len</span>(prompt):]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-uuj81b">Lets test inference. First we test without a context.</p> <p data-svelte-h="svelte-1j8w2o0"><em>Note: Inference is not expected to be super fast on AWS Trainium using 2 cores. For Inference we recommend using Inferentia2.</em></p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->prompt = {
	<span class="hljs-string">"instruction"</span>: <span class="hljs-string">"Can you tell me something about AWS?"</span>
	}
	res = generate(prompt)

	<span class="hljs-built_in">print</span>(res)<!-- HTML_TAG_END --></pre></div> <blockquote data-svelte-h="svelte-6l4k0q"><p>AWS stands for Amazon Web Services. AWS is a suite of remote computing services offered by Amazon. The most widely used of these include Amazon Elastic Compute Cloud (Amazon EC2), which provides resizable compute capacity in the cloud; Amazon Simple Storage Service (Amazon S3), which is an object storage service; and Amazon Elastic Block Store (Amazon EBS), which is designed to provide high performance, durable block storage volumes for use with AWS instances. AWS also provides other services, such as AWS Identity and Access Management (IAM), a service that enables organizations to control access to their AWS resources, and AWS Key Management Service (AWS KMS), which helps customers create and control the use of encryption keys.</p></blockquote> <p data-svelte-h="svelte-u3q76o">That looks correct. Now, lets add some context, e.g. as you would do for RAG applications</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->prompt = {
	<span class="hljs-string">"instruction"</span>: <span class="hljs-string">"How can I train models on AWS Trainium?"</span>,
	<span class="hljs-string">"context"</span>: <span class="hljs-string">"🤗 Optimum Neuron is the interface between the 🤗 Transformers library and AWS Accelerators including [AWS Trainium](https://aws.amazon.com/machine-learning/trainium/?nc1=h_ls) and [AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/?nc1=h_ls). It provides a set of tools enabling easy model loading, training and inference on single- and multi-Accelerator settings for different downstream tasks."</span>
	}
	res = generate(prompt)

	<span class="hljs-built_in">print</span>(res)<!-- HTML_TAG_END --></pre></div> <blockquote data-svelte-h="svelte-15xrtpx"><p>You can use the Optimum Neuron interface to train models on AWS Trainium.</p></blockquote> <p data-svelte-h="svelte-q9f4rf">Awesome, our model also correctly uses the provided context. We are done. Congrats on fine-tuning Llama on AWS Trainium.</p> <p></p>

	<script>
	{
	__sveltekit_w1gmpk = {
	assets: "/docs/optimum.neuron/main/en",
	base: "/docs/optimum.neuron/main/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/optimum.neuron/main/en/_app/immutable/entry/start.ae7452d0.js"),
	import("/docs/optimum.neuron/main/en/_app/immutable/entry/app.f4665957.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 26],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 65.7 kB
Xet hash:: 931148662b8fdbba4368a4ed8ef37a3100ca9e3d8c06e84d2190aa5d832c8874

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.