Buckets:

rtrm's picture
download
raw
25.8 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Qwen3 Embedding on AWS Trainium with Optimum Neuron&quot;,&quot;local&quot;:&quot;qwen3-embedding-on-aws-trainium-with-optimum-neuron&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Prerequisite: Setup Environment&quot;,&quot;local&quot;:&quot;prerequisite-setup-environment&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Compile Qwen Embedding Models for AWS Trainium&quot;,&quot;local&quot;:&quot;compile-qwen-embedding-models-for-aws-trainium&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Option A: Compile using the NeuronModelForEmbedding class&quot;,&quot;local&quot;:&quot;option-a-compile-using-the-neuronmodelforembedding-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Option B: Compile using the optimum-cli tool&quot;,&quot;local&quot;:&quot;option-b-compile-using-the-optimum-cli-tool&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Load compiled Qwen3 Embedding model and run inference&quot;,&quot;local&quot;:&quot;load-compiled-qwen3-embedding-model-and-run-inference&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/optimum.neuron/pr_1097/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/start.82e2ff17.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/scheduler.56725da7.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/singletons.2080b4fc.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/paths.90dabf70.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/app.9b0c0103.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/preload-helper.9dba61fb.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/index.18a26576.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/nodes/0.912aab06.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/nodes/22.78042b39.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/CopyLLMTxtMenu.fb3856d8.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/globals.7f7f1b26.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.a16844e0.js">
<link rel="modulepreload" href="/docs/optimum.neuron/pr_1097/en/_app/immutable/chunks/CodeBlock.2d00672f.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Qwen3 Embedding on AWS Trainium with Optimum Neuron&quot;,&quot;local&quot;:&quot;qwen3-embedding-on-aws-trainium-with-optimum-neuron&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Prerequisite: Setup Environment&quot;,&quot;local&quot;:&quot;prerequisite-setup-environment&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Compile Qwen Embedding Models for AWS Trainium&quot;,&quot;local&quot;:&quot;compile-qwen-embedding-models-for-aws-trainium&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Option A: Compile using the NeuronModelForEmbedding class&quot;,&quot;local&quot;:&quot;option-a-compile-using-the-neuronmodelforembedding-class&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Option B: Compile using the optimum-cli tool&quot;,&quot;local&quot;:&quot;option-b-compile-using-the-optimum-cli-tool&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Load compiled Qwen3 Embedding model and run inference&quot;,&quot;local&quot;:&quot;load-compiled-qwen3-embedding-model-and-run-inference&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="qwen3-embedding-on-aws-trainium-with-optimum-neuron" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#qwen3-embedding-on-aws-trainium-with-optimum-neuron"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Qwen3 Embedding on AWS Trainium with Optimum Neuron</span></h1> <p data-svelte-h="svelte-1wd3ojk">This guide explains how to convert, load, and use Qwen Embedding models on AWS Trainium and Inferentia2 using Optimum Neuron. The Qwen3 Embedding model series is the latest proprietary model of the Qwen family, specifically designed for text embedding and ranking tasks. Building upon the dense foundational models of the Qwen3 series, it provides a comprehensive range of text embeddings and reranking models in various sizes (0.6B, 4B, and 8B). The Qwen3 Embedding series offer support for over 100 languages, thanks to the multilingual capabilites of Qwen3 models.</p> <h2 class="relative group"><a id="prerequisite-setup-environment" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#prerequisite-setup-environment"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Prerequisite: Setup Environment</span></h2> <p data-svelte-h="svelte-1yrpmjk">You can run this notebook on a AWS EC2 instance with the HF DLAMI. To create an instance with the DLAMI, you can follow the <a href="https://huggingface.co/docs/optimum-neuron/en/ec2-setup" rel="nofollow">EC2 Setup guide</a>. Alternatively if you are on a AWS Trainium and Inferentia instance, you can manually install <code>optimum-neuron</code> using the steps in the <a href="https://huggingface.co/docs/optimum-neuron/en/ec2-setup#alternative-manual-installation" rel="nofollow">manual installation guide</a>.</p> <p data-svelte-h="svelte-hpbov5">This guide is written using a <code>trn2.3xlarge</code> AWS Trainium2 instance. But you can use the same code to run the model using a AWS Inferentia2 instance like <code>inf2.48xlarge</code>.</p> <h2 class="relative group"><a id="compile-qwen-embedding-models-for-aws-trainium" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#compile-qwen-embedding-models-for-aws-trainium"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Compile Qwen Embedding Models for AWS Trainium</span></h2> <p data-svelte-h="svelte-1g2aewj">First, you need to convert the model to a format compatible with AWS Trainium and Inferentia2. You can compile Qwen3 Embedding models with Optimum Neuron using the <code>optimum-cli</code> or <code>NeuronModelForEmbedding</code> class. Below you will find an example for both approaches.</p> <p data-svelte-h="svelte-eh7wv6">In the below example, we illustrate this using <a href="https://huggingface.co/Qwen/Qwen3-Embedding-8B" rel="nofollow">Qwen3 Embedding 8B</a> but you can follow the same steps for the 0.8B and the 4B versions of the embedding models.</p> <h3 class="relative group"><a id="option-a-compile-using-the-neuronmodelforembedding-class" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#option-a-compile-using-the-neuronmodelforembedding-class"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Option A: Compile using the NeuronModelForEmbedding class</span></h3> <p data-svelte-h="svelte-1t2eef9">Here we will use the <code>NeuronModelForEmbedding</code> class, which can convert Qwen3 Embedding models to a format compatible with AWS Trainium and Inferentia2 or load already converted models. When exporting models with <code>NeuronModelForEmbedding</code> you need to define the the <code>sequence_length</code> and <code>batch size</code> in the neuron config.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronModelForEmbedding
model_id = <span class="hljs-string">&quot;Qwen/Qwen3-Embedding-8B&quot;</span>
neuron_model_dir = <span class="hljs-string">&quot;qwen_embedding_8B_tp4&quot;</span>
<span class="hljs-comment"># If you are using a AWS Inferentia2 instance and use &#x27;tensor_parallel_size=4&#x27;, you should set the following environment variable as well.</span>
<span class="hljs-comment"># import os</span>
<span class="hljs-comment"># os.environ[&quot;LOCAL_WORLD_SIZE&quot;] = &#x27;4&#x27;</span>
neuron_config = NeuronModelForEmbedding.get_neuron_config(
model_id, batch_size=<span class="hljs-number">2</span>, sequence_length=<span class="hljs-number">1024</span>, tensor_parallel_size=<span class="hljs-number">4</span>
)
neuron_model = NeuronModelForEmbedding.export(model_id=model_id, neuron_config=neuron_config, load_weights=<span class="hljs-literal">False</span>)
<span class="hljs-comment"># Save model to disk</span>
neuron_model.save_pretrained(neuron_model_dir)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="option-b-compile-using-the-optimum-cli-tool" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#option-b-compile-using-the-optimum-cli-tool"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Option B: Compile using the optimum-cli tool</span></h3> <p data-svelte-h="svelte-18r2uoc">Here we will use the <code>optimum-cli</code> tool to convert the model. Similar to the <code>NeuronModelForEmbedding</code> we need to define our sequence length and batch size. The <code>optimum-cli</code> will automatically convert the model to a format compatible with AWS Trainium and Inferentia2 and save it to the specified output directory.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->! optimum-cli export neuron --model Qwen/Qwen3-Embedding-8B --batch_size <span class="hljs-number">2</span> --sequence_length <span class="hljs-number">1024</span> --auto_cast matmul --instance_type trn2 --tensor_parallel_size <span class="hljs-number">4</span> qwen_embedding_8B_tp4/<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="load-compiled-qwen3-embedding-model-and-run-inference" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#load-compiled-qwen3-embedding-model-and-run-inference"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Load compiled Qwen3 Embedding model and run inference</span></h2> <p data-svelte-h="svelte-1uu32wm">Once we have a compiled the model, for loading the model we can use the <code>NeuronModelForEmbedding</code> class to load the model and run inference.
In the below example, we first compute embeddings for two queries and documents; and then compute the similarity score for the queries and documents.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> torch <span class="hljs-keyword">import</span> Tensor
<span class="hljs-keyword">import</span> torch.nn.functional <span class="hljs-keyword">as</span> F
<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
<span class="hljs-keyword">from</span> optimum.neuron <span class="hljs-keyword">import</span> NeuronModelForEmbedding
<span class="hljs-comment"># set this to the path that has the compiled model from above</span>
model_id_or_path = neuron_model_dir
<span class="hljs-keyword">def</span> <span class="hljs-title function_">last_token_pool</span>(<span class="hljs-params">last_hidden_states: Tensor, attention_mask: Tensor</span>) -&gt; Tensor:
left_padding = attention_mask[:, -<span class="hljs-number">1</span>].<span class="hljs-built_in">sum</span>() == attention_mask.shape[<span class="hljs-number">0</span>]
<span class="hljs-keyword">if</span> left_padding:
<span class="hljs-keyword">return</span> last_hidden_states[:, -<span class="hljs-number">1</span>]
<span class="hljs-keyword">else</span>:
sequence_lengths = attention_mask.<span class="hljs-built_in">sum</span>(dim=<span class="hljs-number">1</span>) - <span class="hljs-number">1</span>
batch_size = last_hidden_states.shape[<span class="hljs-number">0</span>]
<span class="hljs-keyword">return</span> last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]
<span class="hljs-comment"># Load model and tokenizer</span>
model = NeuronModelForEmbedding.from_pretrained(model_id_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side=<span class="hljs-string">&quot;right&quot;</span>)
<span class="hljs-comment"># Input text to embed</span>
queries = [<span class="hljs-string">&quot;What is the capital of China?&quot;</span>, <span class="hljs-string">&quot;Explain gravity&quot;</span>]
<span class="hljs-comment"># No need to add instruction for retrieval documents</span>
documents = [
<span class="hljs-string">&quot;The capital of China is Beijing.&quot;</span>,
<span class="hljs-string">&quot;Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.&quot;</span>,
]
<span class="hljs-comment"># Tokenize the input texts</span>
queries_tokens = tokenizer(
queries,
padding=<span class="hljs-literal">True</span>,
truncation=<span class="hljs-literal">True</span>,
max_length=<span class="hljs-number">8192</span>,
return_tensors=<span class="hljs-string">&quot;pt&quot;</span>,
)
documents_tokens = tokenizer(
documents,
padding=<span class="hljs-literal">True</span>,
truncation=<span class="hljs-literal">True</span>,
max_length=<span class="hljs-number">8192</span>,
return_tensors=<span class="hljs-string">&quot;pt&quot;</span>,
)
outputs = model(**queries_tokens)
queries_embeddings = last_token_pool(outputs, queries_tokens[<span class="hljs-string">&quot;attention_mask&quot;</span>])
outputs = model(**documents_tokens)
documents_embeddings = last_token_pool(outputs, documents_tokens[<span class="hljs-string">&quot;attention_mask&quot;</span>])
<span class="hljs-comment"># normalize embeddings and compute similarity scores</span>
queries_embeddings = F.normalize(queries_embeddings, p=<span class="hljs-number">2</span>, dim=<span class="hljs-number">1</span>)
documents_embeddings = F.normalize(documents_embeddings, p=<span class="hljs-number">2</span>, dim=<span class="hljs-number">1</span>)
scores = queries_embeddings @ documents_embeddings.T
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Similarity Scores --&gt;&quot;</span>, scores.tolist())<!-- HTML_TAG_END --></pre></div> <p></p>
<script>
{
__sveltekit_19pvy0q = {
assets: "/docs/optimum.neuron/pr_1097/en",
base: "/docs/optimum.neuron/pr_1097/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/start.82e2ff17.js"),
import("/docs/optimum.neuron/pr_1097/en/_app/immutable/entry/app.9b0c0103.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 22],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
25.8 kB
·
Xet hash:
d8e49a8959e4cdefcb280d17698012d2f1f7431e76007c400b42e3eb3c00df65

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.