Buckets:

hf-doc-build/doc-dev / transformers /main /ja /perf_infer_gpu_many.html
rtrm's picture
download
raw
25.4 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Efficient Inference on a Multiple GPUs&quot;,&quot;local&quot;:&quot;efficient-inference-on-a-multiple-gpus&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Flash Attention 2&quot;,&quot;local&quot;:&quot;flash-attention-2&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;BetterTransformer&quot;,&quot;local&quot;:&quot;bettertransformer&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Decoder models&quot;,&quot;local&quot;:&quot;decoder-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Encoder Models&quot;,&quot;local&quot;:&quot;encoder-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Advanced usage: mixing FP4 (or Int8) and BetterTransformer&quot;,&quot;local&quot;:&quot;advanced-usage-mixing-fp4-or-int8-and-bettertransformer&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/transformers/main/ja/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/entry/start.1486e459.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/scheduler.9bc65507.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/singletons.eee55cbf.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/index.3b203c72.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/paths.59da1547.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/entry/app.d9ae818f.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/index.707bf1b6.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/nodes/0.c06aa070.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/nodes/113.7f3ec144.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/Tip.c2ecdbf4.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/CodeBlock.54a9f38d.js">
<link rel="modulepreload" href="/docs/transformers/main/ja/_app/immutable/chunks/EditOnGithub.922df6ba.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Efficient Inference on a Multiple GPUs&quot;,&quot;local&quot;:&quot;efficient-inference-on-a-multiple-gpus&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Flash Attention 2&quot;,&quot;local&quot;:&quot;flash-attention-2&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;BetterTransformer&quot;,&quot;local&quot;:&quot;bettertransformer&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Decoder models&quot;,&quot;local&quot;:&quot;decoder-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Encoder Models&quot;,&quot;local&quot;:&quot;encoder-models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Advanced usage: mixing FP4 (or Int8) and BetterTransformer&quot;,&quot;local&quot;:&quot;advanced-usage-mixing-fp4-or-int8-and-bettertransformer&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="efficient-inference-on-a-multiple-gpus" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#efficient-inference-on-a-multiple-gpus"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Efficient Inference on a Multiple GPUs</span></h1> <p data-svelte-h="svelte-10hzl65">この文書には、複数のGPUで効率的に推論を行う方法に関する情報が含まれています。</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-612xsh">注意: 複数のGPUセットアップは、<a href="./perf_infer_gpu_one">単一のGPUセクション</a>で説明されているほとんどの戦略を使用できます。ただし、より良い使用法のために使用できる簡単なテクニックについても認識しておく必要があります。</p></div> <h2 class="relative group"><a id="flash-attention-2" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#flash-attention-2"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Flash Attention 2</span></h2> <p data-svelte-h="svelte-127b4hj">Flash Attention 2の統合は、複数のGPUセットアップでも機能します。詳細については、<a href="./perf_infer_gpu_one#Flash-Attention-2">単一のGPUセクション</a>の適切なセクションをご覧ください。</p> <h2 class="relative group"><a id="bettertransformer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#bettertransformer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>BetterTransformer</span></h2> <p data-svelte-h="svelte-1iuxtx2"><a href="https://huggingface.co/docs/optimum/bettertransformer/overview" rel="nofollow">BetterTransformer</a>は、🤗 TransformersモデルをPyTorchネイティブの高速実行パスを使用するように変換し、その下でFlash Attentionなどの最適化されたカーネルを呼び出します。</p> <p data-svelte-h="svelte-xfb372">BetterTransformerは、テキスト、画像、音声モデルの単一GPUおよび複数GPUでの高速推論もサポートしています。</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1kooxcx">Flash Attentionは、fp16またはbf16 dtypeを使用しているモデルにのみ使用できます。BetterTransformerを使用する前に、モデルを適切なdtypeにキャストしてください。</p></div> <h3 class="relative group"><a id="decoder-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#decoder-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Decoder models</span></h3> <p data-svelte-h="svelte-1gojc9b">テキストモデル、特にデコーダーベースのモデル(GPT、T5、Llamaなど)の場合、BetterTransformer APIはすべての注意操作を<a href="https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention" rel="nofollow"><code>torch.nn.functional.scaled_dot_product_attention</code>オペレーター</a>(SDPA)を使用するように変換します。これはPyTorch 2.0以降でのみ使用可能です。</p> <p data-svelte-h="svelte-1mme928">モデルをBetterTransformerに変換するには:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">&quot;facebook/opt-350m&quot;</span>)
<span class="hljs-comment"># convert the model to BetterTransformer</span>
model.to_bettertransformer()
<span class="hljs-comment"># Use it for training or inference</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-188y3vx">SDPAは、ハードウェアや問題のサイズなどの特定の設定で<a href="https://arxiv.org/abs/2205.14135" rel="nofollow">Flash Attention</a>カーネルを呼び出すこともできます。Flash Attentionを有効にするか、特定の設定(ハードウェア、問題のサイズ)で利用可能かを確認するには、<a href="https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel" rel="nofollow"><code>torch.backends.cuda.sdp_kernel</code></a>をコンテキストマネージャとして使用します。</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(&quot;facebook/opt-350m&quot;)
model = AutoModelForCausalLM.from_pretrained(&quot;facebook/opt-350m&quot;).to(&quot;cuda&quot;)
# convert the model to BetterTransformer
model.to_bettertransformer()
input_text = &quot;Hello my dog is cute and&quot;
inputs = tokenizer(input_text, return_tensors=&quot;pt&quot;).to(&quot;cuda&quot;)
<span class="hljs-addition">+ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):</span>
outputs = model.generate(**inputs)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-198zu50">もしトレースバックで次のようなエラーメッセージが表示された場合:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->RuntimeError: No available kernel. Aborting execution.<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-9agg8u">当日、Flash Attentionのカバレッジが広範囲である可能性があるPyTorch Nightlyバージョンを試すようにお勧めします。</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip3 install -U --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-gwrgf0"><a href="https://pytorch.org/blog/out-of-the-box-acceleration/" rel="nofollow">このブログ投稿</a>をチェックして、BetterTransformer + SDPA APIで可能なことについて詳しく学びましょう。</p> <h3 class="relative group"><a id="encoder-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#encoder-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Encoder Models</span></h3> <p data-svelte-h="svelte-nvioyf">推論中のエンコーダーモデルでは、BetterTransformerはエンコーダーレイヤーのforward呼び出しを、エンコーダーレイヤーの<a href="https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html" rel="nofollow"><code>torch.nn.TransformerEncoderLayer</code></a>の相当するものにディスパッチします。これにより、エンコーダーレイヤーの高速実装が実行されます。</p> <p data-svelte-h="svelte-10v3lf3"><code>torch.nn.TransformerEncoderLayer</code>の高速実装はトレーニングをサポートしていないため、代わりに<code>torch.nn.functional.scaled_dot_product_attention</code>にディスパッチされます。これにより、ネストされたテンソルを活用しないFlash AttentionまたはMemory-Efficient Attentionの融合カーネルを使用できます。</p> <p data-svelte-h="svelte-1bfu1uk">BetterTransformerのパフォーマンスの詳細については、この<a href="https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2" rel="nofollow">ブログ投稿</a>をご覧いただけます。また、エンコーダーモデル用のBetterTransformerについては、この<a href="https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/" rel="nofollow">ブログ</a>で詳しく学ぶことができます。</p> <h2 class="relative group"><a id="advanced-usage-mixing-fp4-or-int8-and-bettertransformer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#advanced-usage-mixing-fp4-or-int8-and-bettertransformer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Advanced usage: mixing FP4 (or Int8) and BetterTransformer</span></h2> <p data-svelte-h="svelte-18z05f5">モデルの最良のパフォーマンスを得るために、上記で説明した異なる方法を組み合わせることができます。例えば、FP4ミックスプレシジョン推論+Flash Attentionを使用したBetterTransformerを組み合わせることができます。</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
load_in_4bit=<span class="hljs-literal">True</span>,
bnb_4bit_compute_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;facebook/opt-350m&quot;</span>)
model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">&quot;facebook/opt-350m&quot;</span>, quantization_config=quantization_config)
input_text = <span class="hljs-string">&quot;Hello my dog is cute and&quot;</span>
inputs = tokenizer(input_text, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(<span class="hljs-string">&quot;cuda&quot;</span>)
<span class="hljs-keyword">with</span> torch.backends.cuda.sdp_kernel(enable_flash=<span class="hljs-literal">True</span>, enable_math=<span class="hljs-literal">False</span>, enable_mem_efficient=<span class="hljs-literal">False</span>):
outputs = model.generate(**inputs)
<span class="hljs-built_in">print</span>(tokenizer.decode(outputs[<span class="hljs-number">0</span>], skip_special_tokens=<span class="hljs-literal">True</span>))<!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/ja/perf_infer_gpu_many.md" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_jement = {
assets: "/docs/transformers/main/ja",
base: "/docs/transformers/main/ja",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/transformers/main/ja/_app/immutable/entry/start.1486e459.js"),
import("/docs/transformers/main/ja/_app/immutable/entry/app.d9ae818f.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 113],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
25.4 kB
·
Xet hash:
3c8c23328ae874c2b1d195f0267d27994d99e37ec17bf21ce69045d4c502ee77

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.