Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / transformers /main /en /perf_train_cpu_many.html

rtrm

about 1 month ago

download

raw

53.7 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Efficient Training on Multiple CPUs","local":"efficient-training-on-multiple-cpus","sections":[{"title":"Intel® oneCCL Bindings for PyTorch","local":"intel-oneccl-bindings-for-pytorch","sections":[{"title":"Intel® oneCCL Bindings for PyTorch installation","local":"intel-oneccl-bindings-for-pytorch-installation","sections":[],"depth":3}],"depth":2},{"title":"Intel® MPI library","local":"intel-mpi-library","sections":[{"title":"Intel® Extension for PyTorch installation","local":"intel-extension-for-pytorch-installation","sections":[],"depth":4}],"depth":2},{"title":"Usage in Trainer","local":"usage-in-trainer","sections":[],"depth":2},{"title":"Usage with Kubernetes","local":"usage-with-kubernetes","sections":[{"title":"Setup","local":"setup","sections":[],"depth":3},{"title":"PyTorchJob Specification File","local":"pytorchjob-specification-file","sections":[],"depth":3},{"title":"Deploy","local":"deploy","sections":[],"depth":3}],"depth":2},{"title":"Summary","local":"summary","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/transformers/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/entry/start.2135b7e6.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/scheduler.25b97de1.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/singletons.0f2b7d5f.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/index.e188933d.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/paths.3d04d2c6.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/entry/app.24372c84.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/index.d9030fc9.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/nodes/0.026d2fdd.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/nodes/370.7a067e9a.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/Tip.baa67368.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/CodeBlock.e6cd0d95.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/EditOnGithub.91d95064.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Efficient Training on Multiple CPUs","local":"efficient-training-on-multiple-cpus","sections":[{"title":"Intel® oneCCL Bindings for PyTorch","local":"intel-oneccl-bindings-for-pytorch","sections":[{"title":"Intel® oneCCL Bindings for PyTorch installation","local":"intel-oneccl-bindings-for-pytorch-installation","sections":[],"depth":3}],"depth":2},{"title":"Intel® MPI library","local":"intel-mpi-library","sections":[{"title":"Intel® Extension for PyTorch installation","local":"intel-extension-for-pytorch-installation","sections":[],"depth":4}],"depth":2},{"title":"Usage in Trainer","local":"usage-in-trainer","sections":[],"depth":2},{"title":"Usage with Kubernetes","local":"usage-with-kubernetes","sections":[{"title":"Setup","local":"setup","sections":[],"depth":3},{"title":"PyTorchJob Specification File","local":"pytorchjob-specification-file","sections":[],"depth":3},{"title":"Deploy","local":"deploy","sections":[],"depth":3}],"depth":2},{"title":"Summary","local":"summary","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="efficient-training-on-multiple-cpus" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#efficient-training-on-multiple-cpus"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Efficient Training on Multiple CPUs</span></h1> <p data-svelte-h="svelte-k7boah">When training on a single CPU is too slow, we can use multiple CPUs. This guide focuses on PyTorch-based DDP enabling
	distributed CPU training efficiently on <a href="#usage-in-trainer">bare metal</a> and <a href="#usage-with-kubernetes">Kubernetes</a>.</p> <h2 class="relative group"><a id="intel-oneccl-bindings-for-pytorch" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#intel-oneccl-bindings-for-pytorch"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Intel® oneCCL Bindings for PyTorch</span></h2> <p data-svelte-h="svelte-bivv3w"><a href="https://github.com/oneapi-src/oneCCL" rel="nofollow">Intel® oneCCL</a> (collective communications library) is a library for efficient distributed deep learning training implementing such collectives like allreduce, allgather, alltoall. For more information on oneCCL, please refer to the <a href="https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html" rel="nofollow">oneCCL documentation</a> and <a href="https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html" rel="nofollow">oneCCL specification</a>.</p> <p data-svelte-h="svelte-5o21pv">Module <code>oneccl_bindings_for_pytorch</code> (<code>torch_ccl</code> before version 1.12) implements PyTorch C10D ProcessGroup API and can be dynamically loaded as external ProcessGroup and only works on Linux platform now</p> <p data-svelte-h="svelte-1qrzsug">Check more detailed information for <a href="https://github.com/intel/torch-ccl" rel="nofollow">oneccl_bind_pt</a>.</p> <h3 class="relative group"><a id="intel-oneccl-bindings-for-pytorch-installation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#intel-oneccl-bindings-for-pytorch-installation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Intel® oneCCL Bindings for PyTorch installation</span></h3> <p data-svelte-h="svelte-7vxiqb">Wheel files are available for the following Python versions:</p> <table data-svelte-h="svelte-19gn9co"><thead><tr><th align="center">Extension Version</th> <th align="center">Python 3.6</th> <th align="center">Python 3.7</th> <th align="center">Python 3.8</th> <th align="center">Python 3.9</th> <th align="center">Python 3.10</th></tr></thead> <tbody><tr><td align="center">2.1.0</td> <td align="center"></td> <td align="center">√</td> <td align="center">√</td> <td align="center">√</td> <td align="center">√</td></tr> <tr><td align="center">2.0.0</td> <td align="center"></td> <td align="center">√</td> <td align="center">√</td> <td align="center">√</td> <td align="center">√</td></tr> <tr><td align="center">1.13.0</td> <td align="center"></td> <td align="center">√</td> <td align="center">√</td> <td align="center">√</td> <td align="center">√</td></tr> <tr><td align="center">1.12.100</td> <td align="center"></td> <td align="center">√</td> <td align="center">√</td> <td align="center">√</td> <td align="center">√</td></tr> <tr><td align="center">1.12.0</td> <td align="center"></td> <td align="center">√</td> <td align="center">√</td> <td align="center">√</td> <td align="center">√</td></tr></tbody></table> <p data-svelte-h="svelte-21jzz8">Please run <code>pip list \| grep torch</code> to get your <code>pytorch_version</code>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install oneccl_bind_pt=={pytorch_version} -f https://developer.intel.com/ipex-whl-stable-cpu<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-oc1h4j">where <code>{pytorch_version}</code> should be your PyTorch version, for instance 2.1.0.
	Check more approaches for <a href="https://github.com/intel/torch-ccl" rel="nofollow">oneccl_bind_pt installation</a>.
	Versions of oneCCL and PyTorch must match.</p> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-is6c7w">oneccl_bindings_for_pytorch 1.12.0 prebuilt wheel does not work with PyTorch 1.12.1 (it is for PyTorch 1.12.0)
	PyTorch 1.12.1 should work with oneccl_bindings_for_pytorch 1.12.100</p></div> <h2 class="relative group"><a id="intel-mpi-library" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#intel-mpi-library"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Intel® MPI library</span></h2> <p data-svelte-h="svelte-1hu54a3">Use this standards-based MPI implementation to deliver flexible, efficient, scalable cluster messaging on Intel® architecture. This component is part of the Intel® oneAPI HPC Toolkit.</p> <p data-svelte-h="svelte-1nteuhh">oneccl_bindings_for_pytorch is installed along with the MPI tool set. Need to source the environment before using it.</p> <p data-svelte-h="svelte-324hiy">for Intel® oneCCL >= 1.12.0</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->oneccl_bindings_for_pytorch_path=$(python -c <span class="hljs-string">"from oneccl_bindings_for_pytorch import cwd; print(cwd)"</span>)
	<span class="hljs-built_in">source</span> <span class="hljs-variable">$oneccl_bindings_for_pytorch_path</span>/env/setvars.sh<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1x3l803">for Intel® oneCCL whose version < 1.12.0</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torch_ccl_path=$(python -c <span class="hljs-string">"import torch; import torch_ccl; import os; print(os.path.abspath(os.path.dirname(torch_ccl.__file__)))"</span>)
	<span class="hljs-built_in">source</span> <span class="hljs-variable">$torch_ccl_path</span>/env/setvars.sh<!-- HTML_TAG_END --></pre></div> <h4 class="relative group"><a id="intel-extension-for-pytorch-installation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#intel-extension-for-pytorch-installation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Intel® Extension for PyTorch installation</span></h4> <p data-svelte-h="svelte-12dw1vq">Intel Extension for PyTorch (IPEX) provides performance optimizations for CPU training with both Float32 and BFloat16 (refer to the <a href="./perf_train_cpu">single CPU section</a> to learn more).</p> <p data-svelte-h="svelte-1w8k00">The following “Usage in Trainer” takes mpirun in Intel® MPI library as an example.</p> <h2 class="relative group"><a id="usage-in-trainer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#usage-in-trainer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Usage in Trainer</span></h2> <p data-svelte-h="svelte-k782b4">To enable multi CPU distributed training in the Trainer with the ccl backend, users should add <strong><code>--ddp_backend ccl</code></strong> in the command arguments.</p> <p data-svelte-h="svelte-kvcz06">Let’s see an example with the <a href="https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering" rel="nofollow">question-answering example</a></p> <p data-svelte-h="svelte-m6wzyw">The following command enables training with 2 processes on one Xeon node, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --> export CCL_WORKER_COUNT=1
	export MASTER_ADDR=127.0.0.1
	mpirun -n 2 -genv OMP_NUM_THREADS=23 \
	python3 run_qa.py \
	--model_name_or_path google-bert/bert-large-uncased \
	--dataset_name squad \
	--do_train \
	--do_eval \
	--per_device_train_batch_size 12 \
	--learning_rate 3e-5 \
	--num_train_epochs 2 \
	--max_seq_length 384 \
	--doc_stride 128 \
	--output_dir /tmp/debug_squad/ \
	--no_cuda \
	--ddp_backend ccl \
	--use_ipex<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-cyuir">The following command enables training with a total of four processes on two Xeons (node0 and node1, taking node0 as the main process), ppn (processes per node) is set to 2, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.</p> <p data-svelte-h="svelte-dlvz8o">In node0, you need to create a configuration file which contains the IP addresses of each node (for example hostfile) and pass that configuration file path as an argument.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --> cat hostfile
	xxx.xxx.xxx.xxx #node0 ip
	xxx.xxx.xxx.xxx #node1 ip<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1vsu09v">Now, run the following command in node0 and <strong>4DDP</strong> will be enabled in node0 and node1 with BF16 auto mixed precision:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --> export CCL_WORKER_COUNT=1
	export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
	mpirun -f hostfile -n 4 -ppn 2 \
	-genv OMP_NUM_THREADS=23 \
	python3 run_qa.py \
	--model_name_or_path google-bert/bert-large-uncased \
	--dataset_name squad \
	--do_train \
	--do_eval \
	--per_device_train_batch_size 12 \
	--learning_rate 3e-5 \
	--num_train_epochs 2 \
	--max_seq_length 384 \
	--doc_stride 128 \
	--output_dir /tmp/debug_squad/ \
	--no_cuda \
	--ddp_backend ccl \
	--use_ipex \
	--bf16<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="usage-with-kubernetes" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#usage-with-kubernetes"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Usage with Kubernetes</span></h2> <p data-svelte-h="svelte-lwry78">The same distributed training job from the previous section can be deployed to a Kubernetes cluster using the
	<a href="https://www.kubeflow.org/docs/components/training/pytorch/" rel="nofollow">Kubeflow PyTorchJob training operator</a>.</p> <h3 class="relative group"><a id="setup" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#setup"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Setup</span></h3> <p data-svelte-h="svelte-1p9mop9">This example assumes that you have:</p> <ul data-svelte-h="svelte-4w2oti"><li>Access to a Kubernetes cluster with <a href="https://www.kubeflow.org/docs/started/installing-kubeflow/" rel="nofollow">Kubeflow installed</a></li> <li><a href="https://kubernetes.io/docs/tasks/tools/" rel="nofollow"><code>kubectl</code></a> installed and configured to access the Kubernetes cluster</li> <li>A <a href="https://kubernetes.io/docs/concepts/storage/persistent-volumes/" rel="nofollow">Persistent Volume Claim (PVC)</a> that can be used
	to store datasets and model files. There are multiple options for setting up the PVC including using an NFS
	<a href="https://kubernetes.io/docs/concepts/storage/storage-classes/" rel="nofollow">storage class</a> or a cloud storage bucket.</li> <li>A Docker container that includes your model training script and all the dependencies needed to run the script. For
	distributed CPU training jobs, this typically includes PyTorch, Transformers, Intel Extension for PyTorch, Intel
	oneCCL Bindings for PyTorch, and OpenSSH to communicate between the containers.</li></ul> <p data-svelte-h="svelte-vaz613">The snippet below is an example of a Dockerfile that uses a base image that supports distributed CPU training and then
	extracts a Transformers release to the <code>/workspace</code> directory, so that the example scripts are included in the image:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">FROM</span> intel/intel-optimized-pytorch:<span class="hljs-number">2.3</span>.<span class="hljs-number">0</span>-pip-multinode

	<span class="hljs-keyword">RUN</span><span class="language-bash"> apt-get update -y && \
	apt-get install -y --no-install-recommends --fix-missing \
	google-perftools \
	libomp-dev</span>

	<span class="hljs-keyword">WORKDIR</span><span class="language-bash"> /workspace</span>

	<span class="hljs-comment"># Download and extract the transformers code</span>
	<span class="hljs-keyword">ARG</span> HF_TRANSFORMERS_VER=<span class="hljs-string">"4.44.0"</span>
	<span class="hljs-keyword">RUN</span><span class="language-bash"> pip install --no-cache-dir \
	transformers==<span class="hljs-variable">${HF_TRANSFORMERS_VER}</span> && \
	<span class="hljs-built_in">mkdir</span> transformers && \
	curl -sSL --retry 5 https://github.com/huggingface/transformers/archive/refs/tags/v<span class="hljs-variable">${HF_TRANSFORMERS_VER}</span>.tar.gz \| tar -C transformers --strip-components=1 -xzf -</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ra1dus">The image needs to be built and copied to the cluster’s nodes or pushed to a container registry prior to deploying the
	PyTorchJob to the cluster.</p> <h3 class="relative group"><a id="pytorchjob-specification-file" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#pytorchjob-specification-file"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>PyTorchJob Specification File</span></h3> <p data-svelte-h="svelte-ceey14">The <a href="https://www.kubeflow.org/docs/components/training/pytorch/" rel="nofollow">Kubeflow PyTorchJob</a> is used to run the distributed
	training job on the cluster. The yaml file for the PyTorchJob defines parameters such as:</p> <ul data-svelte-h="svelte-1w74v7u"><li>The name of the PyTorchJob</li> <li>The number of replicas (workers)</li> <li>The python script and it’s parameters that will be used to run the training job</li> <li>The types of resources (node selector, memory, and CPU) needed for each worker</li> <li>The image/tag for the Docker container to use</li> <li>Environment variables</li> <li>A volume mount for the PVC</li></ul> <p data-svelte-h="svelte-1rtua9k">The volume mount defines a path where the PVC will be mounted in the container for each worker pod. This location can be
	used for the dataset, checkpoint files, and the saved model after training completes.</p> <p data-svelte-h="svelte-1sen8je">The snippet below is an example of a yaml file for a PyTorchJob with 4 workers running the
	<a href="https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering" rel="nofollow">question-answering example</a>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attr">apiVersion:</span> <span class="hljs-string">"kubeflow.org/v1"</span>
	<span class="hljs-attr">kind:</span> <span class="hljs-string">PyTorchJob</span>
	<span class="hljs-attr">metadata:</span>
	<span class="hljs-attr">name:</span> <span class="hljs-string">transformers-pytorchjob</span>
	<span class="hljs-attr">spec:</span>
	<span class="hljs-attr">elasticPolicy:</span>
	<span class="hljs-attr">rdzvBackend:</span> <span class="hljs-string">c10d</span>
	<span class="hljs-attr">minReplicas:</span> <span class="hljs-number">1</span>
	<span class="hljs-attr">maxReplicas:</span> <span class="hljs-number">4</span>
	<span class="hljs-attr">maxRestarts:</span> <span class="hljs-number">10</span>
	<span class="hljs-attr">pytorchReplicaSpecs:</span>
	<span class="hljs-attr">Worker:</span>
	<span class="hljs-attr">replicas:</span> <span class="hljs-number">4</span> <span class="hljs-comment"># The number of worker pods</span>
	<span class="hljs-attr">restartPolicy:</span> <span class="hljs-string">OnFailure</span>
	<span class="hljs-attr">template:</span>
	<span class="hljs-attr">spec:</span>
	<span class="hljs-attr">containers:</span>
	<span class="hljs-bullet">-</span> <span class="hljs-attr">name:</span> <span class="hljs-string">pytorch</span>
	<span class="hljs-attr">image:</span> <span class="hljs-string"><image</span> <span class="hljs-string">name>:<tag></span> <span class="hljs-comment"># Specify the docker image to use for the worker pods</span>
	<span class="hljs-attr">imagePullPolicy:</span> <span class="hljs-string">IfNotPresent</span>
	<span class="hljs-attr">command:</span> [<span class="hljs-string">"/bin/bash"</span>, <span class="hljs-string">"-c"</span>]
	<span class="hljs-attr">args:</span>
	<span class="hljs-bullet">-</span> <span class="hljs-string">>-
	cd /workspace/transformers;
	pip install -r /workspace/transformers/examples/pytorch/question-answering/requirements.txt;
	source /usr/local/lib/python3.10/dist-packages/oneccl_bindings_for_pytorch/env/setvars.sh;
	torchrun /workspace/transformers/examples/pytorch/question-answering/run_qa.py \
	--model_name_or_path distilbert/distilbert-base-uncased \
	--dataset_name squad \
	--do_train \
	--do_eval \
	--per_device_train_batch_size 12 \
	--learning_rate 3e-5 \
	--num_train_epochs 2 \
	--max_seq_length 384 \
	--doc_stride 128 \
	--output_dir /tmp/pvc-mount/output_$(date +%Y%m%d_%H%M%S) \
	--no_cuda \
	--ddp_backend ccl \
	--bf16 \
	--use_ipex;
	</span> <span class="hljs-attr">env:</span>
	<span class="hljs-bullet">-</span> <span class="hljs-attr">name:</span> <span class="hljs-string">LD_PRELOAD</span>
	<span class="hljs-attr">value:</span> <span class="hljs-string">"/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4.5.9:/usr/local/lib/libiomp5.so"</span>
	<span class="hljs-bullet">-</span> <span class="hljs-attr">name:</span> <span class="hljs-string">TRANSFORMERS_CACHE</span>
	<span class="hljs-attr">value:</span> <span class="hljs-string">"/tmp/pvc-mount/transformers_cache"</span>
	<span class="hljs-bullet">-</span> <span class="hljs-attr">name:</span> <span class="hljs-string">HF_DATASETS_CACHE</span>
	<span class="hljs-attr">value:</span> <span class="hljs-string">"/tmp/pvc-mount/hf_datasets_cache"</span>
	<span class="hljs-bullet">-</span> <span class="hljs-attr">name:</span> <span class="hljs-string">LOGLEVEL</span>
	<span class="hljs-attr">value:</span> <span class="hljs-string">"INFO"</span>
	<span class="hljs-bullet">-</span> <span class="hljs-attr">name:</span> <span class="hljs-string">CCL_WORKER_COUNT</span>
	<span class="hljs-attr">value:</span> <span class="hljs-string">"1"</span>
	<span class="hljs-bullet">-</span> <span class="hljs-attr">name:</span> <span class="hljs-string">OMP_NUM_THREADS</span> <span class="hljs-comment"># Can be tuned for optimal performance</span>
	<span class="hljs-attr">value:</span> <span class="hljs-string">"240"</span>
	<span class="hljs-attr">resources:</span>
	<span class="hljs-attr">limits:</span>
	<span class="hljs-attr">cpu:</span> <span class="hljs-number">240</span> <span class="hljs-comment"># Update the CPU and memory limit values based on your nodes</span>
	<span class="hljs-attr">memory:</span> <span class="hljs-string">128Gi</span>
	<span class="hljs-attr">requests:</span>
	<span class="hljs-attr">cpu:</span> <span class="hljs-number">240</span> <span class="hljs-comment"># Update the CPU and memory request values based on your nodes</span>
	<span class="hljs-attr">memory:</span> <span class="hljs-string">128Gi</span>
	<span class="hljs-attr">volumeMounts:</span>
	<span class="hljs-bullet">-</span> <span class="hljs-attr">name:</span> <span class="hljs-string">pvc-volume</span>
	<span class="hljs-attr">mountPath:</span> <span class="hljs-string">/tmp/pvc-mount</span>
	<span class="hljs-bullet">-</span> <span class="hljs-attr">mountPath:</span> <span class="hljs-string">/dev/shm</span>
	<span class="hljs-attr">name:</span> <span class="hljs-string">dshm</span>
	<span class="hljs-attr">restartPolicy:</span> <span class="hljs-string">Never</span>
	<span class="hljs-attr">nodeSelector:</span> <span class="hljs-comment"># Optionally use nodeSelector to match a certain node label for the worker pods</span>
	<span class="hljs-attr">node-type:</span> <span class="hljs-string">gnr</span>
	<span class="hljs-attr">volumes:</span>
	<span class="hljs-bullet">-</span> <span class="hljs-attr">name:</span> <span class="hljs-string">pvc-volume</span>
	<span class="hljs-attr">persistentVolumeClaim:</span>
	<span class="hljs-attr">claimName:</span> <span class="hljs-string">transformers-pvc</span>
	<span class="hljs-bullet">-</span> <span class="hljs-attr">name:</span> <span class="hljs-string">dshm</span>
	<span class="hljs-attr">emptyDir:</span>
	<span class="hljs-attr">medium:</span> <span class="hljs-string">Memory</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-l1jsdc">To run this example, update the yaml based on your training script and the nodes in your cluster.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1nu8lil">The CPU resource limits/requests in the yaml are defined in <a href="https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-cpu" rel="nofollow">cpu units</a>
	where 1 CPU unit is equivalent to 1 physical CPU core or 1 virtual core (depending on whether the node is a physical
	host or a VM). The amount of CPU and memory limits/requests defined in the yaml should be less than the amount of
	available CPU/memory capacity on a single machine. It is usually a good idea to not use the entire machine’s capacity in
	order to leave some resources for the kubelet and OS. In order to get <a href="https://kubernetes.io/docs/concepts/workloads/pods/pod-qos/#guaranteed" rel="nofollow">“guaranteed”</a> <a href="https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/" rel="nofollow">quality of service</a> for the worker pods,
	set the same CPU and memory amounts for both the resource limits and requests.</p></div> <h3 class="relative group"><a id="deploy" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deploy"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Deploy</span></h3> <p data-svelte-h="svelte-w0q5hc">After the PyTorchJob spec has been updated with values appropriate for your cluster and training job, it can be deployed
	to the cluster using:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">export</span> NAMESPACE=<specify your namespace>

	kubectl create -f pytorchjob.yaml -n <span class="hljs-variable">${NAMESPACE}</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1rrjl11">The <code>kubectl get pods -n ${NAMESPACE}</code> command can then be used to list the pods in your namespace. You should see
	the worker pods for the PyTorchJob that was just deployed. At first, they will probably have a status of “Pending” as
	the containers get pulled and created, then the status should change to “Running”.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->NAME READY STATUS RESTARTS AGE
	<span class="hljs-meta prompt_">...</span>
	transformers-pytorchjob-worker-0 1/1 Running 0 7m37s
	transformers-pytorchjob-worker-1 1/1 Running 0 7m37s
	transformers-pytorchjob-worker-2 1/1 Running 0 7m37s
	transformers-pytorchjob-worker-3 1/1 Running 0 7m37s
	<span class="hljs-meta prompt_">...</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-6jo9na">The logs for worker can be viewed using <code>kubectl logs <pod name> -n ${NAMESPACE}</code>. Add <code>-f</code> to stream the logs, for example:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->kubectl logs transformers-pytorchjob-worker-0 -n <span class="hljs-variable">${NAMESPACE}</span> -f<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1hi6y76">After the training job completes, the trained model can be copied from the PVC or storage location. When you are done
	with the job, the PyTorchJob resource can be deleted from the cluster using <code>kubectl delete -f pytorchjob.yaml -n ${NAMESPACE}</code>.</p> <h2 class="relative group"><a id="summary" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#summary"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Summary</span></h2> <p data-svelte-h="svelte-prt8yq">This guide covered running distributed PyTorch training jobs using multiple CPUs on bare metal and on a Kubernetes
	cluster. Both cases utilize Intel Extension for PyTorch and Intel oneCCL Bindings for PyTorch for optimal training
	performance, and can be used as a template to run your own workload on multiple nodes.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/perf_train_cpu_many.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1xexzbk = {
	assets: "/docs/transformers/main/en",
	base: "/docs/transformers/main/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/transformers/main/en/_app/immutable/entry/start.2135b7e6.js"),
	import("/docs/transformers/main/en/_app/immutable/entry/app.24372c84.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 370],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 53.7 kB
Xet hash:: a7efbe97c1fb7cde720928b6093bad70779e1baf0a5500cfcf51d67d463df07b

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.