Buckets:

rtrm's picture
download
raw
18.9 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Distributed training&quot;,&quot;local&quot;:&quot;distributed-training&quot;,&quot;sections&quot;:[],&quot;depth&quot;:1}">
<link href="/docs/optimum.intel/pr_1513/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/optimum.intel/pr_1513/en/_app/immutable/entry/start.c8023c00.js">
<link rel="modulepreload" href="/docs/optimum.intel/pr_1513/en/_app/immutable/chunks/scheduler.c90a44b2.js">
<link rel="modulepreload" href="/docs/optimum.intel/pr_1513/en/_app/immutable/chunks/singletons.8201eae3.js">
<link rel="modulepreload" href="/docs/optimum.intel/pr_1513/en/_app/immutable/chunks/paths.abb0872e.js">
<link rel="modulepreload" href="/docs/optimum.intel/pr_1513/en/_app/immutable/entry/app.c5c46727.js">
<link rel="modulepreload" href="/docs/optimum.intel/pr_1513/en/_app/immutable/chunks/preload-helper.03c4211b.js">
<link rel="modulepreload" href="/docs/optimum.intel/pr_1513/en/_app/immutable/chunks/index.66c3f415.js">
<link rel="modulepreload" href="/docs/optimum.intel/pr_1513/en/_app/immutable/nodes/0.e1ac4b17.js">
<link rel="modulepreload" href="/docs/optimum.intel/pr_1513/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/optimum.intel/pr_1513/en/_app/immutable/nodes/7.be4cd0d9.js">
<link rel="modulepreload" href="/docs/optimum.intel/pr_1513/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.edbd65e4.js">
<link rel="modulepreload" href="/docs/optimum.intel/pr_1513/en/_app/immutable/chunks/CodeBlock.02f36bea.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Distributed training&quot;,&quot;local&quot;:&quot;distributed-training&quot;,&quot;sections&quot;:[],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="distributed-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#distributed-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Distributed training</span></h1> <p data-svelte-h="svelte-v5mdvi">When training on a single CPU is too slow, we can use multiple CPUs. This guide focuses on PyTorch-based DDP enabling distributed CPU training efficiently.</p> <p data-svelte-h="svelte-3ncfkb">Distributed training on multiple CPUs is launched by mpirun which supports both Gloo and oneCCL as collective communication backends. And for performance seek, Intel recommends to use oneCCL backend.</p> <p data-svelte-h="svelte-bivv3w"><a href="https://github.com/oneapi-src/oneCCL" rel="nofollow">Intel® oneCCL</a> (collective communications library) is a library for efficient distributed deep learning training implementing such collectives like allreduce, allgather, alltoall. For more information on oneCCL, please refer to the <a href="https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html" rel="nofollow">oneCCL documentation</a> and <a href="https://spec.oneapi.com/versions/latest/elements/oneCCL/source/index.html" rel="nofollow">oneCCL specification</a>.</p> <p data-svelte-h="svelte-5o21pv">Module <code>oneccl_bindings_for_pytorch</code> (<code>torch_ccl</code> before version 1.12) implements PyTorch C10D ProcessGroup API and can be dynamically loaded as external ProcessGroup and only works on Linux platform now</p> <p data-svelte-h="svelte-1qrzsug">Check more detailed information for <a href="https://github.com/intel/torch-ccl" rel="nofollow">oneccl_bind_pt</a>.</p> <p data-svelte-h="svelte-8y61gw">We will show how to use oneCCL backend-ed distributed training as below steps.</p> <p data-svelte-h="svelte-1db4fuo">Intel® oneCCL Bindings for PyTorch installation:</p> <p data-svelte-h="svelte-7vxiqb">Wheel files are available for the following Python versions:</p> <table data-svelte-h="svelte-1m9r82a"><thead><tr><th align="center">Extension Version</th> <th align="center">Python 3.6</th> <th align="center">Python 3.7</th> <th align="center">Python 3.8</th> <th align="center">Python 3.9</th> <th align="center">Python 3.10</th></tr></thead> <tbody><tr><td align="center">1.12.1</td> <td align="center"></td> <td align="center"></td> <td align="center"></td> <td align="center"></td> <td align="center"></td></tr> <tr><td align="center">1.12.0</td> <td align="center"></td> <td align="center"></td> <td align="center"></td> <td align="center"></td> <td align="center"></td></tr> <tr><td align="center">1.11.0</td> <td align="center"></td> <td align="center"></td> <td align="center"></td> <td align="center"></td> <td align="center"></td></tr> <tr><td align="center">1.10.0</td> <td align="center"></td> <td align="center"></td> <td align="center"></td> <td align="center"></td> <td align="center"></td></tr></tbody></table> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install oneccl_bind_pt=={<span class="hljs-attribute">pytorch_version} -f https</span>://software<span class="hljs-variable">.intel</span><span class="hljs-variable">.com</span>/ipex-whl-stable<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-10l1ev2">where <code>{pytorch_version}</code> should be your PyTorch version, for instance 1.12.0.
Versions of oneCCL and PyTorch must match.
<code>oneccl_bindings_for_pytorch</code> 1.12.0 prebuilt wheel does not work with PyTorch 1.12.1 (it is for PyTorch 1.12.0)
PyTorch 1.12.1 should work with <code>oneccl_bindings_for_pytorch</code> 1.12.1</p> <p data-svelte-h="svelte-1dmp0t">MPI tool set for Intel® oneCCL 1.12.0</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->oneccl_bindings_for_pytorch_path=$(<span class="hljs-keyword">python</span> -c <span class="hljs-string">&quot;from oneccl_bindings_for_pytorch import cwd; print(cwd)&quot;</span>)
<span class="hljs-keyword">source</span> $oneccl_bindings_for_pytorch_path/<span class="hljs-keyword">env</span>/setvars.sh<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1x3l803">for Intel® oneCCL whose version &lt; 1.12.0</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torch_ccl_path=$(<span class="hljs-keyword">python</span> -c <span class="hljs-string">&quot;import torch; import torch_ccl; import os; print(os.path.abspath(os.path.dirname(torch_ccl.__file__)))&quot;</span>)
<span class="hljs-keyword">source</span> $torch_ccl_path/<span class="hljs-keyword">env</span>/setvars.sh<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jvrym0">The following command enables training with 2 processes on one node, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->export CCL_WORKER_COUNT=1
export MASTER_ADDR=127.0.0.1
mpirun -n 2 -genv OMP_NUM_THREADS=23 \
python3 run_qa.py \
--model_name_or_path distilbert-base-uncased-distilled-squad \
--dataset_name squad \
--apply_quantization \
--quantization_approach static \
--do_train \
--do_eval \
--verify_loading \
--output_dir /tmp/squad_output \
--no_cuda \
--xpu_backend ccl<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1w8dpvp">The following command enables training with a total of four processes on two nodes (node0 and node1, taking node0 as the main process), ppn (processes per node) is set to 2, with one process running per one socket. The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance.</p> <p data-svelte-h="svelte-dlvz8o">In node0, you need to create a configuration file which contains the IP addresses of each node (for example hostfile) and pass that configuration file path as an argument.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --> cat hostfile
xxx.xxx.xxx.xxx #node0 ip
xxx.xxx.xxx.xxx #node1 ip<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1fnu20u">Now, run the following command in node0 and <strong>4DDP</strong> will be enabled in node0 and node1:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->export CCL_WORKER_COUNT=1
export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip
mpirun -f hostfile -n 4 -ppn 2 \
-genv OMP_NUM_THREADS=23 \
python3 run_qa.py \
--model_name_or_path distilbert-base-uncased-distilled-squad \
--dataset_name squad \
--apply_quantization \
--quantization_approach static \
--do_train \
--do_eval \
--verify_loading \
--output_dir /tmp/squad_output \
--no_cuda \
--xpu_backend ccl<!-- HTML_TAG_END --></pre></div> <p></p>
<script>
{
__sveltekit_xffndc = {
assets: "/docs/optimum.intel/pr_1513/en",
base: "/docs/optimum.intel/pr_1513/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/optimum.intel/pr_1513/en/_app/immutable/entry/start.c8023c00.js"),
import("/docs/optimum.intel/pr_1513/en/_app/immutable/entry/app.c5c46727.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 7],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
18.9 kB
·
Xet hash:
15b592f36b6f181d77347c3285b6b8ad6cddc14aa95ff9d3fa7360d888218e80

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.