Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"DeepSpeed","local":"deepspeed","sections":[{"title":"설치","local":"installation","sections":[],"depth":2},{"title":"메모리 요구량","local":"memory-requirements","sections":[],"depth":2},{"title":"ZeRO 단계 설정하기","local":"select-a-zero-stage","sections":[],"depth":2},{"title":"DeepSpeed 구성 파일","local":"deepspeed-configuration-file","sections":[{"title":"DeepSpeed와 Trainer 매개변수","local":"deepspeed-and-trainer-parameters","sections":[],"depth":3},{"title":"ZeRO 구성","local":"zero-configuration","sections":[],"depth":3},{"title":"NVMe 설정","local":"nvme-configuration","sections":[],"depth":3}],"depth":2},{"title":"DeepSpeed 구성","local":"deepspeed-features","sections":[{"title":"활성화/그레이디언트 체크포인팅","local":"activationgradient-checkpointing","sections":[],"depth":3},{"title":"옵티마이저와 스케줄러","local":"optimizer-and-scheduler","sections":[],"depth":3},{"title":"정밀도","local":"precision","sections":[],"depth":3},{"title":"배치 크기","local":"batch-size","sections":[],"depth":3},{"title":"그레이디언트 누적","local":"gradient-accumulation","sections":[],"depth":3},{"title":"그레이디언트 클리핑","local":"gradient-clipping","sections":[],"depth":3},{"title":"통신 데이터 유형(Communication data type)","local":"communication-data-type","sections":[],"depth":3}],"depth":2},{"title":"모델 배포","local":"deployment","sections":[{"title":"다중 노드 환경에서의 모델 배포","local":"multi-node-deployment","sections":[],"depth":3},{"title":"SLURM","local":"slurm","sections":[],"depth":3},{"title":"노트북","local":"notebook","sections":[],"depth":3}],"depth":2},{"title":"모델 가중치 저장하기","local":"save-model-weights","sections":[{"title":"온라인 환경","local":"online","sections":[],"depth":3},{"title":"오프라인 환경","local":"offline","sections":[],"depth":3}],"depth":2},{"title":"ZeRO Inference","local":"zero-inference","sections":[],"depth":2},{"title":"Trainer 없이 DeepSpeed 사용하기","local":"non-trainer-deepspeed-integration","sections":[{"title":"Trainer 없이 ZeRO Inference 사용하기","local":"non-trainer-zero-inference","sections":[],"depth":3},{"title":"생성","local":"generate","sections":[],"depth":3}],"depth":2},{"title":"트러블슈팅","local":"troubleshoot","sections":[{"title":"DeepSpeed 프로세스가 시작 단계에서 종료되었을 경우","local":"deepspeed-process-killed-at-startup","sections":[],"depth":3},{"title":"NaN 손실","local":"nan-loss","sections":[],"depth":3}],"depth":2},{"title":"리소스","local":"resources","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/transformers/main/ko/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/entry/start.9aa88961.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/scheduler.9bc65507.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/singletons.9eec45c3.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/index.3b203c72.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/paths.566078f7.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/entry/app.84fb67c3.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/index.707bf1b6.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/nodes/0.1c99376b.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/nodes/16.e4ab6f92.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/Tip.c2ecdbf4.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/CodeBlock.54a9f38d.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/EditOnGithub.922df6ba.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/HfOption.6d864328.js"> | |
| <link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/stores.c16bc1a5.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"DeepSpeed","local":"deepspeed","sections":[{"title":"설치","local":"installation","sections":[],"depth":2},{"title":"메모리 요구량","local":"memory-requirements","sections":[],"depth":2},{"title":"ZeRO 단계 설정하기","local":"select-a-zero-stage","sections":[],"depth":2},{"title":"DeepSpeed 구성 파일","local":"deepspeed-configuration-file","sections":[{"title":"DeepSpeed와 Trainer 매개변수","local":"deepspeed-and-trainer-parameters","sections":[],"depth":3},{"title":"ZeRO 구성","local":"zero-configuration","sections":[],"depth":3},{"title":"NVMe 설정","local":"nvme-configuration","sections":[],"depth":3}],"depth":2},{"title":"DeepSpeed 구성","local":"deepspeed-features","sections":[{"title":"활성화/그레이디언트 체크포인팅","local":"activationgradient-checkpointing","sections":[],"depth":3},{"title":"옵티마이저와 스케줄러","local":"optimizer-and-scheduler","sections":[],"depth":3},{"title":"정밀도","local":"precision","sections":[],"depth":3},{"title":"배치 크기","local":"batch-size","sections":[],"depth":3},{"title":"그레이디언트 누적","local":"gradient-accumulation","sections":[],"depth":3},{"title":"그레이디언트 클리핑","local":"gradient-clipping","sections":[],"depth":3},{"title":"통신 데이터 유형(Communication data type)","local":"communication-data-type","sections":[],"depth":3}],"depth":2},{"title":"모델 배포","local":"deployment","sections":[{"title":"다중 노드 환경에서의 모델 배포","local":"multi-node-deployment","sections":[],"depth":3},{"title":"SLURM","local":"slurm","sections":[],"depth":3},{"title":"노트북","local":"notebook","sections":[],"depth":3}],"depth":2},{"title":"모델 가중치 저장하기","local":"save-model-weights","sections":[{"title":"온라인 환경","local":"online","sections":[],"depth":3},{"title":"오프라인 환경","local":"offline","sections":[],"depth":3}],"depth":2},{"title":"ZeRO Inference","local":"zero-inference","sections":[],"depth":2},{"title":"Trainer 없이 DeepSpeed 사용하기","local":"non-trainer-deepspeed-integration","sections":[{"title":"Trainer 없이 ZeRO Inference 사용하기","local":"non-trainer-zero-inference","sections":[],"depth":3},{"title":"생성","local":"generate","sections":[],"depth":3}],"depth":2},{"title":"트러블슈팅","local":"troubleshoot","sections":[{"title":"DeepSpeed 프로세스가 시작 단계에서 종료되었을 경우","local":"deepspeed-process-killed-at-startup","sections":[],"depth":3},{"title":"NaN 손실","local":"nan-loss","sections":[],"depth":3}],"depth":2},{"title":"리소스","local":"resources","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="deepspeed" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deepspeed"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DeepSpeed</span></h1> <p data-svelte-h="svelte-eaoepn"><a href="https://www.deepspeed.ai/" rel="nofollow">DeepSpeed</a>는 분산 학습 메모리를 효율적이고 빠르게 만드는 PyTorch 최적화 라이브러리입니다. 그 핵심은 대규모 모델을 규모에 맞게 훈련할 수 있는 <a href="https://hf.co/papers/1910.02054" rel="nofollow">Zero Redundancy Optimizer(ZeRO)</a>입니다. ZeRO는 여러 단계로 작동합니다:</p> <ul data-svelte-h="svelte-naazos"><li>ZeRO-1, GPU 간 최적화 상태 분할</li> <li>ZeRO-2, GPU 간 그레이디언트 분할</li> <li>ZeRO-3, GPU 간 매개변수 분할</li></ul> <p data-svelte-h="svelte-17q94pv">GPU가 제한된 환경에서 ZeRO는 최적화 메모리와 계산을 GPU에서 CPU로 오프로드하여 단일 GPU에 대규모 모델을 장착하고 훈련할 수 있습니다. DeepSpeed는 모든 ZeRO 단계 및 오프로딩을 위해 Transformers <code>Trainer</code> 클래스와 통합되어 있습니다. 구성 파일을 제공하거나 제공된 템플릿을 사용하기만 하면 됩니다. 추론의 경우, Transformers는 대용량 모델을 가져올 수 있으므로 ZeRO-3 및 오프로딩을 지원합니다.</p> <p data-svelte-h="svelte-j7debo">이 가이드에서는 DeepSpeed 트레이닝을 배포하는 방법, 활성화할 수 있는 기능, 다양한 ZeRO 단계에 대한 구성 파일 설정 방법, 오프로딩, 추론 및 <code>Trainer</code> 없이 DeepSpeed를 사용하는 방법을 안내해 드립니다.</p> <h2 class="relative group"><a id="installation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#installation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>설치</span></h2> <p data-svelte-h="svelte-nq7j86">DeepSpeed는 PyPI 또는 Transformers에서 설치할 수 있습니다(자세한 설치 옵션은 DeepSpeed <a href="https://www.deepspeed.ai/tutorials/advanced-install/" rel="nofollow">설치 상세사항</a> 또는 GitHub <a href="https://github.com/microsoft/deepspeed#installation" rel="nofollow">README</a>를 참조하세요).</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1rxp0gi">DeepSpeed를 설치하는 데 문제가 있는 경우 <a href="../debugging#deepspeed-cuda-installation">DeepSpeed CUDA 설치</a> 가이드를 확인하세요. DeepSpeed에는 pip 설치 가능한 PyPI 패키지로 설치할 수 있지만, 하드웨어에 가장 잘 맞고 PyPI 배포판에서는 제공되지 않는 1비트 Adam과 같은 특정 기능을 지원하려면 <a href="https://www.deepspeed.ai/tutorials/advanced-install/#install-deepspeed-from-source" rel="nofollow">소스에서 설치하기</a>를 적극 권장합니다.</p></div> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">PyPI </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">Transformers </div></div> <div class="language-select"><div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install deepspeed<!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="memory-requirements" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memory-requirements"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>메모리 요구량</span></h2> <p data-svelte-h="svelte-3o8idg">시작하기 전에 모델에 맞는 충분한 GPU 및 CPU 메모리가 있는지 확인하는 것이 좋습니다. DeepSpeed는 필요한 CPU/GPU 메모리를 추정할 수 있는 도구를 제공합니다. 예를 들어, 단일 GPU에서 <a href="bigscience/T0_3B">bigscience/T0_3B</a> 모델의 메모리 요구 사항을 추정할 수 있습니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->$ python -c <span class="hljs-string">'from transformers import AutoModel; \ | |
| from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live; \ | |
| model = AutoModel.from_pretrained("bigscience/T0_3B"); \ | |
| estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=1, num_nodes=1)'</span> | |
| [...] | |
| Estimated memory needed <span class="hljs-keyword">for</span> params, optim states and gradients <span class="hljs-keyword">for</span> a: | |
| HW: Setup with 1 node, 1 GPU per node. | |
| SW: Model with 2783M total params, 65M largest layer params. | |
| per CPU | per GPU | Options | |
| 70.00GB | 0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1 | |
| 70.00GB | 0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0 | |
| 62.23GB | 5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=1 | |
| 62.23GB | 5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=0 | |
| 0.37GB | 46.91GB | offload_param=none, offload_optimizer=none, zero_init=1 | |
| 15.56GB | 46.91GB | offload_param=none, offload_optimizer=none, zero_init=0<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8jkjyv">즉, CPU 오프로드가 없는 단일 80GB GPU 또는 오프로드 할 8GB GPU와 최대 60GB CPU가 필요합니다 (이는 매개변수, 최적화 상태 및 그레이디언트에 대한 메모리 요구 사항일 뿐이며 CUDA 커널 및 활성화에는 조금 더 필요합니다). 또한 더 작은 GPU를 대여하거나 구입하는 것이 더 저렴하지만 모델을 훈련하는 데 시간이 더 오래 걸리므로 비용과 속도 간의 균형을 고려해야 합니다.</p> <p data-svelte-h="svelte-1qfjn0l">GPU 메모리가 충분하다면 CPU/NVMe 오프로드를 비활성화하여 모든 작업을 더 빠르게 처리하세요.</p> <h2 class="relative group"><a id="select-a-zero-stage" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#select-a-zero-stage"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ZeRO 단계 설정하기</span></h2> <p data-svelte-h="svelte-3ef2n9">DeepSpeed를 설치하고 메모리 요구 사항을 더 잘 파악했다면 다음 단계는 사용할 ZeRO 스테이지를 선택하는 것입니다. 가장 빠르고 메모리 효율이 높은 순서대로 정렬하면 다음과 같습니다:</p> <table data-svelte-h="svelte-1ukjioh"><thead><tr><th>속도</th> <th>메모리 효율</th></tr></thead> <tbody><tr><td>ZeRO-1</td> <td>ZeRO-3 + offload</td></tr> <tr><td>ZeRO-2</td> <td>ZeRO-3</td></tr> <tr><td>ZeRO-2 + offload</td> <td>ZeRO-2 + offload</td></tr> <tr><td>ZeRO-3</td> <td>ZeRO-2</td></tr> <tr><td>ZeRO-3 + offload</td> <td>ZeRO-1</td></tr></tbody></table> <p data-svelte-h="svelte-4ugzxq">자신에게 가장 적합한 방법을 찾으려면 가장 빠른 방법부터 시작하고 메모리가 부족하면 더 느리지만 메모리 효율이 높은 다음 단계를 시도하세요. 속도와 메모리 사용량 사이의 적절한 균형을 찾기 위해 (가장 메모리 효율적이거나 가장 빠른 것부터 시작하여) 원하는 방향으로 자유롭게 작업하세요.</p> <p data-svelte-h="svelte-jb44pv">일반적으로 사용할 수 있는 프로세스는 다음과 같습니다(배치 크기 1로 시작):</p> <ol data-svelte-h="svelte-9ezr99"><li>그레이디언트 체크포인팅 활성화</li> <li>ZeRO-2 시도</li> <li>ZeRO-2와 매개변수 오프로드 시도</li> <li>ZeRO-3 시도</li> <li>ZeRO-3과 매개변수 CPU 오프로드 시도</li> <li>ZeRO-3, 매개변수와 옵티마이저 CPU 오프로드 시도</li> <li><code>generate()</code> 메소드를 사용하는 경우 더 좁은 빔 서치 검색 범위와 같은 다양한 기본값을 낮춰보기</li> <li>전체 정밀도 가중치보다 반정밀도(구형 GPU 구조의 경우 fp16, 암페어 이후 GPU의 경우 bf16)를 혼합해보기</li> <li>가능하면 하드웨어를 더 추가하거나 Infinity가 매개변수와 옵티마이저를 NVMe로 오프로드하도록 활성화</li> <li>메모리가 부족하지 않으면 유효 처리량을 측정한 다음 배치 크기를 최대한 크게 늘려 GPU 효율성을 극대화</li> <li>마지막으로 일부 오프로드 기능을 비활성화하거나 더 빠른 ZeRO 스테이지를 사용하고 배치 크기를 늘리거나 줄여 속도와 메모리 사용량 간의 최적의 균형을 찾아 트레이닝 설정을 최적화</li></ol> <h2 class="relative group"><a id="deepspeed-configuration-file" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deepspeed-configuration-file"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DeepSpeed 구성 파일</span></h2> <p data-svelte-h="svelte-xjp9mk">DeepSpeed는 트레이닝 실행 방법을 구성하는 모든 매개변수가 포함된 구성 파일을 통해 <code>Trainer</code> 클래스와 함께 작동합니다. 트레이닝 스크립트를 실행하면 DeepSpeed는 <code>Trainer</code>로부터 받은 구성을 콘솔에 기록하므로 어떤 구성이 사용되었는지 정확히 확인할 수 있습니다.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1v5fsnd">DeepSpeed 구성 옵션의 전체 목록은 <a href="https://www.deepspeed.ai/docs/config-json/" rel="nofollow">DeepSpeed Configuration JSON</a>에서 확인할 수 있습니다. 또한 <a href="https://github.com/microsoft/DeepSpeedExamples" rel="nofollow">DeepSpeedExamples</a> 리포지토리 또는 기본 <a href="https://github.com/microsoft/DeepSpeed" rel="nofollow">DeepSpeed</a> 리포지토리에서 다양한 DeepSpeed 구성 예제에 대한 보다 실용적인 예제를 찾을 수 있습니다. 구체적인 예제를 빠르게 찾으려면 다음과 같이 하세요:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->git <span class="hljs-built_in">clone</span> https://github.com/microsoft/DeepSpeedExamples | |
| <span class="hljs-built_in">cd</span> DeepSpeedExamples | |
| find . -name <span class="hljs-string">'*json'</span> | |
| <span class="hljs-comment"># Lamb 옵티마이저 샘플 찾기</span> | |
| grep -i Lamb $(find . -name <span class="hljs-string">'*json'</span>)<!-- HTML_TAG_END --></pre></div></div> <p data-svelte-h="svelte-1j9y7bq">명령줄 인터페이스에서 트레이닝하는 경우 DeepSpeed 구성 파일은 JSON 파일의 경로로 전달되거나 노트북 설정에서 <code>Trainer</code>를 사용하는 경우 중첩된 <code>dict</code> 객체로 전달됩니다.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">path to file </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">nested dict </div></div> <div class="language-select"><div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->TrainingArguments(..., deepspeed=<span class="hljs-string">"path/to/deepspeed_config.json"</span>)<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="deepspeed-and-trainer-parameters" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deepspeed-and-trainer-parameters"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DeepSpeed와 Trainer 매개변수</span></h3> <p data-svelte-h="svelte-g3sr9r">구성 매개변수에는 세 가지 유형이 있습니다:</p> <ol data-svelte-h="svelte-11w97px"><li><p>일부 구성 매개변수는 <code>Trainer</code>와 DeepSpeed가 공유하며, 정의가 충돌하는 경우 오류를 식별하기 어려울 수 있습니다. 이러한 공유 구성 매개변수는 <code>Trainer</code> 명령줄 인수에서 쉽게 설정할 수 있습니다.</p></li> <li><p>모델 설정에서 자동으로 도출되는 일부 설정 매개변수는 수동으로 값을 조정할 필요가 없습니다. <code>Trainer</code>는 구성 값 <code>auto</code>를 사용하여 가장 정확하거나 효율적인 값을 설정합니다. 직접 구성 매개변수를 명시적으로 설정할 수도 있지만, <code>Trainer</code> 인수와 DeepSpeed 설정 매개변수가 일치하도록 주의해야 합니다. 일치하지 않으면 감지하기 매우 어려운 방식으로 훈련이 실패할 수 있습니다!</p></li> <li><p>교육 요구 사항에 따라 수동으로 설정해야 하는 일부 설정 매개변수는 DeepSpeed에만 해당됩니다.</p></li></ol> <p data-svelte-h="svelte-1yskfjg">DeepSpeed 구성을 수정하고 <code>TrainingArguments</code>를 편집할 수도 있습니다:</p> <ol data-svelte-h="svelte-1bp1jzc"><li>기본 구성으로 사용할 DeepSpeed 구성 파일을 생성하거나 로드합니다.</li> <li>다음 DeepSpeed 구성을 기반으로 <code>TrainingArguments</code> 객체를 생성합니다.</li></ol> <p data-svelte-h="svelte-a7t0kb"><code>scheduler.params.total_num_steps</code>와 같은 일부 값은 트레이닝 중 <code>Trainer</code>에 의해 계산됩니다.</p> <h3 class="relative group"><a id="zero-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#zero-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ZeRO 구성</span></h3> <p data-svelte-h="svelte-1q4ds8e">세 가지 구성이 있으며, 각 구성은 서로 다른 ZeRO 단계에 해당합니다. 1단계는 확장성 측면에서 그다지 눈여겨볼만하지 않으므로 이 가이드에서는 2단계와 3단계에 중점을 둡니다. <code>zero_optimization</code> 구성에는 활성화할 항목과 구성 방법에 대한 모든 옵션이 포함되어 있습니다. 각 매개변수에 대한 자세한 설명은 <a href="https://www.deepspeed.ai/docs/config-json/" rel="nofollow">DeepSpeed 구성 JSON</a> 참조를 참조하세요.</p> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400">DeepSpeed는 매개변수 이름의 유효성을 검사하지 않으며 오타가 있으면 매개변수의 기본 설정으로 대체합니다. DeepSpeed 엔진 시작 로그 메시지를 보고 어떤 값을 사용할지 확인할 수 있습니다.</div> <p data-svelte-h="svelte-7cq9wt"><code>Trainer</code>는 동등한 명령줄 인수를 제공하지 않으므로 다음 구성은 DeepSpeed로 설정해야 합니다.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">ZeRO-1 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">ZeRO-2 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">ZeRO-3 </div></div> <div class="language-select"><p data-svelte-h="svelte-1orz5x9">ZeRO-1은 옵티마이저 상태를 GPU에 분할하여 약간의 속도 향상을 기대할 수 있습니다. ZeRO-1 구성은 다음과 같이 설정할 수 있습니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{ | |
| <span class="hljs-attr">"zero_optimization":</span> { | |
| <span class="hljs-attr">"stage":</span> <span class="hljs-number">1</span> | |
| } | |
| }<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="nvme-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#nvme-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>NVMe 설정</span></h3> <p data-svelte-h="svelte-1i6wucc"><a href="https://hf.co/papers/2104.07857" rel="nofollow">ZeRO-Infinity</a>를 사용하면 모델 상태를 CPU 및/또는 NVMe로 오프로드하여 더 많은 메모리를 절약할 수 있습니다. 스마트 파티셔닝 및 타일링 알고리즘을 통해 각 GPU는 오프로딩 중에 매우 적은 양의 데이터를 주고받을 수 있으므로 최신 NVMe는 훈련 프로세스에 사용할 수 있는 것보다 훨씬 더 큰 총 메모리 풀에 맞출 수 있습니다. ZeRO-Infinity에는 ZeRO-3가 필요합니다.</p> <p data-svelte-h="svelte-nde1sh">사용 가능한 CPU 및/또는 NVMe 메모리에 따라 <a href="https://www.deepspeed.ai/docs/config-json/#optimizer-offloading" rel="nofollow">옵티마이저</a>와 <a href="https://www.deepspeed.ai/docs/config-json/#parameter-offloading" rel="nofollow">매개변수</a> 중 하나만 오프로드하거나 아무것도 오프로드하지 않을 수 있습니다. 또한 일반 하드 드라이브나 솔리드 스테이트 드라이브에서도 작동하지만 속도가 현저히 느려지므로 <code>nvme_path</code>가 NVMe 장치를 가리키고 있는지 확인해야 합니다. 최신 NVMe를 사용하면 읽기 작업의 경우 최대 3.5GB/s, 쓰기 작업의 경우 최대 3GB/s의 전송 속도를 기대할 수 있습니다. 마지막으로, 트레이닝 설정에서 <a href="https://github.com/microsoft/DeepSpeed/issues/998" rel="nofollow">벤치마크 실행하기</a>을 통해 최적의 ‘aio’ 구성을 결정합니다.</p> <p data-svelte-h="svelte-pi99rv">아래 예제 ZeRO-3/Infinity 구성 파일은 대부분의 매개변수 값을 <code>auto</code>으로 설정하고 있지만, 수동으로 값을 추가할 수도 있습니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{ | |
| <span class="hljs-attr">"fp16":</span> { | |
| <span class="hljs-attr">"enabled":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"loss_scale":</span> <span class="hljs-number">0</span>, | |
| <span class="hljs-attr">"loss_scale_window":</span> <span class="hljs-number">1000</span>, | |
| <span class="hljs-attr">"initial_scale_power":</span> <span class="hljs-number">16</span>, | |
| <span class="hljs-attr">"hysteresis":</span> <span class="hljs-number">2</span>, | |
| <span class="hljs-attr">"min_loss_scale":</span> <span class="hljs-number">1</span> | |
| }, | |
| <span class="hljs-attr">"optimizer":</span> { | |
| <span class="hljs-attr">"type":</span> <span class="hljs-string">"AdamW"</span>, | |
| <span class="hljs-attr">"params":</span> { | |
| <span class="hljs-attr">"lr":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"betas":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"eps":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"weight_decay":</span> <span class="hljs-string">"auto"</span> | |
| } | |
| }, | |
| <span class="hljs-attr">"scheduler":</span> { | |
| <span class="hljs-attr">"type":</span> <span class="hljs-string">"WarmupLR"</span>, | |
| <span class="hljs-attr">"params":</span> { | |
| <span class="hljs-attr">"warmup_min_lr":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"warmup_max_lr":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"warmup_num_steps":</span> <span class="hljs-string">"auto"</span> | |
| } | |
| }, | |
| <span class="hljs-attr">"zero_optimization":</span> { | |
| <span class="hljs-attr">"stage":</span> <span class="hljs-number">3</span>, | |
| <span class="hljs-attr">"offload_optimizer":</span> { | |
| <span class="hljs-attr">"device":</span> <span class="hljs-string">"nvme"</span>, | |
| <span class="hljs-attr">"nvme_path":</span> <span class="hljs-string">"/local_nvme"</span>, | |
| <span class="hljs-attr">"pin_memory":</span> <span class="hljs-literal">true</span>, | |
| <span class="hljs-attr">"buffer_count":</span> <span class="hljs-number">4</span>, | |
| <span class="hljs-attr">"fast_init":</span> <span class="hljs-literal">false</span> | |
| }, | |
| <span class="hljs-attr">"offload_param":</span> { | |
| <span class="hljs-attr">"device":</span> <span class="hljs-string">"nvme"</span>, | |
| <span class="hljs-attr">"nvme_path":</span> <span class="hljs-string">"/local_nvme"</span>, | |
| <span class="hljs-attr">"pin_memory":</span> <span class="hljs-literal">true</span>, | |
| <span class="hljs-attr">"buffer_count":</span> <span class="hljs-number">5</span>, | |
| <span class="hljs-attr">"buffer_size":</span> <span class="hljs-number">1e8</span>, | |
| <span class="hljs-attr">"max_in_cpu":</span> <span class="hljs-number">1e9</span> | |
| }, | |
| <span class="hljs-attr">"aio":</span> { | |
| <span class="hljs-attr">"block_size":</span> <span class="hljs-number">262144</span>, | |
| <span class="hljs-attr">"queue_depth":</span> <span class="hljs-number">32</span>, | |
| <span class="hljs-attr">"thread_count":</span> <span class="hljs-number">1</span>, | |
| <span class="hljs-attr">"single_submit":</span> <span class="hljs-literal">false</span>, | |
| <span class="hljs-attr">"overlap_events":</span> <span class="hljs-literal">true</span> | |
| }, | |
| <span class="hljs-attr">"overlap_comm":</span> <span class="hljs-literal">true</span>, | |
| <span class="hljs-attr">"contiguous_gradients":</span> <span class="hljs-literal">true</span>, | |
| <span class="hljs-attr">"sub_group_size":</span> <span class="hljs-number">1e9</span>, | |
| <span class="hljs-attr">"reduce_bucket_size":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"stage3_prefetch_bucket_size":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"stage3_param_persistence_threshold":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"stage3_max_live_parameters":</span> <span class="hljs-number">1e9</span>, | |
| <span class="hljs-attr">"stage3_max_reuse_distance":</span> <span class="hljs-number">1e9</span>, | |
| <span class="hljs-attr">"stage3_gather_16bit_weights_on_model_save":</span> <span class="hljs-literal">true</span> | |
| }, | |
| <span class="hljs-attr">"gradient_accumulation_steps":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"gradient_clipping":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"steps_per_print":</span> <span class="hljs-number">2000</span>, | |
| <span class="hljs-attr">"train_batch_size":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"train_micro_batch_size_per_gpu":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"wall_clock_breakdown":</span> <span class="hljs-literal">false</span> | |
| }<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="deepspeed-features" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deepspeed-features"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DeepSpeed 구성</span></h2> <p data-svelte-h="svelte-v7y32z">이 섹션에서 간략하게 설명하는 몇 가지 중요한 매개변수를 DeepSpeed 구성 파일에 지정할 수 있습니다.</p> <h3 class="relative group"><a id="activationgradient-checkpointing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#activationgradient-checkpointing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>활성화/그레이디언트 체크포인팅</span></h3> <p data-svelte-h="svelte-oxu3it">활성화 및 그레이디언트 체크포인팅은 속도를 더 많은 GPU 메모리와 교환하여 GPU 메모리가 부족한 상황을 극복하거나 배치 크기를 늘려 성능을 향상시킬 수 있습니다. 이 기능을 활성화하려면 다음과 같이 하세요:</p> <ol data-svelte-h="svelte-1nb1u65"><li>허깅 페이스 모델의 경우, <code>Trainer</code>에서 <code>model.gradient_checkpointing_enable()</code> 또는 <code>--gradient_checkpointing</code>을 설정합니다.</li> <li>허깅 페이스가 아닌 모델의 경우, 딥스피드 <a href="https://deepspeed.readthedocs.io/en/latest/activation-checkpointing.html" rel="nofollow">Activation Checkpointing API</a>를 사용합니다. 트랜스포머 모델링 코드를 대체하고 <code>torch.utils.checkpoint</code>를 DeepSpeed API로 대체할 수도 있습니다. 이 접근 방식은 순방향 활성화를 다시 계산하는 대신 CPU 메모리로 오프로드할 수 있으므로 더 유연합니다.</li></ol> <h3 class="relative group"><a id="optimizer-and-scheduler" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#optimizer-and-scheduler"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>옵티마이저와 스케줄러</span></h3> <p data-svelte-h="svelte-1435kea"><code>offload_optimizer</code>를 활성화하지 않는 한 DeepSpeed와 트랜스포머 옵티마이저 및 스케줄러를 혼합하여 사용할 수 있습니다. <code>offload_optimizer</code>를 활성화하면 CPU와 GPU 구현이 모두 있는 경우 DeepSpeed가 아닌 최적화기(LAMB 제외)를 사용할 수 있습니다.</p> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-1foqjf4">구성 파일의 최적화 프로그램 및 스케줄러 매개변수는 명령줄에서 설정할 수 있으므로 오류를 찾기 어렵지 않습니다. 예를 들어 학습 속도가 다른 곳에서 다른 값으로 설정된 경우 명령줄에서 이를 재정의할 수 있습니다. 최적화 프로그램 및 스케줄러 매개변수 외에도 <code>Trainer</code> 명령줄 인수가 DeepSpeed 구성과 일치하는지 확인해야 합니다.</p></div> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">optimizer </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">scheduler </div></div> <div class="language-select"><p data-svelte-h="svelte-1fl66hc">DeepSpeed는 여러 <a href="https://www.deepspeed.ai/docs/config-json/#optimizer-parameters" rel="nofollow">옵티마이저</a>를 제공하지만(Adam, AdamW, OneBitAdam 및 LAMB) PyTorch에서 다른 옵티마이저를 가져올 수도 있습니다. 설정에서 옵티마이저를 구성하지 않으면 <code>Trainer</code>가 자동으로 AdamW를 선택하고 명령줄에서 제공된 값 또는 기본값을 사용합니다: <code>lr</code>, <code>adam_beta1</code>, <code>adam_beta2</code>, <code>adam_epsilon</code>, <code>weight_decay</code>.</p> <p data-svelte-h="svelte-10dgirt">매개변수를 <code>"auto"</code>으로 설정하거나 원하는 값을 직접 수동으로 입력할 수 있습니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{ | |
| <span class="hljs-attr">"optimizer":</span> { | |
| <span class="hljs-attr">"type":</span> <span class="hljs-string">"AdamW"</span>, | |
| <span class="hljs-attr">"params":</span> { | |
| <span class="hljs-attr">"lr":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"betas":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"eps":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"weight_decay":</span> <span class="hljs-string">"auto"</span> | |
| } | |
| } | |
| }<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jv0ylj">최상위 구성에 다음을 추가하여 지원되지 않는 옵티마이저를 사용할 수도 있습니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{ | |
| <span class="hljs-attr">"zero_allow_untested_optimizer":</span> <span class="hljs-literal">true</span> | |
| }<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1oc48yb">DeepSpeed==0.8.3부터 오프로드를 사용하려면 오프로드가 DeepSpeed의 CPU Adam 옵티마이저에서 가장 잘 작동하므로 최상위 수준 구성에 다음 사항을 추가해야 합니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{ | |
| <span class="hljs-attr">"zero_force_ds_cpu_optimizer":</span> <span class="hljs-literal">false</span> | |
| }<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="precision" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#precision"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>정밀도</span></h3> <p data-svelte-h="svelte-j8ngkc">DeepSpeed는 fp32, fp16 및 bf16 혼합 정밀도를 지원합니다.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">fp32 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">fp16 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">bf16 </div></div> <div class="language-select"><p data-svelte-h="svelte-vb21iv">모델이 혼합 정밀도로 사전 학습되지 않은 경우와 같이 혼합 정밀도로 잘 작동하지 않는 경우 NaN 손실을 유발할 수 있는 오버플로 또는 언더플로 문제가 발생할 수 있습니다. 이러한 경우에는 기본 fp16 모드를 명시적으로 비활성화하여 전체 fp32 정밀도를 사용해야 합니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{ | |
| <span class="hljs-attr">"fp16":</span> { | |
| <span class="hljs-attr">"enabled":</span> <span class="hljs-literal">false</span> | |
| } | |
| }<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ii7v2c">Ampere GPU 및 PyTorch 1.7 이상의 경우 일부 연산에 대해 더 효율적인 <a href="https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices" rel="nofollow">tf32</a> 형식으로 자동 전환되지만 결과는 여전히 fp32로 표시됩니다. <code>Trainer</code>에서 <code>--tf32</code>를 설정하여 활성화하고 <code>--tf32 0</code> 또는 <code>--no_tf32</code>를 비활성화하면 제어할 수 있습니다.</p> </div> <h3 class="relative group"><a id="batch-size" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#batch-size"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>배치 크기</span></h3> <p data-svelte-h="svelte-b5qvau">배치 크기는 자동으로 구성하거나 명시적으로 설정할 수 있습니다. <code>"auto"</code> 옵션을 사용하도록 선택하면 <code>Trainer</code>는 <code>train_micro_batch_size_per_gpu</code>를 args.<code>per_device_train_batch_size</code>의 값으로, <code>train_batch_size</code>를 <code>args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps</code>로 설정합니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{ | |
| <span class="hljs-attr">"train_micro_batch_size_per_gpu":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"train_batch_size":</span> <span class="hljs-string">"auto"</span> | |
| }<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="gradient-accumulation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gradient-accumulation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>그레이디언트 누적</span></h3> <p data-svelte-h="svelte-l6vz6d">그레이디언트 누적을 자동으로 구성하거나 명시적으로 설정할 수 있습니다. <code>"auto"</code> 옵션을 사용하도록 선택하면 <code>Trainer</code>가 <code>args.gradient_accumulation_steps</code>의 값으로 설정합니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{ | |
| <span class="hljs-attr">"gradient_accumulation_steps":</span> <span class="hljs-string">"auto"</span> | |
| } | |
| <!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="gradient-clipping" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gradient-clipping"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>그레이디언트 클리핑</span></h3> <p data-svelte-h="svelte-azqlxb">그레이디언트 클리핑은 자동으로 구성하거나 명시적으로 설정할 수 있습니다. <code>"auto"</code> 옵션을 사용하도록 선택하면 <code>Trainer</code>가 <code>args.max_grad_norm</code>의 값으로 설정합니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{ | |
| <span class="hljs-attr">"gradient_clipping":</span> <span class="hljs-string">"auto"</span> | |
| }<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="communication-data-type" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#communication-data-type"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>통신 데이터 유형(Communication data type)</span></h3> <p data-svelte-h="svelte-dpabvn">축소, 수집 및 분산 작업과 같은 통신 집합체의 경우 별도의 데이터 유형이 사용됩니다.</p> <p data-svelte-h="svelte-1686adi">모든 수집 및 분산 작업은 데이터와 동일한 데이터 유형으로 수행됩니다. 예를 들어 bf16으로 훈련하는 경우, 수집은 비손실 연산이므로 데이터도 bf16으로 수집됩니다.</p> <p data-svelte-h="svelte-1adyxw4">예를 들어 그레이디언트가 여러 GPU에 걸쳐 평균화되는 경우와 같이 감소 연산은 손실이 발생합니다. 통신이 fp16 또는 bf16으로 수행되는 경우, 낮은 정밀도로 여러 숫자를 더하면 정확하지 않기 때문에 손실이 발생할 가능성이 더 높습니다. 특히 fp16보다 정밀도가 낮은 bf16의 경우 더욱 그렇습니다. 이러한 이유로 기울기를 평균화할 때 손실이 최소화되므로 감소 연산에는 fp16이 기본값으로 사용됩니다.</p> <p data-svelte-h="svelte-1rowry7">통신 데이터 유형은 설정 파일에서 <code>communication_data_type</code> 매개변수를 설정하여 선택할 수 있습니다. 예를 들어, fp32를 선택하면 약간의 오버헤드가 추가되지만 감소 연산이 fp32에 누적되고 준비가 되면 훈련 중인 반정밀 dtype으로 다운캐스트됩니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{ | |
| <span class="hljs-attr">"communication_data_type":</span> <span class="hljs-string">"fp32"</span> | |
| }<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="deployment" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deployment"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>모델 배포</span></h2> <p data-svelte-h="svelte-144yyml"><a href="https://pytorch.org/docs/stable/elastic/run.html" rel="nofollow">torchrun</a>, <code>deepspeed</code> 런처 또는 <a href="https://huggingface.co/docs/accelerate/basic_tutorials/launch#using-accelerate-launch" rel="nofollow">Accelerate</a> 등 다양한 런처를 통해 DeepSpeed를 배포할 수 있습니다. 배포하려면 <code>Trainer</code> 명령줄에 <code>--deepspeed ds_config.json</code>을 추가합니다. 필요한 명령줄 인수를 코드에 추가하려면 DeepSpeed의 <a href="https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing" rel="nofollow"><code>add_config_arguments</code></a> 유틸리티를 사용하는 것이 좋습니다.</p> <p data-svelte-h="svelte-ircvm2">이 가이드에서는 다양한 트레이닝 설정에 대해 <code>deepspeed</code> 런처로 DeepSpeed를 배포하는 방법을 보여드립니다. 보다 실용적인 사용 예제는 이 <a href="https://github.com/huggingface/transformers/issues/8771#issuecomment-759248400" rel="nofollow">post</a>에서 확인할 수 있습니다.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">multi-GPU </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">single-GPU </div></div> <div class="language-select"><p data-svelte-h="svelte-1lsqf1i">여러 GPU에 DeepSpeed를 배포하려면 <code>--num_gpus</code> 매개변수를 추가하세요. 사용 가능한 모든 GPU를 사용하려는 경우 <code>--num_gpus</code>를 추가할 필요가 없습니다. 아래 예제에서는 2개의 GPU를 사용합니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \ | |
| --deepspeed tests/deepspeed/ds_config_zero3.json \ | |
| --model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ | |
| --output_dir output_dir --overwrite_output_dir --fp16 \ | |
| --do_train --max_train_samples 500 --num_train_epochs 1 \ | |
| --dataset_name wmt16 --dataset_config <span class="hljs-string">"ro-en"</span> \ | |
| --source_lang en --target_lang ro<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="multi-node-deployment" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multi-node-deployment"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>다중 노드 환경에서의 모델 배포</span></h3> <p data-svelte-h="svelte-1bhljkl">노드는 워크로드를 실행하기 위한 하나 이상의 GPU입니다. 더 강력한 설정은 멀티 노드 설정으로, <code>deepspeed</code> 런처로 실행할 수 있습니다. 이 가이드에서는 각각 8개의 GPU가 있는 두 개의 노드가 있다고 가정해 보겠습니다. 첫 번째 노드는 <code>ssh hostname1</code>로, 두 번째 노드는 <code>ssh hostname2</code>로 접속할 수 있습니다. 두 노드 모두 비밀번호 없이 ssh를 통해 로컬로 서로 통신할 수 있어야 합니다.</p> <p data-svelte-h="svelte-19uu04y">기본적으로 DeepSpeed는 멀티노드 환경에서 공유 저장소를 사용할 것으로 예상합니다. 그렇지 않고 각 노드가 로컬 파일 시스템만 볼 수 있는 경우, 공유 파일 시스템에 대한 액세스 없이 로딩할 수 있도록 <a href="https://www.deepspeed.ai/docs/config-json/#checkpoint-options" rel="nofollow"><code>checkpoint</code></a>를 포함하도록 구성 파일을 조정해야 합니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{ | |
| <span class="hljs-attr">"checkpoint":</span> { | |
| <span class="hljs-attr">"use_node_local_storage":</span> <span class="hljs-literal">true</span> | |
| } | |
| }<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-trc0q4"><code>Trainer</code>의 `<code>--save_on_each_node</code> 인수를 사용하여 위의 <code>checkpoint</code>를 구성에 자동으로 추가할 수도 있습니다.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">torchrun </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">deepspeed </div></div> <div class="language-select"><p data-svelte-h="svelte-iooldj"><a href="https://pytorch.org/docs/stable/elastic/run.html" rel="nofollow">torchrun</a>의 경우, 각 노드에 ssh로 접속한 후 두 노드 모두에서 다음 명령을 실행해야 합니다. 런처는 두 노드가 동기화될 때까지 기다렸다가 트레이닝을 시작합니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->torchrun --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 \ | |
| --master_port=9901 your_program.py <normal cl args> --deepspeed ds_config.json<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="slurm" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#slurm"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>SLURM</span></h3> <p data-svelte-h="svelte-rlqn6u">SLURM 환경에서는 특정 SLURM 환경에 맞게 SLURM 스크립트를 조정해야 합니다.SLURM 스크립트 예시는 다음과 같습니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment">#SBATCH --job-name=test-nodes # 작업 이름</span> | |
| <span class="hljs-comment">#SBATCH --nodes=2 # 노드 수</span> | |
| <span class="hljs-comment">#SBATCH --ntasks-per-node=1 # 중요 - 노드당 분산 작업 1개!</span> | |
| <span class="hljs-comment">#SBATCH --cpus-per-task=10 # 작업당 CPU 코어 수</span> | |
| <span class="hljs-comment">#SBATCH --gres=gpu:8 # gpu 수</span> | |
| <span class="hljs-comment">#SBATCH --time 20:00:00 # 최대 실행 시간 (HH:MM:SS)</span> | |
| <span class="hljs-comment">#SBATCH --output=%x-%j.out # 출력 파일 이름</span> | |
| <span class="hljs-built_in">export</span> GPUS_PER_NODE=8 | |
| <span class="hljs-built_in">export</span> MASTER_ADDR=$(scontrol show hostnames <span class="hljs-variable">$SLURM_JOB_NODELIST</span> | <span class="hljs-built_in">head</span> -n 1) | |
| <span class="hljs-built_in">export</span> MASTER_PORT=9901 | |
| srun --jobid <span class="hljs-variable">$SLURM_JOBID</span> bash -c <span class="hljs-string">'python -m torch.distributed.run \ | |
| --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \ | |
| --master_addr $MASTER_ADDR --master_port $MASTER_PORT \ | |
| your_program.py <normal cl args> --deepspeed ds_config.json'</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-16exqvu">그런 다음 모든 노드에서 동시에 학습을 시작하는 다음 명령을 사용하여 다중 노드 배포를 예약할 수 있습니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->sbatch launch.slurm<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="notebook" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#notebook"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>노트북</span></h3> <p data-svelte-h="svelte-1xsk9mv"><code>deepspeed</code> 런처는 노트북에서의 배포를 지원하지 않으므로 분산 환경을 에뮬레이션해야 합니다. 하지만 이는 1개의 GPU에서만 작동합니다. 1개 이상의 GPU를 사용하려면 딥스피드가 작동할 수 있는 다중 프로세스 환경을 사용해야 합니다. 즉, 여기에 표시된 것처럼 에뮬레이션할 수 없는 <code>deepspeed</code> 런처를 사용해야 합니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># DeepSpeed는 단일 프로세스만 사용하더라도 분산 환경을 필요로 합니다.</span> | |
| <span class="hljs-comment"># 이 코드로 분산 환경을 모방합니다.</span> | |
| <span class="hljs-keyword">import</span> os | |
| os.environ[<span class="hljs-string">"MASTER_ADDR"</span>] = <span class="hljs-string">"localhost"</span> | |
| os.environ[<span class="hljs-string">"MASTER_PORT"</span>] = <span class="hljs-string">"9994"</span> <span class="hljs-comment"># RuntimeError: Address already in use 오류 발생 시 수정</span> | |
| os.environ[<span class="hljs-string">"RANK"</span>] = <span class="hljs-string">"0"</span> | |
| os.environ[<span class="hljs-string">"LOCAL_RANK"</span>] = <span class="hljs-string">"0"</span> | |
| os.environ[<span class="hljs-string">"WORLD_SIZE"</span>] = <span class="hljs-string">"1"</span> | |
| <span class="hljs-comment"># 이제 평소와 같이 진행하되, DeepSpeed 설정 파일을 전달합니다.</span> | |
| training_args = TrainingArguments(..., deepspeed=<span class="hljs-string">"ds_config_zero3.json"</span>) | |
| trainer = Trainer(...) | |
| trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-14qdmtn">현재 디렉터리의 노트북에 구성 파일을 즉석에서 만들고 싶다면 전용 셀을 만들 수 있습니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->%%bash | |
| cat <<<span class="hljs-string">'EOT'</span> > ds_config_zero3.json | |
| { | |
| <span class="hljs-string">"fp16"</span>: { | |
| <span class="hljs-string">"enabled"</span>: <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-string">"loss_scale"</span>: <span class="hljs-number">0</span>, | |
| <span class="hljs-string">"loss_scale_window"</span>: <span class="hljs-number">1000</span>, | |
| <span class="hljs-string">"initial_scale_power"</span>: <span class="hljs-number">16</span>, | |
| <span class="hljs-string">"hysteresis"</span>: <span class="hljs-number">2</span>, | |
| <span class="hljs-string">"min_loss_scale"</span>: <span class="hljs-number">1</span> | |
| }, | |
| <span class="hljs-string">"optimizer"</span>: { | |
| <span class="hljs-string">"type"</span>: <span class="hljs-string">"AdamW"</span>, | |
| <span class="hljs-string">"params"</span>: { | |
| <span class="hljs-string">"lr"</span>: <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-string">"betas"</span>: <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-string">"eps"</span>: <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-string">"weight_decay"</span>: <span class="hljs-string">"auto"</span> | |
| } | |
| }, | |
| <span class="hljs-string">"scheduler"</span>: { | |
| <span class="hljs-string">"type"</span>: <span class="hljs-string">"WarmupLR"</span>, | |
| <span class="hljs-string">"params"</span>: { | |
| <span class="hljs-string">"warmup_min_lr"</span>: <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-string">"warmup_max_lr"</span>: <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-string">"warmup_num_steps"</span>: <span class="hljs-string">"auto"</span> | |
| } | |
| }, | |
| <span class="hljs-string">"zero_optimization"</span>: { | |
| <span class="hljs-string">"stage"</span>: <span class="hljs-number">3</span>, | |
| <span class="hljs-string">"offload_optimizer"</span>: { | |
| <span class="hljs-string">"device"</span>: <span class="hljs-string">"cpu"</span>, | |
| <span class="hljs-string">"pin_memory"</span>: true | |
| }, | |
| <span class="hljs-string">"offload_param"</span>: { | |
| <span class="hljs-string">"device"</span>: <span class="hljs-string">"cpu"</span>, | |
| <span class="hljs-string">"pin_memory"</span>: true | |
| }, | |
| <span class="hljs-string">"overlap_comm"</span>: true, | |
| <span class="hljs-string">"contiguous_gradients"</span>: true, | |
| <span class="hljs-string">"sub_group_size"</span>: <span class="hljs-number">1e9</span>, | |
| <span class="hljs-string">"reduce_bucket_size"</span>: <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-string">"stage3_prefetch_bucket_size"</span>: <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-string">"stage3_param_persistence_threshold"</span>: <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-string">"stage3_max_live_parameters"</span>: <span class="hljs-number">1e9</span>, | |
| <span class="hljs-string">"stage3_max_reuse_distance"</span>: <span class="hljs-number">1e9</span>, | |
| <span class="hljs-string">"stage3_gather_16bit_weights_on_model_save"</span>: true | |
| }, | |
| <span class="hljs-string">"gradient_accumulation_steps"</span>: <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-string">"gradient_clipping"</span>: <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-string">"steps_per_print"</span>: <span class="hljs-number">2000</span>, | |
| <span class="hljs-string">"train_batch_size"</span>: <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-string">"train_micro_batch_size_per_gpu"</span>: <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-string">"wall_clock_breakdown"</span>: false | |
| } | |
| EOT<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-a5v6xj">트레이닝 스크립트가 노트북 셀이 아닌 파일에 있는 경우, 노트북 셀의 셸에서 <code>deepspeed</code>를 정상적으로 실행할 수 있습니다. 예를 들어 <code>run_translation.py</code>를 시작하려면 다음과 같이 하세요.:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!git clone https://github.com/huggingface/transformers | |
| !cd transformers; deepspeed examples/pytorch/translation/run_translation.py ...<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-qykc5c">또한 <code>%%bash</code> 매직을 사용하여 여러 줄의 코드를 작성하여 셸 프로그램을 실행할 수도 있지만 교육이 완료될 때까지 로그를 볼 수 없습니다. <code>%%bash</code> 매직으로 분산 환경을 에뮬레이션할 필요는 없습니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->%%bash | |
| git clone https://github.com/huggingface/transformers | |
| cd transformers | |
| deepspeed examples/pytorch/translation/run_translation.py ...<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="save-model-weights" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#save-model-weights"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>모델 가중치 저장하기</span></h2> <p data-svelte-h="svelte-3x8880">딥스피드는 기본 고정밀 fp32 가중치를 사용자 지정 체크포인트 최적화 파일(glob 패턴은 <code>global_step*/*optim_states.pt</code>처럼 보입니다)에 저장하고 일반 체크포인트 아래에 저장합니다.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">fp16 </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">fp32 </div></div> <div class="language-select"><p data-svelte-h="svelte-1u8kmrr">ZeRO-2로 훈련된 모델은 pytorch_model.bin 가중치를 fp16에 저장합니다. ZeRO-3으로 훈련된 모델의 모델 가중치를 fp16에 저장하려면 모델 가중치가 여러 GPU에 분할되어 있으므로 <code>“stage3_gather_16bit_weights_on_model_save”: true</code>를 설정해야 합니다. 그렇지 않으면 <code>Trainer</code>가 가중치를 fp16에 저장하지 않고 pytorch_model.bin 파일을 생성하지 않습니다. 이는 DeepSpeed의 state_dict에 실제 가중치 대신 플레이스홀더가 포함되어 있어 이를 로드할 수 없기 때문입니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{ | |
| <span class="hljs-attr">"zero_optimization":</span> { | |
| <span class="hljs-attr">"stage3_gather_16bit_weights_on_model_save":</span> <span class="hljs-literal">true</span> | |
| } | |
| }<!-- HTML_TAG_END --></pre></div> </div> <h2 class="relative group"><a id="zero-inference" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#zero-inference"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ZeRO Inference</span></h2> <p data-svelte-h="svelte-1tv6ab4"><a href="https://www.deepspeed.ai/2022/09/09/zero-inference.html" rel="nofollow">ZeRO Inference</a>는 모델 가중치를 CPU 또는 NVMe 메모리에 배치하여 GPU에 부담을 주지 않으므로 GPU에서 대규모 모델을 사용하여 추론을 실행할 수 있습니다. 추론은 최적화 상태 및 그레이디언트에 많은 양의 메모리를 추가로 필요로 하지 않으므로 동일한 하드웨어에 훨씬 더 큰 배치 및/또는 시퀀스 길이를 맞출 수 있습니다.</p> <p data-svelte-h="svelte-1jb73uu">ZeRO Inference는 <a href="#zero-configuration">ZeRO-3</a>와 동일한 구성 파일을 공유하며, ZeRO-2 및 ZeRO-1 구성은 추론에 아무런 이점을 제공하지 않으므로 작동하지 않습니다.</p> <p data-svelte-h="svelte-i4o3my">ZeRO Inference를 실행하려면 일반적인 훈련 인수를 <code>TrainingArguments</code> 클래스에 전달하고 <code>--do_eval</code> 인수를 추가합니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->deepspeed --num_gpus=2 your_program.py <normal cl args> --do_eval --deepspeed ds_config.json<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="non-trainer-deepspeed-integration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#non-trainer-deepspeed-integration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Trainer 없이 DeepSpeed 사용하기</span></h2> <p data-svelte-h="svelte-1y01xkj">DeepSpeed는 <code>Trainer</code> 클래스가 없는 트랜스포머에서도 작동합니다. 이는 <code>from_pretrained()</code>를 호출할 때 ZeRO-3 매개변수를 수집하고 모델을 여러 GPU에 분할하는 작업만 처리하는 <code>HfDeepSpeedConfig</code>가 처리합니다.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-kevebx">모든 것이 자동으로 처리되기를 원한다면, <code>Trainer</code>와 함께 DeepSpeed를 사용해 보세요! <a href="https://www.deepspeed.ai/" rel="nofollow">DeepSpeed 문서</a>를 참조하여 설정 파일에서 매개변수 값을 수동으로 구성해야 합니다(<code>"auto"</code> 값은 사용할 수 없음).</p></div> <p data-svelte-h="svelte-cd2s2l">ZeRO-3를 효율적으로 배포하려면 모델 앞에 <code>HfDeepSpeedConfig</code> 객체를 인스턴스화하고 해당 객체를 유지해야 합니다:</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">pretrained model </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">non-pretrained model </div></div> <div class="language-select"><div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers.integrations <span class="hljs-keyword">import</span> HfDeepSpeedConfig | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModel | |
| <span class="hljs-keyword">import</span> deepspeed | |
| ds_config = {...} <span class="hljs-comment"># deepspeed 설정 객체 또는 파일 경로</span> | |
| <span class="hljs-comment"># Zero 3를 감지하기 위해 모델을 인스턴스화하기 전에 반드시 실행해야 합니다</span> | |
| dschf = HfDeepSpeedConfig(ds_config) <span class="hljs-comment"># 이 객체를 유지하세요.</span> | |
| model = AutoModel.from_pretrained(<span class="hljs-string">"openai-community/gpt2"</span>) | |
| engine = deepspeed.initialize(model=model, config_params=ds_config, ...)<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="non-trainer-zero-inference" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#non-trainer-zero-inference"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Trainer 없이 ZeRO Inference 사용하기</span></h3> <p data-svelte-h="svelte-10hxzc">단일 GPU에 모델을 맞출 수 없는 경우 <code>Trainer</code>없이 ZeRO 추론을 실행하려면 추가 GPU를 사용하거나 CPU 메모리로 오프로드를 시도하세요. 여기서 이해해야 할 중요한 뉘앙스는 ZeRO가 설계된 방식에 따라 서로 다른 GPU에서 서로 다른 입력을 병렬로 처리할 수 있다는 것입니다.</p> <p data-svelte-h="svelte-4ddn1l">반드시 확인하세요:</p> <ul data-svelte-h="svelte-1vk101s"><li>GPU 메모리가 충분한 경우 CPU 오프로드를 비활성화합니다(속도가 느려지므로).</li> <li>Ampere 이상의 GPU를 사용하는 경우 bf16을 활성화하면 속도가 빨라집니다. 이러한 GPU가 없는 경우 오버플로 오류가 발생할 수 있으므로 bf16으로 사전 학습된 모델(T5 모델)을 사용하지 않는 한 fp16을 활성화할 수 있습니다.</li></ul> <p data-svelte-h="svelte-wbpuhy">단일 GPU에 맞지 않는 모델에서 <code>Trainer</code> 없이 ZeRO 추론을 실행하는 방법에 대한 더 나은 아이디어를 얻으려면 다음 스크립트를 살펴보시기 바랍니다.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment">#!/usr/bin/env python</span> | |
| <span class="hljs-comment"># 이 스크립트는 단일 GPU에 모델을 맞출 수 없을 때 추론 모드에서 Deepspeed ZeRO를 사용하는 방법을 보여줍니다.</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># 1. CPU 오프로드와 함께 1개의 GPU 사용</span> | |
| <span class="hljs-comment"># 2. 또는 여러 GPU 사용</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># 먼저 deepspeed를 설치해야 합니다: pip install deepspeed</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># 여기서는 약 15GB의 GPU RAM이 필요한 3B "bigscience/T0_3B" 모델을 사용합니다 - 따라서 1개의 큰 GPU나 2개의</span> | |
| <span class="hljs-comment"># 작은 GPU로 처리할 수 있습니다. 또는 1개의 작은 GPU와 많은 CPU 메모리로도 가능합니다.</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># 약 50GB가 필요한 "bigscience/T0"와 같은 더 큰 모델을 사용하려면, 80GB GPU가 없는 한</span> | |
| <span class="hljs-comment"># 2-4개의 GPU가 필요할 것입니다. 그리고 여러 입력을 한 번에 처리하고 싶다면</span> | |
| <span class="hljs-comment"># 스크립트를 수정하여 더 많은 GPU를 처리할 수 있습니다.</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># 제공된 deepspeed 설정은 CPU 메모리 오프로딩도 활성화하므로, 사용 가능한 CPU 메모리가 많고</span> | |
| <span class="hljs-comment"># 속도 저하를 감수할 수 있다면 일반적으로 단일 GPU에 맞지 않는 모델을 로드할 수 있을 것입니다.</span> | |
| <span class="hljs-comment"># GPU 메모리가 충분하다면 CPU로의 오프로드를 원하지 않을 때 프로그램이 더 빠르게 실행될 것입니다 - 그럴 때는 해당 섹션을 비활성화하세요.</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># 1개의 GPU에 배포하려면:</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># deepspeed --num_gpus 1 t0.py</span> | |
| <span class="hljs-comment"># 또는:</span> | |
| <span class="hljs-comment"># python -m torch.distributed.run --nproc_per_node=1 t0.py</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># 2개의 GPU에 배포하려면:</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># deepspeed --num_gpus 2 t0.py</span> | |
| <span class="hljs-comment"># 또는:</span> | |
| <span class="hljs-comment"># python -m torch.distributed.run --nproc_per_node=2 t0.py</span> | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM | |
| <span class="hljs-keyword">from</span> transformers.integrations <span class="hljs-keyword">import</span> HfDeepSpeedConfig | |
| <span class="hljs-keyword">import</span> deepspeed | |
| <span class="hljs-keyword">import</span> os | |
| <span class="hljs-keyword">import</span> torch | |
| os.environ[<span class="hljs-string">"TOKENIZERS_PARALLELISM"</span>] = <span class="hljs-string">"false"</span> <span class="hljs-comment"># 토크나이저의 병렬 처리에 관한 경고를 피하기 위함입니다.</span> | |
| <span class="hljs-comment"># 분산 환경 설정</span> | |
| local_rank = <span class="hljs-built_in">int</span>(os.getenv(<span class="hljs-string">"LOCAL_RANK"</span>, <span class="hljs-string">"0"</span>)) | |
| world_size = <span class="hljs-built_in">int</span>(os.getenv(<span class="hljs-string">"WORLD_SIZE"</span>, <span class="hljs-string">"1"</span>)) | |
| torch.cuda.set_device(local_rank) | |
| deepspeed.init_distributed() | |
| model_name = <span class="hljs-string">"bigscience/T0_3B"</span> | |
| config = AutoConfig.from_pretrained(model_name) | |
| model_hidden_size = config.d_model | |
| <span class="hljs-comment"># 배치 크기는 world_size로 나누어 떨어져야 하지만, world_size보다 클 수 있습니다</span> | |
| train_batch_size = <span class="hljs-number">1</span> * world_size | |
| <span class="hljs-comment"># ds_config 참고사항</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># - Ampere 이상의 GPU를 사용하는 경우 bf16을 활성화하세요 - 이는 혼합 정밀도로 실행되어</span> | |
| <span class="hljs-comment"># 더 빠를 것입니다.</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># - 오래된 GPU의 경우 fp16을 활성화할 수 있지만, bf16으로 사전 훈련되지 않은 모델에서만 작동합니다 - 예를 들어</span> | |
| <span class="hljs-comment"># 모든 공식 t5 모델은 bf16으로 사전 훈련되었습니다</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># - CPU 오프로드를 원하지 않는다면 offload_param.device를 "none"으로 설정하거나 `offload_param` 섹션을</span> | |
| <span class="hljs-comment"># 완전히 제거하세요</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># - `offload_param`을 사용하는 경우, stage3_param_persistence_threshold를 수동으로 미세 조정하여</span> | |
| <span class="hljs-comment"># 어떤 매개변수가 GPU에 남아있어야 하는지 제어할 수 있습니다 - 값이 클수록 오프로드 크기가 작아집니다</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># Deepspeed 설정에 대한 자세한 정보는 다음을 참조하세요</span> | |
| <span class="hljs-comment"># https://huggingface.co/docs/transformers/main/main_classes/deepspeed</span> | |
| <span class="hljs-comment"># 일관성을 위해 json과 동일한 형식을 유지하되, true/false에는 소문자를 사용합니다</span> | |
| <span class="hljs-comment"># fmt: off</span> | |
| ds_config = { | |
| <span class="hljs-string">"fp16"</span>: { | |
| <span class="hljs-string">"enabled"</span>: <span class="hljs-literal">False</span> | |
| }, | |
| <span class="hljs-string">"bf16"</span>: { | |
| <span class="hljs-string">"enabled"</span>: <span class="hljs-literal">False</span> | |
| }, | |
| <span class="hljs-string">"zero_optimization"</span>: { | |
| <span class="hljs-string">"stage"</span>: <span class="hljs-number">3</span>, | |
| <span class="hljs-string">"offload_param"</span>: { | |
| <span class="hljs-string">"device"</span>: <span class="hljs-string">"cpu"</span>, | |
| <span class="hljs-string">"pin_memory"</span>: <span class="hljs-literal">True</span> | |
| }, | |
| <span class="hljs-string">"overlap_comm"</span>: <span class="hljs-literal">True</span>, | |
| <span class="hljs-string">"contiguous_gradients"</span>: <span class="hljs-literal">True</span>, | |
| <span class="hljs-string">"reduce_bucket_size"</span>: model_hidden_size * model_hidden_size, | |
| <span class="hljs-string">"stage3_prefetch_bucket_size"</span>: <span class="hljs-number">0.9</span> * model_hidden_size * model_hidden_size, | |
| <span class="hljs-string">"stage3_param_persistence_threshold"</span>: <span class="hljs-number">10</span> * model_hidden_size | |
| }, | |
| <span class="hljs-string">"steps_per_print"</span>: <span class="hljs-number">2000</span>, | |
| <span class="hljs-string">"train_batch_size"</span>: train_batch_size, | |
| <span class="hljs-string">"train_micro_batch_size_per_gpu"</span>: <span class="hljs-number">1</span>, | |
| <span class="hljs-string">"wall_clock_breakdown"</span>: <span class="hljs-literal">False</span> | |
| } | |
| <span class="hljs-comment"># fmt: on</span> | |
| <span class="hljs-comment"># 다음 줄은 모델의 `from_pretrained` 메소드가 호출될 때</span> | |
| <span class="hljs-comment"># deepspeed.zero.Init를 사용하여 모델을 여러 GPU에 직접 분할하도록 transformers에 지시합니다.</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># **이는 AutoModelForSeq2SeqLM.from_pretrained(model_name)로 모델을 로드하기 전에 실행되어야 합니다**</span> | |
| <span class="hljs-comment">#</span> | |
| <span class="hljs-comment"># 그렇지 않으면 모델이 먼저 정상적으로 로드된 후 포워드 시에만 분할되는데, 이는</span> | |
| <span class="hljs-comment"># 덜 효율적이며 CPU RAM이 부족할 경우 실패할 수 있습니다</span> | |
| dschf = HfDeepSpeedConfig(ds_config) <span class="hljs-comment"># 이 객체를 유지하세요</span> | |
| <span class="hljs-comment"># 이제 모델을 로드할 수 있습니다.</span> | |
| model = AutoModelForSeq2SeqLM.from_pretrained(model_name) | |
| <span class="hljs-comment"># Deepspeed ZeRO를 초기화하고 엔진 객체만 저장</span> | |
| ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[<span class="hljs-number">0</span>] | |
| ds_engine.module.<span class="hljs-built_in">eval</span>() <span class="hljs-comment"># inference</span> | |
| <span class="hljs-comment"># Deepspeed ZeRO는 각 GPU에서 서로 관련 없는 입력을 처리할 수 있습니다. 따라서 2개의 GPU를 사용하면 한 번에 2개의 입력을 처리할 수 있습니다.</span> | |
| <span class="hljs-comment"># GPU를 더 많이 사용하는 경우 그에 맞게 조정하세요.</span> | |
| <span class="hljs-comment"># 물론 처리할 입력이 하나뿐이라면 두 GPU에 동일한 문자열을 전달해야 합니다.</span> | |
| <span class="hljs-comment"># GPU를 하나만 사용하는 경우에는 rank 0만 갖게 됩니다.</span> | |
| rank = torch.distributed.get_rank() | |
| <span class="hljs-keyword">if</span> rank == <span class="hljs-number">0</span>: | |
| text_in = <span class="hljs-string">"Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy"</span> | |
| <span class="hljs-keyword">elif</span> rank == <span class="hljs-number">1</span>: | |
| text_in = <span class="hljs-string">"Is this review positive or negative? Review: this is the worst restaurant ever"</span> | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| inputs = tokenizer.encode(text_in, return_tensors=<span class="hljs-string">"pt"</span>).to(device=local_rank) | |
| <span class="hljs-keyword">with</span> torch.no_grad(): | |
| outputs = ds_engine.module.generate(inputs, synced_gpus=<span class="hljs-literal">True</span>) | |
| text_out = tokenizer.decode(outputs[<span class="hljs-number">0</span>], skip_special_tokens=<span class="hljs-literal">True</span>) | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"rank<span class="hljs-subst">{rank}</span>:\n in=<span class="hljs-subst">{text_in}</span>\n out=<span class="hljs-subst">{text_out}</span>"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1lqs975">스크립트를 t0.py로 저장하고 실행합니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->$ deepspeed --num_gpus 2 t0.py | |
| rank0: | |
| <span class="hljs-keyword">in</span>=Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy | |
| out=Positive | |
| rank1: | |
| <span class="hljs-keyword">in</span>=Is this review positive or negative? Review: this is the worst restaurant ever | |
| out=negative<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1kkj86z">이것은 매우 기본적인 예시이므로 사용 사례에 맞게 조정할 수 있습니다.</p> <h3 class="relative group"><a id="generate" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#generate"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>생성</span></h3> <p data-svelte-h="svelte-1a4l76t">생성에 ZeRO-3와 함께 여러 개의 GPU를 사용하려면 <code>generate()</code> 메서드에서 <code>synced_gpus=True</code>를 설정하여 GPU를 동기화해야 합니다. 그렇지 않으면 한 GPU가 다른 GPU보다 먼저 생성을 완료하면 나머지 GPU가 먼저 완료한 GPU로부터 가중치 샤드를 받지 못하여 전체 시스템이 중단됩니다.</p> <p data-svelte-h="svelte-pc4k6q">트랜스포머>=4.28의 경우, 생성 중에 여러 개의 GPU가 감지되면 <code>synced_gpus</code>가 자동으로 <code>True</code>로 설정됩니다.</p> <h2 class="relative group"><a id="troubleshoot" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#troubleshoot"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>트러블슈팅</span></h2> <p data-svelte-h="svelte-13tr4mq">문제가 발생하면 DeepSpeed가 문제의 원인이 아닌 경우가 많으므로(아주 명백하고 예외적으로 DeepSpeed 모듈을 볼 수 있는 경우가 아니라면) DeepSpeed가 문제의 원인인지 고려해야 합니다! 첫 번째 단계는 DeepSpeed 없이 설정을 다시 시도하고 문제가 지속되면 문제를 신고하는 것입니다. 문제가 핵심적인 DeepSpeed 문제이고 transformers와 관련이 없는 경우, <a href="https://github.com/microsoft/DeepSpeed" rel="nofollow">DeepSpeed 리포지토리</a>에서 이슈를 개설하세요.</p> <p data-svelte-h="svelte-1k5hfn8">transformers와 관련된 이슈를 개설할 때에는 다음 정보를 제공해 주세요:</p> <ul data-svelte-h="svelte-ar78mb"><li>전체 DeepSpeed 구성 파일</li></ul> <p data-svelte-h="svelte-1mjmuca">*<code>Trainer</code>의 명령줄 인수, 또는<code>Trainer</code> 설정을 직접 작성하는 경우<code>TrainingArguments</code> 인수(관련 없는 항목이 수십 개 있는 <code>TrainingArguments</code>는 덤프하지 마세요).</p> <ul data-svelte-h="svelte-k10cds"><li>다음 코드의 출력 결과:</li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->python -c <span class="hljs-string">'import torch; print(f"torch: {torch.__version__}")'</span> | |
| python -c <span class="hljs-string">'import transformers; print(f"transformers: {transformers.__version__}")'</span> | |
| python -c <span class="hljs-string">'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")'</span><!-- HTML_TAG_END --></pre></div> <ul data-svelte-h="svelte-1v612"><li><p>문제를 재현할 수 있는 Google Colab 노트북 링크</p></li> <li><p>불가능할 경우 기존 예제를 사용하여 문제를 재현할 수 있는 표준 및 사용자 지정이 아닌 데이터 집합을 사용할 수 있습니다.</p></li></ul> <p data-svelte-h="svelte-mhpkv9">다음 섹션에서는 가장 일반적인 두 가지 문제를 해결하기 위한 가이드를 제공합니다.</p> <h3 class="relative group"><a id="deepspeed-process-killed-at-startup" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deepspeed-process-killed-at-startup"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DeepSpeed 프로세스가 시작 단계에서 종료되었을 경우</span></h3> <p data-svelte-h="svelte-1dp6ea1">실행 중에 트레이스백 없이 DeepSpeed 프로세스가 종료되면 일반적으로 프로그램이 시스템보다 많은 CPU 메모리를 할당하려고 시도했거나 프로세스가 허용된 것보다 많은 CPU 메모리를 할당하려고 시도하여 OS 커널이 프로세스를 종료했음을 의미합니다. 이 경우 구성 파일에 <code>offload_optimizer</code>, <code>offload_param</code> 또는 둘 다 CPU로 오프로드하도록 구성되어 있는지 확인하세요.</p> <p data-svelte-h="svelte-1tcm046">NVMe 및 ZeRO-3를 설정한 경우 NVMe로 오프로드를 실험해 보세요(모델의 메모리 요구 사항을 <a href="https://deepspeed.readthedocs.io/en/latest/memory.html" rel="nofollow">확인</a>하세요).</p> <h3 class="relative group"><a id="nan-loss" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#nan-loss"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>NaN 손실</span></h3> <p data-svelte-h="svelte-1yf4ua2">모델을 bf16으로 사전 훈련한 다음 fp16으로 사용하려고 할 때 NaN 손실이 발생하는 경우가 많습니다(특히 TPU 훈련 모델에 해당). 이 문제를 해결하려면 하드웨어가 이를 지원하는 경우(TPU, Ampere GPU 이상) fp32 또는 bf16을 사용하세요.</p> <p data-svelte-h="svelte-1njbvfu">다른 문제는 fp16 사용과 관련이 있을 수 있습니다. 예를 들어 이것이 fp16 구성인 경우입니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{ | |
| <span class="hljs-attr">"fp16":</span> { | |
| <span class="hljs-attr">"enabled":</span> <span class="hljs-string">"auto"</span>, | |
| <span class="hljs-attr">"loss_scale":</span> <span class="hljs-number">0</span>, | |
| <span class="hljs-attr">"loss_scale_window":</span> <span class="hljs-number">1000</span>, | |
| <span class="hljs-attr">"initial_scale_power":</span> <span class="hljs-number">16</span>, | |
| <span class="hljs-attr">"hysteresis":</span> <span class="hljs-number">2</span>, | |
| <span class="hljs-attr">"min_loss_scale":</span> <span class="hljs-number">1</span> | |
| } | |
| }<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ywaimq">로그에 다음과 같은 <code>OVERFLOW!</code> 메시지가 표시될 수 있습니다:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->0%| | 0/189 [00:00<?, ?it/s] | |
| [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 262144 | |
| 1%|▌ | 1/189 [00:00<01:26, 2.17it/s] | |
| [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 131072.0 | |
| 1%|█▏ | |
| [...] | |
| [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1 | |
| 14%|████████████████▌ | 27/189 [00:14<01:13, 2.21it/s] | |
| [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1 | |
| 15%|█████████████████▏ | 28/189 [00:14<01:13, 2.18it/s] | |
| [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1 | |
| 15%|█████████████████▊ | 29/189 [00:15<01:13, 2.18it/s] | |
| [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1 | |
| [...]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-nno03d">이는 DeepSpeed 손실 스케일러가 손실 오버플로를 극복할 수 있는 스케일링 계수를 찾을 수 없음을 의미합니다. 이 문제를 해결하려면 <code>initial_scale_power</code> 값을 더 높게 설정하세요(일반적으로 32가 적절합니다).</p> <h2 class="relative group"><a id="resources" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#resources"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>리소스</span></h2> <p data-svelte-h="svelte-zgt8pc">DeepSpeed ZeRO는 제한된 GPU 리소스로 추론을 위해 매우 큰 모델을 훈련하고 로드하는 강력한 기술로, 누구나 쉽게 사용할 수 있습니다. DeepSpeed에 대해 자세히 알아보려면 <a href="https://www.microsoft.com/en-us/research/search/?q=deepspeed" rel="nofollow">블로그 포스트</a>, <a href="https://www.deepspeed.ai/getting-started/" rel="nofollow">공식 문서</a>, <a href="https://github.com/microsoft/deepspeed" rel="nofollow">깃허브 리포지토리</a>를 참조하세요.</p> <p data-svelte-h="svelte-3iitxe">다음 문서도 ZeRO에 대해 자세히 알아볼 수 있는 훌륭한 자료입니다:</p> <ul data-svelte-h="svelte-o0yfva"><li><a href="https://hf.co/papers/1910.02054" rel="nofollow">ZeRO: Memory Optimizations Toward Training Trillion Parameter Models</a></li> <li><a href="https://hf.co/papers/2101.06840" rel="nofollow">ZeRO-Offload: Democratizing Billion-Scale Model Training</a></li> <li><a href="https://hf.co/papers/2104.07857" rel="nofollow">ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning</a></li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/ko/deepspeed.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_1hrx8 = { | |
| assets: "/docs/transformers/main/ko", | |
| base: "/docs/transformers/main/ko", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/transformers/main/ko/_app/immutable/entry/start.9aa88961.js"), | |
| import("/docs/transformers/main/ko/_app/immutable/entry/app.84fb67c3.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 16], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 152 kB
- Xet hash:
- df9c8187b4b3d53b434f285cbaf73fd3446411af1c6171710b1e39aaa2fe99cc
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.