Buckets:

hf-doc-build/doc / transformers /main /ko /optimizers.html
HuggingFaceDocBuilder's picture
download
raw
38.3 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;옵티마이저&quot;,&quot;local&quot;:&quot;optimizers&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;APOLLO&quot;,&quot;local&quot;:&quot;apollo&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;GrokAdamW&quot;,&quot;local&quot;:&quot;grokadamw&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;LOMO&quot;,&quot;local&quot;:&quot;lomo&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Schedule Free&quot;,&quot;local&quot;:&quot;schedule-free&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;StableAdamW&quot;,&quot;local&quot;:&quot;stableadamw&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/transformers/main/ko/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/entry/start.5e38b416.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/scheduler.53228c21.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/singletons.0f395d7b.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/index.e93d0901.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/paths.6e1a5eaf.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/entry/app.240e8e3b.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/preload-helper.cb103237.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/index.3db2ce32.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/nodes/0.8e23ad79.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/nodes/127.e56abe0f.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/CopyLLMTxtMenu.1327b590.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.49b88d99.js">
<link rel="modulepreload" href="/docs/transformers/main/ko/_app/immutable/chunks/CodeBlock.ada04ea6.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;옵티마이저&quot;,&quot;local&quot;:&quot;optimizers&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;APOLLO&quot;,&quot;local&quot;:&quot;apollo&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;GrokAdamW&quot;,&quot;local&quot;:&quot;grokadamw&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;LOMO&quot;,&quot;local&quot;:&quot;lomo&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Schedule Free&quot;,&quot;local&quot;:&quot;schedule-free&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;StableAdamW&quot;,&quot;local&quot;:&quot;stableadamw&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="optimizers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#optimizers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>옵티마이저</span></h1> <p data-svelte-h="svelte-gu6sa1">Transformers는 AdamW 및 AdaFactor와 같은 두 가지 기본 옵티마이저를 제공합니다. 또한, 보다 특화된 옵티마이저와의 통합도 지원합니다. 원하는 옵티마이저를 제공하는 라이브러리를 설치한 후, <a href="/docs/transformers/main/ko/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a><code>optim</code> 파라미터에 해당 옵티마이저명을 지정하시면 됩니다.</p> <p data-svelte-h="svelte-zjc1mp">이 가이드에서는 아래에 제시된 <a href="/docs/transformers/main/ko/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a>와 함께 <a href="/docs/transformers/main/ko/main_classes/trainer#transformers.Trainer">Trainer</a>에서 이러한 옵티마이저를 사용하는 방법을 안내합니다.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> TrainingArguments, AutoTokenizer, AutoModelForCausalLM, Trainer
args = TrainingArguments(
output_dir=<span class="hljs-string">&quot;./test-optimizer&quot;</span>,
max_steps=<span class="hljs-number">1000</span>,
per_device_train_batch_size=<span class="hljs-number">4</span>,
logging_strategy=<span class="hljs-string">&quot;steps&quot;</span>,
logging_steps=<span class="hljs-number">1</span>,
learning_rate=<span class="hljs-number">2e-5</span>,
save_strategy=<span class="hljs-string">&quot;no&quot;</span>,
run_name=<span class="hljs-string">&quot;optimizer-name&quot;</span>,
)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="apollo" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#apollo"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>APOLLO</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->pip install apollo-torch<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-6dncrv"><a href="https://github.com/zhuhanqing/APOLLO" rel="nofollow">Approximated Gradient Scaling for Memory Efficient LLM Optimization (APOLLO)</a> 는 사전 학습과 미세 조정 모두에 대해 전체 파라미터 학습을 지원하는, 메모리 효율적인 옵티마이저입니다. 이 옵티마이저는 SGD와 유사한 메모리 효율성으로 AdamW 수준의 성능을 유지합니다. 극한의 메모리 효율성이 필요하다면 APOLLO의 rank 1 변형인 APOLLO-Mini를 사용할 수 있습니다. APOLLO 옵티마이저는 다음과 같은 특징을 지원합니다.</p> <ul data-svelte-h="svelte-1diubiz"><li>초저랭크(rank) 효율성. <a href="./trainer#galore">GaLoRE</a>보다 훨씬 낮은 랭크를 사용할 수 있으며, 랭크 1로도 충분합니다.</li> <li>고비용 SVD 연산 회피. APOLLO는 학습 중단(training stalls)을 피하기 위해 무작위 투영(random projections)을 활용합니다.</li></ul> <p data-svelte-h="svelte-14hh5ec">학습할 레이어를 지정하려면 <code>optim_target_modules</code> 파라미터를 사용하세요.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-diff "><!-- HTML_TAG_START -->import torch
from transformers import TrainingArguments
args = TrainingArguments(
output_dir=&quot;./test-apollo&quot;,
max_steps=100,
per_device_train_batch_size=2,
<span class="hljs-addition">+ optim=&quot;apollo_adamw&quot;,</span>
<span class="hljs-addition">+ optim_target_modules=[r&quot;.*.attn.*&quot;, r&quot;.*.mlp.*&quot;],</span>
logging_strategy=&quot;steps&quot;,
logging_steps=1,
learning_rate=2e-5,
save_strategy=&quot;no&quot;,
run_name=&quot;apollo_adamw&quot;,
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1dtzstx">추가적인 학습 옵션이 필요하다면, <code>optim_args</code>를 사용하여 <code>rank</code>, <code>scale</code> 등과 같은 하이퍼파라미터를 설정할 수 있습니다. 사용 가능한 하이퍼파라미터 목록은 아래 표를 참고하세요.</p> <blockquote class="tip" data-svelte-h="svelte-wqadcx"><p><code>scale</code> 파라미터는 <code>n/r</code>으로 설정할 수 있습니다. 이때, <code>n</code>은 원본 공간 차원이고 <code>r</code>은 저랭크(low-rank) 공간 차원입니다. <code>scale</code>을 기본값으로 유지하면서 학습률만 조정해도 비슷한 효과를 얻을 수 있습니다.</p></blockquote> <table data-svelte-h="svelte-10hlhe4"><thead><tr><th>매개 변수</th> <th>설명</th> <th>APOLLO</th> <th>APOLLO-Mini</th></tr></thead> <tbody><tr><td>rank</td> <td>그래디언트 스케일링을 위한 보조 부분 공간(sub-space)의 랭크</td> <td>256</td> <td>1</td></tr> <tr><td>scale_type</td> <td>스케일링 인자(factor)를 적용하는 방법</td> <td><code>channel</code> (채널별 스케일링)</td> <td><code>tensor</code> (텐서별 스케일링)</td></tr> <tr><td>scale</td> <td>그래디언트 업데이트를 조정하여 학습을 안정화</td> <td>1.0</td> <td>128</td></tr> <tr><td>update_proj_gap</td> <td>투영 행렬(projection matrices)을 업데이트하기 전 단계(step) 수</td> <td>200</td> <td>200</td></tr> <tr><td>proj</td> <td>투영(projection) 유형</td> <td><code>random</code></td> <td><code>random</code></td></tr></tbody></table> <p data-svelte-h="svelte-1ovcbg4">아래 예시는 APOLLO-Mini 옵티마이저를 활성화하는 방법입니다.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-py "><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> TrainingArguments
args = TrainingArguments(
output_dir=<span class="hljs-string">&quot;./test-apollo_mini&quot;</span>,
max_steps=<span class="hljs-number">100</span>,
per_device_train_batch_size=<span class="hljs-number">2</span>,
optim=<span class="hljs-string">&quot;apollo_adamw&quot;</span>,
optim_target_modules=[<span class="hljs-string">r&quot;.*.attn.*&quot;</span>, <span class="hljs-string">r&quot;.*.mlp.*&quot;</span>],
optim_args=<span class="hljs-string">&quot;proj=random,rank=1,scale=128.0,scale_type=tensor,update_proj_gap=200&quot;</span>,
)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="grokadamw" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#grokadamw"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>GrokAdamW</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->pip install grokadamw<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-j5zj55"><a href="https://github.com/cognitivecomputations/grokadamw" rel="nofollow">GrokAdamW</a><em>grokking</em> 현상(기울기가 천천히 변화해 일반화가 지연되는 현상)에서 성능이 향상되는 모델들에게 적합하도록 설계된 옵티마이저입니다. GrokAdamW는 더 뛰어난 성능과 안정성을 위해 고급 최적화 기술이 필요한 모델에 특히 유용합니다.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-diff "><!-- HTML_TAG_START -->import torch
from transformers import TrainingArguments
args = TrainingArguments(
output_dir=&quot;./test-grokadamw&quot;,
max_steps=1000,
per_device_train_batch_size=4,
<span class="hljs-addition">+ optim=&quot;grokadamw&quot;,</span>
logging_strategy=&quot;steps&quot;,
logging_steps=1,
learning_rate=2e-5,
save_strategy=&quot;no&quot;,
run_name=&quot;grokadamw&quot;,
)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="lomo" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#lomo"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>LOMO</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->pip install lomo-optim<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ih0938"><a href="https://github.com/OpenLMLab/LOMO" rel="nofollow">Low-Memory Optimization (LOMO)</a>는 LLM의 전체 파라미터를 메모리 효율적으로 미세 조정하기 위해 설계된 옵티마이저 제품군이며, <a href="https://huggingface.co/papers/2306.09782" rel="nofollow">LOMO</a><a href="https://hf.co/papers/2310.10195" rel="nofollow">AdaLomo</a> 두 가지 버전이 있습니다. 두 LOMO 옵티마이저는 모두 메모리 사용량을 줄이기 위해 그래디언트 계산과 매개변수 업데이트를 한 단계로 통합합니다. AdaLomo는 LOMO를 기반으로, Adam 옵티마이저처럼 각 매개변수에 대해 적응형 학습률을 적용하는 기능이 추가되었습니다.</p> <blockquote class="tip" data-svelte-h="svelte-1m5ijnx"><p>더 나은 성능과 높은 처리량을 위해서는 <code>grad_norm</code> 없이 AdaLomo를 사용하는 것을 권장합니다.</p></blockquote> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-diff "><!-- HTML_TAG_START -->args = TrainingArguments(
output_dir=&quot;./test-lomo&quot;,
max_steps=1000,
per_device_train_batch_size=4,
<span class="hljs-addition">+ optim=&quot;adalomo&quot;,</span>
gradient_checkpointing=True,
logging_strategy=&quot;steps&quot;,
logging_steps=1,
learning_rate=2e-6,
save_strategy=&quot;no&quot;,
run_name=&quot;adalomo&quot;,
)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="schedule-free" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#schedule-free"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Schedule Free</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->pip install schedulefree<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1517aof"><a href="https://hf.co/papers/2405.15682" rel="nofollow">Schedule Free optimizer (SFO)</a>는 기본 옵티마이저의 모멘텀 대신 평균화(averaging)와 보간(interpolation)을 조합하여 사용합니다. 덕분에 기존의 학습률 스케줄러와 달리, SFO는 학습률을 점진적으로 낮추는 절차가 아예 필요 없습니다.</p> <p data-svelte-h="svelte-1f431bt">SFO는 RAdam(<code>schedule_free_radam</code>), AdamW(<code>schedule_free_adamw</code>), SGD(<code>schedule_free_sgd</code>) 옵티마이저를 지원합니다. RAdam 스케줄러는 <code>warmup_steps</code>.</p> <p data-svelte-h="svelte-1da57q5">기본적으로 <code>lr_scheduler_type=&quot;constant&quot;</code>로 설정하는 것을 권장합니다. 다른 <code>lr_scheduler_type</code> 값도 동작할 순 있으나, SFO 옵티마이저와 다른 학습률 스케줄을 함께 사용하면 SFO의 의도된 동작과 성능에 영향을 줄 수 있습니다.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-diff "><!-- HTML_TAG_START -->args = TrainingArguments(
output_dir=&quot;./test-schedulefree&quot;,
max_steps=1000,
per_device_train_batch_size=4,
<span class="hljs-addition">+ optim=&quot;schedule_free_radamw&quot;,</span>
<span class="hljs-addition">+ lr_scheduler_type=&quot;constant&quot;,</span>
gradient_checkpointing=True,
logging_strategy=&quot;steps&quot;,
logging_steps=1,
learning_rate=2e-6,
save_strategy=&quot;no&quot;,
run_name=&quot;sfo&quot;,
)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="stableadamw" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#stableadamw"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>StableAdamW</span></h2> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->pip install torch-optimi<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1s83iy5"><a href="https://huggingface.co/papers/2304.13013" rel="nofollow">StableAdamW</a>는 AdamW와 AdaFactor를 결합한 하이브리드 옵티마이저입니다. AdaFactor의 업데이트 클리핑(update clipping)이 AdamW에 도입되어 별도의 그래디언트 클리핑(gradient clipping)이 필요 없습니다. 그 외의 동작에서는 AdamW와 완벽히 호환되는 대체제로 사용할 수 있습니다.</p> <blockquote class="tip" data-svelte-h="svelte-11tti9v"><p>배치(batch) 크기가 크거나 훈련 손실(training loss)이 계속해서 급격하게 변동한다면, beta_2 값을 [0.95, 0.99] 사이로 줄여보세요.</p></blockquote> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-diff "><!-- HTML_TAG_START -->args = TrainingArguments(
output_dir=&quot;./test-stable-adamw&quot;,
max_steps=1000,
per_device_train_batch_size=4,
<span class="hljs-addition">+ optim=&quot;stable_adamw&quot;,</span>
gradient_checkpointing=True,
logging_strategy=&quot;steps&quot;,
logging_steps=1,
learning_rate=2e-6,
save_strategy=&quot;no&quot;,
run_name=&quot;stable-adamw&quot;,
)<!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/ko/optimizers.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_igaaa = {
assets: "/docs/transformers/main/ko",
base: "/docs/transformers/main/ko",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/transformers/main/ko/_app/immutable/entry/start.5e38b416.js"),
import("/docs/transformers/main/ko/_app/immutable/entry/app.240e8e3b.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 127],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
38.3 kB
·
Xet hash:
ca840f408376b39d8c5d100ac7aef46b44be793f42075a75da284f5886f47ecd

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.