Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / course /pr_1095 /my /chapter12 /4.html

rtrm

about 1 month ago

download

raw

61.7 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"TRL တွင် GRPO ကို အကောင်အထည်ဖော်ခြင်း","local":"implementing-grpo-in-trl","sections":[{"title":"အဓိက အစိတ်အပိုင်းများ","local":"အဓက-အစတအပငမ","sections":[{"title":"၁။ Dataset Format","local":"၁-dataset-format","sections":[],"depth":3},{"title":"၂။ Reward Function","local":"၂-reward-function","sections":[],"depth":3},{"title":"၃။ Training Configuration","local":"၃-training-configuration","sections":[],"depth":3}],"depth":2},{"title":"အောင်မြင်မှုအတွက် အကြံပြုချက်များ","local":"အငမငမအတက-အကပခကမ","sections":[],"depth":2},{"title":"Reward Function ဒီဇိုင်း","local":"reward-function-ဒဇင","sections":[{"title":"၁။ Length-Based Rewards","local":"၁-length-based-rewards","sections":[],"depth":3}],"depth":2},{"title":"၂။ Verifiable Tasks အတွက် Rule-Based Rewards","local":"၂-verifiable-tasks-အတက-rule-based-rewards","sections":[],"depth":2},{"title":"၃။ Format-Based Rewards","local":"၃-format-based-rewards","sections":[],"depth":2},{"title":"ဒါပါပဲ!","local":"ဒပပ","sections":[],"depth":2},{"title":"ဝေါဟာရ ရှင်းလင်းချက် (Glossary)","local":"ဝဟရ-ရငလငခက-glossary","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/course/pr_1095/my/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/entry/start.8e25cab6.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/scheduler.893fe8c9.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/singletons.ba455c5c.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/index.bce52c8a.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/paths.9a7be869.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/entry/app.b12ce275.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/preload-helper.b5ee8f74.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/index.b1df2166.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/nodes/0.77c840e7.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/nodes/32.38cf2d36.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.e6d31e72.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/CodeBlock.abb4f40e.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"TRL တွင် GRPO ကို အကောင်အထည်ဖော်ခြင်း","local":"implementing-grpo-in-trl","sections":[{"title":"အဓိက အစိတ်အပိုင်းများ","local":"အဓက-အစတအပငမ","sections":[{"title":"၁။ Dataset Format","local":"၁-dataset-format","sections":[],"depth":3},{"title":"၂။ Reward Function","local":"၂-reward-function","sections":[],"depth":3},{"title":"၃။ Training Configuration","local":"၃-training-configuration","sections":[],"depth":3}],"depth":2},{"title":"အောင်မြင်မှုအတွက် အကြံပြုချက်များ","local":"အငမငမအတက-အကပခကမ","sections":[],"depth":2},{"title":"Reward Function ဒီဇိုင်း","local":"reward-function-ဒဇင","sections":[{"title":"၁။ Length-Based Rewards","local":"၁-length-based-rewards","sections":[],"depth":3}],"depth":2},{"title":"၂။ Verifiable Tasks အတွက် Rule-Based Rewards","local":"၂-verifiable-tasks-အတက-rule-based-rewards","sections":[],"depth":2},{"title":"၃။ Format-Based Rewards","local":"၃-format-based-rewards","sections":[],"depth":2},{"title":"ဒါပါပဲ!","local":"ဒပပ","sections":[],"depth":2},{"title":"ဝေါဟာရ ရှင်းလင်းချက် (Glossary)","local":"ဝဟရ-ရငလငခက-glossary","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="implementing-grpo-in-trl" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#implementing-grpo-in-trl"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>TRL တွင် GRPO ကို အကောင်အထည်ဖော်ခြင်း</span></h1> <p data-svelte-h="svelte-ut8yz8">ဒီစာမျက်နှာမှာ၊ Transformer Reinforcement Learning (TRL) library ကို အသုံးပြုပြီး Group Relative Policy Optimization (GRPO) ကို ဘယ်လိုအကောင်အထည်ဖော်ရမလဲဆိုတာ လေ့လာသွားမှာပါ။ ကျွန်တော်တို့ဟာ code ကို အနည်းဆုံးနဲ့ လက်တွေ့အကောင်အထည်ဖော်ခြင်းကို အဓိကထားမှာပါ။</p> <p data-svelte-h="svelte-1v6b3yr">GRPO ရဲ့ အဓိကသဘောတရားတွေကို TRL ရဲ့ GRPOTrainer မှာ ဘယ်လိုပါဝင်နေလဲဆိုတာကို လေ့လာသွားမှာဖြစ်ပြီး၊ တရားဝင် TRL documentation က snippets တွေကို လမ်းညွှန်အဖြစ် အသုံးပြုပါမယ်။</p> <blockquote class="tip" data-svelte-h="svelte-1vg1vtz"><p>ဒီအခန်းက TRL စတင်လေ့လာသူတွေအတွက် ရည်ရွယ်ပါတယ်။ သင် TRL ကို ကျွမ်းကျင်ပြီးသားဆိုရင်၊ GRPO ရဲ့ <a href="https://github.com/huggingface/open-r1/blob/main/src/open_r1/grpo.py" rel="nofollow">Open R1 implementation</a> ကိုလည်း လေ့လာကြည့်နိုင်ပါတယ်။</p></blockquote> <p data-svelte-h="svelte-j0e1y5">ပထမဆုံးအနေနဲ့၊ GRPO algorithm ရဲ့ အရေးကြီးတဲ့ သဘောတရားအချို့ကို ပြန်လည်သတိရကြရအောင်။</p> <ul data-svelte-h="svelte-1htq713"><li><strong>Group Formation</strong>: model က prompt တစ်ခုစီအတွက် completions များစွာကို ထုတ်လုပ်ပါတယ်။</li> <li><strong>Preference Learning</strong>: model က completions အုပ်စုတွေကို နှိုင်းယှဉ်တဲ့ reward function ကနေ သင်ယူပါတယ်။</li> <li><strong>Training Configuration</strong>: model က training process ကို ထိန်းချုပ်ဖို့ configuration တစ်ခုကို အသုံးပြုပါတယ်။</li></ul> <p data-svelte-h="svelte-1s61muj">GRPO ကို အကောင်အထည်ဖော်ဖို့ ကျွန်တော်တို့ ဘာတွေလုပ်ဖို့ လိုအပ်မလဲ။</p> <ul data-svelte-h="svelte-4xzpam"><li>prompts များ၏ dataset တစ်ခုကို သတ်မှတ်ပါ။</li> <li>completions စာရင်းကို ယူပြီး rewards စာရင်းကို ပြန်ပေးမယ့် reward function တစ်ခုကို သတ်မှတ်ပါ။</li> <li>training process ကို GRPOConfig တစ်ခုဖြင့် configure လုပ်ပါ။</li> <li>GRPOTrainer ကို အသုံးပြုပြီး model ကို train လုပ်ပါ။</li></ul> <p data-svelte-h="svelte-sx3u1v">GRPO training ကို စတင်ဖို့အတွက် အနိမ့်ဆုံး ဥပမာတစ်ခုကတော့ အောက်ပါအတိုင်းပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOTrainer, GRPOConfig
	<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset

	<span class="hljs-comment"># 1. သင့် dataset ကို load လုပ်ပါ</span>
	dataset = load_dataset(<span class="hljs-string">"your_dataset"</span>, split=<span class="hljs-string">"train"</span>)


	<span class="hljs-comment"># 2. ရိုးရှင်းသော reward function တစ်ခုကို သတ်မှတ်ပါ</span>
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_func</span>(<span class="hljs-params">completions, **kwargs</span>):
	<span class="hljs-string">"""ဥပမာ- ပိုရှည်သော completions များကို ဆုချပါ"""</span>
	<span class="hljs-keyword">return</span> [<span class="hljs-built_in">float</span>(<span class="hljs-built_in">len</span>(completion)) <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions]


	<span class="hljs-comment"># 3. Training ကို Configure လုပ်ပါ</span>
	training_args = GRPOConfig(
	output_dir=<span class="hljs-string">"output"</span>,
	num_train_epochs=<span class="hljs-number">3</span>,
	per_device_train_batch_size=<span class="hljs-number">4</span>,
	gradient_accumulation_steps=<span class="hljs-number">2</span>,
	logging_steps=<span class="hljs-number">10</span>,
	)

	<span class="hljs-comment"># 4. စတင်ပြီး train လုပ်ပါ</span>
	trainer = GRPOTrainer(
	model=<span class="hljs-string">"your_model"</span>, <span class="hljs-comment"># ဥပမာ- "Qwen/Qwen2-0.5B-Instruct"</span>
	args=training_args,
	train_dataset=dataset,
	reward_funcs=reward_func,
	)
	trainer.train()<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="အဓက-အစတအပငမ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#အဓက-အစတအပငမ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အဓိက အစိတ်အပိုင်းများ</span></h2> <h3 class="relative group"><a id="၁-dataset-format" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၁-dataset-format"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၁။ Dataset Format</span></h3> <p data-svelte-h="svelte-yzpay0">သင့် dataset တွင် model က တုံ့ပြန်မည့် prompts များ ပါဝင်သင့်ပါတယ်။ GRPO trainer က prompt တစ်ခုစီအတွက် completions များစွာကို ထုတ်လုပ်ပြီး ၎င်းတို့ကို နှိုင်းယှဉ်ဖို့ reward function ကို အသုံးပြုပါလိမ့်မယ်။</p> <h3 class="relative group"><a id="၂-reward-function" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၂-reward-function"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၂။ Reward Function</span></h3> <p data-svelte-h="svelte-19sjwwj">reward function ဟာ အရေးကြီးပါတယ်။ ဒါက model က ဘယ်လိုသင်ယူတယ်ဆိုတာကို ဆုံးဖြတ်ပါတယ်။ လက်တွေ့ဥပမာ နှစ်ခုကတော့ အောက်ပါအတိုင်းပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># ဥပမာ ၁- completion အရှည်ပေါ်မူတည်သော reward</span>
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_length</span>(<span class="hljs-params">completions, **kwargs</span>):
	<span class="hljs-keyword">return</span> [<span class="hljs-built_in">float</span>(<span class="hljs-built_in">len</span>(completion)) <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions]


	<span class="hljs-comment"># ဥပမာ ၂- pattern ကို ကိုက်ညီမှုအပေါ်မူတည်သော reward</span>
	<span class="hljs-keyword">import</span> re


	<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_format</span>(<span class="hljs-params">completions, **kwargs</span>):
	pattern = <span class="hljs-string">r"^<think>.?</think><answer>.?</answer>$"</span>
	<span class="hljs-keyword">return</span> [<span class="hljs-number">1.0</span> <span class="hljs-keyword">if</span> re.<span class="hljs-keyword">match</span>(pattern, c) <span class="hljs-keyword">else</span> <span class="hljs-number">0.0</span> <span class="hljs-keyword">for</span> c <span class="hljs-keyword">in</span> completions]<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="၃-training-configuration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၃-training-configuration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၃။ Training Configuration</span></h3> <p data-svelte-h="svelte-1mit6md"><code>GRPOConfig</code> တွင် ထည့်သွင်းစဉ်းစားရမည့် အဓိက parameters များ -</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_args = GRPOConfig(
	<span class="hljs-comment"># မရှိမဖြစ်လိုအပ်သော parameters များ</span>
	output_dir=<span class="hljs-string">"output"</span>,
	num_train_epochs=<span class="hljs-number">3</span>,
	num_generation=<span class="hljs-number">4</span>, <span class="hljs-comment"># prompt တစ်ခုစီအတွက် ထုတ်လုပ်မည့် completions အရေအတွက်</span>
	per_device_train_batch_size=<span class="hljs-number">4</span>, <span class="hljs-comment"># generations အားလုံးကို device batch တစ်ခုတည်းမှာ ရယူလိုပါသည်။</span>
	<span class="hljs-comment"># ရွေးချယ်နိုင်သော သို့သော် အသုံးဝင်သော</span>
	gradient_accumulation_steps=<span class="hljs-number">2</span>,
	learning_rate=<span class="hljs-number">1e-5</span>,
	logging_steps=<span class="hljs-number">10</span>,
	<span class="hljs-comment"># GRPO အတွက် သီးခြား (ရွေးချယ်နိုင်သော)</span>
	use_vllm=<span class="hljs-literal">True</span>, <span class="hljs-comment"># generation ကို အရှိန်မြှင့်ရန်</span>
	)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1cie4om"><code>num_generation</code> parameter က GRPO အတွက် အထူးအရေးကြီးပါတယ်။ ဒါက group size ကို သတ်မှတ်ပါတယ်။ ဆိုလိုတာက model က prompt တစ်ခုစီအတွက် မတူညီတဲ့ completions ဘယ်နှစ်ခု ထုတ်လုပ်မလဲဆိုတာပါပဲ။ ဒါက အခြား RL methods တွေနဲ့ ကွာခြားတဲ့ အဓိကအချက်ပါ။</p> <ul data-svelte-h="svelte-1hqrea5"><li><strong>အလွန်နည်းပါးလွန်းခြင်း (ဥပမာ- ၂-၃ ခု)</strong>- အဓိပ္ပာယ်ရှိသော နှိုင်းယှဉ်မှုများအတွက် လုံလောက်သော ကွဲပြားမှု (diversity) ကို မပေးနိုင်ပါ။</li> <li><strong>အကြံပြုထားသော (၄-၁၆ ခု)</strong>- ကွဲပြားမှုနှင့် တွက်ချက်မှု ထိရောက်မှု (computational efficiency) အကြား ကောင်းမွန်သော ဟန်ချက်ကို ပေးပါသည်။</li> <li><strong>ပိုမိုကြီးမားသော တန်ဖိုးများ</strong>- သင်ယူမှုကို ပိုမိုကောင်းမွန်စေနိုင်သော်လည်း တွက်ချက်မှု ကုန်ကျစရိတ်ကို သိသိသာသာ တိုးမြှင့်စေသည်။</li></ul> <p data-svelte-h="svelte-b2ktzd">group size ကို သင့်ရဲ့ computational resources တွေနဲ့ task ရဲ့ ရှုပ်ထွေးမှုပေါ် မူတည်ပြီး ရွေးချယ်သင့်ပါတယ်။ ရိုးရှင်းတဲ့ tasks တွေအတွက်၊ သေးငယ်တဲ့ groups တွေ (၄-၈) က လုံလောက်နိုင်ပြီး၊ ပိုမိုရှုပ်ထွေးတဲ့ reasoning tasks တွေကတော့ ပိုမိုကြီးမားတဲ့ groups တွေ (၈-၁၆) ကနေ အကျိုးအမြတ်ရနိုင်ပါတယ်။</p> <h2 class="relative group"><a id="အငမငမအတက-အကပခကမ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#အငမငမအတက-အကပခကမ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အောင်မြင်မှုအတွက် အကြံပြုချက်များ</span></h2> <p data-svelte-h="svelte-1r9sxzi">၁။ <strong>Memory Management</strong>: သင်၏ GPU memory အပေါ်မူတည်၍ <code>per_device_train_batch_size</code> နှင့် <code>gradient_accumulation_steps</code> ကို ချိန်ညှိပါ။
	၂။ <strong>Speed</strong>: သင်၏ model ကို ထောက်ပံ့ပါက ပိုမိုမြန်ဆန်သော generation အတွက် <code>use_vllm=True</code> ကို ဖွင့်ပါ။
	၃။ <strong>Monitoring</strong>: training လုပ်နေစဉ်အတွင်း log လုပ်ထားသော metrics များကို စောင့်ကြည့်ပါ။</p> <ul data-svelte-h="svelte-11ecc13"><li><code>reward</code>: completions များ၏ ပျမ်းမျှ reward။</li> <li><code>reward_std</code>: reward groups များအတွင်းရှိ standard deviation။</li> <li><code>kl</code>: reference model မှ KL divergence။</li></ul> <h2 class="relative group"><a id="reward-function-ဒဇင" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#reward-function-ဒဇင"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Reward Function ဒီဇိုင်း</span></h2> <p data-svelte-h="svelte-5oefb3">DeepSeek R1 paper က သင်၏ GRPO implementation အတွက် လိုက်လျောညီထွေဖြစ်အောင် လုပ်ဆောင်နိုင်သော reward function ဒီဇိုင်းချခြင်း နည်းလမ်းများစွာကို ပြသထားပါတယ်။</p> <h3 class="relative group"><a id="၁-length-based-rewards" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၁-length-based-rewards"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၁။ Length-Based Rewards</span></h3> <p data-svelte-h="svelte-1lzl5io">အကောင်အထည်ဖော်ရအလွယ်ဆုံး rewards တွေထဲက တစ်ခုကတော့ length-based reward ပါပဲ။ ပိုရှည်တဲ့ completions တွေကို ဆုချနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_len</span>(<span class="hljs-params">completions, **kwargs</span>):
	ideal_length = <span class="hljs-number">20</span>
	<span class="hljs-keyword">return</span> [-<span class="hljs-built_in">abs</span>(ideal_length - <span class="hljs-built_in">len</span>(completion)) <span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ifkzyn">ဒီ reward function က အလွန်တိုတောင်းလွန်းတဲ့ ဒါမှမဟုတ် အလွန်ရှည်လျားလွန်းတဲ့ completions တွေကို အပြစ်ပေးပါတယ်။ ဒါက model ကို ideal length ၂၀ tokens နဲ့ နီးစပ်တဲ့ completions တွေ ထုတ်လုပ်ဖို့ တိုက်တွန်းပါတယ်။</p> <iframe src="https://marimo.app/gh/huggingface/notebooks/main/e?entrypoint=course%2Fen%2Fchapter13%2Fgrpo_length.py&embed=true&show-chrome=false" title="Marimo Notebook" width="100%" height="800px" frameborder="0" allow="clipboard-write"></iframe> <h2 class="relative group"><a id="၂-verifiable-tasks-အတက-rule-based-rewards" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၂-verifiable-tasks-အတက-rule-based-rewards"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၂။ Verifiable Tasks အတွက် Rule-Based Rewards</span></h2> <p data-svelte-h="svelte-1sk31pc">သင်္ချာ သို့မဟုတ် coding ကဲ့သို့ တိကျမှန်ကန်သော အဖြေများရှိသည့် tasks များအတွက်၊ rule-based reward functions များကို အကောင်အထည်ဖော်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">problem_reward</span>(<span class="hljs-params">completions, answers, **kwargs</span>):
	<span class="hljs-string">"""Verifiable အဖြေများပါသော သင်္ချာပြဿနာများအတွက် reward function
	completions: အကဲဖြတ်ရန် completions စာရင်း
	answers: dataset မှ ပြဿနာများအတွက် အဖြေများစာရင်း
	"""</span>

	rewards = []
	<span class="hljs-keyword">for</span> completion, correct_answer <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(completions, answers):
	<span class="hljs-comment"># completion မှ အဖြေကို ထုတ်ယူပါ</span>
	<span class="hljs-keyword">try</span>:
	<span class="hljs-comment"># ဒါက ရိုးရှင်းတဲ့ ဥပမာတစ်ခုပါ - သင့်လျော်တဲ့ parsing လိုအပ်ပါလိမ့်မယ်</span>
	answer = extract_final_answer(completion)
	<span class="hljs-comment"># Binary reward: မှန်ရင် 1၊ မှားရင် 0</span>
	reward = <span class="hljs-number">1.0</span> <span class="hljs-keyword">if</span> answer == correct_answer <span class="hljs-keyword">else</span> <span class="hljs-number">0.0</span>
	rewards.append(reward)
	<span class="hljs-keyword">except</span>:
	<span class="hljs-comment"># အဖြေကို parse မလုပ်နိုင်ရင်၊ reward နည်းနည်း ပေးပါ</span>
	rewards.append(<span class="hljs-number">0.0</span>)

	<span class="hljs-keyword">return</span> rewards<!-- HTML_TAG_END --></pre></div> <iframe src="https://marimo.app/gh/huggingface/notebooks/main/e?entrypoint=course%2Fen%2Fchapter13%2Fgrpo_math.py&embed=true&show-chrome=false" title="Marimo Notebook" width="100%" height="800px" frameborder="0" allow="clipboard-write"></iframe> <h2 class="relative group"><a id="၃-format-based-rewards" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၃-format-based-rewards"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၃။ Format-Based Rewards</span></h2> <p data-svelte-h="svelte-1eji1ml">DeepSeek R1 training မှာ အရေးကြီးခဲ့တဲ့ သင့်လျော်တဲ့ formatting ကိုလည်း ဆုချနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">format_reward</span>(<span class="hljs-params">completions, **kwargs</span>):
	<span class="hljs-string">"""လိုချင်သော format ကို လိုက်နာသော completions များကို ဆုချပါ"""</span>
	<span class="hljs-comment"># ဥပမာ- completion က think-then-answer format ကို လိုက်နာခြင်းရှိမရှိ စစ်ဆေးပါ</span>
	pattern = <span class="hljs-string">r"<think>(.?)</think>\s<answer>(.*?)</answer>"</span>

	rewards = []
	<span class="hljs-keyword">for</span> completion <span class="hljs-keyword">in</span> completions:
	<span class="hljs-keyword">match</span> = re.search(pattern, completion, re.DOTALL)
	<span class="hljs-keyword">if</span> <span class="hljs-keyword">match</span>:
	<span class="hljs-comment"># sections နှစ်ခုလုံးမှာ အဓိက အကြောင်းအရာများ ရှိမရှိ စစ်ဆေးပါ</span>
	think_content = <span class="hljs-keyword">match</span>.group(<span class="hljs-number">1</span>).strip()
	answer_content = <span class="hljs-keyword">match</span>.group(<span class="hljs-number">2</span>).strip()

	<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(think_content) > <span class="hljs-number">20</span> <span class="hljs-keyword">and</span> <span class="hljs-built_in">len</span>(answer_content) > <span class="hljs-number">0</span>:
	rewards.append(<span class="hljs-number">1.0</span>)
	<span class="hljs-keyword">else</span>:
	rewards.append(
	<span class="hljs-number">0.5</span>
	) <span class="hljs-comment"># မှန်ကန်သော format ဖြစ်သော်လည်း အကြောင်းအရာ နည်းပါးပါက partial reward</span>
	<span class="hljs-keyword">else</span>:
	rewards.append(<span class="hljs-number">0.0</span>) <span class="hljs-comment"># format မမှန်က reward မပေးပါ</span>

	<span class="hljs-keyword">return</span> rewards<!-- HTML_TAG_END --></pre></div> <iframe src="https://marimo.app/gh/huggingface/notebooks/main/e?entrypoint=course%2Fen%2Fchapter13%2Fgrpo_format.py&embed=true&show-chrome=false" title="Marimo Notebook" width="100%" height="800px" frameborder="0" allow="clipboard-write"></iframe> <p data-svelte-h="svelte-f2nm1d">ဒီဥပမာတွေက DeepSeek R1 training process ကနေ လှုံ့ဆော်မှုရယူပြီး မှန်ကန်မှု၊ formatting နဲ့ ပေါင်းစပ်ထားသော signals တွေကို အဓိကထားတဲ့ reward functions တွေကို ဘယ်လိုအကောင်အထည်ဖော်ရမယ်ဆိုတာကို ပြသထားပါတယ်။</p> <h2 class="relative group"><a id="ဒပပ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ဒပပ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ဒါပါပဲ!</span></h2> <p data-svelte-h="svelte-hovu4t">နောက်အပိုင်းမှာ၊ TRL မှာ GRPO ကို အကောင်အထည်ဖော်ဖို့ လေ့ကျင့်ခန်းတစ်ခုကို သင်လိုက်လုပ်ရပါလိမ့်မယ်။</p> <hr> <h2 class="relative group"><a id="ဝဟရ-ရငလငခက-glossary" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ဝဟရ-ရငလငခက-glossary"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ဝေါဟာရ ရှင်းလင်းချက် (Glossary)</span></h2> <ul data-svelte-h="svelte-7vazon"><li><strong>GRPO (Group Relative Policy Optimization)</strong>: Reinforcement Learning (RL) algorithm တစ်ခုဖြစ်ပြီး model က ထုတ်လုပ်လိုက်တဲ့ completions အုပ်စုတွေကို နှိုင်းယှဉ်ပြီး သင်ယူကာ model ရဲ့ policy ကို optimize လုပ်ပါတယ်။</li> <li><strong>TRL (Transformer Reinforcement Learning) Library</strong>: Hugging Face မှ ထုတ်လုပ်ထားသော library တစ်ခုဖြစ်ပြီး Transformer models များကို Reinforcement Learning techniques ဖြင့် fine-tune လုပ်ရန် ရည်ရွယ်သည်။</li> <li><strong>Implementation</strong>: သီအိုရီ သို့မဟုတ် algorithm တစ်ခုကို code အဖြစ် အကောင်အထည်ဖော်ခြင်း။</li> <li><strong>GRPOTrainer</strong>: TRL library မှ GRPO algorithm ကို အကောင်အထည်ဖော်သော Trainer class။</li> <li><strong>TRL Documentation</strong>: TRL library ၏ တရားဝင်မှတ်တမ်းများ။</li> <li><strong>Open R1 Implementation</strong>: GRPO algorithm ၏ open-source အကောင်အထည်ဖော်မှု။</li> <li><strong>Group Formation</strong>: model က prompt တစ်ခုစီအတွက် completions များစွာကို ထုတ်လုပ်ပြီး အုပ်စုဖွဲ့ခြင်း။</li> <li><strong>Completions</strong>: model က prompt တစ်ခုကို တုံ့ပြန်တဲ့အနေနဲ့ ထုတ်လုပ်ပေးတဲ့ စာသား သို့မဟုတ် sequence များ။</li> <li><strong>Preference Learning</strong>: reward function မှတဆင့် completions အုပ်စုများကို နှိုင်းယှဉ်ခြင်းဖြင့် model က သင်ယူသော လုပ်ငန်းစဉ်။</li> <li><strong>Reward Function</strong>: model ၏ output (completions) များကို အကဲဖြတ်ပြီး ဂဏန်းတန်ဖိုး (reward) တစ်ခုကို ပြန်ပေးသော function။ ၎င်းသည် model ကို သင်ယူရာတွင် လမ်းညွှန်ပေးသည်။</li> <li><strong>Training Configuration</strong>: training process အတွက် parameters များနှင့် settings များကို သတ်မှတ်ခြင်း။</li> <li><strong>GRPOConfig</strong>: TRL library မှ GRPO training အတွက် configuration များကို ထိန်းချုပ်သော class။</li> <li><strong>Dataset of Prompts</strong>: model က တုံ့ပြန်ရန်အတွက် အသုံးပြုမည့် prompts များပါဝင်သော dataset။</li> <li><strong><code>trl</code></strong>: Transformer Reinforcement Learning library။</li> <li><strong><code>GRPOTrainer</code></strong>: TRL မှ GRPO algorithm အတွက် Trainer class။</li> <li><strong><code>GRPOConfig</code></strong>: TRL မှ GRPO training အတွက် configuration class။</li> <li><strong><code>load_dataset</code></strong>: Hugging Face Datasets library မှ dataset များကို load လုပ်ရန် function။</li> <li><strong><code>output_dir</code></strong>: trained model နှင့် logs များကို သိမ်းဆည်းမည့် directory။</li> <li><strong><code>num_train_epochs</code></strong>: training လုပ်မည့် epochs အရေအတွက်။</li> <li><strong><code>per_device_train_batch_size</code></strong>: device တစ်ခုစီ (ဥပမာ- GPU) အတွက် batch size။</li> <li><strong><code>gradient_accumulation_steps</code></strong>: gradients များကို update မလုပ်မီ batches မည်မျှစုဆောင်းမည်ကို သတ်မှတ်ခြင်း။</li> <li><strong><code>logging_steps</code></strong>: training log များကို မည်သည့် step အရေအတွက်တိုင်းတွင် မှတ်တမ်းတင်မည်ကို သတ်မှတ်ခြင်း။</li> <li><strong><code>model</code> (argument in <code>GRPOTrainer</code>)</strong>: အသုံးပြုမည့် base model ၏ identifier သို့မဟုတ် instance။</li> <li><strong><code>args</code> (argument in <code>GRPOTrainer</code>)</strong>: training configuration arguments များ။</li> <li><strong><code>train_dataset</code></strong>: training အတွက် အသုံးပြုမည့် dataset။</li> <li><strong><code>reward_funcs</code></strong>: reward function (များ)။</li> <li><strong><code>trainer.train()</code></strong>: training process ကို စတင်ရန် method။</li> <li><strong>Prompts</strong>: model ကို တုံ့ပြန်စေလိုသော စာသား input များ။</li> <li><strong><code>re</code> Module</strong>: Python ၏ regular expression module။</li> <li><strong><code>re.match()</code></strong>: string ၏ အစမှ pattern ကို ကိုက်ညီမှုရှိမရှိ စစ်ဆေးရန် function။</li> <li><strong><code>num_generation</code></strong>: prompt တစ်ခုစီအတွက် model က ထုတ်လုပ်မည့် completions အရေအတွက်။ ၎င်းသည် GRPO ၏ group size ဖြစ်သည်။</li> <li><strong>RL Methods (Reinforcement Learning Methods)</strong>: trial-and-error မှတစ်ဆင့် သင်ယူပြီး reward အများဆုံးရရှိရန် ကြိုးစားသော Machine Learning algorithms များ။</li> <li><strong>Diversity</strong>: ထုတ်လုပ်လိုက်သော completions များ၏ ကွဲပြားမှု။</li> <li><strong>Computational Efficiency</strong>: တွက်ချက်မှုအရင်းအမြစ်များကို မည်မျှထိရောက်စွာ အသုံးပြုသည်ကို ဆိုလိုသည်။</li> <li><strong>Computational Cost</strong>: တွက်ချက်မှု လုပ်ဆောင်ရန် လိုအပ်သော အချိန်နှင့် အရင်းအမြစ်များ။</li> <li><strong>Reasoning Tasks</strong>: အကြောင်းပြချက်၊ ဆင်ခြင်တုံတရား လိုအပ်သော လုပ်ငန်းများ။</li> <li><strong>Memory Management</strong>: ကွန်ပျူတာ၏ မှတ်ဉာဏ် (memory) အသုံးပြုမှုကို ထိန်းချုပ်ခြင်း။</li> <li><strong>GPU Memory</strong>: Graphics Processing Unit (GPU) တွင်ရှိသော မှတ်ဉာဏ်။</li> <li><strong><code>use_vllm=True</code></strong>: vLLM (a high-throughput inference engine) ကို အသုံးပြု၍ generation ကို အရှိန်မြှင့်ရန်။</li> <li><strong>Logged Metrics</strong>: training လုပ်နေစဉ်အတွင်း မှတ်တမ်းတင်ထားသော တိုင်းတာမှုများ။</li> <li><strong><code>reward</code> (metric)</strong>: completions များ၏ ပျမ်းမျှ reward တန်ဖိုး။</li> <li><strong><code>reward_std</code> (metric)</strong>: reward groups များအတွင်းရှိ rewards များ၏ standard deviation။</li> <li><strong><code>kl</code> (metric)</strong>: KL divergence (Kullback-Leibler divergence) ကို ရည်ညွှန်းပြီး reference model မှ policy က မည်မျှကွာခြားသည်ကို တိုင်းတာသည်။</li> <li><strong>DeepSeek R1 Paper</strong>: DeepSeek R1 model နှင့် ၎င်း၏ training method များကို ဖော်ပြထားသော research paper။</li> <li><strong>Length-Based Reward</strong>: completion ၏ အရှည်ပေါ်မူတည်၍ ပေးသော reward။</li> <li><strong><code>ideal_length</code></strong>: completion အတွက် လိုချင်သော အရှည်။</li> <li><strong><code>abs()</code></strong>: ဂဏန်းတစ်ခု၏ absolute value (အနုတ်လက္ခဏာမပါသော တန်ဖိုး)။</li> <li><strong>Verifiable Tasks</strong>: အဖြေကို တိကျစွာ စစ်ဆေးအတည်ပြုနိုင်သော လုပ်ငန်းများ။</li> <li><strong>Rule-Based Reward Functions</strong>: သတ်မှတ်ထားသော စည်းမျဉ်းများ သို့မဟုတ် အခြေအနေများအပေါ် အခြေခံ၍ reward ပေးသော function များ။</li> <li><strong><code>extract_final_answer()</code></strong>: completion မှ နောက်ဆုံးအဖြေကို ထုတ်ယူရန် ဒီဇိုင်းထုတ်ထားသော function (ဥပမာတွင် ရိုးရှင်းထားသည်)။</li> <li><strong>Binary Reward</strong>: 0 သို့မဟုတ် 1 ကဲ့သို့သော တန်ဖိုးနှစ်ခုသာ ရှိသော reward (မှန်/မှား)။</li> <li><strong>Parsing</strong>: စာသားကို ခွဲခြမ်းစိတ်ဖြာပြီး အဓိပ္ပာယ်ဖော်ခြင်း။</li> <li><strong>Format-Based Rewards</strong>: completion ၏ formatting (ပုံစံချထားမှု) အပေါ်မူတည်၍ ပေးသော reward။</li> <li><strong><code>re.search()</code></strong>: string တစ်ခုအတွင်း pattern ကို ရှာဖွေရန် function။</li> <li><strong><code>re.DOTALL</code></strong>: regular expression flags တစ်ခုဖြစ်ပြီး <code>.</code> (dot) သည် newline character (<code>\n</code>) အပါအဝင် မည်သည့် character ကိုမဆို ကိုက်ညီစေသည်။</li> <li><strong><code>match.group(1)</code> / <code>match.group(2)</code></strong>: regular expression match object မှ သက်ဆိုင်ရာ capture group ၏ contents များကို ထုတ်ယူခြင်း။</li> <li><strong><code>strip()</code></strong>: string တစ်ခု၏ အစ သို့မဟုတ် အဆုံးရှိ whitespace များကို ဖယ်ရှားခြင်း။</li> <li><strong>Partial Reward</strong>: အပြည့်အဝ reward မဟုတ်ဘဲ တစ်စိတ်တစ်ပိုင်း reward။</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/my/chapter12/4.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_5q47hu = {
	assets: "/docs/course/pr_1095/my",
	base: "/docs/course/pr_1095/my",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/course/pr_1095/my/_app/immutable/entry/start.8e25cab6.js"),
	import("/docs/course/pr_1095/my/_app/immutable/entry/app.b12ce275.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 32],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 61.7 kB
Xet hash:: acbe6567adadf5dad42ccc8c6960d77b6765f37931b5102dc8f483bc4970fce6

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.