Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / course /pr_1095 /my /chapter11 /5.html

rtrm

about 1 month ago

download

raw

94.2 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"အကဲဖြတ်ခြင်း (Evaluation)","local":"အကဖတခင-evaluation","sections":[{"title":"အလိုအလျောက် Benchmarks များ","local":"အလအလက-benchmarks-မ","sections":[],"depth":2},{"title":"Automatic Benchmarks တွေကို နားလည်ခြင်း","local":"automatic-benchmarks-တက-နလညခင","sections":[],"depth":2},{"title":"အထွေထွေဗဟုသုတ Benchmarks များ","local":"အထထဗဟသတ-benchmarks-မ","sections":[],"depth":2},{"title":"Reasoning Benchmarks များ","local":"reasoning-benchmarks-မ","sections":[],"depth":2},{"title":"ဘာသာစကား နားလည်မှု","local":"ဘသစက-နလညမ","sections":[],"depth":2},{"title":"Domain-Specific Benchmarks များ","local":"domain-specific-benchmarks-မ","sections":[],"depth":2},{"title":"အခြားသော Evaluation ချဉ်းကပ်မှုများ","local":"အခသ-evaluation-ခဉကပမမ","sections":[{"title":"LLM-as-Judge","local":"llm-as-judge","sections":[],"depth":3},{"title":"Evaluation Arenas","local":"evaluation-arenas","sections":[],"depth":3},{"title":"Custom Benchmark Suites","local":"custom-benchmark-suites","sections":[],"depth":3}],"depth":2},{"title":"Custom Evaluation","local":"custom-evaluation","sections":[],"depth":2},{"title":"Custom Evaluations တွေကို အကောင်အထည်ဖော်ခြင်း","local":"custom-evaluations-တက-အကငအထညဖခင","sections":[],"depth":2},{"title":"Example Evaluation Pipeline","local":"example-evaluation-pipeline","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/course/pr_1095/my/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/entry/start.8e25cab6.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/scheduler.893fe8c9.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/singletons.ba455c5c.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/index.bce52c8a.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/paths.9a7be869.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/entry/app.b12ce275.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/preload-helper.b5ee8f74.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/index.b1df2166.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/nodes/0.77c840e7.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/nodes/25.56b0d5c0.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.e6d31e72.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/CodeBlock.abb4f40e.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/CourseFloatingBanner.c1c08878.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/Question.ea6d4cb0.js">
	<link rel="modulepreload" href="/docs/course/pr_1095/my/_app/immutable/chunks/stores.db603902.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"အကဲဖြတ်ခြင်း (Evaluation)","local":"အကဖတခင-evaluation","sections":[{"title":"အလိုအလျောက် Benchmarks များ","local":"အလအလက-benchmarks-မ","sections":[],"depth":2},{"title":"Automatic Benchmarks တွေကို နားလည်ခြင်း","local":"automatic-benchmarks-တက-နလညခင","sections":[],"depth":2},{"title":"အထွေထွေဗဟုသုတ Benchmarks များ","local":"အထထဗဟသတ-benchmarks-မ","sections":[],"depth":2},{"title":"Reasoning Benchmarks များ","local":"reasoning-benchmarks-မ","sections":[],"depth":2},{"title":"ဘာသာစကား နားလည်မှု","local":"ဘသစက-နလညမ","sections":[],"depth":2},{"title":"Domain-Specific Benchmarks များ","local":"domain-specific-benchmarks-မ","sections":[],"depth":2},{"title":"အခြားသော Evaluation ချဉ်းကပ်မှုများ","local":"အခသ-evaluation-ခဉကပမမ","sections":[{"title":"LLM-as-Judge","local":"llm-as-judge","sections":[],"depth":3},{"title":"Evaluation Arenas","local":"evaluation-arenas","sections":[],"depth":3},{"title":"Custom Benchmark Suites","local":"custom-benchmark-suites","sections":[],"depth":3}],"depth":2},{"title":"Custom Evaluation","local":"custom-evaluation","sections":[],"depth":2},{"title":"Custom Evaluations တွေကို အကောင်အထည်ဖော်ခြင်း","local":"custom-evaluations-တက-အကငအထညဖခင","sections":[],"depth":2},{"title":"Example Evaluation Pipeline","local":"example-evaluation-pipeline","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="အကဖတခင-evaluation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#အကဖတခင-evaluation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အကဲဖြတ်ခြင်း (Evaluation)</span></h1> <p data-svelte-h="svelte-ffhlr1">SFT (Supervised Fine-Tuning) သို့မဟုတ် LoRA SFT နည်းလမ်းတွေနဲ့ fine-tune လုပ်ထားတဲ့ model တစ်ခုကို standard benchmarks တွေပေါ်မှာ ကျွန်တော်တို့ အကဲဖြတ်သင့်ပါတယ်။ Machine Learning Engineers တွေအနေနဲ့၊ သင်စိတ်ဝင်စားတဲ့ domain အတွက် သက်ဆိုင်ရာ evaluation တွေ စုစည်းထားသင့်ပါတယ်။ ဒီစာမျက်နှာမှာ၊ အသုံးအများဆုံး benchmarks တွေနဲ့ သင့် model ကို အကဲဖြတ်ဖို့ ဘယ်လိုအသုံးပြုရမလဲဆိုတာ ကြည့်သွားပါမယ်။ သင့်ရဲ့ သီးခြား use case အတွက် custom benchmarks တွေကို ဘယ်လိုဖန်တီးရမလဲဆိုတာကိုလည်း ကြည့်သွားပါမယ်။</p> <h2 class="relative group"><a id="အလအလက-benchmarks-မ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#အလအလက-benchmarks-မ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အလိုအလျောက် Benchmarks များ</span></h2> <p data-svelte-h="svelte-1pn1rkf">Automatic benchmarks တွေဟာ မတူညီတဲ့ tasks တွေနဲ့ capabilities တွေပေါ်မှာ language models တွေကို အကဲဖြတ်ဖို့အတွက် စံပြုကိရိယာတွေအဖြစ် လုပ်ဆောင်ပါတယ်။ Model စွမ်းဆောင်ရည်ကို နားလည်ဖို့အတွက် အသုံးဝင်တဲ့ အစမှတ်တစ်ခုကို ပံ့ပိုးပေးပေမယ့်၊ ၎င်းတို့ဟာ ပြည့်စုံတဲ့ evaluation strategy ရဲ့ အစိတ်အပိုင်းတစ်ခုသာ ဖြစ်တယ်ဆိုတာကို နားလည်ဖို့ အရေးကြီးပါတယ်။</p> <h2 class="relative group"><a id="automatic-benchmarks-တက-နလညခင" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#automatic-benchmarks-တက-နလညခင"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Automatic Benchmarks တွေကို နားလည်ခြင်း</span></h2> <p data-svelte-h="svelte-1doapzx">Automatic benchmarks တွေမှာ ပုံမှန်အားဖြင့် ကြိုတင်သတ်မှတ်ထားတဲ့ tasks တွေနဲ့ evaluation metrics တွေပါတဲ့ curated datasets တွေ ပါဝင်ပါတယ်။ ဒီ benchmarks တွေက အခြေခံဘာသာစကား နားလည်မှုကနေ ရှုပ်ထွေးတဲ့ reasoning အထိ model ရဲ့ capability အမျိုးမျိုးကို အကဲဖြတ်ဖို့ ရည်ရွယ်ပါတယ်။ Automatic benchmarks တွေကို အသုံးပြုခြင်းရဲ့ အဓိကအားသာချက်ကတော့ ၎င်းတို့ရဲ့ Standardization ပါပဲ၊ ၎င်းတို့က မတူညီတဲ့ models တွေကြား တသမတ်တည်း နှိုင်းယှဉ်နိုင်စေပြီး reproducible results တွေကို ပံ့ပိုးပေးပါတယ်။</p> <p data-svelte-h="svelte-1ve69kb">ဒါပေမယ့်၊ benchmark စွမ်းဆောင်ရည်ဟာ တကယ့်လက်တွေ့ကမ္ဘာမှာ ထိရောက်မှုနဲ့ အမြဲတမ်း တိုက်ရိုက်ဆက်စပ်မှု မရှိဘူးဆိုတာကို နားလည်ဖို့ အရေးကြီးပါတယ်။ Academic benchmarks တွေမှာ ထူးချွန်တဲ့ model တစ်ခုဟာ သီးခြား domain applications တွေ ဒါမှမဟုတ် လက်တွေ့ use cases တွေနဲ့ ရုန်းကန်နေရဆဲ ဖြစ်နိုင်ပါတယ်။</p> <h2 class="relative group"><a id="အထထဗဟသတ-benchmarks-မ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#အထထဗဟသတ-benchmarks-မ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အထွေထွေဗဟုသုတ Benchmarks များ</span></h2> <p data-svelte-h="svelte-x505tx"><a href="https://huggingface.co/datasets/cais/mmlu" rel="nofollow">MMLU</a> (Massive Multitask Language Understanding) က သိပ္ပံပညာကနေ လူမှုရေးဘာသာရပ်တွေအထိ ဘာသာရပ် ၅၇ ခုမှာ ဗဟုသုတကို စစ်ဆေးပါတယ်။ ဒါက ပြည့်စုံပေမယ့်၊ သီးခြား domains တွေအတွက် လိုအပ်တဲ့ ကျွမ်းကျင်မှုအတိမ်အနက်ကိုတော့ မဖော်ပြနိုင်ပါဘူး။ TruthfulQA က model တစ်ခုရဲ့ အသုံးများတဲ့ အယူအဆမှားတွေကို ပြန်လည်ထုတ်လုပ်နိုင်တဲ့ လမ်းကြောင်းကို အကဲဖြတ်ပါတယ်၊ ဒါပေမယ့် သတင်းမှားအမျိုးအစားအားလုံးကိုတော့ ဖမ်းယူနိုင်ခြင်း မရှိပါဘူး။</p> <h2 class="relative group"><a id="reasoning-benchmarks-မ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#reasoning-benchmarks-မ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Reasoning Benchmarks များ</span></h2> <p data-svelte-h="svelte-1ajtppv"><a href="https://huggingface.co/datasets/lukaemon/bbh" rel="nofollow">BBH</a> (Big Bench Hard) နဲ့ <a href="https://huggingface.co/datasets/openai/gsm8k" rel="nofollow">GSM8K</a> တို့က ရှုပ်ထွေးတဲ့ reasoning tasks တွေကို အာရုံစိုက်ပါတယ်။ BBH က logical thinking နဲ့ planning ကို စစ်ဆေးပြီး၊ GSM8K ကတော့ သင်္ချာပြဿနာဖြေရှင်းခြင်းကို အထူးပစ်မှတ်ထားပါတယ်။ ဒီ benchmarks တွေက analytical capabilities တွေကို အကဲဖြတ်ဖို့ ကူညီပေးပေမယ့် တကယ့်လက်တွေ့ကမ္ဘာ အခြေအနေတွေမှာ လိုအပ်တဲ့ နက်နဲသိမ်မွေ့တဲ့ reasoning တွေကိုတော့ မဖမ်းယူနိုင်ပါဘူး။</p> <h2 class="relative group"><a id="ဘသစက-နလညမ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ဘသစက-နလညမ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ဘာသာစကား နားလည်မှု</span></h2> <p data-svelte-h="svelte-1tucb6x"><a href="https://github.com/stanford-crfm/helm" rel="nofollow">HELM</a> က ပြည့်စုံတဲ့ evaluation framework တစ်ခုကို ပံ့ပိုးပေးပါတယ်။ HELM လို benchmarks တွေက commonsense, world knowledge, နဲ့ reasoning လို ကဏ္ဍတွေမှာ ဘာသာစကားလုပ်ဆောင်နိုင်စွမ်းအပေါ် ထိုးထွင်းသိမြင်မှုတွေကို ပေးပါတယ်။ ဒါပေမယ့် သဘာဝအတိုင်း စကားပြောဆိုမှုရဲ့ ရှုပ်ထွေးမှု ဒါမှမဟုတ် domain-specific terminology တွေကို အပြည့်အဝ ကိုယ်စားပြုနိုင်ခြင်း မရှိပါဘူး။</p> <h2 class="relative group"><a id="domain-specific-benchmarks-မ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#domain-specific-benchmarks-မ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Domain-Specific Benchmarks များ</span></h2> <p data-svelte-h="svelte-ub5x2f">သင်္ချာ၊ coding နဲ့ chat လို သီးခြား domains တွေကို အာရုံစိုက်ထားတဲ့ benchmarks အချို့ကို ကြည့်ကြရအောင်။</p> <p data-svelte-h="svelte-7z2cn9"><a href="https://huggingface.co/papers/2103.03874" rel="nofollow">MATH benchmark</a> ဟာ သင်္ချာဆိုင်ရာ reasoning အတွက် အရေးကြီးတဲ့ evaluation tool တစ်ခုလည်း ဖြစ်ပါတယ်။ ဒါက algebra, geometry, number theory, counting, probability နဲ့ အခြားအရာတွေ အပါအဝင် သင်္ချာပြိုင်ပွဲတွေက ပြဿနာ ၁၂,၅၀၀ ပါဝင်ပါတယ်။ MATH ကို အထူးစိန်ခေါ်မှုဖြစ်စေတာက multi-step reasoning, formal mathematical notation ကို နားလည်မှုနဲ့ step-by-step solutions တွေကို ထုတ်လုပ်နိုင်စွမ်းတို့ လိုအပ်တာပဲ ဖြစ်ပါတယ်။ ရိုးရှင်းတဲ့ arithmetic tasks တွေနဲ့ မတူဘဲ၊ MATH ပြဿနာတွေက ရှုပ်ထွေးတဲ့ ပြဿနာဖြေရှင်းနည်း ဗျူဟာတွေနဲ့ သင်္ချာသဘောတရားတွေကို အသုံးချဖို့ တောင်းဆိုလေ့ရှိပါတယ်။</p> <p data-svelte-h="svelte-sgrz1y"><a href="https://github.com/openai/human-eval" rel="nofollow">HumanEval Benchmark</a> ကတော့ coding ကို အာရုံစိုက်ထားတဲ့ evaluation dataset တစ်ခုဖြစ်ပြီး programming ပြဿနာ ၁၆၄ ခု ပါဝင်ပါတယ်။ ဒီ benchmark က model တစ်ခုရဲ့ ပေးထားတဲ့ programming tasks တွေကို ဖြေရှင်းနိုင်မယ့် functionally correct Python code ကို ထုတ်လုပ်နိုင်စွမ်းကို စစ်ဆေးပါတယ်။ HumanEval ကို အထူးတန်ဖိုးရှိစေတာက code generation capabilities နဲ့ functional correctness နှစ်ခုလုံးကို တကယ့် test case execution မှတစ်ဆင့် အကဲဖြတ်တာပဲ ဖြစ်ပါတယ်။ ရည်ညွှန်း solutions တွေနဲ့ အပေါ်ယံဆင်တူမှုကိုပဲ မကြည့်ပါဘူး။ ပြဿနာတွေက အခြေခံ string manipulation ကနေ ပိုရှုပ်ထွေးတဲ့ algorithms နဲ့ data structures တွေအထိ ပါဝင်ပါတယ်။</p> <p data-svelte-h="svelte-161uhri"><a href="https://tatsu-lab.github.io/alpaca_eval/" rel="nofollow">Alpaca Eval</a> က instruction-following language models တွေရဲ့ အရည်အသွေးကို အကဲဖြတ်ဖို့ ဒီဇိုင်းထုတ်ထားတဲ့ automated evaluation framework တစ်ခု ဖြစ်ပါတယ်။ ဒါက GPT-4 ကို judge အဖြစ် အသုံးပြုပြီး helpfulness, honesty, နဲ့ harmlessness အပါအဝင် မတူညီတဲ့ dimensions တွေပေါ်မှာ model outputs တွေကို အကဲဖြတ်ပါတယ်။ framework မှာ ဂရုတစိုက် curated လုပ်ထားတဲ့ prompts ၈၀၅ ခုပါတဲ့ dataset တစ်ခု ပါဝင်ပြီး Claude, GPT-4 နဲ့ အခြား reference models များစွာနဲ့ responses တွေကို အကဲဖြတ်နိုင်ပါတယ်။ Alpaca Eval ကို အထူးအသုံးဝင်စေတာက လူသား annotators တွေ မလိုအပ်ဘဲ တသမတ်တည်းဖြစ်တဲ့၊ scalable evaluations တွေကို ပံ့ပိုးပေးနိုင်ပြီး၊ traditional metrics တွေက လွဲချော်နိုင်တဲ့ model စွမ်းဆောင်ရည်ရဲ့ နက်နဲသိမ်မွေ့တဲ့ ကဏ္ဍတွေကို ဖမ်းယူနိုင်တာပဲ ဖြစ်ပါတယ်။</p> <h2 class="relative group"><a id="အခသ-evaluation-ခဉကပမမ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#အခသ-evaluation-ခဉကပမမ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အခြားသော Evaluation ချဉ်းကပ်မှုများ</span></h2> <p data-svelte-h="svelte-ss9jtc">အဖွဲ့အစည်းများစွာက standard benchmarks တွေရဲ့ ကန့်သတ်ချက်တွေကို ဖြေရှင်းဖို့ အခြား evaluation နည်းလမ်းတွေကို ဖန်တီးခဲ့ကြပါတယ်။</p> <h3 class="relative group"><a id="llm-as-judge" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#llm-as-judge"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>LLM-as-Judge</span></h3> <p data-svelte-h="svelte-1o3v4ip">language model တစ်ခုရဲ့ outputs တွေကို အခြား language model တစ်ခုကို အသုံးပြုပြီး အကဲဖြတ်တာက ပိုပြီး ရေပန်းစားလာပါတယ်။ ဒီချဉ်းကပ်မှုက traditional metrics တွေထက် ပိုမိုနက်နဲတဲ့ feedback တွေကို ပေးနိုင်ပေမယ့်၊ သူ့မှာလည်း ဘက်လိုက်မှုတွေနဲ့ ကန့်သတ်ချက်တွေ ရှိပါတယ်။</p> <h3 class="relative group"><a id="evaluation-arenas" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#evaluation-arenas"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Evaluation Arenas</span></h3> <p data-svelte-h="svelte-fynya0"><a href="https://lmarena.ai/" rel="nofollow">Chatbot Arena</a> လို Evaluation arenas တွေက crowdsourced feedback မှတစ်ဆင့် LLM အကဲဖြတ်ခြင်းအတွက် ထူးခြားတဲ့ ချဉ်းကပ်မှုတစ်ခုကို ပေးပါတယ်။ ဒီ platform တွေမှာ၊ အသုံးပြုသူတွေဟာ LLMs နှစ်ခုကြား အမည်မဖော်ဘဲ “battles” တွေမှာ ပါဝင်ပြီး မေးခွန်းတွေမေးကာ ဘယ် model က ပိုကောင်းတဲ့ response တွေ ပေးသလဲဆိုတာကို မဲပေးကြပါတယ်။ ဒီချဉ်းကပ်မှုက မတူညီတဲ့၊ စိန်ခေါ်မှုရှိတဲ့ မေးခွန်းတွေမှတစ်ဆင့် တကယ့်လက်တွေ့ကမ္ဘာ အသုံးပြုမှုပုံစံတွေနဲ့ နှစ်သက်မှုတွေကို ဖမ်းယူပါတယ်။ Crowdsourced votes တွေနဲ့ expert evaluations တွေကြား ခိုင်မာတဲ့ သဘောတူညီမှုရှိတယ်လို့ လေ့လာမှုတွေက ပြသထားပါတယ်။ ဒါပေမယ့်၊ ဒီ platform တွေမှာ user base bias, skewed prompt distributions နဲ့ safety considerations တွေထက် helpfulness ကို အဓိကထားတဲ့ ကန့်သတ်ချက်တွေ ပါဝင်ပါတယ်။</p> <h3 class="relative group"><a id="custom-benchmark-suites" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#custom-benchmark-suites"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Custom Benchmark Suites</span></h3> <p data-svelte-h="svelte-vzfyuv">အဖွဲ့အစည်းတွေက သူတို့ရဲ့ သီးခြားလိုအပ်ချက်တွေနဲ့ use cases တွေအတွက် စိတ်ကြိုက် benchmark suites တွေကို မကြာခဏ ဖန်တီးကြပါတယ်။ ဒါတွေမှာ domain-specific knowledge tests တွေ ဒါမှမဟုတ် တကယ့် deployment အခြေအနေတွေကို ထင်ဟပ်စေတဲ့ evaluation scenarios တွေ ပါဝင်နိုင်ပါတယ်။</p> <h2 class="relative group"><a id="custom-evaluation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#custom-evaluation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Custom Evaluation</span></h2> <p data-svelte-h="svelte-tu0pq2">Standard benchmarks တွေက အသုံးဝင်တဲ့ baseline တစ်ခုကို ပံ့ပိုးပေးပေမယ့်၊ ဒါတွေက သင့်ရဲ့ တစ်ခုတည်းသော evaluation method မဖြစ်သင့်ပါဘူး။ ပိုပြီး ပြည့်စုံတဲ့ ချဉ်းကပ်မှုတစ်ခုကို ဘယ်လိုတည်ဆောက်ရမလဲဆိုတာ ဒီမှာပါ။</p> <p data-svelte-h="svelte-qwwxg">၁။ baseline တစ်ခုကို တည်ဆောက်ပြီး အခြား models တွေနဲ့ နှိုင်းယှဉ်နိုင်ဖို့ သက်ဆိုင်ရာ standard benchmarks တွေနဲ့ စတင်ပါ။</p> <p data-svelte-h="svelte-1hgzaa5">၂။ သင့်ရဲ့ use case ရဲ့ သီးခြားလိုအပ်ချက်တွေနဲ့ စိန်ခေါ်မှုတွေကို ဖော်ထုတ်ပါ။ သင့် model က တကယ်တမ်း ဘယ် tasks တွေကို လုပ်ဆောင်ရမလဲ။ ဘယ်လိုအမှားမျိုးတွေက အပြဿနာအရှိဆုံးလဲ။</p> <p data-svelte-h="svelte-zhc1k8">၃။ သင့်ရဲ့ တကယ့် use case ကို ထင်ဟပ်စေမယ့် custom evaluation datasets တွေကို ဖန်တီးပါ။ ဒါတွေမှာ အောက်ပါတို့ ပါဝင်နိုင်ပါတယ်-</p> <ul data-svelte-h="svelte-1b7d7d2"><li>သင့် domain ကနေ ရရှိတဲ့ တကယ့် user queries များ</li> <li>သင်ကြုံတွေ့ခဲ့ရတဲ့ common edge cases များ</li> <li>အထူးစိန်ခေါ်မှုရှိတဲ့ အခြေအနေများရဲ့ ဥပမာများ</li></ul> <p data-svelte-h="svelte-1n6r14w">၄။ multi-layered evaluation strategy ကို အကောင်အထည်ဖော်တာကို ထည့်သွင်းစဉ်းစားပါ။</p> <ul data-svelte-h="svelte-1kzd7qm"><li>လျင်မြန်တဲ့ feedback အတွက် Automated metrics များ</li> <li>နက်နဲသိမ်မွေ့တဲ့ နားလည်မှုအတွက် Human evaluation များ</li> <li>သီးခြား applications တွေအတွက် Domain expert review များ</li> <li>ထိန်းချုပ်ထားတဲ့ environments တွေမှာ A/B testing များ</li></ul> <h2 class="relative group"><a id="custom-evaluations-တက-အကငအထညဖခင" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#custom-evaluations-တက-အကငအထညဖခင"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Custom Evaluations တွေကို အကောင်အထည်ဖော်ခြင်း</span></h2> <p data-svelte-h="svelte-1akcthw">ဒီအပိုင်းမှာ၊ ကျွန်တော်တို့ရဲ့ finetune လုပ်ထားတဲ့ model အတွက် evaluation ကို အကောင်အထည်ဖော်ပါမယ်။ Hugging Face library ထဲမှာ တည်ဆောက်ထားတဲ့ tasks များစွာပါဝင်တဲ့ standard benchmarks တွေပေါ်မှာ ကျွန်တော်တို့ရဲ့ finetune လုပ်ထားတဲ့ model ကို အကဲဖြတ်ဖို့ <a href="https://github.com/huggingface/lighteval" rel="nofollow"><code>lighteval</code></a> ကို အသုံးပြုနိုင်ပါတယ်။ ကျွန်တော်တို့ အကဲဖြတ်ချင်တဲ့ tasks တွေနဲ့ evaluation အတွက် parameters တွေကို သတ်မှတ်ဖို့ပဲ လိုအပ်ပါတယ်။</p> <p data-svelte-h="svelte-hofti5">LightEval tasks တွေကို သီးခြား format တစ်ခုကို အသုံးပြုပြီး သတ်မှတ်ထားပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-template-variable">{suite}</span><span class="language-xml">\|</span><span class="hljs-template-variable">{task}</span><span class="language-xml">\|</span><span class="hljs-template-variable">{num_few_shot}</span><span class="language-xml">\|</span><span class="hljs-template-variable">{auto_reduce}</span><!-- HTML_TAG_END --></pre></div> <table data-svelte-h="svelte-giwvb4"><thead><tr><th>Parameter</th> <th>Description</th></tr></thead> <tbody><tr><td><code>suite</code></td> <td>benchmark suite (ဥပမာ- ‘mmlu’, ‘truthfulqa’)</td></tr> <tr><td><code>task</code></td> <td>suite အတွင်းရှိ သီးခြား task (ဥပမာ- ‘abstract_algebra’)</td></tr> <tr><td><code>num_few_shot</code></td> <td>prompt တွင် ထည့်သွင်းရန် ဥပမာအရေအတွက် (zero-shot အတွက် 0)</td></tr> <tr><td><code>auto_reduce</code></td> <td>prompt အလွန်ရှည်လျားပါက few-shot examples များကို အလိုအလျောက်လျှော့ချမလား (0 သို့မဟုတ် 1)</td></tr></tbody></table> <p data-svelte-h="svelte-1lpdezq">ဥပမာ - <code>"mmlu\|abstract_algebra\|0\|0"</code> က MMLU ရဲ့ abstract algebra task ကို zero-shot inference ဖြင့် အကဲဖြတ်ပါတယ်။</p> <h2 class="relative group"><a id="example-evaluation-pipeline" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#example-evaluation-pipeline"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Example Evaluation Pipeline</span></h2> <p data-svelte-h="svelte-5210l8">ကျွန်တော်တို့ရဲ့ finetune လုပ်ထားတဲ့ model အတွက် evaluation pipeline တစ်ခုကို တည်ဆောက်ကြရအောင်။ ဆေးဘက်ဆိုင်ရာ domain နဲ့ သက်ဆိုင်တဲ့ sub tasks တွေပေါ်မှာ model ကို အကဲဖြတ်သွားပါမယ်။</p> <p data-svelte-h="svelte-1q8zk31">VLLM backend ကို အသုံးပြုပြီး Lighteval ဖြင့် သီးခြား domain တစ်ခုနဲ့ သက်ဆိုင်တဲ့ automatic benchmarks တွေကို အကဲဖြတ်တဲ့ ပြည့်စုံတဲ့ ဥပမာတစ်ခုကတော့ အောက်ပါအတိုင်းပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->lighteval accelerate \
	<span class="hljs-string">"pretrained=your-model-name"</span> \
	<span class="hljs-string">"mmlu\|anatomy\|0\|0"</span> \
	<span class="hljs-string">"mmlu\|high_school_biology\|0\|0"</span> \
	<span class="hljs-string">"mmlu\|high_school_chemistry\|0\|0"</span> \
	<span class="hljs-string">"mmlu\|professional_medicine\|0\|0"</span> \
	--max_samples 40 \
	--batch_size 1 \
	--output_path <span class="hljs-string">"./results"</span> \
	--save_generations <span class="hljs-literal">true</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ouuxkk">ရလဒ်တွေကို tabular format နဲ့ ပြသပေးပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->\|<span class="hljs-string"> Task </span>\|<span class="hljs-string">Version</span>\|<span class="hljs-string">Metric</span>\|<span class="hljs-string">Value </span>\|<span class="hljs-string"> </span>\|<span class="hljs-string">Stderr</span>\|
	\|<span class="hljs-string">----------------------------------------</span>\|<span class="hljs-string">------:</span>\|<span class="hljs-string">------</span>\|<span class="hljs-string">-----:</span>\|<span class="hljs-string">---</span>\|<span class="hljs-string">-----:</span>\|
	\|<span class="hljs-string">all </span>\|<span class="hljs-string"> </span>\|<span class="hljs-string">acc </span>\|<span class="hljs-string">0.3333</span>\|<span class="hljs-string">± </span>\|<span class="hljs-string">0.1169</span>\|
	\|<span class="hljs-string">leaderboard:mmlu:_average:5 </span>\|<span class="hljs-string"> </span>\|<span class="hljs-string">acc </span>\|<span class="hljs-string">0.3400</span>\|<span class="hljs-string">± </span>\|<span class="hljs-string">0.1121</span>\|
	\|<span class="hljs-string">leaderboard:mmlu:anatomy:5 </span>\|<span class="hljs-string"> 0</span>\|<span class="hljs-string">acc </span>\|<span class="hljs-string">0.4500</span>\|<span class="hljs-string">± </span>\|<span class="hljs-string">0.1141</span>\|
	\|<span class="hljs-string">leaderboard:mmlu:high_school_biology:5 </span>\|<span class="hljs-string"> 0</span>\|<span class="hljs-string">acc </span>\|<span class="hljs-string">0.1500</span>\|<span class="hljs-string">± </span>\|<span class="hljs-string">0.0819</span>\|<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-pj0h6s">Lighteval မှာ Python API လည်း ပါဝင်ပြီး ပိုမိုအသေးစိတ်တဲ့ evaluation tasks တွေအတွက် အသုံးဝင်ပါတယ်။ ဒါက results တွေကို ပိုမိုပြောင်းလွယ်ပြင်လွယ် ကိုင်တွယ်နိုင်စေပါတယ်။ အသေးစိတ်အချက်အလက်တွေအတွက် <a href="https://huggingface.co/docs/lighteval/using-the-python-api" rel="nofollow">Lighteval documentation</a> ကို ကြည့်ရှုပါ။</p> <blockquote class="tip" data-svelte-h="svelte-r4ki8e"><p>✏️ <strong>စမ်းသပ်ကြည့်ပါ!</strong> သင့်ရဲ့ finetune လုပ်ထားတဲ့ model ကို lighteval မှာ သီးခြား task တစ်ခုပေါ်မှာ အကဲဖြတ်ကြည့်ပါ။</p></blockquote> <h1 class="relative group"><a id="end-of-chapter-quiz" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#end-of-chapter-quiz"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အခန်း (၁၁) ဆိုင်ရာ မေးခွန်းများ</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0" style=""><a href="https://discuss.huggingface.co/t/chapter-11-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> </div> <h3 class="relative group"><a id="၁-model-evaluation-အတက-automatic-benchmarks-တက-အသပခငရ-အဓကအသခကတက-ဘတလ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၁-model-evaluation-အတက-automatic-benchmarks-တက-အသပခငရ-အဓကအသခကတက-ဘတလ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၁။ model evaluation အတွက် automatic benchmarks တွေကို အသုံးပြုခြင်းရဲ့ အဓိကအားသာချက်တွေက ဘာတွေလဲ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->၎င်းတို့သည် ပြီးပြည့်စုံသော တကယ့်လက်တွေ့ကမ္ဘာ စွမ်းဆောင်ရည် metrics များကို ပံ့ပိုးပေးသည်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->၎င်းတို့သည် models များကြား စံပြုနှိုင်းယှဉ်မှုကို ခွင့်ပြုပြီး reproducible results များကို ပံ့ပိုးပေးသည်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->၎င်းတို့သည် အခြားမည်သည့် evaluation ပုံစံကိုမဆို မလိုအပ်အောင် ဖယ်ရှားပေးသည်။<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="၂-ဘသရပ-၅၇-ခမ-ဗဟသတက-သခစစဆတ-benchmark-က-ဘလ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၂-ဘသရပ-၅၇-ခမ-ဗဟသတက-သခစစဆတ-benchmark-က-ဘလ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၂။ ဘာသာရပ် ၅၇ ခုမှာ ဗဟုသုတကို သီးခြားစစ်ဆေးတဲ့ benchmark က ဘာလဲ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->BBH (Big Bench Hard)<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->GSM8K<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->MMLU<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="၃-llm-as-judge-ဆတ-ဘလ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၃-llm-as-judge-ဆတ-ဘလ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၃။ LLM-as-Judge ဆိုတာ ဘာလဲ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->language model တစ်ခုရဲ့ outputs တွေကို အခြား language model တစ်ခုကို အသုံးပြုပြီး အကဲဖြတ်ခြင်း။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->တရားရေးရာ reasoning ကို စစ်ဆေးတဲ့ benchmark တစ်ခု။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->ဥပဒေရေးရာ datasets တွေပေါ်မှာ models တွေကို train လုပ်တဲ့ နည်းလမ်းတစ်ခု။<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="၄-ပညစတ-evaluation-strategy-မ-ဘတပဝငသငလ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၄-ပညစတ-evaluation-strategy-မ-ဘတပဝငသငလ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၄။ ပြည့်စုံတဲ့ evaluation strategy မှာ ဘာတွေပါဝင်သင့်လဲ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->standard benchmarks တွေချည်းသာ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->standard benchmarks တွေ၊ custom evaluation datasets တွေနဲ့ domain-specific testing တွေ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->သင့်ရဲ့ use case နဲ့ သီးခြားသက်ဆိုင်တဲ့ custom datasets တွေချည်းသာ။<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="၅-automatic-benchmarks-တရ-ကနသတခကက-ဘလ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၅-automatic-benchmarks-တရ-ကနသတခကက-ဘလ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၅။ automatic benchmarks တွေရဲ့ ကန့်သတ်ချက်က ဘာလဲ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->၎င်းတို့သည် run ရန် အလွန်စျေးကြီးသည်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->benchmark စွမ်းဆောင်ရည်ဟာ တကယ့်လက်တွေ့ကမ္ဘာ ထိရောက်မှုနဲ့ အမြဲတမ်း တိုက်ရိုက်ဆက်စပ်မှု မရှိဘူး။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->၎င်းတို့သည် models အသေးစားများကိုသာ အကဲဖြတ်နိုင်သည်။<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="၆-custom-evaluation-datasets-တ-ဖနတခငရ-ရညရယခကက-ဘလ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၆-custom-evaluation-datasets-တ-ဖနတခငရ-ရညရယခကက-ဘလ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၆။ custom evaluation datasets တွေ ဖန်တီးခြင်းရဲ့ ရည်ရွယ်ချက်က ဘာလဲ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->သင့်ရဲ့ သီးခြား use case ကို ထင်ဟပ်စေပြီး သင့် domain ကနေ တကယ့် user queries တွေ ပါဝင်စေဖို့။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->standard benchmarks တွေကို လုံးဝအစားထိုးဖို့။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->evaluation ကို ပိုမိုလွယ်ကူစေဖို့။<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h2 class="relative group"><a id="ဝဟရ-ရငလငခက-glossary" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ဝဟရ-ရငလငခက-glossary"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ဝေါဟာရ ရှင်းလင်းချက် (Glossary)</span></h2> <ul data-svelte-h="svelte-jcmev5"><li><strong>Fine-tuned Model</strong>: ကြိုတင်လေ့ကျင့်ထားပြီးသား (pre-trained) မော်ဒယ်တစ်ခုကို သီးခြားလုပ်ငန်းတစ်ခု (specific task) အတွက် အနည်းငယ်သော ဒေတာနဲ့ ထပ်မံလေ့ကျင့်ပေးထားသော မော်ဒယ်။</li> <li><strong>SFT (Supervised Fine-Tuning)</strong>: ကြိုတင်လေ့ကျင့်ထားပြီးသား (pre-trained) မော်ဒယ်တစ်ခုကို တိကျသောလုပ်ငန်းဆောင်တာများ (specific tasks) အတွက် label ပါသော ဒေတာများကို အသုံးပြု၍ ထပ်မံလေ့ကျင့်ခြင်းနည်းလမ်း။</li> <li><strong>LoRA SFT (Low-Rank Adaptation Supervised Fine-Tuning)</strong>: LoRA နည်းပညာကို အသုံးပြု၍ SFT လုပ်ဆောင်ခြင်း။</li> <li><strong>Standard Benchmarks</strong>: မော်ဒယ်များ၏ စွမ်းဆောင်ရည်ကို နှိုင်းယှဉ်တိုင်းတာရန်အတွက် အများသိ၊ စံပြုထားသော datasets များ သို့မဟုတ် လုပ်ငန်းများ။</li> <li><strong>Machine Learning Engineers</strong>: Machine Learning စနစ်များကို ဒီဇိုင်းဆွဲ၊ တည်ဆောက်ပြီး အကောင်အထည်ဖော်သူများ။</li> <li><strong>Suite of Evaluations</strong>: သီးခြား domain တစ်ခုအတွက် သက်ဆိုင်ရာ evaluation နည်းလမ်းများ သို့မဟုတ် ကိရိယာများ စုစည်းမှု။</li> <li><strong>Targeted Domain of Interest</strong>: စိတ်ဝင်စားသော သို့မဟုတ် ပစ်မှတ်ထားသော သီးခြားနယ်ပယ် (ဥပမာ- ဆေးပညာ၊ ဘဏ္ဍာရေး)။</li> <li><strong>Automatic Benchmarks</strong>: လူသားရဲ့ ကြားဝင်ဆောင်ရွက်မှု အနည်းဆုံးဖြင့် မော်ဒယ်များကို အလိုအလျောက် အကဲဖြတ်နိုင်သော စံပြုကိရိယာများ။</li> <li><strong>Language Models</strong>: လူသားဘာသာစကား၏ ဖြန့်ဝေမှုကို နားလည်ရန် လေ့ကျင့်ထားသော AI မော်ဒယ်တစ်ခု။ ၎င်းသည် စာသားထုတ်လုပ်ခြင်း၊ ဘာသာပြန်ခြင်း စသည့်လုပ်ငန်းများတွင် အသုံးပြုနိုင်သည်။</li> <li><strong>Tasks</strong>: Artificial Intelligence (AI) သို့မဟုတ် Machine Learning (ML) မော်ဒယ်တစ်ခုက လုပ်ဆောင်ရန် ဒီဇိုင်းထုတ်ထားသော သီးခြားအလုပ်။</li> <li><strong>Capabilities</strong>: မော်ဒယ်တစ်ခု၏ လုပ်ဆောင်နိုင်စွမ်းများ။</li> <li><strong>Comprehensive Evaluation Strategy</strong>: မော်ဒယ်၏ စွမ်းဆောင်ရည်ကို အကဲဖြတ်ရန်အတွက် နည်းလမ်းမျိုးစုံ (benchmarks, human evaluation, custom datasets) ကို ပေါင်းစပ်အသုံးပြုသော ဗျူဟာ။</li> <li><strong>Curated Datasets</strong>: သီးခြားရည်ရွယ်ချက်အတွက် ဂရုတစိုက် ရွေးချယ်၊ စုစည်းပြီး ပြင်ဆင်ထားသော datasets များ။</li> <li><strong>Predefined Tasks</strong>: ကြိုတင်သတ်မှတ်ထားသော လုပ်ငန်းများ။</li> <li><strong>Evaluation Metrics</strong>: မော်ဒယ်၏ စွမ်းဆောင်ရည်ကို တိုင်းတာရန် အသုံးပြုသော တန်ဖိုးများ (ဥပမာ- accuracy, F1 score, BLEU)။</li> <li><strong>Language Understanding</strong>: မော်ဒယ်တစ်ခုက လူသားဘာသာစကားကို မည်မျှနားလည်နိုင်ခြင်း။</li> <li><strong>Complex Reasoning</strong>: ရှုပ်ထွေးသော ပြဿနာများကို ဖြေရှင်းရန်အတွက် ဆင်ခြင်တွေးခေါ်နိုင်စွမ်း။</li> <li><strong>Standardization</strong>: မတူညီသော entities များကြား နှိုင်းယှဉ်နိုင်စေရန်အတွက် တသမတ်တည်းသော စည်းမျဉ်းများ သို့မဟုတ် နည်းလမ်းများကို ချမှတ်ခြင်း။</li> <li><strong>Reproducible Results</strong>: တူညီသော input များဖြင့် တူညီသောလုပ်ငန်းစဉ်ကို ပြန်လည်လုပ်ဆောင်သောအခါ တူညီသောရလဒ်များကို ပြန်လည်ရရှိနိုင်ခြင်း။</li> <li><strong>Real-world Effectiveness</strong>: လက်တွေ့အခြေအနေများတွင် model တစ်ခု၏ အသုံးဝင်မှုနှင့် စွမ်းဆောင်ရည်။</li> <li><strong>Academic Benchmarks</strong>: ပညာရပ်ဆိုင်ရာ သုတေသနများတွင် အသုံးပြုသော စံပြု benchmarks များ။</li> <li><strong>Domain Applications</strong>: သီးခြားနယ်ပယ်တစ်ခု (ဥပမာ- ဆေးဘက်ဆိုင်ရာ၊ ဘဏ္ဍာရေး) တွင် အသုံးပြုသော application များ။</li> <li><strong>Practical Use Cases</strong>: လက်တွေ့အသုံးချနိုင်သော အခြေအနေများ။</li> <li><strong>MMLU (Massive Multitask Language Understanding)</strong>: ဘာသာရပ် ၅၇ ခုတွင် ဘာသာစကားနားလည်မှုနှင့် ဗဟုသုတကို စစ်ဆေးသည့် benchmark။</li> <li><strong>TruthfulQA</strong>: မော်ဒယ်တစ်ခု၏ အမှန်တရားကို ပြောဆိုနိုင်စွမ်းနှင့် အယူအဆမှားများကို ရှောင်ရှားနိုင်စွမ်းကို အကဲဖြတ်သည့် benchmark။</li> <li><strong>Common Misconceptions</strong>: အများအားဖြင့် မှားယွင်းစွာ နားလည်ထားသော အယူအဆများ။</li> <li><strong>Misinformation</strong>: မှားယွင်းသော သို့မဟုတ် မမှန်ကန်သော အချက်အလက်များ။</li> <li><strong>BBH (Big Bench Hard)</strong>: ရှုပ်ထွေးသော reasoning tasks များကို အကဲဖြတ်သည့် benchmark။</li> <li><strong>GSM8K</strong>: သင်္ချာပြဿနာဖြေရှင်းခြင်းစွမ်းရည်ကို အထူးပစ်မှတ်ထားသည့် benchmark (Grade School Math 8K)။</li> <li><strong>Logical Thinking</strong>: အကြောင်းအကျိုးဆင်ခြင်ခြင်း။</li> <li><strong>Planning</strong>: အလုပ်တစ်ခုကို လုပ်ဆောင်ရန်အတွက် အဆင့်ဆင့် စီမံခြင်း။</li> <li><strong>Analytical Capabilities</strong>: အချက်အလက်များကို ခွဲခြမ်းစိတ်ဖြာပြီး နားလည်နိုင်စွမ်း။</li> <li><strong>Nuanced Reasoning</strong>: နက်နဲသိမ်မွေ့ပြီး အသေးစိတ်ဆင်ခြင်နိုင်စွမ်း။</li> <li><strong>HELM (Holistic Evaluation of Language Models)</strong>: မော်ဒယ်များ၏ စွမ်းဆောင်ရည်ကို ကဏ္ဍစုံမှ အကဲဖြတ်သည့် ဘက်စုံသုံး evaluation framework။</li> <li><strong>Commonsense</strong>: လူအများစု သိရှိနားလည်ထားသော သာမန်အသိပညာ။</li> <li><strong>World Knowledge</strong>: ကမ္ဘာကြီးအကြောင်း အထွေထွေဗဟုသုတ။</li> <li><strong>Natural Conversation</strong>: လူသားများ ပုံမှန်အတိုင်း ပြောဆိုဆက်ဆံခြင်း။</li> <li><strong>Domain-Specific Terminology</strong>: သီးခြားနယ်ပယ်တစ်ခုတွင် အသုံးပြုသော အသုံးအနှုန်းများ။</li> <li><strong>MATH Benchmark</strong>: သင်္ချာပြဿနာဖြေရှင်းခြင်းနှင့် reasoning စွမ်းရည်ကို အကဲဖြတ်သည့် benchmark။</li> <li><strong>Multi-step Reasoning</strong>: ပြဿနာတစ်ခုကို ဖြေရှင်းရန်အတွက် အဆင့်များစွာ ဆင်ခြင်တွေးခေါ်ခြင်း။</li> <li><strong>Formal Mathematical Notation</strong>: သင်္ချာဆိုင်ရာ သင်္ကေတများနှင့် ပုံစံများ။</li> <li><strong>Step-by-step Solutions</strong>: ပြဿနာတစ်ခု၏ ဖြေရှင်းနည်းအဆင့်ဆင့်။</li> <li><strong>Sophisticated Problem-solving Strategies</strong>: ရှုပ်ထွေးသော ပြဿနာဖြေရှင်းနည်း ဗျူဟာများ။</li> <li><strong>Mathematical Concept Applications</strong>: သင်္ချာသဘောတရားများကို လက်တွေ့အသုံးချခြင်း။</li> <li><strong>HumanEval Benchmark</strong>: programming ပြဿနာများကို ဖြေရှင်းရန် Python code ထုတ်လုပ်နိုင်စွမ်းကို အကဲဖြတ်သည့် benchmark။</li> <li><strong>Functionally Correct Python Code</strong>: ပေးထားသော task ကို မှန်ကန်စွာ လုပ်ဆောင်နိုင်သော Python code။</li> <li><strong>Code Generation Capabilities</strong>: code များကို ဖန်တီးထုတ်လုပ်နိုင်စွမ်း။</li> <li><strong>Functional Correctness</strong>: ဆော့ဖ်ဝဲလ်တစ်ခု၏ လုပ်ဆောင်ချက်များသည် မျှော်လင့်ထားသည့်အတိုင်း မှန်ကန်စွာ အလုပ်လုပ်ခြင်း။</li> <li><strong>Test Case Execution</strong>: ပရိုဂရမ်တစ်ခု၏ လုပ်ဆောင်ချက်များကို စမ်းသပ်ရန်အတွက် သီးခြား input များကို အသုံးပြုခြင်း။</li> <li><strong>Superficial Similarity</strong>: အပေါ်ယံဆင်တူမှု။</li> <li><strong>Reference Solutions</strong>: မှန်ကန်သည်ဟု သတ်မှတ်ထားသော အဖြေများ။</li> <li><strong>String Manipulation</strong>: စာသားကြိုးများကို ပြောင်းလဲခြင်း သို့မဟုတ် စီမံဆောင်ရွက်ခြင်း။</li> <li><strong>Algorithms</strong>: ပြဿနာတစ်ခုကို ဖြေရှင်းရန်အတွက် အဆင့်ဆင့် ညွှန်ကြားချက်များ။</li> <li><strong>Data Structures</strong>: ကွန်ပျူတာထဲတွင် ဒေတာများကို စုစည်းပြီး သိမ်းဆည်းရန် နည်းလမ်းများ။</li> <li><strong>Alpaca Eval</strong>: instruction-following language models များ၏ အရည်အသွေးကို အကဲဖြတ်ရန် ဒီဇိုင်းထုတ်ထားသော automated evaluation framework။</li> <li><strong>Instruction-following Language Models</strong>: ပေးထားသော ညွှန်ကြားချက်များကို လိုက်နာ၍ တုံ့ပြန်မှုများ ထုတ်လုပ်နိုင်သော language models များ။</li> <li><strong>GPT-4</strong>: OpenAI မှ ထုတ်လုပ်ထားသော အဆင့်မြင့် Large Language Model။</li> <li><strong>Judge</strong>: Evaluation လုပ်ရာတွင် model outputs များကို အကဲဖြတ်သူ။</li> <li><strong>Helpfulness</strong>: model ၏ အဖြေများသည် အထောက်အကူဖြစ်ခြင်း။</li> <li><strong>Honesty</strong>: model ၏ အဖြေများသည် မှန်ကန်ခြင်း။</li> <li><strong>Harmlessness</strong>: model ၏ အဖြေများသည် အန္တရာယ်မရှိခြင်း။</li> <li><strong>Curated Prompts</strong>: ဂရုတစိုက် ရွေးချယ်ပြီး ပြင်ဆင်ထားသော prompts များ။</li> <li><strong>Reference Models</strong>: နှိုင်းယှဉ်ရန်အတွက် အသုံးပြုသော အခြား model များ။</li> <li><strong>Claude</strong>: Anthropic မှ ထုတ်လုပ်ထားသော AI assistant model။</li> <li><strong>Scalable Evaluations</strong>: ပိုမိုများပြားသော models များ သို့မဟုတ် datasets များကို ထိရောက်စွာ အကဲဖြတ်နိုင်ခြင်း။</li> <li><strong>Human Annotators</strong>: ဒေတာများကို labels များ ထည့်သွင်းပေးရန် ငှားရမ်းထားသော လူများ။</li> <li><strong>Nuanced Aspects</strong>: နက်နဲသိမ်မွေ့ပြီး အသေးစိတ်ကျသော ကဏ္ဍများ။</li> <li><strong>LLM-as-Judge</strong>: language model တစ်ခု၏ output များကို အခြား language model တစ်ခုကို အသုံးပြု၍ အကဲဖြတ်ခြင်းနည်းလမ်း။</li> <li><strong>Nuanced Feedback</strong>: နက်နဲသိမ်မွေ့ပြီး အသေးစိတ်ကျသော တုံ့ပြန်ချက်များ။</li> <li><strong>Evaluation Arenas</strong>: LLM များကို crowdsourced feedback မှတစ်ဆင့် အကဲဖြတ်ရန်အတွက် အွန်လိုင်း platform များ (ဥပမာ- Chatbot Arena)။</li> <li><strong>Chatbot Arena</strong>: LLM များကို crowdsourced feedback မှတစ်ဆင့် အကဲဖြတ်ရန် အသုံးပြုသော platform တစ်ခု။</li> <li><strong>Crowdsourced Feedback</strong>: အွန်လိုင်းလူအဖွဲ့အစည်းမှ လူအများအပြားထံမှ စုဆောင်းရရှိသော တုံ့ပြန်ချက်များ။</li> <li><strong>Anonymous “Battles”</strong>: အမည်မဖော်ဘဲ LLM နှစ်ခုကြား နှိုင်းယှဉ်စစ်ဆေးခြင်း။</li> <li><strong>User Base Bias</strong>: အသုံးပြုသူအဖွဲ့အစည်း၏ ဝိသေသလက္ခဏာများကြောင့် ဖြစ်ပေါ်လာနိုင်သော ဘက်လိုက်မှု။</li> <li><strong>Skewed Prompt Distributions</strong>: prompts များ၏ ဖြန့်ဝေမှုသည် မမျှတခြင်း။</li> <li><strong>Safety Considerations</strong>: AI စနစ်များ၏ အန္တရာယ်ကင်းရှင်းမှုနှင့် သက်ဆိုင်သော အချက်များ။</li> <li><strong>Custom Benchmark Suites</strong>: အဖွဲ့အစည်းတစ်ခု၏ သီးခြားလိုအပ်ချက်များနှင့် use cases များကို ဖြည့်ဆည်းရန် ဒီဇိုင်းထုတ်ထားသော benchmarks များ။</li> <li><strong>Domain-Specific Knowledge Tests</strong>: သီးခြားနယ်ပယ်တစ်ခုရှိ ဗဟုသုတကို စစ်ဆေးသည့် စမ်းသပ်မှုများ။</li> <li><strong>Deployment Conditions</strong>: model တစ်ခုကို လက်တွေ့ပတ်ဝန်းကျင်တွင် အသုံးပြုသည့် အခြေအနေများ။</li> <li><strong>Baseline</strong>: နှိုင်းယှဉ်မှုအတွက် အသုံးပြုသော စတင်မှတ် သို့မဟုတ် ရည်ညွှန်းချက်။</li> <li><strong>Real User Queries</strong>: တကယ့်အသုံးပြုသူများ၏ မေးမြန်းချက်များ။</li> <li><strong>Edge Cases</strong>: ပုံမှန်မဟုတ်သော သို့မဟုတ် ရှားပါးသော အခြေအနေများ။</li> <li><strong>Multi-layered Evaluation Strategy</strong>: မတူညီသော evaluation နည်းလမ်းများစွာကို ပေါင်းစပ်အသုံးပြုသော ဗျူဟာ။</li> <li><strong>Automated Metrics</strong>: ကွန်ပျူတာပရိုဂရမ်များဖြင့် အလိုအလျောက် တွက်ချက်နိုင်သော metrics များ။</li> <li><strong>Human Evaluation</strong>: လူသားများက model ၏ output များကို အကဲဖြတ်ခြင်း။</li> <li><strong>Domain Expert Review</strong>: သီးခြားနယ်ပယ်တစ်ခုရှိ ကျွမ်းကျင်သူများက model ကို ပြန်လည်စစ်ဆေးခြင်း။</li> <li><strong>A/B Testing</strong>: မတူညီသော models သို့မဟုတ် features နှစ်ခု၏ စွမ်းဆောင်ရည်ကို နှိုင်းယှဉ်ရန် အသုံးပြုသော စမ်းသပ်မှု။</li> <li><strong>Controlled Environments</strong>: ပြောင်းလဲနိုင်သော အကြောင်းအရာများကို ဂရုတစိုက် ထိန်းချုပ်ထားသော ပတ်ဝန်းကျင်။</li> <li><strong><code>lighteval</code></strong>: Hugging Face မှ ထုတ်လုပ်ထားသော library တစ်ခုဖြစ်ပြီး LLM များကို standard benchmarks များပေါ်တွင် အကဲဖြတ်ရန် အသုံးပြုသည်။</li> <li><strong>VLLM Backend</strong>: vLLM library ကို အသုံးပြု၍ LLM inference ကို အရှိန်မြှင့်တင်ရန်အတွက် backend။</li> <li><strong><code>pretrained=your-model-name</code></strong>: evaluate လုပ်မည့် pretrained model ၏ နာမည်ကို သတ်မှတ်သည်။</li> <li><strong><code>mmlu\|anatomy\|0\|0</code></strong>: MMLU benchmark အတွင်းရှိ ‘anatomy’ task ကို zero-shot inference ဖြင့် အကဲဖြတ်ရန် LightEval task format။</li> <li><strong><code>--max_samples</code></strong>: evaluation အတွက် အများဆုံး samples အရေအတွက်။</li> <li><strong><code>--batch_size</code></strong>: evaluation လုပ်ငန်းစဉ်အတွင်း တစ်ပြိုင်နက်တည်း လုပ်ဆောင်မည့် samples အရေအတွက်။</li> <li><strong><code>--output_path</code></strong>: evaluation results များကို သိမ်းဆည်းမည့် လမ်းကြောင်း။</li> <li><strong><code>--save_generations</code></strong>: model မှ ထုတ်လုပ်သော generations များကို သိမ်းဆည်းမလား။</li> <li><strong>Tabular Format</strong>: ဇယားပုံစံဖြင့် ပြသထားသော အချက်အလက်များ။</li> <li><strong>Metric</strong>: Model ၏ စွမ်းဆောင်ရည်ကို တိုင်းတာရန် အသုံးပြုသော တန်ဖိုး (ဥပမာ- <code>acc</code> for accuracy)။</li> <li><strong>Value</strong>: Metric ၏ တန်ဖိုး။</li> <li><strong>Stderr (Standard Error)</strong>: ခန့်မှန်းထားသော တန်ဖိုး၏ မမှန်ကန်မှုပမာဏကို တိုင်းတာခြင်း။</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/my/chapter11/5.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_5q47hu = {
	assets: "/docs/course/pr_1095/my",
	base: "/docs/course/pr_1095/my",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/course/pr_1095/my/_app/immutable/entry/start.8e25cab6.js"),
	import("/docs/course/pr_1095/my/_app/immutable/entry/app.b12ce275.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 25],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 94.2 kB
Xet hash:: 66bd4cc85e180c579366103bd77b7f0090bc5a2496afd71b61c62df2c5f42c3e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.