Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / course /pr_1114 /my /chapter6 /10.html

rtrm

about 1 month ago

download

raw

67.8 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"အခန်း (၆) ဆိုင်ရာ မေးခွန်းများ","local":"end-of-chapter-quiz","sections":[{"title":"၁။ Tokenizer အသစ်တစ်ခုကို ဘယ်အချိန်မှာ train လုပ်သင့်သလဲ။","local":"၁-tokenizer-အသစတစခက-ဘယအခနမ-train-လပသငသလ","sections":[],"depth":3},{"title":"၂။ train_new_from_iterator() ကို အသုံးပြုတဲ့အခါ list of lists of texts တွေနဲ့ နှိုင်းယှဉ်ရင် generator of lists of texts တွေကို အသုံးပြုခြင်းရဲ့ အကျိုးကျေးဇူးက ဘာလဲ။","local":"၂-trainnewfromiterator-က-အသပတအခ-list-of-lists-of-texts-တန-နငယဉရင-generator-of-lists-of-texts-တက-အသပခငရ-အကကဇက-ဘလ","sections":[],"depth":3},{"title":"၃။ “Fast” tokenizer တစ်ခုကို အသုံးပြုခြင်းရဲ့ အကျိုးကျေးဇူးတွေက ဘာတွေလဲ။","local":"၃-fast-tokenizer-တစခက-အသပခငရ-အကကဇတက-ဘတလ","sections":[],"depth":3},{"title":"၄။ token-classification pipeline က tokens အများအပြားကို ဖြန့်ကျက်ထားတဲ့ entities တွေကို ဘယ်လို ကိုင်တွယ်ဖြေရှင်းလဲ။","local":"၄-token-classification-pipeline-က-tokens-အမအပက-ဖနကကထတ-entities-တက-ဘယလ-ကငတယဖရငလ","sections":[],"depth":3},{"title":"၅။ question-answering pipeline က ရှည်လျားတဲ့ contexts တွေကို ဘယ်လို ကိုင်တွယ်ဖြေရှင်းလဲ။","local":"၅-question-answering-pipeline-က-ရညလတ-contexts-တက-ဘယလ-ကငတယဖရငလ","sections":[],"depth":3},{"title":"၆။ Normalization ဆိုတာ ဘာလဲ။","local":"၆-normalization-ဆတ-ဘလ","sections":[],"depth":3},{"title":"၇။ Subword tokenizer အတွက် pre-tokenization ဆိုတာဘာလဲ။","local":"၇-subword-tokenizer-အတက-pre-tokenization-ဆတဘလ","sections":[],"depth":3},{"title":"၈။ BPE tokenization model နဲ့ သက်ဆိုင်တဲ့ စာကြောင်းတွေကို ရွေးချယ်ပါ။","local":"၈-bpe-tokenization-model-န-သကဆငတ-စကငတက-ရခယပ","sections":[],"depth":3},{"title":"၉။ WordPiece tokenization model နဲ့ သက်ဆိုင်တဲ့ စာကြောင်းတွေကို ရွေးချယ်ပါ။","local":"၉-wordpiece-tokenization-model-န-သကဆငတ-စကငတက-ရခယပ","sections":[],"depth":3},{"title":"၁၀။ Unigram tokenization model နဲ့ သက်ဆိုင်တဲ့ စာကြောင်းတွေကို ရွေးချယ်ပါ။","local":"၁၀-unigram-tokenization-model-န-သကဆငတ-စကငတက-ရခယပ","sections":[],"depth":3},{"title":"ဝေါဟာရ ရှင်းလင်းချက် (Glossary)","local":"ဝဟရ-ရငလငခက-glossary","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/course/pr_1114/my/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/entry/start.14794ee9.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/scheduler.893fe8c9.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/singletons.10fda3ce.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/index.bce52c8a.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/paths.89c82153.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/entry/app.a133f5c6.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/preload-helper.b1a719fd.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/index.b1df2166.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/nodes/0.510afdc1.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/nodes/45.2956b37c.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.762ed9cc.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/CourseFloatingBanner.c1c08878.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/Question.ea6d4cb0.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/stores.db603902.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"အခန်း (၆) ဆိုင်ရာ မေးခွန်းများ","local":"end-of-chapter-quiz","sections":[{"title":"၁။ Tokenizer အသစ်တစ်ခုကို ဘယ်အချိန်မှာ train လုပ်သင့်သလဲ။","local":"၁-tokenizer-အသစတစခက-ဘယအခနမ-train-လပသငသလ","sections":[],"depth":3},{"title":"၂။ train_new_from_iterator() ကို အသုံးပြုတဲ့အခါ list of lists of texts တွေနဲ့ နှိုင်းယှဉ်ရင် generator of lists of texts တွေကို အသုံးပြုခြင်းရဲ့ အကျိုးကျေးဇူးက ဘာလဲ။","local":"၂-trainnewfromiterator-က-အသပတအခ-list-of-lists-of-texts-တန-နငယဉရင-generator-of-lists-of-texts-တက-အသပခငရ-အကကဇက-ဘလ","sections":[],"depth":3},{"title":"၃။ “Fast” tokenizer တစ်ခုကို အသုံးပြုခြင်းရဲ့ အကျိုးကျေးဇူးတွေက ဘာတွေလဲ။","local":"၃-fast-tokenizer-တစခက-အသပခငရ-အကကဇတက-ဘတလ","sections":[],"depth":3},{"title":"၄။ token-classification pipeline က tokens အများအပြားကို ဖြန့်ကျက်ထားတဲ့ entities တွေကို ဘယ်လို ကိုင်တွယ်ဖြေရှင်းလဲ။","local":"၄-token-classification-pipeline-က-tokens-အမအပက-ဖနကကထတ-entities-တက-ဘယလ-ကငတယဖရငလ","sections":[],"depth":3},{"title":"၅။ question-answering pipeline က ရှည်လျားတဲ့ contexts တွေကို ဘယ်လို ကိုင်တွယ်ဖြေရှင်းလဲ။","local":"၅-question-answering-pipeline-က-ရညလတ-contexts-တက-ဘယလ-ကငတယဖရငလ","sections":[],"depth":3},{"title":"၆။ Normalization ဆိုတာ ဘာလဲ။","local":"၆-normalization-ဆတ-ဘလ","sections":[],"depth":3},{"title":"၇။ Subword tokenizer အတွက် pre-tokenization ဆိုတာဘာလဲ။","local":"၇-subword-tokenizer-အတက-pre-tokenization-ဆတဘလ","sections":[],"depth":3},{"title":"၈။ BPE tokenization model နဲ့ သက်ဆိုင်တဲ့ စာကြောင်းတွေကို ရွေးချယ်ပါ။","local":"၈-bpe-tokenization-model-န-သကဆငတ-စကငတက-ရခယပ","sections":[],"depth":3},{"title":"၉။ WordPiece tokenization model နဲ့ သက်ဆိုင်တဲ့ စာကြောင်းတွေကို ရွေးချယ်ပါ။","local":"၉-wordpiece-tokenization-model-န-သကဆငတ-စကငတက-ရခယပ","sections":[],"depth":3},{"title":"၁၀။ Unigram tokenization model နဲ့ သက်ဆိုင်တဲ့ စာကြောင်းတွေကို ရွေးချယ်ပါ။","local":"၁၀-unigram-tokenization-model-န-သကဆငတ-စကငတက-ရခယပ","sections":[],"depth":3},{"title":"ဝေါဟာရ ရှင်းလင်းချက် (Glossary)","local":"ဝဟရ-ရငလငခက-glossary","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="end-of-chapter-quiz" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#end-of-chapter-quiz"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အခန်း (၆) ဆိုင်ရာ မေးခွန်းများ</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0" style=""><a href="https://discuss.huggingface.co/t/chapter-6-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> </div> <p data-svelte-h="svelte-f3za2x">ဒီအခန်းမှာ သင်ယူခဲ့တာတွေကို စစ်ဆေးကြည့်ရအောင်။</p> <h3 class="relative group"><a id="၁-tokenizer-အသစတစခက-ဘယအခနမ-train-လပသငသလ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၁-tokenizer-အသစတစခက-ဘယအခနမ-train-လပသငသလ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၁။ Tokenizer အသစ်တစ်ခုကို ဘယ်အချိန်မှာ train လုပ်သင့်သလဲ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->သင့် dataset က လက်ရှိ pretrained model တစ်ခုက အသုံးပြုတဲ့ dataset နဲ့ ဆင်တူပြီး၊ model အသစ်တစ်ခုကို pretrain လုပ်ချင်တဲ့အခါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->သင့် dataset က လက်ရှိ pretrained model တစ်ခုက အသုံးပြုတဲ့ dataset နဲ့ ဆင်တူပြီး၊ ဒီ pretrained model ကို အသုံးပြုပြီး model အသစ်တစ်ခုကို fine-tune လုပ်ချင်တဲ့အခါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->သင့် dataset က လက်ရှိ pretrained model တစ်ခုက အသုံးပြုတဲ့ dataset နဲ့ ကွဲပြားပြီး၊ model အသစ်တစ်ခုကို pretrain လုပ်ချင်တဲ့အခါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="3"> <!-- HTML_TAG_START -->သင့် dataset က လက်ရှိ pretrained model တစ်ခုက အသုံးပြုတဲ့ dataset နဲ့ ကွဲပြားပြီး၊ ဒီ pretrained model ကို အသုံးပြုပြီး model အသစ်တစ်ခုကို fine-tune လုပ်ချင်တဲ့အခါ။<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="၂-trainnewfromiterator-က-အသပတအခ-list-of-lists-of-texts-တန-နငယဉရင-generator-of-lists-of-texts-တက-အသပခငရ-အကကဇက-ဘလ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၂-trainnewfromiterator-က-အသပတအခ-list-of-lists-of-texts-တန-နငယဉရင-generator-of-lists-of-texts-တက-အသပခငရ-အကကဇက-ဘလ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၂။ train_new_from_iterator() ကို အသုံးပြုတဲ့အခါ list of lists of texts တွေနဲ့ နှိုင်းယှဉ်ရင် generator of lists of texts တွေကို အသုံးပြုခြင်းရဲ့ အကျိုးကျေးဇူးက ဘာလဲ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->ဒါက <code>train_new_from_iterator()</code> method က လက်ခံတဲ့ တစ်ခုတည်းသော အမျိုးအစားပါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->dataset တစ်ခုလုံးကို memory ထဲကို တစ်ပြိုင်နက်တည်း loading လုပ်တာကို ရှောင်ရှားပါလိမ့်မယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->ဒါက 🤗 Tokenizers library ကို multiprocessing အသုံးပြုနိုင်စေပါလိမ့်မယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="3"> <!-- HTML_TAG_START -->သင် train လုပ်တဲ့ tokenizer က ပိုကောင်းတဲ့ texts တွေကို ထုတ်လုပ်ပါလိမ့်မယ်။<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="၃-fast-tokenizer-တစခက-အသပခငရ-အကကဇတက-ဘတလ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၃-fast-tokenizer-တစခက-အသပခငရ-အကကဇတက-ဘတလ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၃။ “Fast” tokenizer တစ်ခုကို အသုံးပြုခြင်းရဲ့ အကျိုးကျေးဇူးတွေက ဘာတွေလဲ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->inputs အများအပြားကို batch အလိုက် စုစည်းပြီး လုပ်ဆောင်တဲ့အခါ slow tokenizer ထက် ပိုမြန်မြန် လုပ်ဆောင်နိုင်ပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->Fast tokenizers တွေက ၎င်းတို့ရဲ့ slow counterparts တွေထက် အမြဲတမ်း ပိုမြန်မြန် tokenize လုပ်ပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->ဒါက padding နဲ့ truncation ကို အသုံးပြုနိုင်ပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="3"> <!-- HTML_TAG_START -->ဒါက tokens တွေကို ၎င်းတို့ကို ဖန်တီးခဲ့တဲ့ text span နဲ့ map လုပ်နိုင်စေတဲ့ ထပ်ဆောင်း features တွေ ရှိပါတယ်။<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="၄-token-classification-pipeline-က-tokens-အမအပက-ဖနကကထတ-entities-တက-ဘယလ-ကငတယဖရငလ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၄-token-classification-pipeline-က-tokens-အမအပက-ဖနကကထတ-entities-တက-ဘယလ-ကငတယဖရငလ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၄။ token-classification pipeline က tokens အများအပြားကို ဖြန့်ကျက်ထားတဲ့ entities တွေကို ဘယ်လို ကိုင်တွယ်ဖြေရှင်းလဲ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->တူညီတဲ့ label ရှိတဲ့ entities တွေကို entity တစ်ခုတည်းအဖြစ် ပေါင်းစပ်ပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->entity ရဲ့ အစအတွက် label တစ်ခုနဲ့ entity ရဲ့ ဆက်လက်ဖြစ်ပေါ်မှုအတွက် label တစ်ခု ရှိပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->ပေးထားတဲ့ word တစ်ခုမှာ၊ ပထမဆုံး token မှာ entity ရဲ့ label ရှိနေသရွေ့၊ word တစ်ခုလုံးကို အဲဒီ entity နဲ့ label လုပ်ထားတယ်လို့ သတ်မှတ်ပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="3"> <!-- HTML_TAG_START -->token တစ်ခုမှာ ပေးထားတဲ့ entity ရဲ့ label ရှိနေတဲ့အခါ၊ အဲဒီနောက်မှာ လိုက်လာတဲ့ တူညီတဲ့ label ရှိတဲ့ တခြား token မှန်သမျှကို entity အသစ်တစ်ခုရဲ့ အစအဖြစ် label မလုပ်ထားသရွေ့ တူညီတဲ့ entity ရဲ့ အစိတ်အပိုင်းအဖြစ် သတ်မှတ်ပါတယ်။<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="၅-question-answering-pipeline-က-ရညလတ-contexts-တက-ဘယလ-ကငတယဖရငလ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၅-question-answering-pipeline-က-ရညလတ-contexts-တက-ဘယလ-ကငတယဖရငလ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၅။ question-answering pipeline က ရှည်လျားတဲ့ contexts တွေကို ဘယ်လို ကိုင်တွယ်ဖြေရှင်းလဲ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->ဒါက တကယ်တမ်း ကိုင်တွယ်ဖြေရှင်းတာ မဟုတ်ပါဘူး၊ ဘာလို့လဲဆိုတော့ model က လက်ခံတဲ့ အမြင့်ဆုံးအရှည်မှာ ရှည်လျားတဲ့ context ကို truncate လုပ်လို့ပါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->ဒါက context ကို အစိတ်အပိုင်းများစွာအဖြစ် ပိုင်းခြားပြီး ရရှိတဲ့ ရလဒ်တွေကို ပျမ်းမျှယူပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->ဒါက context ကို အစိတ်အပိုင်းများစွာအဖြစ် ပိုင်းခြားပြီး (overlap ပါဝင်ပြီး) အစိတ်အပိုင်းတစ်ခုစီမှာ အဖြေအတွက် အမြင့်ဆုံး score ကို ရှာဖွေပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="3"> <!-- HTML_TAG_START -->ဒါက context ကို အစိတ်အပိုင်းများစွာအဖြစ် ပိုင်းခြားပြီး (ထိရောက်မှုအတွက် overlap မပါဘဲ) အစိတ်အပိုင်းတစ်ခုစီမှာ အဖြေအတွက် အမြင့်ဆုံး score ကို ရှာဖွေပါတယ်။<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="၆-normalization-ဆတ-ဘလ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၆-normalization-ဆတ-ဘလ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၆။ Normalization ဆိုတာ ဘာလဲ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->ဒါက tokenizer က ကနဦးအဆင့်တွေမှာ texts တွေပေါ်မှာ လုပ်ဆောင်တဲ့ မည်သည့် သန့်ရှင်းရေးမဆို။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->ဒါက data augmentation technique တစ်ခုဖြစ်ပြီး ရှားပါးတဲ့ words တွေကို ဖယ်ရှားခြင်းဖြင့် text ကို ပိုမို normal ဖြစ်အောင် လုပ်ဆောင်တာပါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->ဒါက tokenizer က special tokens တွေကို ထည့်သွင်းတဲ့ နောက်ဆုံး post-processing အဆင့်ပါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="3"> <!-- HTML_TAG_START -->ဒါက embeddings တွေကို mean 0 နဲ့ standard deviation 1 ဖြစ်အောင်၊ mean ကို နှုတ်ပြီး std နဲ့ ပိုင်းခြင်းဖြင့် လုပ်ဆောင်တာပါ။<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="၇-subword-tokenizer-အတက-pre-tokenization-ဆတဘလ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၇-subword-tokenizer-အတက-pre-tokenization-ဆတဘလ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၇။ Subword tokenizer အတွက် pre-tokenization ဆိုတာဘာလဲ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->ဒါက tokenization မတိုင်မီ အဆင့်ဖြစ်ပြီး data augmentation (random masking လိုမျိုး) ကို အသုံးပြုတာပါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->ဒါက tokenization မတိုင်မီ အဆင့်ဖြစ်ပြီး လိုအပ်တဲ့ သန့်ရှင်းရေးလုပ်ငန်းတွေကို text ပေါ်မှာ အသုံးပြုတာပါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->ဒါက tokenizer model ကို အသုံးမပြုမီ အဆင့်ဖြစ်ပြီး input ကို words တွေအဖြစ် ပိုင်းခြားဖို့ပါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="3"> <!-- HTML_TAG_START -->ဒါက tokenizer model ကို အသုံးမပြုမီ အဆင့်ဖြစ်ပြီး input ကို tokens တွေအဖြစ် ပိုင်းခြားဖို့ပါ။<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="၈-bpe-tokenization-model-န-သကဆငတ-စကငတက-ရခယပ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၈-bpe-tokenization-model-န-သကဆငတ-စကငတက-ရခယပ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၈။ BPE tokenization model နဲ့ သက်ဆိုင်တဲ့ စာကြောင်းတွေကို ရွေးချယ်ပါ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->BPE ဟာ small vocabulary ကနေ စတင်ပြီး merge rules တွေကို သင်ယူတဲ့ subword tokenization algorithm တစ်ခုပါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->BPE ဟာ big vocabulary ကနေ စတင်ပြီး ၎င်းကနေ tokens တွေကို တဖြည်းဖြည်း ဖယ်ရှားတဲ့ subword tokenization algorithm တစ်ခုပါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->BPE tokenizers တွေက အကြိမ်အများဆုံး ဖြစ်ပေါ်တဲ့ tokens တွဲကို merge လုပ်ခြင်းဖြင့် merge rules တွေကို သင်ယူပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="3"> <!-- HTML_TAG_START -->BPE tokenizer က merge rule တစ်ခုကို အကြိမ်များစွာ ဖြစ်ပေါ်ပြီး တစ်ဦးချင်းစီ အစိတ်အပိုင်းတွေက နည်းနည်းပဲ ဖြစ်ပေါ်တဲ့ pairs တွေကို အလေးပေးတဲ့ score တစ်ခုကို အမြင့်ဆုံးဖြစ်စေမယ့် tokens တွဲကို merge လုပ်ခြင်းဖြင့် သင်ယူပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="4"> <!-- HTML_TAG_START -->BPE က words တွေကို characters တွေအဖြစ် ပိုင်းခြားပြီး merge rules တွေကို အသုံးပြုခြင်းဖြင့် subwords တွေအဖြစ် tokenize လုပ်ပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="5"> <!-- HTML_TAG_START -->BPE က words တွေကို vocabulary ထဲမှာရှိတဲ့ word ရဲ့ အစကနေ စတင်တဲ့ အရှည်ဆုံး subword ကို ရှာဖွေပြီး၊ ကျန်တဲ့ text အတွက် လုပ်ငန်းစဉ်ကို ထပ်ခါတလဲလဲ လုပ်ဆောင်ခြင်းဖြင့် subwords တွေအဖြစ် tokenize လုပ်ပါတယ်။<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="၉-wordpiece-tokenization-model-န-သကဆငတ-စကငတက-ရခယပ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၉-wordpiece-tokenization-model-န-သကဆငတ-စကငတက-ရခယပ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၉။ WordPiece tokenization model နဲ့ သက်ဆိုင်တဲ့ စာကြောင်းတွေကို ရွေးချယ်ပါ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->WordPiece ဟာ small vocabulary ကနေ စတင်ပြီး merge rules တွေကို သင်ယူတဲ့ subword tokenization algorithm တစ်ခုပါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->WordPiece ဟာ big vocabulary ကနေ စတင်ပြီး ၎င်းကနေ tokens တွေကို တဖြည်းဖြည်း ဖယ်ရှားတဲ့ subword tokenization algorithm တစ်ခုပါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->WordPiece tokenizers တွေက အကြိမ်အများဆုံး ဖြစ်ပေါ်တဲ့ tokens တွဲကို merge လုပ်ခြင်းဖြင့် merge rules တွေကို သင်ယူပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="3"> <!-- HTML_TAG_START -->WordPiece tokenizer က merge rule တစ်ခုကို အကြိမ်များစွာ ဖြစ်ပေါ်ပြီး တစ်ဦးချင်းစီ အစိတ်အပိုင်းတွေက နည်းနည်းပဲ ဖြစ်ပေါ်တဲ့ pairs တွေကို အလေးပေးတဲ့ score တစ်ခုကို အမြင့်ဆုံးဖြစ်စေမယ့် tokens တွဲကို merge လုပ်ခြင်းဖြင့် သင်ယူပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="4"> <!-- HTML_TAG_START -->WordPiece က words တွေကို model အရ tokens တွေအဖြစ် အများဆုံး ဖြစ်နိုင်ခြေရှိတဲ့ segmentation ကို ရှာဖွေခြင်းဖြင့် subwords တွေအဖြစ် tokenize လုပ်ပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="5"> <!-- HTML_TAG_START -->WordPiece က words တွေကို vocabulary ထဲမှာရှိတဲ့ word ရဲ့ အစကနေ စတင်တဲ့ အရှည်ဆုံး subword ကို ရှာဖွေပြီး၊ ကျန်တဲ့ text အတွက် လုပ်ငန်းစဉ်ကို ထပ်ခါတလဲလဲ လုပ်ဆောင်ခြင်းဖြင့် subwords တွေအဖြစ် tokenize လုပ်ပါတယ်။<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h3 class="relative group"><a id="၁၀-unigram-tokenization-model-န-သကဆငတ-စကငတက-ရခယပ" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#၁၀-unigram-tokenization-model-န-သကဆငတ-စကငတက-ရခယပ"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>၁၀။ Unigram tokenization model နဲ့ သက်ဆိုင်တဲ့ စာကြောင်းတွေကို ရွေးချယ်ပါ။</span></h3> <div><form><label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="0"> <!-- HTML_TAG_START -->Unigram ဟာ small vocabulary ကနေ စတင်ပြီး merge rules တွေကို သင်ယူတဲ့ subword tokenization algorithm တစ်ခုပါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="1"> <!-- HTML_TAG_START -->Unigram ဟာ big vocabulary ကနေ စတင်ပြီး ၎င်းကနေ tokens တွေကို တဖြည်းဖြည်း ဖယ်ရှားတဲ့ subword tokenization algorithm တစ်ခုပါ။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="2"> <!-- HTML_TAG_START -->Unigram က vocabulary ကို whole corpus ပေါ်မှာ တွက်ချက်ထားတဲ့ loss ကို အနည်းဆုံးဖြစ်အောင် လုပ်ဆောင်ခြင်းဖြင့် ၎င်းရဲ့ vocabulary ကို လိုက်လျောညီထွေဖြစ်အောင် ပြောင်းလဲပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="3"> <!-- HTML_TAG_START -->Unigram က အကြိမ်အများဆုံး ဖြစ်ပေါ်တဲ့ subwords တွေကို ထိန်းသိမ်းထားခြင်းဖြင့် ၎င်းရဲ့ vocabulary ကို လိုက်လျောညီထွေဖြစ်အောင် ပြောင်းလဲပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="4"> <!-- HTML_TAG_START -->Unigram က words တွေကို model အရ tokens တွေအဖြစ် အများဆုံး ဖြစ်နိုင်ခြေရှိတဲ့ segmentation ကို ရှာဖွေခြင်းဖြင့် subwords တွေအဖြစ် tokenize လုပ်ပါတယ်။<!-- HTML_TAG_END --></label> <label class="block"><input autocomplete="off" class="form-input -mt-1.5 mr-2" name="choice" type="checkbox" value="5"> <!-- HTML_TAG_START -->Unigram က words တွေကို characters တွေအဖြစ် ပိုင်းခြားပြီး၊ merge rules တွေကို အသုံးပြုခြင်းဖြင့် subwords တွေအဖြစ် tokenize လုပ်ပါတယ်။<!-- HTML_TAG_END --></label> <div class="flex flex-row items-center mt-3"><button class="btn px-4 mr-4" type="submit" disabled>Submit</button> </div></form></div> <h2 class="relative group"><a id="ဝဟရ-ရငလငခက-glossary" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ဝဟရ-ရငလငခက-glossary"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ဝေါဟာရ ရှင်းလင်းချက် (Glossary)</span></h2> <ul data-svelte-h="svelte-6xhemi"><li><strong>Tokenizer</strong>: စာသား (သို့မဟုတ် အခြားဒေတာ) ကို AI မော်ဒယ်များ စီမံဆောင်ရွက်နိုင်ရန် tokens တွေအဖြစ် ပိုင်းခြားပေးသည့် ကိရိယာ သို့မဟုတ် လုပ်ငန်းစဉ်။</li> <li><strong>Pretrain</strong>: Model တစ်ခုကို အကြီးစားဒေတာများဖြင့် အစောပိုင်းကတည်းက လေ့ကျင့်ထားခြင်း။</li> <li><strong>Dataset</strong>: AI မော်ဒယ်တွေ လေ့ကျင့်ဖို့အတွက် အသုံးပြုတဲ့ ဒေတာအစုအဝေးတစ်ခုပါ။</li> <li><strong>Fine-tune</strong>: ကြိုတင်လေ့ကျင့်ထားပြီးသား (pre-trained) မော်ဒယ်တစ်ခုကို သီးခြားလုပ်ငန်းတစ်ခု (specific task) အတွက် အနည်းငယ်သော ဒေတာနဲ့ ထပ်မံလေ့ကျင့်ပေးခြင်းကို ဆိုလိုပါတယ်။</li> <li><strong>Compute Resources</strong>: ကွန်ပျူတာ၏ တွက်ချက်နိုင်စွမ်း (CPU, GPU, RAM)။</li> <li><strong>Generator</strong>: Python တွင် iteration လုပ်နိုင်သော object တစ်ခုဖြစ်ပြီး ၎င်းသည် အရာအားလုံးကို memory ထဲသို့ တစ်ပြိုင်နက်တည်း သိမ်းဆည်းမထားဘဲ လိုအပ်သလို တန်ဖိုးများကို ထုတ်ပေးသည်။</li> <li><strong><code>train_new_from_iterator()</code></strong>: 🤗 Tokenizers library မှ tokenizer အသစ်တစ်ခုကို iterator (ဥပမာ- generator) မှ data ကို အသုံးပြု၍ လေ့ကျင့်သော method။</li> <li><strong>Memory</strong>: ကွန်ပျူတာ၏ RAM (Random Access Memory)။</li> <li><strong>🤗 Datasets Library</strong>: Hugging Face က ထုတ်လုပ်ထားတဲ့ library တစ်ခုဖြစ်ပြီး AI မော်ဒယ်တွေ လေ့ကျင့်ဖို့အတွက် ဒေတာအစုအဝေး (datasets) တွေကို လွယ်လွယ်ကူကူ ဝင်ရောက်ရယူ၊ စီမံခန့်ခွဲပြီး အသုံးပြုနိုင်စေပါတယ်။</li> <li><strong>🤗 Tokenizers Library</strong>: Rust ဘာသာနဲ့ ရေးသားထားတဲ့ Hugging Face library တစ်ခုဖြစ်ပြီး မြန်ဆန်ထိရောက်တဲ့ tokenization ကို လုပ်ဆောင်ပေးသည်။</li> <li><strong>Multiprocessing</strong>: ကွန်ပျူတာ၏ processors အများအပြားကို အသုံးပြု၍ လုပ်ငန်းများကို တစ်ပြိုင်နက်တည်း လုပ်ဆောင်ခြင်း။</li> <li><strong>Language Model</strong>: လူသားဘာသာစကား၏ ဖြန့်ဝေမှုကို နားလည်ရန် လေ့ကျင့်ထားသော AI မော်ဒယ်တစ်ခု။ ၎င်းသည် စာသားထုတ်လုပ်ခြင်း၊ ဘာသာပြန်ခြင်း စသည့်လုပ်ငန်းများတွင် အသုံးပြုနိုင်သည်။</li> <li><strong>“Fast” Tokenizer</strong>: Rust ဘာသာစကားဖြင့် အကောင်အထည်ဖော်ထားသော tokenizer ဖြစ်ပြီး Python-based “slow” tokenizers များထက် အလွန်မြန်ဆန်သည်။</li> <li><strong>“Slow” Tokenizer</strong>: Python ဘာသာစကားဖြင့် အကောင်အထည်ဖော်ထားသော tokenizer။</li> <li><strong>Batch (of inputs)</strong>: မတူညီသော input များစွာကို တစ်ပြိုင်နက်တည်း လုပ်ဆောင်နိုင်ရန် အုပ်စုဖွဲ့ခြင်း။</li> <li><strong>Parallelism</strong>: လုပ်ငန်းများစွာကို တစ်ပြိုင်နက်တည်း လုပ်ဆောင်ခြင်း။</li> <li><strong>Rust</strong>: System programming language တစ်ခုဖြစ်ပြီး performance မြင့်မားသော applications များ တည်ဆောက်ရာတွင် အသုံးပြုသည်။</li> <li><strong>Padding</strong>: input sequences များ၏ အရှည်ကို တူညီစေရန်အတွက် အပို tokens များ ထည့်သွင်းခြင်း။</li> <li><strong>Truncation</strong>: input sequences များကို သတ်မှတ်ထားသော အရှည်တစ်ခုအထိ ဖြတ်တောက်ခြင်း။</li> <li><strong>Offset Mappings</strong>: token တစ်ခုစီသည် မူရင်းစာသား၏ မည်သည့်စတင်ခြင်းနှင့် အဆုံးသတ် character index များကြားတွင် ရှိနေသည်ကို ဖော်ပြသော map။</li> <li><strong><code>token-classification</code> Pipeline</strong>: <code>pipeline()</code> function ကို အသုံးပြု၍ token classification task ကို လုပ်ဆောင်ရန် တည်ဆောက်ထားသော pipeline။</li> <li><strong>Entities</strong>: Named Entity Recognition (NER) တွင် ဖော်ထုတ်ရမည့် အရာများ (ဥပမာ- လူပုဂ္ဂိုလ်၊ နေရာဒေသ၊ အဖွဲ့အစည်း)။</li> <li><strong>Label</strong>: Classification task တစ်ခုတွင် data point တစ်ခုအား သတ်မှတ်ထားသော အမျိုးအစား။</li> <li><strong><code>B-XXX</code> Label</strong>: Named Entity Recognition (NER) တွင် entity အမျိုးအစား <code>XXX</code> ၏ စတင်ခြင်း token ကို ကိုယ်စားပြုသော label (“Beginning” of entity)။</li> <li><strong><code>I-XXX</code> Label</strong>: Named Entity Recognition (NER) တွင် entity အမျိုးအစား <code>XXX</code> ၏ အတွင်းပိုင်း token ကို ကိုယ်စားပြုသော label (“Inside” entity)။</li> <li><strong><code>question-answering</code> Pipeline</strong>: <code>pipeline()</code> function ကို အသုံးပြု၍ question answering task ကို လုပ်ဆောင်ရန် တည်ဆောက်ထားသော pipeline။</li> <li><strong>Contexts</strong>: Question answering task တွင် မေးခွန်းအတွက် အဖြေပါဝင်နိုင်သည့် စာသားအပိုဒ်။</li> <li><strong>Truncate (Context)</strong>: ရှည်လျားသော context ကို model ၏ maximum length အထိ ဖြတ်တောက်ခြင်း။</li> <li><strong>Overlap (Context)</strong>: ရှည်လျားသော context ကို အပိုင်းပိုင်းပိုင်းဖြတ်ရာတွင် အပိုင်းများကြားတွင် တူညီသော စာသားအချို့ ထပ်နေခြင်း။</li> <li><strong>Normalization</strong>: စာသားကို သန့်ရှင်းရေးလုပ်ခြင်း (ဥပမာ- needless whitespace ဖယ်ရှားခြင်း၊ lowercasing, accents ဖယ်ရှားခြင်း)။</li> <li><strong>Data Augmentation</strong>: datasets ၏ အရွယ်အစားနှင့် မတူကွဲပြားမှုကို တိုးမြှင့်ရန်အတွက် လက်ရှိဒေတာကို ပြောင်းလဲခြင်း သို့မဟုတ် ဒေတာအသစ်များ ဖန်တီးခြင်းနည်းလမ်း။</li> <li><strong>Rare Words</strong>: corpus ထဲတွင် အကြိမ်ရေနည်းပါးစွာသာ ပေါ်ပေါက်သော စကားလုံးများ။</li> <li><strong>Post-processing</strong>: Model ၏ output များကို နောက်ဆုံးအသုံးပြုမှုအတွက် ပြင်ဆင်ခြင်း လုပ်ငန်းစဉ်။</li> <li><strong>Embeddings</strong>: စကားလုံးများ၊ စာကြောင်းများ သို့မဟုတ် အခြားဒေတာများကို ဂဏန်းဆိုင်ရာ vector များအဖြစ် ကိုယ်စားပြုခြင်း။</li> <li><strong>Mean (Average)</strong>: ပျမ်းမျှတန်ဖိုး။</li> <li><strong>Standard Deviation (Std)</strong>: data points များသည် mean (ပျမ်းမျှ) မှ မည်မျှကွာဝေးနေသည်ကို တိုင်းတာသော သင်္ချာဆိုင်ရာတန်ဖိုး။</li> <li><strong>Pixel Values</strong>: ပုံရိပ်တစ်ခုရှိ pixel တစ်ခုစီ၏ အရောင် သို့မဟုတ် အလင်းအမှောင် တန်ဖိုးများ။</li> <li><strong>Computer Vision</strong>: ကွန်ပျူတာများကို ပုံရိပ်များ သို့မဟုတ် ဗီဒီယိုများမှ အချက်အလက်များ နားလည်စေရန် သင်ကြားပေးခြင်း။</li> <li><strong>Pre-tokenization</strong>: Subword tokenization မလုပ်ဆောင်မီ စာသားကို ပိုမိုသေးငယ်သော entities (ဥပမာ- words) အဖြစ် အကြိုပိုင်းခြားခြင်း။</li> <li><strong>Subword Tokenizer</strong>: စကားလုံးများကို သေးငယ်သော subword units (ဥပမာ- word pieces, byte-pair encodings) များအဖြစ် ပိုင်းခြားသော tokenizer။</li> <li><strong>Masking (Random Masking)</strong>: data augmentation technique တစ်ခုဖြစ်ပြီး input data အစိတ်အပိုင်းအချို့ကို ကျပန်းဖုံးကွယ်ထားခြင်း။</li> <li><strong>Tokenizer Model</strong>: Tokenization လုပ်ငန်းစဉ်ကို လုပ်ဆောင်ပေးသော model။</li> <li><strong>BPE (Byte-Pair Encoding)</strong>: Subword tokenization algorithm တစ်မျိုး။ small vocabulary မှ စတင်ပြီး အများဆုံးဖြစ်ပေါ်သော tokens တွဲများကို merge လုပ်ခြင်းဖြင့် merge rules များကို သင်ယူသည်။ words များကို characters များအဖြစ် ပိုင်းခြားပြီး merge rules များကို အသုံးပြု၍ subwords များအဖြစ် tokenize လုပ်သည်။</li> <li><strong>Vocabulary</strong>: tokenizer သို့မဟုတ် model တစ်ခုက သိရှိနားလည်ပြီး ကိုင်တွယ်နိုင်သော ထူးခြားသည့် tokens များ စုစုပေါင်း။</li> <li><strong>Merge Rules</strong>: BPE နှင့် WordPiece algorithm များတွင် tokens များကို ပေါင်းစပ်ရန် သင်ယူထားသော စည်းမျဉ်းများ။</li> <li><strong>WordPiece</strong>: Subword tokenization algorithm တစ်မျိုး။ small vocabulary မှ စတင်ပြီး အကြိမ်များစွာ ဖြစ်ပေါ်ပြီး တစ်ဦးချင်းစီ အစိတ်အပိုင်းတွေက နည်းနည်းပဲ ဖြစ်ပေါ်တဲ့ pairs တွေကို အလေးပေးတဲ့ score တစ်ခုကို အမြင့်ဆုံးဖြစ်စေမယ့် tokens တွဲကို merge လုပ်ခြင်းဖြင့် merge rules များကို သင်ယူသည်။ words များကို vocabulary ထဲမှာရှိတဲ့ word ရဲ့ အစကနေ စတင်တဲ့ အရှည်ဆုံး subword ကို ရှာဖွေပြီး၊ ကျန်တဲ့ text အတွက် လုပ်ငန်းစဉ်ကို ထပ်ခါတလဲလဲ လုပ်ဆောင်ခြင်းဖြင့် subwords တွေအဖြစ် tokenize လုပ်သည်။</li> <li><strong>Segmentation</strong>: စာသားတစ်ခုကို သေးငယ်သော အစိတ်အပိုင်းများ (ဥပမာ- tokens) အဖြစ် ပိုင်းခြားခြင်း။</li> <li><strong>Unigram</strong>: Subword tokenization algorithm တစ်မျိုး။ big vocabulary မှ စတင်ပြီး whole corpus ပေါ်မှာ တွက်ချက်ထားတဲ့ loss ကို အနည်းဆုံးဖြစ်စေမယ့် tokens တွေကို ဖယ်ရှားခြင်းဖြင့် vocabulary ကို လိုက်လျောညီထွေဖြစ်အောင် ပြောင်းလဲသည်။ words တွေကို model အရ tokens တွေအဖြစ် အများဆုံး ဖြစ်နိုင်ခြေရှိတဲ့ segmentation ကို ရှာဖွေခြင်းဖြင့် subwords တွေအဖြစ် tokenize လုပ်သည်။</li> <li><strong>Loss (Corpus Loss)</strong>: Model ၏ ခန့်မှန်းချက်များနှင့် အမှန်တကယ် labels များကြား ကွာခြားမှုကို whole corpus တစ်ခုလုံးအတွက် တိုင်းတာသော တန်ဖိုး။</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/my/chapter6/10.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_tyugt6 = {
	assets: "/docs/course/pr_1114/my",
	base: "/docs/course/pr_1114/my",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/course/pr_1114/my/_app/immutable/entry/start.14794ee9.js"),
	import("/docs/course/pr_1114/my/_app/immutable/entry/app.a133f5c6.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 45],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 67.8 kB
Xet hash:: ac7036cb9b93da5ad4150321f358df621e61f6ac30a25e0bcd87a719ba152bef

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.