Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / course /pr_1114 /my /chapter6 /4.html

rtrm

about 1 month ago

download

raw

59.9 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Normalization နှင့် Pre-tokenization","local":"normalization-and-pre-tokenization","sections":[{"title":"Normalization","local":"normalization","sections":[],"depth":2},{"title":"Pre-tokenization","local":"pre-tokenization","sections":[],"depth":2},{"title":"SentencePiece","local":"sentencepiece","sections":[],"depth":2},{"title":"Algorithm Overview","local":"algorithm-overview","sections":[],"depth":2},{"title":"ဝေါဟာရ ရှင်းလင်းချက် (Glossary)","local":"ဝဟရ-ရငလငခက-glossary","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/course/pr_1114/my/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/entry/start.14794ee9.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/scheduler.893fe8c9.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/singletons.10fda3ce.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/index.bce52c8a.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/paths.89c82153.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/entry/app.a133f5c6.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/preload-helper.b1a719fd.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/index.b1df2166.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/nodes/0.510afdc1.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/nodes/49.e8fb3bfe.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.762ed9cc.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/Youtube.ec5d7916.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/CodeBlock.6cef0479.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/CourseFloatingBanner.c1c08878.js">
	<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/FrameworkSwitchCourse.4480e339.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Normalization နှင့် Pre-tokenization","local":"normalization-and-pre-tokenization","sections":[{"title":"Normalization","local":"normalization","sections":[],"depth":2},{"title":"Pre-tokenization","local":"pre-tokenization","sections":[],"depth":2},{"title":"SentencePiece","local":"sentencepiece","sections":[],"depth":2},{"title":"Algorithm Overview","local":"algorithm-overview","sections":[],"depth":2},{"title":"ဝေါဟာရ ရှင်းလင်းချက် (Glossary)","local":"ဝဟရ-ရငလငခက-glossary","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="bg-white leading-none border border-gray-100 rounded-lg flex p-0.5 w-56 text-sm mb-4"><a class="flex justify-center flex-1 py-1.5 px-2.5 focus:outline-none !no-underline rounded-l bg-red-50 dark:bg-transparent text-red-600" href="?fw=pt"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><defs><clipPath id="a"><rect x="3.05" y="0.5" width="25.73" height="31" fill="none"></rect></clipPath></defs><g clip-path="url(#a)"><path d="M24.94,9.51a12.81,12.81,0,0,1,0,18.16,12.68,12.68,0,0,1-18,0,12.81,12.81,0,0,1,0-18.16l9-9V5l-.84.83-6,6a9.58,9.58,0,1,0,13.55,0ZM20.44,9a1.68,1.68,0,1,1,1.67-1.67A1.68,1.68,0,0,1,20.44,9Z" fill="#ee4c2c"></path></g></svg> Pytorch </a><a class="flex justify-center flex-1 py-1.5 px-2.5 focus:outline-none !no-underline rounded-r text-gray-500 filter grayscale" href="?fw=tf"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="0.94em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 274"><path d="M145.726 42.065v42.07l72.861 42.07v-42.07l-72.86-42.07zM0 84.135v42.07l36.43 21.03V105.17L0 84.135zm109.291 21.035l-36.43 21.034v126.2l36.43 21.035v-84.135l36.435 21.035v-42.07l-36.435-21.034V105.17z" fill="#E55B2D"></path><path d="M145.726 42.065L36.43 105.17v42.065l72.861-42.065v42.065l36.435-21.03v-84.14zM255.022 63.1l-36.435 21.035v42.07l36.435-21.035V63.1zm-72.865 84.135l-36.43 21.035v42.07l36.43-21.036v-42.07zm-36.43 63.104l-36.436-21.035v84.135l36.435-21.035V210.34z" fill="#ED8E24"></path><path d="M145.726 0L0 84.135l36.43 21.035l109.296-63.105l72.861 42.07L255.022 63.1L145.726 0zm0 126.204l-36.435 21.03l36.435 21.036l36.43-21.035l-36.43-21.03z" fill="#F8BF3C"></path></svg> TensorFlow </a></div> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="normalization-and-pre-tokenization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#normalization-and-pre-tokenization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Normalization နှင့် Pre-tokenization</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0" style=""><a href="https://discuss.huggingface.co/t/chapter-6-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter6/section4.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter6/section4.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-hjb97q">Transformer models တွေနဲ့ အသုံးပြုတဲ့ အသုံးအများဆုံး subword tokenization algorithms သုံးခု (Byte-Pair Encoding [BPE]၊ WordPiece နဲ့ Unigram) ကို နက်နက်နဲနဲ လေ့လာမဝင်မီ၊ tokenizer တစ်ခုစီက text ကို ဘယ်လို preprocessing လုပ်လဲဆိုတာ အရင်ဆုံး ကြည့်ရပါမယ်။ tokenization pipeline ရဲ့ အဆင့်တွေကို မြင့်မားသောအဆင့် (high-level) overview တစ်ခုကတော့ အောက်ပါအတိုင်းပါ။</p> <div class="flex justify-center" data-svelte-h="svelte-oxfng3"><img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter6/tokenization_pipeline.svg" alt="The tokenization pipeline."> <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter6/tokenization_pipeline-dark.svg" alt="The tokenization pipeline."></div> <p data-svelte-h="svelte-4hzmpv">text တစ်ခုကို (၎င်း၏ model အရ) subtokens အဖြစ် ပိုင်းခြားခြင်းမပြုမီ၊ tokenizer သည် အဆင့်နှစ်ဆင့်ကို လုပ်ဆောင်ပါတယ်- <em>normalization</em> နဲ့ <em>pre-tokenization</em> တို့ ဖြစ်ပါတယ်။</p> <h2 class="relative group"><a id="normalization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#normalization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Normalization</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/4IIC2jI9CaU" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-4ztzuq">normalization အဆင့်မှာ မလိုအပ်တဲ့ whitespace တွေ ဖယ်ရှားတာ၊ lowercasing လုပ်တာ၊ နဲ့/သို့မဟုတ် accents တွေ ဖယ်ရှားတာလိုမျိုး အထွေထွေသန့်ရှင်းရေးတွေ ပါဝင်ပါတယ်။ သင် <a href="http://www.unicode.org/reports/tr15/" rel="nofollow">Unicode normalization</a> (NFC ဒါမှမဟုတ် NFKC လိုမျိုး) နဲ့ ရင်းနှီးတယ်ဆိုရင်၊ ဒါကလည်း tokenizer က အသုံးချနိုင်တဲ့ အရာတစ်ခုပါပဲ။</p> <p data-svelte-h="svelte-l5cxfq">🤗 Transformers <code>tokenizer</code> မှာ <code>backend_tokenizer</code> လို့ခေါ်တဲ့ attribute တစ်ခုရှိပြီး အောက်ခံ tokenizer ကို 🤗 Tokenizers library ကနေ ဝင်ရောက်ကြည့်ရှုနိုင်စေပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer

	tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"bert-base-uncased"</span>)
	<span class="hljs-built_in">print</span>(<span class="hljs-built_in">type</span>(tokenizer.backend_tokenizer))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><<span class="hljs-keyword">class</span> <span class="hljs-string">'tokenizers.Tokenizer'</span>><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-fyvj8h"><code>tokenizer</code> object ရဲ့ <code>normalizer</code> attribute မှာ <code>normalize_str()</code> method တစ်ခုရှိပြီး normalization ကို ဘယ်လိုလုပ်ဆောင်လဲဆိုတာ ကြည့်ဖို့ ကျွန်တော်တို့ အသုံးပြုနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(tokenizer.backend_tokenizer.normalizer.normalize_str(<span class="hljs-string">"Héllò hôw are ü?"</span>))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">'hello how are u?'</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ik4uf8">ဒီဥပမာမှာ၊ ကျွန်တော်တို့ <code>bert-base-uncased</code> checkpoint ကို ရွေးချယ်ခဲ့တာကြောင့်၊ normalization က lowercasing လုပ်ပြီး accents တွေကို ဖယ်ရှားခဲ့ပါတယ်။</p> <blockquote class="tip" data-svelte-h="svelte-1q327zz"><p>✏️ <strong>စမ်းသပ်ကြည့်ပါ။</strong> <code>bert-base-cased</code> checkpoint ကနေ tokenizer တစ်ခုကို load လုပ်ပြီး ဥပမာတူတူကို ၎င်းဆီ ပေးပို့ပါ။ cased နဲ့ uncased versions တွေကြားက အဓိကကွာခြားချက်တွေက ဘာတွေလဲ။</p></blockquote> <h2 class="relative group"><a id="pre-tokenization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#pre-tokenization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Pre-tokenization</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/grlLV8AIXug" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-rsqvij">နောက်အပိုင်းတွေမှာ ကျွန်တော်တို့ တွေ့ရမယ့်အတိုင်း၊ tokenizer တစ်ခုကို raw text တစ်ခုတည်းနဲ့ train လုပ်လို့ မရပါဘူး။ အဲဒီအစား၊ ကျွန်တော်တို့ဟာ texts တွေကို words တွေလို သေးငယ်တဲ့ entities တွေအဖြစ် အရင်ဆုံး ပိုင်းခြားဖို့ လိုအပ်ပါတယ်။ အဲဒီနေရာမှာ pre-tokenization အဆင့်က ပါဝင်လာပါတယ်။ <a href="/course/chapter2">Chapter 2</a> မှာ ကျွန်တော်တို့ တွေ့ခဲ့ရတဲ့အတိုင်း၊ word-based tokenizer တစ်ခုက raw text တစ်ခုကို whitespace နဲ့ punctuation တွေပေါ်မှာ ရိုးရှင်းစွာ words တွေအဖြစ် ပိုင်းခြားနိုင်ပါတယ်။ အဲဒီ words တွေဟာ tokenizer က ၎င်းရဲ့ training လုပ်နေစဉ်အတွင်း သင်ယူနိုင်တဲ့ subtokens တွေရဲ့ boundary တွေ ဖြစ်ပါလိမ့်မယ်။</p> <p data-svelte-h="svelte-1qgbaf4">fast tokenizer တစ်ခုက pre-tokenization ကို ဘယ်လိုလုပ်ဆောင်လဲဆိုတာ ကြည့်ဖို့၊ <code>tokenizer</code> object ရဲ့ <code>pre_tokenizer</code> attribute ရဲ့ <code>pre_tokenize_str()</code> method ကို ကျွန်တော်တို့ အသုံးပြုနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(<span class="hljs-string">"Hello, how are you?"</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[(<span class="hljs-string">'Hello'</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">','</span>, (<span class="hljs-number">5</span>, <span class="hljs-number">6</span>)), (<span class="hljs-string">'how'</span>, (<span class="hljs-number">7</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">'are'</span>, (<span class="hljs-number">11</span>, <span class="hljs-number">14</span>)), (<span class="hljs-string">'you'</span>, (<span class="hljs-number">16</span>, <span class="hljs-number">19</span>)), (<span class="hljs-string">'?'</span>, (<span class="hljs-number">19</span>, <span class="hljs-number">20</span>))]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1sh3fol">tokenizer က offsets တွေကို အရင်ကတည်းက ခြေရာခံထားတာကို သတိပြုပါ။ ဒါက ယခင်အပိုင်းမှာ ကျွန်တော်တို့ အသုံးပြုခဲ့တဲ့ offset mapping ကို ဘယ်လိုပေးနိုင်လဲဆိုတာပါပဲ။ ဒီနေရာမှာ tokenizer က spaces နှစ်ခုကို လျစ်လျူရှုပြီး တစ်ခုတည်းနဲ့ အစားထိုးပါတယ်၊ ဒါပေမယ့် <code>are</code> နဲ့ <code>you</code> ကြားက offset ကတော့ အဲဒါကို ထည့်သွင်းစဉ်းစားဖို့ ခုန်ကျော်သွားပါတယ်။</p> <p data-svelte-h="svelte-18fu2q2">ကျွန်တော်တို့ BERT tokenizer ကို အသုံးပြုနေတာကြောင့်၊ pre-tokenization မှာ whitespace နဲ့ punctuation တွေပေါ်မှာ ပိုင်းခြားတာ ပါဝင်ပါတယ်။ တခြား tokenizers တွေမှာ ဒီအဆင့်အတွက် မတူညီတဲ့ စည်းမျဉ်းတွေ ရှိနိုင်ပါတယ်။ ဥပမာ၊ ကျွန်တော်တို့ GPT-2 tokenizer ကို အသုံးပြုမယ်ဆိုရင်…</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"gpt2"</span>)
	tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(<span class="hljs-string">"Hello, how are you?"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-es2u0r">ဒါကလည်း whitespace နဲ့ punctuation တွေပေါ်မှာ ပိုင်းခြားပါလိမ့်မယ်၊ ဒါပေမယ့် spaces တွေကို ထိန်းသိမ်းထားပြီး ၎င်းတို့ကို <code>Ġ</code> symbol နဲ့ အစားထိုးပါလိမ့်မယ်။ ဒါက tokens တွေကို decode လုပ်ရင် original spaces တွေကို ပြန်လည်ရယူနိုင်စေပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[(<span class="hljs-string">'Hello'</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">','</span>, (<span class="hljs-number">5</span>, <span class="hljs-number">6</span>)), (<span class="hljs-string">'Ġhow'</span>, (<span class="hljs-number">6</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">'Ġare'</span>, (<span class="hljs-number">10</span>, <span class="hljs-number">14</span>)), (<span class="hljs-string">'Ġ'</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">15</span>)), (<span class="hljs-string">'Ġyou'</span>, (<span class="hljs-number">15</span>, <span class="hljs-number">19</span>)),
	(<span class="hljs-string">'?'</span>, (<span class="hljs-number">19</span>, <span class="hljs-number">20</span>))]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-b3a04">BERT tokenizer နဲ့ မတူဘဲ၊ ဒီ tokenizer က double space ကို လျစ်လျူရှုတာ မရှိဘူးဆိုတာလည်း သတိပြုပါ။</p> <p data-svelte-h="svelte-1a74n16">နောက်ဆုံး ဥပမာတစ်ခုအနေနဲ့၊ SentencePiece algorithm ပေါ်မှာ အခြေခံထားတဲ့ T5 tokenizer ကို ကြည့်ရအောင်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"t5-small"</span>)
	tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(<span class="hljs-string">"Hello, how are you?"</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[(<span class="hljs-string">' Hello,'</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">6</span>)), (<span class="hljs-string">' how'</span>, (<span class="hljs-number">7</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">' are'</span>, (<span class="hljs-number">11</span>, <span class="hljs-number">14</span>)), (<span class="hljs-string">' you?'</span>, (<span class="hljs-number">16</span>, <span class="hljs-number">20</span>))]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-14rks5o">GPT-2 tokenizer နဲ့ ဆင်တူစွာ၊ ဒါက spaces တွေကို ထိန်းသိမ်းထားပြီး ၎င်းတို့ကို သီးခြား token တစ်ခု (<code>_</code>) နဲ့ အစားထိုးပါတယ်။ ဒါပေမယ့် T5 tokenizer က whitespace တွေပေါ်မှာပဲ ပိုင်းခြားပြီး punctuation တွေပေါ်မှာ မဟုတ်ပါဘူး။ ဒါ့အပြင် sentence အစမှာ ( <code>Hello</code> မတိုင်ခင်) default အားဖြင့် space တစ်ခု ထည့်ထားပြီး <code>are</code> နဲ့ <code>you</code> ကြားက double space ကို လျစ်လျူရှုခဲ့တာကိုလည်း သတိပြုပါ။</p> <p data-svelte-h="svelte-1no9joc">ကွဲပြားခြားနားတဲ့ tokenizers အချို့က text ကို ဘယ်လို process လုပ်လဲဆိုတာ အနည်းငယ် မြင်တွေ့ခဲ့ရပြီဆိုတော့၊ အောက်ခံ algorithms တွေကိုယ်တိုင် လေ့လာကြည့်နိုင်ပါပြီ။ ကျွန်တော်တို့ဟာ အကျယ်တဝင့် အသုံးဝင်တဲ့ SentencePiece ကို အမြန်ကြည့်ခြင်းဖြင့် စတင်ပါမယ်။ ထို့နောက်၊ နောက်ထပ် သုံးပိုင်းမှာ၊ subword tokenization အတွက် အဓိက algorithms သုံးခု ဘယ်လိုအလုပ်လုပ်လဲဆိုတာကို စစ်ဆေးကြည့်ပါမယ်။</p> <h2 class="relative group"><a id="sentencepiece" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sentencepiece"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>SentencePiece</span></h2> <p data-svelte-h="svelte-woz2t3"><a href="https://github.com/google/sentencepiece" rel="nofollow">SentencePiece</a> ဟာ text ကို preprocessing လုပ်ဖို့အတွက် tokenization algorithm တစ်ခုဖြစ်ပြီး နောက်ထပ် သုံးအပိုင်းမှာ ကျွန်တော်တို့ မြင်တွေ့ရမယ့် models တွေထဲက ဘယ်တစ်ခုနဲ့မဆို အသုံးပြုနိုင်ပါတယ်။ ဒါက text ကို Unicode characters sequence တစ်ခုအဖြစ် သတ်မှတ်ပြီး၊ spaces တွေကို <code></code> ဆိုတဲ့ special character တစ်ခုနဲ့ အစားထိုးပါတယ်။ Unigram algorithm ( <a href="/course/chapter6/7">အပိုင်း ၇</a> ကို ကြည့်ပါ) နဲ့ ပေါင်းစပ်အသုံးပြုတဲ့အခါ၊ pre-tokenization အဆင့်တောင် မလိုအပ်ပါဘူး။ ဒါက space character ကို အသုံးမပြုတဲ့ ဘာသာစကားတွေ (ဥပမာ- တရုတ် ဒါမှမဟုတ် ဂျပန်) အတွက် အလွန်အသုံးဝင်ပါတယ်။</p> <p data-svelte-h="svelte-54xa8k">SentencePiece ရဲ့ အခြားအဓိက feature ကတော့ <em>reversible tokenization</em> ပါ၊ spaces တွေကို သီးခြားစီ စီမံဆောင်ရွက်မှု မရှိတဲ့အတွက်၊ tokens တွေကို decode လုပ်တာက ၎င်းတို့ကို concatenate လုပ်ပြီး <code>_</code> တွေကို spaces တွေနဲ့ အစားထိုးခြင်းဖြင့် ရိုးရှင်းစွာ လုပ်ဆောင်ပါတယ်။ — ဒါက normalized text ကို ရရှိစေပါတယ်။ ကျွန်တော်တို့ အရင်က တွေ့ခဲ့ရတဲ့အတိုင်း၊ BERT tokenizer က ထပ်နေတဲ့ spaces တွေကို ဖယ်ရှားတာကြောင့်၊ ၎င်းရဲ့ tokenization က reversible မဟုတ်ပါဘူး။</p> <h2 class="relative group"><a id="algorithm-overview" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#algorithm-overview"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Algorithm Overview</span></h2> <p data-svelte-h="svelte-171uft3">အောက်ပါအပိုင်းတွေမှာ၊ အဓိက subword tokenization algorithms သုံးခုဖြစ်တဲ့ BPE (GPT-2 နဲ့ အခြား models တွေက အသုံးပြု)၊ WordPiece (BERT ကဲ့သို့သော models တွေက အသုံးပြု) နဲ့ Unigram (T5 နဲ့ အခြား models တွေက အသုံးပြု) တွေကို နက်နက်နဲနဲ လေ့လာသွားပါမယ်။ မစတင်မီ၊ ၎င်းတို့တစ်ခုချင်းစီ ဘယ်လိုအလုပ်လုပ်လဲဆိုတာကို အမြန် overview တစ်ခု ပေးလိုက်ပါတယ်။ ဒီဇယားက အခုထိ သင့်အတွက် အဓိပ္ပာယ်မရှိသေးရင် နောက်အပိုင်းတစ်ခုစီကို ဖတ်ပြီးနောက် ဒီဇယားကို ပြန်ကြည့်ဖို့ မတွန့်ဆုတ်ပါနဲ့။</p> <table data-svelte-h="svelte-13sjwm1"><thead><tr><th align="center">Model</th> <th align="center">BPE</th> <th align="center">WordPiece</th> <th align="center">Unigram</th></tr></thead> <tbody><tr><td align="center">Training</td> <td align="center">small vocabulary ကနေ စတင်ပြီး tokens တွေကို merge လုပ်ဖို့ စည်းမျဉ်းတွေ သင်ယူ</td> <td align="center">small vocabulary ကနေ စတင်ပြီး tokens တွေကို merge လုပ်ဖို့ စည်းမျဉ်းတွေ သင်ယူ</td> <td align="center">large vocabulary ကနေ စတင်ပြီး tokens တွေကို ဖယ်ရှားဖို့ စည်းမျဉ်းတွေ သင်ယူ</td></tr> <tr><td align="center">Training step</td> <td align="center">အများဆုံး common pair နဲ့ ကိုက်ညီတဲ့ tokens တွေကို merge လုပ်</td> <td align="center">pair ရဲ့ frequency ပေါ် အခြေခံပြီး best score နဲ့ ကိုက်ညီတဲ့ tokens တွေကို merge လုပ်၊ တစ်ခုချင်းစီ token က less frequent ဖြစ်တဲ့ pairs တွေကို အလေးထား</td> <td align="center">whole corpus ပေါ်မှာ တွက်ချက်ထားတဲ့ loss ကို အနည်းဆုံးဖြစ်စေမယ့် vocabulary ထဲက tokens အားလုံးကို ဖယ်ရှား</td></tr> <tr><td align="center">Learns</td> <td align="center">Merge rules တွေနဲ့ vocabulary</td> <td align="center">Vocabulary တစ်ခုတည်း</td> <td align="center">token တစ်ခုစီအတွက် score ပါဝင်တဲ့ vocabulary</td></tr> <tr><td align="center">Encoding</td> <td align="center">word ကို characters တွေအဖြစ် ပိုင်းခြားပြီး training လုပ်နေစဉ် သင်ယူခဲ့တဲ့ merges တွေကို အသုံးပြု</td> <td align="center">vocabulary ထဲမှာရှိတဲ့ word ရဲ့ အစကနေ စတင်တဲ့ အရှည်ဆုံး subword ကို ရှာဖွေ၊ ပြီးတော့ word ရဲ့ ကျန်တာအတွက် အတူတူလုပ်</td> <td align="center">training လုပ်နေစဉ် သင်ယူခဲ့တဲ့ scores တွေကို အသုံးပြုပြီး tokens တွေအဖြစ် split လုပ်ဖို့ အများဆုံး ဖြစ်နိုင်ခြေရှိတာကို ရှာဖွေ</td></tr></tbody></table> <p data-svelte-h="svelte-96gsc9">အခု BPE ကို စတင်လေ့လာကြရအောင်!</p> <h2 class="relative group"><a id="ဝဟရ-ရငလငခက-glossary" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ဝဟရ-ရငလငခက-glossary"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ဝေါဟာရ ရှင်းလင်းချက် (Glossary)</span></h2> <ul data-svelte-h="svelte-1cikgj2"><li><strong>Subword Tokenization Algorithms</strong>: စကားလုံးများကို သေးငယ်သော subword units (ဥပမာ- word pieces, byte-pair encodings) များအဖြစ် ပိုင်းခြားသော tokenization နည်းလမ်းများ။ ၎င်းသည် vocabulary အရွယ်အစားကို ထိန်းချုပ်ရန်နှင့် out-of-vocabulary (OOV) ပြဿနာများကို ဖြေရှင်းရန် ကူညီပေးသည်။</li> <li><strong>Transformer Models</strong>: Natural Language Processing (NLP) မှာ အောင်မြင်မှုများစွာရရှိခဲ့တဲ့ deep learning architecture တစ်မျိုးပါ။</li> <li><strong>Byte-Pair Encoding (BPE)</strong>: Subword tokenization algorithm တစ်မျိုး။</li> <li><strong>WordPiece</strong>: Subword tokenization algorithm တစ်မျိုး။</li> <li><strong>Unigram</strong>: Subword tokenization algorithm တစ်မျိုး။</li> <li><strong>Preprocessing</strong>: ဒေတာများကို model က နားလည်ပြီး လုပ်ဆောင်နိုင်တဲ့ ပုံစံအဖြစ် ပြောင်းလဲပြင်ဆင်ခြင်း လုပ်ငန်းစဉ်။</li> <li><strong>Tokenization Pipeline</strong>: စာသားကို AI မော်ဒယ်များ လုပ်ဆောင်နိုင်သော ဂဏန်းဆိုင်ရာ ကိုယ်စားပြုမှုအဖြစ် ပြောင်းလဲရန် လိုအပ်သော အဆင့်များ (ဥပမာ- normalization, pre-tokenization, tokenization, post-processing)။</li> <li><strong>Normalization</strong>: စာသားကို သန့်ရှင်းရေးလုပ်ခြင်း (ဥပမာ- needless whitespace ဖယ်ရှားခြင်း၊ lowercasing, accents ဖယ်ရှားခြင်း)။</li> <li><strong>Pre-tokenization</strong>: Subword tokenization မလုပ်ဆောင်မီ စာသားကို ပိုမိုသေးငယ်သော entities (ဥပမာ- words) အဖြစ် အကြိုပိုင်းခြားခြင်း။</li> <li><strong>Whitespace</strong>: စာသားများကြားရှိ နေရာလွတ်များ (space, tab, newline)။</li> <li><strong>Lowercasing</strong>: စာလုံးများကို အသေးစာလုံးများအဖြစ် ပြောင်းလဲခြင်း။</li> <li><strong>Accents</strong>: စာလုံးများပေါ်တွင် တင်ရှိသော အသံထွက် သင်္ကေတများ (ဥပမာ- é, ü)။</li> <li><strong>Unicode Normalization</strong>: Unicode character များကို တူညီသော စံနှုန်းတစ်ခုအဖြစ် ပြောင်းလဲခြင်း။</li> <li><strong>NFC/NFKC</strong>: Unicode normalization forms များ။</li> <li><strong>🤗 Transformers <code>tokenizer</code></strong>: Hugging Face Transformers library မှ tokenizer object။</li> <li><strong><code>backend_tokenizer</code> Attribute</strong>: <code>tokenizer</code> object မှ underlying tokenizer (🤗 Tokenizers library က) ကို ဝင်ရောက်ကြည့်ရှုနိုင်စေသော attribute။</li> <li><strong>🤗 Tokenizers Library</strong>: Rust ဘာသာနဲ့ ရေးသားထားတဲ့ Hugging Face library တစ်ခုဖြစ်ပြီး မြန်ဆန်ထိရောက်တဲ့ tokenization ကို လုပ်ဆောင်ပေးသည်။</li> <li><strong><code>tokenizers.Tokenizer</code></strong>: 🤗 Tokenizers library မှ base tokenizer class။</li> <li><strong><code>normalizer</code> Attribute</strong>: <code>tokenizer</code> object မှ normalization ကို လုပ်ဆောင်ပေးသော object။</li> <li><strong><code>normalize_str()</code> Method</strong>: Normalizer object မှ string တစ်ခုကို normalize လုပ်သော method။</li> <li><strong><code>bert-base-uncased</code></strong>: BERT model ၏ base version အတွက် checkpoint identifier (uncased version)။</li> <li><strong>Cased/Uncased Versions</strong>: cased version က စာလုံးအကြီးအသေးကို ခွဲခြားပြီး uncased version က စာလုံးအကြီးအသေးကို မခွဲခြားပါ။</li> <li><strong>Raw Text</strong>: မည်သည့် preprocessing မျှ မလုပ်ဆောင်ရသေးသော စာသား။</li> <li><strong>Words</strong>: စာသားကို ပိုင်းခြားထားသော အခြေခံယူနစ်များ။</li> <li><strong>Boundaries</strong>: အရာနှစ်ခုကြား ပိုင်းခြားထားသော အစွန်းများ။</li> <li><strong>Subtokens</strong>: Subword tokenization ပြီးနောက် ရရှိသော သေးငယ်သော tokens များ။</li> <li><strong>Word-based Tokenizer</strong>: စကားလုံးများကို အခြေခံ၍ စာသားကို ပိုင်းခြားသော tokenizer။</li> <li><strong>Punctuation</strong>: စာသားများတွင် အသုံးပြုသော သတ်ပုံအမှတ်အသားများ (ဥပမာ- comma, period, question mark)။</li> <li><strong><code>pre_tokenizer</code> Attribute</strong>: <code>tokenizer</code> object မှ pre-tokenization ကို လုပ်ဆောင်ပေးသော object။</li> <li><strong><code>pre_tokenize_str()</code> Method</strong>: Pre-tokenizer object မှ string တစ်ခုကို pre-tokenize လုပ်သော method။</li> <li><strong>Offsets</strong>: token တစ်ခုစီသည် မူရင်းစာသား၏ မည်သည့်စတင်ခြင်းနှင့် အဆုံးသတ် character index များကြားတွင် ရှိနေသည်ကို ဖော်ပြသော map။</li> <li><strong>Offset Mapping</strong>: token တစ်ခုစီသည် မူရင်းစာသား၏ မည်သည့်စတင်ခြင်းနှင့် အဆုံးသတ် character index များကြားတွင် ရှိနေသည်ကို ဖော်ပြသော map။</li> <li><strong>BERT Tokenizer</strong>: BERT model အတွက် အသုံးပြုသော tokenizer။</li> <li><strong>GPT-2 Tokenizer</strong>: GPT-2 model အတွက် အသုံးပြုသော tokenizer။</li> <li><strong><code>gpt2</code></strong>: GPT-2 model ၏ identifier။</li> <li><strong><code>Ġ</code> Symbol</strong>: GPT-2 tokenizer တွင် space ကို ကိုယ်စားပြုသော symbol။</li> <li><strong>Decode Tokens</strong>: tokens များကို မူရင်း text အဖြစ် ပြန်ပြောင်းခြင်း။</li> <li><strong>T5 Tokenizer</strong>: T5 model အတွက် အသုံးပြုသော tokenizer။</li> <li><strong><code>t5-small</code></strong>: T5 model ၏ small version အတွက် identifier။</li> <li><strong>Reversible Tokenization</strong>: tokens များကို decode လုပ်တဲ့အခါ မူရင်း text (သို့မဟုတ် normalized text) ကို ပြန်လည်ရယူနိုင်သော tokenization အမျိုးအစား။</li> <li><strong>Normalized Text</strong>: Normalization လုပ်ထားသော စာသား။</li> <li><strong>Vocabulary</strong>: tokenizer သို့မဟုတ် model တစ်ခုက သိရှိနားလည်ပြီး ကိုင်တွယ်နိုင်သော ထူးခြားသည့် tokens များ စုစုပေါင်း။</li> <li><strong>Merge Rules</strong>: BPE နှင့် WordPiece algorithm များတွင် tokens များကို ပေါင်းစပ်ရန် သင်ယူထားသော စည်းမျဉ်းများ။</li> <li><strong>Corpus (Training)</strong>: Model သို့မဟုတ် tokenizer ကို လေ့ကျင့်ရန် အသုံးပြုသော စာသားအစုအဝေးကြီး။</li> <li><strong>Score (Unigram)</strong>: Unigram algorithm တွင် token တစ်ခုစီနှင့် ဆက်စပ်နေသော တန်ဖိုး။</li> <li><strong>Loss</strong>: Model ၏ ခန့်မှန်းချက်များနှင့် အမှန်တကယ် labels များကြား ကွာခြားမှုကို တိုင်းတာသော တန်ဖိုး။</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/my/chapter6/4.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_tyugt6 = {
	assets: "/docs/course/pr_1114/my",
	base: "/docs/course/pr_1114/my",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/course/pr_1114/my/_app/immutable/entry/start.14794ee9.js"),
	import("/docs/course/pr_1114/my/_app/immutable/entry/app.a133f5c6.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 49],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 59.9 kB
Xet hash:: 2ad6ebd8c144ca924fdfc05af27c5ea5d4a7a4a50a5465266e33f16832a0ae25

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.