Buckets:

rtrm's picture
download
raw
183 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Tokenizer တစ်ခုကို အဆင့်ဆင့် တည်ဆောက်ခြင်း&quot;,&quot;local&quot;:&quot;building-a-tokenizer-block-by-block&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Corpus တစ်ခု ရယူခြင်း&quot;,&quot;local&quot;:&quot;acquiring-a-corpus&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;အစကနေ WordPiece Tokenizer တစ်ခု တည်ဆောက်ခြင်း&quot;,&quot;local&quot;:&quot;building-a-wordpiece-tokenizer-from-scratch&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;အစကနေ BPE Tokenizer တစ်ခု တည်ဆောက်ခြင်း&quot;,&quot;local&quot;:&quot;building-a-bpe-tokenizer-from-scratch&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;အစကနေ Unigram Tokenizer တစ်ခု တည်ဆောက်ခြင်း&quot;,&quot;local&quot;:&quot;building-a-unigram-tokenizer-from-scratch&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;ဝေါဟာရ ရှင်းလင်းချက် (Glossary)&quot;,&quot;local&quot;:&quot;ဝဟရ-ရငလငခက-glossary&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1114/my/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/entry/start.14794ee9.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/scheduler.893fe8c9.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/singletons.10fda3ce.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/index.bce52c8a.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/paths.89c82153.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/entry/app.a133f5c6.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/preload-helper.b1a719fd.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/index.b1df2166.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/nodes/0.510afdc1.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/nodes/53.3b37bc16.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.762ed9cc.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/Youtube.ec5d7916.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/CodeBlock.6cef0479.js">
<link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/CourseFloatingBanner.c1c08878.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Tokenizer တစ်ခုကို အဆင့်ဆင့် တည်ဆောက်ခြင်း&quot;,&quot;local&quot;:&quot;building-a-tokenizer-block-by-block&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Corpus တစ်ခု ရယူခြင်း&quot;,&quot;local&quot;:&quot;acquiring-a-corpus&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;အစကနေ WordPiece Tokenizer တစ်ခု တည်ဆောက်ခြင်း&quot;,&quot;local&quot;:&quot;building-a-wordpiece-tokenizer-from-scratch&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;အစကနေ BPE Tokenizer တစ်ခု တည်ဆောက်ခြင်း&quot;,&quot;local&quot;:&quot;building-a-bpe-tokenizer-from-scratch&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;အစကနေ Unigram Tokenizer တစ်ခု တည်ဆောက်ခြင်း&quot;,&quot;local&quot;:&quot;building-a-unigram-tokenizer-from-scratch&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;ဝေါဟာရ ရှင်းလင်းချက် (Glossary)&quot;,&quot;local&quot;:&quot;ဝဟရ-ရငလငခက-glossary&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="building-a-tokenizer-block-by-block" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#building-a-tokenizer-block-by-block"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Tokenizer တစ်ခုကို အဆင့်ဆင့် တည်ဆောက်ခြင်း</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0" style=""><a href="https://discuss.huggingface.co/t/chapter-6-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter6/section8.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter6/section8.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-141i56c">ယခင်အပိုင်းတွေမှာ ကျွန်တော်တို့ တွေ့ခဲ့ရတဲ့အတိုင်း၊ tokenization မှာ အဆင့်များစွာ ပါဝင်ပါတယ်။</p> <ul data-svelte-h="svelte-x7mng"><li>Normalization (မလိုအပ်တဲ့ spaces တွေ ဒါမှမဟုတ် accents တွေ ဖယ်ရှားတာ၊ Unicode normalization စတာတွေလိုမျိုး လိုအပ်တယ်လို့ ယူဆရတဲ့ text ကို သန့်ရှင်းရေးလုပ်ခြင်း)</li> <li>Pre-tokenization (input ကို words တွေအဖြစ် ပိုင်းခြားခြင်း)</li> <li>inputs ကို model ကနေတစ်ဆင့် run ခြင်း (pre-tokenize လုပ်ထားတဲ့ words တွေကို အသုံးပြုပြီး tokens sequence တစ်ခု ထုတ်လုပ်ခြင်း)</li> <li>Post-processing (tokenizer ရဲ့ special tokens တွေ ထည့်သွင်းခြင်း၊ attention mask နဲ့ token type IDs တွေ ထုတ်လုပ်ခြင်း)</li></ul> <p data-svelte-h="svelte-mxtm6p">သတိရစေရန်၊ ဒီမှာ overall process ကို ထပ်ကြည့်ရအောင်။</p> <div class="flex justify-center" data-svelte-h="svelte-oxfng3"><img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter6/tokenization_pipeline.svg" alt="The tokenization pipeline."> <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter6/tokenization_pipeline-dark.svg" alt="The tokenization pipeline."></div> <p data-svelte-h="svelte-1spiys7">🤗 Tokenizers library ကို အဲဒီအဆင့်တစ်ခုစီအတွက် ရွေးချယ်စရာများစွာ ပံ့ပိုးပေးဖို့ တည်ဆောက်ထားပြီး၊ သင်ဟာ ဒါတွေကို ရောနှောပြီး ပေါင်းစပ်နိုင်ပါတယ်။ ဒီအပိုင်းမှာ ကျွန်တော်တို့ <a href="/course/chapter6/2">အပိုင်း ၂</a> မှာ လုပ်ခဲ့သလိုဟောင်းနွမ်းတဲ့ tokenizer တစ်ခုကနေ tokenizer အသစ်တစ်ခုကို train လုပ်မယ့်အစား၊ tokenizer တစ်ခုကို အစကနေ ဘယ်လိုတည်ဆောက်နိုင်လဲဆိုတာ ကြည့်ရပါမယ်။ အဲဒီအခါ သင်စိတ်ကူးနိုင်တဲ့ မည်သည့် tokenizer အမျိုးအစားကိုမဆို တည်ဆောက်နိုင်ပါလိမ့်မယ်။</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/MR8tZm5ViWU" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-1u5uy4g">ပိုပြီးတိတိကျကျပြောရရင်၊ library ကို ဗဟိုချက်မဖြစ်တဲ့ <code>Tokenizer</code> class ကိုယ်တိုင်နဲ့ building blocks တွေကို submodule တွေအဖြစ် အုပ်စုဖွဲ့ထားပါတယ်။</p> <ul data-svelte-h="svelte-1tg4kr"><li><code>normalizers</code> မှာ သင်အသုံးပြုနိုင်တဲ့ <code>Normalizer</code> အမျိုးအစားအားလုံး (အပြည့်အစုံကို <a href="https://huggingface.co/docs/tokenizers/api/normalizers" rel="nofollow">ဒီနေရာမှာ</a> ကြည့်ပါ) ပါဝင်ပါတယ်။</li> <li><code>pre_tokenizers</code> မှာ သင်အသုံးပြုနိုင်တဲ့ <code>PreTokenizer</code> အမျိုးအစားအားလုံး (အပြည့်အစုံကို <a href="https://huggingface.co/docs/tokenizers/api/pre-tokenizers" rel="nofollow">ဒီနေရာမှာ</a> ကြည့်ပါ) ပါဝင်ပါတယ်။</li> <li><code>models</code> မှာ <code>BPE</code>, <code>WordPiece</code>, နဲ့ <code>Unigram</code> လိုမျိုး သင်အသုံးပြုနိုင်တဲ့ <code>Model</code> အမျိုးအစားမျိုးစုံ (အပြည့်အစုံကို <a href="https://huggingface.co/docs/tokenizers/api/models" rel="nofollow">ဒီနေရာမှာ</a> ကြည့်ပါ) ပါဝင်ပါတယ်။</li> <li><code>trainers</code> မှာ သင် corpus တစ်ခုပေါ်မှာ model ကို train လုပ်ဖို့ အသုံးပြုနိုင်တဲ့ <code>Trainer</code> အမျိုးအစားအမျိုးမျိုး (model အမျိုးအစားတစ်ခုစီအတွက် တစ်ခုစီ၊ အပြည့်အစုံကို <a href="https://huggingface.co/docs/tokenizers/api/trainers" rel="nofollow">ဒီနေရာမှာ</a> ကြည့်ပါ) ပါဝင်ပါတယ်။</li> <li><code>post_processors</code> မှာ သင်အသုံးပြုနိုင်တဲ့ <code>PostProcessor</code> အမျိုးအစားမျိုးစုံ (အပြည့်အစုံကို <a href="https://huggingface.co/docs/tokenizers/api/post-processors" rel="nofollow">ဒီနေရာမှာ</a> ကြည့်ပါ) ပါဝင်ပါတယ်။</li> <li><code>decoders</code> မှာ tokenization ရဲ့ outputs တွေကို decode လုပ်ဖို့ သင်အသုံးပြုနိုင်တဲ့ <code>Decoder</code> အမျိုးအစားမျိုးစုံ (အပြည့်အစုံကို <a href="https://huggingface.co/docs/tokenizers/components#decoders" rel="nofollow">ဒီနေရာမှာ</a> ကြည့်ပါ) ပါဝင်ပါတယ်။</li></ul> <p data-svelte-h="svelte-1y9rqvk">building blocks တွေရဲ့ စာရင်းအပြည့်အစုံကို <a href="https://huggingface.co/docs/tokenizers/components" rel="nofollow">ဒီနေရာမှာ</a> ရှာတွေ့နိုင်ပါတယ်။</p> <h2 class="relative group"><a id="acquiring-a-corpus" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#acquiring-a-corpus"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Corpus တစ်ခု ရယူခြင်း</span></h2> <p data-svelte-h="svelte-wo9qzi">ကျွန်တော်တို့ရဲ့ tokenizer အသစ်ကို train လုပ်ဖို့အတွက်၊ သေးငယ်တဲ့ text corpus တစ်ခုကို အသုံးပြုပါမယ် (ဒါမှ ဥပမာတွေက မြန်မြန်ဆန်ဆန် run မှာပါ)။ corpus ရယူခြင်းအဆင့်တွေက <a href="/course/chapter6/2">ဒီအခန်းရဲ့ အစပိုင်း</a> မှာ ကျွန်တော်တို့ လုပ်ခဲ့တဲ့အဆင့်တွေနဲ့ ဆင်တူပါတယ်။ ဒါပေမယ့် ဒီတစ်ကြိမ်မှာတော့ <a href="https://huggingface.co/datasets/wikitext" rel="nofollow">WikiText-2</a> dataset ကို အသုံးပြုပါမယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
dataset = load_dataset(<span class="hljs-string">&quot;wikitext&quot;</span>, name=<span class="hljs-string">&quot;wikitext-2-raw-v1&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
<span class="hljs-keyword">def</span> <span class="hljs-title function_">get_training_corpus</span>():
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">0</span>, <span class="hljs-built_in">len</span>(dataset), <span class="hljs-number">1000</span>):
<span class="hljs-keyword">yield</span> dataset[i : i + <span class="hljs-number">1000</span>][<span class="hljs-string">&quot;text&quot;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1skrsoe"><code>get_training_corpus()</code> function က batches of 1,000 texts တွေကို yield လုပ်မယ့် generator တစ်ခုဖြစ်ပြီး၊ ဒါတွေကို tokenizer ကို train ဖို့ ကျွန်တော်တို့ အသုံးပြုပါမယ်။</p> <p data-svelte-h="svelte-1biyl0i">🤗 Tokenizers တွေကို text files တွေပေါ်မှာ တိုက်ရိုက် train လုပ်နိုင်ပါတယ်။ WikiText-2 ကနေ inputs/texts တွေအားလုံး ပါဝင်တဲ့ text file တစ်ခုကို locally အသုံးပြုနိုင်အောင် ဘယ်လို generate လုပ်ရမလဲဆိုတာ ဒီမှာ ဖော်ပြထားပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">with</span> <span class="hljs-built_in">open</span>(<span class="hljs-string">&quot;wikitext-2.txt&quot;</span>, <span class="hljs-string">&quot;w&quot;</span>, encoding=<span class="hljs-string">&quot;utf-8&quot;</span>) <span class="hljs-keyword">as</span> f:
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(dataset)):
f.write(dataset[i][<span class="hljs-string">&quot;text&quot;</span>] + <span class="hljs-string">&quot;\n&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17nrx09">နောက်မှာတော့ သင့်ကိုယ်ပိုင် BERT, GPT-2, နဲ့ XLNet tokenizers တွေကို အဆင့်ဆင့် ဘယ်လိုတည်ဆောက်ရမလဲဆိုတာ ပြသပေးပါမယ်။ ဒါက ကျွန်တော်တို့ကို အဓိက tokenization algorithms သုံးခုဖြစ်တဲ့ WordPiece, BPE, နဲ့ Unigram တို့ရဲ့ ဥပမာတစ်ခုစီကို ပေးပါလိမ့်မယ်။ BERT နဲ့ စတင်ကြရအောင်။</p> <h2 class="relative group"><a id="building-a-wordpiece-tokenizer-from-scratch" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#building-a-wordpiece-tokenizer-from-scratch"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အစကနေ WordPiece Tokenizer တစ်ခု တည်ဆောက်ခြင်း</span></h2> <p data-svelte-h="svelte-19mkw5x">🤗 Tokenizers library နဲ့ tokenizer တစ်ခုတည်ဆောက်ဖို့အတွက်၊ ကျွန်တော်တို့ဟာ <code>models</code> တစ်ခုနဲ့ <code>Tokenizer</code> object တစ်ခုကို instantiate လုပ်ခြင်းဖြင့် စတင်ပြီး၊ ၎င်းရဲ့ <code>normalizer</code>, <code>pre_tokenizer</code>, <code>post_processor</code>, နဲ့ <code>decoder</code> attributes တွေကို ကျွန်တော်တို့ လိုချင်တဲ့ တန်ဖိုးတွေဆီ သတ်မှတ်ပေးပါတယ်။</p> <p data-svelte-h="svelte-1qm18z7">ဒီဥပမာအတွက်၊ WordPiece model တစ်ခုနဲ့ <code>Tokenizer</code> တစ်ခုကို ကျွန်တော်တို့ ဖန်တီးပါမယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> (
decoders,
models,
normalizers,
pre_tokenizers,
processors,
trainers,
Tokenizer,
)
tokenizer = Tokenizer(models.WordPiece(unk_token=<span class="hljs-string">&quot;[UNK]&quot;</span>))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-r5m8ig"><code>unk_token</code> ကို သတ်မှတ်ပေးရမှာပါ။ ဒါမှ model က မမြင်ဖူးသေးတဲ့ characters တွေ ကြုံတွေ့ရတဲ့အခါ ဘာကို ပြန်ပေးရမလဲဆိုတာ သိမှာပါ။ ဒီနေရာမှာ ကျွန်တော်တို့ သတ်မှတ်နိုင်တဲ့ တခြား arguments တွေကတော့ model ရဲ့ <code>vocab</code> (ကျွန်တော်တို့ model ကို train မှာဖြစ်တဲ့အတွက် ဒါကို သတ်မှတ်ဖို့ မလိုအပ်ပါဘူး) နဲ့ <code>max_input_chars_per_word</code> (word တစ်ခုစီအတွက် အမြင့်ဆုံးအရှည်ကို သတ်မှတ်ပေးပြီး၊ ဒီတန်ဖိုးထက် ပိုရှည်တဲ့ words တွေကို ပိုင်းခြားပါလိမ့်မယ်) တို့ ပါဝင်ပါတယ်။</p> <p data-svelte-h="svelte-q1n86n">tokenization ရဲ့ ပထမအဆင့်က normalization ဖြစ်တာကြောင့်၊ အဲဒါနဲ့ စတင်ကြပါစို့။ BERT ကို အများအားဖြင့် အသုံးပြုတဲ့အတွက် BERT အတွက် သတ်မှတ်နိုင်တဲ့ classic options တွေနဲ့ <code>BertNormalizer</code> တစ်ခုရှိပါတယ်၊ <code>lowercase</code> နဲ့ <code>strip_accents</code> က ရှင်းပြစရာမလိုပါဘူး၊ <code>clean_text</code> က control characters အားလုံးကို ဖယ်ရှားပြီး ထပ်နေတဲ့ spaces တွေကို တစ်ခုတည်းနဲ့ အစားထိုးပါတယ်၊ <code>handle_chinese_chars</code> က Chinese characters တွေပတ်ပတ်လည်မှာ spaces တွေ ထည့်ပေးပါတယ်။ <code>bert-base-uncased</code> tokenizer ကို ပြန်လည်ထုတ်လုပ်ဖို့၊ ဒီ normalizer ကို သတ်မှတ်ပေးနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.normalizer = normalizers.BertNormalizer(lowercase=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-hqg2ho">သို့သော်လည်း၊ အထွေထွေအားဖြင့်၊ tokenizer အသစ်တစ်ခု တည်ဆောက်တဲ့အခါ 🤗 Tokenizers library ထဲမှာ အကောင်အထည်ဖော်ထားပြီးသား ဒီလိုအသုံးဝင်တဲ့ normalizer ကို သင်ရရှိမှာ မဟုတ်ပါဘူး၊ ဒါကြောင့် BERT normalizer ကို ကိုယ်တိုင် ဘယ်လိုဖန်တီးရမလဲဆိုတာ ကြည့်ရအောင်။ library က <code>Lowercase</code> normalizer နဲ့ <code>StripAccents</code> normalizer ကို ပံ့ပိုးပေးပြီး၊ <code>Sequence</code> ကို အသုံးပြုပြီး normalizers များစွာကို ပေါင်းစပ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.normalizer = normalizers.<span class="hljs-type">Sequence</span>(
[normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1lw62pa">ကျွန်တော်တို့ <code>NFD</code> Unicode normalizer ကိုလည်း အသုံးပြုနေပါတယ်၊ ဘာလို့လဲဆိုတော့ မဟုတ်ရင် <code>StripAccents</code> normalizer က accented characters တွေကို မှန်ကန်စွာ မှတ်မိမှာ မဟုတ်ဘဲ ၎င်းတို့ကို ဖယ်ရှားနိုင်မှာ မဟုတ်ပါဘူး။</p> <p data-svelte-h="svelte-136j7i5">အရင်က တွေ့ခဲ့ရတဲ့အတိုင်း၊ <code>normalizer</code> ရဲ့ <code>normalize_str()</code> method ကို အသုံးပြုပြီး ပေးထားတဲ့ text ပေါ်မှာ ဒါက ဘယ်လိုအကျိုးသက်ရောက်မှု ရှိလဲဆိုတာ ကြည့်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(tokenizer.normalizer.normalize_str(<span class="hljs-string">&quot;Héllò hôw are ü?&quot;</span>))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->hello how are u?<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-13a6mef"><p><strong>ဆက်လက်လေ့လာရန်</strong> ယခင် normalizers ၏ versions နှစ်ခုကို unicode character <code>u&quot;\u0085&quot;</code> ပါဝင်သော string တစ်ခုပေါ်တွင် စမ်းသပ်ပါက၊ ဤ normalizers နှစ်ခုသည် အတိအကျတူညီခြင်းမရှိသည်ကို သင်သတိထားမိပါလိမ့်မည်။
<code>normalizers.Sequence</code> ပါသော version ကို အလွန်အမင်း ရှုပ်ထွေးမှုမဖြစ်စေရန်၊ <code>clean_text</code> argument ကို <code>True</code> (၎င်းသည် default behavior ဖြစ်သည်) ဟု သတ်မှတ်ထားသောအခါ <code>BertNormalizer</code> လိုအပ်သည့် Regex replacements များကို ကျွန်ုပ်တို့ ထည့်သွင်းမထားပါ။ သို့သော် စိတ်မပူပါနှင့်၊ အသုံးဝင်သော <code>BertNormalizer</code> ကို အသုံးမပြုဘဲ <code>normalizers.Replace</code> နှစ်ခုကို normalizers sequence တွင် ထပ်ထည့်ခြင်းဖြင့် အတိအကျတူညီသော normalization ကို ရရှိနိုင်ပါသည်။</p></blockquote> <p data-svelte-h="svelte-b1cmuk">နောက်တစ်ခုက pre-tokenization အဆင့်ပါ။ ထပ်မံပြီး၊ ကျွန်တော်တို့ အသုံးပြုနိုင်တဲ့ prebuilt <code>BertPreTokenizer</code> တစ်ခုရှိပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jfyjke">ဒါမှမဟုတ် အစကနေ တည်ဆောက်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-wm20ec"><code>Whitespace</code> pre-tokenizer က whitespace နဲ့ letters, digits, ဒါမှမဟုတ် underscore character မဟုတ်တဲ့ characters အားလုံးကို ပိုင်းခြားတာကြောင့်၊ ဒါက နည်းပညာအရ whitespace နဲ့ punctuation တွေပေါ်မှာ ပိုင်းခြားတယ်ဆိုတာ သတိပြုပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.pre_tokenizer.pre_tokenize_str(<span class="hljs-string">&quot;Let&#x27;s test my pre-tokenizer.&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[(<span class="hljs-string">&#x27;Let&#x27;</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">3</span>)), (<span class="hljs-string">&quot;&#x27;&quot;</span>, (<span class="hljs-number">3</span>, <span class="hljs-number">4</span>)), (<span class="hljs-string">&#x27;s&#x27;</span>, (<span class="hljs-number">4</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">&#x27;test&#x27;</span>, (<span class="hljs-number">6</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">&#x27;my&#x27;</span>, (<span class="hljs-number">11</span>, <span class="hljs-number">13</span>)), (<span class="hljs-string">&#x27;pre&#x27;</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">17</span>)),
(<span class="hljs-string">&#x27;-&#x27;</span>, (<span class="hljs-number">17</span>, <span class="hljs-number">18</span>)), (<span class="hljs-string">&#x27;tokenizer&#x27;</span>, (<span class="hljs-number">18</span>, <span class="hljs-number">27</span>)), (<span class="hljs-string">&#x27;.&#x27;</span>, (<span class="hljs-number">27</span>, <span class="hljs-number">28</span>))]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ioiki1">အကယ်၍ သင်ဟာ whitespace ပေါ်မှာပဲ ပိုင်းခြားချင်တယ်ဆိုရင်၊ <code>WhitespaceSplit</code> pre-tokenizer ကို အစားအသုံးပြုသင့်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pre_tokenizer = pre_tokenizers.WhitespaceSplit()
pre_tokenizer.pre_tokenize_str(<span class="hljs-string">&quot;Let&#x27;s test my pre-tokenizer.&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[(<span class="hljs-string">&quot;Let&#x27;s&quot;</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">&#x27;test&#x27;</span>, (<span class="hljs-number">6</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">&#x27;my&#x27;</span>, (<span class="hljs-number">11</span>, <span class="hljs-number">13</span>)), (<span class="hljs-string">&#x27;pre-tokenizer.&#x27;</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">28</span>))]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-hx4h33">normalizers တွေနဲ့ တူတူပဲ၊ pre-tokenizers များစွာကို ပေါင်းစပ်ဖို့ <code>Sequence</code> ကို အသုံးပြုနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pre_tokenizer = pre_tokenizers.<span class="hljs-type">Sequence</span>(
[pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
)
pre_tokenizer.pre_tokenize_str(<span class="hljs-string">&quot;Let&#x27;s test my pre-tokenizer.&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[(<span class="hljs-string">&#x27;Let&#x27;</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">3</span>)), (<span class="hljs-string">&quot;&#x27;&quot;</span>, (<span class="hljs-number">3</span>, <span class="hljs-number">4</span>)), (<span class="hljs-string">&#x27;s&#x27;</span>, (<span class="hljs-number">4</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">&#x27;test&#x27;</span>, (<span class="hljs-number">6</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">&#x27;my&#x27;</span>, (<span class="hljs-number">11</span>, <span class="hljs-number">13</span>)), (<span class="hljs-string">&#x27;pre&#x27;</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">17</span>)),
(<span class="hljs-string">&#x27;-&#x27;</span>, (<span class="hljs-number">17</span>, <span class="hljs-number">18</span>)), (<span class="hljs-string">&#x27;tokenizer&#x27;</span>, (<span class="hljs-number">18</span>, <span class="hljs-number">27</span>)), (<span class="hljs-string">&#x27;.&#x27;</span>, (<span class="hljs-number">27</span>, <span class="hljs-number">28</span>))]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-h2yxwq">tokenization pipeline ရဲ့ နောက်တစ်ဆင့်က inputs တွေကို model ကနေတစ်ဆင့် run ခြင်းပါ။ ကျွန်တော်တို့ model ကို initialization မှာ သတ်မှတ်ထားပြီးသားဖြစ်ပေမယ့်၊ ဒါကို train လုပ်ဖို့တော့ လိုအပ်ပါသေးတယ်။ ဒါအတွက် <code>WordPieceTrainer</code> လိုအပ်ပါလိမ့်မယ်။ 🤗 Tokenizers မှာ trainer တစ်ခုကို instantiate လုပ်တဲ့အခါ မှတ်ထားရမယ့် အဓိကအချက်ကတော့ သင်အသုံးပြုဖို့ ရည်ရွယ်ထားတဲ့ special tokens အားလုံးကို ၎င်းဆီ ပေးဖို့ လိုအပ်ပါတယ် — မဟုတ်ရင် ၎င်းတို့ training corpus ထဲမှာ မပါဝင်တဲ့အတွက် vocabulary ထဲကို ထည့်သွင်းပေးမှာ မဟုတ်ပါဘူး။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->special_tokens = [<span class="hljs-string">&quot;[UNK]&quot;</span>, <span class="hljs-string">&quot;[PAD]&quot;</span>, <span class="hljs-string">&quot;[CLS]&quot;</span>, <span class="hljs-string">&quot;[SEP]&quot;</span>, <span class="hljs-string">&quot;[MASK]&quot;</span>]
trainer = trainers.WordPieceTrainer(vocab_size=<span class="hljs-number">25000</span>, special_tokens=special_tokens)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-2i9ubf"><code>vocab_size</code> နဲ့ <code>special_tokens</code> ကို သတ်မှတ်ခြင်းအပြင်၊ <code>min_frequency</code> (token တစ်ခု vocabulary ထဲမှာ ပါဝင်ဖို့ ဘယ်အကြိမ်ရေ အနည်းဆုံး ပေါ်လာရမလဲ) ကို သတ်မှတ်နိုင်ပါတယ် ဒါမှမဟုတ် <code>continuing_subword_prefix</code> ကို ပြောင်းလဲနိုင်ပါတယ် (ကျွန်တော်တို့ <code>##</code> နဲ့ မတူတာတစ်ခုကို အသုံးပြုချင်ရင်)။</p> <p data-svelte-h="svelte-1p6vkx5">ကျွန်တော်တို့ အစောပိုင်းက သတ်မှတ်ခဲ့တဲ့ iterator ကို အသုံးပြုပြီး model ကို train လုပ်ဖို့အတွက်၊ ဒီ command ကို run ရုံပါပဲ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-46cjj3">ကျွန်တော်တို့ tokenizer ကို train လုပ်ဖို့ text files တွေကိုလည်း အသုံးပြုနိုင်ပါတယ်။ ဒါက အောက်ပါအတိုင်း ဖြစ်ပါလိမ့်မယ် (ကျွန်တော်တို့ model ကို အရင်ဆုံး empty <code>WordPiece</code> တစ်ခုနဲ့ reinitialize လုပ်ပါတယ်)-</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.model = models.WordPiece(unk_token=<span class="hljs-string">&quot;[UNK]&quot;</span>)
tokenizer.train([<span class="hljs-string">&quot;wikitext-2.txt&quot;</span>], trainer=trainer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-9gaf68">ကိစ္စနှစ်ခုလုံးမှာ၊ <code>encode()</code> method ကို ခေါ်ခြင်းဖြင့် tokenizer ကို text တစ်ခုပေါ်မှာ စမ်းသပ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoding = tokenizer.encode(<span class="hljs-string">&quot;Let&#x27;s test this tokenizer.&quot;</span>)
<span class="hljs-built_in">print</span>(encoding.tokens)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;let&#x27;</span>, <span class="hljs-string">&quot;&#x27;&quot;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27;test&#x27;</span>, <span class="hljs-string">&#x27;this&#x27;</span>, <span class="hljs-string">&#x27;tok&#x27;</span>, <span class="hljs-string">&#x27;##eni&#x27;</span>, <span class="hljs-string">&#x27;##zer&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jy6r65">ရရှိတဲ့ <code>encoding</code> ဟာ <code>Encoding</code> တစ်ခုဖြစ်ပြီး၊ ၎င်းရဲ့ attributes များစွာ ( <code>ids</code>, <code>type_ids</code>, <code>tokens</code>, <code>offsets</code>, <code>attention_mask</code>, <code>special_tokens_mask</code>, နဲ့ <code>overflowing</code>) ထဲမှာ tokenizer ရဲ့ လိုအပ်တဲ့ outputs အားလုံး ပါဝင်ပါတယ်။</p> <p data-svelte-h="svelte-15p3n4u">tokenization pipeline ရဲ့ နောက်ဆုံးအဆင့်က post-processing ပါ။ ကျွန်တော်တို့ <code>[CLS]</code> token ကို အစမှာ ထည့်သွင်းဖို့နဲ့ <code>[SEP]</code> token ကို အဆုံးမှာ ထည့်သွင်းဖို့ လိုအပ်ပါတယ် (ဒါမှမဟုတ် sentence pair ရှိရင် sentence တစ်ခုစီရဲ့ နောက်မှာ)။ ဒါအတွက် <code>TemplateProcessor</code> ကို ကျွန်တော်တို့ အသုံးပြုပါမယ်၊ ဒါပေမယ့် ပထမဆုံး vocabulary ထဲက <code>[CLS]</code> နဲ့ <code>[SEP]</code> tokens တွေရဲ့ IDs တွေကို သိဖို့ လိုအပ်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->cls_token_id = tokenizer.token_to_id(<span class="hljs-string">&quot;[CLS]&quot;</span>)
sep_token_id = tokenizer.token_to_id(<span class="hljs-string">&quot;[SEP]&quot;</span>)
<span class="hljs-built_in">print</span>(cls_token_id, sep_token_id)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-number">2</span>, <span class="hljs-number">3</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1gzy7xj"><code>TemplateProcessor</code> အတွက် template ကို ရေးဖို့၊ single sentence တစ်ခုနဲ့ sentence pair တစ်ခုကို ဘယ်လို ကိုင်တွယ်ရမယ်ဆိုတာ သတ်မှတ်ရပါမယ်။ နှစ်ခုလုံးအတွက်၊ ကျွန်တော်တို့ အသုံးပြုချင်တဲ့ special tokens တွေကို ရေးပါတယ်၊ ပထမ (သို့မဟုတ် single) sentence ကို <code>$A</code> နဲ့ ကိုယ်စားပြုပြီး၊ ဒုတိယ sentence (pair တစ်ခုကို encoding လုပ်ရင်) ကို <code>$B</code> နဲ့ ကိုယ်စားပြုပါတယ်။ ဒါတွေတစ်ခုစီ (special tokens နဲ့ sentences) အတွက်၊ colon နောက်မှာ သက်ဆိုင်ရာ token type ID ကိုလည်း သတ်မှတ်ပါတယ်။</p> <p data-svelte-h="svelte-9e75ys">classic BERT template ကို အောက်ပါအတိုင်း သတ်မှတ်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.post_processor = processors.TemplateProcessing(
single=<span class="hljs-string">f&quot;[CLS]:0 $A:0 [SEP]:0&quot;</span>,
pair=<span class="hljs-string">f&quot;[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1&quot;</span>,
special_tokens=[(<span class="hljs-string">&quot;[CLS]&quot;</span>, cls_token_id), (<span class="hljs-string">&quot;[SEP]&quot;</span>, sep_token_id)],
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1rzgr0m">special tokens တွေရဲ့ IDs တွေကို ပေးပို့ဖို့ လိုအပ်တယ်ဆိုတာ သတိပြုပါ။ ဒါမှ tokenizer က ၎င်းတို့ကို ၎င်းတို့ရဲ့ IDs တွေအဖြစ် မှန်ကန်စွာ ပြောင်းလဲနိုင်မှာပါ။</p> <p data-svelte-h="svelte-i1t2lx">ဒါကို ထည့်သွင်းပြီးတာနဲ့၊ ကျွန်တော်တို့ရဲ့ ယခင်ဥပမာကို ပြန်ကြည့်မယ်ဆိုရင်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoding = tokenizer.encode(<span class="hljs-string">&quot;Let&#x27;s test this tokenizer.&quot;</span>)
<span class="hljs-built_in">print</span>(encoding.tokens)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;[CLS]&#x27;</span>, <span class="hljs-string">&#x27;let&#x27;</span>, <span class="hljs-string">&quot;&#x27;&quot;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27;test&#x27;</span>, <span class="hljs-string">&#x27;this&#x27;</span>, <span class="hljs-string">&#x27;tok&#x27;</span>, <span class="hljs-string">&#x27;##eni&#x27;</span>, <span class="hljs-string">&#x27;##zer&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;[SEP]&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17sob19">ပြီးတော့ sentence pair တစ်ခုပေါ်မှာဆိုရင်၊ မှန်ကန်တဲ့ ရလဒ်ကို ကျွန်တော်တို့ ရရှိပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoding = tokenizer.encode(<span class="hljs-string">&quot;Let&#x27;s test this tokenizer...&quot;</span>, <span class="hljs-string">&quot;on a pair of sentences.&quot;</span>)
<span class="hljs-built_in">print</span>(encoding.tokens)
<span class="hljs-built_in">print</span>(encoding.type_ids)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;[CLS]&#x27;</span>, <span class="hljs-string">&#x27;let&#x27;</span>, <span class="hljs-string">&quot;&#x27;&quot;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27;test&#x27;</span>, <span class="hljs-string">&#x27;this&#x27;</span>, <span class="hljs-string">&#x27;tok&#x27;</span>, <span class="hljs-string">&#x27;##eni&#x27;</span>, <span class="hljs-string">&#x27;##zer&#x27;</span>, <span class="hljs-string">&#x27;...&#x27;</span>, <span class="hljs-string">&#x27;[SEP]&#x27;</span>, <span class="hljs-string">&#x27;on&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;pair&#x27;</span>, <span class="hljs-string">&#x27;of&#x27;</span>, <span class="hljs-string">&#x27;sentences&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;[SEP]&#x27;</span>]
[<span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xtsg0m">ဒီ tokenizer ကို အစကနေ တည်ဆောက်တာ နီးပါးပြီးစီးပါပြီ — နောက်ဆုံးအဆင့်က decoder တစ်ခု ထည့်သွင်းဖို့ပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.decoder = decoders.WordPiece(prefix=<span class="hljs-string">&quot;##&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jeksy2">ကျွန်တော်တို့ရဲ့ ယခင် <code>encoding</code> ပေါ်မှာ စမ်းသပ်ကြည့်ရအောင်…</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.decode(encoding.ids)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">&quot;let&#x27;s test this tokenizer... on a pair of sentences.&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-s3e4l1">ကောင်းပါပြီ! ကျွန်တော်တို့ရဲ့ tokenizer ကို ဒီလို JSON file တစ်ခုတည်းမှာ သိမ်းဆည်းနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.save(<span class="hljs-string">&quot;tokenizer.json&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-gsf4pl">အဲဒီနောက် <code>from_file()</code> method နဲ့ <code>Tokenizer</code> object တစ်ခုထဲကို အဲဒီ file ကို ပြန်လည် load လုပ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->new_tokenizer = Tokenizer.from_file(<span class="hljs-string">&quot;tokenizer.json&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-saxts2">ဒီ tokenizer ကို 🤗 Transformers မှာ အသုံးပြုဖို့အတွက်၊ ကျွန်တော်တို့ဟာ ဒါကို <code>PreTrainedTokenizerFast</code> ထဲမှာ wrap လုပ်ရပါမယ်။ ကျွန်တော်တို့ဟာ generic class ကို အသုံးပြုနိုင်ပါတယ် ဒါမှမဟုတ် ကျွန်တော်တို့ရဲ့ tokenizer က လက်ရှိ model တစ်ခုနဲ့ ကိုက်ညီတယ်ဆိုရင် အဲဒီ class ကို အသုံးပြုနိုင်ပါတယ် (ဒီနေရာမှာ <code>BertTokenizerFast</code>)။ သင်ဟာ ဒီသင်ခန်းစာကို tokenizer အသစ်တစ်ခု တည်ဆောက်ဖို့ အသုံးပြုတယ်ဆိုရင်၊ ပထမ option ကို အသုံးပြုရပါလိမ့်မယ်။</p> <p data-svelte-h="svelte-c5lp68">tokenizer ကို <code>PreTrainedTokenizerFast</code> ထဲမှာ wrap လုပ်ဖို့၊ ကျွန်တော်တို့ တည်ဆောက်ခဲ့တဲ့ tokenizer ကို <code>tokenizer_object</code> အဖြစ် ပေးနိုင်ပါတယ် ဒါမှမဟုတ် ကျွန်တော်တို့ သိမ်းဆည်းခဲ့တဲ့ tokenizer file ကို <code>tokenizer_file</code> အဖြစ် ပေးနိုင်ပါတယ်။ အဓိက မှတ်ထားရမယ့်အချက်ကတော့ ကျွန်တော်တို့ special tokens အားလုံးကို ကိုယ်တိုင် သတ်မှတ်ပေးရပါမယ်၊ ဘာလို့လဲဆိုတော့ အဲဒီ class က <code>tokenizer</code> object ကနေ ဘယ် token က mask token လဲ၊ <code>[CLS]</code> token လဲ စတာတွေကို မသိနိုင်လို့ပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast
wrapped_tokenizer = PreTrainedTokenizerFast(
tokenizer_object=tokenizer,
<span class="hljs-comment"># tokenizer_file=&quot;tokenizer.json&quot;, # You can load from the tokenizer file, alternatively</span>
unk_token=<span class="hljs-string">&quot;[UNK]&quot;</span>,
pad_token=<span class="hljs-string">&quot;[PAD]&quot;</span>,
cls_token=<span class="hljs-string">&quot;[CLS]&quot;</span>,
sep_token=<span class="hljs-string">&quot;[SEP]&quot;</span>,
mask_token=<span class="hljs-string">&quot;[MASK]&quot;</span>,
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-3muxye">အကယ်၍ သင်ဟာ သီးခြား tokenizer class တစ်ခု (ဥပမာ - <code>BertTokenizerFast</code>) ကို အသုံးပြုနေတယ်ဆိုရင်၊ default tokens တွေနဲ့ ကွာခြားတဲ့ special tokens တွေကိုသာ သတ်မှတ်ပေးဖို့ လိုအပ်ပါလိမ့်မယ် (ဒီနေရာမှာတော့ မရှိပါဘူး)။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> BertTokenizerFast
wrapped_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-3pvusq">အဲဒီနောက် ဒီ tokenizer ကို တခြား 🤗 Transformers tokenizer တွေလိုပဲ သင်အသုံးပြုနိုင်ပါတယ်။ <code>save_pretrained()</code> method နဲ့ သိမ်းဆည်းနိုင်ပါတယ် ဒါမှမဟုတ် <code>push_to_hub()</code> method နဲ့ Hub ကို upload လုပ်နိုင်ပါတယ်။</p> <p data-svelte-h="svelte-1lyflqe">ကျွန်တော်တို့ WordPiece tokenizer တစ်ခုကို ဘယ်လိုတည်ဆောက်ရမယ်ဆိုတာ မြင်တွေ့ခဲ့ရပြီဆိုတော့၊ BPE tokenizer တစ်ခုအတွက်လည်း အတူတူလုပ်ကြစို့။ သင်အဆင့်တွေအားလုံးကို သိပြီးသားဖြစ်တဲ့အတွက် နည်းနည်းပိုမြန်မြန် သွားပါမယ်၊ ခြားနားချက်တွေကိုပဲ မီးမောင်းထိုးပြပါမယ်။</p> <h2 class="relative group"><a id="building-a-bpe-tokenizer-from-scratch" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#building-a-bpe-tokenizer-from-scratch"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အစကနေ BPE Tokenizer တစ်ခု တည်ဆောက်ခြင်း</span></h2> <p data-svelte-h="svelte-1hdw5sa">အခု GPT-2 tokenizer တစ်ခု တည်ဆောက်ကြစို့။ BERT tokenizer အတွက်လိုပဲ၊ BPE model တစ်ခုနဲ့ <code>Tokenizer</code> တစ်ခုကို initialize လုပ်ခြင်းဖြင့် စတင်ပါမယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer = Tokenizer(models.BPE())<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ec922a">BERT အတွက်လိုပဲ၊ ကျွန်တော်တို့မှာ vocabulary ရှိရင် (ဒီကိစ္စမှာ <code>vocab</code> နဲ့ <code>merges</code> ကို ပေးဖို့ လိုပါလိမ့်မယ်) ဒီ model ကို vocabulary နဲ့ initialize လုပ်နိုင်ပါတယ်။ ဒါပေမယ့် ကျွန်တော်တို့ အစကနေ train မှာဖြစ်တဲ့အတွက်၊ ဒါကို လုပ်ဖို့မလိုအပ်ပါဘူး။ GPT-2 က byte-level BPE ကို အသုံးပြုပြီး ဒါက <code>unk_token</code> မလိုအပ်တဲ့အတွက် <code>unk_token</code> ကို သတ်မှတ်ပေးဖို့လည်း ကျွန်တော်တို့ မလိုအပ်ပါဘူး။</p> <p data-svelte-h="svelte-1uahpvd">GPT-2 က normalizer ကို အသုံးမပြုတာကြောင့်၊ အဲဒီအဆင့်ကို ကျော်ပြီး pre-tokenization ကို တိုက်ရိုက်သွားပါမယ်-</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=<span class="hljs-literal">False</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xea6z8">ဒီနေရာမှာ <code>ByteLevel</code> ကို ကျွန်တော်တို့ ထည့်သွင်းခဲ့တဲ့ option က sentence အစမှာ space မထည့်ဖို့ပါပဲ (ဒါက default အားဖြင့် ထည့်ပါတယ်)။ အရင်ကလို ဥပမာ text တစ်ခုရဲ့ pre-tokenization ကို ကြည့်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.pre_tokenizer.pre_tokenize_str(<span class="hljs-string">&quot;Let&#x27;s test pre-tokenization!&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[(<span class="hljs-string">&#x27;Let&#x27;</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">3</span>)), (<span class="hljs-string">&quot;&#x27;s&quot;</span>, (<span class="hljs-number">3</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">&#x27;Ġtest&#x27;</span>, (<span class="hljs-number">5</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">&#x27;Ġpre&#x27;</span>, (<span class="hljs-number">10</span>, <span class="hljs-number">14</span>)), (<span class="hljs-string">&#x27;-&#x27;</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">15</span>)),
(<span class="hljs-string">&#x27;tokenization&#x27;</span>, (<span class="hljs-number">15</span>, <span class="hljs-number">27</span>)), (<span class="hljs-string">&#x27;!&#x27;</span>, (<span class="hljs-number">27</span>, <span class="hljs-number">28</span>))]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8cpkli">နောက်တစ်ခုက training လိုအပ်တဲ့ model ပါ။ GPT-2 အတွက်၊ တစ်ခုတည်းသော special token က end-of-text token ပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trainer = trainers.BpeTrainer(vocab_size=<span class="hljs-number">25000</span>, special_tokens=[<span class="hljs-string">&quot;&lt;|endoftext|&gt;&quot;</span>])
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1x3ijoa"><code>WordPieceTrainer</code> နဲ့ တူတူပဲ၊ <code>vocab_size</code> နဲ့ <code>special_tokens</code> အပြင်၊ ကျွန်တော်တို့ လိုအပ်ရင် <code>min_frequency</code> ကို သတ်မှတ်နိုင်ပါတယ်၊ ဒါမှမဟုတ် end-of-word suffix ( <code>&lt;/w&gt;</code> လိုမျိုး) ရှိရင် <code>end_of_word_suffix</code> နဲ့ သတ်မှတ်နိုင်ပါတယ်။</p> <p data-svelte-h="svelte-qeny4o">ဒီ tokenizer ကို text files တွေပေါ်မှာလည်း train လုပ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.model = models.BPE()
tokenizer.train([<span class="hljs-string">&quot;wikitext-2.txt&quot;</span>], trainer=trainer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-f8svam">sample text တစ်ခုရဲ့ tokenization ကို ကြည့်ရအောင်-</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoding = tokenizer.encode(<span class="hljs-string">&quot;Let&#x27;s test this tokenizer.&quot;</span>)
<span class="hljs-built_in">print</span>(encoding.tokens)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;L&#x27;</span>, <span class="hljs-string">&#x27;et&#x27;</span>, <span class="hljs-string">&quot;&#x27;&quot;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27;Ġtest&#x27;</span>, <span class="hljs-string">&#x27;Ġthis&#x27;</span>, <span class="hljs-string">&#x27;Ġto&#x27;</span>, <span class="hljs-string">&#x27;ken&#x27;</span>, <span class="hljs-string">&#x27;izer&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1cv10os">GPT-2 tokenizer အတွက် byte-level post-processing ကို အောက်ပါအတိုင်း ကျွန်တော်တို့ အသုံးပြုပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.post_processor = processors.ByteLevel(trim_offsets=<span class="hljs-literal">False</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-au8l8c"><code>trim_offsets = False</code> option က post-processor ကို ‘Ġ’ နဲ့ စတင်တဲ့ tokens တွေရဲ့ offsets တွေကို ရှိတဲ့အတိုင်း ထားဖို့ ညွှန်ပြပါတယ်။ ဒီနည်းနဲ့ offsets တွေရဲ့ အစက word ရဲ့ ပထမဆုံး character ကို ညွှန်ပြမယ့်အစား word ရဲ့ ရှေ့က space ကို ညွှန်ပြပါလိမ့်မယ် (space က နည်းပညာအရ token ရဲ့ အစိတ်အပိုင်းဖြစ်တာကြောင့်)။ ကျွန်တော်တို့ အခု encode လုပ်ခဲ့တဲ့ text နဲ့ ရလဒ်ကို ကြည့်ရအောင်။ <code>&#39;Ġtest&#39;</code> က index 4 မှာရှိတဲ့ token ဖြစ်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->sentence = <span class="hljs-string">&quot;Let&#x27;s test this tokenizer.&quot;</span>
encoding = tokenizer.encode(sentence)
start, end = encoding.offsets[<span class="hljs-number">4</span>]
sentence[start:end]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">&#x27; test&#x27;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1a1apnj">နောက်ဆုံးအနေနဲ့၊ byte-level decoder တစ်ခု ထည့်သွင်းပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.decoder = decoders.ByteLevel()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-675ell">ပြီးတော့ ဒါက မှန်ကန်စွာ အလုပ်လုပ်လားဆိုတာ ထပ်မံစစ်ဆေးနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.decode(encoding.ids)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">&quot;Let&#x27;s test this tokenizer.&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1fws21w">ကောင်းပါပြီ! အခု ပြီးစီးပြီဆိုတော့၊ tokenizer ကို အရင်လိုပဲ သိမ်းဆည်းနိုင်ပြီး၊ 🤗 Transformers မှာ အသုံးပြုချင်တယ်ဆိုရင် <code>PreTrainedTokenizerFast</code> ဒါမှမဟုတ် <code>GPT2TokenizerFast</code> ထဲမှာ wrap လုပ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast
wrapped_tokenizer = PreTrainedTokenizerFast(
tokenizer_object=tokenizer,
bos_token=<span class="hljs-string">&quot;&lt;|endoftext|&gt;&quot;</span>,
eos_token=<span class="hljs-string">&quot;&lt;|endoftext|&gt;&quot;</span>,
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1pzgdft">ဒါမှမဟုတ်:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> GPT2TokenizerFast
wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17tmyse">နောက်ဆုံး ဥပမာအနေနဲ့၊ Unigram tokenizer တစ်ခုကို အစကနေ ဘယ်လိုတည်ဆောက်ရမလဲဆိုတာ ကျွန်တော်တို့ ပြသပေးပါမယ်။</p> <h2 class="relative group"><a id="building-a-unigram-tokenizer-from-scratch" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#building-a-unigram-tokenizer-from-scratch"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အစကနေ Unigram Tokenizer တစ်ခု တည်ဆောက်ခြင်း</span></h2> <p data-svelte-h="svelte-1p4z3wo">အခု XLNet tokenizer တစ်ခု တည်ဆောက်ကြစို့။ ယခင် tokenizers တွေအတွက်လိုပဲ၊ Unigram model တစ်ခုနဲ့ <code>Tokenizer</code> တစ်ခုကို initialize လုပ်ခြင်းဖြင့် စတင်ပါမယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer = Tokenizer(models.Unigram())<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-6jmcsf">ထပ်မံပြီး၊ ကျွန်တော်တို့မှာ vocabulary ရှိရင် ဒီ model ကို vocabulary နဲ့ initialize လုပ်နိုင်ပါတယ်။</p> <p data-svelte-h="svelte-1z0uul8">normalization အတွက်၊ XLNet က replacements အနည်းငယ် (SentencePiece ကနေလာတာပါ) ကို အသုံးပြုပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> Regex
tokenizer.normalizer = normalizers.<span class="hljs-type">Sequence</span>(
[
normalizers.Replace(<span class="hljs-string">&quot;``&quot;</span>, <span class="hljs-string">&#x27;&quot;&#x27;</span>),
normalizers.Replace(<span class="hljs-string">&quot;&#x27;&#x27;&quot;</span>, <span class="hljs-string">&#x27;&quot;&#x27;</span>),
normalizers.NFKD(),
normalizers.StripAccents(),
normalizers.Replace(Regex(<span class="hljs-string">&quot; {2,}&quot;</span>), <span class="hljs-string">&quot; &quot;</span>),
]
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jnf39y">ဒါက <code></code> နဲ့ <code></code> တွေကို <code></code> နဲ့ အစားထိုးပြီး spaces နှစ်ခု သို့မဟုတ် ထို့ထက်ပိုတဲ့ sequence တွေကို single space တစ်ခုနဲ့ အစားထိုးပါတယ်၊ ဒါ့အပြင် tokenize လုပ်မယ့် texts ထဲက accents တွေကို ဖယ်ရှားပါတယ်။</p> <p data-svelte-h="svelte-17fu9q6">မည်သည့် SentencePiece tokenizer အတွက်မဆို အသုံးပြုရမယ့် pre-tokenizer က <code>Metaspace</code> ပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1x0owi0">အရင်ကလို ဥပမာ text တစ်ခုရဲ့ pre-tokenization ကို ကြည့်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.pre_tokenizer.pre_tokenize_str(<span class="hljs-string">&quot;Let&#x27;s test the pre-tokenizer!&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[(<span class="hljs-string">&quot; Let&#x27;s&quot;</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">&#x27; test&#x27;</span>, (<span class="hljs-number">5</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">&#x27; the&#x27;</span>, (<span class="hljs-number">10</span>, <span class="hljs-number">14</span>)), (<span class="hljs-string">&#x27; pre-tokenizer!&#x27;</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">29</span>))]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xd39o2">နောက်တစ်ခုက training လိုအပ်တဲ့ model ပါ။ XLNet မှာ special tokens တွေ အတော်လေး များပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->special_tokens = [<span class="hljs-string">&quot;&lt;cls&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;sep&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;unk&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;pad&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;mask&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;s&gt;&quot;</span>, <span class="hljs-string">&quot;&lt;/s&gt;&quot;</span>]
trainer = trainers.UnigramTrainer(
vocab_size=<span class="hljs-number">25000</span>, special_tokens=special_tokens, unk_token=<span class="hljs-string">&quot;&lt;unk&gt;&quot;</span>
)
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-rhmk1c"><code>UnigramTrainer</code> အတွက် မမေ့မလျော့ ထည့်သွင်းရမယ့် အရေးကြီးတဲ့ argument တစ်ခုက <code>unk_token</code> ပါ။ ကျွန်တော်တို့ဟာ Unigram algorithm အတွက် သီးခြား arguments တွေဖြစ်တဲ့ tokens တွေ ဖယ်ရှားတဲ့ အဆင့်တစ်ခုစီအတွက် <code>shrinking_factor</code> (default က 0.75) ဒါမှမဟုတ် ပေးထားတဲ့ token တစ်ခုရဲ့ အမြင့်ဆုံးအရှည်ကို သတ်မှတ်ဖို့ <code>max_piece_length</code> (default က 16) တို့ကိုလည်း ပေးနိုင်ပါတယ်။</p> <p data-svelte-h="svelte-qeny4o">ဒီ tokenizer ကို text files တွေပေါ်မှာလည်း train လုပ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.model = models.Unigram()
tokenizer.train([<span class="hljs-string">&quot;wikitext-2.txt&quot;</span>], trainer=trainer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-867lno">sample text တစ်ခုရဲ့ tokenization ကို ကြည့်ရအောင်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoding = tokenizer.encode(<span class="hljs-string">&quot;Let&#x27;s test this tokenizer.&quot;</span>)
<span class="hljs-built_in">print</span>(encoding.tokens)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27; Let&#x27;</span>, <span class="hljs-string">&quot;&#x27;&quot;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27; test&#x27;</span>, <span class="hljs-string">&#x27; this&#x27;</span>, <span class="hljs-string">&#x27; to&#x27;</span>, <span class="hljs-string">&#x27;ken&#x27;</span>, <span class="hljs-string">&#x27;izer&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1e4tu7q">XLNet ရဲ့ ထူးခြားချက်တစ်ခုကတော့ <code>&lt;cls&gt;</code> token ကို sentence ရဲ့ အဆုံးမှာ ထားပြီး၊ type ID ကို 2 (အခြား tokens တွေနဲ့ ကွဲပြားစေရန်) ပေးထားတာပါပဲ။ ရလဒ်အနေနဲ့ ဒါက ဘယ်ဘက်မှာ padding လုပ်တာပါ။ special tokens အားလုံးနဲ့ token type IDs တွေကို BERT အတွက်လို template တစ်ခုနဲ့ ကျွန်တော်တို့ ကိုင်တွယ်နိုင်ပါတယ်။ ဒါပေမယ့် ပထမဆုံး <code>&lt;cls&gt;</code> နဲ့ <code>&lt;sep&gt;</code> tokens တွေရဲ့ IDs တွေကို ရယူရပါမယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->cls_token_id = tokenizer.token_to_id(<span class="hljs-string">&quot;&lt;cls&gt;&quot;</span>)
sep_token_id = tokenizer.token_to_id(<span class="hljs-string">&quot;&lt;sep&gt;&quot;</span>)
<span class="hljs-built_in">print</span>(cls_token_id, sep_token_id)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">0</span> <span class="hljs-number">1</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-j837r6">template က ဒီလိုမျိုး ဖြစ်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.post_processor = processors.TemplateProcessing(
single=<span class="hljs-string">&quot;$A:0 &lt;sep&gt;:0 &lt;cls&gt;:2&quot;</span>,
pair=<span class="hljs-string">&quot;$A:0 &lt;sep&gt;:0 $B:1 &lt;sep&gt;:1 &lt;cls&gt;:2&quot;</span>,
special_tokens=[(<span class="hljs-string">&quot;&lt;sep&gt;&quot;</span>, sep_token_id), (<span class="hljs-string">&quot;&lt;cls&gt;&quot;</span>, cls_token_id)],
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1amd51r">ပြီးတော့ sentence pair တစ်ခုကို encode လုပ်ခြင်းဖြင့် ဒါက အလုပ်လုပ်လားဆိုတာ စမ်းသပ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoding = tokenizer.encode(<span class="hljs-string">&quot;Let&#x27;s test this tokenizer...&quot;</span>, <span class="hljs-string">&quot;on a pair of sentences!&quot;</span>)
<span class="hljs-built_in">print</span>(encoding.tokens)
<span class="hljs-built_in">print</span>(encoding.type_ids)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27; Let&#x27;</span>, <span class="hljs-string">&quot;&#x27;&quot;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27; test&#x27;</span>, <span class="hljs-string">&#x27; this&#x27;</span>, <span class="hljs-string">&#x27; to&#x27;</span>, <span class="hljs-string">&#x27;ken&#x27;</span>, <span class="hljs-string">&#x27;izer&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;&lt;sep&gt;&#x27;</span>, <span class="hljs-string">&#x27; &#x27;</span>, <span class="hljs-string">&#x27;on&#x27;</span>, <span class="hljs-string">&#x27; &#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27; pair&#x27;</span>,
<span class="hljs-string">&#x27; of&#x27;</span>, <span class="hljs-string">&#x27; sentence&#x27;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27;!&#x27;</span>, <span class="hljs-string">&#x27;&lt;sep&gt;&#x27;</span>, <span class="hljs-string">&#x27;&lt;cls&gt;&#x27;</span>]
[<span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1btthbu">နောက်ဆုံးအနေနဲ့၊ <code>Metaspace</code> decoder တစ်ခု ထည့်သွင်းပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.decoder = decoders.Metaspace()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-vv71hp">ပြီးတော့ ဒီ tokenizer နဲ့ ပြီးပါပြီ! ကျွန်တော်တို့ tokenizer ကို အရင်လိုပဲ သိမ်းဆည်းနိုင်ပြီး၊ 🤗 Transformers မှာ အသုံးပြုချင်တယ်ဆိုရင် <code>PreTrainedTokenizerFast</code> ဒါမှမဟုတ် <code>XLNetTokenizerFast</code> ထဲမှာ wrap လုပ်နိုင်ပါတယ်။ <code>PreTrainedTokenizerFast</code> ကို အသုံးပြုတဲ့အခါ သတိပြုရမယ့်အချက်တစ်ခုကတော့ special tokens တွေအပြင်၊ Hugging Face library ကို ဘယ်ဘက်ကနေ padding လုပ်ဖို့ ကျွန်တော်တို့ ပြောပြဖို့ လိုအပ်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast
wrapped_tokenizer = PreTrainedTokenizerFast(
tokenizer_object=tokenizer,
bos_token=<span class="hljs-string">&quot;&lt;s&gt;&quot;</span>,
eos_token=<span class="hljs-string">&quot;&lt;/s&gt;&quot;</span>,
unk_token=<span class="hljs-string">&quot;&lt;unk&gt;&quot;</span>,
pad_token=<span class="hljs-string">&quot;&lt;pad&gt;&quot;</span>,
cls_token=<span class="hljs-string">&quot;&lt;cls&gt;&quot;</span>,
sep_token=<span class="hljs-string">&quot;&lt;sep&gt;&quot;</span>,
mask_token=<span class="hljs-string">&quot;&lt;mask&gt;&quot;</span>,
padding_side=<span class="hljs-string">&quot;left&quot;</span>,
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1pzgdft">ဒါမှမဟုတ်:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> XLNetTokenizerFast
wrapped_tokenizer = XLNetTokenizerFast(tokenizer_object=tokenizer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-mip31h">existing tokenizers တွေကို တည်ဆောက်ရာမှာ building blocks အမျိုးမျိုးကို ဘယ်လိုအသုံးပြုလဲဆိုတာ သင်မြင်တွေ့ခဲ့ရပြီဆိုတော့၊ 🤗 Tokenizers library နဲ့ သင်လိုချင်တဲ့ မည်သည့် tokenizer ကိုမဆို ရေးနိုင်ပြီး 🤗 Transformers မှာ အသုံးပြုနိုင်ပါလိမ့်မယ်။</p> <h2 class="relative group"><a id="ဝဟရ-ရငလငခက-glossary" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ဝဟရ-ရငလငခက-glossary"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ဝေါဟာရ ရှင်းလင်းချက် (Glossary)</span></h2> <ul data-svelte-h="svelte-90urvs"><li><strong>Tokenization</strong>: စာသား (သို့မဟုတ် အခြားဒေတာ) ကို AI မော်ဒယ်များ စီမံဆောင်ရွက်နိုင်ရန် tokens တွေအဖြစ် ပိုင်းခြားပေးသည့် လုပ်ငန်းစဉ်။</li> <li><strong>Normalization</strong>: စာသားကို သန့်ရှင်းရေးလုပ်ခြင်း (ဥပမာ- needless whitespace ဖယ်ရှားခြင်း၊ lowercasing, accents ဖယ်ရှားခြင်း)။</li> <li><strong>Pre-tokenization</strong>: Subword tokenization မလုပ်ဆောင်မီ စာသားကို ပိုမိုသေးငယ်သော entities (ဥပမာ- words) အဖြစ် အကြိုပိုင်းခြားခြင်း။</li> <li><strong>Tokens Sequence</strong>: စာသားကို ပိုင်းခြားပြီးနောက် ရရှိသော tokens များ၏ အစဉ်လိုက်။</li> <li><strong>Post-processing</strong>: Model ၏ output များကို နောက်ဆုံးအသုံးပြုမှုအတွက် ပြင်ဆင်ခြင်း လုပ်ငန်းစဉ်။</li> <li><strong>Special Tokens</strong>: Tokenizer သို့မဟုတ် model အတွက် သီးခြားအဓိပ္ပာယ်ရှိသော tokens များ (ဥပမာ- <code>[CLS]</code>, <code>[SEP]</code>, <code>[PAD]</code>)။</li> <li><strong>Attention Mask</strong>: မော်ဒယ်ကို အာရုံစိုက်သင့်သည့် tokens များနှင့် လျစ်လျူရှုသင့်သည့် (padding) tokens များကို ခွဲခြားပေးသည့် binary mask။</li> <li><strong>Token Type IDs</strong>: Sentence pair လုပ်ငန်းများတွင် input sequence တစ်ခုစီမှ token တစ်ခုစီသည် မည်သည့် sentence (ပထမ သို့မဟုတ် ဒုတိယ) နှင့် သက်ဆိုင်သည်ကို ဖော်ပြပေးသော IDs များ။</li> <li><strong>🤗 Tokenizers Library</strong>: Rust ဘာသာနဲ့ ရေးသားထားတဲ့ Hugging Face library တစ်ခုဖြစ်ပြီး မြန်ဆန်ထိရောက်တဲ့ tokenization ကို လုပ်ဆောင်ပေးသည်။</li> <li><strong><code>Tokenizer</code> Class</strong>: 🤗 Tokenizers library မှ အဓိက tokenizer class။</li> <li><strong><code>normalizers</code> Submodule</strong>: Normalization building blocks များ ပါဝင်သော submodule။</li> <li><strong><code>Normalizer</code></strong>: Normalization logic ကို အကောင်အထည်ဖော်ထားသော class။</li> <li><strong><code>pre_tokenizers</code> Submodule</strong>: Pre-tokenization building blocks များ ပါဝင်သော submodule။</li> <li><strong><code>PreTokenizer</code></strong>: Pre-tokenization logic ကို အကောင်အထည်ဖော်ထားသော class။</li> <li><strong><code>models</code> Submodule</strong>: Subword tokenization models များ (BPE, WordPiece, Unigram) ပါဝင်သော submodule။</li> <li><strong><code>Model</code></strong>: Subword tokenization algorithm ကို အကောင်အထည်ဖော်ထားသော class။</li> <li><strong>BPE (Byte-Pair Encoding)</strong>: Subword tokenization algorithm တစ်မျိုး။</li> <li><strong>WordPiece</strong>: Subword tokenization algorithm တစ်မျိုး။</li> <li><strong>Unigram</strong>: Subword tokenization algorithm တစ်မျိုး။</li> <li><strong><code>trainers</code> Submodule</strong>: Model training အတွက် trainers များ ပါဝင်သော submodule။</li> <li><strong><code>Trainer</code></strong>: Model ကို corpus တစ်ခုပေါ်တွင် train လုပ်ရန် အသုံးပြုသော class။</li> <li><strong>Corpus</strong>: စာသား (သို့မဟုတ် အခြားဒေတာ) အစုအဝေးကြီးတစ်ခု။</li> <li><strong><code>post_processors</code> Submodule</strong>: Post-processing building blocks များ ပါဝင်သော submodule။</li> <li><strong><code>PostProcessor</code></strong>: Post-processing logic ကို အကောင်အထည်ဖော်ထားသော class။</li> <li><strong><code>decoders</code> Submodule</strong>: Tokenization outputs များကို decode လုပ်ရန် decoders များ ပါဝင်သော submodule။</li> <li><strong><code>Decoder</code></strong>: Decoding logic ကို အကောင်အထည်ဖော်ထားသော class။</li> <li><strong><code>get_training_corpus()</code> Function</strong>: Tokenizer ကို လေ့ကျင့်ရန်အတွက် batches of texts များကို yield လုပ်သော generator function။</li> <li><strong>Generator</strong>: Python တွင် iteration လုပ်နိုင်သော object တစ်ခုဖြစ်ပြီး ၎င်းသည် အရာအားလုံးကို memory ထဲသို့ တစ်ပြိုင်နက်တည်း သိမ်းဆည်းမထားဘဲ လိုအပ်သလို တန်ဖိုးများကို ထုတ်ပေးသည်။</li> <li><strong>WikiText-2 Dataset</strong>: ဘာသာစကား model များကို လေ့ကျင့်ရန် အသုံးပြုသော dataset တစ်ခု။</li> <li><strong>Text Files</strong>: စာသားအချက်အလက်များသာ ပါဝင်သော ဖိုင်များ။</li> <li><strong>BERT Tokenizer</strong>: BERT model အတွက် အသုံးပြုသော tokenizer။</li> <li><strong>GPT-2 Tokenizer</strong>: GPT-2 model အတွက် အသုံးပြုသော tokenizer။</li> <li><strong>XLNet Tokenizer</strong>: XLNet model အတွက် အသုံးပြုသော tokenizer။</li> <li><strong><code>models.WordPiece(unk_token=&quot;[UNK]&quot;)</code></strong>: Unknown token အဖြစ် <code>[UNK]</code> ကို အသုံးပြုသော WordPiece model ကို ဖန်တီးခြင်း။</li> <li><strong><code>unk_token</code></strong>: Model က မသိသော tokens များကို ကိုယ်စားပြုသော special token။</li> <li><strong><code>vocab</code></strong>: Model ၏ vocabulary (သိရှိသော tokens များစာရင်း)။</li> <li><strong><code>max_input_chars_per_word</code></strong>: word တစ်ခုအတွက် အမြင့်ဆုံး character အရေအတွက်။</li> <li><strong><code>BertNormalizer</code></strong>: BERT tokenizer အတွက် အကြိုတည်ဆောက်ထားသော normalizer class။</li> <li><strong><code>lowercase</code></strong>: စာလုံးများကို အသေးစာလုံးများအဖြစ် ပြောင်းလဲခြင်း။</li> <li><strong><code>strip_accents</code></strong>: စာလုံးများပေါ်ရှိ accents များကို ဖယ်ရှားခြင်း။</li> <li><strong><code>clean_text</code></strong>: Control characters များကို ဖယ်ရှားခြင်းနှင့် ထပ်နေသော spaces များကို တစ်ခုတည်းဖြင့် အစားထိုးခြင်း။</li> <li><strong><code>handle_chinese_chars</code></strong>: Chinese characters များပတ်ပတ်လည်တွင် spaces တွေ ထည့်သွင်းခြင်း။</li> <li><strong><code>bert-base-uncased</code></strong>: BERT model ၏ base version အတွက် checkpoint identifier (uncased version)။</li> <li><strong><code>normalizers.Sequence</code></strong>: normalizers များစွာကို ပေါင်းစပ်ရန် အသုံးပြုသော class။</li> <li><strong><code>normalizers.NFD()</code></strong>: Unicode normalization form D (Canonical Decomposition) ကို အသုံးပြုသော normalizer။</li> <li><strong><code>normalizers.Lowercase()</code></strong>: စာလုံးများကို အသေးစာလုံးများအဖြစ် ပြောင်းလဲသော normalizer။</li> <li><strong><code>normalizers.StripAccents()</code></strong>: accents များကို ဖယ်ရှားသော normalizer။</li> <li><strong><code>normalize_str()</code> Method</strong>: Normalizer object မှ string တစ်ခုကို normalize လုပ်သော method။</li> <li><strong><code>BertPreTokenizer</code></strong>: BERT tokenizer အတွက် အကြိုတည်ဆောက်ထားသော pre-tokenizer class။</li> <li><strong><code>pre_tokenizers.Whitespace()</code></strong>: whitespace နှင့် punctuation ဖြင့် ပိုင်းခြားသော pre-tokenizer။</li> <li><strong><code>pre_tokenizers.WhitespaceSplit()</code></strong>: whitespace ဖြင့်သာ ပိုင်းခြားသော pre-tokenizer။</li> <li><strong><code>pre_tokenizers.Punctuation()</code></strong>: punctuation ဖြင့် ပိုင်းခြားသော pre-tokenizer။</li> <li><strong><code>pre_tokenize_str()</code> Method</strong>: Pre-tokenizer object မှ string တစ်ခုကို pre-tokenize လုပ်သော method။</li> <li><strong><code>WordPieceTrainer</code></strong>: WordPiece model ကို train လုပ်ရန်အတွက် trainer class။</li> <li><strong><code>vocab_size</code></strong>: vocabulary ၏ အမြင့်ဆုံးအရွယ်အစား။</li> <li><strong><code>special_tokens</code></strong>: Model ၏ special tokens များ။</li> <li><strong><code>min_frequency</code></strong>: token တစ်ခု vocabulary ထဲမှာ ပါဝင်ဖို့ ဘယ်အကြိမ်ရေ အနည်းဆုံး ပေါ်လာရမလဲ။</li> <li><strong><code>continuing_subword_prefix</code></strong>: subword တစ်ခု ဆက်နေကြောင်း ဖော်ပြသော prefix (ဥပမာ- <code>##</code>)။</li> <li><strong><code>train_from_iterator()</code> Method</strong>: iterator မှ data ကို အသုံးပြုပြီး tokenizer ကို train လုပ်သော method။</li> <li><strong><code>train()</code> Method</strong>: text files များမှ data ကို အသုံးပြုပြီး tokenizer ကို train လုပ်သော method။</li> <li><strong><code>encode()</code> Method</strong>: စာသားကို tokens ID များအဖြစ် ပြောင်းလဲပေးသော tokenizer method။</li> <li><strong><code>Encoding</code> Object</strong>: <code>encode()</code> method မှ ပြန်ပေးသော object ဖြစ်ပြီး encoded inputs အားလုံး ပါဝင်သည်။</li> <li><strong><code>ids</code></strong>: Encoded tokens များ၏ ID များ။</li> <li><strong><code>type_ids</code></strong>: Token type IDs များ။</li> <li><strong><code>tokens</code></strong>: Tokenized string များ၏ list။</li> <li><strong><code>offsets</code></strong>: Offset mapping များ။</li> <li><strong><code>attention_mask</code></strong>: Attention mask။</li> <li><strong><code>special_tokens_mask</code></strong>: Special tokens mask။</li> <li><strong><code>overflowing</code></strong>: Truncate လုပ်ထားသော tokens များ။</li> <li><strong><code>token_to_id()</code> Method</strong>: Token string ကို ၎င်း၏ ID သို့ ပြောင်းလဲပေးသော tokenizer method။</li> <li><strong><code>TemplateProcessing</code></strong>: Post-processing အတွက် template ကို အသုံးပြုသော processor class။</li> <li><strong><code>single</code></strong>: single sentence အတွက် template။</li> <li><strong><code>pair</code></strong>: sentence pair အတွက် template။</li> <li><strong><code>$A</code></strong>: ပထမ sentence ကို ကိုယ်စားပြုသော placeholder။</li> <li><strong><code>$B</code></strong>: ဒုတိယ sentence ကို ကိုယ်စားပြုသော placeholder။</li> <li><strong><code>WordPiece Decoder</code></strong>: WordPiece tokens များကို text အဖြစ် ပြန်ပြောင်းပေးသော decoder။</li> <li><strong><code>prefix=&quot;##&quot;</code></strong>: WordPiece decoder အတွက် subword prefix။</li> <li><strong><code>decode()</code> Method</strong>: Token IDs များကို text အဖြစ် ပြန်ပြောင်းပေးသော tokenizer method။</li> <li><strong>JSON File</strong>: JavaScript Object Notation format ဖြင့် သိမ်းဆည်းထားသော ဖိုင်။</li> <li><strong><code>save(&quot;tokenizer.json&quot;)</code></strong>: Tokenizer ကို JSON file အဖြစ် သိမ်းဆည်းသော method။</li> <li><strong><code>from_file(&quot;tokenizer.json&quot;)</code></strong>: JSON file မှ tokenizer ကို load လုပ်သော method။</li> <li><strong><code>PreTrainedTokenizerFast</code></strong>: 🤗 Transformers library မှ generic fast tokenizer class။</li> <li><strong><code>BertTokenizerFast</code></strong>: BERT model အတွက် သီးခြား fast tokenizer class။</li> <li><strong><code>tokenizer_object</code></strong>: <code>PreTrainedTokenizerFast</code> သို့ ပေးပို့သော tokenizer object။</li> <li><strong><code>tokenizer_file</code></strong>: <code>PreTrainedTokenizerFast</code> သို့ ပေးပို့သော tokenizer file ၏ path။</li> <li><strong><code>bos_token</code> (Beginning Of Sentence Token)</strong>: Sentence ၏ အစကို ကိုယ်စားပြုသော special token။</li> <li><strong><code>eos_token</code> (End Of Sentence Token)</strong>: Sentence ၏ အဆုံးကို ကိုယ်စားပြုသော special token။</li> <li><strong><code>pad_token</code> (Padding Token)</strong>: Sequence များကို တူညီသောအရှည်ဖြစ်အောင် ဖြည့်စွက်ရန် အသုံးပြုသော special token။</li> <li><strong><code>cls_token</code></strong>: BERT model တွင် sequence ၏ အစကို ကိုယ်စားပြုသော special token။</li> <li><strong><code>sep_token</code></strong>: BERT model တွင် sentence တစ်ခု၏ အဆုံး သို့မဟုတ် sentence နှစ်ခုကြား ပိုင်းခြားရန် အသုံးပြုသော special token။</li> <li><strong><code>mask_token</code></strong>: Masked Language Modeling (MLM) တွင် စကားလုံးများကို ဝှက်ထားရန် အသုံးပြုသော special token။</li> <li><strong><code>save_pretrained()</code> Method</strong>: <code>PreTrainedTokenizerFast</code> ကို pretrained model အဖြစ် သိမ်းဆည်းသော method။</li> <li><strong><code>push_to_hub()</code> Method</strong>: <code>PreTrainedTokenizerFast</code> ကို Hugging Face Hub သို့ upload လုပ်သော method။</li> <li><strong><code>models.BPE()</code></strong>: BPE model ကို ဖန်တီးခြင်း။</li> <li><strong><code>merges</code></strong>: BPE model တွင် token merge rules များ။</li> <li><strong>Byte-level BPE</strong>: BPE tokenization တစ်မျိုးဖြစ်ပြီး bytes များကို အခြေခံ၍ လုပ်ဆောင်သည်။</li> <li><strong><code>pre_tokenizers.ByteLevel()</code></strong>: Byte-level pre-tokenization ကို လုပ်ဆောင်သော pre-tokenizer။</li> <li><strong><code>add_prefix_space=False</code></strong>: ByteLevel pre-tokenizer အတွက် sentence အစတွင် space မထည့်ရန် သတ်မှတ်ခြင်း။</li> <li><strong><code>BpeTrainer</code></strong>: BPE model ကို train လုပ်ရန်အတွက် trainer class။</li> <li><strong><code>end_of_word_suffix</code></strong>: word ၏ အဆုံးကို ကိုယ်စားပြုသော suffix (ဥပမာ- <code>&lt;/w&gt;</code>)။</li> <li><strong><code>Ġ</code> Symbol</strong>: GPT-2 tokenizer တွင် space ကို ကိုယ်စားပြုသော symbol။</li> <li><strong><code>processors.ByteLevel(trim_offsets=False)</code></strong>: Byte-level post-processing ကို လုပ်ဆောင်သော processor။ <code>trim_offsets=False</code> က offsets များကို ပြင်ဆင်မွမ်းမံခြင်း မပြုလုပ်စေပါ။</li> <li><strong><code>decoders.ByteLevel()</code></strong>: Byte-level tokens များကို text အဖြစ် ပြန်ပြောင်းပေးသော decoder။</li> <li><strong><code>models.Unigram()</code></strong>: Unigram model ကို ဖန်တီးခြင်း။</li> <li><strong><code>tokenizers.Regex</code></strong>: Regex (Regular Expression) ကို အသုံးပြုရန် class။</li> <li><strong><code>normalizers.Replace(&quot;``&quot;, &#39;&quot;&#39;)</code></strong>: ````<code>ကို</code>”` ဖြင့် အစားထိုးသော normalizer။</li> <li><strong><code>normalizers.Replace(&quot;&#39;&#39;&quot;, &#39;&quot;&#39;)</code></strong>: <code>&#39;&#39;</code> ကို <code>&quot;</code> ဖြင့် အစားထိုးသော normalizer။</li> <li><strong><code>normalizers.NFKD()</code></strong>: Unicode normalization form KD (Compatibility Decomposition) ကို အသုံးပြုသော normalizer။</li> <li><strong><code>normalizers.Replace(Regex(&quot; {2,}&quot;), &quot; &quot;)</code></strong>: spaces နှစ်ခု သို့မဟုတ် ထို့ထက်ပိုသော sequence များကို single space တစ်ခုဖြင့် အစားထိုးသော normalizer။</li> <li><strong><code>pre_tokenizers.Metaspace()</code></strong>: Metaspace pre-tokenization ကို လုပ်ဆောင်သော pre-tokenizer။ (spaces များကို special character ဖြင့် အစားထိုးသည်)။</li> <li><strong><code>UnigramTrainer</code></strong>: Unigram model ကို train လုပ်ရန်အတွက် trainer class။</li> <li><strong><code>shrinking_factor</code></strong>: Unigram training တွင် vocabulary မှ tokens များကို ဖယ်ရှားသည့်အခါ အသုံးပြုသော factor။</li> <li><strong><code>max_piece_length</code></strong>: token တစ်ခု၏ အမြင့်ဆုံးအရှည်။</li> <li><strong><code>&lt;cls&gt;</code> Token</strong>: XLNet model တွင် sequence ၏ အဆုံးကို ကိုယ်စားပြုသော special token (token type ID 2 ဖြင့်)။</li> <li><strong><code>&lt;sep&gt;</code> Token</strong>: XLNet model တွင် sentence တစ်ခု၏ အဆုံးကို ကိုယ်စားပြုသော special token။</li> <li><strong><code>padding_side=&quot;left&quot;</code></strong>: Padding ကို sequence ၏ ဘယ်ဘက်ခြမ်းတွင် လုပ်ဆောင်ရန် သတ်မှတ်ခြင်း။</li> <li><strong><code>XLNetTokenizerFast</code></strong>: XLNet model အတွက် သီးခြား fast tokenizer class။</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/my/chapter6/8.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_tyugt6 = {
assets: "/docs/course/pr_1114/my",
base: "/docs/course/pr_1114/my",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1114/my/_app/immutable/entry/start.14794ee9.js"),
import("/docs/course/pr_1114/my/_app/immutable/entry/app.a133f5c6.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 53],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
183 kB
·
Xet hash:
3e78d3e0ea8ee360c98d25b44d9a8d08ba6f2e94a67be61deeb128e90acc2688

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.