Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Tokenizer တစ်ခုကို အဆင့်ဆင့် တည်ဆောက်ခြင်း","local":"building-a-tokenizer-block-by-block","sections":[{"title":"Corpus တစ်ခု ရယူခြင်း","local":"acquiring-a-corpus","sections":[],"depth":2},{"title":"အစကနေ WordPiece Tokenizer တစ်ခု တည်ဆောက်ခြင်း","local":"building-a-wordpiece-tokenizer-from-scratch","sections":[],"depth":2},{"title":"အစကနေ BPE Tokenizer တစ်ခု တည်ဆောက်ခြင်း","local":"building-a-bpe-tokenizer-from-scratch","sections":[],"depth":2},{"title":"အစကနေ Unigram Tokenizer တစ်ခု တည်ဆောက်ခြင်း","local":"building-a-unigram-tokenizer-from-scratch","sections":[],"depth":2},{"title":"ဝေါဟာရ ရှင်းလင်းချက် (Glossary)","local":"ဝဟရ-ရငလငခက-glossary","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/course/pr_1114/my/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/entry/start.14794ee9.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/scheduler.893fe8c9.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/singletons.10fda3ce.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/index.bce52c8a.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/paths.89c82153.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/entry/app.a133f5c6.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/preload-helper.b1a719fd.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/index.b1df2166.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/nodes/0.510afdc1.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/nodes/53.3b37bc16.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.762ed9cc.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/Youtube.ec5d7916.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/CodeBlock.6cef0479.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1114/my/_app/immutable/chunks/CourseFloatingBanner.c1c08878.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Tokenizer တစ်ခုကို အဆင့်ဆင့် တည်ဆောက်ခြင်း","local":"building-a-tokenizer-block-by-block","sections":[{"title":"Corpus တစ်ခု ရယူခြင်း","local":"acquiring-a-corpus","sections":[],"depth":2},{"title":"အစကနေ WordPiece Tokenizer တစ်ခု တည်ဆောက်ခြင်း","local":"building-a-wordpiece-tokenizer-from-scratch","sections":[],"depth":2},{"title":"အစကနေ BPE Tokenizer တစ်ခု တည်ဆောက်ခြင်း","local":"building-a-bpe-tokenizer-from-scratch","sections":[],"depth":2},{"title":"အစကနေ Unigram Tokenizer တစ်ခု တည်ဆောက်ခြင်း","local":"building-a-unigram-tokenizer-from-scratch","sections":[],"depth":2},{"title":"ဝေါဟာရ ရှင်းလင်းချက် (Glossary)","local":"ဝဟရ-ရငလငခက-glossary","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="building-a-tokenizer-block-by-block" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#building-a-tokenizer-block-by-block"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Tokenizer တစ်ခုကို အဆင့်ဆင့် တည်ဆောက်ခြင်း</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0" style=""><a href="https://discuss.huggingface.co/t/chapter-6-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter6/section8.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter6/section8.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-141i56c">ယခင်အပိုင်းတွေမှာ ကျွန်တော်တို့ တွေ့ခဲ့ရတဲ့အတိုင်း၊ tokenization မှာ အဆင့်များစွာ ပါဝင်ပါတယ်။</p> <ul data-svelte-h="svelte-x7mng"><li>Normalization (မလိုအပ်တဲ့ spaces တွေ ဒါမှမဟုတ် accents တွေ ဖယ်ရှားတာ၊ Unicode normalization စတာတွေလိုမျိုး လိုအပ်တယ်လို့ ယူဆရတဲ့ text ကို သန့်ရှင်းရေးလုပ်ခြင်း)</li> <li>Pre-tokenization (input ကို words တွေအဖြစ် ပိုင်းခြားခြင်း)</li> <li>inputs ကို model ကနေတစ်ဆင့် run ခြင်း (pre-tokenize လုပ်ထားတဲ့ words တွေကို အသုံးပြုပြီး tokens sequence တစ်ခု ထုတ်လုပ်ခြင်း)</li> <li>Post-processing (tokenizer ရဲ့ special tokens တွေ ထည့်သွင်းခြင်း၊ attention mask နဲ့ token type IDs တွေ ထုတ်လုပ်ခြင်း)</li></ul> <p data-svelte-h="svelte-mxtm6p">သတိရစေရန်၊ ဒီမှာ overall process ကို ထပ်ကြည့်ရအောင်။</p> <div class="flex justify-center" data-svelte-h="svelte-oxfng3"><img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter6/tokenization_pipeline.svg" alt="The tokenization pipeline."> <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter6/tokenization_pipeline-dark.svg" alt="The tokenization pipeline."></div> <p data-svelte-h="svelte-1spiys7">🤗 Tokenizers library ကို အဲဒီအဆင့်တစ်ခုစီအတွက် ရွေးချယ်စရာများစွာ ပံ့ပိုးပေးဖို့ တည်ဆောက်ထားပြီး၊ သင်ဟာ ဒါတွေကို ရောနှောပြီး ပေါင်းစပ်နိုင်ပါတယ်။ ဒီအပိုင်းမှာ ကျွန်တော်တို့ <a href="/course/chapter6/2">အပိုင်း ၂</a> မှာ လုပ်ခဲ့သလိုဟောင်းနွမ်းတဲ့ tokenizer တစ်ခုကနေ tokenizer အသစ်တစ်ခုကို train လုပ်မယ့်အစား၊ tokenizer တစ်ခုကို အစကနေ ဘယ်လိုတည်ဆောက်နိုင်လဲဆိုတာ ကြည့်ရပါမယ်။ အဲဒီအခါ သင်စိတ်ကူးနိုင်တဲ့ မည်သည့် tokenizer အမျိုးအစားကိုမဆို တည်ဆောက်နိုင်ပါလိမ့်မယ်။</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/MR8tZm5ViWU" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-1u5uy4g">ပိုပြီးတိတိကျကျပြောရရင်၊ library ကို ဗဟိုချက်မဖြစ်တဲ့ <code>Tokenizer</code> class ကိုယ်တိုင်နဲ့ building blocks တွေကို submodule တွေအဖြစ် အုပ်စုဖွဲ့ထားပါတယ်။</p> <ul data-svelte-h="svelte-1tg4kr"><li><code>normalizers</code> မှာ သင်အသုံးပြုနိုင်တဲ့ <code>Normalizer</code> အမျိုးအစားအားလုံး (အပြည့်အစုံကို <a href="https://huggingface.co/docs/tokenizers/api/normalizers" rel="nofollow">ဒီနေရာမှာ</a> ကြည့်ပါ) ပါဝင်ပါတယ်။</li> <li><code>pre_tokenizers</code> မှာ သင်အသုံးပြုနိုင်တဲ့ <code>PreTokenizer</code> အမျိုးအစားအားလုံး (အပြည့်အစုံကို <a href="https://huggingface.co/docs/tokenizers/api/pre-tokenizers" rel="nofollow">ဒီနေရာမှာ</a> ကြည့်ပါ) ပါဝင်ပါတယ်။</li> <li><code>models</code> မှာ <code>BPE</code>, <code>WordPiece</code>, နဲ့ <code>Unigram</code> လိုမျိုး သင်အသုံးပြုနိုင်တဲ့ <code>Model</code> အမျိုးအစားမျိုးစုံ (အပြည့်အစုံကို <a href="https://huggingface.co/docs/tokenizers/api/models" rel="nofollow">ဒီနေရာမှာ</a> ကြည့်ပါ) ပါဝင်ပါတယ်။</li> <li><code>trainers</code> မှာ သင် corpus တစ်ခုပေါ်မှာ model ကို train လုပ်ဖို့ အသုံးပြုနိုင်တဲ့ <code>Trainer</code> အမျိုးအစားအမျိုးမျိုး (model အမျိုးအစားတစ်ခုစီအတွက် တစ်ခုစီ၊ အပြည့်အစုံကို <a href="https://huggingface.co/docs/tokenizers/api/trainers" rel="nofollow">ဒီနေရာမှာ</a> ကြည့်ပါ) ပါဝင်ပါတယ်။</li> <li><code>post_processors</code> မှာ သင်အသုံးပြုနိုင်တဲ့ <code>PostProcessor</code> အမျိုးအစားမျိုးစုံ (အပြည့်အစုံကို <a href="https://huggingface.co/docs/tokenizers/api/post-processors" rel="nofollow">ဒီနေရာမှာ</a> ကြည့်ပါ) ပါဝင်ပါတယ်။</li> <li><code>decoders</code> မှာ tokenization ရဲ့ outputs တွေကို decode လုပ်ဖို့ သင်အသုံးပြုနိုင်တဲ့ <code>Decoder</code> အမျိုးအစားမျိုးစုံ (အပြည့်အစုံကို <a href="https://huggingface.co/docs/tokenizers/components#decoders" rel="nofollow">ဒီနေရာမှာ</a> ကြည့်ပါ) ပါဝင်ပါတယ်။</li></ul> <p data-svelte-h="svelte-1y9rqvk">building blocks တွေရဲ့ စာရင်းအပြည့်အစုံကို <a href="https://huggingface.co/docs/tokenizers/components" rel="nofollow">ဒီနေရာမှာ</a> ရှာတွေ့နိုင်ပါတယ်။</p> <h2 class="relative group"><a id="acquiring-a-corpus" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#acquiring-a-corpus"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Corpus တစ်ခု ရယူခြင်း</span></h2> <p data-svelte-h="svelte-wo9qzi">ကျွန်တော်တို့ရဲ့ tokenizer အသစ်ကို train လုပ်ဖို့အတွက်၊ သေးငယ်တဲ့ text corpus တစ်ခုကို အသုံးပြုပါမယ် (ဒါမှ ဥပမာတွေက မြန်မြန်ဆန်ဆန် run မှာပါ)။ corpus ရယူခြင်းအဆင့်တွေက <a href="/course/chapter6/2">ဒီအခန်းရဲ့ အစပိုင်း</a> မှာ ကျွန်တော်တို့ လုပ်ခဲ့တဲ့အဆင့်တွေနဲ့ ဆင်တူပါတယ်။ ဒါပေမယ့် ဒီတစ်ကြိမ်မှာတော့ <a href="https://huggingface.co/datasets/wikitext" rel="nofollow">WikiText-2</a> dataset ကို အသုံးပြုပါမယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| dataset = load_dataset(<span class="hljs-string">"wikitext"</span>, name=<span class="hljs-string">"wikitext-2-raw-v1"</span>, split=<span class="hljs-string">"train"</span>) | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">get_training_corpus</span>(): | |
| <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">0</span>, <span class="hljs-built_in">len</span>(dataset), <span class="hljs-number">1000</span>): | |
| <span class="hljs-keyword">yield</span> dataset[i : i + <span class="hljs-number">1000</span>][<span class="hljs-string">"text"</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1skrsoe"><code>get_training_corpus()</code> function က batches of 1,000 texts တွေကို yield လုပ်မယ့် generator တစ်ခုဖြစ်ပြီး၊ ဒါတွေကို tokenizer ကို train ဖို့ ကျွန်တော်တို့ အသုံးပြုပါမယ်။</p> <p data-svelte-h="svelte-1biyl0i">🤗 Tokenizers တွေကို text files တွေပေါ်မှာ တိုက်ရိုက် train လုပ်နိုင်ပါတယ်။ WikiText-2 ကနေ inputs/texts တွေအားလုံး ပါဝင်တဲ့ text file တစ်ခုကို locally အသုံးပြုနိုင်အောင် ဘယ်လို generate လုပ်ရမလဲဆိုတာ ဒီမှာ ဖော်ပြထားပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">with</span> <span class="hljs-built_in">open</span>(<span class="hljs-string">"wikitext-2.txt"</span>, <span class="hljs-string">"w"</span>, encoding=<span class="hljs-string">"utf-8"</span>) <span class="hljs-keyword">as</span> f: | |
| <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(dataset)): | |
| f.write(dataset[i][<span class="hljs-string">"text"</span>] + <span class="hljs-string">"\n"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17nrx09">နောက်မှာတော့ သင့်ကိုယ်ပိုင် BERT, GPT-2, နဲ့ XLNet tokenizers တွေကို အဆင့်ဆင့် ဘယ်လိုတည်ဆောက်ရမလဲဆိုတာ ပြသပေးပါမယ်။ ဒါက ကျွန်တော်တို့ကို အဓိက tokenization algorithms သုံးခုဖြစ်တဲ့ WordPiece, BPE, နဲ့ Unigram တို့ရဲ့ ဥပမာတစ်ခုစီကို ပေးပါလိမ့်မယ်။ BERT နဲ့ စတင်ကြရအောင်။</p> <h2 class="relative group"><a id="building-a-wordpiece-tokenizer-from-scratch" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#building-a-wordpiece-tokenizer-from-scratch"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အစကနေ WordPiece Tokenizer တစ်ခု တည်ဆောက်ခြင်း</span></h2> <p data-svelte-h="svelte-19mkw5x">🤗 Tokenizers library နဲ့ tokenizer တစ်ခုတည်ဆောက်ဖို့အတွက်၊ ကျွန်တော်တို့ဟာ <code>models</code> တစ်ခုနဲ့ <code>Tokenizer</code> object တစ်ခုကို instantiate လုပ်ခြင်းဖြင့် စတင်ပြီး၊ ၎င်းရဲ့ <code>normalizer</code>, <code>pre_tokenizer</code>, <code>post_processor</code>, နဲ့ <code>decoder</code> attributes တွေကို ကျွန်တော်တို့ လိုချင်တဲ့ တန်ဖိုးတွေဆီ သတ်မှတ်ပေးပါတယ်။</p> <p data-svelte-h="svelte-1qm18z7">ဒီဥပမာအတွက်၊ WordPiece model တစ်ခုနဲ့ <code>Tokenizer</code> တစ်ခုကို ကျွန်တော်တို့ ဖန်တီးပါမယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> ( | |
| decoders, | |
| models, | |
| normalizers, | |
| pre_tokenizers, | |
| processors, | |
| trainers, | |
| Tokenizer, | |
| ) | |
| tokenizer = Tokenizer(models.WordPiece(unk_token=<span class="hljs-string">"[UNK]"</span>))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-r5m8ig"><code>unk_token</code> ကို သတ်မှတ်ပေးရမှာပါ။ ဒါမှ model က မမြင်ဖူးသေးတဲ့ characters တွေ ကြုံတွေ့ရတဲ့အခါ ဘာကို ပြန်ပေးရမလဲဆိုတာ သိမှာပါ။ ဒီနေရာမှာ ကျွန်တော်တို့ သတ်မှတ်နိုင်တဲ့ တခြား arguments တွေကတော့ model ရဲ့ <code>vocab</code> (ကျွန်တော်တို့ model ကို train မှာဖြစ်တဲ့အတွက် ဒါကို သတ်မှတ်ဖို့ မလိုအပ်ပါဘူး) နဲ့ <code>max_input_chars_per_word</code> (word တစ်ခုစီအတွက် အမြင့်ဆုံးအရှည်ကို သတ်မှတ်ပေးပြီး၊ ဒီတန်ဖိုးထက် ပိုရှည်တဲ့ words တွေကို ပိုင်းခြားပါလိမ့်မယ်) တို့ ပါဝင်ပါတယ်။</p> <p data-svelte-h="svelte-q1n86n">tokenization ရဲ့ ပထမအဆင့်က normalization ဖြစ်တာကြောင့်၊ အဲဒါနဲ့ စတင်ကြပါစို့။ BERT ကို အများအားဖြင့် အသုံးပြုတဲ့အတွက် BERT အတွက် သတ်မှတ်နိုင်တဲ့ classic options တွေနဲ့ <code>BertNormalizer</code> တစ်ခုရှိပါတယ်၊ <code>lowercase</code> နဲ့ <code>strip_accents</code> က ရှင်းပြစရာမလိုပါဘူး၊ <code>clean_text</code> က control characters အားလုံးကို ဖယ်ရှားပြီး ထပ်နေတဲ့ spaces တွေကို တစ်ခုတည်းနဲ့ အစားထိုးပါတယ်၊ <code>handle_chinese_chars</code> က Chinese characters တွေပတ်ပတ်လည်မှာ spaces တွေ ထည့်ပေးပါတယ်။ <code>bert-base-uncased</code> tokenizer ကို ပြန်လည်ထုတ်လုပ်ဖို့၊ ဒီ normalizer ကို သတ်မှတ်ပေးနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.normalizer = normalizers.BertNormalizer(lowercase=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-hqg2ho">သို့သော်လည်း၊ အထွေထွေအားဖြင့်၊ tokenizer အသစ်တစ်ခု တည်ဆောက်တဲ့အခါ 🤗 Tokenizers library ထဲမှာ အကောင်အထည်ဖော်ထားပြီးသား ဒီလိုအသုံးဝင်တဲ့ normalizer ကို သင်ရရှိမှာ မဟုတ်ပါဘူး၊ ဒါကြောင့် BERT normalizer ကို ကိုယ်တိုင် ဘယ်လိုဖန်တီးရမလဲဆိုတာ ကြည့်ရအောင်။ library က <code>Lowercase</code> normalizer နဲ့ <code>StripAccents</code> normalizer ကို ပံ့ပိုးပေးပြီး၊ <code>Sequence</code> ကို အသုံးပြုပြီး normalizers များစွာကို ပေါင်းစပ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.normalizer = normalizers.<span class="hljs-type">Sequence</span>( | |
| [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()] | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1lw62pa">ကျွန်တော်တို့ <code>NFD</code> Unicode normalizer ကိုလည်း အသုံးပြုနေပါတယ်၊ ဘာလို့လဲဆိုတော့ မဟုတ်ရင် <code>StripAccents</code> normalizer က accented characters တွေကို မှန်ကန်စွာ မှတ်မိမှာ မဟုတ်ဘဲ ၎င်းတို့ကို ဖယ်ရှားနိုင်မှာ မဟုတ်ပါဘူး။</p> <p data-svelte-h="svelte-136j7i5">အရင်က တွေ့ခဲ့ရတဲ့အတိုင်း၊ <code>normalizer</code> ရဲ့ <code>normalize_str()</code> method ကို အသုံးပြုပြီး ပေးထားတဲ့ text ပေါ်မှာ ဒါက ဘယ်လိုအကျိုးသက်ရောက်မှု ရှိလဲဆိုတာ ကြည့်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(tokenizer.normalizer.normalize_str(<span class="hljs-string">"Héllò hôw are ü?"</span>))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->hello how are u?<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-13a6mef"><p><strong>ဆက်လက်လေ့လာရန်</strong> ယခင် normalizers ၏ versions နှစ်ခုကို unicode character <code>u"\u0085"</code> ပါဝင်သော string တစ်ခုပေါ်တွင် စမ်းသပ်ပါက၊ ဤ normalizers နှစ်ခုသည် အတိအကျတူညီခြင်းမရှိသည်ကို သင်သတိထားမိပါလိမ့်မည်။ | |
| <code>normalizers.Sequence</code> ပါသော version ကို အလွန်အမင်း ရှုပ်ထွေးမှုမဖြစ်စေရန်၊ <code>clean_text</code> argument ကို <code>True</code> (၎င်းသည် default behavior ဖြစ်သည်) ဟု သတ်မှတ်ထားသောအခါ <code>BertNormalizer</code> လိုအပ်သည့် Regex replacements များကို ကျွန်ုပ်တို့ ထည့်သွင်းမထားပါ။ သို့သော် စိတ်မပူပါနှင့်၊ အသုံးဝင်သော <code>BertNormalizer</code> ကို အသုံးမပြုဘဲ <code>normalizers.Replace</code> နှစ်ခုကို normalizers sequence တွင် ထပ်ထည့်ခြင်းဖြင့် အတိအကျတူညီသော normalization ကို ရရှိနိုင်ပါသည်။</p></blockquote> <p data-svelte-h="svelte-b1cmuk">နောက်တစ်ခုက pre-tokenization အဆင့်ပါ။ ထပ်မံပြီး၊ ကျွန်တော်တို့ အသုံးပြုနိုင်တဲ့ prebuilt <code>BertPreTokenizer</code> တစ်ခုရှိပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jfyjke">ဒါမှမဟုတ် အစကနေ တည်ဆောက်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-wm20ec"><code>Whitespace</code> pre-tokenizer က whitespace နဲ့ letters, digits, ဒါမှမဟုတ် underscore character မဟုတ်တဲ့ characters အားလုံးကို ပိုင်းခြားတာကြောင့်၊ ဒါက နည်းပညာအရ whitespace နဲ့ punctuation တွေပေါ်မှာ ပိုင်းခြားတယ်ဆိုတာ သတိပြုပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.pre_tokenizer.pre_tokenize_str(<span class="hljs-string">"Let's test my pre-tokenizer."</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[(<span class="hljs-string">'Let'</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">3</span>)), (<span class="hljs-string">"'"</span>, (<span class="hljs-number">3</span>, <span class="hljs-number">4</span>)), (<span class="hljs-string">'s'</span>, (<span class="hljs-number">4</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">'test'</span>, (<span class="hljs-number">6</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">'my'</span>, (<span class="hljs-number">11</span>, <span class="hljs-number">13</span>)), (<span class="hljs-string">'pre'</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">17</span>)), | |
| (<span class="hljs-string">'-'</span>, (<span class="hljs-number">17</span>, <span class="hljs-number">18</span>)), (<span class="hljs-string">'tokenizer'</span>, (<span class="hljs-number">18</span>, <span class="hljs-number">27</span>)), (<span class="hljs-string">'.'</span>, (<span class="hljs-number">27</span>, <span class="hljs-number">28</span>))]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ioiki1">အကယ်၍ သင်ဟာ whitespace ပေါ်မှာပဲ ပိုင်းခြားချင်တယ်ဆိုရင်၊ <code>WhitespaceSplit</code> pre-tokenizer ကို အစားအသုံးပြုသင့်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pre_tokenizer = pre_tokenizers.WhitespaceSplit() | |
| pre_tokenizer.pre_tokenize_str(<span class="hljs-string">"Let's test my pre-tokenizer."</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[(<span class="hljs-string">"Let's"</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">'test'</span>, (<span class="hljs-number">6</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">'my'</span>, (<span class="hljs-number">11</span>, <span class="hljs-number">13</span>)), (<span class="hljs-string">'pre-tokenizer.'</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">28</span>))]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-hx4h33">normalizers တွေနဲ့ တူတူပဲ၊ pre-tokenizers များစွာကို ပေါင်းစပ်ဖို့ <code>Sequence</code> ကို အသုံးပြုနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pre_tokenizer = pre_tokenizers.<span class="hljs-type">Sequence</span>( | |
| [pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()] | |
| ) | |
| pre_tokenizer.pre_tokenize_str(<span class="hljs-string">"Let's test my pre-tokenizer."</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[(<span class="hljs-string">'Let'</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">3</span>)), (<span class="hljs-string">"'"</span>, (<span class="hljs-number">3</span>, <span class="hljs-number">4</span>)), (<span class="hljs-string">'s'</span>, (<span class="hljs-number">4</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">'test'</span>, (<span class="hljs-number">6</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">'my'</span>, (<span class="hljs-number">11</span>, <span class="hljs-number">13</span>)), (<span class="hljs-string">'pre'</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">17</span>)), | |
| (<span class="hljs-string">'-'</span>, (<span class="hljs-number">17</span>, <span class="hljs-number">18</span>)), (<span class="hljs-string">'tokenizer'</span>, (<span class="hljs-number">18</span>, <span class="hljs-number">27</span>)), (<span class="hljs-string">'.'</span>, (<span class="hljs-number">27</span>, <span class="hljs-number">28</span>))]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-h2yxwq">tokenization pipeline ရဲ့ နောက်တစ်ဆင့်က inputs တွေကို model ကနေတစ်ဆင့် run ခြင်းပါ။ ကျွန်တော်တို့ model ကို initialization မှာ သတ်မှတ်ထားပြီးသားဖြစ်ပေမယ့်၊ ဒါကို train လုပ်ဖို့တော့ လိုအပ်ပါသေးတယ်။ ဒါအတွက် <code>WordPieceTrainer</code> လိုအပ်ပါလိမ့်မယ်။ 🤗 Tokenizers မှာ trainer တစ်ခုကို instantiate လုပ်တဲ့အခါ မှတ်ထားရမယ့် အဓိကအချက်ကတော့ သင်အသုံးပြုဖို့ ရည်ရွယ်ထားတဲ့ special tokens အားလုံးကို ၎င်းဆီ ပေးဖို့ လိုအပ်ပါတယ် — မဟုတ်ရင် ၎င်းတို့ training corpus ထဲမှာ မပါဝင်တဲ့အတွက် vocabulary ထဲကို ထည့်သွင်းပေးမှာ မဟုတ်ပါဘူး။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->special_tokens = [<span class="hljs-string">"[UNK]"</span>, <span class="hljs-string">"[PAD]"</span>, <span class="hljs-string">"[CLS]"</span>, <span class="hljs-string">"[SEP]"</span>, <span class="hljs-string">"[MASK]"</span>] | |
| trainer = trainers.WordPieceTrainer(vocab_size=<span class="hljs-number">25000</span>, special_tokens=special_tokens)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-2i9ubf"><code>vocab_size</code> နဲ့ <code>special_tokens</code> ကို သတ်မှတ်ခြင်းအပြင်၊ <code>min_frequency</code> (token တစ်ခု vocabulary ထဲမှာ ပါဝင်ဖို့ ဘယ်အကြိမ်ရေ အနည်းဆုံး ပေါ်လာရမလဲ) ကို သတ်မှတ်နိုင်ပါတယ် ဒါမှမဟုတ် <code>continuing_subword_prefix</code> ကို ပြောင်းလဲနိုင်ပါတယ် (ကျွန်တော်တို့ <code>##</code> နဲ့ မတူတာတစ်ခုကို အသုံးပြုချင်ရင်)။</p> <p data-svelte-h="svelte-1p6vkx5">ကျွန်တော်တို့ အစောပိုင်းက သတ်မှတ်ခဲ့တဲ့ iterator ကို အသုံးပြုပြီး model ကို train လုပ်ဖို့အတွက်၊ ဒီ command ကို run ရုံပါပဲ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-46cjj3">ကျွန်တော်တို့ tokenizer ကို train လုပ်ဖို့ text files တွေကိုလည်း အသုံးပြုနိုင်ပါတယ်။ ဒါက အောက်ပါအတိုင်း ဖြစ်ပါလိမ့်မယ် (ကျွန်တော်တို့ model ကို အရင်ဆုံး empty <code>WordPiece</code> တစ်ခုနဲ့ reinitialize လုပ်ပါတယ်)-</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.model = models.WordPiece(unk_token=<span class="hljs-string">"[UNK]"</span>) | |
| tokenizer.train([<span class="hljs-string">"wikitext-2.txt"</span>], trainer=trainer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-9gaf68">ကိစ္စနှစ်ခုလုံးမှာ၊ <code>encode()</code> method ကို ခေါ်ခြင်းဖြင့် tokenizer ကို text တစ်ခုပေါ်မှာ စမ်းသပ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoding = tokenizer.encode(<span class="hljs-string">"Let's test this tokenizer."</span>) | |
| <span class="hljs-built_in">print</span>(encoding.tokens)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">'let'</span>, <span class="hljs-string">"'"</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">'test'</span>, <span class="hljs-string">'this'</span>, <span class="hljs-string">'tok'</span>, <span class="hljs-string">'##eni'</span>, <span class="hljs-string">'##zer'</span>, <span class="hljs-string">'.'</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jy6r65">ရရှိတဲ့ <code>encoding</code> ဟာ <code>Encoding</code> တစ်ခုဖြစ်ပြီး၊ ၎င်းရဲ့ attributes များစွာ ( <code>ids</code>, <code>type_ids</code>, <code>tokens</code>, <code>offsets</code>, <code>attention_mask</code>, <code>special_tokens_mask</code>, နဲ့ <code>overflowing</code>) ထဲမှာ tokenizer ရဲ့ လိုအပ်တဲ့ outputs အားလုံး ပါဝင်ပါတယ်။</p> <p data-svelte-h="svelte-15p3n4u">tokenization pipeline ရဲ့ နောက်ဆုံးအဆင့်က post-processing ပါ။ ကျွန်တော်တို့ <code>[CLS]</code> token ကို အစမှာ ထည့်သွင်းဖို့နဲ့ <code>[SEP]</code> token ကို အဆုံးမှာ ထည့်သွင်းဖို့ လိုအပ်ပါတယ် (ဒါမှမဟုတ် sentence pair ရှိရင် sentence တစ်ခုစီရဲ့ နောက်မှာ)။ ဒါအတွက် <code>TemplateProcessor</code> ကို ကျွန်တော်တို့ အသုံးပြုပါမယ်၊ ဒါပေမယ့် ပထမဆုံး vocabulary ထဲက <code>[CLS]</code> နဲ့ <code>[SEP]</code> tokens တွေရဲ့ IDs တွေကို သိဖို့ လိုအပ်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->cls_token_id = tokenizer.token_to_id(<span class="hljs-string">"[CLS]"</span>) | |
| sep_token_id = tokenizer.token_to_id(<span class="hljs-string">"[SEP]"</span>) | |
| <span class="hljs-built_in">print</span>(cls_token_id, sep_token_id)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-number">2</span>, <span class="hljs-number">3</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1gzy7xj"><code>TemplateProcessor</code> အတွက် template ကို ရေးဖို့၊ single sentence တစ်ခုနဲ့ sentence pair တစ်ခုကို ဘယ်လို ကိုင်တွယ်ရမယ်ဆိုတာ သတ်မှတ်ရပါမယ်။ နှစ်ခုလုံးအတွက်၊ ကျွန်တော်တို့ အသုံးပြုချင်တဲ့ special tokens တွေကို ရေးပါတယ်၊ ပထမ (သို့မဟုတ် single) sentence ကို <code>$A</code> နဲ့ ကိုယ်စားပြုပြီး၊ ဒုတိယ sentence (pair တစ်ခုကို encoding လုပ်ရင်) ကို <code>$B</code> နဲ့ ကိုယ်စားပြုပါတယ်။ ဒါတွေတစ်ခုစီ (special tokens နဲ့ sentences) အတွက်၊ colon နောက်မှာ သက်ဆိုင်ရာ token type ID ကိုလည်း သတ်မှတ်ပါတယ်။</p> <p data-svelte-h="svelte-9e75ys">classic BERT template ကို အောက်ပါအတိုင်း သတ်မှတ်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.post_processor = processors.TemplateProcessing( | |
| single=<span class="hljs-string">f"[CLS]:0 $A:0 [SEP]:0"</span>, | |
| pair=<span class="hljs-string">f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1"</span>, | |
| special_tokens=[(<span class="hljs-string">"[CLS]"</span>, cls_token_id), (<span class="hljs-string">"[SEP]"</span>, sep_token_id)], | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1rzgr0m">special tokens တွေရဲ့ IDs တွေကို ပေးပို့ဖို့ လိုအပ်တယ်ဆိုတာ သတိပြုပါ။ ဒါမှ tokenizer က ၎င်းတို့ကို ၎င်းတို့ရဲ့ IDs တွေအဖြစ် မှန်ကန်စွာ ပြောင်းလဲနိုင်မှာပါ။</p> <p data-svelte-h="svelte-i1t2lx">ဒါကို ထည့်သွင်းပြီးတာနဲ့၊ ကျွန်တော်တို့ရဲ့ ယခင်ဥပမာကို ပြန်ကြည့်မယ်ဆိုရင်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoding = tokenizer.encode(<span class="hljs-string">"Let's test this tokenizer."</span>) | |
| <span class="hljs-built_in">print</span>(encoding.tokens)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">'[CLS]'</span>, <span class="hljs-string">'let'</span>, <span class="hljs-string">"'"</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">'test'</span>, <span class="hljs-string">'this'</span>, <span class="hljs-string">'tok'</span>, <span class="hljs-string">'##eni'</span>, <span class="hljs-string">'##zer'</span>, <span class="hljs-string">'.'</span>, <span class="hljs-string">'[SEP]'</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17sob19">ပြီးတော့ sentence pair တစ်ခုပေါ်မှာဆိုရင်၊ မှန်ကန်တဲ့ ရလဒ်ကို ကျွန်တော်တို့ ရရှိပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoding = tokenizer.encode(<span class="hljs-string">"Let's test this tokenizer..."</span>, <span class="hljs-string">"on a pair of sentences."</span>) | |
| <span class="hljs-built_in">print</span>(encoding.tokens) | |
| <span class="hljs-built_in">print</span>(encoding.type_ids)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">'[CLS]'</span>, <span class="hljs-string">'let'</span>, <span class="hljs-string">"'"</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">'test'</span>, <span class="hljs-string">'this'</span>, <span class="hljs-string">'tok'</span>, <span class="hljs-string">'##eni'</span>, <span class="hljs-string">'##zer'</span>, <span class="hljs-string">'...'</span>, <span class="hljs-string">'[SEP]'</span>, <span class="hljs-string">'on'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'pair'</span>, <span class="hljs-string">'of'</span>, <span class="hljs-string">'sentences'</span>, <span class="hljs-string">'.'</span>, <span class="hljs-string">'[SEP]'</span>] | |
| [<span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xtsg0m">ဒီ tokenizer ကို အစကနေ တည်ဆောက်တာ နီးပါးပြီးစီးပါပြီ — နောက်ဆုံးအဆင့်က decoder တစ်ခု ထည့်သွင်းဖို့ပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.decoder = decoders.WordPiece(prefix=<span class="hljs-string">"##"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jeksy2">ကျွန်တော်တို့ရဲ့ ယခင် <code>encoding</code> ပေါ်မှာ စမ်းသပ်ကြည့်ရအောင်…</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.decode(encoding.ids)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">"let's test this tokenizer... on a pair of sentences."</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-s3e4l1">ကောင်းပါပြီ! ကျွန်တော်တို့ရဲ့ tokenizer ကို ဒီလို JSON file တစ်ခုတည်းမှာ သိမ်းဆည်းနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.save(<span class="hljs-string">"tokenizer.json"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-gsf4pl">အဲဒီနောက် <code>from_file()</code> method နဲ့ <code>Tokenizer</code> object တစ်ခုထဲကို အဲဒီ file ကို ပြန်လည် load လုပ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->new_tokenizer = Tokenizer.from_file(<span class="hljs-string">"tokenizer.json"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-saxts2">ဒီ tokenizer ကို 🤗 Transformers မှာ အသုံးပြုဖို့အတွက်၊ ကျွန်တော်တို့ဟာ ဒါကို <code>PreTrainedTokenizerFast</code> ထဲမှာ wrap လုပ်ရပါမယ်။ ကျွန်တော်တို့ဟာ generic class ကို အသုံးပြုနိုင်ပါတယ် ဒါမှမဟုတ် ကျွန်တော်တို့ရဲ့ tokenizer က လက်ရှိ model တစ်ခုနဲ့ ကိုက်ညီတယ်ဆိုရင် အဲဒီ class ကို အသုံးပြုနိုင်ပါတယ် (ဒီနေရာမှာ <code>BertTokenizerFast</code>)။ သင်ဟာ ဒီသင်ခန်းစာကို tokenizer အသစ်တစ်ခု တည်ဆောက်ဖို့ အသုံးပြုတယ်ဆိုရင်၊ ပထမ option ကို အသုံးပြုရပါလိမ့်မယ်။</p> <p data-svelte-h="svelte-c5lp68">tokenizer ကို <code>PreTrainedTokenizerFast</code> ထဲမှာ wrap လုပ်ဖို့၊ ကျွန်တော်တို့ တည်ဆောက်ခဲ့တဲ့ tokenizer ကို <code>tokenizer_object</code> အဖြစ် ပေးနိုင်ပါတယ် ဒါမှမဟုတ် ကျွန်တော်တို့ သိမ်းဆည်းခဲ့တဲ့ tokenizer file ကို <code>tokenizer_file</code> အဖြစ် ပေးနိုင်ပါတယ်။ အဓိက မှတ်ထားရမယ့်အချက်ကတော့ ကျွန်တော်တို့ special tokens အားလုံးကို ကိုယ်တိုင် သတ်မှတ်ပေးရပါမယ်၊ ဘာလို့လဲဆိုတော့ အဲဒီ class က <code>tokenizer</code> object ကနေ ဘယ် token က mask token လဲ၊ <code>[CLS]</code> token လဲ စတာတွေကို မသိနိုင်လို့ပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast | |
| wrapped_tokenizer = PreTrainedTokenizerFast( | |
| tokenizer_object=tokenizer, | |
| <span class="hljs-comment"># tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively</span> | |
| unk_token=<span class="hljs-string">"[UNK]"</span>, | |
| pad_token=<span class="hljs-string">"[PAD]"</span>, | |
| cls_token=<span class="hljs-string">"[CLS]"</span>, | |
| sep_token=<span class="hljs-string">"[SEP]"</span>, | |
| mask_token=<span class="hljs-string">"[MASK]"</span>, | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-3muxye">အကယ်၍ သင်ဟာ သီးခြား tokenizer class တစ်ခု (ဥပမာ - <code>BertTokenizerFast</code>) ကို အသုံးပြုနေတယ်ဆိုရင်၊ default tokens တွေနဲ့ ကွာခြားတဲ့ special tokens တွေကိုသာ သတ်မှတ်ပေးဖို့ လိုအပ်ပါလိမ့်မယ် (ဒီနေရာမှာတော့ မရှိပါဘူး)။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> BertTokenizerFast | |
| wrapped_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-3pvusq">အဲဒီနောက် ဒီ tokenizer ကို တခြား 🤗 Transformers tokenizer တွေလိုပဲ သင်အသုံးပြုနိုင်ပါတယ်။ <code>save_pretrained()</code> method နဲ့ သိမ်းဆည်းနိုင်ပါတယ် ဒါမှမဟုတ် <code>push_to_hub()</code> method နဲ့ Hub ကို upload လုပ်နိုင်ပါတယ်။</p> <p data-svelte-h="svelte-1lyflqe">ကျွန်တော်တို့ WordPiece tokenizer တစ်ခုကို ဘယ်လိုတည်ဆောက်ရမယ်ဆိုတာ မြင်တွေ့ခဲ့ရပြီဆိုတော့၊ BPE tokenizer တစ်ခုအတွက်လည်း အတူတူလုပ်ကြစို့။ သင်အဆင့်တွေအားလုံးကို သိပြီးသားဖြစ်တဲ့အတွက် နည်းနည်းပိုမြန်မြန် သွားပါမယ်၊ ခြားနားချက်တွေကိုပဲ မီးမောင်းထိုးပြပါမယ်။</p> <h2 class="relative group"><a id="building-a-bpe-tokenizer-from-scratch" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#building-a-bpe-tokenizer-from-scratch"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အစကနေ BPE Tokenizer တစ်ခု တည်ဆောက်ခြင်း</span></h2> <p data-svelte-h="svelte-1hdw5sa">အခု GPT-2 tokenizer တစ်ခု တည်ဆောက်ကြစို့။ BERT tokenizer အတွက်လိုပဲ၊ BPE model တစ်ခုနဲ့ <code>Tokenizer</code> တစ်ခုကို initialize လုပ်ခြင်းဖြင့် စတင်ပါမယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer = Tokenizer(models.BPE())<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ec922a">BERT အတွက်လိုပဲ၊ ကျွန်တော်တို့မှာ vocabulary ရှိရင် (ဒီကိစ္စမှာ <code>vocab</code> နဲ့ <code>merges</code> ကို ပေးဖို့ လိုပါလိမ့်မယ်) ဒီ model ကို vocabulary နဲ့ initialize လုပ်နိုင်ပါတယ်။ ဒါပေမယ့် ကျွန်တော်တို့ အစကနေ train မှာဖြစ်တဲ့အတွက်၊ ဒါကို လုပ်ဖို့မလိုအပ်ပါဘူး။ GPT-2 က byte-level BPE ကို အသုံးပြုပြီး ဒါက <code>unk_token</code> မလိုအပ်တဲ့အတွက် <code>unk_token</code> ကို သတ်မှတ်ပေးဖို့လည်း ကျွန်တော်တို့ မလိုအပ်ပါဘူး။</p> <p data-svelte-h="svelte-1uahpvd">GPT-2 က normalizer ကို အသုံးမပြုတာကြောင့်၊ အဲဒီအဆင့်ကို ကျော်ပြီး pre-tokenization ကို တိုက်ရိုက်သွားပါမယ်-</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=<span class="hljs-literal">False</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xea6z8">ဒီနေရာမှာ <code>ByteLevel</code> ကို ကျွန်တော်တို့ ထည့်သွင်းခဲ့တဲ့ option က sentence အစမှာ space မထည့်ဖို့ပါပဲ (ဒါက default အားဖြင့် ထည့်ပါတယ်)။ အရင်ကလို ဥပမာ text တစ်ခုရဲ့ pre-tokenization ကို ကြည့်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.pre_tokenizer.pre_tokenize_str(<span class="hljs-string">"Let's test pre-tokenization!"</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[(<span class="hljs-string">'Let'</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">3</span>)), (<span class="hljs-string">"'s"</span>, (<span class="hljs-number">3</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">'Ġtest'</span>, (<span class="hljs-number">5</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">'Ġpre'</span>, (<span class="hljs-number">10</span>, <span class="hljs-number">14</span>)), (<span class="hljs-string">'-'</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">15</span>)), | |
| (<span class="hljs-string">'tokenization'</span>, (<span class="hljs-number">15</span>, <span class="hljs-number">27</span>)), (<span class="hljs-string">'!'</span>, (<span class="hljs-number">27</span>, <span class="hljs-number">28</span>))]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8cpkli">နောက်တစ်ခုက training လိုအပ်တဲ့ model ပါ။ GPT-2 အတွက်၊ တစ်ခုတည်းသော special token က end-of-text token ပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trainer = trainers.BpeTrainer(vocab_size=<span class="hljs-number">25000</span>, special_tokens=[<span class="hljs-string">"<|endoftext|>"</span>]) | |
| tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1x3ijoa"><code>WordPieceTrainer</code> နဲ့ တူတူပဲ၊ <code>vocab_size</code> နဲ့ <code>special_tokens</code> အပြင်၊ ကျွန်တော်တို့ လိုအပ်ရင် <code>min_frequency</code> ကို သတ်မှတ်နိုင်ပါတယ်၊ ဒါမှမဟုတ် end-of-word suffix ( <code></w></code> လိုမျိုး) ရှိရင် <code>end_of_word_suffix</code> နဲ့ သတ်မှတ်နိုင်ပါတယ်။</p> <p data-svelte-h="svelte-qeny4o">ဒီ tokenizer ကို text files တွေပေါ်မှာလည်း train လုပ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.model = models.BPE() | |
| tokenizer.train([<span class="hljs-string">"wikitext-2.txt"</span>], trainer=trainer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-f8svam">sample text တစ်ခုရဲ့ tokenization ကို ကြည့်ရအောင်-</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoding = tokenizer.encode(<span class="hljs-string">"Let's test this tokenizer."</span>) | |
| <span class="hljs-built_in">print</span>(encoding.tokens)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">'L'</span>, <span class="hljs-string">'et'</span>, <span class="hljs-string">"'"</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">'Ġtest'</span>, <span class="hljs-string">'Ġthis'</span>, <span class="hljs-string">'Ġto'</span>, <span class="hljs-string">'ken'</span>, <span class="hljs-string">'izer'</span>, <span class="hljs-string">'.'</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1cv10os">GPT-2 tokenizer အတွက် byte-level post-processing ကို အောက်ပါအတိုင်း ကျွန်တော်တို့ အသုံးပြုပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.post_processor = processors.ByteLevel(trim_offsets=<span class="hljs-literal">False</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-au8l8c"><code>trim_offsets = False</code> option က post-processor ကို ‘Ġ’ နဲ့ စတင်တဲ့ tokens တွေရဲ့ offsets တွေကို ရှိတဲ့အတိုင်း ထားဖို့ ညွှန်ပြပါတယ်။ ဒီနည်းနဲ့ offsets တွေရဲ့ အစက word ရဲ့ ပထမဆုံး character ကို ညွှန်ပြမယ့်အစား word ရဲ့ ရှေ့က space ကို ညွှန်ပြပါလိမ့်မယ် (space က နည်းပညာအရ token ရဲ့ အစိတ်အပိုင်းဖြစ်တာကြောင့်)။ ကျွန်တော်တို့ အခု encode လုပ်ခဲ့တဲ့ text နဲ့ ရလဒ်ကို ကြည့်ရအောင်။ <code>'Ġtest'</code> က index 4 မှာရှိတဲ့ token ဖြစ်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->sentence = <span class="hljs-string">"Let's test this tokenizer."</span> | |
| encoding = tokenizer.encode(sentence) | |
| start, end = encoding.offsets[<span class="hljs-number">4</span>] | |
| sentence[start:end]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">' test'</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1a1apnj">နောက်ဆုံးအနေနဲ့၊ byte-level decoder တစ်ခု ထည့်သွင်းပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.decoder = decoders.ByteLevel()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-675ell">ပြီးတော့ ဒါက မှန်ကန်စွာ အလုပ်လုပ်လားဆိုတာ ထပ်မံစစ်ဆေးနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.decode(encoding.ids)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">"Let's test this tokenizer."</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1fws21w">ကောင်းပါပြီ! အခု ပြီးစီးပြီဆိုတော့၊ tokenizer ကို အရင်လိုပဲ သိမ်းဆည်းနိုင်ပြီး၊ 🤗 Transformers မှာ အသုံးပြုချင်တယ်ဆိုရင် <code>PreTrainedTokenizerFast</code> ဒါမှမဟုတ် <code>GPT2TokenizerFast</code> ထဲမှာ wrap လုပ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast | |
| wrapped_tokenizer = PreTrainedTokenizerFast( | |
| tokenizer_object=tokenizer, | |
| bos_token=<span class="hljs-string">"<|endoftext|>"</span>, | |
| eos_token=<span class="hljs-string">"<|endoftext|>"</span>, | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1pzgdft">ဒါမှမဟုတ်:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> GPT2TokenizerFast | |
| wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17tmyse">နောက်ဆုံး ဥပမာအနေနဲ့၊ Unigram tokenizer တစ်ခုကို အစကနေ ဘယ်လိုတည်ဆောက်ရမလဲဆိုတာ ကျွန်တော်တို့ ပြသပေးပါမယ်။</p> <h2 class="relative group"><a id="building-a-unigram-tokenizer-from-scratch" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#building-a-unigram-tokenizer-from-scratch"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အစကနေ Unigram Tokenizer တစ်ခု တည်ဆောက်ခြင်း</span></h2> <p data-svelte-h="svelte-1p4z3wo">အခု XLNet tokenizer တစ်ခု တည်ဆောက်ကြစို့။ ယခင် tokenizers တွေအတွက်လိုပဲ၊ Unigram model တစ်ခုနဲ့ <code>Tokenizer</code> တစ်ခုကို initialize လုပ်ခြင်းဖြင့် စတင်ပါမယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer = Tokenizer(models.Unigram())<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-6jmcsf">ထပ်မံပြီး၊ ကျွန်တော်တို့မှာ vocabulary ရှိရင် ဒီ model ကို vocabulary နဲ့ initialize လုပ်နိုင်ပါတယ်။</p> <p data-svelte-h="svelte-1z0uul8">normalization အတွက်၊ XLNet က replacements အနည်းငယ် (SentencePiece ကနေလာတာပါ) ကို အသုံးပြုပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> Regex | |
| tokenizer.normalizer = normalizers.<span class="hljs-type">Sequence</span>( | |
| [ | |
| normalizers.Replace(<span class="hljs-string">"``"</span>, <span class="hljs-string">'"'</span>), | |
| normalizers.Replace(<span class="hljs-string">"''"</span>, <span class="hljs-string">'"'</span>), | |
| normalizers.NFKD(), | |
| normalizers.StripAccents(), | |
| normalizers.Replace(Regex(<span class="hljs-string">" {2,}"</span>), <span class="hljs-string">" "</span>), | |
| ] | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jnf39y">ဒါက <code>“</code> နဲ့ <code>”</code> တွေကို <code>”</code> နဲ့ အစားထိုးပြီး spaces နှစ်ခု သို့မဟုတ် ထို့ထက်ပိုတဲ့ sequence တွေကို single space တစ်ခုနဲ့ အစားထိုးပါတယ်၊ ဒါ့အပြင် tokenize လုပ်မယ့် texts ထဲက accents တွေကို ဖယ်ရှားပါတယ်။</p> <p data-svelte-h="svelte-17fu9q6">မည်သည့် SentencePiece tokenizer အတွက်မဆို အသုံးပြုရမယ့် pre-tokenizer က <code>Metaspace</code> ပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1x0owi0">အရင်ကလို ဥပမာ text တစ်ခုရဲ့ pre-tokenization ကို ကြည့်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.pre_tokenizer.pre_tokenize_str(<span class="hljs-string">"Let's test the pre-tokenizer!"</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[(<span class="hljs-string">" Let's"</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">' test'</span>, (<span class="hljs-number">5</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">' the'</span>, (<span class="hljs-number">10</span>, <span class="hljs-number">14</span>)), (<span class="hljs-string">' pre-tokenizer!'</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">29</span>))]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xd39o2">နောက်တစ်ခုက training လိုအပ်တဲ့ model ပါ။ XLNet မှာ special tokens တွေ အတော်လေး များပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->special_tokens = [<span class="hljs-string">"<cls>"</span>, <span class="hljs-string">"<sep>"</span>, <span class="hljs-string">"<unk>"</span>, <span class="hljs-string">"<pad>"</span>, <span class="hljs-string">"<mask>"</span>, <span class="hljs-string">"<s>"</span>, <span class="hljs-string">"</s>"</span>] | |
| trainer = trainers.UnigramTrainer( | |
| vocab_size=<span class="hljs-number">25000</span>, special_tokens=special_tokens, unk_token=<span class="hljs-string">"<unk>"</span> | |
| ) | |
| tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-rhmk1c"><code>UnigramTrainer</code> အတွက် မမေ့မလျော့ ထည့်သွင်းရမယ့် အရေးကြီးတဲ့ argument တစ်ခုက <code>unk_token</code> ပါ။ ကျွန်တော်တို့ဟာ Unigram algorithm အတွက် သီးခြား arguments တွေဖြစ်တဲ့ tokens တွေ ဖယ်ရှားတဲ့ အဆင့်တစ်ခုစီအတွက် <code>shrinking_factor</code> (default က 0.75) ဒါမှမဟုတ် ပေးထားတဲ့ token တစ်ခုရဲ့ အမြင့်ဆုံးအရှည်ကို သတ်မှတ်ဖို့ <code>max_piece_length</code> (default က 16) တို့ကိုလည်း ပေးနိုင်ပါတယ်။</p> <p data-svelte-h="svelte-qeny4o">ဒီ tokenizer ကို text files တွေပေါ်မှာလည်း train လုပ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.model = models.Unigram() | |
| tokenizer.train([<span class="hljs-string">"wikitext-2.txt"</span>], trainer=trainer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-867lno">sample text တစ်ခုရဲ့ tokenization ကို ကြည့်ရအောင်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoding = tokenizer.encode(<span class="hljs-string">"Let's test this tokenizer."</span>) | |
| <span class="hljs-built_in">print</span>(encoding.tokens)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">' Let'</span>, <span class="hljs-string">"'"</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">' test'</span>, <span class="hljs-string">' this'</span>, <span class="hljs-string">' to'</span>, <span class="hljs-string">'ken'</span>, <span class="hljs-string">'izer'</span>, <span class="hljs-string">'.'</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1e4tu7q">XLNet ရဲ့ ထူးခြားချက်တစ်ခုကတော့ <code><cls></code> token ကို sentence ရဲ့ အဆုံးမှာ ထားပြီး၊ type ID ကို 2 (အခြား tokens တွေနဲ့ ကွဲပြားစေရန်) ပေးထားတာပါပဲ။ ရလဒ်အနေနဲ့ ဒါက ဘယ်ဘက်မှာ padding လုပ်တာပါ။ special tokens အားလုံးနဲ့ token type IDs တွေကို BERT အတွက်လို template တစ်ခုနဲ့ ကျွန်တော်တို့ ကိုင်တွယ်နိုင်ပါတယ်။ ဒါပေမယ့် ပထမဆုံး <code><cls></code> နဲ့ <code><sep></code> tokens တွေရဲ့ IDs တွေကို ရယူရပါမယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->cls_token_id = tokenizer.token_to_id(<span class="hljs-string">"<cls>"</span>) | |
| sep_token_id = tokenizer.token_to_id(<span class="hljs-string">"<sep>"</span>) | |
| <span class="hljs-built_in">print</span>(cls_token_id, sep_token_id)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">0</span> <span class="hljs-number">1</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-j837r6">template က ဒီလိုမျိုး ဖြစ်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.post_processor = processors.TemplateProcessing( | |
| single=<span class="hljs-string">"$A:0 <sep>:0 <cls>:2"</span>, | |
| pair=<span class="hljs-string">"$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2"</span>, | |
| special_tokens=[(<span class="hljs-string">"<sep>"</span>, sep_token_id), (<span class="hljs-string">"<cls>"</span>, cls_token_id)], | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1amd51r">ပြီးတော့ sentence pair တစ်ခုကို encode လုပ်ခြင်းဖြင့် ဒါက အလုပ်လုပ်လားဆိုတာ စမ်းသပ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoding = tokenizer.encode(<span class="hljs-string">"Let's test this tokenizer..."</span>, <span class="hljs-string">"on a pair of sentences!"</span>) | |
| <span class="hljs-built_in">print</span>(encoding.tokens) | |
| <span class="hljs-built_in">print</span>(encoding.type_ids)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">' Let'</span>, <span class="hljs-string">"'"</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">' test'</span>, <span class="hljs-string">' this'</span>, <span class="hljs-string">' to'</span>, <span class="hljs-string">'ken'</span>, <span class="hljs-string">'izer'</span>, <span class="hljs-string">'.'</span>, <span class="hljs-string">'.'</span>, <span class="hljs-string">'.'</span>, <span class="hljs-string">'<sep>'</span>, <span class="hljs-string">' '</span>, <span class="hljs-string">'on'</span>, <span class="hljs-string">' '</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">' pair'</span>, | |
| <span class="hljs-string">' of'</span>, <span class="hljs-string">' sentence'</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">'!'</span>, <span class="hljs-string">'<sep>'</span>, <span class="hljs-string">'<cls>'</span>] | |
| [<span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1btthbu">နောက်ဆုံးအနေနဲ့၊ <code>Metaspace</code> decoder တစ်ခု ထည့်သွင်းပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.decoder = decoders.Metaspace()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-vv71hp">ပြီးတော့ ဒီ tokenizer နဲ့ ပြီးပါပြီ! ကျွန်တော်တို့ tokenizer ကို အရင်လိုပဲ သိမ်းဆည်းနိုင်ပြီး၊ 🤗 Transformers မှာ အသုံးပြုချင်တယ်ဆိုရင် <code>PreTrainedTokenizerFast</code> ဒါမှမဟုတ် <code>XLNetTokenizerFast</code> ထဲမှာ wrap လုပ်နိုင်ပါတယ်။ <code>PreTrainedTokenizerFast</code> ကို အသုံးပြုတဲ့အခါ သတိပြုရမယ့်အချက်တစ်ခုကတော့ special tokens တွေအပြင်၊ Hugging Face library ကို ဘယ်ဘက်ကနေ padding လုပ်ဖို့ ကျွန်တော်တို့ ပြောပြဖို့ လိုအပ်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast | |
| wrapped_tokenizer = PreTrainedTokenizerFast( | |
| tokenizer_object=tokenizer, | |
| bos_token=<span class="hljs-string">"<s>"</span>, | |
| eos_token=<span class="hljs-string">"</s>"</span>, | |
| unk_token=<span class="hljs-string">"<unk>"</span>, | |
| pad_token=<span class="hljs-string">"<pad>"</span>, | |
| cls_token=<span class="hljs-string">"<cls>"</span>, | |
| sep_token=<span class="hljs-string">"<sep>"</span>, | |
| mask_token=<span class="hljs-string">"<mask>"</span>, | |
| padding_side=<span class="hljs-string">"left"</span>, | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1pzgdft">ဒါမှမဟုတ်:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> XLNetTokenizerFast | |
| wrapped_tokenizer = XLNetTokenizerFast(tokenizer_object=tokenizer)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-mip31h">existing tokenizers တွေကို တည်ဆောက်ရာမှာ building blocks အမျိုးမျိုးကို ဘယ်လိုအသုံးပြုလဲဆိုတာ သင်မြင်တွေ့ခဲ့ရပြီဆိုတော့၊ 🤗 Tokenizers library နဲ့ သင်လိုချင်တဲ့ မည်သည့် tokenizer ကိုမဆို ရေးနိုင်ပြီး 🤗 Transformers မှာ အသုံးပြုနိုင်ပါလိမ့်မယ်။</p> <h2 class="relative group"><a id="ဝဟရ-ရငလငခက-glossary" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ဝဟရ-ရငလငခက-glossary"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ဝေါဟာရ ရှင်းလင်းချက် (Glossary)</span></h2> <ul data-svelte-h="svelte-90urvs"><li><strong>Tokenization</strong>: စာသား (သို့မဟုတ် အခြားဒေတာ) ကို AI မော်ဒယ်များ စီမံဆောင်ရွက်နိုင်ရန် tokens တွေအဖြစ် ပိုင်းခြားပေးသည့် လုပ်ငန်းစဉ်။</li> <li><strong>Normalization</strong>: စာသားကို သန့်ရှင်းရေးလုပ်ခြင်း (ဥပမာ- needless whitespace ဖယ်ရှားခြင်း၊ lowercasing, accents ဖယ်ရှားခြင်း)။</li> <li><strong>Pre-tokenization</strong>: Subword tokenization မလုပ်ဆောင်မီ စာသားကို ပိုမိုသေးငယ်သော entities (ဥပမာ- words) အဖြစ် အကြိုပိုင်းခြားခြင်း။</li> <li><strong>Tokens Sequence</strong>: စာသားကို ပိုင်းခြားပြီးနောက် ရရှိသော tokens များ၏ အစဉ်လိုက်။</li> <li><strong>Post-processing</strong>: Model ၏ output များကို နောက်ဆုံးအသုံးပြုမှုအတွက် ပြင်ဆင်ခြင်း လုပ်ငန်းစဉ်။</li> <li><strong>Special Tokens</strong>: Tokenizer သို့မဟုတ် model အတွက် သီးခြားအဓိပ္ပာယ်ရှိသော tokens များ (ဥပမာ- <code>[CLS]</code>, <code>[SEP]</code>, <code>[PAD]</code>)။</li> <li><strong>Attention Mask</strong>: မော်ဒယ်ကို အာရုံစိုက်သင့်သည့် tokens များနှင့် လျစ်လျူရှုသင့်သည့် (padding) tokens များကို ခွဲခြားပေးသည့် binary mask။</li> <li><strong>Token Type IDs</strong>: Sentence pair လုပ်ငန်းများတွင် input sequence တစ်ခုစီမှ token တစ်ခုစီသည် မည်သည့် sentence (ပထမ သို့မဟုတ် ဒုတိယ) နှင့် သက်ဆိုင်သည်ကို ဖော်ပြပေးသော IDs များ။</li> <li><strong>🤗 Tokenizers Library</strong>: Rust ဘာသာနဲ့ ရေးသားထားတဲ့ Hugging Face library တစ်ခုဖြစ်ပြီး မြန်ဆန်ထိရောက်တဲ့ tokenization ကို လုပ်ဆောင်ပေးသည်။</li> <li><strong><code>Tokenizer</code> Class</strong>: 🤗 Tokenizers library မှ အဓိက tokenizer class။</li> <li><strong><code>normalizers</code> Submodule</strong>: Normalization building blocks များ ပါဝင်သော submodule။</li> <li><strong><code>Normalizer</code></strong>: Normalization logic ကို အကောင်အထည်ဖော်ထားသော class။</li> <li><strong><code>pre_tokenizers</code> Submodule</strong>: Pre-tokenization building blocks များ ပါဝင်သော submodule။</li> <li><strong><code>PreTokenizer</code></strong>: Pre-tokenization logic ကို အကောင်အထည်ဖော်ထားသော class။</li> <li><strong><code>models</code> Submodule</strong>: Subword tokenization models များ (BPE, WordPiece, Unigram) ပါဝင်သော submodule။</li> <li><strong><code>Model</code></strong>: Subword tokenization algorithm ကို အကောင်အထည်ဖော်ထားသော class။</li> <li><strong>BPE (Byte-Pair Encoding)</strong>: Subword tokenization algorithm တစ်မျိုး။</li> <li><strong>WordPiece</strong>: Subword tokenization algorithm တစ်မျိုး။</li> <li><strong>Unigram</strong>: Subword tokenization algorithm တစ်မျိုး။</li> <li><strong><code>trainers</code> Submodule</strong>: Model training အတွက် trainers များ ပါဝင်သော submodule။</li> <li><strong><code>Trainer</code></strong>: Model ကို corpus တစ်ခုပေါ်တွင် train လုပ်ရန် အသုံးပြုသော class။</li> <li><strong>Corpus</strong>: စာသား (သို့မဟုတ် အခြားဒေတာ) အစုအဝေးကြီးတစ်ခု။</li> <li><strong><code>post_processors</code> Submodule</strong>: Post-processing building blocks များ ပါဝင်သော submodule။</li> <li><strong><code>PostProcessor</code></strong>: Post-processing logic ကို အကောင်အထည်ဖော်ထားသော class။</li> <li><strong><code>decoders</code> Submodule</strong>: Tokenization outputs များကို decode လုပ်ရန် decoders များ ပါဝင်သော submodule။</li> <li><strong><code>Decoder</code></strong>: Decoding logic ကို အကောင်အထည်ဖော်ထားသော class။</li> <li><strong><code>get_training_corpus()</code> Function</strong>: Tokenizer ကို လေ့ကျင့်ရန်အတွက် batches of texts များကို yield လုပ်သော generator function။</li> <li><strong>Generator</strong>: Python တွင် iteration လုပ်နိုင်သော object တစ်ခုဖြစ်ပြီး ၎င်းသည် အရာအားလုံးကို memory ထဲသို့ တစ်ပြိုင်နက်တည်း သိမ်းဆည်းမထားဘဲ လိုအပ်သလို တန်ဖိုးများကို ထုတ်ပေးသည်။</li> <li><strong>WikiText-2 Dataset</strong>: ဘာသာစကား model များကို လေ့ကျင့်ရန် အသုံးပြုသော dataset တစ်ခု။</li> <li><strong>Text Files</strong>: စာသားအချက်အလက်များသာ ပါဝင်သော ဖိုင်များ။</li> <li><strong>BERT Tokenizer</strong>: BERT model အတွက် အသုံးပြုသော tokenizer။</li> <li><strong>GPT-2 Tokenizer</strong>: GPT-2 model အတွက် အသုံးပြုသော tokenizer။</li> <li><strong>XLNet Tokenizer</strong>: XLNet model အတွက် အသုံးပြုသော tokenizer။</li> <li><strong><code>models.WordPiece(unk_token="[UNK]")</code></strong>: Unknown token အဖြစ် <code>[UNK]</code> ကို အသုံးပြုသော WordPiece model ကို ဖန်တီးခြင်း။</li> <li><strong><code>unk_token</code></strong>: Model က မသိသော tokens များကို ကိုယ်စားပြုသော special token။</li> <li><strong><code>vocab</code></strong>: Model ၏ vocabulary (သိရှိသော tokens များစာရင်း)။</li> <li><strong><code>max_input_chars_per_word</code></strong>: word တစ်ခုအတွက် အမြင့်ဆုံး character အရေအတွက်။</li> <li><strong><code>BertNormalizer</code></strong>: BERT tokenizer အတွက် အကြိုတည်ဆောက်ထားသော normalizer class။</li> <li><strong><code>lowercase</code></strong>: စာလုံးများကို အသေးစာလုံးများအဖြစ် ပြောင်းလဲခြင်း။</li> <li><strong><code>strip_accents</code></strong>: စာလုံးများပေါ်ရှိ accents များကို ဖယ်ရှားခြင်း။</li> <li><strong><code>clean_text</code></strong>: Control characters များကို ဖယ်ရှားခြင်းနှင့် ထပ်နေသော spaces များကို တစ်ခုတည်းဖြင့် အစားထိုးခြင်း။</li> <li><strong><code>handle_chinese_chars</code></strong>: Chinese characters များပတ်ပတ်လည်တွင် spaces တွေ ထည့်သွင်းခြင်း။</li> <li><strong><code>bert-base-uncased</code></strong>: BERT model ၏ base version အတွက် checkpoint identifier (uncased version)။</li> <li><strong><code>normalizers.Sequence</code></strong>: normalizers များစွာကို ပေါင်းစပ်ရန် အသုံးပြုသော class။</li> <li><strong><code>normalizers.NFD()</code></strong>: Unicode normalization form D (Canonical Decomposition) ကို အသုံးပြုသော normalizer။</li> <li><strong><code>normalizers.Lowercase()</code></strong>: စာလုံးများကို အသေးစာလုံးများအဖြစ် ပြောင်းလဲသော normalizer။</li> <li><strong><code>normalizers.StripAccents()</code></strong>: accents များကို ဖယ်ရှားသော normalizer။</li> <li><strong><code>normalize_str()</code> Method</strong>: Normalizer object မှ string တစ်ခုကို normalize လုပ်သော method။</li> <li><strong><code>BertPreTokenizer</code></strong>: BERT tokenizer အတွက် အကြိုတည်ဆောက်ထားသော pre-tokenizer class။</li> <li><strong><code>pre_tokenizers.Whitespace()</code></strong>: whitespace နှင့် punctuation ဖြင့် ပိုင်းခြားသော pre-tokenizer။</li> <li><strong><code>pre_tokenizers.WhitespaceSplit()</code></strong>: whitespace ဖြင့်သာ ပိုင်းခြားသော pre-tokenizer။</li> <li><strong><code>pre_tokenizers.Punctuation()</code></strong>: punctuation ဖြင့် ပိုင်းခြားသော pre-tokenizer။</li> <li><strong><code>pre_tokenize_str()</code> Method</strong>: Pre-tokenizer object မှ string တစ်ခုကို pre-tokenize လုပ်သော method။</li> <li><strong><code>WordPieceTrainer</code></strong>: WordPiece model ကို train လုပ်ရန်အတွက် trainer class။</li> <li><strong><code>vocab_size</code></strong>: vocabulary ၏ အမြင့်ဆုံးအရွယ်အစား။</li> <li><strong><code>special_tokens</code></strong>: Model ၏ special tokens များ။</li> <li><strong><code>min_frequency</code></strong>: token တစ်ခု vocabulary ထဲမှာ ပါဝင်ဖို့ ဘယ်အကြိမ်ရေ အနည်းဆုံး ပေါ်လာရမလဲ။</li> <li><strong><code>continuing_subword_prefix</code></strong>: subword တစ်ခု ဆက်နေကြောင်း ဖော်ပြသော prefix (ဥပမာ- <code>##</code>)။</li> <li><strong><code>train_from_iterator()</code> Method</strong>: iterator မှ data ကို အသုံးပြုပြီး tokenizer ကို train လုပ်သော method။</li> <li><strong><code>train()</code> Method</strong>: text files များမှ data ကို အသုံးပြုပြီး tokenizer ကို train လုပ်သော method။</li> <li><strong><code>encode()</code> Method</strong>: စာသားကို tokens ID များအဖြစ် ပြောင်းလဲပေးသော tokenizer method။</li> <li><strong><code>Encoding</code> Object</strong>: <code>encode()</code> method မှ ပြန်ပေးသော object ဖြစ်ပြီး encoded inputs အားလုံး ပါဝင်သည်။</li> <li><strong><code>ids</code></strong>: Encoded tokens များ၏ ID များ။</li> <li><strong><code>type_ids</code></strong>: Token type IDs များ။</li> <li><strong><code>tokens</code></strong>: Tokenized string များ၏ list။</li> <li><strong><code>offsets</code></strong>: Offset mapping များ။</li> <li><strong><code>attention_mask</code></strong>: Attention mask။</li> <li><strong><code>special_tokens_mask</code></strong>: Special tokens mask။</li> <li><strong><code>overflowing</code></strong>: Truncate လုပ်ထားသော tokens များ။</li> <li><strong><code>token_to_id()</code> Method</strong>: Token string ကို ၎င်း၏ ID သို့ ပြောင်းလဲပေးသော tokenizer method။</li> <li><strong><code>TemplateProcessing</code></strong>: Post-processing အတွက် template ကို အသုံးပြုသော processor class။</li> <li><strong><code>single</code></strong>: single sentence အတွက် template။</li> <li><strong><code>pair</code></strong>: sentence pair အတွက် template။</li> <li><strong><code>$A</code></strong>: ပထမ sentence ကို ကိုယ်စားပြုသော placeholder။</li> <li><strong><code>$B</code></strong>: ဒုတိယ sentence ကို ကိုယ်စားပြုသော placeholder။</li> <li><strong><code>WordPiece Decoder</code></strong>: WordPiece tokens များကို text အဖြစ် ပြန်ပြောင်းပေးသော decoder။</li> <li><strong><code>prefix="##"</code></strong>: WordPiece decoder အတွက် subword prefix။</li> <li><strong><code>decode()</code> Method</strong>: Token IDs များကို text အဖြစ် ပြန်ပြောင်းပေးသော tokenizer method။</li> <li><strong>JSON File</strong>: JavaScript Object Notation format ဖြင့် သိမ်းဆည်းထားသော ဖိုင်။</li> <li><strong><code>save("tokenizer.json")</code></strong>: Tokenizer ကို JSON file အဖြစ် သိမ်းဆည်းသော method။</li> <li><strong><code>from_file("tokenizer.json")</code></strong>: JSON file မှ tokenizer ကို load လုပ်သော method။</li> <li><strong><code>PreTrainedTokenizerFast</code></strong>: 🤗 Transformers library မှ generic fast tokenizer class။</li> <li><strong><code>BertTokenizerFast</code></strong>: BERT model အတွက် သီးခြား fast tokenizer class။</li> <li><strong><code>tokenizer_object</code></strong>: <code>PreTrainedTokenizerFast</code> သို့ ပေးပို့သော tokenizer object။</li> <li><strong><code>tokenizer_file</code></strong>: <code>PreTrainedTokenizerFast</code> သို့ ပေးပို့သော tokenizer file ၏ path။</li> <li><strong><code>bos_token</code> (Beginning Of Sentence Token)</strong>: Sentence ၏ အစကို ကိုယ်စားပြုသော special token။</li> <li><strong><code>eos_token</code> (End Of Sentence Token)</strong>: Sentence ၏ အဆုံးကို ကိုယ်စားပြုသော special token။</li> <li><strong><code>pad_token</code> (Padding Token)</strong>: Sequence များကို တူညီသောအရှည်ဖြစ်အောင် ဖြည့်စွက်ရန် အသုံးပြုသော special token။</li> <li><strong><code>cls_token</code></strong>: BERT model တွင် sequence ၏ အစကို ကိုယ်စားပြုသော special token။</li> <li><strong><code>sep_token</code></strong>: BERT model တွင် sentence တစ်ခု၏ အဆုံး သို့မဟုတ် sentence နှစ်ခုကြား ပိုင်းခြားရန် အသုံးပြုသော special token။</li> <li><strong><code>mask_token</code></strong>: Masked Language Modeling (MLM) တွင် စကားလုံးများကို ဝှက်ထားရန် အသုံးပြုသော special token။</li> <li><strong><code>save_pretrained()</code> Method</strong>: <code>PreTrainedTokenizerFast</code> ကို pretrained model အဖြစ် သိမ်းဆည်းသော method။</li> <li><strong><code>push_to_hub()</code> Method</strong>: <code>PreTrainedTokenizerFast</code> ကို Hugging Face Hub သို့ upload လုပ်သော method။</li> <li><strong><code>models.BPE()</code></strong>: BPE model ကို ဖန်တီးခြင်း။</li> <li><strong><code>merges</code></strong>: BPE model တွင် token merge rules များ။</li> <li><strong>Byte-level BPE</strong>: BPE tokenization တစ်မျိုးဖြစ်ပြီး bytes များကို အခြေခံ၍ လုပ်ဆောင်သည်။</li> <li><strong><code>pre_tokenizers.ByteLevel()</code></strong>: Byte-level pre-tokenization ကို လုပ်ဆောင်သော pre-tokenizer။</li> <li><strong><code>add_prefix_space=False</code></strong>: ByteLevel pre-tokenizer အတွက် sentence အစတွင် space မထည့်ရန် သတ်မှတ်ခြင်း။</li> <li><strong><code>BpeTrainer</code></strong>: BPE model ကို train လုပ်ရန်အတွက် trainer class။</li> <li><strong><code>end_of_word_suffix</code></strong>: word ၏ အဆုံးကို ကိုယ်စားပြုသော suffix (ဥပမာ- <code></w></code>)။</li> <li><strong><code>Ġ</code> Symbol</strong>: GPT-2 tokenizer တွင် space ကို ကိုယ်စားပြုသော symbol။</li> <li><strong><code>processors.ByteLevel(trim_offsets=False)</code></strong>: Byte-level post-processing ကို လုပ်ဆောင်သော processor။ <code>trim_offsets=False</code> က offsets များကို ပြင်ဆင်မွမ်းမံခြင်း မပြုလုပ်စေပါ။</li> <li><strong><code>decoders.ByteLevel()</code></strong>: Byte-level tokens များကို text အဖြစ် ပြန်ပြောင်းပေးသော decoder။</li> <li><strong><code>models.Unigram()</code></strong>: Unigram model ကို ဖန်တီးခြင်း။</li> <li><strong><code>tokenizers.Regex</code></strong>: Regex (Regular Expression) ကို အသုံးပြုရန် class။</li> <li><strong><code>normalizers.Replace("``", '"')</code></strong>: ````<code>ကို</code>”` ဖြင့် အစားထိုးသော normalizer။</li> <li><strong><code>normalizers.Replace("''", '"')</code></strong>: <code>''</code> ကို <code>"</code> ဖြင့် အစားထိုးသော normalizer။</li> <li><strong><code>normalizers.NFKD()</code></strong>: Unicode normalization form KD (Compatibility Decomposition) ကို အသုံးပြုသော normalizer။</li> <li><strong><code>normalizers.Replace(Regex(" {2,}"), " ")</code></strong>: spaces နှစ်ခု သို့မဟုတ် ထို့ထက်ပိုသော sequence များကို single space တစ်ခုဖြင့် အစားထိုးသော normalizer။</li> <li><strong><code>pre_tokenizers.Metaspace()</code></strong>: Metaspace pre-tokenization ကို လုပ်ဆောင်သော pre-tokenizer။ (spaces များကို special character ဖြင့် အစားထိုးသည်)။</li> <li><strong><code>UnigramTrainer</code></strong>: Unigram model ကို train လုပ်ရန်အတွက် trainer class။</li> <li><strong><code>shrinking_factor</code></strong>: Unigram training တွင် vocabulary မှ tokens များကို ဖယ်ရှားသည့်အခါ အသုံးပြုသော factor။</li> <li><strong><code>max_piece_length</code></strong>: token တစ်ခု၏ အမြင့်ဆုံးအရှည်။</li> <li><strong><code><cls></code> Token</strong>: XLNet model တွင် sequence ၏ အဆုံးကို ကိုယ်စားပြုသော special token (token type ID 2 ဖြင့်)။</li> <li><strong><code><sep></code> Token</strong>: XLNet model တွင် sentence တစ်ခု၏ အဆုံးကို ကိုယ်စားပြုသော special token။</li> <li><strong><code>padding_side="left"</code></strong>: Padding ကို sequence ၏ ဘယ်ဘက်ခြမ်းတွင် လုပ်ဆောင်ရန် သတ်မှတ်ခြင်း။</li> <li><strong><code>XLNetTokenizerFast</code></strong>: XLNet model အတွက် သီးခြား fast tokenizer class။</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/my/chapter6/8.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_tyugt6 = { | |
| assets: "/docs/course/pr_1114/my", | |
| base: "/docs/course/pr_1114/my", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/course/pr_1114/my/_app/immutable/entry/start.14794ee9.js"), | |
| import("/docs/course/pr_1114/my/_app/immutable/entry/app.a133f5c6.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 53], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 183 kB
- Xet hash:
- 3e78d3e0ea8ee360c98d25b44d9a8d08ba6f2e94a67be61deeb128e90acc2688
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.