Buckets:

rtrm's picture
download
raw
69.5 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Melatih Tokenizer Baru dari Tokenizer Lama&quot;,&quot;local&quot;:&quot;training-a-new-tokenizer-from-an-old-one&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Menyusun Korpus&quot;,&quot;local&quot;:&quot;assembling-a-corpus&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Melatih Tokenizer Baru&quot;,&quot;local&quot;:&quot;training-a-new-tokenizer&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Menyimpan Tokenizer&quot;,&quot;local&quot;:&quot;saving-the-tokenizer&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1054/id/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/entry/start.4f92af03.js">
<link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/scheduler.36a0863c.js">
<link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/singletons.7dc7b9a4.js">
<link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/index.733708bb.js">
<link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/paths.cf097d06.js">
<link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/entry/app.19cef1b6.js">
<link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/index.156fee99.js">
<link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/nodes/0.1203e4a0.js">
<link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/nodes/46.97cd9f44.js">
<link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/Tip.8a648467.js">
<link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/Youtube.a5d6d567.js">
<link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/CodeBlock.4cf998e6.js">
<link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/CourseFloatingBanner.16bb8bff.js">
<link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/getInferenceSnippets.472bc46d.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Melatih Tokenizer Baru dari Tokenizer Lama&quot;,&quot;local&quot;:&quot;training-a-new-tokenizer-from-an-old-one&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Menyusun Korpus&quot;,&quot;local&quot;:&quot;assembling-a-corpus&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Melatih Tokenizer Baru&quot;,&quot;local&quot;:&quot;training-a-new-tokenizer&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Menyimpan Tokenizer&quot;,&quot;local&quot;:&quot;saving-the-tokenizer&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="training-a-new-tokenizer-from-an-old-one" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#training-a-new-tokenizer-from-an-old-one"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Melatih Tokenizer Baru dari Tokenizer Lama</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-6-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter6/section2.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter6/section2.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-1rb50p7">Jika model bahasa tidak tersedia dalam bahasa yang Anda minati, atau jika korpus Anda sangat berbeda dari korpus yang digunakan untuk melatih model tersebut, kemungkinan besar Anda ingin melatih ulang model dari awal dengan tokenizer yang disesuaikan untuk data Anda. Ini berarti Anda perlu melatih tokenizer baru pada dataset Anda. Tapi apa sebenarnya artinya? Ketika pertama kali kita membahas tokenizer di <a href="/course/chapter2">Bab 2</a>, kita melihat bahwa sebagian besar model Transformer menggunakan <em>algoritma tokenisasi sub-kata</em>. Untuk mengidentifikasi sub-kata mana yang paling relevan dan sering muncul dalam korpus, tokenizer perlu memproses seluruh teks dalam korpus tersebut — proses ini disebut <em>pelatihan</em>. Aturan spesifik yang digunakan dalam proses ini tergantung pada jenis tokenizer, dan kita akan membahas tiga algoritma utama nanti dalam bab ini.</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/DJimQynXZsQ" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-l969u4">⚠️ Melatih tokenizer <strong>tidak sama</strong> dengan melatih model! Pelatihan model menggunakan <em>stochastic gradient descent</em> untuk mengurangi nilai <em>loss</em> secara bertahap pada setiap <em>batch</em>. Proses ini bersifat acak (artinya Anda perlu mengatur seed agar mendapatkan hasil yang sama jika melatih ulang). Sementara itu, pelatihan tokenizer adalah proses statistik untuk mengidentifikasi sub-kata terbaik dari sebuah korpus, dan aturan yang digunakan tergantung pada algoritma tokenisasi. Proses ini <strong>deterministik</strong>, artinya hasilnya akan selalu sama jika Anda menggunakan algoritma dan korpus yang sama.</p></div> <h2 class="relative group"><a id="assembling-a-corpus" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#assembling-a-corpus"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Menyusun Korpus</span></h2> <p data-svelte-h="svelte-5za60b">Ada API yang sangat sederhana di pustaka 🤗 Transformers yang dapat Anda gunakan untuk melatih tokenizer baru dengan karakteristik yang sama seperti tokenizer yang sudah ada: <code>AutoTokenizer.train_new_from_iterator()</code>. Untuk melihat cara kerjanya, bayangkan kita ingin melatih GPT-2 dari awal, tetapi dalam bahasa selain bahasa Inggris. Tugas pertama kita adalah mengumpulkan banyak data dalam bahasa tersebut untuk dijadikan korpus pelatihan. Untuk memberikan contoh yang bisa dimengerti semua orang, kita tidak akan menggunakan bahasa seperti Rusia atau Mandarin, tetapi justru menggunakan bahasa Inggris yang bersifat khusus: kode Python.</p> <p data-svelte-h="svelte-12qv4as">Pustaka <a href="https://github.com/huggingface/datasets" rel="nofollow">🤗 Datasets</a> dapat membantu kita menyusun korpus dari kode sumber Python. Kita akan menggunakan fungsi <code>load_dataset()</code> seperti biasa untuk mengunduh dan menyimpan <em>cache</em> dari dataset <a href="https://huggingface.co/datasets/code_search_net" rel="nofollow">CodeSearchNet</a>. Dataset ini dibuat untuk <a href="https://wandb.ai/github/CodeSearchNet/benchmark" rel="nofollow">tantangan CodeSearchNet</a> dan berisi jutaan fungsi dari pustaka open source di GitHub dalam berbagai bahasa pemrograman. Di sini, kita akan memuat bagian Python dari dataset tersebut:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-comment"># Ini bisa memakan waktu beberapa menit, jadi siapkan kopi atau teh sambil menunggu!</span>
raw_datasets = load_dataset(<span class="hljs-string">&quot;code_search_net&quot;</span>, <span class="hljs-string">&quot;python&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-hp0rwn">Kita dapat melihat <em>split</em> pelatihan untuk mengetahui kolom apa saja yang tersedia:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->raw_datasets[<span class="hljs-string">&quot;train&quot;</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({
features: [<span class="hljs-string">&#x27;repository_name&#x27;</span>, <span class="hljs-string">&#x27;func_path_in_repository&#x27;</span>, <span class="hljs-string">&#x27;func_name&#x27;</span>, <span class="hljs-string">&#x27;whole_func_string&#x27;</span>, <span class="hljs-string">&#x27;language&#x27;</span>,
<span class="hljs-string">&#x27;func_code_string&#x27;</span>, <span class="hljs-string">&#x27;func_code_tokens&#x27;</span>, <span class="hljs-string">&#x27;func_documentation_string&#x27;</span>, <span class="hljs-string">&#x27;func_documentation_tokens&#x27;</span>, <span class="hljs-string">&#x27;split_name&#x27;</span>,
<span class="hljs-string">&#x27;func_code_url&#x27;</span>
],
num_rows: <span class="hljs-number">412178</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-cevh20">Kita dapat melihat bahwa dataset ini memisahkan docstring dari kode, dan menyarankan tokenisasi untuk keduanya. Di sini, kita hanya akan menggunakan kolom <code>whole_func_string</code> untuk melatih tokenizer kita. Kita bisa melihat contoh salah satu fungsi dengan mengakses indeks tertentu pada <em>split</em> pelatihan:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(raw_datasets[<span class="hljs-string">&quot;train&quot;</span>][<span class="hljs-number">123456</span>][<span class="hljs-string">&quot;whole_func_string&quot;</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-4g6qjg">yang seharusnya mencetak:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">handle_simple_responses</span>(<span class="hljs-params">
self, timeout_ms=<span class="hljs-literal">None</span>, info_cb=DEFAULT_MESSAGE_CALLBACK</span>):
<span class="hljs-string">&quot;&quot;&quot;Accepts normal responses from the device.
Args:
timeout_ms: Timeout in milliseconds to wait for each response.
info_cb: Optional callback for text sent from the bootloader.
Returns:
OKAY packet&#x27;s message.
&quot;&quot;&quot;</span>
<span class="hljs-keyword">return</span> self._accept_responses(<span class="hljs-string">&#x27;OKAY&#x27;</span>, info_cb, timeout_ms=timeout_ms)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-tnveqh">Hal pertama yang perlu kita lakukan adalah mengubah dataset menjadi sebuah <em>iterator</em> berupa daftar-daftar teks — misalnya, daftar dari daftar teks. Menggunakan daftar teks akan memungkinkan tokenizer kita bekerja lebih cepat (melatih dalam batch teks alih-alih memproses teks satu per satu), dan sebaiknya menggunakan iterator jika kita ingin menghindari menyimpan semuanya sekaligus di dalam memori. Jika korpus Anda sangat besar, Anda akan ingin memanfaatkan kenyataan bahwa 🤗 Datasets tidak memuat semuanya ke dalam RAM, melainkan menyimpan elemen-elemen dataset di disk.</p> <p data-svelte-h="svelte-vwxm9z">Melakukan hal berikut akan membuat daftar yang berisi daftar 1.000 teks per elemen, tetapi akan memuat semuanya ke memori:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Jangan aktifkan baris ini kecuali dataset Anda kecil!</span>
<span class="hljs-comment"># training_corpus = [raw_datasets[&quot;train&quot;][i: i + 1000][&quot;whole_func_string&quot;] for i in range(0, len(raw_datasets[&quot;train&quot;]), 1000)]</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-iryghc">Dengan menggunakan generator Python, kita dapat mencegah Python memuat semuanya ke dalam memori sebelum dibutuhkan. Untuk membuat generator seperti itu, cukup ganti tanda kurung siku dengan tanda kurung biasa:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->training_corpus = (
raw_datasets[<span class="hljs-string">&quot;train&quot;</span>][i : i + <span class="hljs-number">1000</span>][<span class="hljs-string">&quot;whole_func_string&quot;</span>]
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">0</span>, <span class="hljs-built_in">len</span>(raw_datasets[<span class="hljs-string">&quot;train&quot;</span>]), <span class="hljs-number">1000</span>)
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1034ity">Baris kode ini tidak mengambil elemen apa pun dari dataset; ini hanya membuat sebuah objek yang bisa Anda gunakan dalam <code>for</code> loop di Python. Teks-teksnya hanya akan dimuat saat Anda membutuhkannya (yaitu, ketika Anda berada pada langkah dalam <code>for</code> loop yang memerlukannya), dan hanya 1.000 teks yang akan dimuat sekaligus. Dengan cara ini, Anda tidak akan kehabisan memori bahkan jika sedang memproses dataset yang sangat besar.</p> <p data-svelte-h="svelte-d0jtqq">Masalah dari objek generator adalah bahwa ia hanya bisa digunakan sekali. Jadi, alih-alih memberikan daftar 10 angka dua kali seperti ini:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->gen = (i <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">10</span>))
<span class="hljs-built_in">print</span>(<span class="hljs-built_in">list</span>(gen))
<span class="hljs-built_in">print</span>(<span class="hljs-built_in">list</span>(gen))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-y6g5qi">kita malah mendapat:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">3</span>, <span class="hljs-number">4</span>, <span class="hljs-number">5</span>, <span class="hljs-number">6</span>, <span class="hljs-number">7</span>, <span class="hljs-number">8</span>, <span class="hljs-number">9</span>]
[]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-iwqf2r">Itulah sebabnya kita mendefinisikan fungsi yang mengembalikan generator:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">get_training_corpus</span>():
<span class="hljs-keyword">return</span> (
raw_datasets[<span class="hljs-string">&quot;train&quot;</span>][i : i + <span class="hljs-number">1000</span>][<span class="hljs-string">&quot;whole_func_string&quot;</span>]
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">0</span>, <span class="hljs-built_in">len</span>(raw_datasets[<span class="hljs-string">&quot;train&quot;</span>]), <span class="hljs-number">1000</span>)
)
training_corpus = get_training_corpus()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-vbcud5">Anda juga bisa mendefinisikan generator menggunakan perulangan <code>for</code> dengan pernyataan <code>yield</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">get_training_corpus</span>():
dataset = raw_datasets[<span class="hljs-string">&quot;train&quot;</span>]
<span class="hljs-keyword">for</span> start_idx <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">0</span>, <span class="hljs-built_in">len</span>(dataset), <span class="hljs-number">1000</span>):
samples = dataset[start_idx : start_idx + <span class="hljs-number">1000</span>]
<span class="hljs-keyword">yield</span> samples[<span class="hljs-string">&quot;whole_func_string&quot;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-gebrnh">yang akan menghasilkan generator yang sama, tetapi memungkinkan Anda menggunakan logika yang lebih kompleks daripada yang bisa dilakukan dengan list comprehension.</p> <h2 class="relative group"><a id="training-a-new-tokenizer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#training-a-new-tokenizer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Melatih Tokenizer Baru</span></h2> <p data-svelte-h="svelte-3m5pr">Sekarang kita memiliki korpus dalam bentuk iterator yang berisi <em>batch</em> teks, kita siap untuk melatih tokenizer baru. Untuk melakukan ini, pertama-tama kita perlu memuat tokenizer yang ingin kita pasangkan dengan model kita (dalam hal ini, GPT-2):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
old_tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;gpt2&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-kcp6yc">Meskipun kita akan melatih tokenizer baru, ini adalah langkah yang baik agar kita tidak memulai semuanya dari nol. Dengan cara ini, kita tidak perlu menentukan apa pun tentang algoritma tokenisasi atau token khusus yang ingin digunakan; tokenizer baru kita akan persis seperti milik GPT-2, dan satu-satunya hal yang akan berubah adalah kosakatanya, yang akan ditentukan berdasarkan pelatihan pada korpus kita.</p> <p data-svelte-h="svelte-18ex75p">Pertama, mari kita lihat bagaimana tokenizer ini menangani contoh fungsi:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->example = <span class="hljs-string">&#x27;&#x27;&#x27;def add_numbers(a, b):
&quot;&quot;&quot;Add the two numbers `a` and `b`.&quot;&quot;&quot;
return a + b&#x27;&#x27;&#x27;</span>
tokens = old_tokenizer.tokenize(example)
tokens<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;def&#x27;</span>, <span class="hljs-string">&#x27;Ġadd&#x27;</span>, <span class="hljs-string">&#x27;_&#x27;</span>, <span class="hljs-string">&#x27;n&#x27;</span>, <span class="hljs-string">&#x27;umbers&#x27;</span>, <span class="hljs-string">&#x27;(&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;Ġb&#x27;</span>, <span class="hljs-string">&#x27;):&#x27;</span>, <span class="hljs-string">&#x27;Ċ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&quot;&quot;&quot;&#x27;</span>, <span class="hljs-string">&#x27;Add&#x27;</span>, <span class="hljs-string">&#x27;Ġthe&#x27;</span>, <span class="hljs-string">&#x27;Ġtwo&#x27;</span>,
<span class="hljs-string">&#x27;Ġnumbers&#x27;</span>, <span class="hljs-string">&#x27;Ġ`&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;`&#x27;</span>, <span class="hljs-string">&#x27;Ġand&#x27;</span>, <span class="hljs-string">&#x27;Ġ`&#x27;</span>, <span class="hljs-string">&#x27;b&#x27;</span>, <span class="hljs-string">&#x27;`&#x27;</span>, <span class="hljs-string">&#x27;.&quot;&#x27;</span>, <span class="hljs-string">&#x27;&quot;&quot;&#x27;</span>, <span class="hljs-string">&#x27;Ċ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&#x27;</span>, <span class="hljs-string">&#x27;Ġreturn&#x27;</span>, <span class="hljs-string">&#x27;Ġa&#x27;</span>, <span class="hljs-string">&#x27;Ġ+&#x27;</span>, <span class="hljs-string">&#x27;Ġb&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-128w08f">Tokenizer ini memiliki beberapa simbol khusus seperti <code>Ġ</code> dan <code>Ċ</code> yang menunjukkan spasi dan baris baru. Seperti yang terlihat, ini kurang efisien: tokenizer ini mengembalikan token terpisah untuk setiap spasi, padahal bisa saja mengelompokkannya (karena indentasi empat atau delapan spasi sangat umum dalam kode). Tokenizer juga memecah nama fungsi dengan cara yang aneh karena tidak terbiasa dengan karakter <code>_</code>.</p> <p data-svelte-h="svelte-15gbjlx">Mari kita latih tokenizer baru dan lihat apakah hal ini dapat diatasi. Kita akan menggunakan metode <code>train_new_from_iterator()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, <span class="hljs-number">52000</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1gfvl3m">Perintah ini mungkin memakan waktu jika korpus Anda besar, tapi untuk dataset sebesar 1.6 GB, ini berjalan sangat cepat (sekitar 1 menit 16 detik pada CPU AMD Ryzen 9 3900X dengan 12 core).</p> <p data-svelte-h="svelte-1dpl9gk">Perlu dicatat bahwa <code>AutoTokenizer.train_new_from_iterator()</code> hanya berfungsi jika tokenizer yang Anda gunakan adalah tokenizer “cepat”. Seperti yang akan Anda lihat di bagian selanjutnya, pustaka 🤗 Transformers memiliki dua jenis tokenizer: beberapa ditulis murni dalam Python dan lainnya (yang cepat) didukung oleh pustaka 🤗 Tokenizers, yang ditulis dalam bahasa pemrograman <a href="https://www.rust-lang.org" rel="nofollow">Rust</a>. Python adalah bahasa yang paling sering digunakan untuk aplikasi data science dan deep learning, tetapi ketika sesuatu perlu diparalelkan agar lebih cepat, maka harus ditulis dalam bahasa lain. Sebagai contoh, perkalian matriks yang menjadi inti dari perhitungan model ditulis dalam CUDA, sebuah pustaka C yang dioptimalkan untuk GPU.</p> <p data-svelte-h="svelte-ynm5ri">Melatih tokenizer dari nol dengan Python murni akan sangat lambat, itulah sebabnya pustaka 🤗 Tokenizers dikembangkan. Namun, seperti Anda tidak perlu mempelajari CUDA untuk menjalankan model Anda di GPU, Anda juga tidak perlu belajar Rust untuk menggunakan tokenizer cepat. Pustaka 🤗 Tokenizers menyediakan binding ke Python untuk berbagai metode yang secara internal memanggil kode Rust; contohnya untuk melakukan pelatihan tokenizer secara paralel, atau seperti yang kita lihat di <a href="/course/chapter3">Bab 3</a>, untuk melakukan tokenisasi pada batch input.</p> <p data-svelte-h="svelte-y9it41">Sebagian besar model Transformer memiliki tokenizer cepat yang tersedia (ada beberapa pengecualian yang bisa Anda lihat <a href="https://huggingface.co/transformers/#supported-frameworks" rel="nofollow">di sini</a>), dan API <code>AutoTokenizer</code> akan selalu memilih tokenizer cepat untuk Anda jika tersedia. Pada bagian selanjutnya, kita akan melihat beberapa fitur khusus lain yang dimiliki tokenizer cepat, yang akan sangat berguna untuk tugas-tugas seperti klasifikasi token dan question answering. Namun sebelum masuk ke sana, mari kita coba tokenizer baru kita pada contoh sebelumnya:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokens = tokenizer.tokenize(example)
tokens<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;def&#x27;</span>, <span class="hljs-string">&#x27;Ġadd&#x27;</span>, <span class="hljs-string">&#x27;_&#x27;</span>, <span class="hljs-string">&#x27;numbers&#x27;</span>, <span class="hljs-string">&#x27;(&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;Ġb&#x27;</span>, <span class="hljs-string">&#x27;):&#x27;</span>, <span class="hljs-string">&#x27;ĊĠĠĠ&#x27;</span>, <span class="hljs-string">&#x27;Ġ&quot;&quot;&quot;&#x27;</span>, <span class="hljs-string">&#x27;Add&#x27;</span>, <span class="hljs-string">&#x27;Ġthe&#x27;</span>, <span class="hljs-string">&#x27;Ġtwo&#x27;</span>, <span class="hljs-string">&#x27;Ġnumbers&#x27;</span>, <span class="hljs-string">&#x27;Ġ`&#x27;</span>,
<span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;`&#x27;</span>, <span class="hljs-string">&#x27;Ġand&#x27;</span>, <span class="hljs-string">&#x27;Ġ`&#x27;</span>, <span class="hljs-string">&#x27;b&#x27;</span>, <span class="hljs-string">&#x27;`.&quot;&quot;&quot;&#x27;</span>, <span class="hljs-string">&#x27;ĊĠĠĠ&#x27;</span>, <span class="hljs-string">&#x27;Ġreturn&#x27;</span>, <span class="hljs-string">&#x27;Ġa&#x27;</span>, <span class="hljs-string">&#x27;Ġ+&#x27;</span>, <span class="hljs-string">&#x27;Ġb&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1uiabme">Di sini kita kembali melihat simbol-simbol khusus <code>Ġ</code> dan <code>Ċ</code> yang menandakan spasi dan baris baru, tetapi kita juga bisa melihat bahwa tokenizer kita telah mempelajari beberapa token yang sangat spesifik untuk korpus fungsi Python: misalnya, ada token <code>ĊĠĠĠ</code> yang merepresentasikan indentasi, dan token <code>Ġ&quot;&quot;&quot;</code> yang merepresentasikan tiga tanda kutip yang memulai sebuah docstring. Tokenizer juga berhasil membagi nama fungsi pada karakter <code>_</code>. Ini merupakan representasi yang cukup ringkas; sebagai perbandingan, menggunakan tokenizer Bahasa Inggris biasa pada contoh yang sama akan menghasilkan kalimat yang lebih panjang.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(<span class="hljs-built_in">len</span>(tokens))
<span class="hljs-built_in">print</span>(<span class="hljs-built_in">len</span>(old_tokenizer.tokenize(example)))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">27</span>
<span class="hljs-number">36</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1yir781">Mari kita lihat contoh lain:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->example = <span class="hljs-string">&quot;&quot;&quot;class LinearLayer():
def __init__(self, input_size, output_size):
self.weight = torch.randn(input_size, output_size)
self.bias = torch.zeros(output_size)
def __call__(self, x):
return x @ self.weights + self.bias
&quot;&quot;&quot;</span>
tokenizer.tokenize(example)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;class&#x27;</span>, <span class="hljs-string">&#x27;ĠLinear&#x27;</span>, <span class="hljs-string">&#x27;Layer&#x27;</span>, <span class="hljs-string">&#x27;():&#x27;</span>, <span class="hljs-string">&#x27;ĊĠĠĠ&#x27;</span>, <span class="hljs-string">&#x27;Ġdef&#x27;</span>, <span class="hljs-string">&#x27;Ġ__&#x27;</span>, <span class="hljs-string">&#x27;init&#x27;</span>, <span class="hljs-string">&#x27;__(&#x27;</span>, <span class="hljs-string">&#x27;self&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;Ġinput&#x27;</span>, <span class="hljs-string">&#x27;_&#x27;</span>, <span class="hljs-string">&#x27;size&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>,
<span class="hljs-string">&#x27;Ġoutput&#x27;</span>, <span class="hljs-string">&#x27;_&#x27;</span>, <span class="hljs-string">&#x27;size&#x27;</span>, <span class="hljs-string">&#x27;):&#x27;</span>, <span class="hljs-string">&#x27;ĊĠĠĠĠĠĠĠ&#x27;</span>, <span class="hljs-string">&#x27;Ġself&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;weight&#x27;</span>, <span class="hljs-string">&#x27;Ġ=&#x27;</span>, <span class="hljs-string">&#x27;Ġtorch&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;randn&#x27;</span>, <span class="hljs-string">&#x27;(&#x27;</span>, <span class="hljs-string">&#x27;input&#x27;</span>, <span class="hljs-string">&#x27;_&#x27;</span>,
<span class="hljs-string">&#x27;size&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;Ġoutput&#x27;</span>, <span class="hljs-string">&#x27;_&#x27;</span>, <span class="hljs-string">&#x27;size&#x27;</span>, <span class="hljs-string">&#x27;)&#x27;</span>, <span class="hljs-string">&#x27;ĊĠĠĠĠĠĠĠ&#x27;</span>, <span class="hljs-string">&#x27;Ġself&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;bias&#x27;</span>, <span class="hljs-string">&#x27;Ġ=&#x27;</span>, <span class="hljs-string">&#x27;Ġtorch&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;zeros&#x27;</span>, <span class="hljs-string">&#x27;(&#x27;</span>,
<span class="hljs-string">&#x27;output&#x27;</span>, <span class="hljs-string">&#x27;_&#x27;</span>, <span class="hljs-string">&#x27;size&#x27;</span>, <span class="hljs-string">&#x27;)&#x27;</span>, <span class="hljs-string">&#x27;ĊĊĠĠĠ&#x27;</span>, <span class="hljs-string">&#x27;Ġdef&#x27;</span>, <span class="hljs-string">&#x27;Ġ__&#x27;</span>, <span class="hljs-string">&#x27;call&#x27;</span>, <span class="hljs-string">&#x27;__(&#x27;</span>, <span class="hljs-string">&#x27;self&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;Ġx&#x27;</span>, <span class="hljs-string">&#x27;):&#x27;</span>, <span class="hljs-string">&#x27;ĊĠĠĠĠĠĠĠ&#x27;</span>,
<span class="hljs-string">&#x27;Ġreturn&#x27;</span>, <span class="hljs-string">&#x27;Ġx&#x27;</span>, <span class="hljs-string">&#x27;Ġ@&#x27;</span>, <span class="hljs-string">&#x27;Ġself&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;weights&#x27;</span>, <span class="hljs-string">&#x27;Ġ+&#x27;</span>, <span class="hljs-string">&#x27;Ġself&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;bias&#x27;</span>, <span class="hljs-string">&#x27;ĊĠĠĠĠ&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8w7sda">Selain token indentasi, kita juga bisa melihat token untuk indentasi ganda: <code>ĊĠĠĠĠĠĠĠ</code>. Kata kunci Python seperti <code>class</code>, <code>init</code>, <code>call</code>, <code>self</code>, dan <code>return</code> masing-masing ditokenisasi sebagai satu token, dan kita bisa melihat bahwa tokenizer ini juga memisahkan karakter <code>_</code>, <code>.</code>, bahkan nama dalam camel case seperti <code>LinearLayer</code> menjadi <code>[&quot;ĠLinear&quot;, &quot;Layer&quot;]</code>.</p> <h2 class="relative group"><a id="saving-the-tokenizer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#saving-the-tokenizer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Menyimpan Tokenizer</span></h2> <p data-svelte-h="svelte-1indufy">Untuk memastikan kita bisa menggunakannya kembali di masa depan, kita perlu menyimpan tokenizer baru ini. Seperti pada model, kita bisa menggunakan metode <code>save_pretrained()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.save_pretrained(<span class="hljs-string">&quot;code-search-net-tokenizer&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1kiqpty">Perintah ini akan membuat folder baru bernama <em>code-search-net-tokenizer</em>, yang berisi semua file yang dibutuhkan untuk memuat kembali tokenizer tersebut. Jika Anda ingin membagikannya dengan kolega atau teman, Anda bisa mengunggahnya ke Hugging Face Hub setelah login. Jika Anda bekerja di notebook, ada fungsi khusus yang bisa digunakan:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login
notebook_login()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-tqq3hh">Ini akan menampilkan widget untuk memasukkan kredensial akun Hugging Face Anda. Jika Anda tidak bekerja di notebook, cukup ketik perintah berikut di terminal:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->huggingface-cli login<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1g7x9fo">Setelah login, Anda bisa mendorong (push) tokenizer ke Hub dengan perintah berikut:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.push_to_hub(<span class="hljs-string">&quot;code-search-net-tokenizer&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mr46w0">Ini akan membuat repositori baru di namespace Anda dengan nama <code>code-search-net-tokenizer</code>, berisi file tokenizer. Anda kemudian bisa memuat tokenizer dari mana saja menggunakan metode <code>from_pretrained()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Ganti &quot;huggingface-course&quot; dengan namespace Anda sendiri jika ingin menggunakan tokenizer Anda</span>
tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;huggingface-course/code-search-net-tokenizer&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1nuuorx">Sekarang Anda sudah siap untuk melatih model bahasa dari nol dan menyetelnya ulang untuk tugas Anda! Kita akan membahas itu di <a href="/course/chapter7">Bab 7</a>, tapi sebelumnya, pada sisa bab ini kita akan melihat lebih dekat tokenizer cepat dan menjelajahi secara rinci apa yang sebenarnya terjadi ketika kita memanggil metode <code>train_new_from_iterator()</code>.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/id/chapter6/2.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_ojy514 = {
assets: "/docs/course/pr_1054/id",
base: "/docs/course/pr_1054/id",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1054/id/_app/immutable/entry/start.4f92af03.js"),
import("/docs/course/pr_1054/id/_app/immutable/entry/app.19cef1b6.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 46],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
69.5 kB
·
Xet hash:
8cbf67f3afd08922061b1f3f4845d5c501a94d01f9063cadd1d65c2c8b9016e9

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.