Buckets:

rtrm's picture
download
raw
94.6 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Unigram標記化&quot;,&quot;local&quot;:&quot;unigram標記化&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;訓練算法&quot;,&quot;local&quot;:&quot;訓練算法&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;標記化算法&quot;,&quot;local&quot;:&quot;標記化算法&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;回到訓練&quot;,&quot;local&quot;:&quot;回到訓練&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;實現 Unigram&quot;,&quot;local&quot;:&quot;實現-unigram&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1069/zh-TW/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/entry/start.01945326.js">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/chunks/scheduler.37c15a92.js">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/chunks/singletons.40763523.js">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/chunks/index.18351ede.js">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/chunks/paths.677dc4ce.js">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/entry/app.e8597ff2.js">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/chunks/index.2bf4358c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/nodes/0.decbb3b3.js">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/nodes/51.819fba0d.js">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/chunks/Tip.363c041f.js">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/chunks/Youtube.1e50a667.js">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/chunks/CodeBlock.4e987730.js">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/chunks/CourseFloatingBanner.9ff4c771.js">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/chunks/DocNotebookDropdown.efc1fb7c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/zh-TW/_app/immutable/chunks/getInferenceSnippets.24b50994.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Unigram標記化&quot;,&quot;local&quot;:&quot;unigram標記化&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;訓練算法&quot;,&quot;local&quot;:&quot;訓練算法&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;標記化算法&quot;,&quot;local&quot;:&quot;標記化算法&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;回到訓練&quot;,&quot;local&quot;:&quot;回到訓練&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;實現 Unigram&quot;,&quot;local&quot;:&quot;實現-unigram&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="unigram標記化" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#unigram標記化"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Unigram標記化</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-6-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/zh-CN/chapter6/section7.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/zh-CN/chapter6/section7.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-h4znh1">在 SentencePiece 中經常使用 Unigram 算法,該算法是 AlBERT、T5、mBART、Big Bird 和 XLNet 等模型使用的標記化算法。</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/TGZfZVuF9Yc" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1ib3nke">💡 本節深入介紹了 Unigram,甚至展示了一個完整的實現。如果你只想大致瞭解標記化算法,可以跳到最後。</p></div> <h2 class="relative group"><a id="訓練算法" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#訓練算法"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>訓練算法</span></h2> <p data-svelte-h="svelte-fafpi7">與 BPE 和 WordPiece 相比,Unigram 在另一個方向上工作:它從一個較大的詞彙表開始,然後從中刪除標記,直到達到所需的詞彙表大小。有多種選項可用於構建基本詞彙表:例如,我們可以採用預標記化單詞中最常見的子串,或者在具有大詞彙量的初始語料庫上應用 BPE。</p> <p data-svelte-h="svelte-a06pm7">在訓練的每一步,Unigram 算法都會在給定當前詞彙的情況下計算語料庫的損失。然後,對於詞彙表中的每個符號,算法計算如果刪除該符號,整體損失會增加多少,並尋找增加最少的符號。這些符號對語料庫的整體損失影響較小,因此從某種意義上說,它們「不太需要」並且是移除的最佳候選者。</p> <p data-svelte-h="svelte-nn6gyb">這是一個非常昂貴的操作,所以我們不只是刪除與最低損失增加相關的單個符號,而且\(p\) (\(p\)是一個可以控制的超參數,通常是 10 或 20)與最低損失增加相關的符號的百分比。然後重複這個過程,直到詞彙量達到所需的大小。</p> <p data-svelte-h="svelte-u53jkf">請注意:我們從不刪除基本字符,以確保可以標記任何單詞。</p> <p data-svelte-h="svelte-1h9hxxp">這或許仍然有點模糊:算法的主要部分是計算語料庫的損失,並查看當我們從詞彙表中刪除一些標記時它會如何變化,但我們還沒有解釋如何做到這一點。這一步依賴於 Unigram 模型的標記化算法,因此我們接下來將深入研究。</p> <p data-svelte-h="svelte-n9yffr">我們將重用前面示例中的語料庫:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">&quot;hug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;bun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;hugs&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1e6kc1v">對於此示例,我們將採用初始詞彙表的所有嚴格子字符串:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-selector-attr">[<span class="hljs-string">&quot;h&quot;</span>, <span class="hljs-string">&quot;u&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>, <span class="hljs-string">&quot;hu&quot;</span>, <span class="hljs-string">&quot;ug&quot;</span>, <span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;pu&quot;</span>, <span class="hljs-string">&quot;n&quot;</span>, <span class="hljs-string">&quot;un&quot;</span>, <span class="hljs-string">&quot;b&quot;</span>, <span class="hljs-string">&quot;bu&quot;</span>, <span class="hljs-string">&quot;s&quot;</span>, <span class="hljs-string">&quot;hug&quot;</span>, <span class="hljs-string">&quot;gs&quot;</span>, <span class="hljs-string">&quot;ugs&quot;</span>]</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="標記化算法" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#標記化算法"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>標記化算法</span></h2> <p data-svelte-h="svelte-1l17a1l">Unigram 模型是一種語言模型,它認為每個標記都獨立於它之前的標記。它是最簡單的語言模型,從某種意義上說, 給定先前上下文的標記 X 的概率就是標記 X 的概率。因此,如果我們使用 Unigram 語言模型生成文本,我們將始終預測最常見的標記。</p> <p data-svelte-h="svelte-1ebda5t">給定標記的概率是它在原始語料庫中的頻率(我們找到它的次數),除以詞彙表中所有標記的所有頻率的總和(以確保概率總和為 1)。例如, <code>&quot;ug&quot;</code><code>&quot;hug&quot;</code><code>&quot;pug&quot;</code> 以及 <code>&quot;hugs&quot;</code> 中,所以它在我們的語料庫中的頻率為 20。</p> <p data-svelte-h="svelte-22ww7u">以下是詞彙表中所有可能的子詞的出現頻率:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">&quot;h&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">15</span>) (<span class="hljs-string">&quot;u&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">36</span>) (<span class="hljs-string">&quot;g&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">20</span>) (<span class="hljs-string">&quot;hu&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">15</span>) (<span class="hljs-string">&quot;ug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">20</span>) (<span class="hljs-string">&quot;p&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">17</span>) (<span class="hljs-string">&quot;pu&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">17</span>) (<span class="hljs-string">&quot;n&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">16</span>)
(<span class="hljs-string">&quot;un&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">16</span>) (<span class="hljs-string">&quot;b&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>) (<span class="hljs-string">&quot;bu&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>) (<span class="hljs-string">&quot;s&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>) (<span class="hljs-string">&quot;hug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">15</span>) (<span class="hljs-string">&quot;gs&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>) (<span class="hljs-string">&quot;ugs&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-lmx393">所以,所有頻率之和為210, 並且子詞 <code>&quot;ug&quot;</code> 出現的概率是 20/210。</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-12n4dxv">✏️ <strong>現在輪到你了!</strong> 編寫代碼來計算上面的頻率,並仔細檢查顯示的結果以及總和是否正確。</p></div> <p>現在,為了對給定的單詞進行標記,我們將所有可能的分割視為標記,並根據 Unigram 模型計算每個分割的概率。由於所有標記都被認為是獨立的,所以這個概率只是每個標記概率的乘積。例如 <code data-svelte-h="svelte-1gjdq76">&quot;pug&quot;</code> 的標記化 <code data-svelte-h="svelte-1n2m4po">[&quot;p&quot;, &quot;u&quot;, &quot;g&quot;]</code> 的概率為:
<!-- HTML_TAG_START --><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mi>P</mi><mo stretchy="false">(</mo><mo stretchy="false">[</mo><mi mathvariant="normal"></mi><mi mathvariant="normal"></mi><mi>p</mi><mi mathvariant="normal">&quot;</mi><mo separator="true">,</mo><mi mathvariant="normal"></mi><mi mathvariant="normal"></mi><mi>u</mi><mi mathvariant="normal">&quot;</mi><mo separator="true">,</mo><mi mathvariant="normal"></mi><mi mathvariant="normal"></mi><mi>g</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">]</mo><mo stretchy="false">)</mo><mo>=</mo><mi>P</mi><mo stretchy="false">(</mo><mi mathvariant="normal"></mi><mi mathvariant="normal"></mi><mi>p</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">)</mo><mo>×</mo><mi>P</mi><mo stretchy="false">(</mo><mi mathvariant="normal"></mi><mi mathvariant="normal"></mi><mi>u</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">)</mo><mo>×</mo><mi>P</mi><mo stretchy="false">(</mo><mi mathvariant="normal"></mi><mi mathvariant="normal"></mi><mi>g</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">)</mo><mo>=</mo><mfrac><mn>5</mn><mn>210</mn></mfrac><mo>×</mo><mfrac><mn>36</mn><mn>210</mn></mfrac><mo>×</mo><mfrac><mn>20</mn><mn>210</mn></mfrac><mo>=</mo><mn>0.000389</mn></mrow><annotation encoding="application/x-tex">P([``p&quot;, ``u&quot;, ``g&quot;]) = P(``p&quot;) \times P(``u&quot;) \times P(``g&quot;) = \frac{5}{210} \times \frac{36}{210} \times \frac{20}{210} = 0.000389</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">([</span><span class="mord">‘‘</span><span class="mord mathnormal">p</span><span class="mord">&quot;</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">‘‘</span><span class="mord mathnormal">u</span><span class="mord">&quot;</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">‘‘</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord">&quot;</span><span class="mclose">])</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord">‘‘</span><span class="mord mathnormal">p</span><span class="mord">&quot;</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord">‘‘</span><span class="mord mathnormal">u</span><span class="mord">&quot;</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord">‘‘</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord">&quot;</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">210</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">5</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">210</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">36</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">210</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">20</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.6444em;"></span><span class="mord">0.000389</span></span></span></span></span><!-- HTML_TAG_END --></p> <p>相比之下,標記化 <code data-svelte-h="svelte-42m5r0">[&quot;pu&quot;, &quot;g&quot;]</code> 的概率為:
<!-- HTML_TAG_START --><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mi>P</mi><mo stretchy="false">(</mo><mo stretchy="false">[</mo><mi mathvariant="normal"></mi><mi mathvariant="normal"></mi><mi>p</mi><mi>u</mi><mi mathvariant="normal">&quot;</mi><mo separator="true">,</mo><mi mathvariant="normal"></mi><mi mathvariant="normal"></mi><mi>g</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">]</mo><mo stretchy="false">)</mo><mo>=</mo><mi>P</mi><mo stretchy="false">(</mo><mi mathvariant="normal"></mi><mi mathvariant="normal"></mi><mi>p</mi><mi>u</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">)</mo><mo>×</mo><mi>P</mi><mo stretchy="false">(</mo><mi mathvariant="normal"></mi><mi mathvariant="normal"></mi><mi>g</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">)</mo><mo>=</mo><mfrac><mn>5</mn><mn>210</mn></mfrac><mo>×</mo><mfrac><mn>20</mn><mn>210</mn></mfrac><mo>=</mo><mn>0.0022676</mn></mrow><annotation encoding="application/x-tex">P([``pu&quot;, ``g&quot;]) = P(``pu&quot;) \times P(``g&quot;) = \frac{5}{210} \times \frac{20}{210} = 0.0022676</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">([</span><span class="mord">‘‘</span><span class="mord mathnormal">p</span><span class="mord mathnormal">u</span><span class="mord">&quot;</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">‘‘</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord">&quot;</span><span class="mclose">])</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord">‘‘</span><span class="mord mathnormal">p</span><span class="mord mathnormal">u</span><span class="mord">&quot;</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord">‘‘</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord">&quot;</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">210</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">5</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">210</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">20</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.6444em;"></span><span class="mord">0.0022676</span></span></span></span></span><!-- HTML_TAG_END --></p> <p data-svelte-h="svelte-b8cnhl">所以一個更有可能。一般來說,具有儘可能少的標記的標記化將具有最高的概率(因為每個標記重複除以 210),這對應於我們直觀想要的:將一個單詞分成儘可能少的標記。</p> <p data-svelte-h="svelte-17k48dh">使用 Unigram 模型對單詞進行分詞是概率最高的分詞。在示例 <code>&quot;pug&quot;</code> 中,這裡是我們為每個可能的分割獲得的概率:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;u&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>] : 0.000389
[<span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;ug&quot;</span>] : 0.0022676
[<span class="hljs-string">&quot;pu&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>] : 0.0022676<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-50f1bh">所以 <code>&quot;pug&quot;</code> 將被標記為 <code>[&quot;p&quot;, &quot;ug&quot;]</code> 或者 <code>[&quot;pu&quot;, &quot;g&quot;]</code>,取決於首先遇到這些分割中的哪一個(請注意:在更大的語料庫中,這樣的相等的情況很少見)。</p> <p data-svelte-h="svelte-1g71ole">在這種情況下,很容易找到所有可能的分割並計算它們的概率,但一般來說會有點困難。有一種用於此的經典算法,稱為 <em>維特比(Viterbi)算法</em>。本質上,我們可以構建一個圖來檢測給定單詞的可能分割,如果從<em>a</em><em>b</em>的子詞在詞彙表中,則從字符<em>a</em>到字符<em>b</em>之間存在一個分支,並將子詞的概率歸因於該分支。</p> <p data-svelte-h="svelte-111hyu0">為了在該圖中找到將具有最佳分數的路徑,維特比算法為單詞中的每個位置確定在該位置結束的具有最佳分數的分段。由於我們從開始到結束,可以通過循環遍歷以當前位置結尾的所有子詞,然後使用該子詞開始位置的最佳標記化分數來找到最佳分數。然後,我們只需要展開到達終點所採取的路徑。</p> <p data-svelte-h="svelte-fupgnj">讓我們看一個使用我們的詞彙表和單詞 <code>&quot;unhug&quot;</code> 的例子。對於每個位置,以最好的分數結尾的子詞如下:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attribute">Character</span> <span class="hljs-number">0</span> (u): <span class="hljs-string">&quot;u&quot;</span> (score <span class="hljs-number">0</span>.<span class="hljs-number">171429</span>)
<span class="hljs-attribute">Character</span> <span class="hljs-number">1</span> (n): <span class="hljs-string">&quot;un&quot;</span> (score <span class="hljs-number">0</span>.<span class="hljs-number">076191</span>)
<span class="hljs-attribute">Character</span> <span class="hljs-number">2</span> (h): <span class="hljs-string">&quot;un&quot;</span> <span class="hljs-string">&quot;h&quot;</span> (score <span class="hljs-number">0</span>.<span class="hljs-number">005442</span>)
<span class="hljs-attribute">Character</span> <span class="hljs-number">3</span> (u): <span class="hljs-string">&quot;un&quot;</span> <span class="hljs-string">&quot;hu&quot;</span> (score <span class="hljs-number">0</span>.<span class="hljs-number">005442</span>)
<span class="hljs-attribute">Character</span> <span class="hljs-number">4</span> (g): <span class="hljs-string">&quot;un&quot;</span> <span class="hljs-string">&quot;hug&quot;</span> (score <span class="hljs-number">0</span>.<span class="hljs-number">005442</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-lmz6dt">因此 <code>&quot;unhug&quot;</code> 將被標記為 <code>[&quot;un&quot;, &quot;hug&quot;]</code></p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-k3fq22">✏️ <strong>現在輪到你了!</strong> 確定單詞 <code>&quot;huggun&quot;</code> 的標記化及其分數。</p></div> <h2 class="relative group"><a id="回到訓練" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#回到訓練"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>回到訓練</span></h2> <p data-svelte-h="svelte-1vz95kf">現在我們已經瞭解了標記化的工作原理,我們可以更深入地研究訓練期間使用的損失。在任何給定的階段,這個損失是通過對語料庫中的每個單詞進行標記來計算的,使用當前詞彙表和由語料庫中每個標記的頻率確定的 Unigram 模型(如前所述)。</p> <p data-svelte-h="svelte-1stzqjp">語料庫中的每個詞都有一個分數,損失是這些分數的負對數似然 — 即所有詞的語料庫中所有詞的總和 <code>-log(P(word))</code></p> <p data-svelte-h="svelte-1dfk8a">讓我們用以下語料庫回到我們的例子:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">&quot;hug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;bun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;hugs&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ssfuyg">每個單詞的標記化及其各自的分數是:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">&quot;hug&quot;</span>: [<span class="hljs-string">&quot;hug&quot;</span>] <span class="hljs-comment">(score 0.071428)</span>
<span class="hljs-string">&quot;pug&quot;</span>: [<span class="hljs-string">&quot;pu&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>] <span class="hljs-comment">(score 0.007710)</span>
<span class="hljs-string">&quot;pun&quot;</span>: [<span class="hljs-string">&quot;pu&quot;</span>, <span class="hljs-string">&quot;n&quot;</span>] <span class="hljs-comment">(score 0.006168)</span>
<span class="hljs-string">&quot;bun&quot;</span>: [<span class="hljs-string">&quot;bu&quot;</span>, <span class="hljs-string">&quot;n&quot;</span>] <span class="hljs-comment">(score 0.001451)</span>
<span class="hljs-string">&quot;hugs&quot;</span>: [<span class="hljs-string">&quot;hug&quot;</span>, <span class="hljs-string">&quot;s&quot;</span>] <span class="hljs-comment">(score 0.001701)</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-t39rlk">所以損失是:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attribute">10</span> * (-log(<span class="hljs-number">0</span>.<span class="hljs-number">071428</span>)) + <span class="hljs-number">5</span> * (-log(<span class="hljs-number">0</span>.<span class="hljs-number">007710</span>)) + <span class="hljs-number">12</span> * (-log(<span class="hljs-number">0</span>.<span class="hljs-number">006168</span>)) + <span class="hljs-number">4</span> * (-log(<span class="hljs-number">0</span>.<span class="hljs-number">001451</span>)) + <span class="hljs-number">5</span> * (-log(<span class="hljs-number">0</span>.<span class="hljs-number">001701</span>)) = <span class="hljs-number">169</span>.<span class="hljs-number">8</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1g0xzx7">現在我們需要計算刪除每個標記如何影響損失。這相當乏味,所以我們在這裡只對兩個標記進行操作,並保存整個過程以備有代碼來幫助我們。在這個(非常)特殊的情況下,我們對所有單詞有兩個等效的標記:正如我們之前看到的,例如, <code>&quot;pug&quot;</code> 可以以相同的分數被標記為 <code>[&quot;p&quot;, &quot;ug&quot;]</code>。因此,去除詞彙表中的 <code>&quot;pu&quot;</code> 標記將給出完全相同的損失。</p> <p data-svelte-h="svelte-1aotov0">另一方面,去除 <code>&quot;hug&quot;</code> 損失變得更糟, 因為 <code>&quot;hug&quot;</code><code>&quot;hugs&quot;</code> 的標記化會變成:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">&quot;hug&quot;</span>: [<span class="hljs-string">&quot;hu&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>] <span class="hljs-comment">(score 0.006802)</span>
<span class="hljs-string">&quot;hugs&quot;</span>: [<span class="hljs-string">&quot;hu&quot;</span>, <span class="hljs-string">&quot;gs&quot;</span>] <span class="hljs-comment">(score 0.001701)</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1714xsh">這些變化將導致損失增加:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->- <span class="hljs-number">10</span> * (<span class="hljs-name">-log</span>(<span class="hljs-number">0.071428</span>)) + <span class="hljs-number">10</span> * (<span class="hljs-name">-log</span>(<span class="hljs-number">0.006802</span>)) = <span class="hljs-number">23.5</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-i4jgj3">因此, 標記 <code>&quot;pu&quot;</code>可能會從詞彙表中刪除,但不會刪除 <code>&quot;hug&quot;</code>.</p> <h2 class="relative group"><a id="實現-unigram" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#實現-unigram"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>實現 Unigram</span></h2> <p data-svelte-h="svelte-t9jgc0">現在讓我們在代碼中實現我們迄今為止看到的所有內容。與 BPE 和 WordPiece 一樣,這不是 Unigram 算法的有效實現(恰恰相反),但它應該可以幫助你更好地理解它。</p> <p data-svelte-h="svelte-jt4fjc">我們將使用與之前相同的語料庫作為示例:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->corpus = [
<span class="hljs-string">&quot;This is the Hugging Face course.&quot;</span>,
<span class="hljs-string">&quot;This chapter is about tokenization.&quot;</span>,
<span class="hljs-string">&quot;This section shows several tokenizer algorithms.&quot;</span>,
<span class="hljs-string">&quot;Hopefully, you will be able to understand how they are trained and generate tokens.&quot;</span>,
]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-5xnt85">這一次,我們將使用 <code>xlnet-base-cased</code> 作為我們的模型:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;xlnet-base-cased&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ymtcjm">與 BPE 和 WordPiece 一樣,我們首先計算語料庫中每個單詞的出現次數:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> collections <span class="hljs-keyword">import</span> defaultdict
word_freqs = defaultdict(<span class="hljs-built_in">int</span>)
<span class="hljs-keyword">for</span> text <span class="hljs-keyword">in</span> corpus:
words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
new_words = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> words_with_offsets]
<span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> new_words:
word_freqs[word] += <span class="hljs-number">1</span>
word_freqs<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-2uyc79">然後,我們需要將我們的詞彙表初始化為大於我們最終想要的詞彙量。我們必須包含所有基本字符(否則我們將無法標記每個單詞),但對於較大的子字符串,我們將只保留最常見的字符,因此我們按頻率對它們進行排序:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->char_freqs = defaultdict(<span class="hljs-built_in">int</span>)
subwords_freqs = defaultdict(<span class="hljs-built_in">int</span>)
<span class="hljs-keyword">for</span> word, freq <span class="hljs-keyword">in</span> word_freqs.items():
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(word)):
char_freqs[word[i]] += freq
<span class="hljs-comment"># Loop through the subwords of length at least 2</span>
<span class="hljs-keyword">for</span> j <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(i + <span class="hljs-number">2</span>, <span class="hljs-built_in">len</span>(word) + <span class="hljs-number">1</span>):
subwords_freqs[word[i:j]] += freq
<span class="hljs-comment"># Sort subwords by frequency</span>
sorted_subwords = <span class="hljs-built_in">sorted</span>(subwords_freqs.items(), key=<span class="hljs-keyword">lambda</span> x: x[<span class="hljs-number">1</span>], reverse=<span class="hljs-literal">True</span>)
sorted_subwords[:<span class="hljs-number">10</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[(<span class="hljs-string">&#x27;▁t&#x27;</span>, <span class="hljs-number">7</span>), (<span class="hljs-string">&#x27;is&#x27;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&#x27;er&#x27;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&#x27;▁a&#x27;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&#x27;▁to&#x27;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&#x27;to&#x27;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&#x27;en&#x27;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&#x27;▁T&#x27;</span>, <span class="hljs-number">3</span>), (<span class="hljs-string">&#x27;▁Th&#x27;</span>, <span class="hljs-number">3</span>), (<span class="hljs-string">&#x27;▁Thi&#x27;</span>, <span class="hljs-number">3</span>)]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1bjkyee">我們用最優的子詞對字符進行分組,以獲得大小為 300 的初始詞彙表:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->token_freqs = <span class="hljs-built_in">list</span>(char_freqs.items()) + sorted_subwords[: <span class="hljs-number">300</span> - <span class="hljs-built_in">len</span>(char_freqs)]
token_freqs = {token: freq <span class="hljs-keyword">for</span> token, freq <span class="hljs-keyword">in</span> token_freqs}<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-11qe3u0">💡 SentencePiece 使用一種稱為增強後綴數組(ESA)的更高效算法來創建初始詞彙表。</p></div> <p data-svelte-h="svelte-1l0k6hk">接下來,我們計算所有頻率的總和,將頻率轉換為概率。對於我們的模型,我們將存儲概率的對數,因為添加對數比乘以小數在數值上更穩定,這將簡化模型損失的計算:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> math <span class="hljs-keyword">import</span> log
total_sum = <span class="hljs-built_in">sum</span>([freq <span class="hljs-keyword">for</span> token, freq <span class="hljs-keyword">in</span> token_freqs.items()])
model = {token: -log(freq / total_sum) <span class="hljs-keyword">for</span> token, freq <span class="hljs-keyword">in</span> token_freqs.items()}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-h6hkh8">N現在主要功能是使用 Viterbi 算法標記單詞的功能。正如我們之前看到的,該算法計算單詞的每個子串的最佳分段,我們將其存儲在名為 <code>best_segmentations</code> 的變量中。我們將在單詞的每個位置(從 0 到其總長度)存儲一個字典,有兩個鍵:最佳分割中最後一個標記的開始索引,以及最佳分割的分數。使用最後一個標記的開始索引,一旦列表完全填充,我們將能夠檢索完整的分段。</p> <p data-svelte-h="svelte-1i18saf">填充列表只需兩個循環:主循環遍歷每個起始位置,第二個循環嘗試從該起始位置開始的所有子字符串。如果子串在詞彙表中,我們有一個新的詞分段,直到該結束位置,我們將其與 <code>best_segmentations</code> 相比較。</p> <p data-svelte-h="svelte-15hu4m3">一旦主循環完成,我們就從結尾開始,從一個開始位置跳到下一個,記錄我們前進的標記,直到我們到達單詞的開頭:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">encode_word</span>(<span class="hljs-params">word, model</span>):
best_segmentations = [{<span class="hljs-string">&quot;start&quot;</span>: <span class="hljs-number">0</span>, <span class="hljs-string">&quot;score&quot;</span>: <span class="hljs-number">1</span>}] + [
{<span class="hljs-string">&quot;start&quot;</span>: <span class="hljs-literal">None</span>, <span class="hljs-string">&quot;score&quot;</span>: <span class="hljs-literal">None</span>} <span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(word))
]
<span class="hljs-keyword">for</span> start_idx <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(word)):
<span class="hljs-comment"># This should be properly filled by the previous steps of the loop</span>
best_score_at_start = best_segmentations[start_idx][<span class="hljs-string">&quot;score&quot;</span>]
<span class="hljs-keyword">for</span> end_idx <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(start_idx + <span class="hljs-number">1</span>, <span class="hljs-built_in">len</span>(word) + <span class="hljs-number">1</span>):
token = word[start_idx:end_idx]
<span class="hljs-keyword">if</span> token <span class="hljs-keyword">in</span> model <span class="hljs-keyword">and</span> best_score_at_start <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>:
score = model[token] + best_score_at_start
<span class="hljs-comment"># If we have found a better segmentation ending at end_idx, we update</span>
<span class="hljs-keyword">if</span> (
best_segmentations[end_idx][<span class="hljs-string">&quot;score&quot;</span>] <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span>
<span class="hljs-keyword">or</span> best_segmentations[end_idx][<span class="hljs-string">&quot;score&quot;</span>] &gt; score
):
best_segmentations[end_idx] = {<span class="hljs-string">&quot;start&quot;</span>: start_idx, <span class="hljs-string">&quot;score&quot;</span>: score}
segmentation = best_segmentations[-<span class="hljs-number">1</span>]
<span class="hljs-keyword">if</span> segmentation[<span class="hljs-string">&quot;score&quot;</span>] <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span>:
<span class="hljs-comment"># We did not find a tokenization of the word -&gt; unknown</span>
<span class="hljs-keyword">return</span> [<span class="hljs-string">&quot;&lt;unk&gt;&quot;</span>], <span class="hljs-literal">None</span>
score = segmentation[<span class="hljs-string">&quot;score&quot;</span>]
start = segmentation[<span class="hljs-string">&quot;start&quot;</span>]
end = <span class="hljs-built_in">len</span>(word)
tokens = []
<span class="hljs-keyword">while</span> start != <span class="hljs-number">0</span>:
tokens.insert(<span class="hljs-number">0</span>, word[start:end])
next_start = best_segmentations[start][<span class="hljs-string">&quot;start&quot;</span>]
end = start
start = next_start
tokens.insert(<span class="hljs-number">0</span>, word[start:end])
<span class="hljs-keyword">return</span> tokens, score<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-10ed263">我們已經可以在一些詞上嘗試我們的初始模型:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(encode_word(<span class="hljs-string">&quot;Hopefully&quot;</span>, model))
<span class="hljs-built_in">print</span>(encode_word(<span class="hljs-string">&quot;This&quot;</span>, model))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->([<span class="hljs-string">&#x27;H&#x27;</span>, <span class="hljs-string">&#x27;o&#x27;</span>, <span class="hljs-string">&#x27;p&#x27;</span>, <span class="hljs-string">&#x27;e&#x27;</span>, <span class="hljs-string">&#x27;f&#x27;</span>, <span class="hljs-string">&#x27;u&#x27;</span>, <span class="hljs-string">&#x27;ll&#x27;</span>, <span class="hljs-string">&#x27;y&#x27;</span>], <span class="hljs-number">41.5157494601402</span>)
([<span class="hljs-string">&#x27;This&#x27;</span>], <span class="hljs-number">6.288267030694535</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-zeax3o">現在很容易計算模型在語料庫上的損失!</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_loss</span>(<span class="hljs-params">model</span>):
loss = <span class="hljs-number">0</span>
<span class="hljs-keyword">for</span> word, freq <span class="hljs-keyword">in</span> word_freqs.items():
_, word_loss = encode_word(word, model)
loss += freq * word_loss
<span class="hljs-keyword">return</span> loss<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-x7et8m">我們可以檢查它是否適用於我們擁有的模型:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->compute_loss(model)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">413.10377642940875</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1yc4ap5">計算每個標記的分數也不是很難;我們只需要計算通過刪除每個標記獲得的模型的損失:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> copy
<span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_scores</span>(<span class="hljs-params">model</span>):
scores = {}
model_loss = compute_loss(model)
<span class="hljs-keyword">for</span> token, score <span class="hljs-keyword">in</span> model.items():
<span class="hljs-comment"># We always keep tokens of length 1</span>
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(token) == <span class="hljs-number">1</span>:
<span class="hljs-keyword">continue</span>
model_without_token = copy.deepcopy(model)
_ = model_without_token.pop(token)
scores[token] = compute_loss(model_without_token) - model_loss
<span class="hljs-keyword">return</span> scores<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-q0jiv8">我們可以在給定的標記上嘗試:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->scores = compute_scores(model)
<span class="hljs-built_in">print</span>(scores[<span class="hljs-string">&quot;ll&quot;</span>])
<span class="hljs-built_in">print</span>(scores[<span class="hljs-string">&quot;his&quot;</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1rlo4pw">自從 <code>&quot;ll&quot;</code> 用於標記化 <code>&quot;Hopefully&quot;</code>, 刪除它可能會讓我們使用標記 <code>&quot;l&quot;</code> 兩次相反,我們預計它將產生正損失。 <code>&quot;his&quot;</code> 僅在單詞<code>&quot;This&quot;</code> 內使用,它被標記為自身,所以我們期望它的損失為零。結果如下:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">6.376412403623874</span>
<span class="hljs-number">0.0</span><!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-c2m34b">💡 這種方法非常低效,因此 SentencePiece 使用了沒有標記 X 的模型損失的近似值:它不是從頭開始,而是通過其在剩餘詞彙表中的分段替換標記 X。這樣,所有分數可以與模型損失同時計算。</p></div> <p data-svelte-h="svelte-g5nxig">完成所有這些後,我們需要做的最後一件事是將模型使用的特殊標記添加到詞彙表中,然後循環直到我們從詞彙表中修剪了足夠的標記以達到我們想要的大小:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->percent_to_remove = <span class="hljs-number">0.1</span>
<span class="hljs-keyword">while</span> <span class="hljs-built_in">len</span>(model) &gt; <span class="hljs-number">100</span>:
scores = compute_scores(model)
sorted_scores = <span class="hljs-built_in">sorted</span>(scores.items(), key=<span class="hljs-keyword">lambda</span> x: x[<span class="hljs-number">1</span>])
<span class="hljs-comment"># Remove percent_to_remove tokens with the lowest scores.</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">int</span>(<span class="hljs-built_in">len</span>(model) * percent_to_remove)):
_ = token_freqs.pop(sorted_scores[i][<span class="hljs-number">0</span>])
total_sum = <span class="hljs-built_in">sum</span>([freq <span class="hljs-keyword">for</span> token, freq <span class="hljs-keyword">in</span> token_freqs.items()])
model = {token: -log(freq / total_sum) <span class="hljs-keyword">for</span> token, freq <span class="hljs-keyword">in</span> token_freqs.items()}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1evuc8t">然後,為了標記一些文本,我們只需要應用預標記化,然後使用我們的 <code>encode_word()</code> 函數:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">tokenize</span>(<span class="hljs-params">text, model</span>):
words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
pre_tokenized_text = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> words_with_offsets]
encoded_words = [encode_word(word, model)[<span class="hljs-number">0</span>] <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> pre_tokenized_text]
<span class="hljs-keyword">return</span> <span class="hljs-built_in">sum</span>(encoded_words, [])
tokenize(<span class="hljs-string">&quot;This is the Hugging Face course.&quot;</span>, model)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;▁This&#x27;</span>, <span class="hljs-string">&#x27;▁is&#x27;</span>, <span class="hljs-string">&#x27;▁the&#x27;</span>, <span class="hljs-string">&#x27;▁Hugging&#x27;</span>, <span class="hljs-string">&#x27;▁Face&#x27;</span>, <span class="hljs-string">&#x27;&#x27;</span>, <span class="hljs-string">&#x27;c&#x27;</span>, <span class="hljs-string">&#x27;ou&#x27;</span>, <span class="hljs-string">&#x27;r&#x27;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27;e&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1sdpsve">Unigram 就是這樣!希望現在你感覺自己是標記器所有方面的專家。在下一節中,我們將深入研究 🤗 Tokenizers 庫的構建塊,並向您展示如何使用它們來構建您自己的標記器。</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/zh-TW/chapter6/7.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_14z83n9 = {
assets: "/docs/course/pr_1069/zh-TW",
base: "/docs/course/pr_1069/zh-TW",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1069/zh-TW/_app/immutable/entry/start.01945326.js"),
import("/docs/course/pr_1069/zh-TW/_app/immutable/entry/app.e8597ff2.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 51],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
94.6 kB
·
Xet hash:
2fc636d3fce658f13e19dc29060edfef8ff2354a5004c0f0e4eae2587fc93d67

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.