Buckets:

rtrm's picture
download
raw
93.8 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;WordPiece tokenization 算法&quot;,&quot;local&quot;:&quot;WordPiece tokenization算法&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;WordPiece 训练&quot;,&quot;local&quot;:&quot;WordPiece 训练&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;tokenization 算法&quot;,&quot;local&quot;:&quot;tokenization 算法&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;实现 WordPiece&quot;,&quot;local&quot;:&quot;实现 WordPiece&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1021/zh-CN/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/entry/start.f3a1a511.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/scheduler.37c15a92.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/singletons.9bf55235.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/index.18351ede.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/paths.0ba10750.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/entry/app.c39e37cf.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/index.2bf4358c.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/nodes/0.dad18ce3.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/nodes/49.02638f23.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/Tip.363c041f.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/Youtube.1e50a667.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/CodeBlock.4e987730.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/CourseFloatingBanner.9ff4c771.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/DocNotebookDropdown.efc1fb7c.js">
<link rel="modulepreload" href="/docs/course/pr_1021/zh-CN/_app/immutable/chunks/getInferenceSnippets.ebf8be91.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;WordPiece tokenization 算法&quot;,&quot;local&quot;:&quot;WordPiece tokenization算法&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;WordPiece 训练&quot;,&quot;local&quot;:&quot;WordPiece 训练&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;tokenization 算法&quot;,&quot;local&quot;:&quot;tokenization 算法&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;实现 WordPiece&quot;,&quot;local&quot;:&quot;实现 WordPiece&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="WordPiece tokenization算法" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#WordPiece tokenization算法"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>WordPiece tokenization 算法</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-6-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/zh-CN/chapter6/section6.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/zh-CN/chapter6/section6.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-1fqctxb">WordPiece 是 Google 开发的用于 BERT 预训练的分词算法。自此之后,很多基于 BERT 的 Transformer 模型都复用了这种方法,比如 DistilBERT,MobileBERT,Funnel Transformers 和 MPNET。它在训练方面与 BPE 非常类似,但实际的分词方法有所不同。</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/qpv6ms_t_1A" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1jw9ndh">💡 本节详细讲述了 WordPiece,甚至展示了一个完整的实现。如果你只想对这个分词算法有个大概的理解,可以直接跳到最后。</p></div> <h2 class="relative group"><a id="WordPiece 训练" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#WordPiece 训练"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>WordPiece 训练</span></h2> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-1i7swwn">⚠️ Google 从未开源 WordPiece 训练算法的实现,因此以下是我们基于已发表文献的最佳猜测。它可能并非 100% 准确的。</p></div> <p data-svelte-h="svelte-11u566e">与BPE 一样,WordPiece 也是从包含模型使用的特殊 tokens 和初始字母表的小词汇表开始的。由于它是通过添加前缀(如 BERT 中的 <code>##</code> )来识别子词的,每个词最初都会通过在词内部所有字符前添加该前缀进行分割。因此,例如 <code>&quot;word&quot;</code> 将被这样分割:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->w ##o ##r ##d<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-sszbho">因此,初始字母表包含所有出现在单词第一个位置的字符,以及出现在单词内部并带有 WordPiece 前缀的字符。</p> <p data-svelte-h="svelte-1592x9q">然后,同样像 BPE 一样,WordPiece 会学习合并规则。主要的不同之处在于合并对的选择方式。WordPiece 不是选择频率最高的对,而是对每对计算一个得分,使用以下公式:</p> <p><!-- HTML_TAG_START --><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mrow><mi mathvariant="normal">s</mi><mi mathvariant="normal">c</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">e</mi></mrow><mo>=</mo><mo stretchy="false">(</mo><mrow><mi mathvariant="normal">f</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">q</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">f</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">p</mi><mi mathvariant="normal">a</mi><mi mathvariant="normal">i</mi><mi mathvariant="normal">r</mi></mrow><mo stretchy="false">)</mo><mi mathvariant="normal">/</mi><mo stretchy="false">(</mo><mrow><mi mathvariant="normal">f</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">q</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">f</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">f</mi><mi mathvariant="normal">i</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">s</mi><mi mathvariant="normal">t</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">l</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">m</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">n</mi><mi mathvariant="normal">t</mi></mrow><mo>×</mo><mrow><mi mathvariant="normal">f</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">q</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">f</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">s</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">c</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">n</mi><mi mathvariant="normal">d</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">l</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">m</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">n</mi><mi mathvariant="normal">t</mi></mrow><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex">\mathrm{score} = (\mathrm{freq\_of\_pair}) / (\mathrm{freq\_of\_first\_element} \times\mathrm{freq\_of\_second\_element})</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord"><span class="mord mathrm">score</span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.06em;vertical-align:-0.31em;"></span><span class="mopen">(</span><span class="mord"><span class="mord mathrm">freq_of_pair</span></span><span class="mclose">)</span><span class="mord">/</span><span class="mopen">(</span><span class="mord"><span class="mord mathrm">freq_of_first_element</span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1.06em;vertical-align:-0.31em;"></span><span class="mord"><span class="mord mathrm">freq_of_second_element</span></span><span class="mclose">)</span></span></span></span></span><!-- HTML_TAG_END --></p> <p data-svelte-h="svelte-1cke216">通过将两部分合在一起的频率除以其中各部分的频率的乘积,该算法优先合并那些在词汇表中单独出现出现的对。例如,即使 <code>(&quot;un&quot;, &quot;##able&quot;)</code> 这对在词汇表中出现的频率很高,它也不一定会被合并,因为 <code>&quot;un&quot;</code><code>&quot;##able&quot;</code> 这两对可能会在很多其他词中出现,频率很高。相比之下,像 <code>(&quot;hu&quot;, &quot;##gging&quot;)</code> 这样的对可能会更快地被合并(假设单词“hugging”在词汇表中出现的频率很高),因为 <code>&quot;hu&quot;</code><code>&quot;##gging&quot;</code> 可能分别出现的频率较低。</p> <p data-svelte-h="svelte-uw1lny">我们使用与 BPE 示例相同的词汇表:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">&quot;hug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;bun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;hugs&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1dsawga">经过分割之后将会是:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">&quot;h&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">&quot;b&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&quot;h&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#s</span>&quot;</span>, <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7tgvkb">所以最初的词汇表将会是 <code>[&quot;b&quot;, &quot;h&quot;, &quot;p&quot;, &quot;##g&quot;, &quot;##n&quot;, &quot;##s&quot;, &quot;##u&quot;]</code> (如果我们暂时忽略特殊 tokens )。出现频率最高的一对是 <code>(&quot;##u&quot;, &quot;##g&quot;)</code> (目前 20 次),但 <code>&quot;##u&quot;</code> 和其他单词一起出现的频率非常高,所以它的分数不是最高的(分数是 1 / 36)。所有带有 <code>&quot;##u&quot;</code> 的对实际上都有相同的分数(1 / 36),所以分数最高的对是 <code>(&quot;##g&quot;, &quot;##s&quot;)</code> —— 唯一没有 <code>&quot;##u&quot;</code> 的对——分数是 1 / 20,所以学习的第一个合并是 <code>(&quot;##g&quot;, &quot;##s&quot;) -&gt; (&quot;##gs&quot;)</code></p> <p data-svelte-h="svelte-10vwmky">请注意,当我们合并时,我们会删除两个 tokens 之间的 <code>##</code> ,所以我们将 <code>&quot;##gs&quot;</code> 添加到词汇表中,并将语料库的单词按照改规则进行合并:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->词汇表: [<span class="hljs-string">&quot;b&quot;</span>, <span class="hljs-string">&quot;h&quot;</span>, <span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#s</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#gs</span>&quot;</span>]
语料库: (<span class="hljs-string">&quot;h&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">&quot;b&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&quot;h&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#gs</span>&quot;</span>, <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1tt5wup">此时, <code>&quot;##u&quot;</code> 出现在所有可能的对中,因此它们最终都具有相同的分数。在这种情况下,第一个对会被合并,于是我们得到了 <code>(&quot;h&quot;, &quot;##u&quot;) -&gt; &quot;hu&quot;</code> 规则:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->词汇表: [<span class="hljs-string">&quot;b&quot;</span>, <span class="hljs-string">&quot;h&quot;</span>, <span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#s</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#gs</span>&quot;</span>, <span class="hljs-string">&quot;hu&quot;</span>]
语料库: (<span class="hljs-string">&quot;hu&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">&quot;b&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&quot;hu&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#gs</span>&quot;</span>, <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1nj9dhv">然后,下一个最佳得分的对是 <code>(&quot;hu&quot;, &quot;##g&quot;)</code><code>(&quot;hu&quot;, &quot;##gs&quot;)</code> (得分为 1/15,而所有其他配对的得分为 1/21),因此得分最高的第一对合并:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->词汇表: [<span class="hljs-string">&quot;b&quot;</span>, <span class="hljs-string">&quot;h&quot;</span>, <span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#s</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#gs</span>&quot;</span>, <span class="hljs-string">&quot;hu&quot;</span>, <span class="hljs-string">&quot;hug&quot;</span>]
语料库: (<span class="hljs-string">&quot;hug&quot;</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">&quot;b&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&quot;hu&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#gs</span>&quot;</span>, <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ib8ifw">然后我们就按此方式继续,直到我们达到所需的词汇表大小。</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1q0b1w">✏️ <strong>现在轮到你了!</strong> 下一个合并规则是什么?</p></div> <h2 class="relative group"><a id="tokenization 算法" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tokenization 算法"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>tokenization 算法</span></h2> <p data-svelte-h="svelte-1g7xvnn">WordPiece 和 BPE 的分词方式有所不同,WordPiece 只保存最终词汇表,而不保存学习到的合并规则。WordPiece 从待分词的词开始,找到词汇表中最长的子词,然后在其处分割。例如,如果我们使用上述示例中学习到的词汇表,对于词 <code>&quot;hugs&quot;</code> ,从开始处的最长子词在词汇表中是 <code>&quot;hug&quot;</code> ,所以我们在那里分割,得到 <code>[&quot;hug&quot;, &quot;##s&quot;]</code> 。然后我们继续处理 <code>&quot;##s&quot;</code> ,它在词汇表中,所以 <code>&quot;hugs&quot;</code> 的分词结果是 <code>[&quot;hug&quot;, &quot;##s&quot;]</code></p> <p data-svelte-h="svelte-1om85t3">如果使用 BPE,我们会按照学习到的合并规则进行合并,并将其分词为 <code>[&quot;hu&quot;, &quot;##gs&quot;]</code> ,不同的字词分词算法所以最终得到的编码是不同的。</p> <p data-svelte-h="svelte-trjwfq">再举一个例子,让我们看看 <code>&quot;bugs&quot;</code> 将如何分词的。 <code>&quot;b&quot;</code> 是从词汇表中单词开头开始的最长子词,所以我们在那里分割并得到 <code>[&quot;b&quot;, &quot;##ugs&quot;]</code> 。然后 <code>&quot;##u&quot;</code> 是词汇表中从 <code>&quot;##ugs&quot;</code> 开始的最长的子词,所以我们在那里拆分并得到 <code>[&quot;b&quot;, &quot;##u, &quot;##gs&quot;]</code> 。最后, <code>&quot;##gs&quot;</code> 在词汇表中,因此 <code>&quot;bugs&quot;</code> 的分词结果是: <code>[&quot;b&quot;, &quot;##u, &quot;##gs&quot;]</code></p> <p data-svelte-h="svelte-160l13j">当分词过程中无法在词汇库中找到该子词时,整个词会被标记为 unknown(未知)—— 例如, <code>&quot;mug&quot;</code> 将被标记为 <code>[&quot;[UNK]&quot;]</code><code>&quot;bum&quot;</code> 也是如此(即使我们的词汇表中包含 <code>&quot;b&quot;</code><code>&quot;##u&quot;</code> 开始,但是 <code>&quot;##m&quot;</code> 不在词汇表中,因此最终的分词结果只会是 <code>[&quot;[UNK]&quot;]</code> ,而不是 <code>[&quot;b&quot;, &quot;##u&quot;, &quot;[UNK]&quot;]</code> )。这是与 BPE 的另一个区别,BPE 只会将不在词汇库中的单个字符标记为 unknown。</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1yqamlc">✏️ <strong>现在轮到你了!</strong> <code>&quot;pugs&quot;</code> 将被如何分词?</p></div> <h2 class="relative group"><a id="实现 WordPiece" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#实现 WordPiece"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>实现 WordPiece</span></h2> <p data-svelte-h="svelte-1hl7db2">现在让我们看一下 WordPiece 算法的实现。与 BPE 一样,这只是教学示例,你不能在大型语料库上使用。</p> <p data-svelte-h="svelte-17nhne6">我们将使用与 BPE 示例中相同的语料库:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->corpus = [
<span class="hljs-string">&quot;This is the Hugging Face Course.&quot;</span>,
<span class="hljs-string">&quot;This chapter is about tokenization.&quot;</span>,
<span class="hljs-string">&quot;This section shows several tokenizer algorithms.&quot;</span>,
<span class="hljs-string">&quot;Hopefully, you will be able to understand how they are trained and generate tokens.&quot;</span>,
]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qxzm28">首先,我们需要将语料库预分词为单词。由于我们正在复刻 WordPiece tokenizer (如 BERT),因此我们将使用 <code>bert-base-cased</code> tokenizer 进行预分词:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;bert-base-cased&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1a86zo">然后我们在进行预分词的同时,计算语料库中每个单词的频率:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> collections <span class="hljs-keyword">import</span> defaultdict
word_freqs = defaultdict(<span class="hljs-built_in">int</span>)
<span class="hljs-keyword">for</span> text <span class="hljs-keyword">in</span> corpus:
words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
new_words = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> words_with_offsets]
<span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> new_words:
word_freqs[word] += <span class="hljs-number">1</span>
word_freqs<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->defaultdict(
<span class="hljs-built_in">int</span>, {<span class="hljs-string">&#x27;This&#x27;</span>: <span class="hljs-number">3</span>, <span class="hljs-string">&#x27;is&#x27;</span>: <span class="hljs-number">2</span>, <span class="hljs-string">&#x27;the&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Hugging&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Face&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Course&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;.&#x27;</span>: <span class="hljs-number">4</span>, <span class="hljs-string">&#x27;chapter&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;about&#x27;</span>: <span class="hljs-number">1</span>,
<span class="hljs-string">&#x27;tokenization&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;section&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;shows&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;several&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;tokenizer&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;algorithms&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Hopefully&#x27;</span>: <span class="hljs-number">1</span>,
<span class="hljs-string">&#x27;,&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;you&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;will&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;be&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;able&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;to&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;understand&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;how&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;they&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;are&#x27;</span>: <span class="hljs-number">1</span>,
<span class="hljs-string">&#x27;trained&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;and&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;generate&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;tokens&#x27;</span>: <span class="hljs-number">1</span>})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mul76c">如我们之前看到的,字母表是一个独特的集合,由所有单词的第一个字母以及所有以 <code>##</code> 为前缀和在单词中的其他字母组成:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->alphabet = []
<span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> word_freqs.keys():
<span class="hljs-keyword">if</span> word[<span class="hljs-number">0</span>] <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> alphabet:
alphabet.append(word[<span class="hljs-number">0</span>])
<span class="hljs-keyword">for</span> letter <span class="hljs-keyword">in</span> word[<span class="hljs-number">1</span>:]:
<span class="hljs-keyword">if</span> <span class="hljs-string">f&quot;##<span class="hljs-subst">{letter}</span>&quot;</span> <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> alphabet:
alphabet.append(<span class="hljs-string">f&quot;##<span class="hljs-subst">{letter}</span>&quot;</span>)
alphabet.sort()
alphabet
<span class="hljs-built_in">print</span>(alphabet)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;##a&#x27;</span>, <span class="hljs-string">&#x27;##b&#x27;</span>, <span class="hljs-string">&#x27;##c&#x27;</span>, <span class="hljs-string">&#x27;##d&#x27;</span>, <span class="hljs-string">&#x27;##e&#x27;</span>, <span class="hljs-string">&#x27;##f&#x27;</span>, <span class="hljs-string">&#x27;##g&#x27;</span>, <span class="hljs-string">&#x27;##h&#x27;</span>, <span class="hljs-string">&#x27;##i&#x27;</span>, <span class="hljs-string">&#x27;##k&#x27;</span>, <span class="hljs-string">&#x27;##l&#x27;</span>, <span class="hljs-string">&#x27;##m&#x27;</span>, <span class="hljs-string">&#x27;##n&#x27;</span>, <span class="hljs-string">&#x27;##o&#x27;</span>, <span class="hljs-string">&#x27;##p&#x27;</span>, <span class="hljs-string">&#x27;##r&#x27;</span>, <span class="hljs-string">&#x27;##s&#x27;</span>,
<span class="hljs-string">&#x27;##t&#x27;</span>, <span class="hljs-string">&#x27;##u&#x27;</span>, <span class="hljs-string">&#x27;##v&#x27;</span>, <span class="hljs-string">&#x27;##w&#x27;</span>, <span class="hljs-string">&#x27;##y&#x27;</span>, <span class="hljs-string">&#x27;##z&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;C&#x27;</span>, <span class="hljs-string">&#x27;F&#x27;</span>, <span class="hljs-string">&#x27;H&#x27;</span>, <span class="hljs-string">&#x27;T&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;b&#x27;</span>, <span class="hljs-string">&#x27;c&#x27;</span>, <span class="hljs-string">&#x27;g&#x27;</span>, <span class="hljs-string">&#x27;h&#x27;</span>, <span class="hljs-string">&#x27;i&#x27;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27;t&#x27;</span>, <span class="hljs-string">&#x27;u&#x27;</span>,
<span class="hljs-string">&#x27;w&#x27;</span>, <span class="hljs-string">&#x27;y&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-6gvlr8">我们还在该词汇表的开头添加了模型使用的特殊 tokens,在使用 BERT 的情况下,特殊 tokens 是 <code>[&quot;[PAD]&quot;, &quot;[UNK]&quot;, &quot;[CLS]&quot;, &quot;[SEP]&quot;, &quot;[MASK]&quot;]</code></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->vocab = [<span class="hljs-string">&quot;[PAD]&quot;</span>, <span class="hljs-string">&quot;[UNK]&quot;</span>, <span class="hljs-string">&quot;[CLS]&quot;</span>, <span class="hljs-string">&quot;[SEP]&quot;</span>, <span class="hljs-string">&quot;[MASK]&quot;</span>] + alphabet.copy()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15qyvc2">接下来我们需要将每个单词进行分割,除了第一个字母外,其他字母都需要以 <code>##</code> 为前缀:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->splits = {
word: [c <span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span> <span class="hljs-keyword">else</span> <span class="hljs-string">f&quot;##<span class="hljs-subst">{c}</span>&quot;</span> <span class="hljs-keyword">for</span> i, c <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(word)]
<span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> word_freqs.keys()
}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-3upalw">现在我们已经准备好训练了,让我们编写一个函数来计算每对的分数。我们需要在训练的每个步骤中使用它:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_pair_scores</span>(<span class="hljs-params">splits</span>):
letter_freqs = defaultdict(<span class="hljs-built_in">int</span>)
pair_freqs = defaultdict(<span class="hljs-built_in">int</span>)
<span class="hljs-keyword">for</span> word, freq <span class="hljs-keyword">in</span> word_freqs.items():
split = splits[word]
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(split) == <span class="hljs-number">1</span>:
letter_freqs[split[<span class="hljs-number">0</span>]] += freq
<span class="hljs-keyword">continue</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(split) - <span class="hljs-number">1</span>):
pair = (split[i], split[i + <span class="hljs-number">1</span>])
letter_freqs[split[i]] += freq
pair_freqs[pair] += freq
letter_freqs[split[-<span class="hljs-number">1</span>]] += freq
scores = {
pair: freq / (letter_freqs[pair[<span class="hljs-number">0</span>]] * letter_freqs[pair[<span class="hljs-number">1</span>]])
<span class="hljs-keyword">for</span> pair, freq <span class="hljs-keyword">in</span> pair_freqs.items()
}
<span class="hljs-keyword">return</span> scores<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-cihmgg">让我们来看看在初始分割后的部分字典:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pair_scores = compute_pair_scores(splits)
<span class="hljs-keyword">for</span> i, key <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(pair_scores.keys()):
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;<span class="hljs-subst">{key}</span>: <span class="hljs-subst">{pair_scores[key]}</span>&quot;</span>)
<span class="hljs-keyword">if</span> i &gt;= <span class="hljs-number">5</span>:
<span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">&#x27;T&#x27;</span>, <span class="hljs-string">&#x27;##h&#x27;</span>): <span class="hljs-number">0.125</span>
(<span class="hljs-string">&#x27;##h&#x27;</span>, <span class="hljs-string">&#x27;##i&#x27;</span>): <span class="hljs-number">0.03409090909090909</span>
(<span class="hljs-string">&#x27;##i&#x27;</span>, <span class="hljs-string">&#x27;##s&#x27;</span>): <span class="hljs-number">0.02727272727272727</span>
(<span class="hljs-string">&#x27;i&#x27;</span>, <span class="hljs-string">&#x27;##s&#x27;</span>): <span class="hljs-number">0.1</span>
(<span class="hljs-string">&#x27;t&#x27;</span>, <span class="hljs-string">&#x27;##h&#x27;</span>): <span class="hljs-number">0.03571428571428571</span>
(<span class="hljs-string">&#x27;##h&#x27;</span>, <span class="hljs-string">&#x27;##e&#x27;</span>): <span class="hljs-number">0.011904761904761904</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1tie2v3">现在,只需要一个快速循环就可以找到得分最高的对:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->best_pair = <span class="hljs-string">&quot;&quot;</span>
max_score = <span class="hljs-literal">None</span>
<span class="hljs-keyword">for</span> pair, score <span class="hljs-keyword">in</span> pair_scores.items():
<span class="hljs-keyword">if</span> max_score <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">or</span> max_score &lt; score:
best_pair = pair
max_score = score
<span class="hljs-built_in">print</span>(best_pair, max_score)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;##b&#x27;</span>) <span class="hljs-number">0.2</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-97mtnh">所以第一个要学习的合并是 <code>(&#39;a&#39;, &#39;##b&#39;) -&gt; &#39;ab&#39;</code> ,并且我们添加 <code>&#39;ab&#39;</code> 到词汇表中:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->vocab.append(<span class="hljs-string">&quot;ab&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-os95fs">接下来,我们需要对 <code>splits</code> 字典进行这种合并。让我们为此写另一个函数:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">merge_pair</span>(<span class="hljs-params">a, b, splits</span>):
<span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> word_freqs:
split = splits[word]
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(split) == <span class="hljs-number">1</span>:
<span class="hljs-keyword">continue</span>
i = <span class="hljs-number">0</span>
<span class="hljs-keyword">while</span> i &lt; <span class="hljs-built_in">len</span>(split) - <span class="hljs-number">1</span>:
<span class="hljs-keyword">if</span> split[i] == a <span class="hljs-keyword">and</span> split[i + <span class="hljs-number">1</span>] == b:
merge = a + b[<span class="hljs-number">2</span>:] <span class="hljs-keyword">if</span> b.startswith(<span class="hljs-string">&quot;##&quot;</span>) <span class="hljs-keyword">else</span> a + b
split = split[:i] + [merge] + split[i + <span class="hljs-number">2</span> :]
<span class="hljs-keyword">else</span>:
i += <span class="hljs-number">1</span>
splits[word] = split
<span class="hljs-keyword">return</span> splits<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1y048j">我们可以看看第一次合并的结果:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->splits = merge_pair(<span class="hljs-string">&quot;a&quot;</span>, <span class="hljs-string">&quot;##b&quot;</span>, splits)
splits[<span class="hljs-string">&quot;about&quot;</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;ab&#x27;</span>, <span class="hljs-string">&#x27;##o&#x27;</span>, <span class="hljs-string">&#x27;##u&#x27;</span>, <span class="hljs-string">&#x27;##t&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7tzu4v">现在我们有了合并循环的所有代码。让我们设定词汇表的大小为 70:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->vocab_size = <span class="hljs-number">70</span>
<span class="hljs-keyword">while</span> <span class="hljs-built_in">len</span>(vocab) &lt; vocab_size:
scores = compute_pair_scores(splits)
best_pair, max_score = <span class="hljs-string">&quot;&quot;</span>, <span class="hljs-literal">None</span>
<span class="hljs-keyword">for</span> pair, score <span class="hljs-keyword">in</span> scores.items():
<span class="hljs-keyword">if</span> max_score <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">or</span> max_score &lt; score:
best_pair = pair
max_score = score
splits = merge_pair(*best_pair, splits)
new_token = (
best_pair[<span class="hljs-number">0</span>] + best_pair[<span class="hljs-number">1</span>][<span class="hljs-number">2</span>:]
<span class="hljs-keyword">if</span> best_pair[<span class="hljs-number">1</span>].startswith(<span class="hljs-string">&quot;##&quot;</span>)
<span class="hljs-keyword">else</span> best_pair[<span class="hljs-number">0</span>] + best_pair[<span class="hljs-number">1</span>]
)
vocab.append(new_token)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-q8i5ms">然后我们可以查看生成的词汇表:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(vocab)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;[PAD]&#x27;</span>, <span class="hljs-string">&#x27;[UNK]&#x27;</span>, <span class="hljs-string">&#x27;[CLS]&#x27;</span>, <span class="hljs-string">&#x27;[SEP]&#x27;</span>, <span class="hljs-string">&#x27;[MASK]&#x27;</span>, <span class="hljs-string">&#x27;##a&#x27;</span>, <span class="hljs-string">&#x27;##b&#x27;</span>, <span class="hljs-string">&#x27;##c&#x27;</span>, <span class="hljs-string">&#x27;##d&#x27;</span>, <span class="hljs-string">&#x27;##e&#x27;</span>, <span class="hljs-string">&#x27;##f&#x27;</span>, <span class="hljs-string">&#x27;##g&#x27;</span>, <span class="hljs-string">&#x27;##h&#x27;</span>, <span class="hljs-string">&#x27;##i&#x27;</span>, <span class="hljs-string">&#x27;##k&#x27;</span>,
<span class="hljs-string">&#x27;##l&#x27;</span>, <span class="hljs-string">&#x27;##m&#x27;</span>, <span class="hljs-string">&#x27;##n&#x27;</span>, <span class="hljs-string">&#x27;##o&#x27;</span>, <span class="hljs-string">&#x27;##p&#x27;</span>, <span class="hljs-string">&#x27;##r&#x27;</span>, <span class="hljs-string">&#x27;##s&#x27;</span>, <span class="hljs-string">&#x27;##t&#x27;</span>, <span class="hljs-string">&#x27;##u&#x27;</span>, <span class="hljs-string">&#x27;##v&#x27;</span>, <span class="hljs-string">&#x27;##w&#x27;</span>, <span class="hljs-string">&#x27;##y&#x27;</span>, <span class="hljs-string">&#x27;##z&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;C&#x27;</span>, <span class="hljs-string">&#x27;F&#x27;</span>, <span class="hljs-string">&#x27;H&#x27;</span>,
<span class="hljs-string">&#x27;T&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;b&#x27;</span>, <span class="hljs-string">&#x27;c&#x27;</span>, <span class="hljs-string">&#x27;g&#x27;</span>, <span class="hljs-string">&#x27;h&#x27;</span>, <span class="hljs-string">&#x27;i&#x27;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27;t&#x27;</span>, <span class="hljs-string">&#x27;u&#x27;</span>, <span class="hljs-string">&#x27;w&#x27;</span>, <span class="hljs-string">&#x27;y&#x27;</span>, <span class="hljs-string">&#x27;ab&#x27;</span>,<span class="hljs-string">&#x27;##fu&#x27;</span>, <span class="hljs-string">&#x27;Fa&#x27;</span>, <span class="hljs-string">&#x27;Fac&#x27;</span>, <span class="hljs-string">&#x27;##ct&#x27;</span>, <span class="hljs-string">&#x27;##ful&#x27;</span>, <span class="hljs-string">&#x27;##full&#x27;</span>, <span class="hljs-string">&#x27;##fully&#x27;</span>,
<span class="hljs-string">&#x27;Th&#x27;</span>, <span class="hljs-string">&#x27;ch&#x27;</span>, <span class="hljs-string">&#x27;##hm&#x27;</span>, <span class="hljs-string">&#x27;cha&#x27;</span>, <span class="hljs-string">&#x27;chap&#x27;</span>, <span class="hljs-string">&#x27;chapt&#x27;</span>, <span class="hljs-string">&#x27;##thm&#x27;</span>, <span class="hljs-string">&#x27;Hu&#x27;</span>, <span class="hljs-string">&#x27;Hug&#x27;</span>, <span class="hljs-string">&#x27;Hugg&#x27;</span>, <span class="hljs-string">&#x27;sh&#x27;</span>, <span class="hljs-string">&#x27;th&#x27;</span>, <span class="hljs-string">&#x27;is&#x27;</span>, <span class="hljs-string">&#x27;##thms&#x27;</span>, <span class="hljs-string">&#x27;##za&#x27;</span>, <span class="hljs-string">&#x27;##zat&#x27;</span>,
<span class="hljs-string">&#x27;##ut&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-e50qfc">如我们所见,相较于 BPE(字节对编码),此分词器在学习单词部分作为 tokens 时稍快一些。</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-syspc2">💡 在同一语料库上使用 <code>train_new_from_iterator()</code> 不会产生完全相同的词汇表。这是因为 🤗 Tokenizers 库没有为训练实现 WordPiece(因为我们不完全确定它的真实实现方式),而是使用了 BPE。</p></div> <p data-svelte-h="svelte-ddxha3">要对新文本进行分词,我们先预分词,再进行分割,然后在每个词上使用分词算法。也就是说,我们寻找从第一个词开始的最大子词并将其分割,然后我们对第二部分重复此过程,以此类推,对该词以及文本中的后续词进行分割:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">encode_word</span>(<span class="hljs-params">word</span>):
tokens = []
<span class="hljs-keyword">while</span> <span class="hljs-built_in">len</span>(word) &gt; <span class="hljs-number">0</span>:
i = <span class="hljs-built_in">len</span>(word)
<span class="hljs-keyword">while</span> i &gt; <span class="hljs-number">0</span> <span class="hljs-keyword">and</span> word[:i] <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> vocab:
i -= <span class="hljs-number">1</span>
<span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span>:
<span class="hljs-keyword">return</span> [<span class="hljs-string">&quot;[UNK]&quot;</span>]
tokens.append(word[:i])
word = word[i:]
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(word) &gt; <span class="hljs-number">0</span>:
word = <span class="hljs-string">f&quot;##<span class="hljs-subst">{word}</span>&quot;</span>
<span class="hljs-keyword">return</span> tokens<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-4xc89b">让我们使用词汇表中的一个词和一个不在词汇表中的词上测试一下:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(encode_word(<span class="hljs-string">&quot;Hugging&quot;</span>))
<span class="hljs-built_in">print</span>(encode_word(<span class="hljs-string">&quot;HOgging&quot;</span>))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;Hugg&#x27;</span>, <span class="hljs-string">&#x27;##i&#x27;</span>, <span class="hljs-string">&#x27;##n&#x27;</span>, <span class="hljs-string">&#x27;##g&#x27;</span>]
[<span class="hljs-string">&#x27;[UNK]&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qypl47">现在,让我们编写一个对文本分词的函数:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">tokenize</span>(<span class="hljs-params">text</span>):
pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
pre_tokenized_text = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> pre_tokenize_result]
encoded_words = [encode_word(word) <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> pre_tokenized_text]
<span class="hljs-keyword">return</span> <span class="hljs-built_in">sum</span>(encoded_words, [])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-35pr86">我们可以在任何文本上尝试:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenize(<span class="hljs-string">&quot;This is the Hugging Face course!&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;Th&#x27;</span>, <span class="hljs-string">&#x27;##i&#x27;</span>, <span class="hljs-string">&#x27;##s&#x27;</span>, <span class="hljs-string">&#x27;is&#x27;</span>, <span class="hljs-string">&#x27;th&#x27;</span>, <span class="hljs-string">&#x27;##e&#x27;</span>, <span class="hljs-string">&#x27;Hugg&#x27;</span>, <span class="hljs-string">&#x27;##i&#x27;</span>, <span class="hljs-string">&#x27;##n&#x27;</span>, <span class="hljs-string">&#x27;##g&#x27;</span>, <span class="hljs-string">&#x27;Fac&#x27;</span>, <span class="hljs-string">&#x27;##e&#x27;</span>, <span class="hljs-string">&#x27;c&#x27;</span>, <span class="hljs-string">&#x27;##o&#x27;</span>, <span class="hljs-string">&#x27;##u&#x27;</span>, <span class="hljs-string">&#x27;##r&#x27;</span>, <span class="hljs-string">&#x27;##s&#x27;</span>,
<span class="hljs-string">&#x27;##e&#x27;</span>, <span class="hljs-string">&#x27;[UNK]&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-tt696e">这就是 WordPiece 算法的全部内容!现在让我们来看看 Unigram。</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/zh-CN/chapter6/6.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_9aawfw = {
assets: "/docs/course/pr_1021/zh-CN",
base: "/docs/course/pr_1021/zh-CN",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1021/zh-CN/_app/immutable/entry/start.f3a1a511.js"),
import("/docs/course/pr_1021/zh-CN/_app/immutable/entry/app.c39e37cf.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 49],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
93.8 kB
·
Xet hash:
8dca3933078dfa2b348a7ceb6efd34ca627e076f6e53a21e8ef6f4928c8300f5

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.