Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Tokenización WordPiece","local":"wordpiece-tokenization","sections":[{"title":"Algoritmo de Entrenamiento","local":"training-algorithm","sections":[],"depth":2},{"title":"Algoritmo de Tokenización","local":"tokenization-algorithm","sections":[],"depth":2},{"title":"Implementando WordPiece","local":"implementing-wordpiece","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/course/pr_1069/es/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/entry/start.b7b528c6.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/scheduler.37c15a92.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/singletons.e1c0df1c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/index.18351ede.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/paths.2d1184ba.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/entry/app.ea1cc000.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/index.2bf4358c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/nodes/0.4f78a9c4.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/nodes/40.e0c1de6d.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/Tip.363c041f.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/Youtube.1e50a667.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/CodeBlock.4e987730.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/CourseFloatingBanner.6add7356.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/Heading.8ada512a.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/getInferenceSnippets.b37612c0.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Tokenización WordPiece","local":"wordpiece-tokenization","sections":[{"title":"Algoritmo de Entrenamiento","local":"training-algorithm","sections":[],"depth":2},{"title":"Algoritmo de Tokenización","local":"tokenization-algorithm","sections":[],"depth":2},{"title":"Implementando WordPiece","local":"implementing-wordpiece","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="wordpiece-tokenization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#wordpiece-tokenization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Tokenización WordPiece</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-6-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter6/section6.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter6/section6.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-fxspyi">WordPiece es el algoritmo de tokenización que Google desarrolló para pre-entrenar BERT. Ha sido reutilizado un varios modelos Transformers basados en BERT, tales como DistilBERT, MobileBERT, Funnel Transformers, y MPNET. Es muy similar a BPE en términos del entrenamiento, pero la tokenización se hace de distinta manera.</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/qpv6ms_t_1A" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1ov2mxe">💡 Esta sección cubre WordPiece en profundidad, yendo tan lejos como para mostrar una implementación completa. Puedes saltarte hasta el final si sólo quieres una descripción general del algoritmo de tokenización.</p></div> <h2 class="relative group"><a id="training-algorithm" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#training-algorithm"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Algoritmo de Entrenamiento</span></h2> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-13njxon">⚠️ Google nunca liberó el código (open-sourced) su implementación del algoritmo de entrenamiento de WordPiece, por tanto lo que sigue es nuestra mejor suposición badado en la literatura publicada. Puede no ser 100% preciso.</p></div> <p data-svelte-h="svelte-1mryzpb">Al igual que BPE, WordPiece comienza a partir de un pequeño vocabulario incluyendo los tokens especiales utilizados por el modelo y el alfabeto inicial. Dado que identifica subpalabras (subwords) agregando un prefijo (como <code>##</code> para el caso de BERT), cada palabra está inicialmente separada agregando dicho prefijo a todos los caracteres dentro de la palabra. Por lo que por ejemplo la palabra <code>"word"</code> queda separada así:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->w ##o ##r ##d<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17a9wfo">Por lo tanto, el alfabeto inicial contiene todos los caracteres presentes al comienzo de una palabra y los caracteres presente dentro de una palabra precedida por el prefijo de WordPiece.</p> <p>Luego, de nuevo al igual que BPE, WordPiece aprende reglas de fusión. La principal diferencia es la forma que el par fusionado es seleccionado. Envex de seleccionar el par más frecuente, WordPiece calcula un puntaje para cada par, utilizando la siguiente formula: | |
| <!-- HTML_TAG_START --><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mrow><mi mathvariant="normal">s</mi><mi mathvariant="normal">c</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">e</mi></mrow><mo>=</mo><mo stretchy="false">(</mo><mrow><mi mathvariant="normal">f</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">q</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">f</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">p</mi><mi mathvariant="normal">a</mi><mi mathvariant="normal">i</mi><mi mathvariant="normal">r</mi></mrow><mo stretchy="false">)</mo><mi mathvariant="normal">/</mi><mo stretchy="false">(</mo><mrow><mi mathvariant="normal">f</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">q</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">f</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">f</mi><mi mathvariant="normal">i</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">s</mi><mi mathvariant="normal">t</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">l</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">m</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">n</mi><mi mathvariant="normal">t</mi></mrow><mo>×</mo><mrow><mi mathvariant="normal">f</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">q</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">f</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">s</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">c</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">n</mi><mi mathvariant="normal">d</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">l</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">m</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">n</mi><mi mathvariant="normal">t</mi></mrow><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex">\mathrm{score} = (\mathrm{freq\_of\_pair}) / (\mathrm{freq\_of\_first\_element} \times \mathrm{freq\_of\_second\_element})</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord"><span class="mord mathrm">score</span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.06em;vertical-align:-0.31em;"></span><span class="mopen">(</span><span class="mord"><span class="mord mathrm">freq_of_pair</span></span><span class="mclose">)</span><span class="mord">/</span><span class="mopen">(</span><span class="mord"><span class="mord mathrm">freq_of_first_element</span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1.06em;vertical-align:-0.31em;"></span><span class="mord"><span class="mord mathrm">freq_of_second_element</span></span><span class="mclose">)</span></span></span></span></span><!-- HTML_TAG_END --></p> <p data-svelte-h="svelte-vgeqzk">Dividiendo por la frecuencia del par por el producto de las frecuencias de cada una de sus partes, el algoritmo prioriza la fusión de pares donde las partes individuales son menos frecuentes en el vocabulario. Por ejemplo, no fusionará necesariamente <code>("un", "##able")</code> incluso si ese par ocurre de manera muy frecuente en el vocabulario, porque los dos pares <code>"un"</code> y <code>"##able"</code> muy probablemente aparecerán en un montón de otras palabras y tendrán una alta frecuencia. En contraste con un par como <code>("hu", "##gging")</code> los cuales son probablemente menos frecuentes individualmente.</p> <p data-svelte-h="svelte-bz1ymu">Miremos el mismo vocabulario que usamos en el ejemplo de entrenamiento de BPE:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">"hug"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"pug"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"pun"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"bun"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"hugs"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-sfg1j5">Las separaciones acá serán:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">"h"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#g</span>"</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">"p"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#g</span>"</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">"p"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#n</span>"</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">"b"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#n</span>"</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">"h"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#g</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#s</span>"</span>, <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7dymnf">por lo que el vocabulario inicial será <code>["b", "h", "p", "##g", "##n", "##s", "##u"]</code> (si nos olvidamos de los tokens especiales por ahora). El par más frecuente es <code>("##u", "##g")</code> (presente 20 veces), pero la frecuencia individual de <code>"##u"</code> es muy alta, por lo que el puntaje no es el más alto (es 1 / 36). Todos los pares con <code>"##u"</code> en realidad tienen el mismo puntaje (1 / 36), por lo que el mejor puntaje va para el par <code>("##g", "##s")</code> — el único sin <code>"##u"</code> — 1 / 20, y la primera fusión aprendida es <code>("##g", "##s") -> ("##gs")</code>.</p> <p data-svelte-h="svelte-stc5cc">Notar que cuando fusionamos, removemos el <code>##</code> entre los dos tokens, por que agregamos <code>"##gs"</code> al vocabulario y aplicamos la fusión en las palabras del corpus:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Vocabulary: [<span class="hljs-string">"b"</span>, <span class="hljs-string">"h"</span>, <span class="hljs-string">"p"</span>, <span class="hljs-string">"#<span class="hljs-subst">#g</span>"</span>, <span class="hljs-string">"#<span class="hljs-subst">#n</span>"</span>, <span class="hljs-string">"#<span class="hljs-subst">#s</span>"</span>, <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span>, <span class="hljs-string">"#<span class="hljs-subst">#gs</span>"</span>] | |
| Corpus: (<span class="hljs-string">"h"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#g</span>"</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">"p"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#g</span>"</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">"p"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#n</span>"</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">"b"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#n</span>"</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">"h"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#gs</span>"</span>, <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-16zc08u">En este punto, <code>"##u"</code> está en todos los posibles pares, por lo que todos terminan con el mismo puntaje. Digamos que en este caso, el primer par se fusiona, <code>("h", "##u") -> "hu"</code>. Esto nos lleva a:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Vocabulary: [<span class="hljs-string">"b"</span>, <span class="hljs-string">"h"</span>, <span class="hljs-string">"p"</span>, <span class="hljs-string">"#<span class="hljs-subst">#g</span>"</span>, <span class="hljs-string">"#<span class="hljs-subst">#n</span>"</span>, <span class="hljs-string">"#<span class="hljs-subst">#s</span>"</span>, <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span>, <span class="hljs-string">"#<span class="hljs-subst">#gs</span>"</span>, <span class="hljs-string">"hu"</span>] | |
| Corpus: (<span class="hljs-string">"hu"</span> <span class="hljs-string">"#<span class="hljs-subst">#g</span>"</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">"p"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#g</span>"</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">"p"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#n</span>"</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">"b"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#n</span>"</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">"hu"</span> <span class="hljs-string">"#<span class="hljs-subst">#gs</span>"</span>, <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mtj12r">Luego el siguiente mejor puntaje está compartido por <code>("hu", "##g")</code> y <code>("hu", "##gs")</code> (con 1/15, comparado con 1/21 para todos los otros pares), por lo que el primer par con el puntaje más alto se fusiona:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Vocabulary: [<span class="hljs-string">"b"</span>, <span class="hljs-string">"h"</span>, <span class="hljs-string">"p"</span>, <span class="hljs-string">"#<span class="hljs-subst">#g</span>"</span>, <span class="hljs-string">"#<span class="hljs-subst">#n</span>"</span>, <span class="hljs-string">"#<span class="hljs-subst">#s</span>"</span>, <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span>, <span class="hljs-string">"#<span class="hljs-subst">#gs</span>"</span>, <span class="hljs-string">"hu"</span>, <span class="hljs-string">"hug"</span>] | |
| Corpus: (<span class="hljs-string">"hug"</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">"p"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#g</span>"</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">"p"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#n</span>"</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">"b"</span> <span class="hljs-string">"#<span class="hljs-subst">#u</span>"</span> <span class="hljs-string">"#<span class="hljs-subst">#n</span>"</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">"hu"</span> <span class="hljs-string">"#<span class="hljs-subst">#gs</span>"</span>, <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1bxgya3">y continuamos como esto hasta que alcancemos el tamaño de vocabulario deseado.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1ppumjp">✏️ <strong>Ahora es tu turno!</strong> Cuál será la siguiente regla de fusioń?</p></div> <h2 class="relative group"><a id="tokenization-algorithm" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tokenization-algorithm"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Algoritmo de Tokenización</span></h2> <p data-svelte-h="svelte-3rztuk">La tokenización difiere en WordPiece y BPE en que WordPiece sólo guarda el vocabulario final, no las reglas de fusión aprendidas. Comenzando a partir de la palabra a tokenizar, WordPiece encuentra la subpalabra más larga que está en el vocabulario, luego la separa. Por ejemplo, su usamos el vocabulario aprendido en el ejemplo anterior, para la palabra <code>"hugs"</code> la subpalabra más larga comenzando desde el inicio que está dentro del vocabulario es <code>"hug"</code>, por lo que separamos ahí y obtenemos <code>["hug", "##s"]</code>. Luego continuamos con <code>"##s"</code>, el cuál está en el vocabulario, por lo que la tokenización de <code>"hugs"</code> es <code>["hug", "##s"]</code>.</p> <p data-svelte-h="svelte-10uttaj">Con BPE, habríamos aplicado las fusiones aprendidas en orden y tokenizado esto como <code>["hu", "##gs"]</code>, por lo que la codificación es diferente.</p> <p data-svelte-h="svelte-rmaccr">Como otro ejemplo, veamos como la palabra <code>"bugs"</code> sería tokenizado. <code>"b"</code> es la subpalabra más larga comenzando del inicio de la palabra que está en el vocabulario, por lo que separamos ahí y obtenemos <code>["b", "##ugs"]</code>. Luego <code>"##u"</code> es la subpalabra más larga somenzando desde el inicio de <code>"##ugs"</code> que está en el vocabulario, por lo que separamos ahí y obtenemos <code>["b", "##u, "##gs"]</code>. Finalmente, <code>"##gs"</code> está en el vocabulario, por lo que esta última lista es la tokenización de <code>"bugs"</code>.</p> <p data-svelte-h="svelte-kd7qkr">Cuando la tokenización llega a la etapa donde ya no es posible encontrar una subpalabra en el vocabulario, la palabra entera es tokenizada como desconocida — Por ejemplo, <code>"mug"</code> sería tokenizada como <code>["[UNK]"]</code>, al igual que <code>"bum"</code> (incluso si podemos comenzar con <code>"b"</code> y <code>"##u"</code>, <code>"##m"</code> no está en el vocabulario, y la tokenización resultante será sólo <code>["[UNK]"]</code>, y no <code>["b", "##u", "[UNK]"]</code>). Este es otra diferencia con respecto a BPE, el cual sólo clasificaría los caracteres individuales que no están en el vocabulario como desconocido.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1abemxt">✏️ <strong>Ahora es tu turno!</strong> ¿Cómo se tokenizaría la palabra <code>"pugs"</code>?</p></div> <h2 class="relative group"><a id="implementing-wordpiece" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#implementing-wordpiece"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Implementando WordPiece</span></h2> <p data-svelte-h="svelte-kj5p78">Ahora echemos un vistazo a una implementación del algoritmo WordPiece. Al igual que BPE, este es sólo pedagócico y no podrás aplicar esto en corpus grande.</p> <p data-svelte-h="svelte-1tcizvu">Usaremos el mismo corpus que en el ejemplo de BPE:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->corpus = [ | |
| <span class="hljs-string">"This is the Hugging Face Course."</span>, | |
| <span class="hljs-string">"This chapter is about tokenization."</span>, | |
| <span class="hljs-string">"This section shows several tokenizer algorithms."</span>, | |
| <span class="hljs-string">"Hopefully, you will be able to understand how they are trained and generate tokens."</span>, | |
| ]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-uklssn">Primero, necesitamos pre-tokenizar el corpus en palabras. Dado que estamos replicando el tokenizador WordPiece (como BERT), usaremos el tokenizador <code>bert-base-cased</code> para la pre-tokenización:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"bert-base-cased"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-4qz6iz">Luego calculamos las frecuencias de cada palabra en el corpus mientras hacemos la pre-tokenización:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> collections <span class="hljs-keyword">import</span> defaultdict | |
| word_freqs = defaultdict(<span class="hljs-built_in">int</span>) | |
| <span class="hljs-keyword">for</span> text <span class="hljs-keyword">in</span> corpus: | |
| words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text) | |
| new_words = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> words_with_offsets] | |
| <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> new_words: | |
| word_freqs[word] += <span class="hljs-number">1</span> | |
| word_freqs<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->defaultdict( | |
| <span class="hljs-built_in">int</span>, {<span class="hljs-string">'This'</span>: <span class="hljs-number">3</span>, <span class="hljs-string">'is'</span>: <span class="hljs-number">2</span>, <span class="hljs-string">'the'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Hugging'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Face'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Course'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'.'</span>: <span class="hljs-number">4</span>, <span class="hljs-string">'chapter'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'about'</span>: <span class="hljs-number">1</span>, | |
| <span class="hljs-string">'tokenization'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'section'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'shows'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'several'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'tokenizer'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'algorithms'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Hopefully'</span>: <span class="hljs-number">1</span>, | |
| <span class="hljs-string">','</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'you'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'will'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'be'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'able'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'to'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'understand'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'how'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'they'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'are'</span>: <span class="hljs-number">1</span>, | |
| <span class="hljs-string">'trained'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'and'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'generate'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'tokens'</span>: <span class="hljs-number">1</span>})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1exookl">Como vimos antes, el alfabeto es el único conjunto compuesto de todas las primeras letras de las palabras, y todas las otras letras que aparecen con el prefijo <code>##</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->alphabet = [] | |
| <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> word_freqs.keys(): | |
| <span class="hljs-keyword">if</span> word[<span class="hljs-number">0</span>] <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> alphabet: | |
| alphabet.append(word[<span class="hljs-number">0</span>]) | |
| <span class="hljs-keyword">for</span> letter <span class="hljs-keyword">in</span> word[<span class="hljs-number">1</span>:]: | |
| <span class="hljs-keyword">if</span> <span class="hljs-string">f"##<span class="hljs-subst">{letter}</span>"</span> <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> alphabet: | |
| alphabet.append(<span class="hljs-string">f"##<span class="hljs-subst">{letter}</span>"</span>) | |
| alphabet.sort() | |
| alphabet | |
| <span class="hljs-built_in">print</span>(alphabet)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">'##a'</span>, <span class="hljs-string">'##b'</span>, <span class="hljs-string">'##c'</span>, <span class="hljs-string">'##d'</span>, <span class="hljs-string">'##e'</span>, <span class="hljs-string">'##f'</span>, <span class="hljs-string">'##g'</span>, <span class="hljs-string">'##h'</span>, <span class="hljs-string">'##i'</span>, <span class="hljs-string">'##k'</span>, <span class="hljs-string">'##l'</span>, <span class="hljs-string">'##m'</span>, <span class="hljs-string">'##n'</span>, <span class="hljs-string">'##o'</span>, <span class="hljs-string">'##p'</span>, <span class="hljs-string">'##r'</span>, <span class="hljs-string">'##s'</span>, | |
| <span class="hljs-string">'##t'</span>, <span class="hljs-string">'##u'</span>, <span class="hljs-string">'##v'</span>, <span class="hljs-string">'##w'</span>, <span class="hljs-string">'##y'</span>, <span class="hljs-string">'##z'</span>, <span class="hljs-string">','</span>, <span class="hljs-string">'.'</span>, <span class="hljs-string">'C'</span>, <span class="hljs-string">'F'</span>, <span class="hljs-string">'H'</span>, <span class="hljs-string">'T'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'b'</span>, <span class="hljs-string">'c'</span>, <span class="hljs-string">'g'</span>, <span class="hljs-string">'h'</span>, <span class="hljs-string">'i'</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">'t'</span>, <span class="hljs-string">'u'</span>, | |
| <span class="hljs-string">'w'</span>, <span class="hljs-string">'y'</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1fcw3da">También agregamos los tokens especiales usados por el modelo al inicio de ese vocabulario. En el caso de BERT, es la lista <code>["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->vocab = [<span class="hljs-string">"[PAD]"</span>, <span class="hljs-string">"[UNK]"</span>, <span class="hljs-string">"[CLS]"</span>, <span class="hljs-string">"[SEP]"</span>, <span class="hljs-string">"[MASK]"</span>] + alphabet.copy()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1rlai7c">A continuación necesitamos separar cada palabra, con todas las letras que no tienen el prefijo <code>##</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->splits = { | |
| word: [c <span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span> <span class="hljs-keyword">else</span> <span class="hljs-string">f"##<span class="hljs-subst">{c}</span>"</span> <span class="hljs-keyword">for</span> i, c <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(word)] | |
| <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> word_freqs.keys() | |
| }<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1nodrch">Ahora que estamos listos para el entrenamiento, escribamos una función que calcule el puntaje para cada par. Usaremos esto en cada etapa del entrenamiento:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_pair_scores</span>(<span class="hljs-params">splits</span>): | |
| letter_freqs = defaultdict(<span class="hljs-built_in">int</span>) | |
| pair_freqs = defaultdict(<span class="hljs-built_in">int</span>) | |
| <span class="hljs-keyword">for</span> word, freq <span class="hljs-keyword">in</span> word_freqs.items(): | |
| split = splits[word] | |
| <span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(split) == <span class="hljs-number">1</span>: | |
| letter_freqs[split[<span class="hljs-number">0</span>]] += freq | |
| <span class="hljs-keyword">continue</span> | |
| <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(split) - <span class="hljs-number">1</span>): | |
| pair = (split[i], split[i + <span class="hljs-number">1</span>]) | |
| letter_freqs[split[i]] += freq | |
| pair_freqs[pair] += freq | |
| letter_freqs[split[-<span class="hljs-number">1</span>]] += freq | |
| scores = { | |
| pair: freq / (letter_freqs[pair[<span class="hljs-number">0</span>]] * letter_freqs[pair[<span class="hljs-number">1</span>]]) | |
| <span class="hljs-keyword">for</span> pair, freq <span class="hljs-keyword">in</span> pair_freqs.items() | |
| } | |
| <span class="hljs-keyword">return</span> scores<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1wchqi3">Echemos un vistazo a parte de este diccionario luego de las separaciones iniciales:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pair_scores = compute_pair_scores(splits) | |
| <span class="hljs-keyword">for</span> i, key <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(pair_scores.keys()): | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"<span class="hljs-subst">{key}</span>: <span class="hljs-subst">{pair_scores[key]}</span>"</span>) | |
| <span class="hljs-keyword">if</span> i >= <span class="hljs-number">5</span>: | |
| <span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">'T'</span>, <span class="hljs-string">'##h'</span>): <span class="hljs-number">0.125</span> | |
| (<span class="hljs-string">'##h'</span>, <span class="hljs-string">'##i'</span>): <span class="hljs-number">0.03409090909090909</span> | |
| (<span class="hljs-string">'##i'</span>, <span class="hljs-string">'##s'</span>): <span class="hljs-number">0.02727272727272727</span> | |
| (<span class="hljs-string">'i'</span>, <span class="hljs-string">'##s'</span>): <span class="hljs-number">0.1</span> | |
| (<span class="hljs-string">'t'</span>, <span class="hljs-string">'##h'</span>): <span class="hljs-number">0.03571428571428571</span> | |
| (<span class="hljs-string">'##h'</span>, <span class="hljs-string">'##e'</span>): <span class="hljs-number">0.011904761904761904</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-bo5zx0">Ahora, encontrar el par con el mejor puntaje sólo toma un rápido ciclo:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->best_pair = <span class="hljs-string">""</span> | |
| max_score = <span class="hljs-literal">None</span> | |
| <span class="hljs-keyword">for</span> pair, score <span class="hljs-keyword">in</span> pair_scores.items(): | |
| <span class="hljs-keyword">if</span> max_score <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">or</span> max_score < score: | |
| best_pair = pair | |
| max_score = score | |
| <span class="hljs-built_in">print</span>(best_pair, max_score)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">'a'</span>, <span class="hljs-string">'##b'</span>) <span class="hljs-number">0.2</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-k1zuko">Por lo que la primera fusión a aprender es <code>('a', '##b') -> 'ab'</code>, y agregamos <code>'ab'</code> al vocabulario:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->vocab.append(<span class="hljs-string">"ab"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-pfmm82">Para continuar, necesitamos aplicar esa fusión en nuestro diccionario de separaciones (<code>splits</code> dictionary). Escribamos otra función para esto:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">merge_pair</span>(<span class="hljs-params">a, b, splits</span>): | |
| <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> word_freqs: | |
| split = splits[word] | |
| <span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(split) == <span class="hljs-number">1</span>: | |
| <span class="hljs-keyword">continue</span> | |
| i = <span class="hljs-number">0</span> | |
| <span class="hljs-keyword">while</span> i < <span class="hljs-built_in">len</span>(split) - <span class="hljs-number">1</span>: | |
| <span class="hljs-keyword">if</span> split[i] == a <span class="hljs-keyword">and</span> split[i + <span class="hljs-number">1</span>] == b: | |
| merge = a + b[<span class="hljs-number">2</span>:] <span class="hljs-keyword">if</span> b.startswith(<span class="hljs-string">"##"</span>) <span class="hljs-keyword">else</span> a + b | |
| split = split[:i] + [merge] + split[i + <span class="hljs-number">2</span> :] | |
| <span class="hljs-keyword">else</span>: | |
| i += <span class="hljs-number">1</span> | |
| splits[word] = split | |
| <span class="hljs-keyword">return</span> splits<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1kl1g0x">Y podemos mirar el resultado de la primera fusión:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->splits = merge_pair(<span class="hljs-string">"a"</span>, <span class="hljs-string">"##b"</span>, splits) | |
| splits[<span class="hljs-string">"about"</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">'ab'</span>, <span class="hljs-string">'##o'</span>, <span class="hljs-string">'##u'</span>, <span class="hljs-string">'##t'</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-wgf1cr">Ahora tenemos todos los que necesitamos para iterar hasta haber aprendido todas las fusiones que queramos. Apuntemos a un tamaño de vocabulario de 70:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->vocab_size = <span class="hljs-number">70</span> | |
| <span class="hljs-keyword">while</span> <span class="hljs-built_in">len</span>(vocab) < vocab_size: | |
| scores = compute_pair_scores(splits) | |
| best_pair, max_score = <span class="hljs-string">""</span>, <span class="hljs-literal">None</span> | |
| <span class="hljs-keyword">for</span> pair, score <span class="hljs-keyword">in</span> scores.items(): | |
| <span class="hljs-keyword">if</span> max_score <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">or</span> max_score < score: | |
| best_pair = pair | |
| max_score = score | |
| splits = merge_pair(*best_pair, splits) | |
| new_token = ( | |
| best_pair[<span class="hljs-number">0</span>] + best_pair[<span class="hljs-number">1</span>][<span class="hljs-number">2</span>:] | |
| <span class="hljs-keyword">if</span> best_pair[<span class="hljs-number">1</span>].startswith(<span class="hljs-string">"##"</span>) | |
| <span class="hljs-keyword">else</span> best_pair[<span class="hljs-number">0</span>] + best_pair[<span class="hljs-number">1</span>] | |
| ) | |
| vocab.append(new_token)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-kb4vz1">Luego podemos ver el vocabulario generado:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(vocab)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">'[PAD]'</span>, <span class="hljs-string">'[UNK]'</span>, <span class="hljs-string">'[CLS]'</span>, <span class="hljs-string">'[SEP]'</span>, <span class="hljs-string">'[MASK]'</span>, <span class="hljs-string">'##a'</span>, <span class="hljs-string">'##b'</span>, <span class="hljs-string">'##c'</span>, <span class="hljs-string">'##d'</span>, <span class="hljs-string">'##e'</span>, <span class="hljs-string">'##f'</span>, <span class="hljs-string">'##g'</span>, <span class="hljs-string">'##h'</span>, <span class="hljs-string">'##i'</span>, <span class="hljs-string">'##k'</span>, | |
| <span class="hljs-string">'##l'</span>, <span class="hljs-string">'##m'</span>, <span class="hljs-string">'##n'</span>, <span class="hljs-string">'##o'</span>, <span class="hljs-string">'##p'</span>, <span class="hljs-string">'##r'</span>, <span class="hljs-string">'##s'</span>, <span class="hljs-string">'##t'</span>, <span class="hljs-string">'##u'</span>, <span class="hljs-string">'##v'</span>, <span class="hljs-string">'##w'</span>, <span class="hljs-string">'##y'</span>, <span class="hljs-string">'##z'</span>, <span class="hljs-string">','</span>, <span class="hljs-string">'.'</span>, <span class="hljs-string">'C'</span>, <span class="hljs-string">'F'</span>, <span class="hljs-string">'H'</span>, | |
| <span class="hljs-string">'T'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'b'</span>, <span class="hljs-string">'c'</span>, <span class="hljs-string">'g'</span>, <span class="hljs-string">'h'</span>, <span class="hljs-string">'i'</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">'t'</span>, <span class="hljs-string">'u'</span>, <span class="hljs-string">'w'</span>, <span class="hljs-string">'y'</span>, <span class="hljs-string">'ab'</span>, <span class="hljs-string">'##fu'</span>, <span class="hljs-string">'Fa'</span>, <span class="hljs-string">'Fac'</span>, <span class="hljs-string">'##ct'</span>, <span class="hljs-string">'##ful'</span>, <span class="hljs-string">'##full'</span>, <span class="hljs-string">'##fully'</span>, | |
| <span class="hljs-string">'Th'</span>, <span class="hljs-string">'ch'</span>, <span class="hljs-string">'##hm'</span>, <span class="hljs-string">'cha'</span>, <span class="hljs-string">'chap'</span>, <span class="hljs-string">'chapt'</span>, <span class="hljs-string">'##thm'</span>, <span class="hljs-string">'Hu'</span>, <span class="hljs-string">'Hug'</span>, <span class="hljs-string">'Hugg'</span>, <span class="hljs-string">'sh'</span>, <span class="hljs-string">'th'</span>, <span class="hljs-string">'is'</span>, <span class="hljs-string">'##thms'</span>, <span class="hljs-string">'##za'</span>, <span class="hljs-string">'##zat'</span>, | |
| <span class="hljs-string">'##ut'</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1eqqial">Como podemos ver, comparado con BPE, este tokenizador aprende partes de palabras como tokens un poco más rápido.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1k620vj">💡 Usar <code>train_new_from_iterator()</code> en el mismo corpus no resultará en exactamente el mismo vocabulario. Esto porque la librería 🤗 Tokenizers no implementa WordPiece para el entrenamiento (dado que no estamos completamente seguros de su funcionamiento interno), en vez de eso utiliza BPE.</p></div> <p data-svelte-h="svelte-gxxlml">Para tokenizar un nuevo texto, lo pre-tokenizamos, lo separamos, y luego aplicamos el algoritmo de tokenización para cada palabra. Es decir, miramos la subpalabra más grande comenzando al inicio de la primera palabra y la separamos, luego repetimos el proceso en la segunda parte, y así pará el resto de dicha palabra y de las siguientes palabras en el texto:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">encode_word</span>(<span class="hljs-params">word</span>): | |
| tokens = [] | |
| <span class="hljs-keyword">while</span> <span class="hljs-built_in">len</span>(word) > <span class="hljs-number">0</span>: | |
| i = <span class="hljs-built_in">len</span>(word) | |
| <span class="hljs-keyword">while</span> i > <span class="hljs-number">0</span> <span class="hljs-keyword">and</span> word[:i] <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> vocab: | |
| i -= <span class="hljs-number">1</span> | |
| <span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span>: | |
| <span class="hljs-keyword">return</span> [<span class="hljs-string">"[UNK]"</span>] | |
| tokens.append(word[:i]) | |
| word = word[i:] | |
| <span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(word) > <span class="hljs-number">0</span>: | |
| word = <span class="hljs-string">f"##<span class="hljs-subst">{word}</span>"</span> | |
| <span class="hljs-keyword">return</span> tokens<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1rmo9di">Probémoslo en una palabra que esté en el vocabulario, y en otra que no esté:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">print</span>(encode_word(<span class="hljs-string">"Hugging"</span>)) | |
| <span class="hljs-built_in">print</span>(encode_word(<span class="hljs-string">"HOgging"</span>))<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">'Hugg'</span>, <span class="hljs-string">'##i'</span>, <span class="hljs-string">'##n'</span>, <span class="hljs-string">'##g'</span>] | |
| [<span class="hljs-string">'[UNK]'</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-s23tws">Ahora, escribamos una función que tokenize un texto:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">tokenize</span>(<span class="hljs-params">text</span>): | |
| pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text) | |
| pre_tokenized_text = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> pre_tokenize_result] | |
| encoded_words = [encode_word(word) <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> pre_tokenized_text] | |
| <span class="hljs-keyword">return</span> <span class="hljs-built_in">sum</span>(encoded_words, [])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1tuu6w3">Podemos probar en cualquier texto:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenize(<span class="hljs-string">"This is the Hugging Face course!"</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">'Th'</span>, <span class="hljs-string">'##i'</span>, <span class="hljs-string">'##s'</span>, <span class="hljs-string">'is'</span>, <span class="hljs-string">'th'</span>, <span class="hljs-string">'##e'</span>, <span class="hljs-string">'Hugg'</span>, <span class="hljs-string">'##i'</span>, <span class="hljs-string">'##n'</span>, <span class="hljs-string">'##g'</span>, <span class="hljs-string">'Fac'</span>, <span class="hljs-string">'##e'</span>, <span class="hljs-string">'c'</span>, <span class="hljs-string">'##o'</span>, <span class="hljs-string">'##u'</span>, <span class="hljs-string">'##r'</span>, <span class="hljs-string">'##s'</span>, | |
| <span class="hljs-string">'##e'</span>, <span class="hljs-string">'[UNK]'</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7e6vha">Eso es todo para el algoritmo WordPiece! Ahora echemos un visto a Unigram.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/es/chapter6/6.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_brjtkr = { | |
| assets: "/docs/course/pr_1069/es", | |
| base: "/docs/course/pr_1069/es", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/course/pr_1069/es/_app/immutable/entry/start.b7b528c6.js"), | |
| import("/docs/course/pr_1069/es/_app/immutable/entry/app.ea1cc000.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 40], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 94.6 kB
- Xet hash:
- 264a8984b11f60115027e1dd9f49f2a67058b75003329256f873d75089a0cecb
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.