Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / transformers /pr_33913 /en /tokenizer_summary.html

rtrm

28 days ago

download

raw

65.9 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Summary of the tokenizers","local":"summary-of-the-tokenizers","sections":[{"title":"Introduction","local":"introduction","sections":[],"depth":2},{"title":"Subword tokenization","local":"subword-tokenization","sections":[{"title":"Byte-Pair Encoding (BPE)","local":"byte-pair-encoding-bpe","sections":[{"title":"Byte-level BPE","local":"byte-level-bpe","sections":[],"depth":4}],"depth":3},{"title":"WordPiece","local":"wordpiece","sections":[],"depth":3},{"title":"Unigram","local":"unigram","sections":[],"depth":3},{"title":"SentencePiece","local":"sentencepiece","sections":[],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/transformers/pr_33913/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/entry/start.b67f883f.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/scheduler.25b97de1.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/singletons.62a184e0.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/index.e188933d.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/paths.51881b9e.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/entry/app.e436b1f2.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/index.d9030fc9.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/nodes/0.05e395f5.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/nodes/450.b4fb4a04.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/Youtube.eaf1a617.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/CodeBlock.e6cd0d95.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/DocNotebookDropdown.5ea6cb78.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/globals.7f7f1b26.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/EditOnGithub.91d95064.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Summary of the tokenizers","local":"summary-of-the-tokenizers","sections":[{"title":"Introduction","local":"introduction","sections":[],"depth":2},{"title":"Subword tokenization","local":"subword-tokenization","sections":[{"title":"Byte-Pair Encoding (BPE)","local":"byte-pair-encoding-bpe","sections":[{"title":"Byte-level BPE","local":"byte-level-bpe","sections":[],"depth":4}],"depth":3},{"title":"WordPiece","local":"wordpiece","sections":[],"depth":3},{"title":"Unigram","local":"unigram","sections":[],"depth":3},{"title":"SentencePiece","local":"sentencepiece","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="summary-of-the-tokenizers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#summary-of-the-tokenizers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Summary of the tokenizers</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"> </button> </div> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"> </button> </div></div> <p data-svelte-h="svelte-1h07nug">On this page, we will have a closer look at tokenization.</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/VFp38yj8h3A" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-ej4gda">As we saw in <a href="preprocessing">the preprocessing tutorial</a>, tokenizing a text is splitting it into words or
	subwords, which then are converted to ids through a look-up table. Converting words or subwords to ids is
	straightforward, so in this summary, we will focus on splitting a text into words or subwords (i.e. tokenizing a text).
	More specifically, we will look at the three main types of tokenizers used in 🤗 Transformers: <a href="#byte-pair-encoding">Byte-Pair Encoding
	(BPE)</a>, <a href="#wordpiece">WordPiece</a>, and <a href="#sentencepiece">SentencePiece</a>, and show examples
	of which tokenizer type is used by which model.</p> <p data-svelte-h="svelte-1rtpvmf">Note that on each model page, you can look at the documentation of the associated tokenizer to know which tokenizer
	type was used by the pretrained model. For instance, if we look at <a href="/docs/transformers/pr_33913/en/model_doc/bert#transformers.BertTokenizer">BertTokenizer</a>, we can see
	that the model uses <a href="#wordpiece">WordPiece</a>.</p> <h2 class="relative group"><a id="introduction" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#introduction"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Introduction</span></h2> <p data-svelte-h="svelte-67zbo9">Splitting a text into smaller chunks is a task that is harder than it looks, and there are multiple ways of doing so.
	For instance, let’s look at the sentence <code>"Don't you love 🤗 Transformers? We sure do."</code></p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/nhJxYji1aho" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-s5zdmn">A simple way of tokenizing this text is to split it by spaces, which would give:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-selector-attr">[<span class="hljs-string">"Don't"</span>, <span class="hljs-string">"you"</span>, <span class="hljs-string">"love"</span>, <span class="hljs-string">"🤗"</span>, <span class="hljs-string">"Transformers?"</span>, <span class="hljs-string">"We"</span>, <span class="hljs-string">"sure"</span>, <span class="hljs-string">"do."</span>]</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-14s15gf">This is a sensible first step, but if we look at the tokens <code>"Transformers?"</code> and <code>"do."</code>, we notice that the
	punctuation is attached to the words <code>"Transformer"</code> and <code>"do"</code>, which is suboptimal. We should take the
	punctuation into account so that a model does not have to learn a different representation of a word and every possible
	punctuation symbol that could follow it, which would explode the number of representations the model has to learn.
	Taking punctuation into account, tokenizing our exemplary text would give:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-selector-attr">[<span class="hljs-string">"Don"</span>, <span class="hljs-string">"'"</span>, <span class="hljs-string">"t"</span>, <span class="hljs-string">"you"</span>, <span class="hljs-string">"love"</span>, <span class="hljs-string">"🤗"</span>, <span class="hljs-string">"Transformers"</span>, <span class="hljs-string">"?"</span>, <span class="hljs-string">"We"</span>, <span class="hljs-string">"sure"</span>, <span class="hljs-string">"do"</span>, <span class="hljs-string">"."</span>]</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-j1dyaf">Better. However, it is disadvantageous, how the tokenization dealt with the word <code>"Don't"</code>. <code>"Don't"</code> stands for
	<code>"do not"</code>, so it would be better tokenized as <code>["Do", "n't"]</code>. This is where things start getting complicated, and
	part of the reason each model has its own tokenizer type. Depending on the rules we apply for tokenizing a text, a
	different tokenized output is generated for the same text. A pretrained model only performs properly if you feed it an
	input that was tokenized with the same rules that were used to tokenize its training data.</p> <p data-svelte-h="svelte-11fbadg"><a href="https://spacy.io/" rel="nofollow">spaCy</a> and <a href="http://www.statmt.org/moses/?n=Development.GetStarted" rel="nofollow">Moses</a> are two popular
	rule-based tokenizers. Applying them on our example, <em>spaCy</em> and <em>Moses</em> would output something like:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-selector-attr">[<span class="hljs-string">"Do"</span>, <span class="hljs-string">"n't"</span>, <span class="hljs-string">"you"</span>, <span class="hljs-string">"love"</span>, <span class="hljs-string">"🤗"</span>, <span class="hljs-string">"Transformers"</span>, <span class="hljs-string">"?"</span>, <span class="hljs-string">"We"</span>, <span class="hljs-string">"sure"</span>, <span class="hljs-string">"do"</span>, <span class="hljs-string">"."</span>]</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qdsuer">As can be seen space and punctuation tokenization, as well as rule-based tokenization, is used here. Space and
	punctuation tokenization and rule-based tokenization are both examples of word tokenization, which is loosely defined
	as splitting sentences into words. While it’s the most intuitive way to split texts into smaller chunks, this
	tokenization method can lead to problems for massive text corpora. In this case, space and punctuation tokenization
	usually generates a very big vocabulary (the set of all unique words and tokens used). <em>E.g.</em>, <a href="model_doc/transfo-xl">Transformer XL</a> uses space and punctuation tokenization, resulting in a vocabulary size of 267,735!</p> <p data-svelte-h="svelte-1fz5yn1">Such a big vocabulary size forces the model to have an enormous embedding matrix as the input and output layer, which
	causes both an increased memory and time complexity. In general, transformers models rarely have a vocabulary size
	greater than 50,000, especially if they are pretrained only on a single language.</p> <p data-svelte-h="svelte-16itca4">So if simple space and punctuation tokenization is unsatisfactory, why not simply tokenize on characters?</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/ssLq_EK2jLE" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-jqh0qv">While character tokenization is very simple and would greatly reduce memory and time complexity it makes it much harder
	for the model to learn meaningful input representations. <em>E.g.</em> learning a meaningful context-independent
	representation for the letter <code>"t"</code> is much harder than learning a context-independent representation for the word
	<code>"today"</code>. Therefore, character tokenization is often accompanied by a loss of performance. So to get the best of
	both worlds, transformers models use a hybrid between word-level and character-level tokenization called <strong>subword</strong>
	tokenization.</p> <h2 class="relative group"><a id="subword-tokenization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#subword-tokenization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Subword tokenization</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/zHvTiHr506c" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-1e53ypn">Subword tokenization algorithms rely on the principle that frequently used words should not be split into smaller
	subwords, but rare words should be decomposed into meaningful subwords. For instance <code>"annoyingly"</code> might be
	considered a rare word and could be decomposed into <code>"annoying"</code> and <code>"ly"</code>. Both <code>"annoying"</code> and <code>"ly"</code> as
	stand-alone subwords would appear more frequently while at the same time the meaning of <code>"annoyingly"</code> is kept by the
	composite meaning of <code>"annoying"</code> and <code>"ly"</code>. This is especially useful in agglutinative languages such as Turkish,
	where you can form (almost) arbitrarily long complex words by stringing together subwords.</p> <p data-svelte-h="svelte-17odgc6">Subword tokenization allows the model to have a reasonable vocabulary size while being able to learn meaningful
	context-independent representations. In addition, subword tokenization enables the model to process words it has never
	seen before, by decomposing them into known subwords. For instance, the <a href="/docs/transformers/pr_33913/en/model_doc/bert#transformers.BertTokenizer">BertTokenizer</a> tokenizes
	<code>"I have a new GPU!"</code> as follows:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> BertTokenizer

	<span class="hljs-meta">>>> </span>tokenizer = BertTokenizer.from_pretrained(<span class="hljs-string">"google-bert/bert-base-uncased"</span>)
	<span class="hljs-meta">>>> </span>tokenizer.tokenize(<span class="hljs-string">"I have a new GPU!"</span>)
	[<span class="hljs-string">"i"</span>, <span class="hljs-string">"have"</span>, <span class="hljs-string">"a"</span>, <span class="hljs-string">"new"</span>, <span class="hljs-string">"gp"</span>, <span class="hljs-string">"##u"</span>, <span class="hljs-string">"!"</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mzno7">Because we are considering the uncased model, the sentence was lowercased first. We can see that the words <code>["i", "have", "a", "new"]</code> are present in the tokenizer’s vocabulary, but the word <code>"gpu"</code> is not. Consequently, the
	tokenizer splits <code>"gpu"</code> into known subwords: <code>["gp" and "##u"]</code>. <code>"##"</code> means that the rest of the token should
	be attached to the previous one, without space (for decoding or reversal of the tokenization).</p> <p data-svelte-h="svelte-n44w6">As another example, <a href="/docs/transformers/pr_33913/en/model_doc/xlnet#transformers.XLNetTokenizer">XLNetTokenizer</a> tokenizes our previously exemplary text as follows:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> XLNetTokenizer

	<span class="hljs-meta">>>> </span>tokenizer = XLNetTokenizer.from_pretrained(<span class="hljs-string">"xlnet/xlnet-base-cased"</span>)
	<span class="hljs-meta">>>> </span>tokenizer.tokenize(<span class="hljs-string">"Don't you love 🤗 Transformers? We sure do."</span>)
	[<span class="hljs-string">"▁Don"</span>, <span class="hljs-string">"'"</span>, <span class="hljs-string">"t"</span>, <span class="hljs-string">"▁you"</span>, <span class="hljs-string">"▁love"</span>, <span class="hljs-string">"▁"</span>, <span class="hljs-string">"🤗"</span>, <span class="hljs-string">"▁"</span>, <span class="hljs-string">"Transform"</span>, <span class="hljs-string">"ers"</span>, <span class="hljs-string">"?"</span>, <span class="hljs-string">"▁We"</span>, <span class="hljs-string">"▁sure"</span>, <span class="hljs-string">"▁do"</span>, <span class="hljs-string">"."</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-uhdxhc">We’ll get back to the meaning of those <code>"▁"</code> when we look at <a href="#sentencepiece">SentencePiece</a>. As one can see,
	the rare word <code>"Transformers"</code> has been split into the more frequent subwords <code>"Transform"</code> and <code>"ers"</code>.</p> <p data-svelte-h="svelte-129rtng">Let’s now look at how the different subword tokenization algorithms work. Note that all of those tokenization
	algorithms rely on some form of training which is usually done on the corpus the corresponding model will be trained
	on.</p> <a id="byte-pair-encoding"></a> <h3 class="relative group"><a id="byte-pair-encoding-bpe" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#byte-pair-encoding-bpe"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Byte-Pair Encoding (BPE)</span></h3> <p data-svelte-h="svelte-18ue2i4">Byte-Pair Encoding (BPE) was introduced in <a href="https://arxiv.org/abs/1508.07909" rel="nofollow">Neural Machine Translation of Rare Words with Subword Units (Sennrich et
	al., 2015)</a>. BPE relies on a pre-tokenizer that splits the training data into
	words. Pretokenization can be as simple as space tokenization, e.g. <a href="model_doc/gpt2">GPT-2</a>, <a href="model_doc/roberta">RoBERTa</a>. More advanced pre-tokenization include rule-based tokenization, e.g. <a href="model_doc/xlm">XLM</a>,
	<a href="model_doc/flaubert">FlauBERT</a> which uses Moses for most languages, or <a href="model_doc/openai-gpt">GPT</a> which uses
	spaCy and ftfy, to count the frequency of each word in the training corpus.</p> <p data-svelte-h="svelte-jfl55u">After pre-tokenization, a set of unique words has been created and the frequency with which each word occurred in the
	training data has been determined. Next, BPE creates a base vocabulary consisting of all symbols that occur in the set
	of unique words and learns merge rules to form a new symbol from two symbols of the base vocabulary. It does so until
	the vocabulary has attained the desired vocabulary size. Note that the desired vocabulary size is a hyperparameter to
	define before training the tokenizer.</p> <p data-svelte-h="svelte-14qop42">As an example, let’s assume that after pre-tokenization, the following set of words including their frequency has been
	determined:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">"hug"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"pug"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"pun"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"bun"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"hugs"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-dnl63x">Consequently, the base vocabulary is <code>["b", "g", "h", "n", "p", "s", "u"]</code>. Splitting all words into symbols of the
	base vocabulary, we obtain:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">"h"</span> <span class="hljs-string">"u"</span> <span class="hljs-string">"g"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"p"</span> <span class="hljs-string">"u"</span> <span class="hljs-string">"g"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"p"</span> <span class="hljs-string">"u"</span> <span class="hljs-string">"n"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"b"</span> <span class="hljs-string">"u"</span> <span class="hljs-string">"n"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"h"</span> <span class="hljs-string">"u"</span> <span class="hljs-string">"g"</span> <span class="hljs-string">"s"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ldwndq">BPE then counts the frequency of each possible symbol pair and picks the symbol pair that occurs most frequently. In
	the example above <code>"h"</code> followed by <code>"u"</code> is present <em>10 + 5 = 15</em> times (10 times in the 10 occurrences of
	<code>"hug"</code>, 5 times in the 5 occurrences of <code>"hugs"</code>). However, the most frequent symbol pair is <code>"u"</code> followed by
	<code>"g"</code>, occurring <em>10 + 5 + 5 = 20</em> times in total. Thus, the first merge rule the tokenizer learns is to group all
	<code>"u"</code> symbols followed by a <code>"g"</code> symbol together. Next, <code>"ug"</code> is added to the vocabulary. The set of words then
	becomes</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">"h"</span> <span class="hljs-string">"ug"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"p"</span> <span class="hljs-string">"ug"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"p"</span> <span class="hljs-string">"u"</span> <span class="hljs-string">"n"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"b"</span> <span class="hljs-string">"u"</span> <span class="hljs-string">"n"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"h"</span> <span class="hljs-string">"ug"</span> <span class="hljs-string">"s"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1tbuzqr">BPE then identifies the next most common symbol pair. It’s <code>"u"</code> followed by <code>"n"</code>, which occurs 16 times. <code>"u"</code>,
	<code>"n"</code> is merged to <code>"un"</code> and added to the vocabulary. The next most frequent symbol pair is <code>"h"</code> followed by
	<code>"ug"</code>, occurring 15 times. Again the pair is merged and <code>"hug"</code> can be added to the vocabulary.</p> <p data-svelte-h="svelte-ldb2oy">At this stage, the vocabulary is <code>["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"]</code> and our set of unique words
	is represented as</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-string">"hug"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"p"</span> <span class="hljs-string">"ug"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"p"</span> <span class="hljs-string">"un"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"b"</span> <span class="hljs-string">"un"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"hug"</span> <span class="hljs-string">"s"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1y8lhzz">Assuming, that the Byte-Pair Encoding training would stop at this point, the learned merge rules would then be applied
	to new words (as long as those new words do not include symbols that were not in the base vocabulary). For instance,
	the word <code>"bug"</code> would be tokenized to <code>["b", "ug"]</code> but <code>"mug"</code> would be tokenized as <code>["<unk>", "ug"]</code> since
	the symbol <code>"m"</code> is not in the base vocabulary. In general, single letters such as <code>"m"</code> are not replaced by the
	<code>"<unk>"</code> symbol because the training data usually includes at least one occurrence of each letter, but it is likely
	to happen for very special characters like emojis.</p> <p data-svelte-h="svelte-1jzokj6">As mentioned earlier, the vocabulary size, <em>i.e.</em> the base vocabulary size + the number of merges, is a hyperparameter
	to choose. For instance <a href="model_doc/openai-gpt">GPT</a> has a vocabulary size of 40,478 since they have 478 base characters
	and chose to stop training after 40,000 merges.</p> <h4 class="relative group"><a id="byte-level-bpe" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#byte-level-bpe"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Byte-level BPE</span></h4> <p data-svelte-h="svelte-1nngf6">A base vocabulary that includes all possible base characters can be quite large if <em>e.g.</em> all unicode characters are
	considered as base characters. To have a better base vocabulary, <a href="https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf" rel="nofollow">GPT-2</a> uses bytes
	as the base vocabulary, which is a clever trick to force the base vocabulary to be of size 256 while ensuring that
	every base character is included in the vocabulary. With some additional rules to deal with punctuation, the GPT2’s
	tokenizer can tokenize every text without the need for the <unk> symbol. <a href="model_doc/gpt">GPT-2</a> has a vocabulary
	size of 50,257, which corresponds to the 256 bytes base tokens, a special end-of-text token and the symbols learned
	with 50,000 merges.</p> <a id="wordpiece"></a> <h3 class="relative group"><a id="wordpiece" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#wordpiece"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>WordPiece</span></h3> <p data-svelte-h="svelte-bqacvb">WordPiece is the subword tokenization algorithm used for <a href="model_doc/bert">BERT</a>, <a href="model_doc/distilbert">DistilBERT</a>, and <a href="model_doc/electra">Electra</a>. The algorithm was outlined in <a href="https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf" rel="nofollow">Japanese and Korean
	Voice Search (Schuster et al., 2012)</a> and is very similar to
	BPE. WordPiece first initializes the vocabulary to include every character present in the training data and
	progressively learns a given number of merge rules. In contrast to BPE, WordPiece does not choose the most frequent
	symbol pair, but the one that maximizes the likelihood of the training data once added to the vocabulary.</p> <p data-svelte-h="svelte-q7cyms">So what does this mean exactly? Referring to the previous example, maximizing the likelihood of the training data is
	equivalent to finding the symbol pair, whose probability divided by the probabilities of its first symbol followed by
	its second symbol is the greatest among all symbol pairs. <em>E.g.</em> <code>"u"</code>, followed by <code>"g"</code> would have only been
	merged if the probability of <code>"ug"</code> divided by <code>"u"</code>, <code>"g"</code> would have been greater than for any other symbol
	pair. Intuitively, WordPiece is slightly different to BPE in that it evaluates what it <em>loses</em> by merging two symbols
	to ensure it’s <em>worth it</em>.</p> <a id="unigram"></a> <h3 class="relative group"><a id="unigram" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#unigram"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Unigram</span></h3> <p data-svelte-h="svelte-uuczu6">Unigram is a subword tokenization algorithm introduced in <a href="https://arxiv.org/pdf/1804.10959.pdf" rel="nofollow">Subword Regularization: Improving Neural Network Translation
	Models with Multiple Subword Candidates (Kudo, 2018)</a>. In contrast to BPE or
	WordPiece, Unigram initializes its base vocabulary to a large number of symbols and progressively trims down each
	symbol to obtain a smaller vocabulary. The base vocabulary could for instance correspond to all pre-tokenized words and
	the most common substrings. Unigram is not used directly for any of the models in the transformers, but it’s used in
	conjunction with <a href="#sentencepiece">SentencePiece</a>.</p> <p data-svelte-h="svelte-29pdu2">At each training step, the Unigram algorithm defines a loss (often defined as the log-likelihood) over the training
	data given the current vocabulary and a unigram language model. Then, for each symbol in the vocabulary, the algorithm
	computes how much the overall loss would increase if the symbol was to be removed from the vocabulary. Unigram then
	removes p (with p usually being 10% or 20%) percent of the symbols whose loss increase is the lowest, <em>i.e.</em> those
	symbols that least affect the overall loss over the training data. This process is repeated until the vocabulary has
	reached the desired size. The Unigram algorithm always keeps the base characters so that any word can be tokenized.</p> <p data-svelte-h="svelte-8otjbx">Because Unigram is not based on merge rules (in contrast to BPE and WordPiece), the algorithm has several ways of
	tokenizing new text after training. As an example, if a trained Unigram tokenizer exhibits the vocabulary:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-selector-attr">[<span class="hljs-string">"b"</span>, <span class="hljs-string">"g"</span>, <span class="hljs-string">"h"</span>, <span class="hljs-string">"n"</span>, <span class="hljs-string">"p"</span>, <span class="hljs-string">"s"</span>, <span class="hljs-string">"u"</span>, <span class="hljs-string">"ug"</span>, <span class="hljs-string">"un"</span>, <span class="hljs-string">"hug"</span>]</span>,<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1rs0qcq"><code>"hugs"</code> could be tokenized both as <code>["hug", "s"]</code>, <code>["h", "ug", "s"]</code> or <code>["h", "u", "g", "s"]</code>. So which one
	to choose? Unigram saves the probability of each token in the training corpus on top of saving the vocabulary so that
	the probability of each possible tokenization can be computed after training. The algorithm simply picks the most
	likely tokenization in practice, but also offers the possibility to sample a possible tokenization according to their
	probabilities.</p> <p>Those probabilities are defined by the loss the tokenizer is trained on. Assuming that the training data consists of
	the words<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>x</mi><mn>1</mn></msub><mo separator="true">,</mo><mo>…</mo><mo separator="true">,</mo><msub><mi>x</mi><mi>N</mi></msub></mrow><annotation encoding="application/x-tex">x_{1}, \dots, x_{N}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.625em;vertical-align:-0.1944em;"></span><span class="mord"><span class="mord mathnormal">x</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3011em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mtight">1</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner">…</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord"><span class="mord mathnormal">x</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3283em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.10903em;">N</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END --> and that the set of all possible tokenizations for a word<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><msub><mi>x</mi><mi>i</mi></msub></mrow><annotation encoding="application/x-tex">x_{i}</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.5806em;vertical-align:-0.15em;"></span><span class="mord"><span class="mord mathnormal">x</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END --> is
	defined as<!-- HTML_TAG_START --><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>S</mi><mo stretchy="false">(</mo><msub><mi>x</mi><mi>i</mi></msub><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex">S(x_{i})</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.05764em;">S</span><span class="mopen">(</span><span class="mord"><span class="mord mathnormal">x</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3117em;"><span style="top:-2.55em;margin-left:0em;margin-right:0.05em;"><span class="pstrut" style="height:2.7em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.15em;"><span></span></span></span></span></span></span><span class="mclose">)</span></span></span></span><!-- HTML_TAG_END -->, then the overall loss is defined as
	<!-- HTML_TAG_START --><span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mi mathvariant="script">L</mi><mo>=</mo><mo>−</mo><munderover><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>1</mn></mrow><mi>N</mi></munderover><mi>log</mi><mo>⁡</mo><mrow><mo fence="true">(</mo><munder><mo>∑</mo><mrow><mi>x</mi><mo>∈</mo><mi>S</mi><mo stretchy="false">(</mo><msub><mi>x</mi><mi>i</mi></msub><mo stretchy="false">)</mo></mrow></munder><mi>p</mi><mo stretchy="false">(</mo><mi>x</mi><mo stretchy="false">)</mo><mo fence="true">)</mo></mrow></mrow><annotation encoding="application/x-tex">\mathcal{L} = -\sum_{i=1}^{N} \log \left ( \sum_{x \in S(x_{i})} p(x) \right )</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.6833em;"></span><span class="mord mathcal">L</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:3.6em;vertical-align:-1.55em;"></span><span class="mord">−</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.8283em;"><span style="top:-1.8723em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span><span class="mrel mtight">=</span><span class="mord mtight">1</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span><span style="top:-4.3em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight" style="margin-right:0.10903em;">N</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:1.2777em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mop">lo<span style="margin-right:0.01389em;">g</span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="minner"><span class="mopen"><span class="delimsizing mult"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:2.05em;"><span style="top:-4.05em;"><span class="pstrut" style="height:5.6em;"></span><span style="width:0.875em;height:3.600em;"><svg xmlns="http://www.w3.org/2000/svg" width='0.875em' height='3.600em' viewBox='0 0 875 3600'><path d='M863,9c0,-2,-2,-5,-6,-9c0,0,-17,0,-17,0c-12.7,0,-19.3,0.3,-20,1
	c-5.3,5.3,-10.3,11,-15,17c-242.7,294.7,-395.3,682,-458,1162c-21.3,163.3,-33.3,349,
	-36,557 l0,84c0.2,6,0,26,0,60c2,159.3,10,310.7,24,454c53.3,528,210,
	949.7,470,1265c4.7,6,9.7,11.7,15,17c0.7,0.7,7,1,19,1c0,0,18,0,18,0c4,-4,6,-7,6,-9
	c0,-2.7,-3.3,-8.7,-10,-18c-135.3,-192.7,-235.5,-414.3,-300.5,-665c-65,-250.7,-102.5,
	-544.7,-112.5,-882c-2,-104,-3,-167,-3,-189
	l0,-92c0,-162.7,5.7,-314,17,-454c20.7,-272,63.7,-513,129,-723c65.3,
	-210,155.3,-396.3,270,-559c6.7,-9.3,10,-15.3,10,-18z'/></svg></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:1.55em;"><span></span></span></span></span></span></span><span class="mop op-limits"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.05em;"><span style="top:-1.809em;margin-left:0em;"><span class="pstrut" style="height:3.05em;"></span><span class="sizing reset-size6 size3 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">x</span><span class="mrel mtight">∈</span><span class="mord mathnormal mtight" style="margin-right:0.05764em;">S</span><span class="mopen mtight">(</span><span class="mord mtight"><span class="mord mathnormal mtight">x</span><span class="msupsub"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:0.3281em;"><span style="top:-2.357em;margin-left:0em;margin-right:0.0714em;"><span class="pstrut" style="height:2.5em;"></span><span class="sizing reset-size3 size1 mtight"><span class="mord mtight"><span class="mord mathnormal mtight">i</span></span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:0.143em;"><span></span></span></span></span></span></span><span class="mclose mtight">)</span></span></span></span><span style="top:-3.05em;"><span class="pstrut" style="height:3.05em;"></span><span><span class="mop op-symbol large-op">∑</span></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:1.516em;"><span></span></span></span></span></span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal">p</span><span class="mopen">(</span><span class="mord mathnormal">x</span><span class="mclose">)</span><span class="mclose"><span class="delimsizing mult"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:2.05em;"><span style="top:-4.05em;"><span class="pstrut" style="height:5.6em;"></span><span style="width:0.875em;height:3.600em;"><svg xmlns="http://www.w3.org/2000/svg" width='0.875em' height='3.600em' viewBox='0 0 875 3600'><path d='M76,0c-16.7,0,-25,3,-25,9c0,2,2,6.3,6,13c21.3,28.7,42.3,60.3,
	63,95c96.7,156.7,172.8,332.5,228.5,527.5c55.7,195,92.8,416.5,111.5,664.5
	c11.3,139.3,17,290.7,17,454c0,28,1.7,43,3.3,45l0,9
	c-3,4,-3.3,16.7,-3.3,38c0,162,-5.7,313.7,-17,455c-18.7,248,-55.8,469.3,-111.5,664
	c-55.7,194.7,-131.8,370.3,-228.5,527c-20.7,34.7,-41.7,66.3,-63,95c-2,3.3,-4,7,-6,11
	c0,7.3,5.7,11,17,11c0,0,11,0,11,0c9.3,0,14.3,-0.3,15,-1c5.3,-5.3,10.3,-11,15,-17
	c242.7,-294.7,395.3,-681.7,458,-1161c21.3,-164.7,33.3,-350.7,36,-558
	l0,-144c-2,-159.3,-10,-310.7,-24,-454c-53.3,-528,-210,-949.7,
	-470,-1265c-4.7,-6,-9.7,-11.7,-15,-17c-0.7,-0.7,-6.7,-1,-18,-1z'/></svg></span></span></span><span class="vlist-s"></span></span><span class="vlist-r"><span class="vlist" style="height:1.55em;"><span></span></span></span></span></span></span></span></span></span></span></span><!-- HTML_TAG_END --></p> <a id="sentencepiece"></a> <h3 class="relative group"><a id="sentencepiece" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sentencepiece"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>SentencePiece</span></h3> <p data-svelte-h="svelte-1jarhel">All tokenization algorithms described so far have the same problem: It is assumed that the input text uses spaces to
	separate words. However, not all languages use spaces to separate words. One possible solution is to use language
	specific pre-tokenizers, <em>e.g.</em> <a href="model_doc/xlm">XLM</a> uses a specific Chinese, Japanese, and Thai pre-tokenizer.
	To solve this problem more generally, <a href="https://arxiv.org/pdf/1808.06226.pdf" rel="nofollow">SentencePiece: A simple and language independent subword tokenizer and
	detokenizer for Neural Text Processing (Kudo et al., 2018)</a> treats the input
	as a raw input stream, thus including the space in the set of characters to use. It then uses the BPE or unigram
	algorithm to construct the appropriate vocabulary.</p> <p data-svelte-h="svelte-1k638cj">The <a href="/docs/transformers/pr_33913/en/model_doc/xlnet#transformers.XLNetTokenizer">XLNetTokenizer</a> uses SentencePiece for example, which is also why in the example earlier the
	<code>"▁"</code> character was included in the vocabulary. Decoding with SentencePiece is very easy since all tokens can just be
	concatenated and <code>"▁"</code> is replaced by a space.</p> <p data-svelte-h="svelte-tl8004">All transformers models in the library that use SentencePiece use it in combination with unigram. Examples of models
	using SentencePiece are <a href="model_doc/albert">ALBERT</a>, <a href="model_doc/xlnet">XLNet</a>, <a href="model_doc/marian">Marian</a>, and <a href="model_doc/t5">T5</a>.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/tokenizer_summary.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_z647wz = {
	assets: "/docs/transformers/pr_33913/en",
	base: "/docs/transformers/pr_33913/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/transformers/pr_33913/en/_app/immutable/entry/start.b67f883f.js"),
	import("/docs/transformers/pr_33913/en/_app/immutable/entry/app.e436b1f2.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 450],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 65.9 kB
Xet hash:: e6f86f21b81f3132acc31e683a4c24825007733c3d8719e455f9321adacc7587

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.