Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / course /pr_1069 /en /_app /immutable /nodes /75.4e6d989a.js

rtrm's picture

about 1 month ago

80.8 kB

	import{s as Vi,o as Wi,n as Bi}from"../chunks/scheduler.37c15a92.js";import{S as _i,i as Qi,g as i,s as n,r as o,A as Ai,h as p,f as s,c as a,j as Zi,u as c,x as r,k as pa,y as Ri,a as l,v as M,d as m,t as y,w as h}from"../chunks/index.7cb9c9b8.js";import{T as Ni}from"../chunks/Tip.d10b3fc9.js";import{Y as Xi}from"../chunks/Youtube.8666c400.js";import{C as u}from"../chunks/CodeBlock.abae2786.js";import{C as Gi}from"../chunks/CourseFloatingBanner.df82c153.js";import{H as ms,E as qi}from"../chunks/getInferenceSnippets.f9350a3f.js";function Li(ys){let J,j=`<strong>To go further</strong> If you test the two versions of the previous normalizers on a string containing the unicode character <code>u"\\u0085"</code> you will surely notice that these two normalizers are not exactly equivalent.
	To not over-complicate the version with <code>normalizers.Sequence</code> too much , we haven’t included the Regex replacements that the <code>BertNormalizer</code> requires when the <code>clean_text</code> argument is set to <code>True</code> - which is the default behavior. But don’t worry: it is possible to get exactly the same normalization without using the handy <code>BertNormalizer</code> by adding two <code>normalizers.Replace</code>’s to the normalizers sequence.`;return{c(){J=i("p"),J.innerHTML=j},l(d){J=p(d,"P",{"data-svelte-h":!0}),r(J)!=="svelte-fwgfxt"&&(J.innerHTML=j)},m(d,cs){l(d,J,cs)},p:Bi,d(d){d&&s(J)}}}function Hi(ys){let J,j,d,cs,k,hs,w,us,b,ra="As we’ve seen in the previous sections, tokenization comprises several steps:",Js,$,oa="<li>Normalization (any cleanup of the text that is deemed necessary, such as removing spaces or accents, Unicode normalization, etc.)</li> <li>Pre-tokenization (splitting the input into words)</li> <li>Running the input through the model (using the pre-tokenized words to produce a sequence of tokens)</li> <li>Post-processing (adding the special tokens of the tokenizer, generating the attention mask and token type IDs)</li>",ds,g,ca="As a reminder, here’s another look at the overall process:",fs,f,Ma='<img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter6/tokenization_pipeline.svg" alt="The tokenization pipeline."/> <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter6/tokenization_pipeline-dark.svg" alt="The tokenization pipeline."/>',Ts,U,ma='The 🤗 Tokenizers library has been built to provide several options for each of those steps, which you can mix and match together. In this section we’ll see how we can build a tokenizer from scratch, as opposed to training a new tokenizer from an old one as we did in <a href="/course/chapter6/2">section 2</a>. You’ll then be able to build any kind of tokenizer you can think of!',js,x,ks,z,ya="More precisely, the library is built around a central <code>Tokenizer</code> class with the building blocks regrouped in submodules:",ws,I,ha='<li><code>normalizers</code> contains all the possible types of <code>Normalizer</code> you can use (complete list <a href="https://huggingface.co/docs/tokenizers/api/normalizers" rel="nofollow">here</a>).</li> <li><code>pre_tokenizers</code> contains all the possible types of <code>PreTokenizer</code> you can use (complete list <a href="https://huggingface.co/docs/tokenizers/api/pre-tokenizers" rel="nofollow">here</a>).</li> <li><code>models</code> contains the various types of <code>Model</code> you can use, like <code>BPE</code>, <code>WordPiece</code>, and <code>Unigram</code> (complete list <a href="https://huggingface.co/docs/tokenizers/api/models" rel="nofollow">here</a>).</li> <li><code>trainers</code> contains all the different types of <code>Trainer</code> you can use to train your model on a corpus (one per type of model; complete list <a href="https://huggingface.co/docs/tokenizers/api/trainers" rel="nofollow">here</a>).</li> <li><code>post_processors</code> contains the various types of <code>PostProcessor</code> you can use (complete list <a href="https://huggingface.co/docs/tokenizers/api/post-processors" rel="nofollow">here</a>).</li> <li><code>decoders</code> contains the various types of <code>Decoder</code> you can use to decode the outputs of tokenization (complete list <a href="https://huggingface.co/docs/tokenizers/components#decoders" rel="nofollow">here</a>).</li>',bs,C,ua='You can find the whole list of building blocks <a href="https://huggingface.co/docs/tokenizers/components" rel="nofollow">here</a>.',$s,v,gs,Z,Ja='To train our new tokenizer, we will use a small corpus of text (so the examples run fast). The steps for acquiring the corpus are similar to the ones we took at the <a href="/course/chapter6/2">beginning of this chapter</a>, but this time we’ll use the <a href="https://huggingface.co/datasets/wikitext" rel="nofollow">WikiText-2</a> dataset:',Us,V,xs,W,da="The function <code>get_training_corpus()</code> is a generator that will yield batches of 1,000 texts, which we will use to train the tokenizer.",zs,B,fa="🤗 Tokenizers can also be trained on text files directly. Here’s how we can generate a text file containing all the texts/inputs from WikiText-2 that we can use locally:",Is,_,Cs,Q,Ta="Next we’ll show you how to build your own BERT, GPT-2, and XLNet tokenizers, block by block. That will give us an example of each of the three main tokenization algorithms: WordPiece, BPE, and Unigram. Let’s start with BERT!",vs,A,Zs,R,ja="To build a tokenizer with the 🤗 Tokenizers library, we start by instantiating a <code>Tokenizer</code> object with a <code>model</code>, then set its <code>normalizer</code>, <code>pre_tokenizer</code>, <code>post_processor</code>, and <code>decoder</code> attributes to the values we want.",Vs,N,ka="For this example, we’ll create a <code>Tokenizer</code> with a WordPiece model:",Ws,X,Bs,G,wa="We have to specify the <code>unk_token</code> so the model knows what to return when it encounters characters it hasn’t seen before. Other arguments we can set here include the <code>vocab</code> of our model (we’re going to train the model, so we don’t need to set this) and <code>max_input_chars_per_word</code>, which specifies a maximum length for each word (words longer than the value passed will be split).",_s,q,ba="The first step of tokenization is normalization, so let’s begin with that. Since BERT is widely used, there is a <code>BertNormalizer</code> with the classic options we can set for BERT: <code>lowercase</code> and <code>strip_accents</code>, which are self-explanatory; <code>clean_text</code> to remove all control characters and replace repeating spaces with a single one; and <code>handle_chinese_chars</code>, which places spaces around Chinese characters. To replicate the <code>bert-base-uncased</code> tokenizer, we can just set this normalizer:",Qs,L,As,H,$a="Generally speaking, however, when building a new tokenizer you won’t have access to such a handy normalizer already implemented in the 🤗 Tokenizers library — so let’s see how to create the BERT normalizer by hand. The library provides a <code>Lowercase</code> normalizer and a <code>StripAccents</code> normalizer, and you can compose several normalizers using a <code>Sequence</code>:",Rs,D,Ns,E,ga="We’re also using an <code>NFD</code> Unicode normalizer, as otherwise the <code>StripAccents</code> normalizer won’t properly recognize the accented characters and thus won’t strip them out.",Xs,S,Ua="As we’ve seen before, we can use the <code>normalize_str()</code> method of the <code>normalizer</code> to check out the effects it has on a given text:",Gs,P,qs,F,Ls,T,Hs,Y,xa="Next is the pre-tokenization step. Again, there is a prebuilt <code>BertPreTokenizer</code> that we can use:",Ds,K,Es,O,za="Or we can build it from scratch:",Ss,ee,Ps,te,Ia="Note that the <code>Whitespace</code> pre-tokenizer splits on whitespace and all characters that are not letters, digits, or the underscore character, so it technically splits on whitespace and punctuation:",Fs,se,Ys,le,Ks,ne,Ca="If you only want to split on whitespace, you should use the <code>WhitespaceSplit</code> pre-tokenizer instead:",Os,ae,el,ie,tl,pe,va="Like with normalizers, you can use a <code>Sequence</code> to compose several pre-tokenizers:",sl,re,ll,oe,nl,ce,Za="The next step in the tokenization pipeline is running the inputs through the model. We already specified our model in the initialization, but we still need to train it, which will require a <code>WordPieceTrainer</code>. The main thing to remember when instantiating a trainer in 🤗 Tokenizers is that you need to pass it all the special tokens you intend to use — otherwise it won’t add them to the vocabulary, since they are not in the training corpus:",al,Me,il,me,Va="As well as specifying the <code>vocab_size</code> and <code>special_tokens</code>, we can set the <code>min_frequency</code> (the number of times a token must appear to be included in the vocabulary) or change the <code>continuing_subword_prefix</code> (if we want to use something different from <code>##</code>).",pl,ye,Wa="To train our model using the iterator we defined earlier, we just have to execute this command:",rl,he,ol,ue,Ba="We can also use text files to train our tokenizer, which would look like this (we reinitialize the model with an empty <code>WordPiece</code> beforehand):",cl,Je,Ml,de,_a="In both cases, we can then test the tokenizer on a text by calling the <code>encode()</code> method:",ml,fe,yl,Te,hl,je,Qa="The <code>encoding</code> obtained is an <code>Encoding</code>, which contains all the necessary outputs of the tokenizer in its various attributes: <code>ids</code>, <code>type_ids</code>, <code>tokens</code>, <code>offsets</code>, <code>attention_mask</code>, <code>special_tokens_mask</code>, and <code>overflowing</code>.",ul,ke,Aa="The last step in the tokenization pipeline is post-processing. We need to add the <code>[CLS]</code> token at the beginning and the <code>[SEP]</code> token at the end (or after each sentence, if we have a pair of sentences). We will use a <code>TemplateProcessor</code> for this, but first we need to know the IDs of the <code>[CLS]</code> and <code>[SEP]</code> tokens in the vocabulary:",Jl,we,dl,be,fl,$e,Ra="To write the template for the <code>TemplateProcessor</code>, we have to specify how to treat a single sentence and a pair of sentences. For both, we write the special tokens we want to use; the first (or single) sentence is represented by <code>$A</code>, while the second sentence (if encoding a pair) is represented by <code>$B</code>. For each of these (special tokens and sentences), we also specify the corresponding token type ID after a colon.",Tl,ge,Na="The classic BERT template is thus defined as follows:",jl,Ue,kl,xe,Xa="Note that we need to pass along the IDs of the special tokens, so the tokenizer can properly convert them to their IDs.",wl,ze,Ga="Once this is added, going back to our previous example will give:",bl,Ie,$l,Ce,gl,ve,qa="And on a pair of sentences, we get the proper result:",Ul,Ze,xl,Ve,zl,We,La="We’ve almost finished building this tokenizer from scratch — the last step is to include a decoder:",Il,Be,Cl,_e,Ha="Let’s test it on our previous <code>encoding</code>:",vl,Qe,Zl,Ae,Vl,Re,Da="Great! We can save our tokenizer in a single JSON file like this:",Wl,Ne,Bl,Xe,Ea="We can then reload that file in a <code>Tokenizer</code> object with the <code>from_file()</code> method:",_l,Ge,Ql,qe,Sa="To use this tokenizer in 🤗 Transformers, we have to wrap it in a <code>PreTrainedTokenizerFast</code>. We can either use the generic class or, if our tokenizer corresponds to an existing model, use that class (here, <code>BertTokenizerFast</code>). If you apply this lesson to build a brand new tokenizer, you will have to use the first option.",Al,Le,Pa="To wrap the tokenizer in a <code>PreTrainedTokenizerFast</code>, we can either pass the tokenizer we built as a <code>tokenizer_object</code> or pass the tokenizer file we saved as <code>tokenizer_file</code>. The key thing to remember is that we have to manually set all the special tokens, since that class can’t infer from the <code>tokenizer</code> object which token is the mask token, the <code>[CLS]</code> token, etc.:",Rl,He,Nl,De,Fa="If you are using a specific tokenizer class (like <code>BertTokenizerFast</code>), you will only need to specify the special tokens that are different from the default ones (here, none):",Xl,Ee,Gl,Se,Ya="You can then use this tokenizer like any other 🤗 Transformers tokenizer. You can save it with the <code>save_pretrained()</code> method, or upload it to the Hub with the <code>push_to_hub()</code> method.",ql,Pe,Ka="Now that we’ve seen how to build a WordPiece tokenizer, let’s do the same for a BPE tokenizer. We’ll go a bit faster since you know all the steps, and only highlight the differences.",Ll,Fe,Hl,Ye,Oa="Let’s now build a GPT-2 tokenizer. Like for the BERT tokenizer, we start by initializing a <code>Tokenizer</code> with a BPE model:",Dl,Ke,El,Oe,ei="Also like for BERT, we could initialize this model with a vocabulary if we had one (we would need to pass the <code>vocab</code> and <code>merges</code> in this case), but since we will train from scratch, we don’t need to do that. We also don’t need to specify an <code>unk_token</code> because GPT-2 uses byte-level BPE, which doesn’t require it.",Sl,et,ti="GPT-2 does not use a normalizer, so we skip that step and go directly to the pre-tokenization:",Pl,tt,Fl,st,si="The option we added to <code>ByteLevel</code> here is to not add a space at the beginning of a sentence (which is the default otherwise). We can have a look at the pre-tokenization of an example text like before:",Yl,lt,Kl,nt,Ol,at,li="Next is the model, which needs training. For GPT-2, the only special token is the end-of-text token:",en,it,tn,pt,ni="Like with the <code>WordPieceTrainer</code>, as well as the <code>vocab_size</code> and <code>special_tokens</code>, we can specify the <code>min_frequency</code> if we want to, or if we have an end-of-word suffix (like <code></w></code>), we can set it with <code>end_of_word_suffix</code>.",sn,rt,ai="This tokenizer can also be trained on text files:",ln,ot,nn,ct,ii="Let’s have a look at the tokenization of a sample text:",an,Mt,pn,mt,rn,yt,pi="We apply the byte-level post-processing for the GPT-2 tokenizer as follows:",on,ht,cn,ut,ri="The <code>trim_offsets = False</code> option indicates to the post-processor that we should leave the offsets of tokens that begin with ‘Ġ’ as they are: this way the start of the offsets will point to the space before the word, not the first character of the word (since the space is technically part of the token). Let’s have a look at the result with the text we just encoded, where <code>'Ġtest'</code> is the token at index 4:",Mn,Jt,mn,dt,yn,ft,oi="Finally, we add a byte-level decoder:",hn,Tt,un,jt,ci="and we can double-check it works properly:",Jn,kt,dn,wt,fn,bt,Mi="Great! Now that we’re done, we can save the tokenizer like before, and wrap it in a <code>PreTrainedTokenizerFast</code> or <code>GPT2TokenizerFast</code> if we want to use it in 🤗 Transformers:",Tn,$t,jn,gt,mi="or:",kn,Ut,wn,xt,yi="As the last example, we’ll show you how to build a Unigram tokenizer from scratch.",bn,zt,$n,It,hi="Let’s now build an XLNet tokenizer. Like for the previous tokenizers, we start by initializing a <code>Tokenizer</code> with a Unigram model:",gn,Ct,Un,vt,ui="Again, we could initialize this model with a vocabulary if we had one.",xn,Zt,Ji="For the normalization, XLNet uses a few replacements (which come from SentencePiece):",zn,Vt,In,Wt,di="This replaces <code>“</code> and <code>”</code> with <code>”</code> and any sequence of two or more spaces with a single space, as well as removing the accents in the texts to tokenize.",Cn,Bt,fi="The pre-tokenizer to use for any SentencePiece tokenizer is <code>Metaspace</code>:",vn,_t,Zn,Qt,Ti="We can have a look at the pre-tokenization of an example text like before:",Vn,At,Wn,Rt,Bn,Nt,ji="Next is the model, which needs training. XLNet has quite a few special tokens:",_n,Xt,Qn,Gt,ki="A very important argument not to forget for the <code>UnigramTrainer</code> is the <code>unk_token</code>. We can also pass along other arguments specific to the Unigram algorithm, such as the <code>shrinking_factor</code> for each step where we remove tokens (defaults to 0.75) or the <code>max_piece_length</code> to specify the maximum length of a given token (defaults to 16).",An,qt,wi="This tokenizer can also be trained on text files:",Rn,Lt,Nn,Ht,bi="Let’s have a look at the tokenization of a sample text:",Xn,Dt,Gn,Et,qn,St,$i="A peculiarity of XLNet is that it puts the <code><cls></code> token at the end of the sentence, with a type ID of 2 (to distinguish it from the other tokens). It’s padding on the left, as a result. We can deal with all the special tokens and token type IDs with a template, like for BERT, but first we have to get the IDs of the <code><cls></code> and <code><sep></code> tokens:",Ln,Pt,Hn,Ft,Dn,Yt,gi="The template looks like this:",En,Kt,Sn,Ot,Ui="And we can test it works by encoding a pair of sentences:",Pn,es,Fn,ts,Yn,ss,xi="Finally, we add a <code>Metaspace</code> decoder:",Kn,ls,On,ns,zi="and we’re done with this tokenizer! We can save the tokenizer like before, and wrap it in a <code>PreTrainedTokenizerFast</code> or <code>XLNetTokenizerFast</code> if we want to use it in 🤗 Transformers. One thing to note when using <code>PreTrainedTokenizerFast</code> is that on top of the special tokens, we need to tell the 🤗 Transformers library to pad on the left:",ea,as,ta,is,Ii="Or alternatively:",sa,ps,la,rs,Ci="Now that you have seen how the various building blocks are used to build existing tokenizers, you should be able to write any tokenizer you want with the 🤗 Tokenizers library and be able to use it in 🤗 Transformers.",na,os,aa,Ms,ia;return k=new ms({props:{title:"Building a tokenizer, block by block",local:"building-a-tokenizer-block-by-block",headingTag:"h1"}}),w=new Gi({props:{chapter:6,classNames:"absolute z-10 right-0 top-0",notebooks:[{label:"Google Colab",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter6/section8.ipynb"},{label:"Aws Studio",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter6/section8.ipynb"}]}}),x=new Xi({props:{id:"MR8tZm5ViWU"}}),v=new ms({props:{title:"Acquiring a corpus",local:"acquiring-a-corpus",headingTag:"h2"}}),V=new u({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBJTBBZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJ3aWtpdGV4dCUyMiUyQyUyMG5hbWUlM0QlMjJ3aWtpdGV4dC0yLXJhdy12MSUyMiUyQyUyMHNwbGl0JTNEJTIydHJhaW4lMjIpJTBBJTBBJTBBZGVmJTIwZ2V0X3RyYWluaW5nX2NvcnB1cygpJTNBJTBBJTIwJTIwJTIwJTIwZm9yJTIwaSUyMGluJTIwcmFuZ2UoMCUyQyUyMGxlbihkYXRhc2V0KSUyQyUyMDEwMDApJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIweWllbGQlMjBkYXRhc2V0JTVCaSUyMCUzQSUyMGklMjAlMkIlMjAxMDAwJTVEJTVCJTIydGV4dCUyMiU1RA==",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset

	dataset = load_dataset(<span class="hljs-string">"wikitext"</span>, name=<span class="hljs-string">"wikitext-2-raw-v1"</span>, split=<span class="hljs-string">"train"</span>)


	<span class="hljs-keyword">def</span> <span class="hljs-title function_">get_training_corpus</span>():
	<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">0</span>, <span class="hljs-built_in">len</span>(dataset), <span class="hljs-number">1000</span>):
	<span class="hljs-keyword">yield</span> dataset[i : i + <span class="hljs-number">1000</span>][<span class="hljs-string">"text"</span>]`,wrap:!1}}),_=new u({props:{code:"d2l0aCUyMG9wZW4oJTIyd2lraXRleHQtMi50eHQlMjIlMkMlMjAlMjJ3JTIyJTJDJTIwZW5jb2RpbmclM0QlMjJ1dGYtOCUyMiklMjBhcyUyMGYlM0ElMEElMjAlMjAlMjAlMjBmb3IlMjBpJTIwaW4lMjByYW5nZShsZW4oZGF0YXNldCkpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZi53cml0ZShkYXRhc2V0JTVCaSU1RCU1QiUyMnRleHQlMjIlNUQlMjAlMkIlMjAlMjIlNUNuJTIyKQ==",highlighted:`<span class="hljs-keyword">with</span> <span class="hljs-built_in">open</span>(<span class="hljs-string">"wikitext-2.txt"</span>, <span class="hljs-string">"w"</span>, encoding=<span class="hljs-string">"utf-8"</span>) <span class="hljs-keyword">as</span> f:
	<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(dataset)):
	f.write(dataset[i][<span class="hljs-string">"text"</span>] + <span class="hljs-string">"\\n"</span>)`,wrap:!1}}),A=new ms({props:{title:"Building a WordPiece tokenizer from scratch",local:"building-a-wordpiece-tokenizer-from-scratch",headingTag:"h2"}}),X=new u({props:{code:"ZnJvbSUyMHRva2VuaXplcnMlMjBpbXBvcnQlMjAoJTBBJTIwJTIwJTIwJTIwZGVjb2RlcnMlMkMlMEElMjAlMjAlMjAlMjBtb2RlbHMlMkMlMEElMjAlMjAlMjAlMjBub3JtYWxpemVycyUyQyUwQSUyMCUyMCUyMCUyMHByZV90b2tlbml6ZXJzJTJDJTBBJTIwJTIwJTIwJTIwcHJvY2Vzc29ycyUyQyUwQSUyMCUyMCUyMCUyMHRyYWluZXJzJTJDJTBBJTIwJTIwJTIwJTIwVG9rZW5pemVyJTJDJTBBKSUwQSUwQXRva2VuaXplciUyMCUzRCUyMFRva2VuaXplcihtb2RlbHMuV29yZFBpZWNlKHVua190b2tlbiUzRCUyMiU1QlVOSyU1RCUyMikp",highlighted:`<span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> (
	decoders,
	models,
	normalizers,
	pre_tokenizers,
	processors,
	trainers,
	Tokenizer,
	)

	tokenizer = Tokenizer(models.WordPiece(unk_token=<span class="hljs-string">"[UNK]"</span>))`,wrap:!1}}),L=new u({props:{code:"dG9rZW5pemVyLm5vcm1hbGl6ZXIlMjAlM0QlMjBub3JtYWxpemVycy5CZXJ0Tm9ybWFsaXplcihsb3dlcmNhc2UlM0RUcnVlKQ==",highlighted:'tokenizer.normalizer = normalizers.BertNormalizer(lowercase=<span class="hljs-literal">True</span>)',wrap:!1}}),D=new u({props:{code:"dG9rZW5pemVyLm5vcm1hbGl6ZXIlMjAlM0QlMjBub3JtYWxpemVycy5TZXF1ZW5jZSglMEElMjAlMjAlMjAlMjAlNUJub3JtYWxpemVycy5ORkQoKSUyQyUyMG5vcm1hbGl6ZXJzLkxvd2VyY2FzZSgpJTJDJTIwbm9ybWFsaXplcnMuU3RyaXBBY2NlbnRzKCklNUQlMEEp",highlighted:`tokenizer.normalizer = normalizers.<span class="hljs-type">Sequence</span>(
	[normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents()]
	)`,wrap:!1}}),P=new u({props:{code:"cHJpbnQodG9rZW5pemVyLm5vcm1hbGl6ZXIubm9ybWFsaXplX3N0ciglMjJIJUMzJUE5bGwlQzMlQjIlMjBoJUMzJUI0dyUyMGFyZSUyMCVDMyVCQyUzRiUyMikp",highlighted:'<span class="hljs-built_in">print</span>(tokenizer.normalizer.normalize_str(<span class="hljs-string">"Héllò hôw are ü?"</span>))',wrap:!1}}),F=new u({props:{code:"aGVsbG8lMjBob3clMjBhcmUlMjB1JTNG",highlighted:"hello how are u?",wrap:!1}}),T=new Ni({props:{$$slots:{default:[Li]},$$scope:{ctx:ys}}}),K=new u({props:{code:"dG9rZW5pemVyLnByZV90b2tlbml6ZXIlMjAlM0QlMjBwcmVfdG9rZW5pemVycy5CZXJ0UHJlVG9rZW5pemVyKCk=",highlighted:"tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()",wrap:!1}}),ee=new u({props:{code:"dG9rZW5pemVyLnByZV90b2tlbml6ZXIlMjAlM0QlMjBwcmVfdG9rZW5pemVycy5XaGl0ZXNwYWNlKCk=",highlighted:"tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()",wrap:!1}}),se=new u({props:{code:"dG9rZW5pemVyLnByZV90b2tlbml6ZXIucHJlX3Rva2VuaXplX3N0ciglMjJMZXQncyUyMHRlc3QlMjBteSUyMHByZS10b2tlbml6ZXIuJTIyKQ==",highlighted:'tokenizer.pre_tokenizer.pre_tokenize_str(<span class="hljs-string">"Let's test my pre-tokenizer."</span>)',wrap:!1}}),le=new u({props:{code:"JTVCKCdMZXQnJTJDJTIwKDAlMkMlMjAzKSklMkMlMjAoJTIyJyUyMiUyQyUyMCgzJTJDJTIwNCkpJTJDJTIwKCdzJyUyQyUyMCg0JTJDJTIwNSkpJTJDJTIwKCd0ZXN0JyUyQyUyMCg2JTJDJTIwMTApKSUyQyUyMCgnbXknJTJDJTIwKDExJTJDJTIwMTMpKSUyQyUyMCgncHJlJyUyQyUyMCgxNCUyQyUyMDE3KSklMkMlMEElMjAoJy0nJTJDJTIwKDE3JTJDJTIwMTgpKSUyQyUyMCgndG9rZW5pemVyJyUyQyUyMCgxOCUyQyUyMDI3KSklMkMlMjAoJy4nJTJDJTIwKDI3JTJDJTIwMjgpKSU1RA==",highlighted:`[(<span class="hljs-string">'Let'</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">3</span>)), (<span class="hljs-string">"'"</span>, (<span class="hljs-number">3</span>, <span class="hljs-number">4</span>)), (<span class="hljs-string">'s'</span>, (<span class="hljs-number">4</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">'test'</span>, (<span class="hljs-number">6</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">'my'</span>, (<span class="hljs-number">11</span>, <span class="hljs-number">13</span>)), (<span class="hljs-string">'pre'</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">17</span>)),
	(<span class="hljs-string">'-'</span>, (<span class="hljs-number">17</span>, <span class="hljs-number">18</span>)), (<span class="hljs-string">'tokenizer'</span>, (<span class="hljs-number">18</span>, <span class="hljs-number">27</span>)), (<span class="hljs-string">'.'</span>, (<span class="hljs-number">27</span>, <span class="hljs-number">28</span>))]`,wrap:!1}}),ae=new u({props:{code:"cHJlX3Rva2VuaXplciUyMCUzRCUyMHByZV90b2tlbml6ZXJzLldoaXRlc3BhY2VTcGxpdCgpJTBBcHJlX3Rva2VuaXplci5wcmVfdG9rZW5pemVfc3RyKCUyMkxldCdzJTIwdGVzdCUyMG15JTIwcHJlLXRva2VuaXplci4lMjIp",highlighted:`pre_tokenizer = pre_tokenizers.WhitespaceSplit()
	pre_tokenizer.pre_tokenize_str(<span class="hljs-string">"Let's test my pre-tokenizer."</span>)`,wrap:!1}}),ie=new u({props:{code:"JTVCKCUyMkxldCdzJTIyJTJDJTIwKDAlMkMlMjA1KSklMkMlMjAoJ3Rlc3QnJTJDJTIwKDYlMkMlMjAxMCkpJTJDJTIwKCdteSclMkMlMjAoMTElMkMlMjAxMykpJTJDJTIwKCdwcmUtdG9rZW5pemVyLiclMkMlMjAoMTQlMkMlMjAyOCkpJTVE",highlighted:'[(<span class="hljs-string">"Let's"</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">'test'</span>, (<span class="hljs-number">6</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">'my'</span>, (<span class="hljs-number">11</span>, <span class="hljs-number">13</span>)), (<span class="hljs-string">'pre-tokenizer.'</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">28</span>))]',wrap:!1}}),re=new u({props:{code:"cHJlX3Rva2VuaXplciUyMCUzRCUyMHByZV90b2tlbml6ZXJzLlNlcXVlbmNlKCUwQSUyMCUyMCUyMCUyMCU1QnByZV90b2tlbml6ZXJzLldoaXRlc3BhY2VTcGxpdCgpJTJDJTIwcHJlX3Rva2VuaXplcnMuUHVuY3R1YXRpb24oKSU1RCUwQSklMEFwcmVfdG9rZW5pemVyLnByZV90b2tlbml6ZV9zdHIoJTIyTGV0J3MlMjB0ZXN0JTIwbXklMjBwcmUtdG9rZW5pemVyLiUyMik=",highlighted:`pre_tokenizer = pre_tokenizers.<span class="hljs-type">Sequence</span>(
	[pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
	)
	pre_tokenizer.pre_tokenize_str(<span class="hljs-string">"Let's test my pre-tokenizer."</span>)`,wrap:!1}}),oe=new u({props:{code:"JTVCKCdMZXQnJTJDJTIwKDAlMkMlMjAzKSklMkMlMjAoJTIyJyUyMiUyQyUyMCgzJTJDJTIwNCkpJTJDJTIwKCdzJyUyQyUyMCg0JTJDJTIwNSkpJTJDJTIwKCd0ZXN0JyUyQyUyMCg2JTJDJTIwMTApKSUyQyUyMCgnbXknJTJDJTIwKDExJTJDJTIwMTMpKSUyQyUyMCgncHJlJyUyQyUyMCgxNCUyQyUyMDE3KSklMkMlMEElMjAoJy0nJTJDJTIwKDE3JTJDJTIwMTgpKSUyQyUyMCgndG9rZW5pemVyJyUyQyUyMCgxOCUyQyUyMDI3KSklMkMlMjAoJy4nJTJDJTIwKDI3JTJDJTIwMjgpKSU1RA==",highlighted:`[(<span class="hljs-string">'Let'</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">3</span>)), (<span class="hljs-string">"'"</span>, (<span class="hljs-number">3</span>, <span class="hljs-number">4</span>)), (<span class="hljs-string">'s'</span>, (<span class="hljs-number">4</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">'test'</span>, (<span class="hljs-number">6</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">'my'</span>, (<span class="hljs-number">11</span>, <span class="hljs-number">13</span>)), (<span class="hljs-string">'pre'</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">17</span>)),
	(<span class="hljs-string">'-'</span>, (<span class="hljs-number">17</span>, <span class="hljs-number">18</span>)), (<span class="hljs-string">'tokenizer'</span>, (<span class="hljs-number">18</span>, <span class="hljs-number">27</span>)), (<span class="hljs-string">'.'</span>, (<span class="hljs-number">27</span>, <span class="hljs-number">28</span>))]`,wrap:!1}}),Me=new u({props:{code:"c3BlY2lhbF90b2tlbnMlMjAlM0QlMjAlNUIlMjIlNUJVTkslNUQlMjIlMkMlMjAlMjIlNUJQQUQlNUQlMjIlMkMlMjAlMjIlNUJDTFMlNUQlMjIlMkMlMjAlMjIlNUJTRVAlNUQlMjIlMkMlMjAlMjIlNUJNQVNLJTVEJTIyJTVEJTBBdHJhaW5lciUyMCUzRCUyMHRyYWluZXJzLldvcmRQaWVjZVRyYWluZXIodm9jYWJfc2l6ZSUzRDI1MDAwJTJDJTIwc3BlY2lhbF90b2tlbnMlM0RzcGVjaWFsX3Rva2Vucyk=",highlighted:`special_tokens = [<span class="hljs-string">"[UNK]"</span>, <span class="hljs-string">"[PAD]"</span>, <span class="hljs-string">"[CLS]"</span>, <span class="hljs-string">"[SEP]"</span>, <span class="hljs-string">"[MASK]"</span>]
	trainer = trainers.WordPieceTrainer(vocab_size=<span class="hljs-number">25000</span>, special_tokens=special_tokens)`,wrap:!1}}),he=new u({props:{code:"dG9rZW5pemVyLnRyYWluX2Zyb21faXRlcmF0b3IoZ2V0X3RyYWluaW5nX2NvcnB1cygpJTJDJTIwdHJhaW5lciUzRHRyYWluZXIp",highlighted:"tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)",wrap:!1}}),Je=new u({props:{code:"dG9rZW5pemVyLm1vZGVsJTIwJTNEJTIwbW9kZWxzLldvcmRQaWVjZSh1bmtfdG9rZW4lM0QlMjIlNUJVTkslNUQlMjIpJTBBdG9rZW5pemVyLnRyYWluKCU1QiUyMndpa2l0ZXh0LTIudHh0JTIyJTVEJTJDJTIwdHJhaW5lciUzRHRyYWluZXIp",highlighted:`tokenizer.model = models.WordPiece(unk_token=<span class="hljs-string">"[UNK]"</span>)
	tokenizer.train([<span class="hljs-string">"wikitext-2.txt"</span>], trainer=trainer)`,wrap:!1}}),fe=new u({props:{code:"ZW5jb2RpbmclMjAlM0QlMjB0b2tlbml6ZXIuZW5jb2RlKCUyMkxldCdzJTIwdGVzdCUyMHRoaXMlMjB0b2tlbml6ZXIuJTIyKSUwQXByaW50KGVuY29kaW5nLnRva2Vucyk=",highlighted:`encoding = tokenizer.encode(<span class="hljs-string">"Let's test this tokenizer."</span>)
	<span class="hljs-built_in">print</span>(encoding.tokens)`,wrap:!1}}),Te=new u({props:{code:"JTVCJ2xldCclMkMlMjAlMjInJTIyJTJDJTIwJ3MnJTJDJTIwJ3Rlc3QnJTJDJTIwJ3RoaXMnJTJDJTIwJ3RvayclMkMlMjAnJTIzJTIzZW5pJyUyQyUyMCclMjMlMjN6ZXInJTJDJTIwJy4nJTVE",highlighted:'[<span class="hljs-string">'let'</span>, <span class="hljs-string">"'"</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">'test'</span>, <span class="hljs-string">'this'</span>, <span class="hljs-string">'tok'</span>, <span class="hljs-string">'##eni'</span>, <span class="hljs-string">'##zer'</span>, <span class="hljs-string">'.'</span>]',wrap:!1}}),we=new u({props:{code:"Y2xzX3Rva2VuX2lkJTIwJTNEJTIwdG9rZW5pemVyLnRva2VuX3RvX2lkKCUyMiU1QkNMUyU1RCUyMiklMEFzZXBfdG9rZW5faWQlMjAlM0QlMjB0b2tlbml6ZXIudG9rZW5fdG9faWQoJTIyJTVCU0VQJTVEJTIyKSUwQXByaW50KGNsc190b2tlbl9pZCUyQyUyMHNlcF90b2tlbl9pZCk=",highlighted:`cls_token_id = tokenizer.token_to_id(<span class="hljs-string">"[CLS]"</span>)
	sep_token_id = tokenizer.token_to_id(<span class="hljs-string">"[SEP]"</span>)
	<span class="hljs-built_in">print</span>(cls_token_id, sep_token_id)`,wrap:!1}}),be=new u({props:{code:"KDIlMkMlMjAzKQ==",highlighted:'(<span class="hljs-number">2</span>, <span class="hljs-number">3</span>)',wrap:!1}}),Ue=new u({props:{code:"dG9rZW5pemVyLnBvc3RfcHJvY2Vzc29yJTIwJTNEJTIwcHJvY2Vzc29ycy5UZW1wbGF0ZVByb2Nlc3NpbmcoJTBBJTIwJTIwJTIwJTIwc2luZ2xlJTNEZiUyMiU1QkNMUyU1RCUzQTAlMjAlMjRBJTNBMCUyMCU1QlNFUCU1RCUzQTAlMjIlMkMlMEElMjAlMjAlMjAlMjBwYWlyJTNEZiUyMiU1QkNMUyU1RCUzQTAlMjAlMjRBJTNBMCUyMCU1QlNFUCU1RCUzQTAlMjAlMjRCJTNBMSUyMCU1QlNFUCU1RCUzQTElMjIlMkMlMEElMjAlMjAlMjAlMjBzcGVjaWFsX3Rva2VucyUzRCU1QiglMjIlNUJDTFMlNUQlMjIlMkMlMjBjbHNfdG9rZW5faWQpJTJDJTIwKCUyMiU1QlNFUCU1RCUyMiUyQyUyMHNlcF90b2tlbl9pZCklNUQlMkMlMEEp",highlighted:`tokenizer.post_processor = processors.TemplateProcessing(
	single=<span class="hljs-string">f"[CLS]:0 $A:0 [SEP]:0"</span>,
	pair=<span class="hljs-string">f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1"</span>,
	special_tokens=[(<span class="hljs-string">"[CLS]"</span>, cls_token_id), (<span class="hljs-string">"[SEP]"</span>, sep_token_id)],
	)`,wrap:!1}}),Ie=new u({props:{code:"ZW5jb2RpbmclMjAlM0QlMjB0b2tlbml6ZXIuZW5jb2RlKCUyMkxldCdzJTIwdGVzdCUyMHRoaXMlMjB0b2tlbml6ZXIuJTIyKSUwQXByaW50KGVuY29kaW5nLnRva2Vucyk=",highlighted:`encoding = tokenizer.encode(<span class="hljs-string">"Let's test this tokenizer."</span>)
	<span class="hljs-built_in">print</span>(encoding.tokens)`,wrap:!1}}),Ce=new u({props:{code:"JTVCJyU1QkNMUyU1RCclMkMlMjAnbGV0JyUyQyUyMCUyMiclMjIlMkMlMjAncyclMkMlMjAndGVzdCclMkMlMjAndGhpcyclMkMlMjAndG9rJyUyQyUyMCclMjMlMjNlbmknJTJDJTIwJyUyMyUyM3plciclMkMlMjAnLiclMkMlMjAnJTVCU0VQJTVEJyU1RA==",highlighted:'[<span class="hljs-string">'[CLS]'</span>, <span class="hljs-string">'let'</span>, <span class="hljs-string">"'"</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">'test'</span>, <span class="hljs-string">'this'</span>, <span class="hljs-string">'tok'</span>, <span class="hljs-string">'##eni'</span>, <span class="hljs-string">'##zer'</span>, <span class="hljs-string">'.'</span>, <span class="hljs-string">'[SEP]'</span>]',wrap:!1}}),Ze=new u({props:{code:"ZW5jb2RpbmclMjAlM0QlMjB0b2tlbml6ZXIuZW5jb2RlKCUyMkxldCdzJTIwdGVzdCUyMHRoaXMlMjB0b2tlbml6ZXIuLi4lMjIlMkMlMjAlMjJvbiUyMGElMjBwYWlyJTIwb2YlMjBzZW50ZW5jZXMuJTIyKSUwQXByaW50KGVuY29kaW5nLnRva2VucyklMEFwcmludChlbmNvZGluZy50eXBlX2lkcyk=",highlighted:`encoding = tokenizer.encode(<span class="hljs-string">"Let's test this tokenizer..."</span>, <span class="hljs-string">"on a pair of sentences."</span>)
	<span class="hljs-built_in">print</span>(encoding.tokens)
	<span class="hljs-built_in">print</span>(encoding.type_ids)`,wrap:!1}}),Ve=new u({props:{code:"JTVCJyU1QkNMUyU1RCclMkMlMjAnbGV0JyUyQyUyMCUyMiclMjIlMkMlMjAncyclMkMlMjAndGVzdCclMkMlMjAndGhpcyclMkMlMjAndG9rJyUyQyUyMCclMjMlMjNlbmknJTJDJTIwJyUyMyUyM3plciclMkMlMjAnLi4uJyUyQyUyMCclNUJTRVAlNUQnJTJDJTIwJ29uJyUyQyUyMCdhJyUyQyUyMCdwYWlyJyUyQyUyMCdvZiclMkMlMjAnc2VudGVuY2VzJyUyQyUyMCcuJyUyQyUyMCclNUJTRVAlNUQnJTVEJTBBJTVCMCUyQyUyMDAlMkMlMjAwJTJDJTIwMCUyQyUyMDAlMkMlMjAwJTJDJTIwMCUyQyUyMDAlMkMlMjAwJTJDJTIwMCUyQyUyMDAlMkMlMjAxJTJDJTIwMSUyQyUyMDElMkMlMjAxJTJDJTIwMSUyQyUyMDElMkMlMjAxJTVE",highlighted:`[<span class="hljs-string">'[CLS]'</span>, <span class="hljs-string">'let'</span>, <span class="hljs-string">"'"</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">'test'</span>, <span class="hljs-string">'this'</span>, <span class="hljs-string">'tok'</span>, <span class="hljs-string">'##eni'</span>, <span class="hljs-string">'##zer'</span>, <span class="hljs-string">'...'</span>, <span class="hljs-string">'[SEP]'</span>, <span class="hljs-string">'on'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'pair'</span>, <span class="hljs-string">'of'</span>, <span class="hljs-string">'sentences'</span>, <span class="hljs-string">'.'</span>, <span class="hljs-string">'[SEP]'</span>]
	[<span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>]`,wrap:!1}}),Be=new u({props:{code:"dG9rZW5pemVyLmRlY29kZXIlMjAlM0QlMjBkZWNvZGVycy5Xb3JkUGllY2UocHJlZml4JTNEJTIyJTIzJTIzJTIyKQ==",highlighted:'tokenizer.decoder = decoders.WordPiece(prefix=<span class="hljs-string">"##"</span>)',wrap:!1}}),Qe=new u({props:{code:"dG9rZW5pemVyLmRlY29kZShlbmNvZGluZy5pZHMp",highlighted:"tokenizer.decode(encoding.ids)",wrap:!1}}),Ae=new u({props:{code:"JTIybGV0J3MlMjB0ZXN0JTIwdGhpcyUyMHRva2VuaXplci4uLiUyMG9uJTIwYSUyMHBhaXIlMjBvZiUyMHNlbnRlbmNlcy4lMjI=",highlighted:'<span class="hljs-string">"let's test this tokenizer... on a pair of sentences."</span>',wrap:!1}}),Ne=new u({props:{code:"dG9rZW5pemVyLnNhdmUoJTIydG9rZW5pemVyLmpzb24lMjIp",highlighted:'tokenizer.save(<span class="hljs-string">"tokenizer.json"</span>)',wrap:!1}}),Ge=new u({props:{code:"bmV3X3Rva2VuaXplciUyMCUzRCUyMFRva2VuaXplci5mcm9tX2ZpbGUoJTIydG9rZW5pemVyLmpzb24lMjIp",highlighted:'new_tokenizer = Tokenizer.from_file(<span class="hljs-string">"tokenizer.json"</span>)',wrap:!1}}),He=new u({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFByZVRyYWluZWRUb2tlbml6ZXJGYXN0JTBBJTBBd3JhcHBlZF90b2tlbml6ZXIlMjAlM0QlMjBQcmVUcmFpbmVkVG9rZW5pemVyRmFzdCglMEElMjAlMjAlMjAlMjB0b2tlbml6ZXJfb2JqZWN0JTNEdG9rZW5pemVyJTJDJTBBJTIwJTIwJTIwJTIwJTIzJTIwdG9rZW5pemVyX2ZpbGUlM0QlMjJ0b2tlbml6ZXIuanNvbiUyMiUyQyUyMCUyMyUyMFlvdSUyMGNhbiUyMGxvYWQlMjBmcm9tJTIwdGhlJTIwdG9rZW5pemVyJTIwZmlsZSUyQyUyMGFsdGVybmF0aXZlbHklMEElMjAlMjAlMjAlMjB1bmtfdG9rZW4lM0QlMjIlNUJVTkslNUQlMjIlMkMlMEElMjAlMjAlMjAlMjBwYWRfdG9rZW4lM0QlMjIlNUJQQUQlNUQlMjIlMkMlMEElMjAlMjAlMjAlMjBjbHNfdG9rZW4lM0QlMjIlNUJDTFMlNUQlMjIlMkMlMEElMjAlMjAlMjAlMjBzZXBfdG9rZW4lM0QlMjIlNUJTRVAlNUQlMjIlMkMlMEElMjAlMjAlMjAlMjBtYXNrX3Rva2VuJTNEJTIyJTVCTUFTSyU1RCUyMiUyQyUwQSk=",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast

	wrapped_tokenizer = PreTrainedTokenizerFast(
	tokenizer_object=tokenizer,
	<span class="hljs-comment"># tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively</span>
	unk_token=<span class="hljs-string">"[UNK]"</span>,
	pad_token=<span class="hljs-string">"[PAD]"</span>,
	cls_token=<span class="hljs-string">"[CLS]"</span>,
	sep_token=<span class="hljs-string">"[SEP]"</span>,
	mask_token=<span class="hljs-string">"[MASK]"</span>,
	)`,wrap:!1}}),Ee=new u({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEJlcnRUb2tlbml6ZXJGYXN0JTBBJTBBd3JhcHBlZF90b2tlbml6ZXIlMjAlM0QlMjBCZXJ0VG9rZW5pemVyRmFzdCh0b2tlbml6ZXJfb2JqZWN0JTNEdG9rZW5pemVyKQ==",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> BertTokenizerFast

	wrapped_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)`,wrap:!1}}),Fe=new ms({props:{title:"Building a BPE tokenizer from scratch",local:"building-a-bpe-tokenizer-from-scratch",headingTag:"h2"}}),Ke=new u({props:{code:"dG9rZW5pemVyJTIwJTNEJTIwVG9rZW5pemVyKG1vZGVscy5CUEUoKSk=",highlighted:"tokenizer = Tokenizer(models.BPE())",wrap:!1}}),tt=new u({props:{code:"dG9rZW5pemVyLnByZV90b2tlbml6ZXIlMjAlM0QlMjBwcmVfdG9rZW5pemVycy5CeXRlTGV2ZWwoYWRkX3ByZWZpeF9zcGFjZSUzREZhbHNlKQ==",highlighted:'tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=<span class="hljs-literal">False</span>)',wrap:!1}}),lt=new u({props:{code:"dG9rZW5pemVyLnByZV90b2tlbml6ZXIucHJlX3Rva2VuaXplX3N0ciglMjJMZXQncyUyMHRlc3QlMjBwcmUtdG9rZW5pemF0aW9uISUyMik=",highlighted:'tokenizer.pre_tokenizer.pre_tokenize_str(<span class="hljs-string">"Let's test pre-tokenization!"</span>)',wrap:!1}}),nt=new u({props:{code:"JTVCKCdMZXQnJTJDJTIwKDAlMkMlMjAzKSklMkMlMjAoJTIyJ3MlMjIlMkMlMjAoMyUyQyUyMDUpKSUyQyUyMCgnJUM0JUEwdGVzdCclMkMlMjAoNSUyQyUyMDEwKSklMkMlMjAoJyVDNCVBMHByZSclMkMlMjAoMTAlMkMlMjAxNCkpJTJDJTIwKCctJyUyQyUyMCgxNCUyQyUyMDE1KSklMkMlMEElMjAoJ3Rva2VuaXphdGlvbiclMkMlMjAoMTUlMkMlMjAyNykpJTJDJTIwKCchJyUyQyUyMCgyNyUyQyUyMDI4KSklNUQ=",highlighted:`[(<span class="hljs-string">'Let'</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">3</span>)), (<span class="hljs-string">"'s"</span>, (<span class="hljs-number">3</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">'Ġtest'</span>, (<span class="hljs-number">5</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">'Ġpre'</span>, (<span class="hljs-number">10</span>, <span class="hljs-number">14</span>)), (<span class="hljs-string">'-'</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">15</span>)),
	(<span class="hljs-string">'tokenization'</span>, (<span class="hljs-number">15</span>, <span class="hljs-number">27</span>)), (<span class="hljs-string">'!'</span>, (<span class="hljs-number">27</span>, <span class="hljs-number">28</span>))]`,wrap:!1}}),it=new u({props:{code:"dHJhaW5lciUyMCUzRCUyMHRyYWluZXJzLkJwZVRyYWluZXIodm9jYWJfc2l6ZSUzRDI1MDAwJTJDJTIwc3BlY2lhbF90b2tlbnMlM0QlNUIlMjIlM0MlN0NlbmRvZnRleHQlN0MlM0UlMjIlNUQpJTBBdG9rZW5pemVyLnRyYWluX2Zyb21faXRlcmF0b3IoZ2V0X3RyYWluaW5nX2NvcnB1cygpJTJDJTIwdHJhaW5lciUzRHRyYWluZXIp",highlighted:`trainer = trainers.BpeTrainer(vocab_size=<span class="hljs-number">25000</span>, special_tokens=[<span class="hljs-string">"<\|endoftext\|>"</span>])
	tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)`,wrap:!1}}),ot=new u({props:{code:"dG9rZW5pemVyLm1vZGVsJTIwJTNEJTIwbW9kZWxzLkJQRSgpJTBBdG9rZW5pemVyLnRyYWluKCU1QiUyMndpa2l0ZXh0LTIudHh0JTIyJTVEJTJDJTIwdHJhaW5lciUzRHRyYWluZXIp",highlighted:`tokenizer.model = models.BPE()
	tokenizer.train([<span class="hljs-string">"wikitext-2.txt"</span>], trainer=trainer)`,wrap:!1}}),Mt=new u({props:{code:"ZW5jb2RpbmclMjAlM0QlMjB0b2tlbml6ZXIuZW5jb2RlKCUyMkxldCdzJTIwdGVzdCUyMHRoaXMlMjB0b2tlbml6ZXIuJTIyKSUwQXByaW50KGVuY29kaW5nLnRva2Vucyk=",highlighted:`encoding = tokenizer.encode(<span class="hljs-string">"Let's test this tokenizer."</span>)
	<span class="hljs-built_in">print</span>(encoding.tokens)`,wrap:!1}}),mt=new u({props:{code:"JTVCJ0wnJTJDJTIwJ2V0JyUyQyUyMCUyMiclMjIlMkMlMjAncyclMkMlMjAnJUM0JUEwdGVzdCclMkMlMjAnJUM0JUEwdGhpcyclMkMlMjAnJUM0JUEwdG8nJTJDJTIwJ2tlbiclMkMlMjAnaXplciclMkMlMjAnLiclNUQ=",highlighted:'[<span class="hljs-string">'L'</span>, <span class="hljs-string">'et'</span>, <span class="hljs-string">"'"</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">'Ġtest'</span>, <span class="hljs-string">'Ġthis'</span>, <span class="hljs-string">'Ġto'</span>, <span class="hljs-string">'ken'</span>, <span class="hljs-string">'izer'</span>, <span class="hljs-string">'.'</span>]',wrap:!1}}),ht=new u({props:{code:"dG9rZW5pemVyLnBvc3RfcHJvY2Vzc29yJTIwJTNEJTIwcHJvY2Vzc29ycy5CeXRlTGV2ZWwodHJpbV9vZmZzZXRzJTNERmFsc2Up",highlighted:'tokenizer.post_processor = processors.ByteLevel(trim_offsets=<span class="hljs-literal">False</span>)',wrap:!1}}),Jt=new u({props:{code:"c2VudGVuY2UlMjAlM0QlMjAlMjJMZXQncyUyMHRlc3QlMjB0aGlzJTIwdG9rZW5pemVyLiUyMiUwQWVuY29kaW5nJTIwJTNEJTIwdG9rZW5pemVyLmVuY29kZShzZW50ZW5jZSklMEFzdGFydCUyQyUyMGVuZCUyMCUzRCUyMGVuY29kaW5nLm9mZnNldHMlNUI0JTVEJTBBc2VudGVuY2UlNUJzdGFydCUzQWVuZCU1RA==",highlighted:`sentence = <span class="hljs-string">"Let's test this tokenizer."</span>
	encoding = tokenizer.encode(sentence)
	start, end = encoding.offsets[<span class="hljs-number">4</span>]
	sentence[start:end]`,wrap:!1}}),dt=new u({props:{code:"JyUyMHRlc3Qn",highlighted:'<span class="hljs-string">' test'</span>',wrap:!1}}),Tt=new u({props:{code:"dG9rZW5pemVyLmRlY29kZXIlMjAlM0QlMjBkZWNvZGVycy5CeXRlTGV2ZWwoKQ==",highlighted:"tokenizer.decoder = decoders.ByteLevel()",wrap:!1}}),kt=new u({props:{code:"dG9rZW5pemVyLmRlY29kZShlbmNvZGluZy5pZHMp",highlighted:"tokenizer.decode(encoding.ids)",wrap:!1}}),wt=new u({props:{code:"JTIyTGV0J3MlMjB0ZXN0JTIwdGhpcyUyMHRva2VuaXplci4lMjI=",highlighted:'<span class="hljs-string">"Let's test this tokenizer."</span>',wrap:!1}}),$t=new u({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFByZVRyYWluZWRUb2tlbml6ZXJGYXN0JTBBJTBBd3JhcHBlZF90b2tlbml6ZXIlMjAlM0QlMjBQcmVUcmFpbmVkVG9rZW5pemVyRmFzdCglMEElMjAlMjAlMjAlMjB0b2tlbml6ZXJfb2JqZWN0JTNEdG9rZW5pemVyJTJDJTBBJTIwJTIwJTIwJTIwYm9zX3Rva2VuJTNEJTIyJTNDJTdDZW5kb2Z0ZXh0JTdDJTNFJTIyJTJDJTBBJTIwJTIwJTIwJTIwZW9zX3Rva2VuJTNEJTIyJTNDJTdDZW5kb2Z0ZXh0JTdDJTNFJTIyJTJDJTBBKQ==",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast

	wrapped_tokenizer = PreTrainedTokenizerFast(
	tokenizer_object=tokenizer,
	bos_token=<span class="hljs-string">"<\|endoftext\|>"</span>,
	eos_token=<span class="hljs-string">"<\|endoftext\|>"</span>,
	)`,wrap:!1}}),Ut=new u({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEdQVDJUb2tlbml6ZXJGYXN0JTBBJTBBd3JhcHBlZF90b2tlbml6ZXIlMjAlM0QlMjBHUFQyVG9rZW5pemVyRmFzdCh0b2tlbml6ZXJfb2JqZWN0JTNEdG9rZW5pemVyKQ==",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> GPT2TokenizerFast

	wrapped_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer)`,wrap:!1}}),zt=new ms({props:{title:"Building a Unigram tokenizer from scratch",local:"building-a-unigram-tokenizer-from-scratch",headingTag:"h2"}}),Ct=new u({props:{code:"dG9rZW5pemVyJTIwJTNEJTIwVG9rZW5pemVyKG1vZGVscy5VbmlncmFtKCkp",highlighted:"tokenizer = Tokenizer(models.Unigram())",wrap:!1}}),Vt=new u({props:{code:"ZnJvbSUyMHRva2VuaXplcnMlMjBpbXBvcnQlMjBSZWdleCUwQSUwQXRva2VuaXplci5ub3JtYWxpemVyJTIwJTNEJTIwbm9ybWFsaXplcnMuU2VxdWVuY2UoJTBBJTIwJTIwJTIwJTIwJTVCJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbm9ybWFsaXplcnMuUmVwbGFjZSglMjIlNjAlNjAlMjIlMkMlMjAnJTIyJyklMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBub3JtYWxpemVycy5SZXBsYWNlKCUyMicnJTIyJTJDJTIwJyUyMicpJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbm9ybWFsaXplcnMuTkZLRCgpJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbm9ybWFsaXplcnMuU3RyaXBBY2NlbnRzKCklMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBub3JtYWxpemVycy5SZXBsYWNlKFJlZ2V4KCUyMiUyMCU3QjIlMkMlN0QlMjIpJTJDJTIwJTIyJTIwJTIyKSUyQyUwQSUyMCUyMCUyMCUyMCU1RCUwQSk=",highlighted:`<span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> Regex

	tokenizer.normalizer = normalizers.<span class="hljs-type">Sequence</span>(
	[
	normalizers.Replace(<span class="hljs-string">"\`\`"</span>, <span class="hljs-string">'"'</span>),
	normalizers.Replace(<span class="hljs-string">"''"</span>, <span class="hljs-string">'"'</span>),
	normalizers.NFKD(),
	normalizers.StripAccents(),
	normalizers.Replace(Regex(<span class="hljs-string">" {2,}"</span>), <span class="hljs-string">" "</span>),
	]
	)`,wrap:!1}}),_t=new u({props:{code:"dG9rZW5pemVyLnByZV90b2tlbml6ZXIlMjAlM0QlMjBwcmVfdG9rZW5pemVycy5NZXRhc3BhY2UoKQ==",highlighted:"tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()",wrap:!1}}),At=new u({props:{code:"dG9rZW5pemVyLnByZV90b2tlbml6ZXIucHJlX3Rva2VuaXplX3N0ciglMjJMZXQncyUyMHRlc3QlMjB0aGUlMjBwcmUtdG9rZW5pemVyISUyMik=",highlighted:'tokenizer.pre_tokenizer.pre_tokenize_str(<span class="hljs-string">"Let's test the pre-tokenizer!"</span>)',wrap:!1}}),Rt=new u({props:{code:"JTVCKCUyMiVFMiU5NiU4MUxldCdzJTIyJTJDJTIwKDAlMkMlMjA1KSklMkMlMjAoJyVFMiU5NiU4MXRlc3QnJTJDJTIwKDUlMkMlMjAxMCkpJTJDJTIwKCclRTIlOTYlODF0aGUnJTJDJTIwKDEwJTJDJTIwMTQpKSUyQyUyMCgnJUUyJTk2JTgxcHJlLXRva2VuaXplciEnJTJDJTIwKDE0JTJDJTIwMjkpKSU1RA==",highlighted:'[(<span class="hljs-string">"▁Let's"</span>, (<span class="hljs-number">0</span>, <span class="hljs-number">5</span>)), (<span class="hljs-string">'▁test'</span>, (<span class="hljs-number">5</span>, <span class="hljs-number">10</span>)), (<span class="hljs-string">'▁the'</span>, (<span class="hljs-number">10</span>, <span class="hljs-number">14</span>)), (<span class="hljs-string">'▁pre-tokenizer!'</span>, (<span class="hljs-number">14</span>, <span class="hljs-number">29</span>))]',wrap:!1}}),Xt=new u({props:{code:"c3BlY2lhbF90b2tlbnMlMjAlM0QlMjAlNUIlMjIlM0NjbHMlM0UlMjIlMkMlMjAlMjIlM0NzZXAlM0UlMjIlMkMlMjAlMjIlM0N1bmslM0UlMjIlMkMlMjAlMjIlM0NwYWQlM0UlMjIlMkMlMjAlMjIlM0NtYXNrJTNFJTIyJTJDJTIwJTIyJTNDcyUzRSUyMiUyQyUyMCUyMiUzQyUyRnMlM0UlMjIlNUQlMEF0cmFpbmVyJTIwJTNEJTIwdHJhaW5lcnMuVW5pZ3JhbVRyYWluZXIoJTBBJTIwJTIwJTIwJTIwdm9jYWJfc2l6ZSUzRDI1MDAwJTJDJTIwc3BlY2lhbF90b2tlbnMlM0RzcGVjaWFsX3Rva2VucyUyQyUyMHVua190b2tlbiUzRCUyMiUzQ3VuayUzRSUyMiUwQSklMEF0b2tlbml6ZXIudHJhaW5fZnJvbV9pdGVyYXRvcihnZXRfdHJhaW5pbmdfY29ycHVzKCklMkMlMjB0cmFpbmVyJTNEdHJhaW5lcik=",highlighted:`special_tokens = [<span class="hljs-string">"<cls>"</span>, <span class="hljs-string">"<sep>"</span>, <span class="hljs-string">"<unk>"</span>, <span class="hljs-string">"<pad>"</span>, <span class="hljs-string">"<mask>"</span>, <span class="hljs-string">"<s>"</span>, <span class="hljs-string">"</s>"</span>]
	trainer = trainers.UnigramTrainer(
	vocab_size=<span class="hljs-number">25000</span>, special_tokens=special_tokens, unk_token=<span class="hljs-string">"<unk>"</span>
	)
	tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)`,wrap:!1}}),Lt=new u({props:{code:"dG9rZW5pemVyLm1vZGVsJTIwJTNEJTIwbW9kZWxzLlVuaWdyYW0oKSUwQXRva2VuaXplci50cmFpbiglNUIlMjJ3aWtpdGV4dC0yLnR4dCUyMiU1RCUyQyUyMHRyYWluZXIlM0R0cmFpbmVyKQ==",highlighted:`tokenizer.model = models.Unigram()
	tokenizer.train([<span class="hljs-string">"wikitext-2.txt"</span>], trainer=trainer)`,wrap:!1}}),Dt=new u({props:{code:"ZW5jb2RpbmclMjAlM0QlMjB0b2tlbml6ZXIuZW5jb2RlKCUyMkxldCdzJTIwdGVzdCUyMHRoaXMlMjB0b2tlbml6ZXIuJTIyKSUwQXByaW50KGVuY29kaW5nLnRva2Vucyk=",highlighted:`encoding = tokenizer.encode(<span class="hljs-string">"Let's test this tokenizer."</span>)
	<span class="hljs-built_in">print</span>(encoding.tokens)`,wrap:!1}}),Et=new u({props:{code:"JTVCJyVFMiU5NiU4MUxldCclMkMlMjAlMjInJTIyJTJDJTIwJ3MnJTJDJTIwJyVFMiU5NiU4MXRlc3QnJTJDJTIwJyVFMiU5NiU4MXRoaXMnJTJDJTIwJyVFMiU5NiU4MXRvJyUyQyUyMCdrZW4nJTJDJTIwJ2l6ZXInJTJDJTIwJy4nJTVE",highlighted:'[<span class="hljs-string">'▁Let'</span>, <span class="hljs-string">"'"</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">'▁test'</span>, <span class="hljs-string">'▁this'</span>, <span class="hljs-string">'▁to'</span>, <span class="hljs-string">'ken'</span>, <span class="hljs-string">'izer'</span>, <span class="hljs-string">'.'</span>]',wrap:!1}}),Pt=new u({props:{code:"Y2xzX3Rva2VuX2lkJTIwJTNEJTIwdG9rZW5pemVyLnRva2VuX3RvX2lkKCUyMiUzQ2NscyUzRSUyMiklMEFzZXBfdG9rZW5faWQlMjAlM0QlMjB0b2tlbml6ZXIudG9rZW5fdG9faWQoJTIyJTNDc2VwJTNFJTIyKSUwQXByaW50KGNsc190b2tlbl9pZCUyQyUyMHNlcF90b2tlbl9pZCk=",highlighted:`cls_token_id = tokenizer.token_to_id(<span class="hljs-string">"<cls>"</span>)
	sep_token_id = tokenizer.token_to_id(<span class="hljs-string">"<sep>"</span>)
	<span class="hljs-built_in">print</span>(cls_token_id, sep_token_id)`,wrap:!1}}),Ft=new u({props:{code:"MCUyMDE=",highlighted:'<span class="hljs-number">0</span> <span class="hljs-number">1</span>',wrap:!1}}),Kt=new u({props:{code:"dG9rZW5pemVyLnBvc3RfcHJvY2Vzc29yJTIwJTNEJTIwcHJvY2Vzc29ycy5UZW1wbGF0ZVByb2Nlc3NpbmcoJTBBJTIwJTIwJTIwJTIwc2luZ2xlJTNEJTIyJTI0QSUzQTAlMjAlM0NzZXAlM0UlM0EwJTIwJTNDY2xzJTNFJTNBMiUyMiUyQyUwQSUyMCUyMCUyMCUyMHBhaXIlM0QlMjIlMjRBJTNBMCUyMCUzQ3NlcCUzRSUzQTAlMjAlMjRCJTNBMSUyMCUzQ3NlcCUzRSUzQTElMjAlM0NjbHMlM0UlM0EyJTIyJTJDJTBBJTIwJTIwJTIwJTIwc3BlY2lhbF90b2tlbnMlM0QlNUIoJTIyJTNDc2VwJTNFJTIyJTJDJTIwc2VwX3Rva2VuX2lkKSUyQyUyMCglMjIlM0NjbHMlM0UlMjIlMkMlMjBjbHNfdG9rZW5faWQpJTVEJTJDJTBBKQ==",highlighted:`tokenizer.post_processor = processors.TemplateProcessing(
	single=<span class="hljs-string">"$A:0 <sep>:0 <cls>:2"</span>,
	pair=<span class="hljs-string">"$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2"</span>,
	special_tokens=[(<span class="hljs-string">"<sep>"</span>, sep_token_id), (<span class="hljs-string">"<cls>"</span>, cls_token_id)],
	)`,wrap:!1}}),es=new u({props:{code:"ZW5jb2RpbmclMjAlM0QlMjB0b2tlbml6ZXIuZW5jb2RlKCUyMkxldCdzJTIwdGVzdCUyMHRoaXMlMjB0b2tlbml6ZXIuLi4lMjIlMkMlMjAlMjJvbiUyMGElMjBwYWlyJTIwb2YlMjBzZW50ZW5jZXMhJTIyKSUwQXByaW50KGVuY29kaW5nLnRva2VucyklMEFwcmludChlbmNvZGluZy50eXBlX2lkcyk=",highlighted:`encoding = tokenizer.encode(<span class="hljs-string">"Let's test this tokenizer..."</span>, <span class="hljs-string">"on a pair of sentences!"</span>)
	<span class="hljs-built_in">print</span>(encoding.tokens)
	<span class="hljs-built_in">print</span>(encoding.type_ids)`,wrap:!1}}),ts=new u({props:{code:"JTVCJyVFMiU5NiU4MUxldCclMkMlMjAlMjInJTIyJTJDJTIwJ3MnJTJDJTIwJyVFMiU5NiU4MXRlc3QnJTJDJTIwJyVFMiU5NiU4MXRoaXMnJTJDJTIwJyVFMiU5NiU4MXRvJyUyQyUyMCdrZW4nJTJDJTIwJ2l6ZXInJTJDJTIwJy4nJTJDJTIwJy4nJTJDJTIwJy4nJTJDJTIwJyUzQ3NlcCUzRSclMkMlMjAnJUUyJTk2JTgxJyUyQyUyMCdvbiclMkMlMjAnJUUyJTk2JTgxJyUyQyUyMCdhJyUyQyUyMCclRTIlOTYlODFwYWlyJyUyQyUyMCUwQSUyMCUyMCclRTIlOTYlODFvZiclMkMlMjAnJUUyJTk2JTgxc2VudGVuY2UnJTJDJTIwJ3MnJTJDJTIwJyEnJTJDJTIwJyUzQ3NlcCUzRSclMkMlMjAnJTNDY2xzJTNFJyU1RCUwQSU1QjAlMkMlMjAwJTJDJTIwMCUyQyUyMDAlMkMlMjAwJTJDJTIwMCUyQyUyMDAlMkMlMjAwJTJDJTIwMCUyQyUyMDAlMkMlMjAwJTJDJTIwMCUyQyUyMDElMkMlMjAxJTJDJTIwMSUyQyUyMDElMkMlMjAxJTJDJTIwMSUyQyUyMDElMkMlMjAxJTJDJTIwMSUyQyUyMDElMkMlMjAyJTVE",highlighted:`[<span class="hljs-string">'▁Let'</span>, <span class="hljs-string">"'"</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">'▁test'</span>, <span class="hljs-string">'▁this'</span>, <span class="hljs-string">'▁to'</span>, <span class="hljs-string">'ken'</span>, <span class="hljs-string">'izer'</span>, <span class="hljs-string">'.'</span>, <span class="hljs-string">'.'</span>, <span class="hljs-string">'.'</span>, <span class="hljs-string">'<sep>'</span>, <span class="hljs-string">'▁'</span>, <span class="hljs-string">'on'</span>, <span class="hljs-string">'▁'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'▁pair'</span>,
	<span class="hljs-string">'▁of'</span>, <span class="hljs-string">'▁sentence'</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">'!'</span>, <span class="hljs-string">'<sep>'</span>, <span class="hljs-string">'<cls>'</span>]
	[<span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>]`,wrap:!1}}),ls=new u({props:{code:"dG9rZW5pemVyLmRlY29kZXIlMjAlM0QlMjBkZWNvZGVycy5NZXRhc3BhY2UoKQ==",highlighted:"tokenizer.decoder = decoders.Metaspace()",wrap:!1}}),as=new u({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFByZVRyYWluZWRUb2tlbml6ZXJGYXN0JTBBJTBBd3JhcHBlZF90b2tlbml6ZXIlMjAlM0QlMjBQcmVUcmFpbmVkVG9rZW5pemVyRmFzdCglMEElMjAlMjAlMjAlMjB0b2tlbml6ZXJfb2JqZWN0JTNEdG9rZW5pemVyJTJDJTBBJTIwJTIwJTIwJTIwYm9zX3Rva2VuJTNEJTIyJTNDcyUzRSUyMiUyQyUwQSUyMCUyMCUyMCUyMGVvc190b2tlbiUzRCUyMiUzQyUyRnMlM0UlMjIlMkMlMEElMjAlMjAlMjAlMjB1bmtfdG9rZW4lM0QlMjIlM0N1bmslM0UlMjIlMkMlMEElMjAlMjAlMjAlMjBwYWRfdG9rZW4lM0QlMjIlM0NwYWQlM0UlMjIlMkMlMEElMjAlMjAlMjAlMjBjbHNfdG9rZW4lM0QlMjIlM0NjbHMlM0UlMjIlMkMlMEElMjAlMjAlMjAlMjBzZXBfdG9rZW4lM0QlMjIlM0NzZXAlM0UlMjIlMkMlMEElMjAlMjAlMjAlMjBtYXNrX3Rva2VuJTNEJTIyJTNDbWFzayUzRSUyMiUyQyUwQSUyMCUyMCUyMCUyMHBhZGRpbmdfc2lkZSUzRCUyMmxlZnQlMjIlMkMlMEEp",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast

	wrapped_tokenizer = PreTrainedTokenizerFast(
	tokenizer_object=tokenizer,
	bos_token=<span class="hljs-string">"<s>"</span>,
	eos_token=<span class="hljs-string">"</s>"</span>,
	unk_token=<span class="hljs-string">"<unk>"</span>,
	pad_token=<span class="hljs-string">"<pad>"</span>,
	cls_token=<span class="hljs-string">"<cls>"</span>,
	sep_token=<span class="hljs-string">"<sep>"</span>,
	mask_token=<span class="hljs-string">"<mask>"</span>,
	padding_side=<span class="hljs-string">"left"</span>,
	)`,wrap:!1}}),ps=new u({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFhMTmV0VG9rZW5pemVyRmFzdCUwQSUwQXdyYXBwZWRfdG9rZW5pemVyJTIwJTNEJTIwWExOZXRUb2tlbml6ZXJGYXN0KHRva2VuaXplcl9vYmplY3QlM0R0b2tlbml6ZXIp",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> XLNetTokenizerFast

	wrapped_tokenizer = XLNetTokenizerFast(tokenizer_object=tokenizer)`,wrap:!1}}),os=new qi({props:{source:"https://github.com/huggingface/course/blob/main/chapters/en/chapter6/8.mdx"}}),{c(){J=i("meta"),j=n(),d=i("p"),cs=n(),o(k.$$.fragment),hs=n(),o(w.$$.fragment),us=n(),b=i("p"),b.textContent=ra,Js=n(),$=i("ul"),$.innerHTML=oa,ds=n(),g=i("p"),g.textContent=ca,fs=n(),f=i("div"),f.innerHTML=Ma,Ts=n(),U=i("p"),U.innerHTML=ma,js=n(),o(x.$$.fragment),ks=n(),z=i("p"),z.innerHTML=ya,ws=n(),I=i("ul"),I.innerHTML=ha,bs=n(),C=i("p"),C.innerHTML=ua,$s=n(),o(v.$$.fragment),gs=n(),Z=i("p"),Z.innerHTML=Ja,Us=n(),o(V.$$.fragment),xs=n(),W=i("p"),W.innerHTML=da,zs=n(),B=i("p"),B.textContent=fa,Is=n(),o(_.$$.fragment),Cs=n(),Q=i("p"),Q.textContent=Ta,vs=n(),o(A.$$.fragment),Zs=n(),R=i("p"),R.innerHTML=ja,Vs=n(),N=i("p"),N.innerHTML=ka,Ws=n(),o(X.$$.fragment),Bs=n(),G=i("p"),G.innerHTML=wa,_s=n(),q=i("p"),q.innerHTML=ba,Qs=n(),o(L.$$.fragment),As=n(),H=i("p"),H.innerHTML=$a,Rs=n(),o(D.$$.fragment),Ns=n(),E=i("p"),E.innerHTML=ga,Xs=n(),S=i("p"),S.innerHTML=Ua,Gs=n(),o(P.$$.fragment),qs=n(),o(F.$$.fragment),Ls=n(),o(T.$$.fragment),Hs=n(),Y=i("p"),Y.innerHTML=xa,Ds=n(),o(K.$$.fragment),Es=n(),O=i("p"),O.textContent=za,Ss=n(),o(ee.$$.fragment),Ps=n(),te=i("p"),te.innerHTML=Ia,Fs=n(),o(se.$$.fragment),Ys=n(),o(le.$$.fragment),Ks=n(),ne=i("p"),ne.innerHTML=Ca,Os=n(),o(ae.$$.fragment),el=n(),o(ie.$$.fragment),tl=n(),pe=i("p"),pe.innerHTML=va,sl=n(),o(re.$$.fragment),ll=n(),o(oe.$$.fragment),nl=n(),ce=i("p"),ce.innerHTML=Za,al=n(),o(Me.$$.fragment),il=n(),me=i("p"),me.innerHTML=Va,pl=n(),ye=i("p"),ye.textContent=Wa,rl=n(),o(he.$$.fragment),ol=n(),ue=i("p"),ue.innerHTML=Ba,cl=n(),o(Je.$$.fragment),Ml=n(),de=i("p"),de.innerHTML=_a,ml=n(),o(fe.$$.fragment),yl=n(),o(Te.$$.fragment),hl=n(),je=i("p"),je.innerHTML=Qa,ul=n(),ke=i("p"),ke.innerHTML=Aa,Jl=n(),o(we.$$.fragment),dl=n(),o(be.$$.fragment),fl=n(),$e=i("p"),$e.innerHTML=Ra,Tl=n(),ge=i("p"),ge.textContent=Na,jl=n(),o(Ue.$$.fragment),kl=n(),xe=i("p"),xe.textContent=Xa,wl=n(),ze=i("p"),ze.textContent=Ga,bl=n(),o(Ie.$$.fragment),$l=n(),o(Ce.$$.fragment),gl=n(),ve=i("p"),ve.textContent=qa,Ul=n(),o(Ze.$$.fragment),xl=n(),o(Ve.$$.fragment),zl=n(),We=i("p"),We.textContent=La,Il=n(),o(Be.$$.fragment),Cl=n(),_e=i("p"),_e.innerHTML=Ha,vl=n(),o(Qe.$$.fragment),Zl=n(),o(Ae.$$.fragment),Vl=n(),Re=i("p"),Re.textContent=Da,Wl=n(),o(Ne.$$.fragment),Bl=n(),Xe=i("p"),Xe.innerHTML=Ea,_l=n(),o(Ge.$$.fragment),Ql=n(),qe=i("p"),qe.innerHTML=Sa,Al=n(),Le=i("p"),Le.innerHTML=Pa,Rl=n(),o(He.$$.fragment),Nl=n(),De=i("p"),De.innerHTML=Fa,Xl=n(),o(Ee.$$.fragment),Gl=n(),Se=i("p"),Se.innerHTML=Ya,ql=n(),Pe=i("p"),Pe.textContent=Ka,Ll=n(),o(Fe.$$.fragment),Hl=n(),Ye=i("p"),Ye.innerHTML=Oa,Dl=n(),o(Ke.$$.fragment),El=n(),Oe=i("p"),Oe.innerHTML=ei,Sl=n(),et=i("p"),et.textContent=ti,Pl=n(),o(tt.$$.fragment),Fl=n(),st=i("p"),st.innerHTML=si,Yl=n(),o(lt.$$.fragment),Kl=n(),o(nt.$$.fragment),Ol=n(),at=i("p"),at.textContent=li,en=n(),o(it.$$.fragment),tn=n(),pt=i("p"),pt.innerHTML=ni,sn=n(),rt=i("p"),rt.textContent=ai,ln=n(),o(ot.$$.fragment),nn=n(),ct=i("p"),ct.textContent=ii,an=n(),o(Mt.$$.fragment),pn=n(),o(mt.$$.fragment),rn=n(),yt=i("p"),yt.textContent=pi,on=n(),o(ht.$$.fragment),cn=n(),ut=i("p"),ut.innerHTML=ri,Mn=n(),o(Jt.$$.fragment),mn=n(),o(dt.$$.fragment),yn=n(),ft=i("p"),ft.textContent=oi,hn=n(),o(Tt.$$.fragment),un=n(),jt=i("p"),jt.textContent=ci,Jn=n(),o(kt.$$.fragment),dn=n(),o(wt.$$.fragment),fn=n(),bt=i("p"),bt.innerHTML=Mi,Tn=n(),o($t.$$.fragment),jn=n(),gt=i("p"),gt.textContent=mi,kn=n(),o(Ut.$$.fragment),wn=n(),xt=i("p"),xt.textContent=yi,bn=n(),o(zt.$$.fragment),$n=n(),It=i("p"),It.innerHTML=hi,gn=n(),o(Ct.$$.fragment),Un=n(),vt=i("p"),vt.textContent=ui,xn=n(),Zt=i("p"),Zt.textContent=Ji,zn=n(),o(Vt.$$.fragment),In=n(),Wt=i("p"),Wt.innerHTML=di,Cn=n(),Bt=i("p"),Bt.innerHTML=fi,vn=n(),o(_t.$$.fragment),Zn=n(),Qt=i("p"),Qt.textContent=Ti,Vn=n(),o(At.$$.fragment),Wn=n(),o(Rt.$$.fragment),Bn=n(),Nt=i("p"),Nt.textContent=ji,_n=n(),o(Xt.$$.fragment),Qn=n(),Gt=i("p"),Gt.innerHTML=ki,An=n(),qt=i("p"),qt.textContent=wi,Rn=n(),o(Lt.$$.fragment),Nn=n(),Ht=i("p"),Ht.textContent=bi,Xn=n(),o(Dt.$$.fragment),Gn=n(),o(Et.$$.fragment),qn=n(),St=i("p"),St.innerHTML=$i,Ln=n(),o(Pt.$$.fragment),Hn=n(),o(Ft.$$.fragment),Dn=n(),Yt=i("p"),Yt.textContent=gi,En=n(),o(Kt.$$.fragment),Sn=n(),Ot=i("p"),Ot.textContent=Ui,Pn=n(),o(es.$$.fragment),Fn=n(),o(ts.$$.fragment),Yn=n(),ss=i("p"),ss.innerHTML=xi,Kn=n(),o(ls.$$.fragment),On=n(),ns=i("p"),ns.innerHTML=zi,ea=n(),o(as.$$.fragment),ta=n(),is=i("p"),is.textContent=Ii,sa=n(),o(ps.$$.fragment),la=n(),rs=i("p"),rs.textContent=Ci,na=n(),o(os.$$.fragment),aa=n(),Ms=i("p"),this.h()},l(e){const t=Ai("svelte-u9bgzb",document.head);J=p(t,"META",{name:!0,content:!0}),t.forEach(s),j=a(e),d=p(e,"P",{}),Zi(d).forEach(s),cs=a(e),c(k.$$.fragment,e),hs=a(e),c(w.$$.fragment,e),us=a(e),b=p(e,"P",{"data-svelte-h":!0}),r(b)!=="svelte-3qwu2n"&&(b.textContent=ra),Js=a(e),$=p(e,"UL",{"data-svelte-h":!0}),r($)!=="svelte-12vigqn"&&($.innerHTML=oa),ds=a(e),g=p(e,"P",{"data-svelte-h":!0}),r(g)!=="svelte-1msje3z"&&(g.textContent=ca),fs=a(e),f=p(e,"DIV",{class:!0,"data-svelte-h":!0}),r(f)!=="svelte-oxfng3"&&(f.innerHTML=Ma),Ts=a(e),U=p(e,"P",{"data-svelte-h":!0}),r(U)!=="svelte-il604i"&&(U.innerHTML=ma),js=a(e),c(x.$$.fragment,e),ks=a(e),z=p(e,"P",{"data-svelte-h":!0}),r(z)!=="svelte-3r1vi2"&&(z.innerHTML=ya),ws=a(e),I=p(e,"UL",{"data-svelte-h":!0}),r(I)!=="svelte-fnf8g6"&&(I.innerHTML=ha),bs=a(e),C=p(e,"P",{"data-svelte-h":!0}),r(C)!=="svelte-13r2p99"&&(C.innerHTML=ua),$s=a(e),c(v.$$.fragment,e),gs=a(e),Z=p(e,"P",{"data-svelte-h":!0}),r(Z)!=="svelte-1njyr3z"&&(Z.innerHTML=Ja),Us=a(e),c(V.$$.fragment,e),xs=a(e),W=p(e,"P",{"data-svelte-h":!0}),r(W)!=="svelte-cmf9ti"&&(W.innerHTML=da),zs=a(e),B=p(e,"P",{"data-svelte-h":!0}),r(B)!=="svelte-1b40lyu"&&(B.textContent=fa),Is=a(e),c(_.$$.fragment,e),Cs=a(e),Q=p(e,"P",{"data-svelte-h":!0}),r(Q)!=="svelte-14xvwqr"&&(Q.textContent=Ta),vs=a(e),c(A.$$.fragment,e),Zs=a(e),R=p(e,"P",{"data-svelte-h":!0}),r(R)!=="svelte-kr34bb"&&(R.innerHTML=ja),Vs=a(e),N=p(e,"P",{"data-svelte-h":!0}),r(N)!=="svelte-ldh8kh"&&(N.innerHTML=ka),Ws=a(e),c(X.$$.fragment,e),Bs=a(e),G=p(e,"P",{"data-svelte-h":!0}),r(G)!=="svelte-szj6vq"&&(G.innerHTML=wa),_s=a(e),q=p(e,"P",{"data-svelte-h":!0}),r(q)!=="svelte-ch9g0x"&&(q.innerHTML=ba),Qs=a(e),c(L.$$.fragment,e),As=a(e),H=p(e,"P",{"data-svelte-h":!0}),r(H)!=="svelte-g661z7"&&(H.innerHTML=$a),Rs=a(e),c(D.$$.fragment,e),Ns=a(e),E=p(e,"P",{"data-svelte-h":!0}),r(E)!=="svelte-sqrbhp"&&(E.innerHTML=ga),Xs=a(e),S=p(e,"P",{"data-svelte-h":!0}),r(S)!=="svelte-1ifl1uh"&&(S.innerHTML=Ua),Gs=a(e),c(P.$$.fragment,e),qs=a(e),c(F.$$.fragment,e),Ls=a(e),c(T.$$.fragment,e),Hs=a(e),Y=p(e,"P",{"data-svelte-h":!0}),r(Y)!=="svelte-1hzmz8v"&&(Y.innerHTML=xa),Ds=a(e),c(K.$$.fragment,e),Es=a(e),O=p(e,"P",{"data-svelte-h":!0}),r(O)!=="svelte-5f2ppa"&&(O.textContent=za),Ss=a(e),c(ee.$$.fragment,e),Ps=a(e),te=p(e,"P",{"data-svelte-h":!0}),r(te)!=="svelte-17g5tkh"&&(te.innerHTML=Ia),Fs=a(e),c(se.$$.fragment,e),Ys=a(e),c(le.$$.fragment,e),Ks=a(e),ne=p(e,"P",{"data-svelte-h":!0}),r(ne)!=="svelte-1nzvxks"&&(ne.innerHTML=Ca),Os=a(e),c(ae.$$.fragment,e),el=a(e),c(ie.$$.fragment,e),tl=a(e),pe=p(e,"P",{"data-svelte-h":!0}),r(pe)!=="svelte-e8ls79"&&(pe.innerHTML=va),sl=a(e),c(re.$$.fragment,e),ll=a(e),c(oe.$$.fragment,e),nl=a(e),ce=p(e,"P",{"data-svelte-h":!0}),r(ce)!=="svelte-1flaqiu"&&(ce.innerHTML=Za),al=a(e),c(Me.$$.fragment,e),il=a(e),me=p(e,"P",{"data-svelte-h":!0}),r(me)!=="svelte-zfisnc"&&(me.innerHTML=Va),pl=a(e),ye=p(e,"P",{"data-svelte-h":!0}),r(ye)!=="svelte-fxwv2n"&&(ye.textContent=Wa),rl=a(e),c(he.$$.fragment,e),ol=a(e),ue=p(e,"P",{"data-svelte-h":!0}),r(ue)!=="svelte-wisurr"&&(ue.innerHTML=Ba),cl=a(e),c(Je.$$.fragment,e),Ml=a(e),de=p(e,"P",{"data-svelte-h":!0}),r(de)!=="svelte-132nejo"&&(de.innerHTML=_a),ml=a(e),c(fe.$$.fragment,e),yl=a(e),c(Te.$$.fragment,e),hl=a(e),je=p(e,"P",{"data-svelte-h":!0}),r(je)!=="svelte-gj3rj2"&&(je.innerHTML=Qa),ul=a(e),ke=p(e,"P",{"data-svelte-h":!0}),r(ke)!=="svelte-1oy2o1b"&&(ke.innerHTML=Aa),Jl=a(e),c(we.$$.fragment,e),dl=a(e),c(be.$$.fragment,e),fl=a(e),$e=p(e,"P",{"data-svelte-h":!0}),r($e)!=="svelte-z3selt"&&($e.innerHTML=Ra),Tl=a(e),ge=p(e,"P",{"data-svelte-h":!0}),r(ge)!=="svelte-q1nzmh"&&(ge.textContent=Na),jl=a(e),c(Ue.$$.fragment,e),kl=a(e),xe=p(e,"P",{"data-svelte-h":!0}),r(xe)!=="svelte-14xkv0z"&&(xe.textContent=Xa),wl=a(e),ze=p(e,"P",{"data-svelte-h":!0}),r(ze)!=="svelte-1i25qf9"&&(ze.textContent=Ga),bl=a(e),c(Ie.$$.fragment,e),$l=a(e),c(Ce.$$.fragment,e),gl=a(e),ve=p(e,"P",{"data-svelte-h":!0}),r(ve)!=="svelte-th6dsg"&&(ve.textContent=qa),Ul=a(e),c(Ze.$$.fragment,e),xl=a(e),c(Ve.$$.fragment,e),zl=a(e),We=p(e,"P",{"data-svelte-h":!0}),r(We)!=="svelte-xbo1fy"&&(We.textContent=La),Il=a(e),c(Be.$$.fragment,e),Cl=a(e),_e=p(e,"P",{"data-svelte-h":!0}),r(_e)!=="svelte-10hc9jg"&&(_e.innerHTML=Ha),vl=a(e),c(Qe.$$.fragment,e),Zl=a(e),c(Ae.$$.fragment,e),Vl=a(e),Re=p(e,"P",{"data-svelte-h":!0}),r(Re)!=="svelte-bx3ks1"&&(Re.textContent=Da),Wl=a(e),c(Ne.$$.fragment,e),Bl=a(e),Xe=p(e,"P",{"data-svelte-h":!0}),r(Xe)!=="svelte-1j80g7l"&&(Xe.innerHTML=Ea),_l=a(e),c(Ge.$$.fragment,e),Ql=a(e),qe=p(e,"P",{"data-svelte-h":!0}),r(qe)!=="svelte-138iql"&&(qe.innerHTML=Sa),Al=a(e),Le=p(e,"P",{"data-svelte-h":!0}),r(Le)!=="svelte-k6u833"&&(Le.innerHTML=Pa),Rl=a(e),c(He.$$.fragment,e),Nl=a(e),De=p(e,"P",{"data-svelte-h":!0}),r(De)!=="svelte-1o14t6o"&&(De.innerHTML=Fa),Xl=a(e),c(Ee.$$.fragment,e),Gl=a(e),Se=p(e,"P",{"data-svelte-h":!0}),r(Se)!=="svelte-1y1ppn4"&&(Se.innerHTML=Ya),ql=a(e),Pe=p(e,"P",{"data-svelte-h":!0}),r(Pe)!=="svelte-1w2ckw7"&&(Pe.textContent=Ka),Ll=a(e),c(Fe.$$.fragment,e),Hl=a(e),Ye=p(e,"P",{"data-svelte-h":!0}),r(Ye)!=="svelte-16961ta"&&(Ye.innerHTML=Oa),Dl=a(e),c(Ke.$$.fragment,e),El=a(e),Oe=p(e,"P",{"data-svelte-h":!0}),r(Oe)!=="svelte-1bovaty"&&(Oe.innerHTML=ei),Sl=a(e),et=p(e,"P",{"data-svelte-h":!0}),r(et)!=="svelte-1ezw1yn"&&(et.textContent=ti),Pl=a(e),c(tt.$$.fragment,e),Fl=a(e),st=p(e,"P",{"data-svelte-h":!0}),r(st)!=="svelte-1bw3nt0"&&(st.innerHTML=si),Yl=a(e),c(lt.$$.fragment,e),Kl=a(e),c(nt.$$.fragment,e),Ol=a(e),at=p(e,"P",{"data-svelte-h":!0}),r(at)!=="svelte-7y4l0y"&&(at.textContent=li),en=a(e),c(it.$$.fragment,e),tn=a(e),pt=p(e,"P",{"data-svelte-h":!0}),r(pt)!=="svelte-bv31bn"&&(pt.innerHTML=ni),sn=a(e),rt=p(e,"P",{"data-svelte-h":!0}),r(rt)!=="svelte-11ph16f"&&(rt.textContent=ai),ln=a(e),c(ot.$$.fragment,e),nn=a(e),ct=p(e,"P",{"data-svelte-h":!0}),r(ct)!=="svelte-1hgpn33"&&(ct.textContent=ii),an=a(e),c(Mt.$$.fragment,e),pn=a(e),c(mt.$$.fragment,e),rn=a(e),yt=p(e,"P",{"data-svelte-h":!0}),r(yt)!=="svelte-n1vz05"&&(yt.textContent=pi),on=a(e),c(ht.$$.fragment,e),cn=a(e),ut=p(e,"P",{"data-svelte-h":!0}),r(ut)!=="svelte-ghzhch"&&(ut.innerHTML=ri),Mn=a(e),c(Jt.$$.fragment,e),mn=a(e),c(dt.$$.fragment,e),yn=a(e),ft=p(e,"P",{"data-svelte-h":!0}),r(ft)!=="svelte-1h8tvqm"&&(ft.textContent=oi),hn=a(e),c(Tt.$$.fragment,e),un=a(e),jt=p(e,"P",{"data-svelte-h":!0}),r(jt)!=="svelte-1507zp"&&(jt.textContent=ci),Jn=a(e),c(kt.$$.fragment,e),dn=a(e),c(wt.$$.fragment,e),fn=a(e),bt=p(e,"P",{"data-svelte-h":!0}),r(bt)!=="svelte-1uknlc4"&&(bt.innerHTML=Mi),Tn=a(e),c($t.$$.fragment,e),jn=a(e),gt=p(e,"P",{"data-svelte-h":!0}),r(gt)!=="svelte-ylttvt"&&(gt.textContent=mi),kn=a(e),c(Ut.$$.fragment,e),wn=a(e),xt=p(e,"P",{"data-svelte-h":!0}),r(xt)!=="svelte-1tk1zdg"&&(xt.textContent=yi),bn=a(e),c(zt.$$.fragment,e),$n=a(e),It=p(e,"P",{"data-svelte-h":!0}),r(It)!=="svelte-1lojr4m"&&(It.innerHTML=hi),gn=a(e),c(Ct.$$.fragment,e),Un=a(e),vt=p(e,"P",{"data-svelte-h":!0}),r(vt)!=="svelte-1g43fmt"&&(vt.textContent=ui),xn=a(e),Zt=p(e,"P",{"data-svelte-h":!0}),r(Zt)!=="svelte-d9cfcx"&&(Zt.textContent=Ji),zn=a(e),c(Vt.$$.fragment,e),In=a(e),Wt=p(e,"P",{"data-svelte-h":!0}),r(Wt)!=="svelte-qkvtqd"&&(Wt.innerHTML=di),Cn=a(e),Bt=p(e,"P",{"data-svelte-h":!0}),r(Bt)!=="svelte-8x763t"&&(Bt.innerHTML=fi),vn=a(e),c(_t.$$.fragment,e),Zn=a(e),Qt=p(e,"P",{"data-svelte-h":!0}),r(Qt)!=="svelte-1ejt7m0"&&(Qt.textContent=Ti),Vn=a(e),c(At.$$.fragment,e),Wn=a(e),c(Rt.$$.fragment,e),Bn=a(e),Nt=p(e,"P",{"data-svelte-h":!0}),r(Nt)!=="svelte-csrqog"&&(Nt.textContent=ji),_n=a(e),c(Xt.$$.fragment,e),Qn=a(e),Gt=p(e,"P",{"data-svelte-h":!0}),r(Gt)!=="svelte-11hh239"&&(Gt.innerHTML=ki),An=a(e),qt=p(e,"P",{"data-svelte-h":!0}),r(qt)!=="svelte-11ph16f"&&(qt.textContent=wi),Rn=a(e),c(Lt.$$.fragment,e),Nn=a(e),Ht=p(e,"P",{"data-svelte-h":!0}),r(Ht)!=="svelte-1hgpn33"&&(Ht.textContent=bi),Xn=a(e),c(Dt.$$.fragment,e),Gn=a(e),c(Et.$$.fragment,e),qn=a(e),St=p(e,"P",{"data-svelte-h":!0}),r(St)!=="svelte-kqa3ra"&&(St.innerHTML=$i),Ln=a(e),c(Pt.$$.fragment,e),Hn=a(e),c(Ft.$$.fragment,e),Dn=a(e),Yt=p(e,"P",{"data-svelte-h":!0}),r(Yt)!=="svelte-1xkp6h4"&&(Yt.textContent=gi),En=a(e),c(Kt.$$.fragment,e),Sn=a(e),Ot=p(e,"P",{"data-svelte-h":!0}),r(Ot)!=="svelte-11ia5pa"&&(Ot.textContent=Ui),Pn=a(e),c(es.$$.fragment,e),Fn=a(e),c(ts.$$.fragment,e),Yn=a(e),ss=p(e,"P",{"data-svelte-h":!0}),r(ss)!=="svelte-16qaug1"&&(ss.innerHTML=xi),Kn=a(e),c(ls.$$.fragment,e),On=a(e),ns=p(e,"P",{"data-svelte-h":!0}),r(ns)!=="svelte-1nk2cqv"&&(ns.innerHTML=zi),ea=a(e),c(as.$$.fragment,e),ta=a(e),is=p(e,"P",{"data-svelte-h":!0}),r(is)!=="svelte-igjp7d"&&(is.textContent=Ii),sa=a(e),c(ps.$$.fragment,e),la=a(e),rs=p(e,"P",{"data-svelte-h":!0}),r(rs)!=="svelte-l5rkk1"&&(rs.textContent=Ci),na=a(e),c(os.$$.fragment,e),aa=a(e),Ms=p(e,"P",{}),Zi(Ms).forEach(s),this.h()},h(){pa(J,"name","hf:doc:metadata"),pa(J,"content",Di),pa(f,"class","flex justify-center")},m(e,t){Ri(document.head,J),l(e,j,t),l(e,d,t),l(e,cs,t),M(k,e,t),l(e,hs,t),M(w,e,t),l(e,us,t),l(e,b,t),l(e,Js,t),l(e,$,t),l(e,ds,t),l(e,g,t),l(e,fs,t),l(e,f,t),l(e,Ts,t),l(e,U,t),l(e,js,t),M(x,e,t),l(e,ks,t),l(e,z,t),l(e,ws,t),l(e,I,t),l(e,bs,t),l(e,C,t),l(e,$s,t),M(v,e,t),l(e,gs,t),l(e,Z,t),l(e,Us,t),M(V,e,t),l(e,xs,t),l(e,W,t),l(e,zs,t),l(e,B,t),l(e,Is,t),M(_,e,t),l(e,Cs,t),l(e,Q,t),l(e,vs,t),M(A,e,t),l(e,Zs,t),l(e,R,t),l(e,Vs,t),l(e,N,t),l(e,Ws,t),M(X,e,t),l(e,Bs,t),l(e,G,t),l(e,_s,t),l(e,q,t),l(e,Qs,t),M(L,e,t),l(e,As,t),l(e,H,t),l(e,Rs,t),M(D,e,t),l(e,Ns,t),l(e,E,t),l(e,Xs,t),l(e,S,t),l(e,Gs,t),M(P,e,t),l(e,qs,t),M(F,e,t),l(e,Ls,t),M(T,e,t),l(e,Hs,t),l(e,Y,t),l(e,Ds,t),M(K,e,t),l(e,Es,t),l(e,O,t),l(e,Ss,t),M(ee,e,t),l(e,Ps,t),l(e,te,t),l(e,Fs,t),M(se,e,t),l(e,Ys,t),M(le,e,t),l(e,Ks,t),l(e,ne,t),l(e,Os,t),M(ae,e,t),l(e,el,t),M(ie,e,t),l(e,tl,t),l(e,pe,t),l(e,sl,t),M(re,e,t),l(e,ll,t),M(oe,e,t),l(e,nl,t),l(e,ce,t),l(e,al,t),M(Me,e,t),l(e,il,t),l(e,me,t),l(e,pl,t),l(e,ye,t),l(e,rl,t),M(he,e,t),l(e,ol,t),l(e,ue,t),l(e,cl,t),M(Je,e,t),l(e,Ml,t),l(e,de,t),l(e,ml,t),M(fe,e,t),l(e,yl,t),M(Te,e,t),l(e,hl,t),l(e,je,t),l(e,ul,t),l(e,ke,t),l(e,Jl,t),M(we,e,t),l(e,dl,t),M(be,e,t),l(e,fl,t),l(e,$e,t),l(e,Tl,t),l(e,ge,t),l(e,jl,t),M(Ue,e,t),l(e,kl,t),l(e,xe,t),l(e,wl,t),l(e,ze,t),l(e,bl,t),M(Ie,e,t),l(e,$l,t),M(Ce,e,t),l(e,gl,t),l(e,ve,t),l(e,Ul,t),M(Ze,e,t),l(e,xl,t),M(Ve,e,t),l(e,zl,t),l(e,We,t),l(e,Il,t),M(Be,e,t),l(e,Cl,t),l(e,_e,t),l(e,vl,t),M(Qe,e,t),l(e,Zl,t),M(Ae,e,t),l(e,Vl,t),l(e,Re,t),l(e,Wl,t),M(Ne,e,t),l(e,Bl,t),l(e,Xe,t),l(e,_l,t),M(Ge,e,t),l(e,Ql,t),l(e,qe,t),l(e,Al,t),l(e,Le,t),l(e,Rl,t),M(He,e,t),l(e,Nl,t),l(e,De,t),l(e,Xl,t),M(Ee,e,t),l(e,Gl,t),l(e,Se,t),l(e,ql,t),l(e,Pe,t),l(e,Ll,t),M(Fe,e,t),l(e,Hl,t),l(e,Ye,t),l(e,Dl,t),M(Ke,e,t),l(e,El,t),l(e,Oe,t),l(e,Sl,t),l(e,et,t),l(e,Pl,t),M(tt,e,t),l(e,Fl,t),l(e,st,t),l(e,Yl,t),M(lt,e,t),l(e,Kl,t),M(nt,e,t),l(e,Ol,t),l(e,at,t),l(e,en,t),M(it,e,t),l(e,tn,t),l(e,pt,t),l(e,sn,t),l(e,rt,t),l(e,ln,t),M(ot,e,t),l(e,nn,t),l(e,ct,t),l(e,an,t),M(Mt,e,t),l(e,pn,t),M(mt,e,t),l(e,rn,t),l(e,yt,t),l(e,on,t),M(ht,e,t),l(e,cn,t),l(e,ut,t),l(e,Mn,t),M(Jt,e,t),l(e,mn,t),M(dt,e,t),l(e,yn,t),l(e,ft,t),l(e,hn,t),M(Tt,e,t),l(e,un,t),l(e,jt,t),l(e,Jn,t),M(kt,e,t),l(e,dn,t),M(wt,e,t),l(e,fn,t),l(e,bt,t),l(e,Tn,t),M($t,e,t),l(e,jn,t),l(e,gt,t),l(e,kn,t),M(Ut,e,t),l(e,wn,t),l(e,xt,t),l(e,bn,t),M(zt,e,t),l(e,$n,t),l(e,It,t),l(e,gn,t),M(Ct,e,t),l(e,Un,t),l(e,vt,t),l(e,xn,t),l(e,Zt,t),l(e,zn,t),M(Vt,e,t),l(e,In,t),l(e,Wt,t),l(e,Cn,t),l(e,Bt,t),l(e,vn,t),M(_t,e,t),l(e,Zn,t),l(e,Qt,t),l(e,Vn,t),M(At,e,t),l(e,Wn,t),M(Rt,e,t),l(e,Bn,t),l(e,Nt,t),l(e,_n,t),M(Xt,e,t),l(e,Qn,t),l(e,Gt,t),l(e,An,t),l(e,qt,t),l(e,Rn,t),M(Lt,e,t),l(e,Nn,t),l(e,Ht,t),l(e,Xn,t),M(Dt,e,t),l(e,Gn,t),M(Et,e,t),l(e,qn,t),l(e,St,t),l(e,Ln,t),M(Pt,e,t),l(e,Hn,t),M(Ft,e,t),l(e,Dn,t),l(e,Yt,t),l(e,En,t),M(Kt,e,t),l(e,Sn,t),l(e,Ot,t),l(e,Pn,t),M(es,e,t),l(e,Fn,t),M(ts,e,t),l(e,Yn,t),l(e,ss,t),l(e,Kn,t),M(ls,e,t),l(e,On,t),l(e,ns,t),l(e,ea,t),M(as,e,t),l(e,ta,t),l(e,is,t),l(e,sa,t),M(ps,e,t),l(e,la,t),l(e,rs,t),l(e,na,t),M(os,e,t),l(e,aa,t),l(e,Ms,t),ia=!0},p(e,[t]){const vi={};t&2&&(vi.$$scope={dirty:t,ctx:e}),T.$set(vi)},i(e){ia\|\|(m(k.$$.fragment,e),m(w.$$.fragment,e),m(x.$$.fragment,e),m(v.$$.fragment,e),m(V.$$.fragment,e),m(_.$$.fragment,e),m(A.$$.fragment,e),m(X.$$.fragment,e),m(L.$$.fragment,e),m(D.$$.fragment,e),m(P.$$.fragment,e),m(F.$$.fragment,e),m(T.$$.fragment,e),m(K.$$.fragment,e),m(ee.$$.fragment,e),m(se.$$.fragment,e),m(le.$$.fragment,e),m(ae.$$.fragment,e),m(ie.$$.fragment,e),m(re.$$.fragment,e),m(oe.$$.fragment,e),m(Me.$$.fragment,e),m(he.$$.fragment,e),m(Je.$$.fragment,e),m(fe.$$.fragment,e),m(Te.$$.fragment,e),m(we.$$.fragment,e),m(be.$$.fragment,e),m(Ue.$$.fragment,e),m(Ie.$$.fragment,e),m(Ce.$$.fragment,e),m(Ze.$$.fragment,e),m(Ve.$$.fragment,e),m(Be.$$.fragment,e),m(Qe.$$.fragment,e),m(Ae.$$.fragment,e),m(Ne.$$.fragment,e),m(Ge.$$.fragment,e),m(He.$$.fragment,e),m(Ee.$$.fragment,e),m(Fe.$$.fragment,e),m(Ke.$$.fragment,e),m(tt.$$.fragment,e),m(lt.$$.fragment,e),m(nt.$$.fragment,e),m(it.$$.fragment,e),m(ot.$$.fragment,e),m(Mt.$$.fragment,e),m(mt.$$.fragment,e),m(ht.$$.fragment,e),m(Jt.$$.fragment,e),m(dt.$$.fragment,e),m(Tt.$$.fragment,e),m(kt.$$.fragment,e),m(wt.$$.fragment,e),m($t.$$.fragment,e),m(Ut.$$.fragment,e),m(zt.$$.fragment,e),m(Ct.$$.fragment,e),m(Vt.$$.fragment,e),m(_t.$$.fragment,e),m(At.$$.fragment,e),m(Rt.$$.fragment,e),m(Xt.$$.fragment,e),m(Lt.$$.fragment,e),m(Dt.$$.fragment,e),m(Et.$$.fragment,e),m(Pt.$$.fragment,e),m(Ft.$$.fragment,e),m(Kt.$$.fragment,e),m(es.$$.fragment,e),m(ts.$$.fragment,e),m(ls.$$.fragment,e),m(as.$$.fragment,e),m(ps.$$.fragment,e),m(os.$$.fragment,e),ia=!0)},o(e){y(k.$$.fragment,e),y(w.$$.fragment,e),y(x.$$.fragment,e),y(v.$$.fragment,e),y(V.$$.fragment,e),y(_.$$.fragment,e),y(A.$$.fragment,e),y(X.$$.fragment,e),y(L.$$.fragment,e),y(D.$$.fragment,e),y(P.$$.fragment,e),y(F.$$.fragment,e),y(T.$$.fragment,e),y(K.$$.fragment,e),y(ee.$$.fragment,e),y(se.$$.fragment,e),y(le.$$.fragment,e),y(ae.$$.fragment,e),y(ie.$$.fragment,e),y(re.$$.fragment,e),y(oe.$$.fragment,e),y(Me.$$.fragment,e),y(he.$$.fragment,e),y(Je.$$.fragment,e),y(fe.$$.fragment,e),y(Te.$$.fragment,e),y(we.$$.fragment,e),y(be.$$.fragment,e),y(Ue.$$.fragment,e),y(Ie.$$.fragment,e),y(Ce.$$.fragment,e),y(Ze.$$.fragment,e),y(Ve.$$.fragment,e),y(Be.$$.fragment,e),y(Qe.$$.fragment,e),y(Ae.$$.fragment,e),y(Ne.$$.fragment,e),y(Ge.$$.fragment,e),y(He.$$.fragment,e),y(Ee.$$.fragment,e),y(Fe.$$.fragment,e),y(Ke.$$.fragment,e),y(tt.$$.fragment,e),y(lt.$$.fragment,e),y(nt.$$.fragment,e),y(it.$$.fragment,e),y(ot.$$.fragment,e),y(Mt.$$.fragment,e),y(mt.$$.fragment,e),y(ht.$$.fragment,e),y(Jt.$$.fragment,e),y(dt.$$.fragment,e),y(Tt.$$.fragment,e),y(kt.$$.fragment,e),y(wt.$$.fragment,e),y($t.$$.fragment,e),y(Ut.$$.fragment,e),y(zt.$$.fragment,e),y(Ct.$$.fragment,e),y(Vt.$$.fragment,e),y(_t.$$.fragment,e),y(At.$$.fragment,e),y(Rt.$$.fragment,e),y(Xt.$$.fragment,e),y(Lt.$$.fragment,e),y(Dt.$$.fragment,e),y(Et.$$.fragment,e),y(Pt.$$.fragment,e),y(Ft.$$.fragment,e),y(Kt.$$.fragment,e),y(es.$$.fragment,e),y(ts.$$.fragment,e),y(ls.$$.fragment,e),y(as.$$.fragment,e),y(ps.$$.fragment,e),y(os.$$.fragment,e),ia=!1},d(e){e&&(s(j),s(d),s(cs),s(hs),s(us),s(b),s(Js),s($),s(ds),s(g),s(fs),s(f),s(Ts),s(U),s(js),s(ks),s(z),s(ws),s(I),s(bs),s(C),s($s),s(gs),s(Z),s(Us),s(xs),s(W),s(zs),s(B),s(Is),s(Cs),s(Q),s(vs),s(Zs),s(R),s(Vs),s(N),s(Ws),s(Bs),s(G),s(_s),s(q),s(Qs),s(As),s(H),s(Rs),s(Ns),s(E),s(Xs),s(S),s(Gs),s(qs),s(Ls),s(Hs),s(Y),s(Ds),s(Es),s(O),s(Ss),s(Ps),s(te),s(Fs),s(Ys),s(Ks),s(ne),s(Os),s(el),s(tl),s(pe),s(sl),s(ll),s(nl),s(ce),s(al),s(il),s(me),s(pl),s(ye),s(rl),s(ol),s(ue),s(cl),s(Ml),s(de),s(ml),s(yl),s(hl),s(je),s(ul),s(ke),s(Jl),s(dl),s(fl),s($e),s(Tl),s(ge),s(jl),s(kl),s(xe),s(wl),s(ze),s(bl),s($l),s(gl),s(ve),s(Ul),s(xl),s(zl),s(We),s(Il),s(Cl),s(_e),s(vl),s(Zl),s(Vl),s(Re),s(Wl),s(Bl),s(Xe),s(_l),s(Ql),s(qe),s(Al),s(Le),s(Rl),s(Nl),s(De),s(Xl),s(Gl),s(Se),s(ql),s(Pe),s(Ll),s(Hl),s(Ye),s(Dl),s(El),s(Oe),s(Sl),s(et),s(Pl),s(Fl),s(st),s(Yl),s(Kl),s(Ol),s(at),s(en),s(tn),s(pt),s(sn),s(rt),s(ln),s(nn),s(ct),s(an),s(pn),s(rn),s(yt),s(on),s(cn),s(ut),s(Mn),s(mn),s(yn),s(ft),s(hn),s(un),s(jt),s(Jn),s(dn),s(fn),s(bt),s(Tn),s(jn),s(gt),s(kn),s(wn),s(xt),s(bn),s($n),s(It),s(gn),s(Un),s(vt),s(xn),s(Zt),s(zn),s(In),s(Wt),s(Cn),s(Bt),s(vn),s(Zn),s(Qt),s(Vn),s(Wn),s(Bn),s(Nt),s(_n),s(Qn),s(Gt),s(An),s(qt),s(Rn),s(Nn),s(Ht),s(Xn),s(Gn),s(qn),s(St),s(Ln),s(Hn),s(Dn),s(Yt),s(En),s(Sn),s(Ot),s(Pn),s(Fn),s(Yn),s(ss),s(Kn),s(On),s(ns),s(ea),s(ta),s(is),s(sa),s(la),s(rs),s(na),s(aa),s(Ms)),s(J),h(k,e),h(w,e),h(x,e),h(v,e),h(V,e),h(_,e),h(A,e),h(X,e),h(L,e),h(D,e),h(P,e),h(F,e),h(T,e),h(K,e),h(ee,e),h(se,e),h(le,e),h(ae,e),h(ie,e),h(re,e),h(oe,e),h(Me,e),h(he,e),h(Je,e),h(fe,e),h(Te,e),h(we,e),h(be,e),h(Ue,e),h(Ie,e),h(Ce,e),h(Ze,e),h(Ve,e),h(Be,e),h(Qe,e),h(Ae,e),h(Ne,e),h(Ge,e),h(He,e),h(Ee,e),h(Fe,e),h(Ke,e),h(tt,e),h(lt,e),h(nt,e),h(it,e),h(ot,e),h(Mt,e),h(mt,e),h(ht,e),h(Jt,e),h(dt,e),h(Tt,e),h(kt,e),h(wt,e),h($t,e),h(Ut,e),h(zt,e),h(Ct,e),h(Vt,e),h(_t,e),h(At,e),h(Rt,e),h(Xt,e),h(Lt,e),h(Dt,e),h(Et,e),h(Pt,e),h(Ft,e),h(Kt,e),h(es,e),h(ts,e),h(ls,e),h(as,e),h(ps,e),h(os,e)}}}const Di='{"title":"Building a tokenizer, block by block","local":"building-a-tokenizer-block-by-block","sections":[{"title":"Acquiring a corpus","local":"acquiring-a-corpus","sections":[],"depth":2},{"title":"Building a WordPiece tokenizer from scratch","local":"building-a-wordpiece-tokenizer-from-scratch","sections":[],"depth":2},{"title":"Building a BPE tokenizer from scratch","local":"building-a-bpe-tokenizer-from-scratch","sections":[],"depth":2},{"title":"Building a Unigram tokenizer from scratch","local":"building-a-unigram-tokenizer-from-scratch","sections":[],"depth":2}],"depth":1}';function Ei(ys){return Wi(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class tp extends _i{constructor(J){super(),Qi(this,J,Ei,Hi,Vi,{})}}export{tp as component};

Xet Storage Details

Size:: 80.8 kB
Xet hash:: 63e85fb79435810216641859ade4b214c92b8270f2050a60f078aca50e4a7398

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.