Buckets:

rtrm's picture
download
raw
72.8 kB
import{s as wn,o as In,n as Ls}from"../chunks/scheduler.37c15a92.js";import{S as dn,i as gn,g as M,s as t,r,m as fn,H as xn,A as bn,h as i,f as a,c as e,j as ka,u as y,x as J,n as Cn,B as qn,k as mn,y as Un,a as n,v as o,d as c,t as j,w as h}from"../chunks/index.7cb9c9b8.js";import{T as Ps}from"../chunks/Tip.d10b3fc9.js";import{Y as kn}from"../chunks/Youtube.8666c400.js";import{C as m}from"../chunks/CodeBlock.abae2786.js";import{C as $n}from"../chunks/CourseFloatingBanner.df82c153.js";import{H as Ca,E as An}from"../chunks/getInferenceSnippets.f9350a3f.js";function zn(w){let p,U="💡 This section covers WordPiece in depth, going as far as showing a full implementation. You can skip to the end if you just want a general overview of the tokenization algorithm.";return{c(){p=M("p"),p.textContent=U},l(u){p=i(u,"P",{"data-svelte-h":!0}),J(p)!=="svelte-7v3wq0"&&(p.textContent=U)},m(u,T){n(u,p,T)},p:Ls,d(u){u&&a(p)}}}function vn(w){let p,U="⚠️ Google never open-sourced its implementation of the training algorithm of WordPiece, so what follows is our best guess based on the published literature. It may not be 100% accurate.";return{c(){p=M("p"),p.textContent=U},l(u){p=i(u,"P",{"data-svelte-h":!0}),J(p)!=="svelte-rd0zod"&&(p.textContent=U)},m(u,T){n(u,p,T)},p:Ls,d(u){u&&a(p)}}}function Qn(w){let p,U="✏️ <strong>Now your turn!</strong> What will the next merge rule be?";return{c(){p=M("p"),p.innerHTML=U},l(u){p=i(u,"P",{"data-svelte-h":!0}),J(p)!=="svelte-4fg9cy"&&(p.innerHTML=U)},m(u,T){n(u,p,T)},p:Ls,d(u){u&&a(p)}}}function Bn(w){let p,U="✏️ <strong>Now your turn!</strong> How will the word <code>&quot;pugs&quot;</code> be tokenized?";return{c(){p=M("p"),p.innerHTML=U},l(u){p=i(u,"P",{"data-svelte-h":!0}),J(p)!=="svelte-1r124bw"&&(p.innerHTML=U)},m(u,T){n(u,p,T)},p:Ls,d(u){u&&a(p)}}}function Nn(w){let p,U="💡 Using <code>train_new_from_iterator()</code> on the same corpus won’t result in the exact same vocabulary. This is because the 🤗 Tokenizers library does not implement WordPiece for the training (since we are not completely sure of its internals), but uses BPE instead.";return{c(){p=M("p"),p.innerHTML=U},l(u){p=i(u,"P",{"data-svelte-h":!0}),J(p)!=="svelte-166hjxq"&&(p.innerHTML=U)},m(u,T){n(u,p,T)},p:Ls,d(u){u&&a(p)}}}function Zn(w){let p,U,u,T,b,Ks,C,Os,q,$a="WordPiece is the tokenization algorithm Google developed to pretrain BERT. It has since been reused in quite a few Transformer models based on BERT, such as DistilBERT, MobileBERT, Funnel Transformers, and MPNET. It’s very similar to BPE in terms of the training, but the actual tokenization is done differently.",sl,k,ll,I,al,$,nl,d,tl,A,Aa="Like BPE, WordPiece starts from a small vocabulary including the special tokens used by the model and the initial alphabet. Since it identifies subwords by adding a prefix (like <code>##</code> for BERT), each word is initially split by adding that prefix to all the characters inside the word. So, for instance, <code>&quot;word&quot;</code> gets split like this:",el,z,pl,v,za="Thus, the initial alphabet contains all the characters present at the beginning of a word and the characters present inside a word preceded by the WordPiece prefix.",Ml,Q,qa,il,Tn='<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mrow><mi mathvariant="normal">s</mi><mi mathvariant="normal">c</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">e</mi></mrow><mo>=</mo><mo stretchy="false">(</mo><mrow><mi mathvariant="normal">f</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">q</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">f</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">p</mi><mi mathvariant="normal">a</mi><mi mathvariant="normal">i</mi><mi mathvariant="normal">r</mi></mrow><mo stretchy="false">)</mo><mi mathvariant="normal">/</mi><mo stretchy="false">(</mo><mrow><mi mathvariant="normal">f</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">q</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">f</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">f</mi><mi mathvariant="normal">i</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">s</mi><mi mathvariant="normal">t</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">l</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">m</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">n</mi><mi mathvariant="normal">t</mi></mrow><mo>×</mo><mrow><mi mathvariant="normal">f</mi><mi mathvariant="normal">r</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">q</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">f</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">s</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">c</mi><mi mathvariant="normal">o</mi><mi mathvariant="normal">n</mi><mi mathvariant="normal">d</mi><mi mathvariant="normal">_</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">l</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">m</mi><mi mathvariant="normal">e</mi><mi mathvariant="normal">n</mi><mi mathvariant="normal">t</mi></mrow><mo stretchy="false">)</mo></mrow><annotation encoding="application/x-tex">\\mathrm{score} = (\\mathrm{freq\\_of\\_pair}) / (\\mathrm{freq\\_of\\_first\\_element} \\times \\mathrm{freq\\_of\\_second\\_element})</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord"><span class="mord mathrm">score</span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1.06em;vertical-align:-0.31em;"></span><span class="mopen">(</span><span class="mord"><span class="mord mathrm">freq_of_pair</span></span><span class="mclose">)</span><span class="mord">/</span><span class="mopen">(</span><span class="mord"><span class="mord mathrm">freq_of_first_element</span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1.06em;vertical-align:-0.31em;"></span><span class="mord"><span class="mord mathrm">freq_of_second_element</span></span><span class="mclose">)</span></span></span></span></span>',rl,B,va="By dividing the frequency of the pair by the product of the frequencies of each of its parts, the algorithm prioritizes the merging of pairs where the individual parts are less frequent in the vocabulary. For instance, it won’t necessarily merge <code>(&quot;un&quot;, &quot;##able&quot;)</code> even if that pair occurs very frequently in the vocabulary, because the two pairs <code>&quot;un&quot;</code> and <code>&quot;##able&quot;</code> will likely each appear in a lot of other words and have a high frequency. In contrast, a pair like <code>(&quot;hu&quot;, &quot;##gging&quot;)</code> will probably be merged faster (assuming the word “hugging” appears often in the vocabulary) since <code>&quot;hu&quot;</code> and <code>&quot;##gging&quot;</code> are likely to be less frequent individually.",yl,N,Qa="Let’s look at the same vocabulary we used in the BPE training example:",ol,Z,cl,_,Ba="The splits here will be:",jl,E,hl,S,Na="so the initial vocabulary will be <code>[&quot;b&quot;, &quot;h&quot;, &quot;p&quot;, &quot;##g&quot;, &quot;##n&quot;, &quot;##s&quot;, &quot;##u&quot;]</code> (if we forget about special tokens for now). The most frequent pair is <code>(&quot;##u&quot;, &quot;##g&quot;)</code> (present 20 times), but the individual frequency of <code>&quot;##u&quot;</code> is very high, so its score is not the highest (it’s 1 / 36). All pairs with a <code>&quot;##u&quot;</code> actually have that same score (1 / 36), so the best score goes to the pair <code>(&quot;##g&quot;, &quot;##s&quot;)</code> — the only one without a <code>&quot;##u&quot;</code> — at 1 / 20, and the first merge learned is <code>(&quot;##g&quot;, &quot;##s&quot;) -&gt; (&quot;##gs&quot;)</code>.",Jl,G,Za="Note that when we merge, we remove the <code>##</code> between the two tokens, so we add <code>&quot;##gs&quot;</code> to the vocabulary and apply the merge in the words of the corpus:",ul,V,ml,W,_a="At this point, <code>&quot;##u&quot;</code> is in all the possible pairs, so they all end up with the same score. Let’s say that in this case, the first pair is merged, so <code>(&quot;h&quot;, &quot;##u&quot;) -&gt; &quot;hu&quot;</code>. This takes us to:",Ul,X,Tl,H,Ea="Then the next best score is shared by <code>(&quot;hu&quot;, &quot;##g&quot;)</code> and <code>(&quot;hu&quot;, &quot;##gs&quot;)</code> (with 1/15, compared to 1/21 for all the other pairs), so the first pair with the biggest score is merged:",wl,R,Il,Y,Sa="and we continue like this until we reach the desired vocabulary size.",dl,g,gl,D,fl,F,Ga="Tokenization differs in WordPiece and BPE in that WordPiece only saves the final vocabulary, not the merge rules learned. Starting from the word to tokenize, WordPiece finds the longest subword that is in the vocabulary, then splits on it. For instance, if we use the vocabulary learned in the example above, for the word <code>&quot;hugs&quot;</code> the longest subword starting from the beginning that is inside the vocabulary is <code>&quot;hug&quot;</code>, so we split there and get <code>[&quot;hug&quot;, &quot;##s&quot;]</code>. We then continue with <code>&quot;##s&quot;</code>, which is in the vocabulary, so the tokenization of <code>&quot;hugs&quot;</code> is <code>[&quot;hug&quot;, &quot;##s&quot;]</code>.",xl,P,Va="With BPE, we would have applied the merges learned in order and tokenized this as <code>[&quot;hu&quot;, &quot;##gs&quot;]</code>, so the encoding is different.",bl,L,Wa="As another example, let’s see how the word <code>&quot;bugs&quot;</code> would be tokenized. <code>&quot;b&quot;</code> is the longest subword starting at the beginning of the word that is in the vocabulary, so we split there and get <code>[&quot;b&quot;, &quot;##ugs&quot;]</code>. Then <code>&quot;##u&quot;</code> is the longest subword starting at the beginning of <code>&quot;##ugs&quot;</code> that is in the vocabulary, so we split there and get <code>[&quot;b&quot;, &quot;##u, &quot;##gs&quot;]</code>. Finally, <code>&quot;##gs&quot;</code> is in the vocabulary, so this last list is the tokenization of <code>&quot;bugs&quot;</code>.",Cl,K,Xa="When the tokenization gets to a stage where it’s not possible to find a subword in the vocabulary, the whole word is tokenized as unknown — so, for instance, <code>&quot;mug&quot;</code> would be tokenized as <code>[&quot;[UNK]&quot;]</code>, as would <code>&quot;bum&quot;</code> (even if we can begin with <code>&quot;b&quot;</code> and <code>&quot;##u&quot;</code>, <code>&quot;##m&quot;</code> is not the vocabulary, and the resulting tokenization will just be <code>[&quot;[UNK]&quot;]</code>, not <code>[&quot;b&quot;, &quot;##u&quot;, &quot;[UNK]&quot;]</code>). This is another difference from BPE, which would only classify the individual characters not in the vocabulary as unknown.",ql,f,kl,O,$l,ss,Ha="Now let’s take a look at an implementation of the WordPiece algorithm. Like with BPE, this is just pedagogical, and you won’t able to use this on a big corpus.",Al,ls,Ra="We will use the same corpus as in the BPE example:",zl,as,vl,ns,Ya="First, we need to pre-tokenize the corpus into words. Since we are replicating a WordPiece tokenizer (like BERT), we will use the <code>bert-base-cased</code> tokenizer for the pre-tokenization:",Ql,ts,Bl,es,Da="Then we compute the frequencies of each word in the corpus as we do the pre-tokenization:",Nl,ps,Zl,Ms,_l,is,Fa="As we saw before, the alphabet is the unique set composed of all the first letters of words, and all the other letters that appear in words prefixed by <code>##</code>:",El,rs,Sl,ys,Gl,os,Pa="We also add the special tokens used by the model at the beginning of that vocabulary. In the case of BERT, it’s the list <code>[&quot;[PAD]&quot;, &quot;[UNK]&quot;, &quot;[CLS]&quot;, &quot;[SEP]&quot;, &quot;[MASK]&quot;]</code>:",Vl,cs,Wl,js,La="Next we need to split each word, with all the letters that are not the first prefixed by <code>##</code>:",Xl,hs,Hl,Js,Ka="Now that we are ready for training, let’s write a function that computes the score of each pair. We’ll need to use this at each step of the training:",Rl,us,Yl,ms,Oa="Let’s have a look at a part of this dictionary after the initial splits:",Dl,Us,Fl,Ts,Pl,ws,sn="Now, finding the pair with the best score only takes a quick loop:",Ll,Is,Kl,ds,Ol,gs,ln="So the first merge to learn is <code>(&#39;a&#39;, &#39;##b&#39;) -&gt; &#39;ab&#39;</code>, and we add <code>&#39;ab&#39;</code> to the vocabulary:",sa,fs,la,xs,an="To continue, we need to apply that merge in our <code>splits</code> dictionary. Let’s write another function for this:",aa,bs,na,Cs,nn="And we can have a look at the result of the first merge:",ta,qs,ea,ks,pa,$s,tn="Now we have everything we need to loop until we have learned all the merges we want. Let’s aim for a vocab size of 70:",Ma,As,ia,zs,en="We can then look at the generated vocabulary:",ra,vs,ya,Qs,oa,Bs,pn="As we can see, compared to BPE, this tokenizer learns parts of words as tokens a bit faster.",ca,x,ja,Ns,Mn="To tokenize a new text, we pre-tokenize it, split it, then apply the tokenization algorithm on each word. That is, we look for the biggest subword starting at the beginning of the first word and split it, then we repeat the process on the second part, and so on for the rest of that word and the following words in the text:",ha,Zs,Ja,_s,rn="Let’s test it on one word that’s in the vocabulary, and another that isn’t:",ua,Es,ma,Ss,Ua,Gs,yn="Now, let’s write a function that tokenizes a text:",Ta,Vs,wa,Ws,on="We can try it on any text:",Ia,Xs,da,Hs,ga,Rs,cn="That’s it for the WordPiece algorithm! Now let’s take a look at Unigram.",fa,Ys,xa,Fs,ba;return b=new Ca({props:{title:"WordPiece tokenization",local:"wordpiece-tokenization",headingTag:"h1"}}),C=new $n({props:{chapter:6,classNames:"absolute z-10 right-0 top-0",notebooks:[{label:"Google Colab",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter6/section6.ipynb"},{label:"Aws Studio",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter6/section6.ipynb"}]}}),k=new kn({props:{id:"qpv6ms_t_1A"}}),I=new Ps({props:{$$slots:{default:[zn]},$$scope:{ctx:w}}}),$=new Ca({props:{title:"Training algorithm",local:"training-algorithm",headingTag:"h2"}}),d=new Ps({props:{warning:!0,$$slots:{default:[vn]},$$scope:{ctx:w}}}),z=new m({props:{code:"dyUyMCUyMyUyM28lMjAlMjMlMjNyJTIwJTIzJTIzZA==",highlighted:"w ##o ##r ##d",wrap:!1}}),Z=new m({props:{code:"KCUyMmh1ZyUyMiUyQyUyMDEwKSUyQyUyMCglMjJwdWclMjIlMkMlMjA1KSUyQyUyMCglMjJwdW4lMjIlMkMlMjAxMiklMkMlMjAoJTIyYnVuJTIyJTJDJTIwNCklMkMlMjAoJTIyaHVncyUyMiUyQyUyMDUp",highlighted:'(<span class="hljs-string">&quot;hug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;bun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;hugs&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)',wrap:!1}}),E=new m({props:{code:"KCUyMmglMjIlMjAlMjIlMjMlMjN1JTIyJTIwJTIyJTIzJTIzZyUyMiUyQyUyMDEwKSUyQyUyMCglMjJwJTIyJTIwJTIyJTIzJTIzdSUyMiUyMCUyMiUyMyUyM2clMjIlMkMlMjA1KSUyQyUyMCglMjJwJTIyJTIwJTIyJTIzJTIzdSUyMiUyMCUyMiUyMyUyM24lMjIlMkMlMjAxMiklMkMlMjAoJTIyYiUyMiUyMCUyMiUyMyUyM3UlMjIlMjAlMjIlMjMlMjNuJTIyJTJDJTIwNCklMkMlMjAoJTIyaCUyMiUyMCUyMiUyMyUyM3UlMjIlMjAlMjIlMjMlMjNnJTIyJTIwJTIyJTIzJTIzcyUyMiUyQyUyMDUp",highlighted:'(<span class="hljs-string">&quot;h&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">&quot;b&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&quot;h&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#s</span>&quot;</span>, <span class="hljs-number">5</span>)',wrap:!1}}),V=new m({props:{code:"Vm9jYWJ1bGFyeSUzQSUyMCU1QiUyMmIlMjIlMkMlMjAlMjJoJTIyJTJDJTIwJTIycCUyMiUyQyUyMCUyMiUyMyUyM2clMjIlMkMlMjAlMjIlMjMlMjNuJTIyJTJDJTIwJTIyJTIzJTIzcyUyMiUyQyUyMCUyMiUyMyUyM3UlMjIlMkMlMjAlMjIlMjMlMjNncyUyMiU1RCUwQUNvcnB1cyUzQSUyMCglMjJoJTIyJTIwJTIyJTIzJTIzdSUyMiUyMCUyMiUyMyUyM2clMjIlMkMlMjAxMCklMkMlMjAoJTIycCUyMiUyMCUyMiUyMyUyM3UlMjIlMjAlMjIlMjMlMjNnJTIyJTJDJTIwNSklMkMlMjAoJTIycCUyMiUyMCUyMiUyMyUyM3UlMjIlMjAlMjIlMjMlMjNuJTIyJTJDJTIwMTIpJTJDJTIwKCUyMmIlMjIlMjAlMjIlMjMlMjN1JTIyJTIwJTIyJTIzJTIzbiUyMiUyQyUyMDQpJTJDJTIwKCUyMmglMjIlMjAlMjIlMjMlMjN1JTIyJTIwJTIyJTIzJTIzZ3MlMjIlMkMlMjA1KQ==",highlighted:`Vocabulary: [<span class="hljs-string">&quot;b&quot;</span>, <span class="hljs-string">&quot;h&quot;</span>, <span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#s</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#gs</span>&quot;</span>]
Corpus: (<span class="hljs-string">&quot;h&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">&quot;b&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&quot;h&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#gs</span>&quot;</span>, <span class="hljs-number">5</span>)`,wrap:!1}}),X=new m({props:{code:"Vm9jYWJ1bGFyeSUzQSUyMCU1QiUyMmIlMjIlMkMlMjAlMjJoJTIyJTJDJTIwJTIycCUyMiUyQyUyMCUyMiUyMyUyM2clMjIlMkMlMjAlMjIlMjMlMjNuJTIyJTJDJTIwJTIyJTIzJTIzcyUyMiUyQyUyMCUyMiUyMyUyM3UlMjIlMkMlMjAlMjIlMjMlMjNncyUyMiUyQyUyMCUyMmh1JTIyJTVEJTBBQ29ycHVzJTNBJTIwKCUyMmh1JTIyJTIwJTIyJTIzJTIzZyUyMiUyQyUyMDEwKSUyQyUyMCglMjJwJTIyJTIwJTIyJTIzJTIzdSUyMiUyMCUyMiUyMyUyM2clMjIlMkMlMjA1KSUyQyUyMCglMjJwJTIyJTIwJTIyJTIzJTIzdSUyMiUyMCUyMiUyMyUyM24lMjIlMkMlMjAxMiklMkMlMjAoJTIyYiUyMiUyMCUyMiUyMyUyM3UlMjIlMjAlMjIlMjMlMjNuJTIyJTJDJTIwNCklMkMlMjAoJTIyaHUlMjIlMjAlMjIlMjMlMjNncyUyMiUyQyUyMDUp",highlighted:`Vocabulary: [<span class="hljs-string">&quot;b&quot;</span>, <span class="hljs-string">&quot;h&quot;</span>, <span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#s</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#gs</span>&quot;</span>, <span class="hljs-string">&quot;hu&quot;</span>]
Corpus: (<span class="hljs-string">&quot;hu&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">&quot;b&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&quot;hu&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#gs</span>&quot;</span>, <span class="hljs-number">5</span>)`,wrap:!1}}),R=new m({props:{code:"Vm9jYWJ1bGFyeSUzQSUyMCU1QiUyMmIlMjIlMkMlMjAlMjJoJTIyJTJDJTIwJTIycCUyMiUyQyUyMCUyMiUyMyUyM2clMjIlMkMlMjAlMjIlMjMlMjNuJTIyJTJDJTIwJTIyJTIzJTIzcyUyMiUyQyUyMCUyMiUyMyUyM3UlMjIlMkMlMjAlMjIlMjMlMjNncyUyMiUyQyUyMCUyMmh1JTIyJTJDJTIwJTIyaHVnJTIyJTVEJTBBQ29ycHVzJTNBJTIwKCUyMmh1ZyUyMiUyQyUyMDEwKSUyQyUyMCglMjJwJTIyJTIwJTIyJTIzJTIzdSUyMiUyMCUyMiUyMyUyM2clMjIlMkMlMjA1KSUyQyUyMCglMjJwJTIyJTIwJTIyJTIzJTIzdSUyMiUyMCUyMiUyMyUyM24lMjIlMkMlMjAxMiklMkMlMjAoJTIyYiUyMiUyMCUyMiUyMyUyM3UlMjIlMjAlMjIlMjMlMjNuJTIyJTJDJTIwNCklMkMlMjAoJTIyaHUlMjIlMjAlMjIlMjMlMjNncyUyMiUyQyUyMDUp",highlighted:`Vocabulary: [<span class="hljs-string">&quot;b&quot;</span>, <span class="hljs-string">&quot;h&quot;</span>, <span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#s</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span>, <span class="hljs-string">&quot;#<span class="hljs-subst">#gs</span>&quot;</span>, <span class="hljs-string">&quot;hu&quot;</span>, <span class="hljs-string">&quot;hug&quot;</span>]
Corpus: (<span class="hljs-string">&quot;hug&quot;</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#g</span>&quot;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&quot;p&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">&quot;b&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#u</span>&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#n</span>&quot;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&quot;hu&quot;</span> <span class="hljs-string">&quot;#<span class="hljs-subst">#gs</span>&quot;</span>, <span class="hljs-number">5</span>)`,wrap:!1}}),g=new Ps({props:{$$slots:{default:[Qn]},$$scope:{ctx:w}}}),D=new Ca({props:{title:"Tokenization algorithm",local:"tokenization-algorithm",headingTag:"h2"}}),f=new Ps({props:{$$slots:{default:[Bn]},$$scope:{ctx:w}}}),O=new Ca({props:{title:"Implementing WordPiece",local:"implementing-wordpiece",headingTag:"h2"}}),as=new m({props:{code:"Y29ycHVzJTIwJTNEJTIwJTVCJTBBJTIwJTIwJTIwJTIwJTIyVGhpcyUyMGlzJTIwdGhlJTIwSHVnZ2luZyUyMEZhY2UlMjBDb3Vyc2UuJTIyJTJDJTBBJTIwJTIwJTIwJTIwJTIyVGhpcyUyMGNoYXB0ZXIlMjBpcyUyMGFib3V0JTIwdG9rZW5pemF0aW9uLiUyMiUyQyUwQSUyMCUyMCUyMCUyMCUyMlRoaXMlMjBzZWN0aW9uJTIwc2hvd3MlMjBzZXZlcmFsJTIwdG9rZW5pemVyJTIwYWxnb3JpdGhtcy4lMjIlMkMlMEElMjAlMjAlMjAlMjAlMjJIb3BlZnVsbHklMkMlMjB5b3UlMjB3aWxsJTIwYmUlMjBhYmxlJTIwdG8lMjB1bmRlcnN0YW5kJTIwaG93JTIwdGhleSUyMGFyZSUyMHRyYWluZWQlMjBhbmQlMjBnZW5lcmF0ZSUyMHRva2Vucy4lMjIlMkMlMEElNUQ=",highlighted:`corpus = [
<span class="hljs-string">&quot;This is the Hugging Face Course.&quot;</span>,
<span class="hljs-string">&quot;This chapter is about tokenization.&quot;</span>,
<span class="hljs-string">&quot;This section shows several tokenizer algorithms.&quot;</span>,
<span class="hljs-string">&quot;Hopefully, you will be able to understand how they are trained and generate tokens.&quot;</span>,
]`,wrap:!1}}),ts=new m({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Ub2tlbml6ZXIlMEElMEF0b2tlbml6ZXIlMjAlM0QlMjBBdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZCglMjJiZXJ0LWJhc2UtY2FzZWQlMjIp",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;bert-base-cased&quot;</span>)`,wrap:!1}}),ps=new m({props:{code:"ZnJvbSUyMGNvbGxlY3Rpb25zJTIwaW1wb3J0JTIwZGVmYXVsdGRpY3QlMEElMEF3b3JkX2ZyZXFzJTIwJTNEJTIwZGVmYXVsdGRpY3QoaW50KSUwQWZvciUyMHRleHQlMjBpbiUyMGNvcnB1cyUzQSUwQSUyMCUyMCUyMCUyMHdvcmRzX3dpdGhfb2Zmc2V0cyUyMCUzRCUyMHRva2VuaXplci5iYWNrZW5kX3Rva2VuaXplci5wcmVfdG9rZW5pemVyLnByZV90b2tlbml6ZV9zdHIodGV4dCklMEElMjAlMjAlMjAlMjBuZXdfd29yZHMlMjAlM0QlMjAlNUJ3b3JkJTIwZm9yJTIwd29yZCUyQyUyMG9mZnNldCUyMGluJTIwd29yZHNfd2l0aF9vZmZzZXRzJTVEJTBBJTIwJTIwJTIwJTIwZm9yJTIwd29yZCUyMGluJTIwbmV3X3dvcmRzJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwd29yZF9mcmVxcyU1QndvcmQlNUQlMjAlMkIlM0QlMjAxJTBBJTBBd29yZF9mcmVxcw==",highlighted:`<span class="hljs-keyword">from</span> collections <span class="hljs-keyword">import</span> defaultdict
word_freqs = defaultdict(<span class="hljs-built_in">int</span>)
<span class="hljs-keyword">for</span> text <span class="hljs-keyword">in</span> corpus:
words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
new_words = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> words_with_offsets]
<span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> new_words:
word_freqs[word] += <span class="hljs-number">1</span>
word_freqs`,wrap:!1}}),Ms=new m({props:{code:"ZGVmYXVsdGRpY3QoJTBBJTIwJTIwJTIwJTIwaW50JTJDJTIwJTdCJ1RoaXMnJTNBJTIwMyUyQyUyMCdpcyclM0ElMjAyJTJDJTIwJ3RoZSclM0ElMjAxJTJDJTIwJ0h1Z2dpbmcnJTNBJTIwMSUyQyUyMCdGYWNlJyUzQSUyMDElMkMlMjAnQ291cnNlJyUzQSUyMDElMkMlMjAnLiclM0ElMjA0JTJDJTIwJ2NoYXB0ZXInJTNBJTIwMSUyQyUyMCdhYm91dCclM0ElMjAxJTJDJTBBJTIwJTIwJTIwJTIwJ3Rva2VuaXphdGlvbiclM0ElMjAxJTJDJTIwJ3NlY3Rpb24nJTNBJTIwMSUyQyUyMCdzaG93cyclM0ElMjAxJTJDJTIwJ3NldmVyYWwnJTNBJTIwMSUyQyUyMCd0b2tlbml6ZXInJTNBJTIwMSUyQyUyMCdhbGdvcml0aG1zJyUzQSUyMDElMkMlMjAnSG9wZWZ1bGx5JyUzQSUyMDElMkMlMEElMjAlMjAlMjAlMjAnJTJDJyUzQSUyMDElMkMlMjAneW91JyUzQSUyMDElMkMlMjAnd2lsbCclM0ElMjAxJTJDJTIwJ2JlJyUzQSUyMDElMkMlMjAnYWJsZSclM0ElMjAxJTJDJTIwJ3RvJyUzQSUyMDElMkMlMjAndW5kZXJzdGFuZCclM0ElMjAxJTJDJTIwJ2hvdyclM0ElMjAxJTJDJTIwJ3RoZXknJTNBJTIwMSUyQyUyMCdhcmUnJTNBJTIwMSUyQyUwQSUyMCUyMCUyMCUyMCd0cmFpbmVkJyUzQSUyMDElMkMlMjAnYW5kJyUzQSUyMDElMkMlMjAnZ2VuZXJhdGUnJTNBJTIwMSUyQyUyMCd0b2tlbnMnJTNBJTIwMSU3RCk=",highlighted:`defaultdict(
<span class="hljs-built_in">int</span>, {<span class="hljs-string">&#x27;This&#x27;</span>: <span class="hljs-number">3</span>, <span class="hljs-string">&#x27;is&#x27;</span>: <span class="hljs-number">2</span>, <span class="hljs-string">&#x27;the&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Hugging&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Face&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Course&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;.&#x27;</span>: <span class="hljs-number">4</span>, <span class="hljs-string">&#x27;chapter&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;about&#x27;</span>: <span class="hljs-number">1</span>,
<span class="hljs-string">&#x27;tokenization&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;section&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;shows&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;several&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;tokenizer&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;algorithms&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;Hopefully&#x27;</span>: <span class="hljs-number">1</span>,
<span class="hljs-string">&#x27;,&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;you&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;will&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;be&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;able&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;to&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;understand&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;how&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;they&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;are&#x27;</span>: <span class="hljs-number">1</span>,
<span class="hljs-string">&#x27;trained&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;and&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;generate&#x27;</span>: <span class="hljs-number">1</span>, <span class="hljs-string">&#x27;tokens&#x27;</span>: <span class="hljs-number">1</span>})`,wrap:!1}}),rs=new m({props:{code:"YWxwaGFiZXQlMjAlM0QlMjAlNUIlNUQlMEFmb3IlMjB3b3JkJTIwaW4lMjB3b3JkX2ZyZXFzLmtleXMoKSUzQSUwQSUyMCUyMCUyMCUyMGlmJTIwd29yZCU1QjAlNUQlMjBub3QlMjBpbiUyMGFscGhhYmV0JTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwYWxwaGFiZXQuYXBwZW5kKHdvcmQlNUIwJTVEKSUwQSUyMCUyMCUyMCUyMGZvciUyMGxldHRlciUyMGluJTIwd29yZCU1QjElM0ElNUQlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpZiUyMGYlMjIlMjMlMjMlN0JsZXR0ZXIlN0QlMjIlMjBub3QlMjBpbiUyMGFscGhhYmV0JTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwYWxwaGFiZXQuYXBwZW5kKGYlMjIlMjMlMjMlN0JsZXR0ZXIlN0QlMjIpJTBBJTBBYWxwaGFiZXQuc29ydCgpJTBBYWxwaGFiZXQlMEElMEFwcmludChhbHBoYWJldCk=",highlighted:`alphabet = []
<span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> word_freqs.keys():
<span class="hljs-keyword">if</span> word[<span class="hljs-number">0</span>] <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> alphabet:
alphabet.append(word[<span class="hljs-number">0</span>])
<span class="hljs-keyword">for</span> letter <span class="hljs-keyword">in</span> word[<span class="hljs-number">1</span>:]:
<span class="hljs-keyword">if</span> <span class="hljs-string">f&quot;##<span class="hljs-subst">{letter}</span>&quot;</span> <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> alphabet:
alphabet.append(<span class="hljs-string">f&quot;##<span class="hljs-subst">{letter}</span>&quot;</span>)
alphabet.sort()
alphabet
<span class="hljs-built_in">print</span>(alphabet)`,wrap:!1}}),ys=new m({props:{code:"JTVCJyUyMyUyM2EnJTJDJTIwJyUyMyUyM2InJTJDJTIwJyUyMyUyM2MnJTJDJTIwJyUyMyUyM2QnJTJDJTIwJyUyMyUyM2UnJTJDJTIwJyUyMyUyM2YnJTJDJTIwJyUyMyUyM2cnJTJDJTIwJyUyMyUyM2gnJTJDJTIwJyUyMyUyM2knJTJDJTIwJyUyMyUyM2snJTJDJTIwJyUyMyUyM2wnJTJDJTIwJyUyMyUyM20nJTJDJTIwJyUyMyUyM24nJTJDJTIwJyUyMyUyM28nJTJDJTIwJyUyMyUyM3AnJTJDJTIwJyUyMyUyM3InJTJDJTIwJyUyMyUyM3MnJTJDJTBBJTIwJyUyMyUyM3QnJTJDJTIwJyUyMyUyM3UnJTJDJTIwJyUyMyUyM3YnJTJDJTIwJyUyMyUyM3cnJTJDJTIwJyUyMyUyM3knJTJDJTIwJyUyMyUyM3onJTJDJTIwJyUyQyclMkMlMjAnLiclMkMlMjAnQyclMkMlMjAnRiclMkMlMjAnSCclMkMlMjAnVCclMkMlMjAnYSclMkMlMjAnYiclMkMlMjAnYyclMkMlMjAnZyclMkMlMjAnaCclMkMlMjAnaSclMkMlMjAncyclMkMlMjAndCclMkMlMjAndSclMkMlMEElMjAndyclMkMlMjAneSclNUQ=",highlighted:`[<span class="hljs-string">&#x27;##a&#x27;</span>, <span class="hljs-string">&#x27;##b&#x27;</span>, <span class="hljs-string">&#x27;##c&#x27;</span>, <span class="hljs-string">&#x27;##d&#x27;</span>, <span class="hljs-string">&#x27;##e&#x27;</span>, <span class="hljs-string">&#x27;##f&#x27;</span>, <span class="hljs-string">&#x27;##g&#x27;</span>, <span class="hljs-string">&#x27;##h&#x27;</span>, <span class="hljs-string">&#x27;##i&#x27;</span>, <span class="hljs-string">&#x27;##k&#x27;</span>, <span class="hljs-string">&#x27;##l&#x27;</span>, <span class="hljs-string">&#x27;##m&#x27;</span>, <span class="hljs-string">&#x27;##n&#x27;</span>, <span class="hljs-string">&#x27;##o&#x27;</span>, <span class="hljs-string">&#x27;##p&#x27;</span>, <span class="hljs-string">&#x27;##r&#x27;</span>, <span class="hljs-string">&#x27;##s&#x27;</span>,
<span class="hljs-string">&#x27;##t&#x27;</span>, <span class="hljs-string">&#x27;##u&#x27;</span>, <span class="hljs-string">&#x27;##v&#x27;</span>, <span class="hljs-string">&#x27;##w&#x27;</span>, <span class="hljs-string">&#x27;##y&#x27;</span>, <span class="hljs-string">&#x27;##z&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;C&#x27;</span>, <span class="hljs-string">&#x27;F&#x27;</span>, <span class="hljs-string">&#x27;H&#x27;</span>, <span class="hljs-string">&#x27;T&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;b&#x27;</span>, <span class="hljs-string">&#x27;c&#x27;</span>, <span class="hljs-string">&#x27;g&#x27;</span>, <span class="hljs-string">&#x27;h&#x27;</span>, <span class="hljs-string">&#x27;i&#x27;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27;t&#x27;</span>, <span class="hljs-string">&#x27;u&#x27;</span>,
<span class="hljs-string">&#x27;w&#x27;</span>, <span class="hljs-string">&#x27;y&#x27;</span>]`,wrap:!1}}),cs=new m({props:{code:"dm9jYWIlMjAlM0QlMjAlNUIlMjIlNUJQQUQlNUQlMjIlMkMlMjAlMjIlNUJVTkslNUQlMjIlMkMlMjAlMjIlNUJDTFMlNUQlMjIlMkMlMjAlMjIlNUJTRVAlNUQlMjIlMkMlMjAlMjIlNUJNQVNLJTVEJTIyJTVEJTIwJTJCJTIwYWxwaGFiZXQuY29weSgp",highlighted:'vocab = [<span class="hljs-string">&quot;[PAD]&quot;</span>, <span class="hljs-string">&quot;[UNK]&quot;</span>, <span class="hljs-string">&quot;[CLS]&quot;</span>, <span class="hljs-string">&quot;[SEP]&quot;</span>, <span class="hljs-string">&quot;[MASK]&quot;</span>] + alphabet.copy()',wrap:!1}}),hs=new m({props:{code:"c3BsaXRzJTIwJTNEJTIwJTdCJTBBJTIwJTIwJTIwJTIwd29yZCUzQSUyMCU1QmMlMjBpZiUyMGklMjAlM0QlM0QlMjAwJTIwZWxzZSUyMGYlMjIlMjMlMjMlN0JjJTdEJTIyJTIwZm9yJTIwaSUyQyUyMGMlMjBpbiUyMGVudW1lcmF0ZSh3b3JkKSU1RCUwQSUyMCUyMCUyMCUyMGZvciUyMHdvcmQlMjBpbiUyMHdvcmRfZnJlcXMua2V5cygpJTBBJTdE",highlighted:`splits = {
word: [c <span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span> <span class="hljs-keyword">else</span> <span class="hljs-string">f&quot;##<span class="hljs-subst">{c}</span>&quot;</span> <span class="hljs-keyword">for</span> i, c <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(word)]
<span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> word_freqs.keys()
}`,wrap:!1}}),us=new m({props:{code:"ZGVmJTIwY29tcHV0ZV9wYWlyX3Njb3JlcyhzcGxpdHMpJTNBJTBBJTIwJTIwJTIwJTIwbGV0dGVyX2ZyZXFzJTIwJTNEJTIwZGVmYXVsdGRpY3QoaW50KSUwQSUyMCUyMCUyMCUyMHBhaXJfZnJlcXMlMjAlM0QlMjBkZWZhdWx0ZGljdChpbnQpJTBBJTIwJTIwJTIwJTIwZm9yJTIwd29yZCUyQyUyMGZyZXElMjBpbiUyMHdvcmRfZnJlcXMuaXRlbXMoKSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHNwbGl0JTIwJTNEJTIwc3BsaXRzJTVCd29yZCU1RCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwbGVuKHNwbGl0KSUyMCUzRCUzRCUyMDElM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBsZXR0ZXJfZnJlcXMlNUJzcGxpdCU1QjAlNUQlNUQlMjAlMkIlM0QlMjBmcmVxJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwY29udGludWUlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBmb3IlMjBpJTIwaW4lMjByYW5nZShsZW4oc3BsaXQpJTIwLSUyMDEpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcGFpciUyMCUzRCUyMChzcGxpdCU1QmklNUQlMkMlMjBzcGxpdCU1QmklMjAlMkIlMjAxJTVEKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGxldHRlcl9mcmVxcyU1QnNwbGl0JTVCaSU1RCU1RCUyMCUyQiUzRCUyMGZyZXElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBwYWlyX2ZyZXFzJTVCcGFpciU1RCUyMCUyQiUzRCUyMGZyZXElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBsZXR0ZXJfZnJlcXMlNUJzcGxpdCU1Qi0xJTVEJTVEJTIwJTJCJTNEJTIwZnJlcSUwQSUwQSUyMCUyMCUyMCUyMHNjb3JlcyUyMCUzRCUyMCU3QiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHBhaXIlM0ElMjBmcmVxJTIwJTJGJTIwKGxldHRlcl9mcmVxcyU1QnBhaXIlNUIwJTVEJTVEJTIwKiUyMGxldHRlcl9mcmVxcyU1QnBhaXIlNUIxJTVEJTVEKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGZvciUyMHBhaXIlMkMlMjBmcmVxJTIwaW4lMjBwYWlyX2ZyZXFzLml0ZW1zKCklMEElMjAlMjAlMjAlMjAlN0QlMEElMjAlMjAlMjAlMjByZXR1cm4lMjBzY29yZXM=",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_pair_scores</span>(<span class="hljs-params">splits</span>):
letter_freqs = defaultdict(<span class="hljs-built_in">int</span>)
pair_freqs = defaultdict(<span class="hljs-built_in">int</span>)
<span class="hljs-keyword">for</span> word, freq <span class="hljs-keyword">in</span> word_freqs.items():
split = splits[word]
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(split) == <span class="hljs-number">1</span>:
letter_freqs[split[<span class="hljs-number">0</span>]] += freq
<span class="hljs-keyword">continue</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(split) - <span class="hljs-number">1</span>):
pair = (split[i], split[i + <span class="hljs-number">1</span>])
letter_freqs[split[i]] += freq
pair_freqs[pair] += freq
letter_freqs[split[-<span class="hljs-number">1</span>]] += freq
scores = {
pair: freq / (letter_freqs[pair[<span class="hljs-number">0</span>]] * letter_freqs[pair[<span class="hljs-number">1</span>]])
<span class="hljs-keyword">for</span> pair, freq <span class="hljs-keyword">in</span> pair_freqs.items()
}
<span class="hljs-keyword">return</span> scores`,wrap:!1}}),Us=new m({props:{code:"cGFpcl9zY29yZXMlMjAlM0QlMjBjb21wdXRlX3BhaXJfc2NvcmVzKHNwbGl0cyklMEFmb3IlMjBpJTJDJTIwa2V5JTIwaW4lMjBlbnVtZXJhdGUocGFpcl9zY29yZXMua2V5cygpKSUzQSUwQSUyMCUyMCUyMCUyMHByaW50KGYlMjIlN0JrZXklN0QlM0ElMjAlN0JwYWlyX3Njb3JlcyU1QmtleSU1RCU3RCUyMiklMEElMjAlMjAlMjAlMjBpZiUyMGklMjAlM0UlM0QlMjA1JTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwYnJlYWs=",highlighted:`pair_scores = compute_pair_scores(splits)
<span class="hljs-keyword">for</span> i, key <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(pair_scores.keys()):
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;<span class="hljs-subst">{key}</span>: <span class="hljs-subst">{pair_scores[key]}</span>&quot;</span>)
<span class="hljs-keyword">if</span> i &gt;= <span class="hljs-number">5</span>:
<span class="hljs-keyword">break</span>`,wrap:!1}}),Ts=new m({props:{code:"KCdUJyUyQyUyMCclMjMlMjNoJyklM0ElMjAwLjEyNSUwQSgnJTIzJTIzaCclMkMlMjAnJTIzJTIzaScpJTNBJTIwMC4wMzQwOTA5MDkwOTA5MDkwOSUwQSgnJTIzJTIzaSclMkMlMjAnJTIzJTIzcycpJTNBJTIwMC4wMjcyNzI3MjcyNzI3MjcyNyUwQSgnaSclMkMlMjAnJTIzJTIzcycpJTNBJTIwMC4xJTBBKCd0JyUyQyUyMCclMjMlMjNoJyklM0ElMjAwLjAzNTcxNDI4NTcxNDI4NTcxJTBBKCclMjMlMjNoJyUyQyUyMCclMjMlMjNlJyklM0ElMjAwLjAxMTkwNDc2MTkwNDc2MTkwNA==",highlighted:`(<span class="hljs-string">&#x27;T&#x27;</span>, <span class="hljs-string">&#x27;##h&#x27;</span>): <span class="hljs-number">0.125</span>
(<span class="hljs-string">&#x27;##h&#x27;</span>, <span class="hljs-string">&#x27;##i&#x27;</span>): <span class="hljs-number">0.03409090909090909</span>
(<span class="hljs-string">&#x27;##i&#x27;</span>, <span class="hljs-string">&#x27;##s&#x27;</span>): <span class="hljs-number">0.02727272727272727</span>
(<span class="hljs-string">&#x27;i&#x27;</span>, <span class="hljs-string">&#x27;##s&#x27;</span>): <span class="hljs-number">0.1</span>
(<span class="hljs-string">&#x27;t&#x27;</span>, <span class="hljs-string">&#x27;##h&#x27;</span>): <span class="hljs-number">0.03571428571428571</span>
(<span class="hljs-string">&#x27;##h&#x27;</span>, <span class="hljs-string">&#x27;##e&#x27;</span>): <span class="hljs-number">0.011904761904761904</span>`,wrap:!1}}),Is=new m({props:{code:"YmVzdF9wYWlyJTIwJTNEJTIwJTIyJTIyJTBBbWF4X3Njb3JlJTIwJTNEJTIwTm9uZSUwQWZvciUyMHBhaXIlMkMlMjBzY29yZSUyMGluJTIwcGFpcl9zY29yZXMuaXRlbXMoKSUzQSUwQSUyMCUyMCUyMCUyMGlmJTIwbWF4X3Njb3JlJTIwaXMlMjBOb25lJTIwb3IlMjBtYXhfc2NvcmUlMjAlM0MlMjBzY29yZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGJlc3RfcGFpciUyMCUzRCUyMHBhaXIlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBtYXhfc2NvcmUlMjAlM0QlMjBzY29yZSUwQSUwQXByaW50KGJlc3RfcGFpciUyQyUyMG1heF9zY29yZSk=",highlighted:`best_pair = <span class="hljs-string">&quot;&quot;</span>
max_score = <span class="hljs-literal">None</span>
<span class="hljs-keyword">for</span> pair, score <span class="hljs-keyword">in</span> pair_scores.items():
<span class="hljs-keyword">if</span> max_score <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">or</span> max_score &lt; score:
best_pair = pair
max_score = score
<span class="hljs-built_in">print</span>(best_pair, max_score)`,wrap:!1}}),ds=new m({props:{code:"KCdhJyUyQyUyMCclMjMlMjNiJyklMjAwLjI=",highlighted:'(<span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;##b&#x27;</span>) <span class="hljs-number">0.2</span>',wrap:!1}}),fs=new m({props:{code:"dm9jYWIuYXBwZW5kKCUyMmFiJTIyKQ==",highlighted:'vocab.append(<span class="hljs-string">&quot;ab&quot;</span>)',wrap:!1}}),bs=new m({props:{code:"ZGVmJTIwbWVyZ2VfcGFpcihhJTJDJTIwYiUyQyUyMHNwbGl0cyklM0ElMEElMjAlMjAlMjAlMjBmb3IlMjB3b3JkJTIwaW4lMjB3b3JkX2ZyZXFzJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc3BsaXQlMjAlM0QlMjBzcGxpdHMlNUJ3b3JkJTVEJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwaWYlMjBsZW4oc3BsaXQpJTIwJTNEJTNEJTIwMSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGNvbnRpbnVlJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwaSUyMCUzRCUyMDAlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB3aGlsZSUyMGklMjAlM0MlMjBsZW4oc3BsaXQpJTIwLSUyMDElM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpZiUyMHNwbGl0JTVCaSU1RCUyMCUzRCUzRCUyMGElMjBhbmQlMjBzcGxpdCU1QmklMjAlMkIlMjAxJTVEJTIwJTNEJTNEJTIwYiUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMG1lcmdlJTIwJTNEJTIwYSUyMCUyQiUyMGIlNUIyJTNBJTVEJTIwaWYlMjBiLnN0YXJ0c3dpdGgoJTIyJTIzJTIzJTIyKSUyMGVsc2UlMjBhJTIwJTJCJTIwYiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHNwbGl0JTIwJTNEJTIwc3BsaXQlNUIlM0FpJTVEJTIwJTJCJTIwJTVCbWVyZ2UlNUQlMjAlMkIlMjBzcGxpdCU1QmklMjAlMkIlMjAyJTIwJTNBJTVEJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZWxzZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGklMjAlMkIlM0QlMjAxJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc3BsaXRzJTVCd29yZCU1RCUyMCUzRCUyMHNwbGl0JTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwc3BsaXRz",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">merge_pair</span>(<span class="hljs-params">a, b, splits</span>):
<span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> word_freqs:
split = splits[word]
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(split) == <span class="hljs-number">1</span>:
<span class="hljs-keyword">continue</span>
i = <span class="hljs-number">0</span>
<span class="hljs-keyword">while</span> i &lt; <span class="hljs-built_in">len</span>(split) - <span class="hljs-number">1</span>:
<span class="hljs-keyword">if</span> split[i] == a <span class="hljs-keyword">and</span> split[i + <span class="hljs-number">1</span>] == b:
merge = a + b[<span class="hljs-number">2</span>:] <span class="hljs-keyword">if</span> b.startswith(<span class="hljs-string">&quot;##&quot;</span>) <span class="hljs-keyword">else</span> a + b
split = split[:i] + [merge] + split[i + <span class="hljs-number">2</span> :]
<span class="hljs-keyword">else</span>:
i += <span class="hljs-number">1</span>
splits[word] = split
<span class="hljs-keyword">return</span> splits`,wrap:!1}}),qs=new m({props:{code:"c3BsaXRzJTIwJTNEJTIwbWVyZ2VfcGFpciglMjJhJTIyJTJDJTIwJTIyJTIzJTIzYiUyMiUyQyUyMHNwbGl0cyklMEFzcGxpdHMlNUIlMjJhYm91dCUyMiU1RA==",highlighted:`splits = merge_pair(<span class="hljs-string">&quot;a&quot;</span>, <span class="hljs-string">&quot;##b&quot;</span>, splits)
splits[<span class="hljs-string">&quot;about&quot;</span>]`,wrap:!1}}),ks=new m({props:{code:"JTVCJ2FiJyUyQyUyMCclMjMlMjNvJyUyQyUyMCclMjMlMjN1JyUyQyUyMCclMjMlMjN0JyU1RA==",highlighted:'[<span class="hljs-string">&#x27;ab&#x27;</span>, <span class="hljs-string">&#x27;##o&#x27;</span>, <span class="hljs-string">&#x27;##u&#x27;</span>, <span class="hljs-string">&#x27;##t&#x27;</span>]',wrap:!1}}),As=new m({props:{code:"dm9jYWJfc2l6ZSUyMCUzRCUyMDcwJTBBd2hpbGUlMjBsZW4odm9jYWIpJTIwJTNDJTIwdm9jYWJfc2l6ZSUzQSUwQSUyMCUyMCUyMCUyMHNjb3JlcyUyMCUzRCUyMGNvbXB1dGVfcGFpcl9zY29yZXMoc3BsaXRzKSUwQSUyMCUyMCUyMCUyMGJlc3RfcGFpciUyQyUyMG1heF9zY29yZSUyMCUzRCUyMCUyMiUyMiUyQyUyME5vbmUlMEElMjAlMjAlMjAlMjBmb3IlMjBwYWlyJTJDJTIwc2NvcmUlMjBpbiUyMHNjb3Jlcy5pdGVtcygpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwaWYlMjBtYXhfc2NvcmUlMjBpcyUyME5vbmUlMjBvciUyMG1heF9zY29yZSUyMCUzQyUyMHNjb3JlJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwYmVzdF9wYWlyJTIwJTNEJTIwcGFpciUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMG1heF9zY29yZSUyMCUzRCUyMHNjb3JlJTBBJTIwJTIwJTIwJTIwc3BsaXRzJTIwJTNEJTIwbWVyZ2VfcGFpcigqYmVzdF9wYWlyJTJDJTIwc3BsaXRzKSUwQSUyMCUyMCUyMCUyMG5ld190b2tlbiUyMCUzRCUyMCglMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBiZXN0X3BhaXIlNUIwJTVEJTIwJTJCJTIwYmVzdF9wYWlyJTVCMSU1RCU1QjIlM0ElNUQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpZiUyMGJlc3RfcGFpciU1QjElNUQuc3RhcnRzd2l0aCglMjIlMjMlMjMlMjIpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZWxzZSUyMGJlc3RfcGFpciU1QjAlNUQlMjAlMkIlMjBiZXN0X3BhaXIlNUIxJTVEJTBBJTIwJTIwJTIwJTIwKSUwQSUyMCUyMCUyMCUyMHZvY2FiLmFwcGVuZChuZXdfdG9rZW4p",highlighted:`vocab_size = <span class="hljs-number">70</span>
<span class="hljs-keyword">while</span> <span class="hljs-built_in">len</span>(vocab) &lt; vocab_size:
scores = compute_pair_scores(splits)
best_pair, max_score = <span class="hljs-string">&quot;&quot;</span>, <span class="hljs-literal">None</span>
<span class="hljs-keyword">for</span> pair, score <span class="hljs-keyword">in</span> scores.items():
<span class="hljs-keyword">if</span> max_score <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">or</span> max_score &lt; score:
best_pair = pair
max_score = score
splits = merge_pair(*best_pair, splits)
new_token = (
best_pair[<span class="hljs-number">0</span>] + best_pair[<span class="hljs-number">1</span>][<span class="hljs-number">2</span>:]
<span class="hljs-keyword">if</span> best_pair[<span class="hljs-number">1</span>].startswith(<span class="hljs-string">&quot;##&quot;</span>)
<span class="hljs-keyword">else</span> best_pair[<span class="hljs-number">0</span>] + best_pair[<span class="hljs-number">1</span>]
)
vocab.append(new_token)`,wrap:!1}}),vs=new m({props:{code:"cHJpbnQodm9jYWIp",highlighted:'<span class="hljs-built_in">print</span>(vocab)',wrap:!1}}),Qs=new m({props:{code:"JTVCJyU1QlBBRCU1RCclMkMlMjAnJTVCVU5LJTVEJyUyQyUyMCclNUJDTFMlNUQnJTJDJTIwJyU1QlNFUCU1RCclMkMlMjAnJTVCTUFTSyU1RCclMkMlMjAnJTIzJTIzYSclMkMlMjAnJTIzJTIzYiclMkMlMjAnJTIzJTIzYyclMkMlMjAnJTIzJTIzZCclMkMlMjAnJTIzJTIzZSclMkMlMjAnJTIzJTIzZiclMkMlMjAnJTIzJTIzZyclMkMlMjAnJTIzJTIzaCclMkMlMjAnJTIzJTIzaSclMkMlMjAnJTIzJTIzayclMkMlMEElMjAnJTIzJTIzbCclMkMlMjAnJTIzJTIzbSclMkMlMjAnJTIzJTIzbiclMkMlMjAnJTIzJTIzbyclMkMlMjAnJTIzJTIzcCclMkMlMjAnJTIzJTIzciclMkMlMjAnJTIzJTIzcyclMkMlMjAnJTIzJTIzdCclMkMlMjAnJTIzJTIzdSclMkMlMjAnJTIzJTIzdiclMkMlMjAnJTIzJTIzdyclMkMlMjAnJTIzJTIzeSclMkMlMjAnJTIzJTIzeiclMkMlMjAnJTJDJyUyQyUyMCcuJyUyQyUyMCdDJyUyQyUyMCdGJyUyQyUyMCdIJyUyQyUwQSUyMCdUJyUyQyUyMCdhJyUyQyUyMCdiJyUyQyUyMCdjJyUyQyUyMCdnJyUyQyUyMCdoJyUyQyUyMCdpJyUyQyUyMCdzJyUyQyUyMCd0JyUyQyUyMCd1JyUyQyUyMCd3JyUyQyUyMCd5JyUyQyUyMCdhYiclMkMlMjAnJTIzJTIzZnUnJTJDJTIwJ0ZhJyUyQyUyMCdGYWMnJTJDJTIwJyUyMyUyM2N0JyUyQyUyMCclMjMlMjNmdWwnJTJDJTIwJyUyMyUyM2Z1bGwnJTJDJTIwJyUyMyUyM2Z1bGx5JyUyQyUwQSUyMCdUaCclMkMlMjAnY2gnJTJDJTIwJyUyMyUyM2htJyUyQyUyMCdjaGEnJTJDJTIwJ2NoYXAnJTJDJTIwJ2NoYXB0JyUyQyUyMCclMjMlMjN0aG0nJTJDJTIwJ0h1JyUyQyUyMCdIdWcnJTJDJTIwJ0h1Z2cnJTJDJTIwJ3NoJyUyQyUyMCd0aCclMkMlMjAnaXMnJTJDJTIwJyUyMyUyM3RobXMnJTJDJTIwJyUyMyUyM3phJyUyQyUyMCclMjMlMjN6YXQnJTJDJTBBJTIwJyUyMyUyM3V0JyU1RA==",highlighted:`[<span class="hljs-string">&#x27;[PAD]&#x27;</span>, <span class="hljs-string">&#x27;[UNK]&#x27;</span>, <span class="hljs-string">&#x27;[CLS]&#x27;</span>, <span class="hljs-string">&#x27;[SEP]&#x27;</span>, <span class="hljs-string">&#x27;[MASK]&#x27;</span>, <span class="hljs-string">&#x27;##a&#x27;</span>, <span class="hljs-string">&#x27;##b&#x27;</span>, <span class="hljs-string">&#x27;##c&#x27;</span>, <span class="hljs-string">&#x27;##d&#x27;</span>, <span class="hljs-string">&#x27;##e&#x27;</span>, <span class="hljs-string">&#x27;##f&#x27;</span>, <span class="hljs-string">&#x27;##g&#x27;</span>, <span class="hljs-string">&#x27;##h&#x27;</span>, <span class="hljs-string">&#x27;##i&#x27;</span>, <span class="hljs-string">&#x27;##k&#x27;</span>,
<span class="hljs-string">&#x27;##l&#x27;</span>, <span class="hljs-string">&#x27;##m&#x27;</span>, <span class="hljs-string">&#x27;##n&#x27;</span>, <span class="hljs-string">&#x27;##o&#x27;</span>, <span class="hljs-string">&#x27;##p&#x27;</span>, <span class="hljs-string">&#x27;##r&#x27;</span>, <span class="hljs-string">&#x27;##s&#x27;</span>, <span class="hljs-string">&#x27;##t&#x27;</span>, <span class="hljs-string">&#x27;##u&#x27;</span>, <span class="hljs-string">&#x27;##v&#x27;</span>, <span class="hljs-string">&#x27;##w&#x27;</span>, <span class="hljs-string">&#x27;##y&#x27;</span>, <span class="hljs-string">&#x27;##z&#x27;</span>, <span class="hljs-string">&#x27;,&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;C&#x27;</span>, <span class="hljs-string">&#x27;F&#x27;</span>, <span class="hljs-string">&#x27;H&#x27;</span>,
<span class="hljs-string">&#x27;T&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;b&#x27;</span>, <span class="hljs-string">&#x27;c&#x27;</span>, <span class="hljs-string">&#x27;g&#x27;</span>, <span class="hljs-string">&#x27;h&#x27;</span>, <span class="hljs-string">&#x27;i&#x27;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27;t&#x27;</span>, <span class="hljs-string">&#x27;u&#x27;</span>, <span class="hljs-string">&#x27;w&#x27;</span>, <span class="hljs-string">&#x27;y&#x27;</span>, <span class="hljs-string">&#x27;ab&#x27;</span>, <span class="hljs-string">&#x27;##fu&#x27;</span>, <span class="hljs-string">&#x27;Fa&#x27;</span>, <span class="hljs-string">&#x27;Fac&#x27;</span>, <span class="hljs-string">&#x27;##ct&#x27;</span>, <span class="hljs-string">&#x27;##ful&#x27;</span>, <span class="hljs-string">&#x27;##full&#x27;</span>, <span class="hljs-string">&#x27;##fully&#x27;</span>,
<span class="hljs-string">&#x27;Th&#x27;</span>, <span class="hljs-string">&#x27;ch&#x27;</span>, <span class="hljs-string">&#x27;##hm&#x27;</span>, <span class="hljs-string">&#x27;cha&#x27;</span>, <span class="hljs-string">&#x27;chap&#x27;</span>, <span class="hljs-string">&#x27;chapt&#x27;</span>, <span class="hljs-string">&#x27;##thm&#x27;</span>, <span class="hljs-string">&#x27;Hu&#x27;</span>, <span class="hljs-string">&#x27;Hug&#x27;</span>, <span class="hljs-string">&#x27;Hugg&#x27;</span>, <span class="hljs-string">&#x27;sh&#x27;</span>, <span class="hljs-string">&#x27;th&#x27;</span>, <span class="hljs-string">&#x27;is&#x27;</span>, <span class="hljs-string">&#x27;##thms&#x27;</span>, <span class="hljs-string">&#x27;##za&#x27;</span>, <span class="hljs-string">&#x27;##zat&#x27;</span>,
<span class="hljs-string">&#x27;##ut&#x27;</span>]`,wrap:!1}}),x=new Ps({props:{$$slots:{default:[Nn]},$$scope:{ctx:w}}}),Zs=new m({props:{code:"ZGVmJTIwZW5jb2RlX3dvcmQod29yZCklM0ElMEElMjAlMjAlMjAlMjB0b2tlbnMlMjAlM0QlMjAlNUIlNUQlMEElMjAlMjAlMjAlMjB3aGlsZSUyMGxlbih3b3JkKSUyMCUzRSUyMDAlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpJTIwJTNEJTIwbGVuKHdvcmQpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwd2hpbGUlMjBpJTIwJTNFJTIwMCUyMGFuZCUyMHdvcmQlNUIlM0FpJTVEJTIwbm90JTIwaW4lMjB2b2NhYiUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGklMjAtJTNEJTIwMSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwaSUyMCUzRCUzRCUyMDAlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjByZXR1cm4lMjAlNUIlMjIlNUJVTkslNUQlMjIlNUQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB0b2tlbnMuYXBwZW5kKHdvcmQlNUIlM0FpJTVEKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHdvcmQlMjAlM0QlMjB3b3JkJTVCaSUzQSU1RCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwbGVuKHdvcmQpJTIwJTNFJTIwMCUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHdvcmQlMjAlM0QlMjBmJTIyJTIzJTIzJTdCd29yZCU3RCUyMiUwQSUyMCUyMCUyMCUyMHJldHVybiUyMHRva2Vucw==",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">encode_word</span>(<span class="hljs-params">word</span>):
tokens = []
<span class="hljs-keyword">while</span> <span class="hljs-built_in">len</span>(word) &gt; <span class="hljs-number">0</span>:
i = <span class="hljs-built_in">len</span>(word)
<span class="hljs-keyword">while</span> i &gt; <span class="hljs-number">0</span> <span class="hljs-keyword">and</span> word[:i] <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> vocab:
i -= <span class="hljs-number">1</span>
<span class="hljs-keyword">if</span> i == <span class="hljs-number">0</span>:
<span class="hljs-keyword">return</span> [<span class="hljs-string">&quot;[UNK]&quot;</span>]
tokens.append(word[:i])
word = word[i:]
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(word) &gt; <span class="hljs-number">0</span>:
word = <span class="hljs-string">f&quot;##<span class="hljs-subst">{word}</span>&quot;</span>
<span class="hljs-keyword">return</span> tokens`,wrap:!1}}),Es=new m({props:{code:"cHJpbnQoZW5jb2RlX3dvcmQoJTIySHVnZ2luZyUyMikpJTBBcHJpbnQoZW5jb2RlX3dvcmQoJTIySE9nZ2luZyUyMikp",highlighted:`<span class="hljs-built_in">print</span>(encode_word(<span class="hljs-string">&quot;Hugging&quot;</span>))
<span class="hljs-built_in">print</span>(encode_word(<span class="hljs-string">&quot;HOgging&quot;</span>))`,wrap:!1}}),Ss=new m({props:{code:"JTVCJ0h1Z2cnJTJDJTIwJyUyMyUyM2knJTJDJTIwJyUyMyUyM24nJTJDJTIwJyUyMyUyM2cnJTVEJTBBJTVCJyU1QlVOSyU1RCclNUQ=",highlighted:`[<span class="hljs-string">&#x27;Hugg&#x27;</span>, <span class="hljs-string">&#x27;##i&#x27;</span>, <span class="hljs-string">&#x27;##n&#x27;</span>, <span class="hljs-string">&#x27;##g&#x27;</span>]
[<span class="hljs-string">&#x27;[UNK]&#x27;</span>]`,wrap:!1}}),Vs=new m({props:{code:"ZGVmJTIwdG9rZW5pemUodGV4dCklM0ElMEElMjAlMjAlMjAlMjBwcmVfdG9rZW5pemVfcmVzdWx0JTIwJTNEJTIwdG9rZW5pemVyLl90b2tlbml6ZXIucHJlX3Rva2VuaXplci5wcmVfdG9rZW5pemVfc3RyKHRleHQpJTBBJTIwJTIwJTIwJTIwcHJlX3Rva2VuaXplZF90ZXh0JTIwJTNEJTIwJTVCd29yZCUyMGZvciUyMHdvcmQlMkMlMjBvZmZzZXQlMjBpbiUyMHByZV90b2tlbml6ZV9yZXN1bHQlNUQlMEElMjAlMjAlMjAlMjBlbmNvZGVkX3dvcmRzJTIwJTNEJTIwJTVCZW5jb2RlX3dvcmQod29yZCklMjBmb3IlMjB3b3JkJTIwaW4lMjBwcmVfdG9rZW5pemVkX3RleHQlNUQlMEElMjAlMjAlMjAlMjByZXR1cm4lMjBzdW0oZW5jb2RlZF93b3JkcyUyQyUyMCU1QiU1RCk=",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">tokenize</span>(<span class="hljs-params">text</span>):
pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
pre_tokenized_text = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> pre_tokenize_result]
encoded_words = [encode_word(word) <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> pre_tokenized_text]
<span class="hljs-keyword">return</span> <span class="hljs-built_in">sum</span>(encoded_words, [])`,wrap:!1}}),Xs=new m({props:{code:"dG9rZW5pemUoJTIyVGhpcyUyMGlzJTIwdGhlJTIwSHVnZ2luZyUyMEZhY2UlMjBjb3Vyc2UhJTIyKQ==",highlighted:'tokenize(<span class="hljs-string">&quot;This is the Hugging Face course!&quot;</span>)',wrap:!1}}),Hs=new m({props:{code:"JTVCJ1RoJyUyQyUyMCclMjMlMjNpJyUyQyUyMCclMjMlMjNzJyUyQyUyMCdpcyclMkMlMjAndGgnJTJDJTIwJyUyMyUyM2UnJTJDJTIwJ0h1Z2cnJTJDJTIwJyUyMyUyM2knJTJDJTIwJyUyMyUyM24nJTJDJTIwJyUyMyUyM2cnJTJDJTIwJ0ZhYyclMkMlMjAnJTIzJTIzZSclMkMlMjAnYyclMkMlMjAnJTIzJTIzbyclMkMlMjAnJTIzJTIzdSclMkMlMjAnJTIzJTIzciclMkMlMjAnJTIzJTIzcyclMkMlMEElMjAnJTIzJTIzZSclMkMlMjAnJTVCVU5LJTVEJyU1RA==",highlighted:`[<span class="hljs-string">&#x27;Th&#x27;</span>, <span class="hljs-string">&#x27;##i&#x27;</span>, <span class="hljs-string">&#x27;##s&#x27;</span>, <span class="hljs-string">&#x27;is&#x27;</span>, <span class="hljs-string">&#x27;th&#x27;</span>, <span class="hljs-string">&#x27;##e&#x27;</span>, <span class="hljs-string">&#x27;Hugg&#x27;</span>, <span class="hljs-string">&#x27;##i&#x27;</span>, <span class="hljs-string">&#x27;##n&#x27;</span>, <span class="hljs-string">&#x27;##g&#x27;</span>, <span class="hljs-string">&#x27;Fac&#x27;</span>, <span class="hljs-string">&#x27;##e&#x27;</span>, <span class="hljs-string">&#x27;c&#x27;</span>, <span class="hljs-string">&#x27;##o&#x27;</span>, <span class="hljs-string">&#x27;##u&#x27;</span>, <span class="hljs-string">&#x27;##r&#x27;</span>, <span class="hljs-string">&#x27;##s&#x27;</span>,
<span class="hljs-string">&#x27;##e&#x27;</span>, <span class="hljs-string">&#x27;[UNK]&#x27;</span>]`,wrap:!1}}),Ys=new An({props:{source:"https://github.com/huggingface/course/blob/main/chapters/en/chapter6/6.mdx"}}),{c(){p=M("meta"),U=t(),u=M("p"),T=t(),r(b.$$.fragment),Ks=t(),r(C.$$.fragment),Os=t(),q=M("p"),q.textContent=$a,sl=t(),r(k.$$.fragment),ll=t(),r(I.$$.fragment),al=t(),r($.$$.fragment),nl=t(),r(d.$$.fragment),tl=t(),A=M("p"),A.innerHTML=Aa,el=t(),r(z.$$.fragment),pl=t(),v=M("p"),v.textContent=za,Ml=t(),Q=M("p"),qa=fn(`Then, again like BPE, WordPiece learns merge rules. The main difference is the way the pair to be merged is selected. Instead of selecting the most frequent pair, WordPiece computes a score for each pair, using the following formula:
`),il=new xn(!1),rl=t(),B=M("p"),B.innerHTML=va,yl=t(),N=M("p"),N.textContent=Qa,ol=t(),r(Z.$$.fragment),cl=t(),_=M("p"),_.textContent=Ba,jl=t(),r(E.$$.fragment),hl=t(),S=M("p"),S.innerHTML=Na,Jl=t(),G=M("p"),G.innerHTML=Za,ul=t(),r(V.$$.fragment),ml=t(),W=M("p"),W.innerHTML=_a,Ul=t(),r(X.$$.fragment),Tl=t(),H=M("p"),H.innerHTML=Ea,wl=t(),r(R.$$.fragment),Il=t(),Y=M("p"),Y.textContent=Sa,dl=t(),r(g.$$.fragment),gl=t(),r(D.$$.fragment),fl=t(),F=M("p"),F.innerHTML=Ga,xl=t(),P=M("p"),P.innerHTML=Va,bl=t(),L=M("p"),L.innerHTML=Wa,Cl=t(),K=M("p"),K.innerHTML=Xa,ql=t(),r(f.$$.fragment),kl=t(),r(O.$$.fragment),$l=t(),ss=M("p"),ss.textContent=Ha,Al=t(),ls=M("p"),ls.textContent=Ra,zl=t(),r(as.$$.fragment),vl=t(),ns=M("p"),ns.innerHTML=Ya,Ql=t(),r(ts.$$.fragment),Bl=t(),es=M("p"),es.textContent=Da,Nl=t(),r(ps.$$.fragment),Zl=t(),r(Ms.$$.fragment),_l=t(),is=M("p"),is.innerHTML=Fa,El=t(),r(rs.$$.fragment),Sl=t(),r(ys.$$.fragment),Gl=t(),os=M("p"),os.innerHTML=Pa,Vl=t(),r(cs.$$.fragment),Wl=t(),js=M("p"),js.innerHTML=La,Xl=t(),r(hs.$$.fragment),Hl=t(),Js=M("p"),Js.textContent=Ka,Rl=t(),r(us.$$.fragment),Yl=t(),ms=M("p"),ms.textContent=Oa,Dl=t(),r(Us.$$.fragment),Fl=t(),r(Ts.$$.fragment),Pl=t(),ws=M("p"),ws.textContent=sn,Ll=t(),r(Is.$$.fragment),Kl=t(),r(ds.$$.fragment),Ol=t(),gs=M("p"),gs.innerHTML=ln,sa=t(),r(fs.$$.fragment),la=t(),xs=M("p"),xs.innerHTML=an,aa=t(),r(bs.$$.fragment),na=t(),Cs=M("p"),Cs.textContent=nn,ta=t(),r(qs.$$.fragment),ea=t(),r(ks.$$.fragment),pa=t(),$s=M("p"),$s.textContent=tn,Ma=t(),r(As.$$.fragment),ia=t(),zs=M("p"),zs.textContent=en,ra=t(),r(vs.$$.fragment),ya=t(),r(Qs.$$.fragment),oa=t(),Bs=M("p"),Bs.textContent=pn,ca=t(),r(x.$$.fragment),ja=t(),Ns=M("p"),Ns.textContent=Mn,ha=t(),r(Zs.$$.fragment),Ja=t(),_s=M("p"),_s.textContent=rn,ua=t(),r(Es.$$.fragment),ma=t(),r(Ss.$$.fragment),Ua=t(),Gs=M("p"),Gs.textContent=yn,Ta=t(),r(Vs.$$.fragment),wa=t(),Ws=M("p"),Ws.textContent=on,Ia=t(),r(Xs.$$.fragment),da=t(),r(Hs.$$.fragment),ga=t(),Rs=M("p"),Rs.textContent=cn,fa=t(),r(Ys.$$.fragment),xa=t(),Fs=M("p"),this.h()},l(s){const l=bn("svelte-u9bgzb",document.head);p=i(l,"META",{name:!0,content:!0}),l.forEach(a),U=e(s),u=i(s,"P",{}),ka(u).forEach(a),T=e(s),y(b.$$.fragment,s),Ks=e(s),y(C.$$.fragment,s),Os=e(s),q=i(s,"P",{"data-svelte-h":!0}),J(q)!=="svelte-1qentm6"&&(q.textContent=$a),sl=e(s),y(k.$$.fragment,s),ll=e(s),y(I.$$.fragment,s),al=e(s),y($.$$.fragment,s),nl=e(s),y(d.$$.fragment,s),tl=e(s),A=i(s,"P",{"data-svelte-h":!0}),J(A)!=="svelte-103h849"&&(A.innerHTML=Aa),el=e(s),y(z.$$.fragment,s),pl=e(s),v=i(s,"P",{"data-svelte-h":!0}),J(v)!=="svelte-pvxu6x"&&(v.textContent=za),Ml=e(s),Q=i(s,"P",{});var Ds=ka(Q);qa=Cn(Ds,`Then, again like BPE, WordPiece learns merge rules. The main difference is the way the pair to be merged is selected. Instead of selecting the most frequent pair, WordPiece computes a score for each pair, using the following formula:
`),il=qn(Ds,!1),Ds.forEach(a),rl=e(s),B=i(s,"P",{"data-svelte-h":!0}),J(B)!=="svelte-7xl8bf"&&(B.innerHTML=va),yl=e(s),N=i(s,"P",{"data-svelte-h":!0}),J(N)!=="svelte-1reb4z4"&&(N.textContent=Qa),ol=e(s),y(Z.$$.fragment,s),cl=e(s),_=i(s,"P",{"data-svelte-h":!0}),J(_)!=="svelte-pb62xd"&&(_.textContent=Ba),jl=e(s),y(E.$$.fragment,s),hl=e(s),S=i(s,"P",{"data-svelte-h":!0}),J(S)!=="svelte-q0haar"&&(S.innerHTML=Na),Jl=e(s),G=i(s,"P",{"data-svelte-h":!0}),J(G)!=="svelte-12rhmr5"&&(G.innerHTML=Za),ul=e(s),y(V.$$.fragment,s),ml=e(s),W=i(s,"P",{"data-svelte-h":!0}),J(W)!=="svelte-151eli3"&&(W.innerHTML=_a),Ul=e(s),y(X.$$.fragment,s),Tl=e(s),H=i(s,"P",{"data-svelte-h":!0}),J(H)!=="svelte-svusn0"&&(H.innerHTML=Ea),wl=e(s),y(R.$$.fragment,s),Il=e(s),Y=i(s,"P",{"data-svelte-h":!0}),J(Y)!=="svelte-1vaqaxm"&&(Y.textContent=Sa),dl=e(s),y(g.$$.fragment,s),gl=e(s),y(D.$$.fragment,s),fl=e(s),F=i(s,"P",{"data-svelte-h":!0}),J(F)!=="svelte-v4c1z1"&&(F.innerHTML=Ga),xl=e(s),P=i(s,"P",{"data-svelte-h":!0}),J(P)!=="svelte-x3syme"&&(P.innerHTML=Va),bl=e(s),L=i(s,"P",{"data-svelte-h":!0}),J(L)!=="svelte-gkhun5"&&(L.innerHTML=Wa),Cl=e(s),K=i(s,"P",{"data-svelte-h":!0}),J(K)!=="svelte-19tvv0p"&&(K.innerHTML=Xa),ql=e(s),y(f.$$.fragment,s),kl=e(s),y(O.$$.fragment,s),$l=e(s),ss=i(s,"P",{"data-svelte-h":!0}),J(ss)!=="svelte-148jv4w"&&(ss.textContent=Ha),Al=e(s),ls=i(s,"P",{"data-svelte-h":!0}),J(ls)!=="svelte-j22yiv"&&(ls.textContent=Ra),zl=e(s),y(as.$$.fragment,s),vl=e(s),ns=i(s,"P",{"data-svelte-h":!0}),J(ns)!=="svelte-1kypsn8"&&(ns.innerHTML=Ya),Ql=e(s),y(ts.$$.fragment,s),Bl=e(s),es=i(s,"P",{"data-svelte-h":!0}),J(es)!=="svelte-1piuede"&&(es.textContent=Da),Nl=e(s),y(ps.$$.fragment,s),Zl=e(s),y(Ms.$$.fragment,s),_l=e(s),is=i(s,"P",{"data-svelte-h":!0}),J(is)!=="svelte-g6680z"&&(is.innerHTML=Fa),El=e(s),y(rs.$$.fragment,s),Sl=e(s),y(ys.$$.fragment,s),Gl=e(s),os=i(s,"P",{"data-svelte-h":!0}),J(os)!=="svelte-9qkygh"&&(os.innerHTML=Pa),Vl=e(s),y(cs.$$.fragment,s),Wl=e(s),js=i(s,"P",{"data-svelte-h":!0}),J(js)!=="svelte-1k9a5j8"&&(js.innerHTML=La),Xl=e(s),y(hs.$$.fragment,s),Hl=e(s),Js=i(s,"P",{"data-svelte-h":!0}),J(Js)!=="svelte-1mtlptv"&&(Js.textContent=Ka),Rl=e(s),y(us.$$.fragment,s),Yl=e(s),ms=i(s,"P",{"data-svelte-h":!0}),J(ms)!=="svelte-h8brnl"&&(ms.textContent=Oa),Dl=e(s),y(Us.$$.fragment,s),Fl=e(s),y(Ts.$$.fragment,s),Pl=e(s),ws=i(s,"P",{"data-svelte-h":!0}),J(ws)!=="svelte-drj1oh"&&(ws.textContent=sn),Ll=e(s),y(Is.$$.fragment,s),Kl=e(s),y(ds.$$.fragment,s),Ol=e(s),gs=i(s,"P",{"data-svelte-h":!0}),J(gs)!=="svelte-1mufquv"&&(gs.innerHTML=ln),sa=e(s),y(fs.$$.fragment,s),la=e(s),xs=i(s,"P",{"data-svelte-h":!0}),J(xs)!=="svelte-1pmzgqr"&&(xs.innerHTML=an),aa=e(s),y(bs.$$.fragment,s),na=e(s),Cs=i(s,"P",{"data-svelte-h":!0}),J(Cs)!=="svelte-d7zdjw"&&(Cs.textContent=nn),ta=e(s),y(qs.$$.fragment,s),ea=e(s),y(ks.$$.fragment,s),pa=e(s),$s=i(s,"P",{"data-svelte-h":!0}),J($s)!=="svelte-vl065q"&&($s.textContent=tn),Ma=e(s),y(As.$$.fragment,s),ia=e(s),zs=i(s,"P",{"data-svelte-h":!0}),J(zs)!=="svelte-15nhah9"&&(zs.textContent=en),ra=e(s),y(vs.$$.fragment,s),ya=e(s),y(Qs.$$.fragment,s),oa=e(s),Bs=i(s,"P",{"data-svelte-h":!0}),J(Bs)!=="svelte-15fyq8b"&&(Bs.textContent=pn),ca=e(s),y(x.$$.fragment,s),ja=e(s),Ns=i(s,"P",{"data-svelte-h":!0}),J(Ns)!=="svelte-1pemxmn"&&(Ns.textContent=Mn),ha=e(s),y(Zs.$$.fragment,s),Ja=e(s),_s=i(s,"P",{"data-svelte-h":!0}),J(_s)!=="svelte-1hqbv66"&&(_s.textContent=rn),ua=e(s),y(Es.$$.fragment,s),ma=e(s),y(Ss.$$.fragment,s),Ua=e(s),Gs=i(s,"P",{"data-svelte-h":!0}),J(Gs)!=="svelte-1euf4wm"&&(Gs.textContent=yn),Ta=e(s),y(Vs.$$.fragment,s),wa=e(s),Ws=i(s,"P",{"data-svelte-h":!0}),J(Ws)!=="svelte-mvn7u4"&&(Ws.textContent=on),Ia=e(s),y(Xs.$$.fragment,s),da=e(s),y(Hs.$$.fragment,s),ga=e(s),Rs=i(s,"P",{"data-svelte-h":!0}),J(Rs)!=="svelte-1acligk"&&(Rs.textContent=cn),fa=e(s),y(Ys.$$.fragment,s),xa=e(s),Fs=i(s,"P",{}),ka(Fs).forEach(a),this.h()},h(){mn(p,"name","hf:doc:metadata"),mn(p,"content",_n),il.a=null},m(s,l){Un(document.head,p),n(s,U,l),n(s,u,l),n(s,T,l),o(b,s,l),n(s,Ks,l),o(C,s,l),n(s,Os,l),n(s,q,l),n(s,sl,l),o(k,s,l),n(s,ll,l),o(I,s,l),n(s,al,l),o($,s,l),n(s,nl,l),o(d,s,l),n(s,tl,l),n(s,A,l),n(s,el,l),o(z,s,l),n(s,pl,l),n(s,v,l),n(s,Ml,l),n(s,Q,l),Un(Q,qa),il.m(Tn,Q),n(s,rl,l),n(s,B,l),n(s,yl,l),n(s,N,l),n(s,ol,l),o(Z,s,l),n(s,cl,l),n(s,_,l),n(s,jl,l),o(E,s,l),n(s,hl,l),n(s,S,l),n(s,Jl,l),n(s,G,l),n(s,ul,l),o(V,s,l),n(s,ml,l),n(s,W,l),n(s,Ul,l),o(X,s,l),n(s,Tl,l),n(s,H,l),n(s,wl,l),o(R,s,l),n(s,Il,l),n(s,Y,l),n(s,dl,l),o(g,s,l),n(s,gl,l),o(D,s,l),n(s,fl,l),n(s,F,l),n(s,xl,l),n(s,P,l),n(s,bl,l),n(s,L,l),n(s,Cl,l),n(s,K,l),n(s,ql,l),o(f,s,l),n(s,kl,l),o(O,s,l),n(s,$l,l),n(s,ss,l),n(s,Al,l),n(s,ls,l),n(s,zl,l),o(as,s,l),n(s,vl,l),n(s,ns,l),n(s,Ql,l),o(ts,s,l),n(s,Bl,l),n(s,es,l),n(s,Nl,l),o(ps,s,l),n(s,Zl,l),o(Ms,s,l),n(s,_l,l),n(s,is,l),n(s,El,l),o(rs,s,l),n(s,Sl,l),o(ys,s,l),n(s,Gl,l),n(s,os,l),n(s,Vl,l),o(cs,s,l),n(s,Wl,l),n(s,js,l),n(s,Xl,l),o(hs,s,l),n(s,Hl,l),n(s,Js,l),n(s,Rl,l),o(us,s,l),n(s,Yl,l),n(s,ms,l),n(s,Dl,l),o(Us,s,l),n(s,Fl,l),o(Ts,s,l),n(s,Pl,l),n(s,ws,l),n(s,Ll,l),o(Is,s,l),n(s,Kl,l),o(ds,s,l),n(s,Ol,l),n(s,gs,l),n(s,sa,l),o(fs,s,l),n(s,la,l),n(s,xs,l),n(s,aa,l),o(bs,s,l),n(s,na,l),n(s,Cs,l),n(s,ta,l),o(qs,s,l),n(s,ea,l),o(ks,s,l),n(s,pa,l),n(s,$s,l),n(s,Ma,l),o(As,s,l),n(s,ia,l),n(s,zs,l),n(s,ra,l),o(vs,s,l),n(s,ya,l),o(Qs,s,l),n(s,oa,l),n(s,Bs,l),n(s,ca,l),o(x,s,l),n(s,ja,l),n(s,Ns,l),n(s,ha,l),o(Zs,s,l),n(s,Ja,l),n(s,_s,l),n(s,ua,l),o(Es,s,l),n(s,ma,l),o(Ss,s,l),n(s,Ua,l),n(s,Gs,l),n(s,Ta,l),o(Vs,s,l),n(s,wa,l),n(s,Ws,l),n(s,Ia,l),o(Xs,s,l),n(s,da,l),o(Hs,s,l),n(s,ga,l),n(s,Rs,l),n(s,fa,l),o(Ys,s,l),n(s,xa,l),n(s,Fs,l),ba=!0},p(s,[l]){const Ds={};l&2&&(Ds.$$scope={dirty:l,ctx:s}),I.$set(Ds);const jn={};l&2&&(jn.$$scope={dirty:l,ctx:s}),d.$set(jn);const hn={};l&2&&(hn.$$scope={dirty:l,ctx:s}),g.$set(hn);const Jn={};l&2&&(Jn.$$scope={dirty:l,ctx:s}),f.$set(Jn);const un={};l&2&&(un.$$scope={dirty:l,ctx:s}),x.$set(un)},i(s){ba||(c(b.$$.fragment,s),c(C.$$.fragment,s),c(k.$$.fragment,s),c(I.$$.fragment,s),c($.$$.fragment,s),c(d.$$.fragment,s),c(z.$$.fragment,s),c(Z.$$.fragment,s),c(E.$$.fragment,s),c(V.$$.fragment,s),c(X.$$.fragment,s),c(R.$$.fragment,s),c(g.$$.fragment,s),c(D.$$.fragment,s),c(f.$$.fragment,s),c(O.$$.fragment,s),c(as.$$.fragment,s),c(ts.$$.fragment,s),c(ps.$$.fragment,s),c(Ms.$$.fragment,s),c(rs.$$.fragment,s),c(ys.$$.fragment,s),c(cs.$$.fragment,s),c(hs.$$.fragment,s),c(us.$$.fragment,s),c(Us.$$.fragment,s),c(Ts.$$.fragment,s),c(Is.$$.fragment,s),c(ds.$$.fragment,s),c(fs.$$.fragment,s),c(bs.$$.fragment,s),c(qs.$$.fragment,s),c(ks.$$.fragment,s),c(As.$$.fragment,s),c(vs.$$.fragment,s),c(Qs.$$.fragment,s),c(x.$$.fragment,s),c(Zs.$$.fragment,s),c(Es.$$.fragment,s),c(Ss.$$.fragment,s),c(Vs.$$.fragment,s),c(Xs.$$.fragment,s),c(Hs.$$.fragment,s),c(Ys.$$.fragment,s),ba=!0)},o(s){j(b.$$.fragment,s),j(C.$$.fragment,s),j(k.$$.fragment,s),j(I.$$.fragment,s),j($.$$.fragment,s),j(d.$$.fragment,s),j(z.$$.fragment,s),j(Z.$$.fragment,s),j(E.$$.fragment,s),j(V.$$.fragment,s),j(X.$$.fragment,s),j(R.$$.fragment,s),j(g.$$.fragment,s),j(D.$$.fragment,s),j(f.$$.fragment,s),j(O.$$.fragment,s),j(as.$$.fragment,s),j(ts.$$.fragment,s),j(ps.$$.fragment,s),j(Ms.$$.fragment,s),j(rs.$$.fragment,s),j(ys.$$.fragment,s),j(cs.$$.fragment,s),j(hs.$$.fragment,s),j(us.$$.fragment,s),j(Us.$$.fragment,s),j(Ts.$$.fragment,s),j(Is.$$.fragment,s),j(ds.$$.fragment,s),j(fs.$$.fragment,s),j(bs.$$.fragment,s),j(qs.$$.fragment,s),j(ks.$$.fragment,s),j(As.$$.fragment,s),j(vs.$$.fragment,s),j(Qs.$$.fragment,s),j(x.$$.fragment,s),j(Zs.$$.fragment,s),j(Es.$$.fragment,s),j(Ss.$$.fragment,s),j(Vs.$$.fragment,s),j(Xs.$$.fragment,s),j(Hs.$$.fragment,s),j(Ys.$$.fragment,s),ba=!1},d(s){s&&(a(U),a(u),a(T),a(Ks),a(Os),a(q),a(sl),a(ll),a(al),a(nl),a(tl),a(A),a(el),a(pl),a(v),a(Ml),a(Q),a(rl),a(B),a(yl),a(N),a(ol),a(cl),a(_),a(jl),a(hl),a(S),a(Jl),a(G),a(ul),a(ml),a(W),a(Ul),a(Tl),a(H),a(wl),a(Il),a(Y),a(dl),a(gl),a(fl),a(F),a(xl),a(P),a(bl),a(L),a(Cl),a(K),a(ql),a(kl),a($l),a(ss),a(Al),a(ls),a(zl),a(vl),a(ns),a(Ql),a(Bl),a(es),a(Nl),a(Zl),a(_l),a(is),a(El),a(Sl),a(Gl),a(os),a(Vl),a(Wl),a(js),a(Xl),a(Hl),a(Js),a(Rl),a(Yl),a(ms),a(Dl),a(Fl),a(Pl),a(ws),a(Ll),a(Kl),a(Ol),a(gs),a(sa),a(la),a(xs),a(aa),a(na),a(Cs),a(ta),a(ea),a(pa),a($s),a(Ma),a(ia),a(zs),a(ra),a(ya),a(oa),a(Bs),a(ca),a(ja),a(Ns),a(ha),a(Ja),a(_s),a(ua),a(ma),a(Ua),a(Gs),a(Ta),a(wa),a(Ws),a(Ia),a(da),a(ga),a(Rs),a(fa),a(xa),a(Fs)),a(p),h(b,s),h(C,s),h(k,s),h(I,s),h($,s),h(d,s),h(z,s),h(Z,s),h(E,s),h(V,s),h(X,s),h(R,s),h(g,s),h(D,s),h(f,s),h(O,s),h(as,s),h(ts,s),h(ps,s),h(Ms,s),h(rs,s),h(ys,s),h(cs,s),h(hs,s),h(us,s),h(Us,s),h(Ts,s),h(Is,s),h(ds,s),h(fs,s),h(bs,s),h(qs,s),h(ks,s),h(As,s),h(vs,s),h(Qs,s),h(x,s),h(Zs,s),h(Es,s),h(Ss,s),h(Vs,s),h(Xs,s),h(Hs,s),h(Ys,s)}}}const _n='{"title":"WordPiece tokenization","local":"wordpiece-tokenization","sections":[{"title":"Training algorithm","local":"training-algorithm","sections":[],"depth":2},{"title":"Tokenization algorithm","local":"tokenization-algorithm","sections":[],"depth":2},{"title":"Implementing WordPiece","local":"implementing-wordpiece","sections":[],"depth":2}],"depth":1}';function En(w){return In(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Yn extends dn{constructor(p){super(),gn(this,p,En,Zn,wn,{})}}export{Yn as component};

Xet Storage Details

Size:
72.8 kB
·
Xet hash:
d65dfb894794564f28a2f544cb63bc02dc8e05672fafb47a0a00bf87be955366

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.