Buckets:

rtrm's picture
download
raw
79.1 kB
import{s as Tn,o as fn,n as wt}from"../chunks/scheduler.37c15a92.js";import{S as gn,i as Cn,g as p,s as n,r as m,m as mt,H as wa,A as In,h as o,f as e,c as l,j as Ut,u as c,x as r,n as ct,B as ja,k as Un,y as T,a,v as h,d as M,t as u,w as y}from"../chunks/index.7cb9c9b8.js";import{T as yt}from"../chunks/Tip.d10b3fc9.js";import{Y as kn}from"../chunks/Youtube.8666c400.js";import{C as j}from"../chunks/CodeBlock.abae2786.js";import{C as vn}from"../chunks/CourseFloatingBanner.df82c153.js";import{H as dt,E as $n}from"../chunks/getInferenceSnippets.a2135f3c.js";function qn(J){let i,U="💡 This section covers Unigram in depth, going as far as showing a full implementation. You can skip to the end if you just want a general overview of the tokenization algorithm.";return{c(){i=p("p"),i.textContent=U},l(w){i=o(w,"P",{"data-svelte-h":!0}),r(i)!=="svelte-11nzsd1"&&(i.textContent=U)},m(w,d){a(w,i,d)},p:wt,d(w){w&&e(i)}}}function xn(J){let i,U="✏️ <strong>Now your turn!</strong> Write the code to compute the frequencies above and double-check that the results shown are correct, as well as the total sum.";return{c(){i=p("p"),i.innerHTML=U},l(w){i=o(w,"P",{"data-svelte-h":!0}),r(i)!=="svelte-kyc6dp"&&(i.innerHTML=U)},m(w,d){a(w,i,d)},p:wt,d(w){w&&e(i)}}}function An(J){let i,U="✏️ <strong>Now your turn!</strong> Determine the tokenization of the word <code>&quot;huggun&quot;</code>, and its score.";return{c(){i=p("p"),i.innerHTML=U},l(w){i=o(w,"P",{"data-svelte-h":!0}),r(i)!=="svelte-zwh8th"&&(i.innerHTML=U)},m(w,d){a(w,i,d)},p:wt,d(w){w&&e(i)}}}function Zn(J){let i,U="💡 SentencePiece uses a more efficient algorithm called Enhanced Suffix Array (ESA) to create the initial vocabulary.";return{c(){i=p("p"),i.textContent=U},l(w){i=o(w,"P",{"data-svelte-h":!0}),r(i)!=="svelte-1s00c64"&&(i.textContent=U)},m(w,d){a(w,i,d)},p:wt,d(w){w&&e(i)}}}function _n(J){let i,U="💡 This approach is very inefficient, so SentencePiece uses an approximation of the loss of the model without token X: instead of starting from scratch, it just replaces token X by its segmentation in the vocabulary that is left. This way, all the scores can be computed at once at the same time as the model loss.";return{c(){i=p("p"),i.textContent=U},l(w){i=o(w,"P",{"data-svelte-h":!0}),r(i)!=="svelte-p8xm7n"&&(i.textContent=U)},m(w,d){a(w,i,d)},p:wt,d(w){w&&e(i)}}}function Nn(J){let i,U="The XLNetTokenizer uses SentencePiece which is why the <code>&quot;_&quot;</code> character is included. To decode with SentencePiece, concatenate all the tokens and replace <code>&quot;_&quot;</code> with a space.";return{c(){i=p("p"),i.innerHTML=U},l(w){i=o(w,"P",{"data-svelte-h":!0}),r(i)!=="svelte-nohclw"&&(i.innerHTML=U)},m(w,d){a(w,i,d)},p:wt,d(w){w&&e(i)}}}function zn(J){let i,U,w,d,Z,Jt,_,bt,N,Ua='The Unigram algorithm is used in combination with <a href="https://huggingface.co/papers/1808.06226" rel="nofollow">SentencePiece</a>, which is the tokenization algorithm used by models like AlBERT, T5, mBART, Big Bird, and XLNet.',Tt,z,da="SentencePiece addresses the fact that not all languages use spaces to separate words. Instead, SentencePiece treats the input as a raw input stream which includes the space in the set of characters to use. Then it can use the Unigram algorithm to construct the appropriate vocabulary.",ft,V,gt,C,Ct,Q,It,B,Ja="Compared to BPE and WordPiece, Unigram works in the other direction: it starts from a big vocabulary and removes tokens from it until it reaches the desired vocabulary size. There are several options to use to build that base vocabulary: we can take the most common substrings in pre-tokenized words, for instance, or apply BPE on the initial corpus with a large vocabulary size.",kt,G,ba="At each step of the training, the Unigram algorithm computes a loss over the corpus given the current vocabulary. Then, for each symbol in the vocabulary, the algorithm computes how much the overall loss would increase if the symbol was removed, and looks for the symbols that would increase it the least. Those symbols have a lower effect on the overall loss over the corpus, so in a sense they are “less needed” and are the best candidates for removal.",vt,I,ma,$t,dn='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>p</mi></mrow><annotation encoding="application/x-tex">p</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.625em;vertical-align:-0.1944em;"></span><span class="mord mathnormal">p</span></span></span></span>',qt,xt,S,Ta="Note that we never remove the base characters, to make sure any word can be tokenized.",At,R,fa="Now, this is still a bit vague: the main part of the algorithm is to compute a loss over the corpus and see how it changes when we remove some tokens from the vocabulary, but we haven’t explained how to do this yet. This step relies on the tokenization algorithm of a Unigram model, so we’ll dive into this next.",Zt,H,ga="We’ll reuse the corpus from the previous examples:",_t,E,Nt,X,Ca="and for this example, we will take all strict substrings for the initial vocabulary :",zt,W,Vt,K,Qt,Y,Ia="A Unigram model is a type of language model that considers each token to be independent of the tokens before it. It’s the simplest language model, in the sense that the probability of token X given the previous context is just the probability of token X. So, if we used a Unigram language model to generate text, we would always predict the most common token.",Bt,P,ka="The probability of a given token is its frequency (the number of times we find it) in the original corpus, divided by the sum of all frequencies of all tokens in the vocabulary (to make sure the probabilities sum up to 1). For instance, <code>&quot;ug&quot;</code> is present in <code>&quot;hug&quot;</code>, <code>&quot;pug&quot;</code>, and <code>&quot;hugs&quot;</code>, so it has a frequency of 20 in our corpus.",Gt,D,va="Here are the frequencies of all the possible subwords in the vocabulary:",St,F,Rt,L,$a="So, the sum of all frequencies is 210, and the probability of the subword <code>&quot;ug&quot;</code> is thus 20/210.",Ht,k,Et,b,ca,ht,qa='["p", "u", "g"]',ha,Mt,xa='"pug"',Ma,Xt,Jn='<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mi>P</mi><mo stretchy="false">(</mo><mo stretchy="false">[</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>p</mi><mi mathvariant="normal">&quot;</mi><mo separator="true">,</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>u</mi><mi mathvariant="normal">&quot;</mi><mo separator="true">,</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>g</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">]</mo><mo stretchy="false">)</mo><mo>=</mo><mi>P</mi><mo stretchy="false">(</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>p</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">)</mo><mo>×</mo><mi>P</mi><mo stretchy="false">(</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>u</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">)</mo><mo>×</mo><mi>P</mi><mo stretchy="false">(</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>g</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">)</mo><mo>=</mo><mfrac><mn>5</mn><mn>210</mn></mfrac><mo>×</mo><mfrac><mn>36</mn><mn>210</mn></mfrac><mo>×</mo><mfrac><mn>20</mn><mn>210</mn></mfrac><mo>=</mo><mn>0.000389</mn></mrow><annotation encoding="application/x-tex">P([``p&quot;, ``u&quot;, ``g&quot;]) = P(``p&quot;) \\times P(``u&quot;) \\times P(``g&quot;) = \\frac{5}{210} \\times \\frac{36}{210} \\times \\frac{20}{210} = 0.000389</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">([</span><span class="mord">‘‘</span><span class="mord mathnormal">p</span><span class="mord">&quot;</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">‘‘</span><span class="mord mathnormal">u</span><span class="mord">&quot;</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">‘‘</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord">&quot;</span><span class="mclose">])</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord">‘‘</span><span class="mord mathnormal">p</span><span class="mord">&quot;</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord">‘‘</span><span class="mord mathnormal">u</span><span class="mord">&quot;</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord">‘‘</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord">&quot;</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">210</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">5</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">210</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">36</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">210</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">20</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.6444em;"></span><span class="mord">0.000389</span></span></span></span></span>',Wt,g,ua,ut,Aa='["pu", "g"]',ya,Kt,bn='<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mi>P</mi><mo stretchy="false">(</mo><mo stretchy="false">[</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>p</mi><mi>u</mi><mi mathvariant="normal">&quot;</mi><mo separator="true">,</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>g</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">]</mo><mo stretchy="false">)</mo><mo>=</mo><mi>P</mi><mo stretchy="false">(</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>p</mi><mi>u</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">)</mo><mo>×</mo><mi>P</mi><mo stretchy="false">(</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>g</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">)</mo><mo>=</mo><mfrac><mn>5</mn><mn>210</mn></mfrac><mo>×</mo><mfrac><mn>20</mn><mn>210</mn></mfrac><mo>=</mo><mn>0.0022676</mn></mrow><annotation encoding="application/x-tex">P([``pu&quot;, ``g&quot;]) = P(``pu&quot;) \\times P(``g&quot;) = \\frac{5}{210} \\times \\frac{20}{210} = 0.0022676</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">([</span><span class="mord">‘‘</span><span class="mord mathnormal">p</span><span class="mord mathnormal">u</span><span class="mord">&quot;</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">‘‘</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord">&quot;</span><span class="mclose">])</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord">‘‘</span><span class="mord mathnormal">p</span><span class="mord mathnormal">u</span><span class="mord">&quot;</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord">‘‘</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord">&quot;</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">210</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">5</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">210</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">20</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.6444em;"></span><span class="mord">0.0022676</span></span></span></span></span>',Yt,O,Za="so that one is way more likely. In general, tokenizations with the least tokens possible will have the highest probability (because of that division by 210 repeated for each token), which corresponds to what we want intuitively: to split a word into the least number of tokens possible.",Pt,ss,_a="The tokenization of a word with the Unigram model is then the tokenization with the highest probability. In the example of <code>&quot;pug&quot;</code>, here are the probabilities we would get for each possible segmentation:",Dt,ts,Ft,es,Na="So, <code>&quot;pug&quot;</code> would be tokenized as <code>[&quot;p&quot;, &quot;ug&quot;]</code> or <code>[&quot;pu&quot;, &quot;g&quot;]</code>, depending on which of those segmentations is encountered first (note that in a larger corpus, equality cases like this will be rare).",Lt,as,za="In this case, it was easy to find all the possible segmentations and compute their probabilities, but in general it’s going to be a bit harder. There is a classic algorithm used for this, called the <em>Viterbi algorithm</em>. Essentially, we can build a graph to detect the possible segmentations of a given word by saying there is a branch from character <em>a</em> to character <em>b</em> if the subword from <em>a</em> to <em>b</em> is in the vocabulary, and attribute to that branch the probability of the subword.",Ot,ns,Va="To find the path in that graph that is going to have the best score the Viterbi algorithm determines, for each position in the word, the segmentation with the best score that ends at that position. Since we go from the beginning to the end, that best score can be found by looping through all subwords ending at the current position and then using the best tokenization score from the position this subword begins at. Then, we just have to unroll the path taken to arrive at the end.",se,ls,Qa="Let’s take a look at an example using our vocabulary and the word <code>&quot;unhug&quot;</code>. For each position, the subwords with the best scores ending there are the following:",te,ps,ee,os,Ba="Thus <code>&quot;unhug&quot;</code> would be tokenized as <code>[&quot;un&quot;, &quot;hug&quot;]</code>.",ae,v,ne,is,le,rs,Ga="Now that we have seen how the tokenization works, we can dive a little more deeply into the loss used during training. At any given stage, this loss is computed by tokenizing every word in the corpus, using the current vocabulary and the Unigram model determined by the frequencies of each token in the corpus (as seen before).",pe,ms,Sa="Each word in the corpus has a score, and the loss is the negative log likelihood of those scores — that is, the sum for all the words in the corpus of all the <code>-log(P(word))</code>.",oe,cs,Ra="Let’s go back to our example with the following corpus:",ie,hs,re,Ms,Ha="The tokenization of each word with their respective scores is:",me,us,ce,ys,Ea="So the loss is:",he,ws,Me,js,Xa="Now we need to compute how removing each token affects the loss. This is rather tedious, so we’ll just do it for two tokens here and save the whole process for when we have code to help us. In this (very) particular case, we had two equivalent tokenizations of all the words: as we saw earlier, for example, <code>&quot;pug&quot;</code> could be tokenized <code>[&quot;p&quot;, &quot;ug&quot;]</code> with the same score. Thus, removing the <code>&quot;pu&quot;</code> token from the vocabulary will give the exact same loss.",ue,Us,Wa="On the other hand, removing <code>&quot;hug&quot;</code> will make the loss worse, because the tokenization of <code>&quot;hug&quot;</code> and <code>&quot;hugs&quot;</code> will become:",ye,ds,we,Js,Ka="These changes will cause the loss to rise by:",je,bs,Ue,Ts,Ya="Therefore, the token <code>&quot;pu&quot;</code> will probably be removed from the vocabulary, but not <code>&quot;hug&quot;</code>.",de,fs,Je,gs,Pa="Now let’s implement everything we’ve seen so far in code. Like with BPE and WordPiece, this is not an efficient implementation of the Unigram algorithm (quite the opposite), but it should help you understand it a bit better.",be,Cs,Da="We will use the same corpus as before as an example:",Te,Is,fe,ks,Fa="This time, we will use <code>xlnet-base-cased</code> as our model:",ge,vs,Ce,$s,La="Like for BPE and WordPiece, we begin by counting the number of occurrences of each word in the corpus:",Ie,qs,ke,xs,Oa="Then, we need to initialize our vocabulary to something larger than the vocab size we will want at the end. We have to include all the basic characters (otherwise we won’t be able to tokenize every word), but for the bigger substrings we’ll only keep the most common ones, so we sort them by frequency:",ve,As,$e,Zs,qe,_s,sn="We group the characters with the best subwords to arrive at an initial vocabulary of size 300:",xe,Ns,Ae,$,Ze,zs,tn="Next, we compute the sum of all frequencies, to convert the frequencies into probabilities. For our model we will store the logarithms of the probabilities, because it’s more numerically stable to add logarithms than to multiply small numbers, and this will simplify the computation of the loss of the model:",_e,Vs,Ne,Qs,en="Now the main function is the one that tokenizes words using the Viterbi algorithm. As we saw before, that algorithm computes the best segmentation of each substring of the word, which we will store in a variable named <code>best_segmentations</code>. We will store one dictionary per position in the word (from 0 to its total length), with two keys: the index of the start of the last token in the best segmentation, and the score of the best segmentation. With the index of the start of the last token, we will be able to retrieve the full segmentation once the list is completely populated.",ze,Bs,an="Populating the list is done with just two loops: the main loop goes over each start position, and the second loop tries all substrings beginning at that start position. If the substring is in the vocabulary, we have a new segmentation of the word up until that end position, which we compare to what is in <code>best_segmentations</code>.",Ve,Gs,nn="Once the main loop is finished, we just start from the end and hop from one start position to the next, recording the tokens as we go, until we reach the start of the word:",Qe,Ss,Be,Rs,ln="We can already try our initial model on some words:",Ge,Hs,Se,Es,Re,Xs,pn="Now it’s easy to compute the loss of the model on the corpus!",He,Ws,Ee,Ks,on="We can check it works on the model we have:",Xe,Ys,We,Ps,Ke,Ds,rn="Computing the scores for each token is not very hard either; we just have to compute the loss for the models obtained by deleting each token:",Ye,Fs,Pe,Ls,mn="We can try it on a given token:",De,Os,Fe,st,cn="Since <code>&quot;ll&quot;</code> is used in the tokenization of <code>&quot;Hopefully&quot;</code>, and removing it will probably make us use the token <code>&quot;l&quot;</code> twice instead, we expect it will have a positive loss. <code>&quot;his&quot;</code> is only used inside the word <code>&quot;This&quot;</code>, which is tokenized as itself, so we expect it to have a zero loss. Here are the results:",Le,tt,Oe,q,sa,et,hn="With all of this in place, the last thing we need to do is add the special tokens used by the model to the vocabulary, then loop until we have pruned enough tokens from the vocabulary to reach our desired size:",ta,at,ea,nt,Mn="Then, to tokenize some text, we just need to apply the pre-tokenization and then use our <code>encode_word()</code> function:",aa,lt,na,pt,la,x,pa,ot,un="That’s it for Unigram! Hopefully by now you’re feeling like an expert in all things tokenizer. In the next section, we will delve into the building blocks of the 🤗 Tokenizers library, and show you how you can use them to build your own tokenizer.",oa,it,ia,jt,ra;return Z=new dt({props:{title:"Unigram tokenization",local:"unigram-tokenization",headingTag:"h1"}}),_=new vn({props:{chapter:6,classNames:"absolute z-10 right-0 top-0",notebooks:[{label:"Google Colab",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter6/section7.ipynb"},{label:"Aws Studio",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter6/section7.ipynb"}]}}),V=new kn({props:{id:"TGZfZVuF9Yc"}}),C=new yt({props:{$$slots:{default:[qn]},$$scope:{ctx:J}}}),Q=new dt({props:{title:"Training algorithm",local:"training-algorithm",headingTag:"h2"}}),E=new j({props:{code:"KCUyMmh1ZyUyMiUyQyUyMDEwKSUyQyUyMCglMjJwdWclMjIlMkMlMjA1KSUyQyUyMCglMjJwdW4lMjIlMkMlMjAxMiklMkMlMjAoJTIyYnVuJTIyJTJDJTIwNCklMkMlMjAoJTIyaHVncyUyMiUyQyUyMDUp",highlighted:'(<span class="hljs-string">&quot;hug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;bun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;hugs&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)',wrap:!1}}),W=new j({props:{code:"JTVCJTIyaCUyMiUyQyUyMCUyMnUlMjIlMkMlMjAlMjJnJTIyJTJDJTIwJTIyaHUlMjIlMkMlMjAlMjJ1ZyUyMiUyQyUyMCUyMnAlMjIlMkMlMjAlMjJwdSUyMiUyQyUyMCUyMm4lMjIlMkMlMjAlMjJ1biUyMiUyQyUyMCUyMmIlMjIlMkMlMjAlMjJidSUyMiUyQyUyMCUyMnMlMjIlMkMlMjAlMjJodWclMjIlMkMlMjAlMjJncyUyMiUyQyUyMCUyMnVncyUyMiU1RA==",highlighted:'<span class="hljs-selector-attr">[<span class="hljs-string">&quot;h&quot;</span>, <span class="hljs-string">&quot;u&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>, <span class="hljs-string">&quot;hu&quot;</span>, <span class="hljs-string">&quot;ug&quot;</span>, <span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;pu&quot;</span>, <span class="hljs-string">&quot;n&quot;</span>, <span class="hljs-string">&quot;un&quot;</span>, <span class="hljs-string">&quot;b&quot;</span>, <span class="hljs-string">&quot;bu&quot;</span>, <span class="hljs-string">&quot;s&quot;</span>, <span class="hljs-string">&quot;hug&quot;</span>, <span class="hljs-string">&quot;gs&quot;</span>, <span class="hljs-string">&quot;ugs&quot;</span>]</span>',wrap:!1}}),K=new dt({props:{title:"Tokenization algorithm",local:"tokenization-algorithm",headingTag:"h2"}}),F=new j({props:{code:"KCUyMmglMjIlMkMlMjAxNSklMjAoJTIydSUyMiUyQyUyMDM2KSUyMCglMjJnJTIyJTJDJTIwMjApJTIwKCUyMmh1JTIyJTJDJTIwMTUpJTIwKCUyMnVnJTIyJTJDJTIwMjApJTIwKCUyMnAlMjIlMkMlMjAxNyklMjAoJTIycHUlMjIlMkMlMjAxNyklMjAoJTIybiUyMiUyQyUyMDE2KSUwQSglMjJ1biUyMiUyQyUyMDE2KSUyMCglMjJiJTIyJTJDJTIwNCklMjAoJTIyYnUlMjIlMkMlMjA0KSUyMCglMjJzJTIyJTJDJTIwNSklMjAoJTIyaHVnJTIyJTJDJTIwMTUpJTIwKCUyMmdzJTIyJTJDJTIwNSklMjAoJTIydWdzJTIyJTJDJTIwNSk=",highlighted:`(<span class="hljs-string">&quot;h&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">15</span>) (<span class="hljs-string">&quot;u&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">36</span>) (<span class="hljs-string">&quot;g&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">20</span>) (<span class="hljs-string">&quot;hu&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">15</span>) (<span class="hljs-string">&quot;ug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">20</span>) (<span class="hljs-string">&quot;p&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">17</span>) (<span class="hljs-string">&quot;pu&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">17</span>) (<span class="hljs-string">&quot;n&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">16</span>)
(<span class="hljs-string">&quot;un&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">16</span>) (<span class="hljs-string">&quot;b&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>) (<span class="hljs-string">&quot;bu&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>) (<span class="hljs-string">&quot;s&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>) (<span class="hljs-string">&quot;hug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">15</span>) (<span class="hljs-string">&quot;gs&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>) (<span class="hljs-string">&quot;ugs&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)`,wrap:!1}}),k=new yt({props:{$$slots:{default:[xn]},$$scope:{ctx:J}}}),ts=new j({props:{code:"JTVCJTIycCUyMiUyQyUyMCUyMnUlMjIlMkMlMjAlMjJnJTIyJTVEJTIwJTNBJTIwMC4wMDAzODklMEElNUIlMjJwJTIyJTJDJTIwJTIydWclMjIlNUQlMjAlM0ElMjAwLjAwMjI2NzYlMEElNUIlMjJwdSUyMiUyQyUyMCUyMmclMjIlNUQlMjAlM0ElMjAwLjAwMjI2NzY=",highlighted:`[<span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;u&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>] : 0.000389
[<span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;ug&quot;</span>] : 0.0022676
[<span class="hljs-string">&quot;pu&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>] : 0.0022676`,wrap:!1}}),ps=new j({props:{code:"Q2hhcmFjdGVyJTIwMCUyMCh1KSUzQSUyMCUyMnUlMjIlMjAoc2NvcmUlMjAwLjE3MTQyOSklMEFDaGFyYWN0ZXIlMjAxJTIwKG4pJTNBJTIwJTIydW4lMjIlMjAoc2NvcmUlMjAwLjA3NjE5MSklMEFDaGFyYWN0ZXIlMjAyJTIwKGgpJTNBJTIwJTIydW4lMjIlMjAlMjJoJTIyJTIwKHNjb3JlJTIwMC4wMDU0NDIpJTBBQ2hhcmFjdGVyJTIwMyUyMCh1KSUzQSUyMCUyMnVuJTIyJTIwJTIyaHUlMjIlMjAoc2NvcmUlMjAwLjAwNTQ0MiklMEFDaGFyYWN0ZXIlMjA0JTIwKGcpJTNBJTIwJTIydW4lMjIlMjAlMjJodWclMjIlMjAoc2NvcmUlMjAwLjAwNTQ0Mik=",highlighted:`<span class="hljs-attribute">Character</span> <span class="hljs-number">0</span> (u): <span class="hljs-string">&quot;u&quot;</span> (score <span class="hljs-number">0</span>.<span class="hljs-number">171429</span>)
<span class="hljs-attribute">Character</span> <span class="hljs-number">1</span> (n): <span class="hljs-string">&quot;un&quot;</span> (score <span class="hljs-number">0</span>.<span class="hljs-number">076191</span>)
<span class="hljs-attribute">Character</span> <span class="hljs-number">2</span> (h): <span class="hljs-string">&quot;un&quot;</span> <span class="hljs-string">&quot;h&quot;</span> (score <span class="hljs-number">0</span>.<span class="hljs-number">005442</span>)
<span class="hljs-attribute">Character</span> <span class="hljs-number">3</span> (u): <span class="hljs-string">&quot;un&quot;</span> <span class="hljs-string">&quot;hu&quot;</span> (score <span class="hljs-number">0</span>.<span class="hljs-number">005442</span>)
<span class="hljs-attribute">Character</span> <span class="hljs-number">4</span> (g): <span class="hljs-string">&quot;un&quot;</span> <span class="hljs-string">&quot;hug&quot;</span> (score <span class="hljs-number">0</span>.<span class="hljs-number">005442</span>)`,wrap:!1}}),v=new yt({props:{$$slots:{default:[An]},$$scope:{ctx:J}}}),is=new dt({props:{title:"Back to training",local:"back-to-training",headingTag:"h2"}}),hs=new j({props:{code:"KCUyMmh1ZyUyMiUyQyUyMDEwKSUyQyUyMCglMjJwdWclMjIlMkMlMjA1KSUyQyUyMCglMjJwdW4lMjIlMkMlMjAxMiklMkMlMjAoJTIyYnVuJTIyJTJDJTIwNCklMkMlMjAoJTIyaHVncyUyMiUyQyUyMDUp",highlighted:'(<span class="hljs-string">&quot;hug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;bun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;hugs&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)',wrap:!1}}),us=new j({props:{code:"JTIyaHVnJTIyJTNBJTIwJTVCJTIyaHVnJTIyJTVEJTIwKHNjb3JlJTIwMC4wNzE0MjgpJTBBJTIycHVnJTIyJTNBJTIwJTVCJTIycHUlMjIlMkMlMjAlMjJnJTIyJTVEJTIwKHNjb3JlJTIwMC4wMDc3MTApJTBBJTIycHVuJTIyJTNBJTIwJTVCJTIycHUlMjIlMkMlMjAlMjJuJTIyJTVEJTIwKHNjb3JlJTIwMC4wMDYxNjgpJTBBJTIyYnVuJTIyJTNBJTIwJTVCJTIyYnUlMjIlMkMlMjAlMjJuJTIyJTVEJTIwKHNjb3JlJTIwMC4wMDE0NTEpJTBBJTIyaHVncyUyMiUzQSUyMCU1QiUyMmh1ZyUyMiUyQyUyMCUyMnMlMjIlNUQlMjAoc2NvcmUlMjAwLjAwMTcwMSk=",highlighted:`<span class="hljs-string">&quot;hug&quot;</span>: [<span class="hljs-string">&quot;hug&quot;</span>] <span class="hljs-comment">(score 0.071428)</span>
<span class="hljs-string">&quot;pug&quot;</span>: [<span class="hljs-string">&quot;pu&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>] <span class="hljs-comment">(score 0.007710)</span>
<span class="hljs-string">&quot;pun&quot;</span>: [<span class="hljs-string">&quot;pu&quot;</span>, <span class="hljs-string">&quot;n&quot;</span>] <span class="hljs-comment">(score 0.006168)</span>
<span class="hljs-string">&quot;bun&quot;</span>: [<span class="hljs-string">&quot;bu&quot;</span>, <span class="hljs-string">&quot;n&quot;</span>] <span class="hljs-comment">(score 0.001451)</span>
<span class="hljs-string">&quot;hugs&quot;</span>: [<span class="hljs-string">&quot;hug&quot;</span>, <span class="hljs-string">&quot;s&quot;</span>] <span class="hljs-comment">(score 0.001701)</span>`,wrap:!1}}),ws=new j({props:{code:"MTAlMjAqJTIwKC1sb2coMC4wNzE0MjgpKSUyMCUyQiUyMDUlMjAqJTIwKC1sb2coMC4wMDc3MTApKSUyMCUyQiUyMDEyJTIwKiUyMCgtbG9nKDAuMDA2MTY4KSklMjAlMkIlMjA0JTIwKiUyMCgtbG9nKDAuMDAxNDUxKSklMjAlMkIlMjA1JTIwKiUyMCgtbG9nKDAuMDAxNzAxKSklMjAlM0QlMjAxNjkuOA==",highlighted:'<span class="hljs-attribute">10</span> * (-log(<span class="hljs-number">0</span>.<span class="hljs-number">071428</span>)) + <span class="hljs-number">5</span> * (-log(<span class="hljs-number">0</span>.<span class="hljs-number">007710</span>)) + <span class="hljs-number">12</span> * (-log(<span class="hljs-number">0</span>.<span class="hljs-number">006168</span>)) + <span class="hljs-number">4</span> * (-log(<span class="hljs-number">0</span>.<span class="hljs-number">001451</span>)) + <span class="hljs-number">5</span> * (-log(<span class="hljs-number">0</span>.<span class="hljs-number">001701</span>)) = <span class="hljs-number">169</span>.<span class="hljs-number">8</span>',wrap:!1}}),ds=new j({props:{code:"JTIyaHVnJTIyJTNBJTIwJTVCJTIyaHUlMjIlMkMlMjAlMjJnJTIyJTVEJTIwKHNjb3JlJTIwMC4wMDY4MDIpJTBBJTIyaHVncyUyMiUzQSUyMCU1QiUyMmh1JTIyJTJDJTIwJTIyZ3MlMjIlNUQlMjAoc2NvcmUlMjAwLjAwMTcwMSk=",highlighted:`<span class="hljs-string">&quot;hug&quot;</span>: [<span class="hljs-string">&quot;hu&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>] <span class="hljs-comment">(score 0.006802)</span>
<span class="hljs-string">&quot;hugs&quot;</span>: [<span class="hljs-string">&quot;hu&quot;</span>, <span class="hljs-string">&quot;gs&quot;</span>] <span class="hljs-comment">(score 0.001701)</span>`,wrap:!1}}),bs=new j({props:{code:"LSUyMDEwJTIwKiUyMCgtbG9nKDAuMDcxNDI4KSklMjAlMkIlMjAxMCUyMColMjAoLWxvZygwLjAwNjgwMikpJTIwJTNEJTIwMjMuNQ==",highlighted:'- <span class="hljs-number">10</span> * (<span class="hljs-name">-log</span>(<span class="hljs-number">0.071428</span>)) + <span class="hljs-number">10</span> * (<span class="hljs-name">-log</span>(<span class="hljs-number">0.006802</span>)) = <span class="hljs-number">23.5</span>',wrap:!1}}),fs=new dt({props:{title:"Implementing Unigram",local:"implementing-unigram",headingTag:"h2"}}),Is=new j({props:{code:"Y29ycHVzJTIwJTNEJTIwJTVCJTBBJTIwJTIwJTIwJTIwJTIyVGhpcyUyMGlzJTIwdGhlJTIwSHVnZ2luZyUyMEZhY2UlMjBDb3Vyc2UuJTIyJTJDJTBBJTIwJTIwJTIwJTIwJTIyVGhpcyUyMGNoYXB0ZXIlMjBpcyUyMGFib3V0JTIwdG9rZW5pemF0aW9uLiUyMiUyQyUwQSUyMCUyMCUyMCUyMCUyMlRoaXMlMjBzZWN0aW9uJTIwc2hvd3MlMjBzZXZlcmFsJTIwdG9rZW5pemVyJTIwYWxnb3JpdGhtcy4lMjIlMkMlMEElMjAlMjAlMjAlMjAlMjJIb3BlZnVsbHklMkMlMjB5b3UlMjB3aWxsJTIwYmUlMjBhYmxlJTIwdG8lMjB1bmRlcnN0YW5kJTIwaG93JTIwdGhleSUyMGFyZSUyMHRyYWluZWQlMjBhbmQlMjBnZW5lcmF0ZSUyMHRva2Vucy4lMjIlMkMlMEElNUQ=",highlighted:`corpus = [
<span class="hljs-string">&quot;This is the Hugging Face Course.&quot;</span>,
<span class="hljs-string">&quot;This chapter is about tokenization.&quot;</span>,
<span class="hljs-string">&quot;This section shows several tokenizer algorithms.&quot;</span>,
<span class="hljs-string">&quot;Hopefully, you will be able to understand how they are trained and generate tokens.&quot;</span>,
]`,wrap:!1}}),vs=new j({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Ub2tlbml6ZXIlMEElMEF0b2tlbml6ZXIlMjAlM0QlMjBBdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZCglMjJ4bG5ldC1iYXNlLWNhc2VkJTIyKQ==",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;xlnet-base-cased&quot;</span>)`,wrap:!1}}),qs=new j({props:{code:"ZnJvbSUyMGNvbGxlY3Rpb25zJTIwaW1wb3J0JTIwZGVmYXVsdGRpY3QlMEElMEF3b3JkX2ZyZXFzJTIwJTNEJTIwZGVmYXVsdGRpY3QoaW50KSUwQWZvciUyMHRleHQlMjBpbiUyMGNvcnB1cyUzQSUwQSUyMCUyMCUyMCUyMHdvcmRzX3dpdGhfb2Zmc2V0cyUyMCUzRCUyMHRva2VuaXplci5iYWNrZW5kX3Rva2VuaXplci5wcmVfdG9rZW5pemVyLnByZV90b2tlbml6ZV9zdHIodGV4dCklMEElMjAlMjAlMjAlMjBuZXdfd29yZHMlMjAlM0QlMjAlNUJ3b3JkJTIwZm9yJTIwd29yZCUyQyUyMG9mZnNldCUyMGluJTIwd29yZHNfd2l0aF9vZmZzZXRzJTVEJTBBJTIwJTIwJTIwJTIwZm9yJTIwd29yZCUyMGluJTIwbmV3X3dvcmRzJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwd29yZF9mcmVxcyU1QndvcmQlNUQlMjAlMkIlM0QlMjAxJTBBJTBBd29yZF9mcmVxcw==",highlighted:`<span class="hljs-keyword">from</span> collections <span class="hljs-keyword">import</span> defaultdict
word_freqs = defaultdict(<span class="hljs-built_in">int</span>)
<span class="hljs-keyword">for</span> text <span class="hljs-keyword">in</span> corpus:
words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
new_words = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> words_with_offsets]
<span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> new_words:
word_freqs[word] += <span class="hljs-number">1</span>
word_freqs`,wrap:!1}}),As=new j({props:{code:"Y2hhcl9mcmVxcyUyMCUzRCUyMGRlZmF1bHRkaWN0KGludCklMEFzdWJ3b3Jkc19mcmVxcyUyMCUzRCUyMGRlZmF1bHRkaWN0KGludCklMEFmb3IlMjB3b3JkJTJDJTIwZnJlcSUyMGluJTIwd29yZF9mcmVxcy5pdGVtcygpJTNBJTBBJTIwJTIwJTIwJTIwZm9yJTIwaSUyMGluJTIwcmFuZ2UobGVuKHdvcmQpKSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGNoYXJfZnJlcXMlNUJ3b3JkJTVCaSU1RCU1RCUyMCUyQiUzRCUyMGZyZXElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjMlMjBMb29wJTIwdGhyb3VnaCUyMHRoZSUyMHN1YndvcmRzJTIwb2YlMjBsZW5ndGglMjBhdCUyMGxlYXN0JTIwMiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGZvciUyMGolMjBpbiUyMHJhbmdlKGklMjAlMkIlMjAyJTJDJTIwbGVuKHdvcmQpJTIwJTJCJTIwMSklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzdWJ3b3Jkc19mcmVxcyU1QndvcmQlNUJpJTNBaiU1RCU1RCUyMCUyQiUzRCUyMGZyZXElMEElMEElMjMlMjBTb3J0JTIwc3Vid29yZHMlMjBieSUyMGZyZXF1ZW5jeSUwQXNvcnRlZF9zdWJ3b3JkcyUyMCUzRCUyMHNvcnRlZChzdWJ3b3Jkc19mcmVxcy5pdGVtcygpJTJDJTIwa2V5JTNEbGFtYmRhJTIweCUzQSUyMHglNUIxJTVEJTJDJTIwcmV2ZXJzZSUzRFRydWUpJTBBc29ydGVkX3N1YndvcmRzJTVCJTNBMTAlNUQ=",highlighted:`char_freqs = defaultdict(<span class="hljs-built_in">int</span>)
subwords_freqs = defaultdict(<span class="hljs-built_in">int</span>)
<span class="hljs-keyword">for</span> word, freq <span class="hljs-keyword">in</span> word_freqs.items():
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(word)):
char_freqs[word[i]] += freq
<span class="hljs-comment"># Loop through the subwords of length at least 2</span>
<span class="hljs-keyword">for</span> j <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(i + <span class="hljs-number">2</span>, <span class="hljs-built_in">len</span>(word) + <span class="hljs-number">1</span>):
subwords_freqs[word[i:j]] += freq
<span class="hljs-comment"># Sort subwords by frequency</span>
sorted_subwords = <span class="hljs-built_in">sorted</span>(subwords_freqs.items(), key=<span class="hljs-keyword">lambda</span> x: x[<span class="hljs-number">1</span>], reverse=<span class="hljs-literal">True</span>)
sorted_subwords[:<span class="hljs-number">10</span>]`,wrap:!1}}),Zs=new j({props:{code:"JTVCKCclRTIlOTYlODF0JyUyQyUyMDcpJTJDJTIwKCdpcyclMkMlMjA1KSUyQyUyMCgnZXInJTJDJTIwNSklMkMlMjAoJyVFMiU5NiU4MWEnJTJDJTIwNSklMkMlMjAoJyVFMiU5NiU4MXRvJyUyQyUyMDQpJTJDJTIwKCd0byclMkMlMjA0KSUyQyUyMCgnZW4nJTJDJTIwNCklMkMlMjAoJyVFMiU5NiU4MVQnJTJDJTIwMyklMkMlMjAoJyVFMiU5NiU4MVRoJyUyQyUyMDMpJTJDJTIwKCclRTIlOTYlODFUaGknJTJDJTIwMyklNUQ=",highlighted:'[(<span class="hljs-string">&#x27;▁t&#x27;</span>, <span class="hljs-number">7</span>), (<span class="hljs-string">&#x27;is&#x27;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&#x27;er&#x27;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&#x27;▁a&#x27;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&#x27;▁to&#x27;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&#x27;to&#x27;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&#x27;en&#x27;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&#x27;▁T&#x27;</span>, <span class="hljs-number">3</span>), (<span class="hljs-string">&#x27;▁Th&#x27;</span>, <span class="hljs-number">3</span>), (<span class="hljs-string">&#x27;▁Thi&#x27;</span>, <span class="hljs-number">3</span>)]',wrap:!1}}),Ns=new j({props:{code:"dG9rZW5fZnJlcXMlMjAlM0QlMjBsaXN0KGNoYXJfZnJlcXMuaXRlbXMoKSklMjAlMkIlMjBzb3J0ZWRfc3Vid29yZHMlNUIlM0ElMjAzMDAlMjAtJTIwbGVuKGNoYXJfZnJlcXMpJTVEJTBBdG9rZW5fZnJlcXMlMjAlM0QlMjAlN0J0b2tlbiUzQSUyMGZyZXElMjBmb3IlMjB0b2tlbiUyQyUyMGZyZXElMjBpbiUyMHRva2VuX2ZyZXFzJTdE",highlighted:`token_freqs = <span class="hljs-built_in">list</span>(char_freqs.items()) + sorted_subwords[: <span class="hljs-number">300</span> - <span class="hljs-built_in">len</span>(char_freqs)]
token_freqs = {token: freq <span class="hljs-keyword">for</span> token, freq <span class="hljs-keyword">in</span> token_freqs}`,wrap:!1}}),$=new yt({props:{$$slots:{default:[Zn]},$$scope:{ctx:J}}}),Vs=new j({props:{code:"ZnJvbSUyMG1hdGglMjBpbXBvcnQlMjBsb2clMEElMEF0b3RhbF9zdW0lMjAlM0QlMjBzdW0oJTVCZnJlcSUyMGZvciUyMHRva2VuJTJDJTIwZnJlcSUyMGluJTIwdG9rZW5fZnJlcXMuaXRlbXMoKSU1RCklMEFtb2RlbCUyMCUzRCUyMCU3QnRva2VuJTNBJTIwLWxvZyhmcmVxJTIwJTJGJTIwdG90YWxfc3VtKSUyMGZvciUyMHRva2VuJTJDJTIwZnJlcSUyMGluJTIwdG9rZW5fZnJlcXMuaXRlbXMoKSU3RA==",highlighted:`<span class="hljs-keyword">from</span> math <span class="hljs-keyword">import</span> log
total_sum = <span class="hljs-built_in">sum</span>([freq <span class="hljs-keyword">for</span> token, freq <span class="hljs-keyword">in</span> token_freqs.items()])
model = {token: -log(freq / total_sum) <span class="hljs-keyword">for</span> token, freq <span class="hljs-keyword">in</span> token_freqs.items()}`,wrap:!1}}),Ss=new j({props:{code:"ZGVmJTIwZW5jb2RlX3dvcmQod29yZCUyQyUyMG1vZGVsKSUzQSUwQSUyMCUyMCUyMCUyMGJlc3Rfc2VnbWVudGF0aW9ucyUyMCUzRCUyMCU1QiU3QiUyMnN0YXJ0JTIyJTNBJTIwMCUyQyUyMCUyMnNjb3JlJTIyJTNBJTIwMSU3RCU1RCUyMCUyQiUyMCU1QiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCU3QiUyMnN0YXJ0JTIyJTNBJTIwTm9uZSUyQyUyMCUyMnNjb3JlJTIyJTNBJTIwTm9uZSU3RCUyMGZvciUyMF8lMjBpbiUyMHJhbmdlKGxlbih3b3JkKSklMEElMjAlMjAlMjAlMjAlNUQlMEElMjAlMjAlMjAlMjBmb3IlMjBzdGFydF9pZHglMjBpbiUyMHJhbmdlKGxlbih3b3JkKSklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjMlMjBUaGlzJTIwc2hvdWxkJTIwYmUlMjBwcm9wZXJseSUyMGZpbGxlZCUyMGJ5JTIwdGhlJTIwcHJldmlvdXMlMjBzdGVwcyUyMG9mJTIwdGhlJTIwbG9vcCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGJlc3Rfc2NvcmVfYXRfc3RhcnQlMjAlM0QlMjBiZXN0X3NlZ21lbnRhdGlvbnMlNUJzdGFydF9pZHglNUQlNUIlMjJzY29yZSUyMiU1RCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGZvciUyMGVuZF9pZHglMjBpbiUyMHJhbmdlKHN0YXJ0X2lkeCUyMCUyQiUyMDElMkMlMjBsZW4od29yZCklMjAlMkIlMjAxKSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHRva2VuJTIwJTNEJTIwd29yZCU1QnN0YXJ0X2lkeCUzQWVuZF9pZHglNUQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpZiUyMHRva2VuJTIwaW4lMjBtb2RlbCUyMGFuZCUyMGJlc3Rfc2NvcmVfYXRfc3RhcnQlMjBpcyUyMG5vdCUyME5vbmUlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzY29yZSUyMCUzRCUyMG1vZGVsJTVCdG9rZW4lNUQlMjAlMkIlMjBiZXN0X3Njb3JlX2F0X3N0YXJ0JTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIzJTIwSWYlMjB3ZSUyMGhhdmUlMjBmb3VuZCUyMGElMjBiZXR0ZXIlMjBzZWdtZW50YXRpb24lMjBlbmRpbmclMjBhdCUyMGVuZF9pZHglMkMlMjB3ZSUyMHVwZGF0ZSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwKCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGJlc3Rfc2VnbWVudGF0aW9ucyU1QmVuZF9pZHglNUQlNUIlMjJzY29yZSUyMiU1RCUyMGlzJTIwTm9uZSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMG9yJTIwYmVzdF9zZWdtZW50YXRpb25zJTVCZW5kX2lkeCU1RCU1QiUyMnNjb3JlJTIyJTVEJTIwJTNFJTIwc2NvcmUlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjApJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwYmVzdF9zZWdtZW50YXRpb25zJTVCZW5kX2lkeCU1RCUyMCUzRCUyMCU3QiUyMnN0YXJ0JTIyJTNBJTIwc3RhcnRfaWR4JTJDJTIwJTIyc2NvcmUlMjIlM0ElMjBzY29yZSU3RCUwQSUwQSUyMCUyMCUyMCUyMHNlZ21lbnRhdGlvbiUyMCUzRCUyMGJlc3Rfc2VnbWVudGF0aW9ucyU1Qi0xJTVEJTBBJTIwJTIwJTIwJTIwaWYlMjBzZWdtZW50YXRpb24lNUIlMjJzY29yZSUyMiU1RCUyMGlzJTIwTm9uZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMFdlJTIwZGlkJTIwbm90JTIwZmluZCUyMGElMjB0b2tlbml6YXRpb24lMjBvZiUyMHRoZSUyMHdvcmQlMjAtJTNFJTIwdW5rbm93biUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHJldHVybiUyMCU1QiUyMiUzQ3VuayUzRSUyMiU1RCUyQyUyME5vbmUlMEElMEElMjAlMjAlMjAlMjBzY29yZSUyMCUzRCUyMHNlZ21lbnRhdGlvbiU1QiUyMnNjb3JlJTIyJTVEJTBBJTIwJTIwJTIwJTIwc3RhcnQlMjAlM0QlMjBzZWdtZW50YXRpb24lNUIlMjJzdGFydCUyMiU1RCUwQSUyMCUyMCUyMCUyMGVuZCUyMCUzRCUyMGxlbih3b3JkKSUwQSUyMCUyMCUyMCUyMHRva2VucyUyMCUzRCUyMCU1QiU1RCUwQSUyMCUyMCUyMCUyMHdoaWxlJTIwc3RhcnQlMjAhJTNEJTIwMCUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHRva2Vucy5pbnNlcnQoMCUyQyUyMHdvcmQlNUJzdGFydCUzQWVuZCU1RCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBuZXh0X3N0YXJ0JTIwJTNEJTIwYmVzdF9zZWdtZW50YXRpb25zJTVCc3RhcnQlNUQlNUIlMjJzdGFydCUyMiU1RCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGVuZCUyMCUzRCUyMHN0YXJ0JTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc3RhcnQlMjAlM0QlMjBuZXh0X3N0YXJ0JTBBJTIwJTIwJTIwJTIwdG9rZW5zLmluc2VydCgwJTJDJTIwd29yZCU1QnN0YXJ0JTNBZW5kJTVEKSUwQSUyMCUyMCUyMCUyMHJldHVybiUyMHRva2VucyUyQyUyMHNjb3Jl",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">encode_word</span>(<span class="hljs-params">word, model</span>):
best_segmentations = [{<span class="hljs-string">&quot;start&quot;</span>: <span class="hljs-number">0</span>, <span class="hljs-string">&quot;score&quot;</span>: <span class="hljs-number">1</span>}] + [
{<span class="hljs-string">&quot;start&quot;</span>: <span class="hljs-literal">None</span>, <span class="hljs-string">&quot;score&quot;</span>: <span class="hljs-literal">None</span>} <span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(word))
]
<span class="hljs-keyword">for</span> start_idx <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(word)):
<span class="hljs-comment"># This should be properly filled by the previous steps of the loop</span>
best_score_at_start = best_segmentations[start_idx][<span class="hljs-string">&quot;score&quot;</span>]
<span class="hljs-keyword">for</span> end_idx <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(start_idx + <span class="hljs-number">1</span>, <span class="hljs-built_in">len</span>(word) + <span class="hljs-number">1</span>):
token = word[start_idx:end_idx]
<span class="hljs-keyword">if</span> token <span class="hljs-keyword">in</span> model <span class="hljs-keyword">and</span> best_score_at_start <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>:
score = model[token] + best_score_at_start
<span class="hljs-comment"># If we have found a better segmentation ending at end_idx, we update</span>
<span class="hljs-keyword">if</span> (
best_segmentations[end_idx][<span class="hljs-string">&quot;score&quot;</span>] <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span>
<span class="hljs-keyword">or</span> best_segmentations[end_idx][<span class="hljs-string">&quot;score&quot;</span>] &gt; score
):
best_segmentations[end_idx] = {<span class="hljs-string">&quot;start&quot;</span>: start_idx, <span class="hljs-string">&quot;score&quot;</span>: score}
segmentation = best_segmentations[-<span class="hljs-number">1</span>]
<span class="hljs-keyword">if</span> segmentation[<span class="hljs-string">&quot;score&quot;</span>] <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span>:
<span class="hljs-comment"># We did not find a tokenization of the word -&gt; unknown</span>
<span class="hljs-keyword">return</span> [<span class="hljs-string">&quot;&lt;unk&gt;&quot;</span>], <span class="hljs-literal">None</span>
score = segmentation[<span class="hljs-string">&quot;score&quot;</span>]
start = segmentation[<span class="hljs-string">&quot;start&quot;</span>]
end = <span class="hljs-built_in">len</span>(word)
tokens = []
<span class="hljs-keyword">while</span> start != <span class="hljs-number">0</span>:
tokens.insert(<span class="hljs-number">0</span>, word[start:end])
next_start = best_segmentations[start][<span class="hljs-string">&quot;start&quot;</span>]
end = start
start = next_start
tokens.insert(<span class="hljs-number">0</span>, word[start:end])
<span class="hljs-keyword">return</span> tokens, score`,wrap:!1}}),Hs=new j({props:{code:"cHJpbnQoZW5jb2RlX3dvcmQoJTIySG9wZWZ1bGx5JTIyJTJDJTIwbW9kZWwpKSUwQXByaW50KGVuY29kZV93b3JkKCUyMlRoaXMlMjIlMkMlMjBtb2RlbCkp",highlighted:`<span class="hljs-built_in">print</span>(encode_word(<span class="hljs-string">&quot;Hopefully&quot;</span>, model))
<span class="hljs-built_in">print</span>(encode_word(<span class="hljs-string">&quot;This&quot;</span>, model))`,wrap:!1}}),Es=new j({props:{code:"KCU1QidIJyUyQyUyMCdvJyUyQyUyMCdwJyUyQyUyMCdlJyUyQyUyMCdmJyUyQyUyMCd1JyUyQyUyMCdsbCclMkMlMjAneSclNUQlMkMlMjA0MS41MTU3NDk0NjAxNDAyKSUwQSglNUInVGhpcyclNUQlMkMlMjA2LjI4ODI2NzAzMDY5NDUzNSk=",highlighted:`([<span class="hljs-string">&#x27;H&#x27;</span>, <span class="hljs-string">&#x27;o&#x27;</span>, <span class="hljs-string">&#x27;p&#x27;</span>, <span class="hljs-string">&#x27;e&#x27;</span>, <span class="hljs-string">&#x27;f&#x27;</span>, <span class="hljs-string">&#x27;u&#x27;</span>, <span class="hljs-string">&#x27;ll&#x27;</span>, <span class="hljs-string">&#x27;y&#x27;</span>], <span class="hljs-number">41.5157494601402</span>)
([<span class="hljs-string">&#x27;This&#x27;</span>], <span class="hljs-number">6.288267030694535</span>)`,wrap:!1}}),Ws=new j({props:{code:"ZGVmJTIwY29tcHV0ZV9sb3NzKG1vZGVsKSUzQSUwQSUyMCUyMCUyMCUyMGxvc3MlMjAlM0QlMjAwJTBBJTIwJTIwJTIwJTIwZm9yJTIwd29yZCUyQyUyMGZyZXElMjBpbiUyMHdvcmRfZnJlcXMuaXRlbXMoKSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMF8lMkMlMjB3b3JkX2xvc3MlMjAlM0QlMjBlbmNvZGVfd29yZCh3b3JkJTJDJTIwbW9kZWwpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbG9zcyUyMCUyQiUzRCUyMGZyZXElMjAqJTIwd29yZF9sb3NzJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwbG9zcw==",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_loss</span>(<span class="hljs-params">model</span>):
loss = <span class="hljs-number">0</span>
<span class="hljs-keyword">for</span> word, freq <span class="hljs-keyword">in</span> word_freqs.items():
_, word_loss = encode_word(word, model)
loss += freq * word_loss
<span class="hljs-keyword">return</span> loss`,wrap:!1}}),Ys=new j({props:{code:"Y29tcHV0ZV9sb3NzKG1vZGVsKQ==",highlighted:"compute_loss(model)",wrap:!1}}),Ps=new j({props:{code:"NDEzLjEwMzc3NjQyOTQwODc1",highlighted:'<span class="hljs-number">413.10377642940875</span>',wrap:!1}}),Fs=new j({props:{code:"aW1wb3J0JTIwY29weSUwQSUwQSUwQWRlZiUyMGNvbXB1dGVfc2NvcmVzKG1vZGVsKSUzQSUwQSUyMCUyMCUyMCUyMHNjb3JlcyUyMCUzRCUyMCU3QiU3RCUwQSUyMCUyMCUyMCUyMG1vZGVsX2xvc3MlMjAlM0QlMjBjb21wdXRlX2xvc3MobW9kZWwpJTBBJTIwJTIwJTIwJTIwZm9yJTIwdG9rZW4lMkMlMjBzY29yZSUyMGluJTIwbW9kZWwuaXRlbXMoKSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMFdlJTIwYWx3YXlzJTIwa2VlcCUyMHRva2VucyUyMG9mJTIwbGVuZ3RoJTIwMSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwbGVuKHRva2VuKSUyMCUzRCUzRCUyMDElM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBjb250aW51ZSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMG1vZGVsX3dpdGhvdXRfdG9rZW4lMjAlM0QlMjBjb3B5LmRlZXBjb3B5KG1vZGVsKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMF8lMjAlM0QlMjBtb2RlbF93aXRob3V0X3Rva2VuLnBvcCh0b2tlbiklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzY29yZXMlNUJ0b2tlbiU1RCUyMCUzRCUyMGNvbXB1dGVfbG9zcyhtb2RlbF93aXRob3V0X3Rva2VuKSUyMC0lMjBtb2RlbF9sb3NzJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwc2NvcmVz",highlighted:`<span class="hljs-keyword">import</span> copy
<span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_scores</span>(<span class="hljs-params">model</span>):
scores = {}
model_loss = compute_loss(model)
<span class="hljs-keyword">for</span> token, score <span class="hljs-keyword">in</span> model.items():
<span class="hljs-comment"># We always keep tokens of length 1</span>
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(token) == <span class="hljs-number">1</span>:
<span class="hljs-keyword">continue</span>
model_without_token = copy.deepcopy(model)
_ = model_without_token.pop(token)
scores[token] = compute_loss(model_without_token) - model_loss
<span class="hljs-keyword">return</span> scores`,wrap:!1}}),Os=new j({props:{code:"c2NvcmVzJTIwJTNEJTIwY29tcHV0ZV9zY29yZXMobW9kZWwpJTBBcHJpbnQoc2NvcmVzJTVCJTIybGwlMjIlNUQpJTBBcHJpbnQoc2NvcmVzJTVCJTIyaGlzJTIyJTVEKQ==",highlighted:`scores = compute_scores(model)
<span class="hljs-built_in">print</span>(scores[<span class="hljs-string">&quot;ll&quot;</span>])
<span class="hljs-built_in">print</span>(scores[<span class="hljs-string">&quot;his&quot;</span>])`,wrap:!1}}),tt=new j({props:{code:"Ni4zNzY0MTI0MDM2MjM4NzQlMEEwLjA=",highlighted:`<span class="hljs-number">6.376412403623874</span>
<span class="hljs-number">0.0</span>`,wrap:!1}}),q=new yt({props:{$$slots:{default:[_n]},$$scope:{ctx:J}}}),at=new j({props:{code:"cGVyY2VudF90b19yZW1vdmUlMjAlM0QlMjAwLjElMEF3aGlsZSUyMGxlbihtb2RlbCklMjAlM0UlMjAxMDAlM0ElMEElMjAlMjAlMjAlMjBzY29yZXMlMjAlM0QlMjBjb21wdXRlX3Njb3Jlcyhtb2RlbCklMEElMjAlMjAlMjAlMjBzb3J0ZWRfc2NvcmVzJTIwJTNEJTIwc29ydGVkKHNjb3Jlcy5pdGVtcygpJTJDJTIwa2V5JTNEbGFtYmRhJTIweCUzQSUyMHglNUIxJTVEKSUwQSUyMCUyMCUyMCUyMCUyMyUyMFJlbW92ZSUyMHBlcmNlbnRfdG9fcmVtb3ZlJTIwdG9rZW5zJTIwd2l0aCUyMHRoZSUyMGxvd2VzdCUyMHNjb3Jlcy4lMEElMjAlMjAlMjAlMjBmb3IlMjBpJTIwaW4lMjByYW5nZShpbnQobGVuKG1vZGVsKSUyMColMjBwZXJjZW50X3RvX3JlbW92ZSkpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwXyUyMCUzRCUyMHRva2VuX2ZyZXFzLnBvcChzb3J0ZWRfc2NvcmVzJTVCaSU1RCU1QjAlNUQpJTBBJTBBJTIwJTIwJTIwJTIwdG90YWxfc3VtJTIwJTNEJTIwc3VtKCU1QmZyZXElMjBmb3IlMjB0b2tlbiUyQyUyMGZyZXElMjBpbiUyMHRva2VuX2ZyZXFzLml0ZW1zKCklNUQpJTBBJTIwJTIwJTIwJTIwbW9kZWwlMjAlM0QlMjAlN0J0b2tlbiUzQSUyMC1sb2coZnJlcSUyMCUyRiUyMHRvdGFsX3N1bSklMjBmb3IlMjB0b2tlbiUyQyUyMGZyZXElMjBpbiUyMHRva2VuX2ZyZXFzLml0ZW1zKCklN0Q=",highlighted:`percent_to_remove = <span class="hljs-number">0.1</span>
<span class="hljs-keyword">while</span> <span class="hljs-built_in">len</span>(model) &gt; <span class="hljs-number">100</span>:
scores = compute_scores(model)
sorted_scores = <span class="hljs-built_in">sorted</span>(scores.items(), key=<span class="hljs-keyword">lambda</span> x: x[<span class="hljs-number">1</span>])
<span class="hljs-comment"># Remove percent_to_remove tokens with the lowest scores.</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">int</span>(<span class="hljs-built_in">len</span>(model) * percent_to_remove)):
_ = token_freqs.pop(sorted_scores[i][<span class="hljs-number">0</span>])
total_sum = <span class="hljs-built_in">sum</span>([freq <span class="hljs-keyword">for</span> token, freq <span class="hljs-keyword">in</span> token_freqs.items()])
model = {token: -log(freq / total_sum) <span class="hljs-keyword">for</span> token, freq <span class="hljs-keyword">in</span> token_freqs.items()}`,wrap:!1}}),lt=new j({props:{code:"ZGVmJTIwdG9rZW5pemUodGV4dCUyQyUyMG1vZGVsKSUzQSUwQSUyMCUyMCUyMCUyMHdvcmRzX3dpdGhfb2Zmc2V0cyUyMCUzRCUyMHRva2VuaXplci5iYWNrZW5kX3Rva2VuaXplci5wcmVfdG9rZW5pemVyLnByZV90b2tlbml6ZV9zdHIodGV4dCklMEElMjAlMjAlMjAlMjBwcmVfdG9rZW5pemVkX3RleHQlMjAlM0QlMjAlNUJ3b3JkJTIwZm9yJTIwd29yZCUyQyUyMG9mZnNldCUyMGluJTIwd29yZHNfd2l0aF9vZmZzZXRzJTVEJTBBJTIwJTIwJTIwJTIwZW5jb2RlZF93b3JkcyUyMCUzRCUyMCU1QmVuY29kZV93b3JkKHdvcmQlMkMlMjBtb2RlbCklNUIwJTVEJTIwZm9yJTIwd29yZCUyMGluJTIwcHJlX3Rva2VuaXplZF90ZXh0JTVEJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwc3VtKGVuY29kZWRfd29yZHMlMkMlMjAlNUIlNUQpJTBBJTBBJTBBdG9rZW5pemUoJTIyVGhpcyUyMGlzJTIwdGhlJTIwSHVnZ2luZyUyMEZhY2UlMjBjb3Vyc2UuJTIyJTJDJTIwbW9kZWwp",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">tokenize</span>(<span class="hljs-params">text, model</span>):
words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
pre_tokenized_text = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> words_with_offsets]
encoded_words = [encode_word(word, model)[<span class="hljs-number">0</span>] <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> pre_tokenized_text]
<span class="hljs-keyword">return</span> <span class="hljs-built_in">sum</span>(encoded_words, [])
tokenize(<span class="hljs-string">&quot;This is the Hugging Face course.&quot;</span>, model)`,wrap:!1}}),pt=new j({props:{code:"JTVCJyVFMiU5NiU4MVRoaXMnJTJDJTIwJyVFMiU5NiU4MWlzJyUyQyUyMCclRTIlOTYlODF0aGUnJTJDJTIwJyVFMiU5NiU4MUh1Z2dpbmcnJTJDJTIwJyVFMiU5NiU4MUZhY2UnJTJDJTIwJyVFMiU5NiU4MSclMkMlMjAnYyclMkMlMjAnb3UnJTJDJTIwJ3InJTJDJTIwJ3MnJTJDJTIwJ2UnJTJDJTIwJy4nJTVE",highlighted:'[<span class="hljs-string">&#x27;▁This&#x27;</span>, <span class="hljs-string">&#x27;▁is&#x27;</span>, <span class="hljs-string">&#x27;▁the&#x27;</span>, <span class="hljs-string">&#x27;▁Hugging&#x27;</span>, <span class="hljs-string">&#x27;▁Face&#x27;</span>, <span class="hljs-string">&#x27;▁&#x27;</span>, <span class="hljs-string">&#x27;c&#x27;</span>, <span class="hljs-string">&#x27;ou&#x27;</span>, <span class="hljs-string">&#x27;r&#x27;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27;e&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>]',wrap:!1}}),x=new yt({props:{$$slots:{default:[Nn]},$$scope:{ctx:J}}}),it=new $n({props:{source:"https://github.com/huggingface/course/blob/main/chapters/en/chapter6/7.mdx"}}),{c(){i=p("meta"),U=n(),w=p("p"),d=n(),m(Z.$$.fragment),Jt=n(),m(_.$$.fragment),bt=n(),N=p("p"),N.innerHTML=Ua,Tt=n(),z=p("p"),z.textContent=da,ft=n(),m(V.$$.fragment),gt=n(),m(C.$$.fragment),Ct=n(),m(Q.$$.fragment),It=n(),B=p("p"),B.textContent=Ja,kt=n(),G=p("p"),G.textContent=ba,vt=n(),I=p("p"),ma=mt("This is all a very costly operation, so we don’t just remove the single symbol associated with the lowest loss increase, but the"),$t=new wa(!1),qt=mt(" (\\(p\\) being a hyperparameter you can control, usually 10 or 20) percent of the symbols associated with the lowest loss increase. This process is then repeated until the vocabulary has reached the desired size."),xt=n(),S=p("p"),S.textContent=Ta,At=n(),R=p("p"),R.textContent=fa,Zt=n(),H=p("p"),H.textContent=ga,_t=n(),m(E.$$.fragment),Nt=n(),X=p("p"),X.textContent=Ca,zt=n(),m(W.$$.fragment),Vt=n(),m(K.$$.fragment),Qt=n(),Y=p("p"),Y.textContent=Ia,Bt=n(),P=p("p"),P.innerHTML=ka,Gt=n(),D=p("p"),D.textContent=va,St=n(),m(F.$$.fragment),Rt=n(),L=p("p"),L.innerHTML=$a,Ht=n(),m(k.$$.fragment),Et=n(),b=p("p"),ca=mt("Now, to tokenize a given word, we look at all the possible segmentations into tokens and compute the probability of each according to the Unigram model. Since all tokens are considered independent, this probability is just the product of the probability of each token. For instance, the tokenization "),ht=p("code"),ht.textContent=qa,ha=mt(" of "),Mt=p("code"),Mt.textContent=xa,Ma=mt(` has the probability:
`),Xt=new wa(!1),Wt=n(),g=p("p"),ua=mt("Comparatively, the tokenization "),ut=p("code"),ut.textContent=Aa,ya=mt(` has the probability:
`),Kt=new wa(!1),Yt=n(),O=p("p"),O.textContent=Za,Pt=n(),ss=p("p"),ss.innerHTML=_a,Dt=n(),m(ts.$$.fragment),Ft=n(),es=p("p"),es.innerHTML=Na,Lt=n(),as=p("p"),as.innerHTML=za,Ot=n(),ns=p("p"),ns.textContent=Va,se=n(),ls=p("p"),ls.innerHTML=Qa,te=n(),m(ps.$$.fragment),ee=n(),os=p("p"),os.innerHTML=Ba,ae=n(),m(v.$$.fragment),ne=n(),m(is.$$.fragment),le=n(),rs=p("p"),rs.textContent=Ga,pe=n(),ms=p("p"),ms.innerHTML=Sa,oe=n(),cs=p("p"),cs.textContent=Ra,ie=n(),m(hs.$$.fragment),re=n(),Ms=p("p"),Ms.textContent=Ha,me=n(),m(us.$$.fragment),ce=n(),ys=p("p"),ys.textContent=Ea,he=n(),m(ws.$$.fragment),Me=n(),js=p("p"),js.innerHTML=Xa,ue=n(),Us=p("p"),Us.innerHTML=Wa,ye=n(),m(ds.$$.fragment),we=n(),Js=p("p"),Js.textContent=Ka,je=n(),m(bs.$$.fragment),Ue=n(),Ts=p("p"),Ts.innerHTML=Ya,de=n(),m(fs.$$.fragment),Je=n(),gs=p("p"),gs.textContent=Pa,be=n(),Cs=p("p"),Cs.textContent=Da,Te=n(),m(Is.$$.fragment),fe=n(),ks=p("p"),ks.innerHTML=Fa,ge=n(),m(vs.$$.fragment),Ce=n(),$s=p("p"),$s.textContent=La,Ie=n(),m(qs.$$.fragment),ke=n(),xs=p("p"),xs.textContent=Oa,ve=n(),m(As.$$.fragment),$e=n(),m(Zs.$$.fragment),qe=n(),_s=p("p"),_s.textContent=sn,xe=n(),m(Ns.$$.fragment),Ae=n(),m($.$$.fragment),Ze=n(),zs=p("p"),zs.textContent=tn,_e=n(),m(Vs.$$.fragment),Ne=n(),Qs=p("p"),Qs.innerHTML=en,ze=n(),Bs=p("p"),Bs.innerHTML=an,Ve=n(),Gs=p("p"),Gs.textContent=nn,Qe=n(),m(Ss.$$.fragment),Be=n(),Rs=p("p"),Rs.textContent=ln,Ge=n(),m(Hs.$$.fragment),Se=n(),m(Es.$$.fragment),Re=n(),Xs=p("p"),Xs.textContent=pn,He=n(),m(Ws.$$.fragment),Ee=n(),Ks=p("p"),Ks.textContent=on,Xe=n(),m(Ys.$$.fragment),We=n(),m(Ps.$$.fragment),Ke=n(),Ds=p("p"),Ds.textContent=rn,Ye=n(),m(Fs.$$.fragment),Pe=n(),Ls=p("p"),Ls.textContent=mn,De=n(),m(Os.$$.fragment),Fe=n(),st=p("p"),st.innerHTML=cn,Le=n(),m(tt.$$.fragment),Oe=n(),m(q.$$.fragment),sa=n(),et=p("p"),et.textContent=hn,ta=n(),m(at.$$.fragment),ea=n(),nt=p("p"),nt.innerHTML=Mn,aa=n(),m(lt.$$.fragment),na=n(),m(pt.$$.fragment),la=n(),m(x.$$.fragment),pa=n(),ot=p("p"),ot.textContent=un,oa=n(),m(it.$$.fragment),ia=n(),jt=p("p"),this.h()},l(s){const t=In("svelte-u9bgzb",document.head);i=o(t,"META",{name:!0,content:!0}),t.forEach(e),U=l(s),w=o(s,"P",{}),Ut(w).forEach(e),d=l(s),c(Z.$$.fragment,s),Jt=l(s),c(_.$$.fragment,s),bt=l(s),N=o(s,"P",{"data-svelte-h":!0}),r(N)!=="svelte-oo0thc"&&(N.innerHTML=Ua),Tt=l(s),z=o(s,"P",{"data-svelte-h":!0}),r(z)!=="svelte-vloklr"&&(z.textContent=da),ft=l(s),c(V.$$.fragment,s),gt=l(s),c(C.$$.fragment,s),Ct=l(s),c(Q.$$.fragment,s),It=l(s),B=o(s,"P",{"data-svelte-h":!0}),r(B)!=="svelte-16vv0fb"&&(B.textContent=Ja),kt=l(s),G=o(s,"P",{"data-svelte-h":!0}),r(G)!=="svelte-1yvt99t"&&(G.textContent=ba),vt=l(s),I=o(s,"P",{});var rt=Ut(I);ma=ct(rt,"This is all a very costly operation, so we don’t just remove the single symbol associated with the lowest loss increase, but the"),$t=ja(rt,!1),qt=ct(rt," (\\(p\\) being a hyperparameter you can control, usually 10 or 20) percent of the symbols associated with the lowest loss increase. This process is then repeated until the vocabulary has reached the desired size."),rt.forEach(e),xt=l(s),S=o(s,"P",{"data-svelte-h":!0}),r(S)!=="svelte-1bd68el"&&(S.textContent=Ta),At=l(s),R=o(s,"P",{"data-svelte-h":!0}),r(R)!=="svelte-18sgwt4"&&(R.textContent=fa),Zt=l(s),H=o(s,"P",{"data-svelte-h":!0}),r(H)!=="svelte-1rnxwlp"&&(H.textContent=ga),_t=l(s),c(E.$$.fragment,s),Nt=l(s),X=o(s,"P",{"data-svelte-h":!0}),r(X)!=="svelte-1sp8vuv"&&(X.textContent=Ca),zt=l(s),c(W.$$.fragment,s),Vt=l(s),c(K.$$.fragment,s),Qt=l(s),Y=o(s,"P",{"data-svelte-h":!0}),r(Y)!=="svelte-17i13c1"&&(Y.textContent=Ia),Bt=l(s),P=o(s,"P",{"data-svelte-h":!0}),r(P)!=="svelte-1ywxof8"&&(P.innerHTML=ka),Gt=l(s),D=o(s,"P",{"data-svelte-h":!0}),r(D)!=="svelte-1im6u9i"&&(D.textContent=va),St=l(s),c(F.$$.fragment,s),Rt=l(s),L=o(s,"P",{"data-svelte-h":!0}),r(L)!=="svelte-16cln8v"&&(L.innerHTML=$a),Ht=l(s),c(k.$$.fragment,s),Et=l(s),b=o(s,"P",{});var f=Ut(b);ca=ct(f,"Now, to tokenize a given word, we look at all the possible segmentations into tokens and compute the probability of each according to the Unigram model. Since all tokens are considered independent, this probability is just the product of the probability of each token. For instance, the tokenization "),ht=o(f,"CODE",{"data-svelte-h":!0}),r(ht)!=="svelte-1n2m4po"&&(ht.textContent=qa),ha=ct(f," of "),Mt=o(f,"CODE",{"data-svelte-h":!0}),r(Mt)!=="svelte-1gjdq76"&&(Mt.textContent=xa),Ma=ct(f,` has the probability:
`),Xt=ja(f,!1),f.forEach(e),Wt=l(s),g=o(s,"P",{});var A=Ut(g);ua=ct(A,"Comparatively, the tokenization "),ut=o(A,"CODE",{"data-svelte-h":!0}),r(ut)!=="svelte-42m5r0"&&(ut.textContent=Aa),ya=ct(A,` has the probability:
`),Kt=ja(A,!1),A.forEach(e),Yt=l(s),O=o(s,"P",{"data-svelte-h":!0}),r(O)!=="svelte-g2doa3"&&(O.textContent=Za),Pt=l(s),ss=o(s,"P",{"data-svelte-h":!0}),r(ss)!=="svelte-9ovu4k"&&(ss.innerHTML=_a),Dt=l(s),c(ts.$$.fragment,s),Ft=l(s),es=o(s,"P",{"data-svelte-h":!0}),r(es)!=="svelte-by7atq"&&(es.innerHTML=Na),Lt=l(s),as=o(s,"P",{"data-svelte-h":!0}),r(as)!=="svelte-1re84pi"&&(as.innerHTML=za),Ot=l(s),ns=o(s,"P",{"data-svelte-h":!0}),r(ns)!=="svelte-4w9cp8"&&(ns.textContent=Va),se=l(s),ls=o(s,"P",{"data-svelte-h":!0}),r(ls)!=="svelte-1v3e60i"&&(ls.innerHTML=Qa),te=l(s),c(ps.$$.fragment,s),ee=l(s),os=o(s,"P",{"data-svelte-h":!0}),r(os)!=="svelte-17ww2xx"&&(os.innerHTML=Ba),ae=l(s),c(v.$$.fragment,s),ne=l(s),c(is.$$.fragment,s),le=l(s),rs=o(s,"P",{"data-svelte-h":!0}),r(rs)!=="svelte-1mxwvxi"&&(rs.textContent=Ga),pe=l(s),ms=o(s,"P",{"data-svelte-h":!0}),r(ms)!=="svelte-r1ko6w"&&(ms.innerHTML=Sa),oe=l(s),cs=o(s,"P",{"data-svelte-h":!0}),r(cs)!=="svelte-1y2guu9"&&(cs.textContent=Ra),ie=l(s),c(hs.$$.fragment,s),re=l(s),Ms=o(s,"P",{"data-svelte-h":!0}),r(Ms)!=="svelte-smnwx3"&&(Ms.textContent=Ha),me=l(s),c(us.$$.fragment,s),ce=l(s),ys=o(s,"P",{"data-svelte-h":!0}),r(ys)!=="svelte-14jcfwu"&&(ys.textContent=Ea),he=l(s),c(ws.$$.fragment,s),Me=l(s),js=o(s,"P",{"data-svelte-h":!0}),r(js)!=="svelte-1rsqmv"&&(js.innerHTML=Xa),ue=l(s),Us=o(s,"P",{"data-svelte-h":!0}),r(Us)!=="svelte-1e3tr4j"&&(Us.innerHTML=Wa),ye=l(s),c(ds.$$.fragment,s),we=l(s),Js=o(s,"P",{"data-svelte-h":!0}),r(Js)!=="svelte-15gzb9u"&&(Js.textContent=Ka),je=l(s),c(bs.$$.fragment,s),Ue=l(s),Ts=o(s,"P",{"data-svelte-h":!0}),r(Ts)!=="svelte-1gwabz2"&&(Ts.innerHTML=Ya),de=l(s),c(fs.$$.fragment,s),Je=l(s),gs=o(s,"P",{"data-svelte-h":!0}),r(gs)!=="svelte-1p0q4mo"&&(gs.textContent=Pa),be=l(s),Cs=o(s,"P",{"data-svelte-h":!0}),r(Cs)!=="svelte-160kau6"&&(Cs.textContent=Da),Te=l(s),c(Is.$$.fragment,s),fe=l(s),ks=o(s,"P",{"data-svelte-h":!0}),r(ks)!=="svelte-5hfan0"&&(ks.innerHTML=Fa),ge=l(s),c(vs.$$.fragment,s),Ce=l(s),$s=o(s,"P",{"data-svelte-h":!0}),r($s)!=="svelte-1p0ikvu"&&($s.textContent=La),Ie=l(s),c(qs.$$.fragment,s),ke=l(s),xs=o(s,"P",{"data-svelte-h":!0}),r(xs)!=="svelte-1njxouy"&&(xs.textContent=Oa),ve=l(s),c(As.$$.fragment,s),$e=l(s),c(Zs.$$.fragment,s),qe=l(s),_s=o(s,"P",{"data-svelte-h":!0}),r(_s)!=="svelte-1rbeurr"&&(_s.textContent=sn),xe=l(s),c(Ns.$$.fragment,s),Ae=l(s),c($.$$.fragment,s),Ze=l(s),zs=o(s,"P",{"data-svelte-h":!0}),r(zs)!=="svelte-kpz2jw"&&(zs.textContent=tn),_e=l(s),c(Vs.$$.fragment,s),Ne=l(s),Qs=o(s,"P",{"data-svelte-h":!0}),r(Qs)!=="svelte-1vp6twv"&&(Qs.innerHTML=en),ze=l(s),Bs=o(s,"P",{"data-svelte-h":!0}),r(Bs)!=="svelte-8o0mkc"&&(Bs.innerHTML=an),Ve=l(s),Gs=o(s,"P",{"data-svelte-h":!0}),r(Gs)!=="svelte-76z975"&&(Gs.textContent=nn),Qe=l(s),c(Ss.$$.fragment,s),Be=l(s),Rs=o(s,"P",{"data-svelte-h":!0}),r(Rs)!=="svelte-1riuhoo"&&(Rs.textContent=ln),Ge=l(s),c(Hs.$$.fragment,s),Se=l(s),c(Es.$$.fragment,s),Re=l(s),Xs=o(s,"P",{"data-svelte-h":!0}),r(Xs)!=="svelte-ncpc2j"&&(Xs.textContent=pn),He=l(s),c(Ws.$$.fragment,s),Ee=l(s),Ks=o(s,"P",{"data-svelte-h":!0}),r(Ks)!=="svelte-p7t1po"&&(Ks.textContent=on),Xe=l(s),c(Ys.$$.fragment,s),We=l(s),c(Ps.$$.fragment,s),Ke=l(s),Ds=o(s,"P",{"data-svelte-h":!0}),r(Ds)!=="svelte-emtee"&&(Ds.textContent=rn),Ye=l(s),c(Fs.$$.fragment,s),Pe=l(s),Ls=o(s,"P",{"data-svelte-h":!0}),r(Ls)!=="svelte-qxfbyo"&&(Ls.textContent=mn),De=l(s),c(Os.$$.fragment,s),Fe=l(s),st=o(s,"P",{"data-svelte-h":!0}),r(st)!=="svelte-zu2ggv"&&(st.innerHTML=cn),Le=l(s),c(tt.$$.fragment,s),Oe=l(s),c(q.$$.fragment,s),sa=l(s),et=o(s,"P",{"data-svelte-h":!0}),r(et)!=="svelte-1vz3e3j"&&(et.textContent=hn),ta=l(s),c(at.$$.fragment,s),ea=l(s),nt=o(s,"P",{"data-svelte-h":!0}),r(nt)!=="svelte-m3sdwl"&&(nt.innerHTML=Mn),aa=l(s),c(lt.$$.fragment,s),na=l(s),c(pt.$$.fragment,s),la=l(s),c(x.$$.fragment,s),pa=l(s),ot=o(s,"P",{"data-svelte-h":!0}),r(ot)!=="svelte-1hk3m3g"&&(ot.textContent=un),oa=l(s),c(it.$$.fragment,s),ia=l(s),jt=o(s,"P",{}),Ut(jt).forEach(e),this.h()},h(){Un(i,"name","hf:doc:metadata"),Un(i,"content",Vn),$t.a=qt,Xt.a=null,Kt.a=null},m(s,t){T(document.head,i),a(s,U,t),a(s,w,t),a(s,d,t),h(Z,s,t),a(s,Jt,t),h(_,s,t),a(s,bt,t),a(s,N,t),a(s,Tt,t),a(s,z,t),a(s,ft,t),h(V,s,t),a(s,gt,t),h(C,s,t),a(s,Ct,t),h(Q,s,t),a(s,It,t),a(s,B,t),a(s,kt,t),a(s,G,t),a(s,vt,t),a(s,I,t),T(I,ma),$t.m(dn,I),T(I,qt),a(s,xt,t),a(s,S,t),a(s,At,t),a(s,R,t),a(s,Zt,t),a(s,H,t),a(s,_t,t),h(E,s,t),a(s,Nt,t),a(s,X,t),a(s,zt,t),h(W,s,t),a(s,Vt,t),h(K,s,t),a(s,Qt,t),a(s,Y,t),a(s,Bt,t),a(s,P,t),a(s,Gt,t),a(s,D,t),a(s,St,t),h(F,s,t),a(s,Rt,t),a(s,L,t),a(s,Ht,t),h(k,s,t),a(s,Et,t),a(s,b,t),T(b,ca),T(b,ht),T(b,ha),T(b,Mt),T(b,Ma),Xt.m(Jn,b),a(s,Wt,t),a(s,g,t),T(g,ua),T(g,ut),T(g,ya),Kt.m(bn,g),a(s,Yt,t),a(s,O,t),a(s,Pt,t),a(s,ss,t),a(s,Dt,t),h(ts,s,t),a(s,Ft,t),a(s,es,t),a(s,Lt,t),a(s,as,t),a(s,Ot,t),a(s,ns,t),a(s,se,t),a(s,ls,t),a(s,te,t),h(ps,s,t),a(s,ee,t),a(s,os,t),a(s,ae,t),h(v,s,t),a(s,ne,t),h(is,s,t),a(s,le,t),a(s,rs,t),a(s,pe,t),a(s,ms,t),a(s,oe,t),a(s,cs,t),a(s,ie,t),h(hs,s,t),a(s,re,t),a(s,Ms,t),a(s,me,t),h(us,s,t),a(s,ce,t),a(s,ys,t),a(s,he,t),h(ws,s,t),a(s,Me,t),a(s,js,t),a(s,ue,t),a(s,Us,t),a(s,ye,t),h(ds,s,t),a(s,we,t),a(s,Js,t),a(s,je,t),h(bs,s,t),a(s,Ue,t),a(s,Ts,t),a(s,de,t),h(fs,s,t),a(s,Je,t),a(s,gs,t),a(s,be,t),a(s,Cs,t),a(s,Te,t),h(Is,s,t),a(s,fe,t),a(s,ks,t),a(s,ge,t),h(vs,s,t),a(s,Ce,t),a(s,$s,t),a(s,Ie,t),h(qs,s,t),a(s,ke,t),a(s,xs,t),a(s,ve,t),h(As,s,t),a(s,$e,t),h(Zs,s,t),a(s,qe,t),a(s,_s,t),a(s,xe,t),h(Ns,s,t),a(s,Ae,t),h($,s,t),a(s,Ze,t),a(s,zs,t),a(s,_e,t),h(Vs,s,t),a(s,Ne,t),a(s,Qs,t),a(s,ze,t),a(s,Bs,t),a(s,Ve,t),a(s,Gs,t),a(s,Qe,t),h(Ss,s,t),a(s,Be,t),a(s,Rs,t),a(s,Ge,t),h(Hs,s,t),a(s,Se,t),h(Es,s,t),a(s,Re,t),a(s,Xs,t),a(s,He,t),h(Ws,s,t),a(s,Ee,t),a(s,Ks,t),a(s,Xe,t),h(Ys,s,t),a(s,We,t),h(Ps,s,t),a(s,Ke,t),a(s,Ds,t),a(s,Ye,t),h(Fs,s,t),a(s,Pe,t),a(s,Ls,t),a(s,De,t),h(Os,s,t),a(s,Fe,t),a(s,st,t),a(s,Le,t),h(tt,s,t),a(s,Oe,t),h(q,s,t),a(s,sa,t),a(s,et,t),a(s,ta,t),h(at,s,t),a(s,ea,t),a(s,nt,t),a(s,aa,t),h(lt,s,t),a(s,na,t),h(pt,s,t),a(s,la,t),h(x,s,t),a(s,pa,t),a(s,ot,t),a(s,oa,t),h(it,s,t),a(s,ia,t),a(s,jt,t),ra=!0},p(s,[t]){const rt={};t&2&&(rt.$$scope={dirty:t,ctx:s}),C.$set(rt);const f={};t&2&&(f.$$scope={dirty:t,ctx:s}),k.$set(f);const A={};t&2&&(A.$$scope={dirty:t,ctx:s}),v.$set(A);const yn={};t&2&&(yn.$$scope={dirty:t,ctx:s}),$.$set(yn);const wn={};t&2&&(wn.$$scope={dirty:t,ctx:s}),q.$set(wn);const jn={};t&2&&(jn.$$scope={dirty:t,ctx:s}),x.$set(jn)},i(s){ra||(M(Z.$$.fragment,s),M(_.$$.fragment,s),M(V.$$.fragment,s),M(C.$$.fragment,s),M(Q.$$.fragment,s),M(E.$$.fragment,s),M(W.$$.fragment,s),M(K.$$.fragment,s),M(F.$$.fragment,s),M(k.$$.fragment,s),M(ts.$$.fragment,s),M(ps.$$.fragment,s),M(v.$$.fragment,s),M(is.$$.fragment,s),M(hs.$$.fragment,s),M(us.$$.fragment,s),M(ws.$$.fragment,s),M(ds.$$.fragment,s),M(bs.$$.fragment,s),M(fs.$$.fragment,s),M(Is.$$.fragment,s),M(vs.$$.fragment,s),M(qs.$$.fragment,s),M(As.$$.fragment,s),M(Zs.$$.fragment,s),M(Ns.$$.fragment,s),M($.$$.fragment,s),M(Vs.$$.fragment,s),M(Ss.$$.fragment,s),M(Hs.$$.fragment,s),M(Es.$$.fragment,s),M(Ws.$$.fragment,s),M(Ys.$$.fragment,s),M(Ps.$$.fragment,s),M(Fs.$$.fragment,s),M(Os.$$.fragment,s),M(tt.$$.fragment,s),M(q.$$.fragment,s),M(at.$$.fragment,s),M(lt.$$.fragment,s),M(pt.$$.fragment,s),M(x.$$.fragment,s),M(it.$$.fragment,s),ra=!0)},o(s){u(Z.$$.fragment,s),u(_.$$.fragment,s),u(V.$$.fragment,s),u(C.$$.fragment,s),u(Q.$$.fragment,s),u(E.$$.fragment,s),u(W.$$.fragment,s),u(K.$$.fragment,s),u(F.$$.fragment,s),u(k.$$.fragment,s),u(ts.$$.fragment,s),u(ps.$$.fragment,s),u(v.$$.fragment,s),u(is.$$.fragment,s),u(hs.$$.fragment,s),u(us.$$.fragment,s),u(ws.$$.fragment,s),u(ds.$$.fragment,s),u(bs.$$.fragment,s),u(fs.$$.fragment,s),u(Is.$$.fragment,s),u(vs.$$.fragment,s),u(qs.$$.fragment,s),u(As.$$.fragment,s),u(Zs.$$.fragment,s),u(Ns.$$.fragment,s),u($.$$.fragment,s),u(Vs.$$.fragment,s),u(Ss.$$.fragment,s),u(Hs.$$.fragment,s),u(Es.$$.fragment,s),u(Ws.$$.fragment,s),u(Ys.$$.fragment,s),u(Ps.$$.fragment,s),u(Fs.$$.fragment,s),u(Os.$$.fragment,s),u(tt.$$.fragment,s),u(q.$$.fragment,s),u(at.$$.fragment,s),u(lt.$$.fragment,s),u(pt.$$.fragment,s),u(x.$$.fragment,s),u(it.$$.fragment,s),ra=!1},d(s){s&&(e(U),e(w),e(d),e(Jt),e(bt),e(N),e(Tt),e(z),e(ft),e(gt),e(Ct),e(It),e(B),e(kt),e(G),e(vt),e(I),e(xt),e(S),e(At),e(R),e(Zt),e(H),e(_t),e(Nt),e(X),e(zt),e(Vt),e(Qt),e(Y),e(Bt),e(P),e(Gt),e(D),e(St),e(Rt),e(L),e(Ht),e(Et),e(b),e(Wt),e(g),e(Yt),e(O),e(Pt),e(ss),e(Dt),e(Ft),e(es),e(Lt),e(as),e(Ot),e(ns),e(se),e(ls),e(te),e(ee),e(os),e(ae),e(ne),e(le),e(rs),e(pe),e(ms),e(oe),e(cs),e(ie),e(re),e(Ms),e(me),e(ce),e(ys),e(he),e(Me),e(js),e(ue),e(Us),e(ye),e(we),e(Js),e(je),e(Ue),e(Ts),e(de),e(Je),e(gs),e(be),e(Cs),e(Te),e(fe),e(ks),e(ge),e(Ce),e($s),e(Ie),e(ke),e(xs),e(ve),e($e),e(qe),e(_s),e(xe),e(Ae),e(Ze),e(zs),e(_e),e(Ne),e(Qs),e(ze),e(Bs),e(Ve),e(Gs),e(Qe),e(Be),e(Rs),e(Ge),e(Se),e(Re),e(Xs),e(He),e(Ee),e(Ks),e(Xe),e(We),e(Ke),e(Ds),e(Ye),e(Pe),e(Ls),e(De),e(Fe),e(st),e(Le),e(Oe),e(sa),e(et),e(ta),e(ea),e(nt),e(aa),e(na),e(la),e(pa),e(ot),e(oa),e(ia),e(jt)),e(i),y(Z,s),y(_,s),y(V,s),y(C,s),y(Q,s),y(E,s),y(W,s),y(K,s),y(F,s),y(k,s),y(ts,s),y(ps,s),y(v,s),y(is,s),y(hs,s),y(us,s),y(ws,s),y(ds,s),y(bs,s),y(fs,s),y(Is,s),y(vs,s),y(qs,s),y(As,s),y(Zs,s),y(Ns,s),y($,s),y(Vs,s),y(Ss,s),y(Hs,s),y(Es,s),y(Ws,s),y(Ys,s),y(Ps,s),y(Fs,s),y(Os,s),y(tt,s),y(q,s),y(at,s),y(lt,s),y(pt,s),y(x,s),y(it,s)}}}const Vn='{"title":"Unigram tokenization","local":"unigram-tokenization","sections":[{"title":"Training algorithm","local":"training-algorithm","sections":[],"depth":2},{"title":"Tokenization algorithm","local":"tokenization-algorithm","sections":[],"depth":2},{"title":"Back to training","local":"back-to-training","sections":[],"depth":2},{"title":"Implementing Unigram","local":"implementing-unigram","sections":[],"depth":2}],"depth":1}';function Qn(J){return fn(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Wn extends gn{constructor(i){super(),Cn(this,i,Qn,zn,Tn,{})}}export{Wn as component};

Xet Storage Details

Size:
79.1 kB
·
Xet hash:
fdc39ee08a210ad87388f3c7ff8f87e8835c254b9f4fc86a477fa89d48ff6fbf

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.