Buckets:

rtrm's picture
download
raw
79.3 kB
import{s as jl,o as dl,n as Ua}from"../chunks/scheduler.37c15a92.js";import{S as Ul,i as hl,g as p,s as l,r as m,m as ra,H as ut,A as Jl,h as i,f as e,c as n,j as ya,u as o,x as r,n as ca,E as Mt,k as ol,y as T,a as t,v as u,d as M,t as y,w as j}from"../chunks/index.2bf4358c.js";import{T as ja}from"../chunks/Tip.363c041f.js";import{Y as wl}from"../chunks/Youtube.1e50a667.js";import{C as U}from"../chunks/CodeBlock.4e987730.js";import{C as bl}from"../chunks/CourseFloatingBanner.6add7356.js";import{H as da,E as Tl}from"../chunks/getInferenceSnippets.ebf8be91.js";function fl(w){let c,h="💡 Această secțiune acoperă Unigram în profunzime, mergând până la prezentarea unei implementări complete. Puteți sări la sfârșit dacă doriți doar o prezentare generală a algoritmului de tokenizare.";return{c(){c=p("p"),c.textContent=h},l(d){c=i(d,"P",{"data-svelte-h":!0}),r(c)!=="svelte-1cbwd1p"&&(c.textContent=h)},m(d,J){t(d,c,J)},p:Ua,d(d){d&&e(c)}}}function gl(w){let c,h="✏️ <strong>Acum este rândul tău!</strong> Scrie codul pentru a calcula frecvențele de mai sus și verifică de două ori dacă rezultatele afișate sunt corecte, precum și suma totală.";return{c(){c=p("p"),c.innerHTML=h},l(d){c=i(d,"P",{"data-svelte-h":!0}),r(c)!=="svelte-1owpu8w"&&(c.innerHTML=h)},m(d,J){t(d,c,J)},p:Ua,d(d){d&&e(c)}}}function Cl(w){let c,h="✏️ <strong>Acum e rândul tău!</strong> Determinați tokenizarea cuvântului <code>&quot;huggun&quot;</code> și scorul acestuia.";return{c(){c=p("p"),c.innerHTML=h},l(d){c=i(d,"P",{"data-svelte-h":!0}),r(c)!=="svelte-if10wz"&&(c.innerHTML=h)},m(d,J){t(d,c,J)},p:Ua,d(d){d&&e(c)}}}function Il(w){let c,h="💡 SentencePiece utilizează un algoritm mai eficient numit Enhanced Suffix Array (ESA) pentru a crea vocabularul inițial.";return{c(){c=p("p"),c.textContent=h},l(d){c=i(d,"P",{"data-svelte-h":!0}),r(c)!=="svelte-7v30c1"&&(c.textContent=h)},m(d,J){t(d,c,J)},p:Ua,d(d){d&&e(c)}}}function vl(w){let c,h="💡 Această abordare este foarte ineficientă, astfel încât SentencePiece utilizează o aproximare a pierderii modelului fără simbolul X: în loc să înceapă de la zero, înlocuiește simbolul X cu segmentarea sa în vocabularul rămas. În acest fel, toate scorurile pot fi calculate odată, în același timp cu pierderea modelului.";return{c(){c=p("p"),c.textContent=h},l(d){c=i(d,"P",{"data-svelte-h":!0}),r(c)!=="svelte-1qowk2a"&&(c.textContent=h)},m(d,J){t(d,c,J)},p:Ua,d(d){d&&e(c)}}}function kl(w){let c,h,d,J,A,ha,z,Ja,Z,yt="Algoritmul Unigram este adesea utilizat în SentencePiece, care este algoritmul de tokenizare utilizat de modele precum AlBERT, T5, mBART, Big Bird și XLNet.",wa,_,ba,C,Ta,V,fa,N,jt="În comparație cu BPE și WordPiece, Unigram lucrează în cealaltă direcție: pornește de la un vocabular mare și elimină tokeni din acesta până când ajunge la dimensiunea dorită. Există mai multe opțiuni pentru a construi acel vocabular de bază: putem lua, de exemplu, cele mai comune substrings din cuvintele pre-tokenizate sau putem aplica BPE pe corpusul inițial cu o dimensiune mare a vocabularului.",ga,Q,dt="La fiecare etapă a antrenării, algoritmul Unigram calculează o pierdere pe corpus oferit, având în vedere vocabularul curent. Apoi, pentru fiecare simbol din vocabular, algoritmul calculează cu cât ar crește pierderea globală dacă simbolul ar fi eliminat și caută simbolurile care ar crește cel mai puțin pierderea. Aceste simboluri au cel mai redus efect asupra pierderii globale din corpus, deci, într-un fel, sunt “mai puțin necesare” și sunt cei mai buni candidați pentru eliminare.",Ca,I,pt,Ia,ul='<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>p</mi></mrow><annotation encoding="application/x-tex">p</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.625em;vertical-align:-0.1944em;"></span><span class="mord mathnormal">p</span></span></span></span>',va,ka,B,Ut="Rețineți că nu eliminăm niciodată caracterele de bază, pentru a ne asigura că orice cuvânt poate fi tokenizat.",$a,G,ht="Acum, acest lucru este încă puțin vag: partea principală a algoritmului este de a calcula o pierdere asupra corpusului și de a vedea cum se schimbă atunci când eliminăm unele tokenuri din vocabular, dar nu am explicat încă cum să facem acest lucru. Acest pas se bazează pe algoritmul de tokenizare al unui model Unigram, așa că îl vom analiza în continuare.",qa,S,Jt="Vom reutiliza corpusul din exemplele anterioare:",xa,R,Aa,E,wt="iar pentru acest exemplu, vom lua toate substringurile stricte pentru vocabularul inițial:",za,H,Za,X,_a,W,bt="Un model Unigram este un tip de model lingvistic care consideră că fiecare token este independent de tokenii anteriori. Este cel mai simplu model lingvistic, în sensul că probabilitatea simbolului X având în vedere contextul anterior este doar probabilitatea simbolului X. Astfel, dacă am utiliza un model lingvistic Unigram pentru a genera text, am prezice întotdeauna simbolul cel mai frecvent.",Va,D,Tt="Probabilitatea unui token dat este frecvența sa (numărul de ori în care îl găsim) în corpusul original, împărțită la suma tuturor aparițiilor tuturor tokenilor din vocabular (pentru a ne asigura că probabilitățile sunt egale cu 1). De exemplu, <code>&quot;ug&quot;</code> este prezent în <code>&quot;hug&quot;</code>, <code>&quot;pug&quot;</code>, și <code>&quot;hugs&quot;</code>, deci are o frecvență de 20 în corpusul nostru.",Na,K,ft="Iată frecvențele tuturor subcuvintelor posibile din vocabular:",Qa,Y,Ba,P,gt="Astfel, suma tuturor frecvențelor este 210, iar probabilitatea subcuvântului <code>&quot;ug&quot;</code> este 20/210.",Ga,v,Sa,b,it,ma,Ct='["p", "u", "g"]',rt,oa,It='"pug"',ct,Ra,Ml='<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mi>P</mi><mo stretchy="false">(</mo><mo stretchy="false">[</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>p</mi><mi mathvariant="normal">&quot;</mi><mo separator="true">,</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>u</mi><mi mathvariant="normal">&quot;</mi><mo separator="true">,</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>g</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">]</mo><mo stretchy="false">)</mo><mo>=</mo><mi>P</mi><mo stretchy="false">(</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>p</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">)</mo><mo>×</mo><mi>P</mi><mo stretchy="false">(</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>u</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">)</mo><mo>×</mo><mi>P</mi><mo stretchy="false">(</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>g</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">)</mo><mo>=</mo><mfrac><mn>5</mn><mn>210</mn></mfrac><mo>×</mo><mfrac><mn>36</mn><mn>210</mn></mfrac><mo>×</mo><mfrac><mn>20</mn><mn>210</mn></mfrac><mo>=</mo><mn>0.000389</mn></mrow><annotation encoding="application/x-tex">P([``p&quot;, ``u&quot;, ``g&quot;]) = P(``p&quot;) \\times P(``u&quot;) \\times P(``g&quot;) = \\frac{5}{210} \\times \\frac{36}{210} \\times \\frac{20}{210} = 0.000389</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">([</span><span class="mord">‘‘</span><span class="mord mathnormal">p</span><span class="mord">&quot;</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">‘‘</span><span class="mord mathnormal">u</span><span class="mord">&quot;</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">‘‘</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord">&quot;</span><span class="mclose">])</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord">‘‘</span><span class="mord mathnormal">p</span><span class="mord">&quot;</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord">‘‘</span><span class="mord mathnormal">u</span><span class="mord">&quot;</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord">‘‘</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord">&quot;</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">210</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">5</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">210</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">36</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">210</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">20</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.6444em;"></span><span class="mord">0.000389</span></span></span></span></span>',Ea,g,mt,ua,vt='["pu", "g"]',ot,Ha,yl='<span class="katex-display"><span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><semantics><mrow><mi>P</mi><mo stretchy="false">(</mo><mo stretchy="false">[</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>p</mi><mi>u</mi><mi mathvariant="normal">&quot;</mi><mo separator="true">,</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>g</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">]</mo><mo stretchy="false">)</mo><mo>=</mo><mi>P</mi><mo stretchy="false">(</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>p</mi><mi>u</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">)</mo><mo>×</mo><mi>P</mi><mo stretchy="false">(</mo><mi mathvariant="normal">‘</mi><mi mathvariant="normal">‘</mi><mi>g</mi><mi mathvariant="normal">&quot;</mi><mo stretchy="false">)</mo><mo>=</mo><mfrac><mn>5</mn><mn>210</mn></mfrac><mo>×</mo><mfrac><mn>20</mn><mn>210</mn></mfrac><mo>=</mo><mn>0.0022676</mn></mrow><annotation encoding="application/x-tex">P([``pu&quot;, ``g&quot;]) = P(``pu&quot;) \\times P(``g&quot;) = \\frac{5}{210} \\times \\frac{20}{210} = 0.0022676</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">([</span><span class="mord">‘‘</span><span class="mord mathnormal">p</span><span class="mord mathnormal">u</span><span class="mord">&quot;</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord">‘‘</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord">&quot;</span><span class="mclose">])</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord">‘‘</span><span class="mord mathnormal">p</span><span class="mord mathnormal">u</span><span class="mord">&quot;</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:1em;vertical-align:-0.25em;"></span><span class="mord mathnormal" style="margin-right:0.13889em;">P</span><span class="mopen">(</span><span class="mord">‘‘</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord">&quot;</span><span class="mclose">)</span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">210</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">5</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2222em;"></span><span class="mbin">×</span><span class="mspace" style="margin-right:0.2222em;"></span></span><span class="base"><span class="strut" style="height:2.0074em;vertical-align:-0.686em;"></span><span class="mord"><span class="mopen nulldelimiter"></span><span class="mfrac"><span class="vlist-t vlist-t2"><span class="vlist-r"><span class="vlist" style="height:1.3214em;"><span style="top:-2.314em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">210</span></span></span><span style="top:-3.23em;"><span class="pstrut" style="height:3em;"></span><span class="frac-line" style="border-bottom-width:0.04em;"></span></span><span style="top:-3.677em;"><span class="pstrut" style="height:3em;"></span><span class="mord"><span class="mord">20</span></span></span></span><span class="vlist-s">​</span></span><span class="vlist-r"><span class="vlist" style="height:0.686em;"><span></span></span></span></span></span><span class="mclose nulldelimiter"></span></span><span class="mspace" style="margin-right:0.2778em;"></span><span class="mrel">=</span><span class="mspace" style="margin-right:0.2778em;"></span></span><span class="base"><span class="strut" style="height:0.6444em;"></span><span class="mord">0.0022676</span></span></span></span></span>',Xa,F,kt="astfel încât una este mult mai probabilă decât alta. În general, tokenizările cu cei mai puțini tokeni posibili vor avea cea mai mare probabilitate (din cauza acelei împărțiri la 210 repetată pentru fiecare token), ceea ce corespunde cu ceea ce dorim intuitiv: să împărțim un cuvânt în cel mai mic număr de tokenuri posibil.",Wa,L,$t="Tokenizarea unui cuvânt cu modelul Unigram este atunci tokenizarea cu cea mai mare probabilitate. În exemplul <code>&quot;pug&quot;</code>, iată probabilitățile pe care le-am obține pentru fiecare segmentare posibilă:",Da,O,Ka,ss,qt="Astfel, <code>&quot;pug&quot;</code> ar fi tokenizat ca <code>[&quot;p&quot;, &quot;ug&quot;]</code> sau <code>[&quot;pu&quot;, &quot;g&quot;]</code>, în funcție de care dintre aceste segmentări este întâlnită prima (rețineți că într-un corpus mai mare, cazurile de egalitate ca acesta vor fi rare).",Ya,as,xt="În acest caz, a fost ușor să găsim toate segmentările posibile și să le calculăm probabilitățile, dar în general va fi puțin mai greu. Există un algoritm clasic utilizat pentru acest lucru, numit <em>algoritmul Viterbi</em>. În esență, putem construi un grafic pentru a detecta segmentările posibile ale unui cuvânt dat, spunând că există o ramură de la caracterul <em>a</em> la caracterul <em>b</em> dacă subcuvântul de la <em>a</em> la <em>b</em> se află în vocabular, și atribuind ramurii respective probabilitatea subcuvântului.",Pa,es,At="Pentru a găsi calea din acest grafic care va avea cel mai bun scor, algoritmul Viterbi determină, pentru fiecare poziție din cuvânt, segmentarea cu cel mai bun scor care se termină la poziția respectivă. Deoarece mergem de la început la sfârșit, cel mai bun scor poate fi găsit prin parcurgerea în buclă a tuturor subcuvintelor care se termină la poziția curentă și apoi folosind cel mai bun scor de tokenizare de la poziția la care începe acest subcuvânt. Apoi, trebuie doar să derulăm calea parcursă pentru a ajunge la sfârșit.",Fa,ts,zt="Să aruncăm o privire la un exemplu folosind vocabularul nostru și cuvântul <code>&quot;unhug&quot;</code>. Pentru fiecare poziție, subcuvintele cu cele mai bune scoruri care se termină acolo sunt următoarele:",La,ls,Oa,ns,Zt="Astfel, <code>&quot;unhug&quot;</code> ar fi tokenizat ca <code>[&quot;un&quot;, &quot;hug&quot;]</code>.",se,k,ae,ps,ee,is,_t="Acum că am văzut cum funcționează tokenizarea, putem analiza mai în profunzime pierderea utilizată în timpul antrenării. În orice etapă dată, această pierdere este calculată prin tokenizarea fiecărui cuvânt din corpus, utilizând vocabularul curent și modelul Unigram determinat de frecvențele fiecărui token din corpus (după cum am văzut mai devreme).",te,rs,Vt="Fiecare cuvânt din corpus are un scor, iar pierderea este negative log likelihood a acestor scoruri - adică suma pentru toate cuvintele din corpus a tuturor <code>-log(P(word))</code>.",le,cs,Nt="Să ne întoarcem la exemplul nostru cu următorul corpus:",ne,ms,pe,os,Qt="Tokenizarea fiecărui cuvânt cu scorurile lor respective este:",ie,us,re,Ms,Bt="Deci, pierderea este:",ce,ys,me,js,Gt="Acum trebuie să calculăm modul în care eliminarea fiecărui token afectează pierderea. Acest lucru este destul de plictisitor, așa că îl vom face doar pentru doi tokeni aici și vom păstra întregul proces pentru atunci când vom avea cod care să ne ajute. În acest caz (foarte) special, aveam două tokenizări echivalente ale tuturor cuvintelor: după cum am văzut mai devreme, de exemplu, <code>&quot;pug&quot;</code> ar putea fi tokenizat <code>[&quot;p&quot;, &quot;ug&quot;]</code> cu același scor. Astfel, eliminarea simbolului <code>&quot;pu&quot;</code> din vocabular va produce exact aceeași pierdere.",oe,ds,St="Pe de altă parte, eliminarea lui <code>&quot;hug&quot;</code> va agrava pierderea, deoarece tokenizarea lui <code>&quot;hug&quot;</code> și <code>&quot;hugs&quot;</code> va deveni:",ue,Us,Me,hs,Rt="Aceste modificări vor determina creșterea pierderii cu:",ye,Js,je,ws,Et="Prin urmare, tokenul <code>&quot;pu&quot;</code> va fi probabil eliminat din vocabular, dar nu și <code>&quot;hug&quot;</code>.",de,bs,Ue,Ts,Ht="Acum să implementăm în cod tot ceea ce am văzut până acum. Ca și în cazul BPE și WordPiece, aceasta nu este o implementare eficientă a algoritmului Unigram (dimpotrivă), dar ar trebui să vă ajute să-l înțelegeți puțin mai bine.",he,fs,Xt="Vom folosi ca exemplu același corpus ca și până acum:",Je,gs,we,Cs,Wt="De data aceasta, vom folosi <code>xlnet-base-cased</code> ca modelul nostru:",be,Is,Te,vs,Dt="Ca și pentru BPE și WordPiece, începem prin a număra numărul de apariții ale fiecărui cuvânt în corpus:",fe,ks,ge,$s,Kt="Apoi, trebuie să inițializăm vocabularul nostru la ceva mai mare decât dimensiunea vocabularului pe care o vom dori la final. Trebuie să includem toate caracterele de bază (altfel nu vom putea tokeniza fiecare cuvânt), dar pentru substringurile mai mari le vom păstra doar pe cele mai comune, așa că le vom sorta după frecvență:",Ce,qs,Ie,xs,ve,As,Yt="Grupăm caracterele cu cele mai bune subcuvinte pentru a ajunge la un vocabular inițial de dimensiunea 300:",ke,zs,$e,$,qe,Zs,Pt="În continuare, calculăm suma tuturor frecvențelor, pentru a converti frecvențele în probabilități. Pentru modelul nostru, vom stoca logaritmii probabilităților, deoarece este mai stabil din punct de vedere numeric să adăugăm logaritmi decât să multiplicăm numere mici, iar acest lucru va simplifica calcularea pierderii modelului:",xe,_s,Ae,Vs,Ft="Acum funcția principală este cea care tokenizează cuvintele folosind algoritmul Viterbi. După cum am văzut mai devreme, acest algoritm calculează cea mai bună segmentare a fiecărui substring din cuvânt, pe care o vom stoca într-o variabilă numită <code>best_segmentations</code>. Vom stoca un dicționar pentru fiecare poziție din cuvânt (de la 0 la lungimea totală a acestuia), cu două chei: indicele de început al ultimului token din cea mai bună segmentare și scorul celei mai bune segmentări. Cu ajutorul indicelui de început al ultimului token, vom putea extrage segmentarea completă odată ce lista este complet populată.",ze,Ns,Lt="Popularea listei se face cu doar două bucle: bucla principală trece peste fiecare poziție de început, iar a doua buclă încearcă toate subcuvintele care încep la acea poziție de început. Dacă substringul se află în vocabular, avem o nouă segmentare a cuvântului până la acea poziție finală, pe care o comparăm cu cea din <code>best_segmentations</code>.",Ze,Qs,Ot="Odată ce bucla principală este terminată, pornim de la sfârșit și sărim de la o poziție de început la alta, înregistrând tokenii pe parcurs, până când ajungem la începutul cuvântului:",_e,Bs,Ve,Gs,sl="Putem încerca deja modelul nostru inițial pe câteva cuvinte:",Ne,Ss,Qe,Rs,Be,Es,al="Acum este ușor de calculat pierderea modelului pe corpus!",Ge,Hs,Se,Xs,el="Putem verifica dacă funcționează pe modelul pe care îl avem:",Re,Ws,Ee,Ds,He,Ks,tl="Nici calcularea scorurilor pentru fiecare token nu este foarte dificilă; trebuie doar să calculăm pierderea pentru modelele obținute prin ștergerea fiecărui tokeb:",Xe,Ys,We,Ps,ll="Îl putem încerca pe un token dat:",De,Fs,Ke,Ls,nl="Deoarece <code>&quot;ll&quot;</code> este folosit în tokenizarea lui <code>&quot;Hopefully&quot;</code>, iar eliminarea lui ne va face, probabil, să folosim tokenul <code>&quot;l&quot;</code> de două ori în schimb, ne așteptăm să aibă o pierdere pozitivă. <code>&quot;his&quot;</code> este folosit doar în interiorul cuvântului <code>&quot;This&quot;</code>, care este tokenizat ca el însuși, deci ne așteptăm să aibă o pierdere zero. Iată rezultatele:",Ye,Os,Pe,q,Fe,sa,pl="Cu toate acestea la locul lor, ultimul lucru pe care trebuie să îl facem este să adăugăm la vocabular tokeni speciali utilizate de model, apoi să facem o buclă până când am eliminat suficienți tokeni din vocabular pentru a ajunge la dimensiunea dorită:",Le,aa,Oe,ea,il="Apoi, pentru a tokeniza un text, trebuie doar să aplicăm pre-tokenizarea și apoi să folosim funcția <code>encode_word()</code>:",st,ta,at,la,et,na,rl="Asta e tot pentru Unigram! Sperăm că până acum vă simțiți ca un expert în toate lucrurile legate de tokenizer. În secțiunea următoare, vom aprofunda elementele de bază ale bibliotecii 🤗 Tokenizers și vă vom arăta cum le puteți utiliza pentru a vă construi propriul tokenizer.",tt,pa,lt,Ma,nt;return A=new da({props:{title:"Tokenizarea Unigram",local:"unigram-tokenization",headingTag:"h1"}}),z=new bl({props:{chapter:6,classNames:"absolute z-10 right-0 top-0",notebooks:[{label:"Google Colab",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter6/section7.ipynb"},{label:"Aws Studio",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter6/section7.ipynb"}]}}),_=new wl({props:{id:"TGZfZVuF9Yc"}}),C=new ja({props:{$$slots:{default:[fl]},$$scope:{ctx:w}}}),V=new da({props:{title:"Algoritm de antrenare",local:"training-algorithm",headingTag:"h2"}}),R=new U({props:{code:"KCUyMmh1ZyUyMiUyQyUyMDEwKSUyQyUyMCglMjJwdWclMjIlMkMlMjA1KSUyQyUyMCglMjJwdW4lMjIlMkMlMjAxMiklMkMlMjAoJTIyYnVuJTIyJTJDJTIwNCklMkMlMjAoJTIyaHVncyUyMiUyQyUyMDUp",highlighted:'(<span class="hljs-string">&quot;hug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;bun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;hugs&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)',wrap:!1}}),H=new U({props:{code:"JTVCJTIyaCUyMiUyQyUyMCUyMnUlMjIlMkMlMjAlMjJnJTIyJTJDJTIwJTIyaHUlMjIlMkMlMjAlMjJ1ZyUyMiUyQyUyMCUyMnAlMjIlMkMlMjAlMjJwdSUyMiUyQyUyMCUyMm4lMjIlMkMlMjAlMjJ1biUyMiUyQyUyMCUyMmIlMjIlMkMlMjAlMjJidSUyMiUyQyUyMCUyMnMlMjIlMkMlMjAlMjJodWclMjIlMkMlMjAlMjJncyUyMiUyQyUyMCUyMnVncyUyMiU1RA==",highlighted:'<span class="hljs-selector-attr">[<span class="hljs-string">&quot;h&quot;</span>, <span class="hljs-string">&quot;u&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>, <span class="hljs-string">&quot;hu&quot;</span>, <span class="hljs-string">&quot;ug&quot;</span>, <span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;pu&quot;</span>, <span class="hljs-string">&quot;n&quot;</span>, <span class="hljs-string">&quot;un&quot;</span>, <span class="hljs-string">&quot;b&quot;</span>, <span class="hljs-string">&quot;bu&quot;</span>, <span class="hljs-string">&quot;s&quot;</span>, <span class="hljs-string">&quot;hug&quot;</span>, <span class="hljs-string">&quot;gs&quot;</span>, <span class="hljs-string">&quot;ugs&quot;</span>]</span>',wrap:!1}}),X=new da({props:{title:"Algoritm de tokenizare",local:"tokenization-algorithm",headingTag:"h2"}}),Y=new U({props:{code:"KCUyMmglMjIlMkMlMjAxNSklMjAoJTIydSUyMiUyQyUyMDM2KSUyMCglMjJnJTIyJTJDJTIwMjApJTIwKCUyMmh1JTIyJTJDJTIwMTUpJTIwKCUyMnVnJTIyJTJDJTIwMjApJTIwKCUyMnAlMjIlMkMlMjAxNyklMjAoJTIycHUlMjIlMkMlMjAxNyklMjAoJTIybiUyMiUyQyUyMDE2KSUwQSglMjJ1biUyMiUyQyUyMDE2KSUyMCglMjJiJTIyJTJDJTIwNCklMjAoJTIyYnUlMjIlMkMlMjA0KSUyMCglMjJzJTIyJTJDJTIwNSklMjAoJTIyaHVnJTIyJTJDJTIwMTUpJTIwKCUyMmdzJTIyJTJDJTIwNSklMjAoJTIydWdzJTIyJTJDJTIwNSk=",highlighted:`(<span class="hljs-string">&quot;h&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">15</span>) (<span class="hljs-string">&quot;u&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">36</span>) (<span class="hljs-string">&quot;g&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">20</span>) (<span class="hljs-string">&quot;hu&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">15</span>) (<span class="hljs-string">&quot;ug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">20</span>) (<span class="hljs-string">&quot;p&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">17</span>) (<span class="hljs-string">&quot;pu&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">17</span>) (<span class="hljs-string">&quot;n&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">16</span>)
(<span class="hljs-string">&quot;un&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">16</span>) (<span class="hljs-string">&quot;b&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>) (<span class="hljs-string">&quot;bu&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>) (<span class="hljs-string">&quot;s&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>) (<span class="hljs-string">&quot;hug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">15</span>) (<span class="hljs-string">&quot;gs&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>) (<span class="hljs-string">&quot;ugs&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)`,wrap:!1}}),v=new ja({props:{$$slots:{default:[gl]},$$scope:{ctx:w}}}),O=new U({props:{code:"JTVCJTIycCUyMiUyQyUyMCUyMnUlMjIlMkMlMjAlMjJnJTIyJTVEJTIwJTNBJTIwMC4wMDAzODklMEElNUIlMjJwJTIyJTJDJTIwJTIydWclMjIlNUQlMjAlM0ElMjAwLjAwMjI2NzYlMEElNUIlMjJwdSUyMiUyQyUyMCUyMmclMjIlNUQlMjAlM0ElMjAwLjAwMjI2NzY=",highlighted:`[<span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;u&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>] : 0.000389
[<span class="hljs-string">&quot;p&quot;</span>, <span class="hljs-string">&quot;ug&quot;</span>] : 0.0022676
[<span class="hljs-string">&quot;pu&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>] : 0.0022676`,wrap:!1}}),ls=new U({props:{code:"Q2hhcmFjdGVyJTIwMCUyMCh1KSUzQSUyMCUyMnUlMjIlMjAoc2NvcmUlMjAwLjE3MTQyOSklMEFDaGFyYWN0ZXIlMjAxJTIwKG4pJTNBJTIwJTIydW4lMjIlMjAoc2NvcmUlMjAwLjA3NjE5MSklMEFDaGFyYWN0ZXIlMjAyJTIwKGgpJTNBJTIwJTIydW4lMjIlMjAlMjJoJTIyJTIwKHNjb3JlJTIwMC4wMDU0NDIpJTBBQ2hhcmFjdGVyJTIwMyUyMCh1KSUzQSUyMCUyMnVuJTIyJTIwJTIyaHUlMjIlMjAoc2NvcmUlMjAwLjAwNTQ0MiklMEFDaGFyYWN0ZXIlMjA0JTIwKGcpJTNBJTIwJTIydW4lMjIlMjAlMjJodWclMjIlMjAoc2NvcmUlMjAwLjAwNTQ0Mik=",highlighted:`<span class="hljs-attribute">Character</span> <span class="hljs-number">0</span> (u): <span class="hljs-string">&quot;u&quot;</span> (score <span class="hljs-number">0</span>.<span class="hljs-number">171429</span>)
<span class="hljs-attribute">Character</span> <span class="hljs-number">1</span> (n): <span class="hljs-string">&quot;un&quot;</span> (score <span class="hljs-number">0</span>.<span class="hljs-number">076191</span>)
<span class="hljs-attribute">Character</span> <span class="hljs-number">2</span> (h): <span class="hljs-string">&quot;un&quot;</span> <span class="hljs-string">&quot;h&quot;</span> (score <span class="hljs-number">0</span>.<span class="hljs-number">005442</span>)
<span class="hljs-attribute">Character</span> <span class="hljs-number">3</span> (u): <span class="hljs-string">&quot;un&quot;</span> <span class="hljs-string">&quot;hu&quot;</span> (score <span class="hljs-number">0</span>.<span class="hljs-number">005442</span>)
<span class="hljs-attribute">Character</span> <span class="hljs-number">4</span> (g): <span class="hljs-string">&quot;un&quot;</span> <span class="hljs-string">&quot;hug&quot;</span> (score <span class="hljs-number">0</span>.<span class="hljs-number">005442</span>)`,wrap:!1}}),k=new ja({props:{$$slots:{default:[Cl]},$$scope:{ctx:w}}}),ps=new da({props:{title:"Înapoi la antrenare",local:"back-to-training",headingTag:"h2"}}),ms=new U({props:{code:"KCUyMmh1ZyUyMiUyQyUyMDEwKSUyQyUyMCglMjJwdWclMjIlMkMlMjA1KSUyQyUyMCglMjJwdW4lMjIlMkMlMjAxMiklMkMlMjAoJTIyYnVuJTIyJTJDJTIwNCklMkMlMjAoJTIyaHVncyUyMiUyQyUyMDUp",highlighted:'(<span class="hljs-string">&quot;hug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pug&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;pun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;bun&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">&quot;hugs&quot;</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)',wrap:!1}}),us=new U({props:{code:"JTIyaHVnJTIyJTNBJTIwJTVCJTIyaHVnJTIyJTVEJTIwKHNjb3JlJTIwMC4wNzE0MjgpJTBBJTIycHVnJTIyJTNBJTIwJTVCJTIycHUlMjIlMkMlMjAlMjJnJTIyJTVEJTIwKHNjb3JlJTIwMC4wMDc3MTApJTBBJTIycHVuJTIyJTNBJTIwJTVCJTIycHUlMjIlMkMlMjAlMjJuJTIyJTVEJTIwKHNjb3JlJTIwMC4wMDYxNjgpJTBBJTIyYnVuJTIyJTNBJTIwJTVCJTIyYnUlMjIlMkMlMjAlMjJuJTIyJTVEJTIwKHNjb3JlJTIwMC4wMDE0NTEpJTBBJTIyaHVncyUyMiUzQSUyMCU1QiUyMmh1ZyUyMiUyQyUyMCUyMnMlMjIlNUQlMjAoc2NvcmUlMjAwLjAwMTcwMSk=",highlighted:`<span class="hljs-string">&quot;hug&quot;</span>: [<span class="hljs-string">&quot;hug&quot;</span>] <span class="hljs-comment">(score 0.071428)</span>
<span class="hljs-string">&quot;pug&quot;</span>: [<span class="hljs-string">&quot;pu&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>] <span class="hljs-comment">(score 0.007710)</span>
<span class="hljs-string">&quot;pun&quot;</span>: [<span class="hljs-string">&quot;pu&quot;</span>, <span class="hljs-string">&quot;n&quot;</span>] <span class="hljs-comment">(score 0.006168)</span>
<span class="hljs-string">&quot;bun&quot;</span>: [<span class="hljs-string">&quot;bu&quot;</span>, <span class="hljs-string">&quot;n&quot;</span>] <span class="hljs-comment">(score 0.001451)</span>
<span class="hljs-string">&quot;hugs&quot;</span>: [<span class="hljs-string">&quot;hug&quot;</span>, <span class="hljs-string">&quot;s&quot;</span>] <span class="hljs-comment">(score 0.001701)</span>`,wrap:!1}}),ys=new U({props:{code:"MTAlMjAqJTIwKC1sb2coMC4wNzE0MjgpKSUyMCUyQiUyMDUlMjAqJTIwKC1sb2coMC4wMDc3MTApKSUyMCUyQiUyMDEyJTIwKiUyMCgtbG9nKDAuMDA2MTY4KSklMjAlMkIlMjA0JTIwKiUyMCgtbG9nKDAuMDAxNDUxKSklMjAlMkIlMjA1JTIwKiUyMCgtbG9nKDAuMDAxNzAxKSklMjAlM0QlMjAxNjkuOA==",highlighted:'<span class="hljs-attribute">10</span> * (-log(<span class="hljs-number">0</span>.<span class="hljs-number">071428</span>)) + <span class="hljs-number">5</span> * (-log(<span class="hljs-number">0</span>.<span class="hljs-number">007710</span>)) + <span class="hljs-number">12</span> * (-log(<span class="hljs-number">0</span>.<span class="hljs-number">006168</span>)) + <span class="hljs-number">4</span> * (-log(<span class="hljs-number">0</span>.<span class="hljs-number">001451</span>)) + <span class="hljs-number">5</span> * (-log(<span class="hljs-number">0</span>.<span class="hljs-number">001701</span>)) = <span class="hljs-number">169</span>.<span class="hljs-number">8</span>',wrap:!1}}),Us=new U({props:{code:"JTIyaHVnJTIyJTNBJTIwJTVCJTIyaHUlMjIlMkMlMjAlMjJnJTIyJTVEJTIwKHNjb3JlJTIwMC4wMDY4MDIpJTBBJTIyaHVncyUyMiUzQSUyMCU1QiUyMmh1JTIyJTJDJTIwJTIyZ3MlMjIlNUQlMjAoc2NvcmUlMjAwLjAwMTcwMSk=",highlighted:`<span class="hljs-string">&quot;hug&quot;</span>: [<span class="hljs-string">&quot;hu&quot;</span>, <span class="hljs-string">&quot;g&quot;</span>] <span class="hljs-comment">(score 0.006802)</span>
<span class="hljs-string">&quot;hugs&quot;</span>: [<span class="hljs-string">&quot;hu&quot;</span>, <span class="hljs-string">&quot;gs&quot;</span>] <span class="hljs-comment">(score 0.001701)</span>`,wrap:!1}}),Js=new U({props:{code:"LSUyMDEwJTIwKiUyMCgtbG9nKDAuMDcxNDI4KSklMjAlMkIlMjAxMCUyMColMjAoLWxvZygwLjAwNjgwMikpJTIwJTNEJTIwMjMuNQ==",highlighted:'- <span class="hljs-number">10</span> * (<span class="hljs-name">-log</span>(<span class="hljs-number">0.071428</span>)) + <span class="hljs-number">10</span> * (<span class="hljs-name">-log</span>(<span class="hljs-number">0.006802</span>)) = <span class="hljs-number">23.5</span>',wrap:!1}}),bs=new da({props:{title:"Implementarea Unigram",local:"implementarea-unigram",headingTag:"h2"}}),gs=new U({props:{code:"Y29ycHVzJTIwJTNEJTIwJTVCJTBBJTIwJTIwJTIwJTIwJTIyVGhpcyUyMGlzJTIwdGhlJTIwSHVnZ2luZyUyMEZhY2UlMjBDb3Vyc2UuJTIyJTJDJTBBJTIwJTIwJTIwJTIwJTIyVGhpcyUyMGNoYXB0ZXIlMjBpcyUyMGFib3V0JTIwdG9rZW5pemF0aW9uLiUyMiUyQyUwQSUyMCUyMCUyMCUyMCUyMlRoaXMlMjBzZWN0aW9uJTIwc2hvd3MlMjBzZXZlcmFsJTIwdG9rZW5pemVyJTIwYWxnb3JpdGhtcy4lMjIlMkMlMEElMjAlMjAlMjAlMjAlMjJIb3BlZnVsbHklMkMlMjB5b3UlMjB3aWxsJTIwYmUlMjBhYmxlJTIwdG8lMjB1bmRlcnN0YW5kJTIwaG93JTIwdGhleSUyMGFyZSUyMHRyYWluZWQlMjBhbmQlMjBnZW5lcmF0ZSUyMHRva2Vucy4lMjIlMkMlMEElNUQ=",highlighted:`corpus = [
<span class="hljs-string">&quot;This is the Hugging Face Course.&quot;</span>,
<span class="hljs-string">&quot;This chapter is about tokenization.&quot;</span>,
<span class="hljs-string">&quot;This section shows several tokenizer algorithms.&quot;</span>,
<span class="hljs-string">&quot;Hopefully, you will be able to understand how they are trained and generate tokens.&quot;</span>,
]`,wrap:!1}}),Is=new U({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Ub2tlbml6ZXIlMEElMEF0b2tlbml6ZXIlMjAlM0QlMjBBdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZCglMjJ4bG5ldC1iYXNlLWNhc2VkJTIyKQ==",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;xlnet-base-cased&quot;</span>)`,wrap:!1}}),ks=new U({props:{code:"ZnJvbSUyMGNvbGxlY3Rpb25zJTIwaW1wb3J0JTIwZGVmYXVsdGRpY3QlMEElMEF3b3JkX2ZyZXFzJTIwJTNEJTIwZGVmYXVsdGRpY3QoaW50KSUwQWZvciUyMHRleHQlMjBpbiUyMGNvcnB1cyUzQSUwQSUyMCUyMCUyMCUyMHdvcmRzX3dpdGhfb2Zmc2V0cyUyMCUzRCUyMHRva2VuaXplci5iYWNrZW5kX3Rva2VuaXplci5wcmVfdG9rZW5pemVyLnByZV90b2tlbml6ZV9zdHIodGV4dCklMEElMjAlMjAlMjAlMjBuZXdfd29yZHMlMjAlM0QlMjAlNUJ3b3JkJTIwZm9yJTIwd29yZCUyQyUyMG9mZnNldCUyMGluJTIwd29yZHNfd2l0aF9vZmZzZXRzJTVEJTBBJTIwJTIwJTIwJTIwZm9yJTIwd29yZCUyMGluJTIwbmV3X3dvcmRzJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwd29yZF9mcmVxcyU1QndvcmQlNUQlMjAlMkIlM0QlMjAxJTBBJTBBd29yZF9mcmVxcw==",highlighted:`<span class="hljs-keyword">from</span> collections <span class="hljs-keyword">import</span> defaultdict
word_freqs = defaultdict(<span class="hljs-built_in">int</span>)
<span class="hljs-keyword">for</span> text <span class="hljs-keyword">in</span> corpus:
words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
new_words = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> words_with_offsets]
<span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> new_words:
word_freqs[word] += <span class="hljs-number">1</span>
word_freqs`,wrap:!1}}),qs=new U({props:{code:"Y2hhcl9mcmVxcyUyMCUzRCUyMGRlZmF1bHRkaWN0KGludCklMEFzdWJ3b3Jkc19mcmVxcyUyMCUzRCUyMGRlZmF1bHRkaWN0KGludCklMEFmb3IlMjB3b3JkJTJDJTIwZnJlcSUyMGluJTIwd29yZF9mcmVxcy5pdGVtcygpJTNBJTBBJTIwJTIwJTIwJTIwZm9yJTIwaSUyMGluJTIwcmFuZ2UobGVuKHdvcmQpKSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGNoYXJfZnJlcXMlNUJ3b3JkJTVCaSU1RCU1RCUyMCUyQiUzRCUyMGZyZXElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjMlMjBMb29wJTIwdGhyb3VnaCUyMHRoZSUyMHN1YndvcmRzJTIwb2YlMjBsZW5ndGglMjBhdCUyMGxlYXN0JTIwMiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGZvciUyMGolMjBpbiUyMHJhbmdlKGklMjAlMkIlMjAyJTJDJTIwbGVuKHdvcmQpJTIwJTJCJTIwMSklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzdWJ3b3Jkc19mcmVxcyU1QndvcmQlNUJpJTNBaiU1RCU1RCUyMCUyQiUzRCUyMGZyZXElMEElMEElMjMlMjBTb3J0YXJlYSUyMHN1YmN1dmludGVsb3IlMjBkdXAlQzQlODMlMjBmcmVjdmVuJUM4JTlCJUM0JTgzJTBBc29ydGVkX3N1YndvcmRzJTIwJTNEJTIwc29ydGVkKHN1YndvcmRzX2ZyZXFzLml0ZW1zKCklMkMlMjBrZXklM0RsYW1iZGElMjB4JTNBJTIweCU1QjElNUQlMkMlMjByZXZlcnNlJTNEVHJ1ZSklMEFzb3J0ZWRfc3Vid29yZHMlNUIlM0ExMCU1RA==",highlighted:`char_freqs = defaultdict(<span class="hljs-built_in">int</span>)
subwords_freqs = defaultdict(<span class="hljs-built_in">int</span>)
<span class="hljs-keyword">for</span> word, freq <span class="hljs-keyword">in</span> word_freqs.items():
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(word)):
char_freqs[word[i]] += freq
<span class="hljs-comment"># Loop through the subwords of length at least 2</span>
<span class="hljs-keyword">for</span> j <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(i + <span class="hljs-number">2</span>, <span class="hljs-built_in">len</span>(word) + <span class="hljs-number">1</span>):
subwords_freqs[word[i:j]] += freq
<span class="hljs-comment"># Sortarea subcuvintelor după frecvență</span>
sorted_subwords = <span class="hljs-built_in">sorted</span>(subwords_freqs.items(), key=<span class="hljs-keyword">lambda</span> x: x[<span class="hljs-number">1</span>], reverse=<span class="hljs-literal">True</span>)
sorted_subwords[:<span class="hljs-number">10</span>]`,wrap:!1}}),xs=new U({props:{code:"JTVCKCclRTIlOTYlODF0JyUyQyUyMDcpJTJDJTIwKCdpcyclMkMlMjA1KSUyQyUyMCgnZXInJTJDJTIwNSklMkMlMjAoJyVFMiU5NiU4MWEnJTJDJTIwNSklMkMlMjAoJyVFMiU5NiU4MXRvJyUyQyUyMDQpJTJDJTIwKCd0byclMkMlMjA0KSUyQyUyMCgnZW4nJTJDJTIwNCklMkMlMjAoJyVFMiU5NiU4MVQnJTJDJTIwMyklMkMlMjAoJyVFMiU5NiU4MVRoJyUyQyUyMDMpJTJDJTIwKCclRTIlOTYlODFUaGknJTJDJTIwMyklNUQ=",highlighted:'[(<span class="hljs-string">&#x27;▁t&#x27;</span>, <span class="hljs-number">7</span>), (<span class="hljs-string">&#x27;is&#x27;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&#x27;er&#x27;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&#x27;▁a&#x27;</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">&#x27;▁to&#x27;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&#x27;to&#x27;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&#x27;en&#x27;</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">&#x27;▁T&#x27;</span>, <span class="hljs-number">3</span>), (<span class="hljs-string">&#x27;▁Th&#x27;</span>, <span class="hljs-number">3</span>), (<span class="hljs-string">&#x27;▁Thi&#x27;</span>, <span class="hljs-number">3</span>)]',wrap:!1}}),zs=new U({props:{code:"dG9rZW5fZnJlcXMlMjAlM0QlMjBsaXN0KGNoYXJfZnJlcXMuaXRlbXMoKSklMjAlMkIlMjBzb3J0ZWRfc3Vid29yZHMlNUIlM0ElMjAzMDAlMjAtJTIwbGVuKGNoYXJfZnJlcXMpJTVEJTBBdG9rZW5fZnJlcXMlMjAlM0QlMjAlN0J0b2tlbiUzQSUyMGZyZXElMjBmb3IlMjB0b2tlbiUyQyUyMGZyZXElMjBpbiUyMHRva2VuX2ZyZXFzJTdE",highlighted:`token_freqs = <span class="hljs-built_in">list</span>(char_freqs.items()) + sorted_subwords[: <span class="hljs-number">300</span> - <span class="hljs-built_in">len</span>(char_freqs)]
token_freqs = {token: freq <span class="hljs-keyword">for</span> token, freq <span class="hljs-keyword">in</span> token_freqs}`,wrap:!1}}),$=new ja({props:{$$slots:{default:[Il]},$$scope:{ctx:w}}}),_s=new U({props:{code:"ZnJvbSUyMG1hdGglMjBpbXBvcnQlMjBsb2clMEElMEF0b3RhbF9zdW0lMjAlM0QlMjBzdW0oJTVCZnJlcSUyMGZvciUyMHRva2VuJTJDJTIwZnJlcSUyMGluJTIwdG9rZW5fZnJlcXMuaXRlbXMoKSU1RCklMEFtb2RlbCUyMCUzRCUyMCU3QnRva2VuJTNBJTIwLWxvZyhmcmVxJTIwJTJGJTIwdG90YWxfc3VtKSUyMGZvciUyMHRva2VuJTJDJTIwZnJlcSUyMGluJTIwdG9rZW5fZnJlcXMuaXRlbXMoKSU3RA==",highlighted:`<span class="hljs-keyword">from</span> math <span class="hljs-keyword">import</span> log
total_sum = <span class="hljs-built_in">sum</span>([freq <span class="hljs-keyword">for</span> token, freq <span class="hljs-keyword">in</span> token_freqs.items()])
model = {token: -log(freq / total_sum) <span class="hljs-keyword">for</span> token, freq <span class="hljs-keyword">in</span> token_freqs.items()}`,wrap:!1}}),Bs=new U({props:{code:"ZGVmJTIwZW5jb2RlX3dvcmQod29yZCUyQyUyMG1vZGVsKSUzQSUwQSUyMCUyMCUyMCUyMGJlc3Rfc2VnbWVudGF0aW9ucyUyMCUzRCUyMCU1QiU3QiUyMnN0YXJ0JTIyJTNBJTIwMCUyQyUyMCUyMnNjb3JlJTIyJTNBJTIwMSU3RCU1RCUyMCUyQiUyMCU1QiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCU3QiUyMnN0YXJ0JTIyJTNBJTIwTm9uZSUyQyUyMCUyMnNjb3JlJTIyJTNBJTIwTm9uZSU3RCUyMGZvciUyMF8lMjBpbiUyMHJhbmdlKGxlbih3b3JkKSklMEElMjAlMjAlMjAlMjAlNUQlMEElMjAlMjAlMjAlMjBmb3IlMjBzdGFydF9pZHglMjBpbiUyMHJhbmdlKGxlbih3b3JkKSklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjMlMjBUaGlzJTIwc2hvdWxkJTIwYmUlMjBwcm9wZXJseSUyMGZpbGxlZCUyMGJ5JTIwdGhlJTIwcHJldmlvdXMlMjBzdGVwcyUyMG9mJTIwdGhlJTIwbG9vcCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGJlc3Rfc2NvcmVfYXRfc3RhcnQlMjAlM0QlMjBiZXN0X3NlZ21lbnRhdGlvbnMlNUJzdGFydF9pZHglNUQlNUIlMjJzY29yZSUyMiU1RCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGZvciUyMGVuZF9pZHglMjBpbiUyMHJhbmdlKHN0YXJ0X2lkeCUyMCUyQiUyMDElMkMlMjBsZW4od29yZCklMjAlMkIlMjAxKSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHRva2VuJTIwJTNEJTIwd29yZCU1QnN0YXJ0X2lkeCUzQWVuZF9pZHglNUQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpZiUyMHRva2VuJTIwaW4lMjBtb2RlbCUyMGFuZCUyMGJlc3Rfc2NvcmVfYXRfc3RhcnQlMjBpcyUyMG5vdCUyME5vbmUlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzY29yZSUyMCUzRCUyMG1vZGVsJTVCdG9rZW4lNUQlMjAlMkIlMjBiZXN0X3Njb3JlX2F0X3N0YXJ0JTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIzJTIwSWYlMjB3ZSUyMGhhdmUlMjBmb3VuZCUyMGElMjBiZXR0ZXIlMjBzZWdtZW50YXRpb24lMjBlbmRpbmclMjBhdCUyMGVuZF9pZHglMkMlMjB3ZSUyMHVwZGF0ZSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwKCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGJlc3Rfc2VnbWVudGF0aW9ucyU1QmVuZF9pZHglNUQlNUIlMjJzY29yZSUyMiU1RCUyMGlzJTIwTm9uZSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMG9yJTIwYmVzdF9zZWdtZW50YXRpb25zJTVCZW5kX2lkeCU1RCU1QiUyMnNjb3JlJTIyJTVEJTIwJTNFJTIwc2NvcmUlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjApJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwYmVzdF9zZWdtZW50YXRpb25zJTVCZW5kX2lkeCU1RCUyMCUzRCUyMCU3QiUyMnN0YXJ0JTIyJTNBJTIwc3RhcnRfaWR4JTJDJTIwJTIyc2NvcmUlMjIlM0ElMjBzY29yZSU3RCUwQSUwQSUyMCUyMCUyMCUyMHNlZ21lbnRhdGlvbiUyMCUzRCUyMGJlc3Rfc2VnbWVudGF0aW9ucyU1Qi0xJTVEJTBBJTIwJTIwJTIwJTIwaWYlMjBzZWdtZW50YXRpb24lNUIlMjJzY29yZSUyMiU1RCUyMGlzJTIwTm9uZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMFdlJTIwZGlkJTIwbm90JTIwZmluZCUyMGElMjB0b2tlbml6YXRpb24lMjBvZiUyMHRoZSUyMHdvcmQlMjAtJTNFJTIwdW5rbm93biUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHJldHVybiUyMCU1QiUyMiUzQ3VuayUzRSUyMiU1RCUyQyUyME5vbmUlMEElMEElMjAlMjAlMjAlMjBzY29yZSUyMCUzRCUyMHNlZ21lbnRhdGlvbiU1QiUyMnNjb3JlJTIyJTVEJTBBJTIwJTIwJTIwJTIwc3RhcnQlMjAlM0QlMjBzZWdtZW50YXRpb24lNUIlMjJzdGFydCUyMiU1RCUwQSUyMCUyMCUyMCUyMGVuZCUyMCUzRCUyMGxlbih3b3JkKSUwQSUyMCUyMCUyMCUyMHRva2VucyUyMCUzRCUyMCU1QiU1RCUwQSUyMCUyMCUyMCUyMHdoaWxlJTIwc3RhcnQlMjAhJTNEJTIwMCUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHRva2Vucy5pbnNlcnQoMCUyQyUyMHdvcmQlNUJzdGFydCUzQWVuZCU1RCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBuZXh0X3N0YXJ0JTIwJTNEJTIwYmVzdF9zZWdtZW50YXRpb25zJTVCc3RhcnQlNUQlNUIlMjJzdGFydCUyMiU1RCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGVuZCUyMCUzRCUyMHN0YXJ0JTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc3RhcnQlMjAlM0QlMjBuZXh0X3N0YXJ0JTBBJTIwJTIwJTIwJTIwdG9rZW5zLmluc2VydCgwJTJDJTIwd29yZCU1QnN0YXJ0JTNBZW5kJTVEKSUwQSUyMCUyMCUyMCUyMHJldHVybiUyMHRva2VucyUyQyUyMHNjb3Jl",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">encode_word</span>(<span class="hljs-params">word, model</span>):
best_segmentations = [{<span class="hljs-string">&quot;start&quot;</span>: <span class="hljs-number">0</span>, <span class="hljs-string">&quot;score&quot;</span>: <span class="hljs-number">1</span>}] + [
{<span class="hljs-string">&quot;start&quot;</span>: <span class="hljs-literal">None</span>, <span class="hljs-string">&quot;score&quot;</span>: <span class="hljs-literal">None</span>} <span class="hljs-keyword">for</span> _ <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(word))
]
<span class="hljs-keyword">for</span> start_idx <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(word)):
<span class="hljs-comment"># This should be properly filled by the previous steps of the loop</span>
best_score_at_start = best_segmentations[start_idx][<span class="hljs-string">&quot;score&quot;</span>]
<span class="hljs-keyword">for</span> end_idx <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(start_idx + <span class="hljs-number">1</span>, <span class="hljs-built_in">len</span>(word) + <span class="hljs-number">1</span>):
token = word[start_idx:end_idx]
<span class="hljs-keyword">if</span> token <span class="hljs-keyword">in</span> model <span class="hljs-keyword">and</span> best_score_at_start <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>:
score = model[token] + best_score_at_start
<span class="hljs-comment"># If we have found a better segmentation ending at end_idx, we update</span>
<span class="hljs-keyword">if</span> (
best_segmentations[end_idx][<span class="hljs-string">&quot;score&quot;</span>] <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span>
<span class="hljs-keyword">or</span> best_segmentations[end_idx][<span class="hljs-string">&quot;score&quot;</span>] &gt; score
):
best_segmentations[end_idx] = {<span class="hljs-string">&quot;start&quot;</span>: start_idx, <span class="hljs-string">&quot;score&quot;</span>: score}
segmentation = best_segmentations[-<span class="hljs-number">1</span>]
<span class="hljs-keyword">if</span> segmentation[<span class="hljs-string">&quot;score&quot;</span>] <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span>:
<span class="hljs-comment"># We did not find a tokenization of the word -&gt; unknown</span>
<span class="hljs-keyword">return</span> [<span class="hljs-string">&quot;&lt;unk&gt;&quot;</span>], <span class="hljs-literal">None</span>
score = segmentation[<span class="hljs-string">&quot;score&quot;</span>]
start = segmentation[<span class="hljs-string">&quot;start&quot;</span>]
end = <span class="hljs-built_in">len</span>(word)
tokens = []
<span class="hljs-keyword">while</span> start != <span class="hljs-number">0</span>:
tokens.insert(<span class="hljs-number">0</span>, word[start:end])
next_start = best_segmentations[start][<span class="hljs-string">&quot;start&quot;</span>]
end = start
start = next_start
tokens.insert(<span class="hljs-number">0</span>, word[start:end])
<span class="hljs-keyword">return</span> tokens, score`,wrap:!1}}),Ss=new U({props:{code:"cHJpbnQoZW5jb2RlX3dvcmQoJTIySG9wZWZ1bGx5JTIyJTJDJTIwbW9kZWwpKSUwQXByaW50KGVuY29kZV93b3JkKCUyMlRoaXMlMjIlMkMlMjBtb2RlbCkp",highlighted:`<span class="hljs-built_in">print</span>(encode_word(<span class="hljs-string">&quot;Hopefully&quot;</span>, model))
<span class="hljs-built_in">print</span>(encode_word(<span class="hljs-string">&quot;This&quot;</span>, model))`,wrap:!1}}),Rs=new U({props:{code:"KCU1QidIJyUyQyUyMCdvJyUyQyUyMCdwJyUyQyUyMCdlJyUyQyUyMCdmJyUyQyUyMCd1JyUyQyUyMCdsbCclMkMlMjAneSclNUQlMkMlMjA0MS41MTU3NDk0NjAxNDAyKSUwQSglNUInVGhpcyclNUQlMkMlMjA2LjI4ODI2NzAzMDY5NDUzNSk=",highlighted:`([<span class="hljs-string">&#x27;H&#x27;</span>, <span class="hljs-string">&#x27;o&#x27;</span>, <span class="hljs-string">&#x27;p&#x27;</span>, <span class="hljs-string">&#x27;e&#x27;</span>, <span class="hljs-string">&#x27;f&#x27;</span>, <span class="hljs-string">&#x27;u&#x27;</span>, <span class="hljs-string">&#x27;ll&#x27;</span>, <span class="hljs-string">&#x27;y&#x27;</span>], <span class="hljs-number">41.5157494601402</span>)
([<span class="hljs-string">&#x27;This&#x27;</span>], <span class="hljs-number">6.288267030694535</span>)`,wrap:!1}}),Hs=new U({props:{code:"ZGVmJTIwY29tcHV0ZV9sb3NzKG1vZGVsKSUzQSUwQSUyMCUyMCUyMCUyMGxvc3MlMjAlM0QlMjAwJTBBJTIwJTIwJTIwJTIwZm9yJTIwd29yZCUyQyUyMGZyZXElMjBpbiUyMHdvcmRfZnJlcXMuaXRlbXMoKSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMF8lMkMlMjB3b3JkX2xvc3MlMjAlM0QlMjBlbmNvZGVfd29yZCh3b3JkJTJDJTIwbW9kZWwpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbG9zcyUyMCUyQiUzRCUyMGZyZXElMjAqJTIwd29yZF9sb3NzJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwbG9zcw==",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_loss</span>(<span class="hljs-params">model</span>):
loss = <span class="hljs-number">0</span>
<span class="hljs-keyword">for</span> word, freq <span class="hljs-keyword">in</span> word_freqs.items():
_, word_loss = encode_word(word, model)
loss += freq * word_loss
<span class="hljs-keyword">return</span> loss`,wrap:!1}}),Ws=new U({props:{code:"Y29tcHV0ZV9sb3NzKG1vZGVsKQ==",highlighted:"compute_loss(model)",wrap:!1}}),Ds=new U({props:{code:"NDEzLjEwMzc3NjQyOTQwODc1",highlighted:'<span class="hljs-number">413.10377642940875</span>',wrap:!1}}),Ys=new U({props:{code:"aW1wb3J0JTIwY29weSUwQSUwQSUwQWRlZiUyMGNvbXB1dGVfc2NvcmVzKG1vZGVsKSUzQSUwQSUyMCUyMCUyMCUyMHNjb3JlcyUyMCUzRCUyMCU3QiU3RCUwQSUyMCUyMCUyMCUyMG1vZGVsX2xvc3MlMjAlM0QlMjBjb21wdXRlX2xvc3MobW9kZWwpJTBBJTIwJTIwJTIwJTIwZm9yJTIwdG9rZW4lMkMlMjBzY29yZSUyMGluJTIwbW9kZWwuaXRlbXMoKSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMFdlJTIwYWx3YXlzJTIwa2VlcCUyMHRva2VucyUyMG9mJTIwbGVuZ3RoJTIwMSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwbGVuKHRva2VuKSUyMCUzRCUzRCUyMDElM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBjb250aW51ZSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMG1vZGVsX3dpdGhvdXRfdG9rZW4lMjAlM0QlMjBjb3B5LmRlZXBjb3B5KG1vZGVsKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMF8lMjAlM0QlMjBtb2RlbF93aXRob3V0X3Rva2VuLnBvcCh0b2tlbiklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzY29yZXMlNUJ0b2tlbiU1RCUyMCUzRCUyMGNvbXB1dGVfbG9zcyhtb2RlbF93aXRob3V0X3Rva2VuKSUyMC0lMjBtb2RlbF9sb3NzJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwc2NvcmVz",highlighted:`<span class="hljs-keyword">import</span> copy
<span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_scores</span>(<span class="hljs-params">model</span>):
scores = {}
model_loss = compute_loss(model)
<span class="hljs-keyword">for</span> token, score <span class="hljs-keyword">in</span> model.items():
<span class="hljs-comment"># We always keep tokens of length 1</span>
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(token) == <span class="hljs-number">1</span>:
<span class="hljs-keyword">continue</span>
model_without_token = copy.deepcopy(model)
_ = model_without_token.pop(token)
scores[token] = compute_loss(model_without_token) - model_loss
<span class="hljs-keyword">return</span> scores`,wrap:!1}}),Fs=new U({props:{code:"c2NvcmVzJTIwJTNEJTIwY29tcHV0ZV9zY29yZXMobW9kZWwpJTBBcHJpbnQoc2NvcmVzJTVCJTIybGwlMjIlNUQpJTBBcHJpbnQoc2NvcmVzJTVCJTIyaGlzJTIyJTVEKQ==",highlighted:`scores = compute_scores(model)
<span class="hljs-built_in">print</span>(scores[<span class="hljs-string">&quot;ll&quot;</span>])
<span class="hljs-built_in">print</span>(scores[<span class="hljs-string">&quot;his&quot;</span>])`,wrap:!1}}),Os=new U({props:{code:"Ni4zNzY0MTI0MDM2MjM4NzQlMEEwLjA=",highlighted:`<span class="hljs-number">6.376412403623874</span>
<span class="hljs-number">0.0</span>`,wrap:!1}}),q=new ja({props:{$$slots:{default:[vl]},$$scope:{ctx:w}}}),aa=new U({props:{code:"cGVyY2VudF90b19yZW1vdmUlMjAlM0QlMjAwLjElMEF3aGlsZSUyMGxlbihtb2RlbCklMjAlM0UlMjAxMDAlM0ElMEElMjAlMjAlMjAlMjBzY29yZXMlMjAlM0QlMjBjb21wdXRlX3Njb3Jlcyhtb2RlbCklMEElMjAlMjAlMjAlMjBzb3J0ZWRfc2NvcmVzJTIwJTNEJTIwc29ydGVkKHNjb3Jlcy5pdGVtcygpJTJDJTIwa2V5JTNEbGFtYmRhJTIweCUzQSUyMHglNUIxJTVEKSUwQSUyMCUyMCUyMCUyMCUyMyUyMFJlbW92ZSUyMHBlcmNlbnRfdG9fcmVtb3ZlJTIwdG9rZW5zJTIwd2l0aCUyMHRoZSUyMGxvd2VzdCUyMHNjb3Jlcy4lMEElMjAlMjAlMjAlMjBmb3IlMjBpJTIwaW4lMjByYW5nZShpbnQobGVuKG1vZGVsKSUyMColMjBwZXJjZW50X3RvX3JlbW92ZSkpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwXyUyMCUzRCUyMHRva2VuX2ZyZXFzLnBvcChzb3J0ZWRfc2NvcmVzJTVCaSU1RCU1QjAlNUQpJTBBJTBBJTIwJTIwJTIwJTIwdG90YWxfc3VtJTIwJTNEJTIwc3VtKCU1QmZyZXElMjBmb3IlMjB0b2tlbiUyQyUyMGZyZXElMjBpbiUyMHRva2VuX2ZyZXFzLml0ZW1zKCklNUQpJTBBJTIwJTIwJTIwJTIwbW9kZWwlMjAlM0QlMjAlN0J0b2tlbiUzQSUyMC1sb2coZnJlcSUyMCUyRiUyMHRvdGFsX3N1bSklMjBmb3IlMjB0b2tlbiUyQyUyMGZyZXElMjBpbiUyMHRva2VuX2ZyZXFzLml0ZW1zKCklN0Q=",highlighted:`percent_to_remove = <span class="hljs-number">0.1</span>
<span class="hljs-keyword">while</span> <span class="hljs-built_in">len</span>(model) &gt; <span class="hljs-number">100</span>:
scores = compute_scores(model)
sorted_scores = <span class="hljs-built_in">sorted</span>(scores.items(), key=<span class="hljs-keyword">lambda</span> x: x[<span class="hljs-number">1</span>])
<span class="hljs-comment"># Remove percent_to_remove tokens with the lowest scores.</span>
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">int</span>(<span class="hljs-built_in">len</span>(model) * percent_to_remove)):
_ = token_freqs.pop(sorted_scores[i][<span class="hljs-number">0</span>])
total_sum = <span class="hljs-built_in">sum</span>([freq <span class="hljs-keyword">for</span> token, freq <span class="hljs-keyword">in</span> token_freqs.items()])
model = {token: -log(freq / total_sum) <span class="hljs-keyword">for</span> token, freq <span class="hljs-keyword">in</span> token_freqs.items()}`,wrap:!1}}),ta=new U({props:{code:"ZGVmJTIwdG9rZW5pemUodGV4dCUyQyUyMG1vZGVsKSUzQSUwQSUyMCUyMCUyMCUyMHdvcmRzX3dpdGhfb2Zmc2V0cyUyMCUzRCUyMHRva2VuaXplci5iYWNrZW5kX3Rva2VuaXplci5wcmVfdG9rZW5pemVyLnByZV90b2tlbml6ZV9zdHIodGV4dCklMEElMjAlMjAlMjAlMjBwcmVfdG9rZW5pemVkX3RleHQlMjAlM0QlMjAlNUJ3b3JkJTIwZm9yJTIwd29yZCUyQyUyMG9mZnNldCUyMGluJTIwd29yZHNfd2l0aF9vZmZzZXRzJTVEJTBBJTIwJTIwJTIwJTIwZW5jb2RlZF93b3JkcyUyMCUzRCUyMCU1QmVuY29kZV93b3JkKHdvcmQlMkMlMjBtb2RlbCklNUIwJTVEJTIwZm9yJTIwd29yZCUyMGluJTIwcHJlX3Rva2VuaXplZF90ZXh0JTVEJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwc3VtKGVuY29kZWRfd29yZHMlMkMlMjAlNUIlNUQpJTBBJTBBJTBBdG9rZW5pemUoJTIyVGhpcyUyMGlzJTIwdGhlJTIwSHVnZ2luZyUyMEZhY2UlMjBjb3Vyc2UuJTIyJTJDJTIwbW9kZWwp",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">tokenize</span>(<span class="hljs-params">text, model</span>):
words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
pre_tokenized_text = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> words_with_offsets]
encoded_words = [encode_word(word, model)[<span class="hljs-number">0</span>] <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> pre_tokenized_text]
<span class="hljs-keyword">return</span> <span class="hljs-built_in">sum</span>(encoded_words, [])
tokenize(<span class="hljs-string">&quot;This is the Hugging Face course.&quot;</span>, model)`,wrap:!1}}),la=new U({props:{code:"JTVCJyVFMiU5NiU4MVRoaXMnJTJDJTIwJyVFMiU5NiU4MWlzJyUyQyUyMCclRTIlOTYlODF0aGUnJTJDJTIwJyVFMiU5NiU4MUh1Z2dpbmcnJTJDJTIwJyVFMiU5NiU4MUZhY2UnJTJDJTIwJyVFMiU5NiU4MSclMkMlMjAnYyclMkMlMjAnb3UnJTJDJTIwJ3InJTJDJTIwJ3MnJTJDJTIwJ2UnJTJDJTIwJy4nJTVE",highlighted:'[<span class="hljs-string">&#x27;▁This&#x27;</span>, <span class="hljs-string">&#x27;▁is&#x27;</span>, <span class="hljs-string">&#x27;▁the&#x27;</span>, <span class="hljs-string">&#x27;▁Hugging&#x27;</span>, <span class="hljs-string">&#x27;▁Face&#x27;</span>, <span class="hljs-string">&#x27;▁&#x27;</span>, <span class="hljs-string">&#x27;c&#x27;</span>, <span class="hljs-string">&#x27;ou&#x27;</span>, <span class="hljs-string">&#x27;r&#x27;</span>, <span class="hljs-string">&#x27;s&#x27;</span>, <span class="hljs-string">&#x27;e&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>]',wrap:!1}}),pa=new Tl({props:{source:"https://github.com/huggingface/course/blob/main/chapters/rum/chapter6/7.mdx"}}),{c(){c=p("meta"),h=l(),d=p("p"),J=l(),m(A.$$.fragment),ha=l(),m(z.$$.fragment),Ja=l(),Z=p("p"),Z.textContent=yt,wa=l(),m(_.$$.fragment),ba=l(),m(C.$$.fragment),Ta=l(),m(V.$$.fragment),fa=l(),N=p("p"),N.textContent=jt,ga=l(),Q=p("p"),Q.textContent=dt,Ca=l(),I=p("p"),pt=ra("Aceasta este o operațiune foarte costisitoare, așa că nu eliminăm doar simbolul asociat cu cea mai mică creștere a pierderii, ci procentul"),Ia=new ut(!1),va=ra(" (\\(p\\) fiind un hyperparametru pe care îl poți controla, de obicei 10 sau 20) din simbolurile asociate cu cea mai mică creștere a pierderilor. Acest proces este se repetă până când vocabularul atinge dimensiunea dorită."),ka=l(),B=p("p"),B.textContent=Ut,$a=l(),G=p("p"),G.textContent=ht,qa=l(),S=p("p"),S.textContent=Jt,xa=l(),m(R.$$.fragment),Aa=l(),E=p("p"),E.textContent=wt,za=l(),m(H.$$.fragment),Za=l(),m(X.$$.fragment),_a=l(),W=p("p"),W.textContent=bt,Va=l(),D=p("p"),D.innerHTML=Tt,Na=l(),K=p("p"),K.textContent=ft,Qa=l(),m(Y.$$.fragment),Ba=l(),P=p("p"),P.innerHTML=gt,Ga=l(),m(v.$$.fragment),Sa=l(),b=p("p"),it=ra("Acum, pentru a tokeniza un cuvânt dat, ne uităm la toate segmentările posibile în tokeni și calculăm probabilitatea fiecăruia în conformitate cu modelul Unigram. Deoarece toate token-urile sunt considerate independente, această probabilitate este doar produsul probabilității fiecărui token. De exemplu, tokenizarea "),ma=p("code"),ma.textContent=Ct,rt=ra(" a lui "),oa=p("code"),oa.textContent=It,ct=ra(` are probabilitatea:
`),Ra=new ut(!1),Ea=l(),g=p("p"),mt=ra("Comparativ, tokenizarea "),ua=p("code"),ua.textContent=vt,ot=ra(` are probabilitatea:
`),Ha=new ut(!1),Xa=l(),F=p("p"),F.textContent=kt,Wa=l(),L=p("p"),L.innerHTML=$t,Da=l(),m(O.$$.fragment),Ka=l(),ss=p("p"),ss.innerHTML=qt,Ya=l(),as=p("p"),as.innerHTML=xt,Pa=l(),es=p("p"),es.textContent=At,Fa=l(),ts=p("p"),ts.innerHTML=zt,La=l(),m(ls.$$.fragment),Oa=l(),ns=p("p"),ns.innerHTML=Zt,se=l(),m(k.$$.fragment),ae=l(),m(ps.$$.fragment),ee=l(),is=p("p"),is.textContent=_t,te=l(),rs=p("p"),rs.innerHTML=Vt,le=l(),cs=p("p"),cs.textContent=Nt,ne=l(),m(ms.$$.fragment),pe=l(),os=p("p"),os.textContent=Qt,ie=l(),m(us.$$.fragment),re=l(),Ms=p("p"),Ms.textContent=Bt,ce=l(),m(ys.$$.fragment),me=l(),js=p("p"),js.innerHTML=Gt,oe=l(),ds=p("p"),ds.innerHTML=St,ue=l(),m(Us.$$.fragment),Me=l(),hs=p("p"),hs.textContent=Rt,ye=l(),m(Js.$$.fragment),je=l(),ws=p("p"),ws.innerHTML=Et,de=l(),m(bs.$$.fragment),Ue=l(),Ts=p("p"),Ts.textContent=Ht,he=l(),fs=p("p"),fs.textContent=Xt,Je=l(),m(gs.$$.fragment),we=l(),Cs=p("p"),Cs.innerHTML=Wt,be=l(),m(Is.$$.fragment),Te=l(),vs=p("p"),vs.textContent=Dt,fe=l(),m(ks.$$.fragment),ge=l(),$s=p("p"),$s.textContent=Kt,Ce=l(),m(qs.$$.fragment),Ie=l(),m(xs.$$.fragment),ve=l(),As=p("p"),As.textContent=Yt,ke=l(),m(zs.$$.fragment),$e=l(),m($.$$.fragment),qe=l(),Zs=p("p"),Zs.textContent=Pt,xe=l(),m(_s.$$.fragment),Ae=l(),Vs=p("p"),Vs.innerHTML=Ft,ze=l(),Ns=p("p"),Ns.innerHTML=Lt,Ze=l(),Qs=p("p"),Qs.textContent=Ot,_e=l(),m(Bs.$$.fragment),Ve=l(),Gs=p("p"),Gs.textContent=sl,Ne=l(),m(Ss.$$.fragment),Qe=l(),m(Rs.$$.fragment),Be=l(),Es=p("p"),Es.textContent=al,Ge=l(),m(Hs.$$.fragment),Se=l(),Xs=p("p"),Xs.textContent=el,Re=l(),m(Ws.$$.fragment),Ee=l(),m(Ds.$$.fragment),He=l(),Ks=p("p"),Ks.textContent=tl,Xe=l(),m(Ys.$$.fragment),We=l(),Ps=p("p"),Ps.textContent=ll,De=l(),m(Fs.$$.fragment),Ke=l(),Ls=p("p"),Ls.innerHTML=nl,Ye=l(),m(Os.$$.fragment),Pe=l(),m(q.$$.fragment),Fe=l(),sa=p("p"),sa.textContent=pl,Le=l(),m(aa.$$.fragment),Oe=l(),ea=p("p"),ea.innerHTML=il,st=l(),m(ta.$$.fragment),at=l(),m(la.$$.fragment),et=l(),na=p("p"),na.textContent=rl,tt=l(),m(pa.$$.fragment),lt=l(),Ma=p("p"),this.h()},l(s){const a=Jl("svelte-u9bgzb",document.head);c=i(a,"META",{name:!0,content:!0}),a.forEach(e),h=n(s),d=i(s,"P",{}),ya(d).forEach(e),J=n(s),o(A.$$.fragment,s),ha=n(s),o(z.$$.fragment,s),Ja=n(s),Z=i(s,"P",{"data-svelte-h":!0}),r(Z)!=="svelte-1a2x6pl"&&(Z.textContent=yt),wa=n(s),o(_.$$.fragment,s),ba=n(s),o(C.$$.fragment,s),Ta=n(s),o(V.$$.fragment,s),fa=n(s),N=i(s,"P",{"data-svelte-h":!0}),r(N)!=="svelte-1cxrmj3"&&(N.textContent=jt),ga=n(s),Q=i(s,"P",{"data-svelte-h":!0}),r(Q)!=="svelte-1x593wj"&&(Q.textContent=dt),Ca=n(s),I=i(s,"P",{});var ia=ya(I);pt=ca(ia,"Aceasta este o operațiune foarte costisitoare, așa că nu eliminăm doar simbolul asociat cu cea mai mică creștere a pierderii, ci procentul"),Ia=Mt(ia,!1),va=ca(ia," (\\(p\\) fiind un hyperparametru pe care îl poți controla, de obicei 10 sau 20) din simbolurile asociate cu cea mai mică creștere a pierderilor. Acest proces este se repetă până când vocabularul atinge dimensiunea dorită."),ia.forEach(e),ka=n(s),B=i(s,"P",{"data-svelte-h":!0}),r(B)!=="svelte-4com3a"&&(B.textContent=Ut),$a=n(s),G=i(s,"P",{"data-svelte-h":!0}),r(G)!=="svelte-tphyph"&&(G.textContent=ht),qa=n(s),S=i(s,"P",{"data-svelte-h":!0}),r(S)!=="svelte-1asmub4"&&(S.textContent=Jt),xa=n(s),o(R.$$.fragment,s),Aa=n(s),E=i(s,"P",{"data-svelte-h":!0}),r(E)!=="svelte-1fechpk"&&(E.textContent=wt),za=n(s),o(H.$$.fragment,s),Za=n(s),o(X.$$.fragment,s),_a=n(s),W=i(s,"P",{"data-svelte-h":!0}),r(W)!=="svelte-a1fcf2"&&(W.textContent=bt),Va=n(s),D=i(s,"P",{"data-svelte-h":!0}),r(D)!=="svelte-168eu2r"&&(D.innerHTML=Tt),Na=n(s),K=i(s,"P",{"data-svelte-h":!0}),r(K)!=="svelte-qvq4cu"&&(K.textContent=ft),Qa=n(s),o(Y.$$.fragment,s),Ba=n(s),P=i(s,"P",{"data-svelte-h":!0}),r(P)!=="svelte-tq8hu8"&&(P.innerHTML=gt),Ga=n(s),o(v.$$.fragment,s),Sa=n(s),b=i(s,"P",{});var f=ya(b);it=ca(f,"Acum, pentru a tokeniza un cuvânt dat, ne uităm la toate segmentările posibile în tokeni și calculăm probabilitatea fiecăruia în conformitate cu modelul Unigram. Deoarece toate token-urile sunt considerate independente, această probabilitate este doar produsul probabilității fiecărui token. De exemplu, tokenizarea "),ma=i(f,"CODE",{"data-svelte-h":!0}),r(ma)!=="svelte-1n2m4po"&&(ma.textContent=Ct),rt=ca(f," a lui "),oa=i(f,"CODE",{"data-svelte-h":!0}),r(oa)!=="svelte-1gjdq76"&&(oa.textContent=It),ct=ca(f,` are probabilitatea:
`),Ra=Mt(f,!1),f.forEach(e),Ea=n(s),g=i(s,"P",{});var x=ya(g);mt=ca(x,"Comparativ, tokenizarea "),ua=i(x,"CODE",{"data-svelte-h":!0}),r(ua)!=="svelte-42m5r0"&&(ua.textContent=vt),ot=ca(x,` are probabilitatea:
`),Ha=Mt(x,!1),x.forEach(e),Xa=n(s),F=i(s,"P",{"data-svelte-h":!0}),r(F)!=="svelte-hg99nj"&&(F.textContent=kt),Wa=n(s),L=i(s,"P",{"data-svelte-h":!0}),r(L)!=="svelte-1ee5mm7"&&(L.innerHTML=$t),Da=n(s),o(O.$$.fragment,s),Ka=n(s),ss=i(s,"P",{"data-svelte-h":!0}),r(ss)!=="svelte-3k0tl5"&&(ss.innerHTML=qt),Ya=n(s),as=i(s,"P",{"data-svelte-h":!0}),r(as)!=="svelte-z7a5sc"&&(as.innerHTML=xt),Pa=n(s),es=i(s,"P",{"data-svelte-h":!0}),r(es)!=="svelte-15nla7p"&&(es.textContent=At),Fa=n(s),ts=i(s,"P",{"data-svelte-h":!0}),r(ts)!=="svelte-1gcl29b"&&(ts.innerHTML=zt),La=n(s),o(ls.$$.fragment,s),Oa=n(s),ns=i(s,"P",{"data-svelte-h":!0}),r(ns)!=="svelte-1g1ohqe"&&(ns.innerHTML=Zt),se=n(s),o(k.$$.fragment,s),ae=n(s),o(ps.$$.fragment,s),ee=n(s),is=i(s,"P",{"data-svelte-h":!0}),r(is)!=="svelte-12vj6oy"&&(is.textContent=_t),te=n(s),rs=i(s,"P",{"data-svelte-h":!0}),r(rs)!=="svelte-1bh8ydg"&&(rs.innerHTML=Vt),le=n(s),cs=i(s,"P",{"data-svelte-h":!0}),r(cs)!=="svelte-umqha5"&&(cs.textContent=Nt),ne=n(s),o(ms.$$.fragment,s),pe=n(s),os=i(s,"P",{"data-svelte-h":!0}),r(os)!=="svelte-1qowfab"&&(os.textContent=Qt),ie=n(s),o(us.$$.fragment,s),re=n(s),Ms=i(s,"P",{"data-svelte-h":!0}),r(Ms)!=="svelte-1amw5rn"&&(Ms.textContent=Bt),ce=n(s),o(ys.$$.fragment,s),me=n(s),js=i(s,"P",{"data-svelte-h":!0}),r(js)!=="svelte-1e0a24y"&&(js.innerHTML=Gt),oe=n(s),ds=i(s,"P",{"data-svelte-h":!0}),r(ds)!=="svelte-10ssw0w"&&(ds.innerHTML=St),ue=n(s),o(Us.$$.fragment,s),Me=n(s),hs=i(s,"P",{"data-svelte-h":!0}),r(hs)!=="svelte-1bkkpl"&&(hs.textContent=Rt),ye=n(s),o(Js.$$.fragment,s),je=n(s),ws=i(s,"P",{"data-svelte-h":!0}),r(ws)!=="svelte-1nf5wyu"&&(ws.innerHTML=Et),de=n(s),o(bs.$$.fragment,s),Ue=n(s),Ts=i(s,"P",{"data-svelte-h":!0}),r(Ts)!=="svelte-1e26tsy"&&(Ts.textContent=Ht),he=n(s),fs=i(s,"P",{"data-svelte-h":!0}),r(fs)!=="svelte-1y8mf0v"&&(fs.textContent=Xt),Je=n(s),o(gs.$$.fragment,s),we=n(s),Cs=i(s,"P",{"data-svelte-h":!0}),r(Cs)!=="svelte-1e82rwh"&&(Cs.innerHTML=Wt),be=n(s),o(Is.$$.fragment,s),Te=n(s),vs=i(s,"P",{"data-svelte-h":!0}),r(vs)!=="svelte-9p88w4"&&(vs.textContent=Dt),fe=n(s),o(ks.$$.fragment,s),ge=n(s),$s=i(s,"P",{"data-svelte-h":!0}),r($s)!=="svelte-wr84u4"&&($s.textContent=Kt),Ce=n(s),o(qs.$$.fragment,s),Ie=n(s),o(xs.$$.fragment,s),ve=n(s),As=i(s,"P",{"data-svelte-h":!0}),r(As)!=="svelte-ew70bw"&&(As.textContent=Yt),ke=n(s),o(zs.$$.fragment,s),$e=n(s),o($.$$.fragment,s),qe=n(s),Zs=i(s,"P",{"data-svelte-h":!0}),r(Zs)!=="svelte-t8yzra"&&(Zs.textContent=Pt),xe=n(s),o(_s.$$.fragment,s),Ae=n(s),Vs=i(s,"P",{"data-svelte-h":!0}),r(Vs)!=="svelte-f2ayzq"&&(Vs.innerHTML=Ft),ze=n(s),Ns=i(s,"P",{"data-svelte-h":!0}),r(Ns)!=="svelte-lnoitx"&&(Ns.innerHTML=Lt),Ze=n(s),Qs=i(s,"P",{"data-svelte-h":!0}),r(Qs)!=="svelte-1m6zfgz"&&(Qs.textContent=Ot),_e=n(s),o(Bs.$$.fragment,s),Ve=n(s),Gs=i(s,"P",{"data-svelte-h":!0}),r(Gs)!=="svelte-ez83mz"&&(Gs.textContent=sl),Ne=n(s),o(Ss.$$.fragment,s),Qe=n(s),o(Rs.$$.fragment,s),Be=n(s),Es=i(s,"P",{"data-svelte-h":!0}),r(Es)!=="svelte-16fxxb9"&&(Es.textContent=al),Ge=n(s),o(Hs.$$.fragment,s),Se=n(s),Xs=i(s,"P",{"data-svelte-h":!0}),r(Xs)!=="svelte-dg0m9h"&&(Xs.textContent=el),Re=n(s),o(Ws.$$.fragment,s),Ee=n(s),o(Ds.$$.fragment,s),He=n(s),Ks=i(s,"P",{"data-svelte-h":!0}),r(Ks)!=="svelte-11aqyeu"&&(Ks.textContent=tl),Xe=n(s),o(Ys.$$.fragment,s),We=n(s),Ps=i(s,"P",{"data-svelte-h":!0}),r(Ps)!=="svelte-1m94bz1"&&(Ps.textContent=ll),De=n(s),o(Fs.$$.fragment,s),Ke=n(s),Ls=i(s,"P",{"data-svelte-h":!0}),r(Ls)!=="svelte-1jayei5"&&(Ls.innerHTML=nl),Ye=n(s),o(Os.$$.fragment,s),Pe=n(s),o(q.$$.fragment,s),Fe=n(s),sa=i(s,"P",{"data-svelte-h":!0}),r(sa)!=="svelte-zknib0"&&(sa.textContent=pl),Le=n(s),o(aa.$$.fragment,s),Oe=n(s),ea=i(s,"P",{"data-svelte-h":!0}),r(ea)!=="svelte-2p2t27"&&(ea.innerHTML=il),st=n(s),o(ta.$$.fragment,s),at=n(s),o(la.$$.fragment,s),et=n(s),na=i(s,"P",{"data-svelte-h":!0}),r(na)!=="svelte-ocw80d"&&(na.textContent=rl),tt=n(s),o(pa.$$.fragment,s),lt=n(s),Ma=i(s,"P",{}),ya(Ma).forEach(e),this.h()},h(){ol(c,"name","hf:doc:metadata"),ol(c,"content",$l),Ia.a=va,Ra.a=null,Ha.a=null},m(s,a){T(document.head,c),t(s,h,a),t(s,d,a),t(s,J,a),u(A,s,a),t(s,ha,a),u(z,s,a),t(s,Ja,a),t(s,Z,a),t(s,wa,a),u(_,s,a),t(s,ba,a),u(C,s,a),t(s,Ta,a),u(V,s,a),t(s,fa,a),t(s,N,a),t(s,ga,a),t(s,Q,a),t(s,Ca,a),t(s,I,a),T(I,pt),Ia.m(ul,I),T(I,va),t(s,ka,a),t(s,B,a),t(s,$a,a),t(s,G,a),t(s,qa,a),t(s,S,a),t(s,xa,a),u(R,s,a),t(s,Aa,a),t(s,E,a),t(s,za,a),u(H,s,a),t(s,Za,a),u(X,s,a),t(s,_a,a),t(s,W,a),t(s,Va,a),t(s,D,a),t(s,Na,a),t(s,K,a),t(s,Qa,a),u(Y,s,a),t(s,Ba,a),t(s,P,a),t(s,Ga,a),u(v,s,a),t(s,Sa,a),t(s,b,a),T(b,it),T(b,ma),T(b,rt),T(b,oa),T(b,ct),Ra.m(Ml,b),t(s,Ea,a),t(s,g,a),T(g,mt),T(g,ua),T(g,ot),Ha.m(yl,g),t(s,Xa,a),t(s,F,a),t(s,Wa,a),t(s,L,a),t(s,Da,a),u(O,s,a),t(s,Ka,a),t(s,ss,a),t(s,Ya,a),t(s,as,a),t(s,Pa,a),t(s,es,a),t(s,Fa,a),t(s,ts,a),t(s,La,a),u(ls,s,a),t(s,Oa,a),t(s,ns,a),t(s,se,a),u(k,s,a),t(s,ae,a),u(ps,s,a),t(s,ee,a),t(s,is,a),t(s,te,a),t(s,rs,a),t(s,le,a),t(s,cs,a),t(s,ne,a),u(ms,s,a),t(s,pe,a),t(s,os,a),t(s,ie,a),u(us,s,a),t(s,re,a),t(s,Ms,a),t(s,ce,a),u(ys,s,a),t(s,me,a),t(s,js,a),t(s,oe,a),t(s,ds,a),t(s,ue,a),u(Us,s,a),t(s,Me,a),t(s,hs,a),t(s,ye,a),u(Js,s,a),t(s,je,a),t(s,ws,a),t(s,de,a),u(bs,s,a),t(s,Ue,a),t(s,Ts,a),t(s,he,a),t(s,fs,a),t(s,Je,a),u(gs,s,a),t(s,we,a),t(s,Cs,a),t(s,be,a),u(Is,s,a),t(s,Te,a),t(s,vs,a),t(s,fe,a),u(ks,s,a),t(s,ge,a),t(s,$s,a),t(s,Ce,a),u(qs,s,a),t(s,Ie,a),u(xs,s,a),t(s,ve,a),t(s,As,a),t(s,ke,a),u(zs,s,a),t(s,$e,a),u($,s,a),t(s,qe,a),t(s,Zs,a),t(s,xe,a),u(_s,s,a),t(s,Ae,a),t(s,Vs,a),t(s,ze,a),t(s,Ns,a),t(s,Ze,a),t(s,Qs,a),t(s,_e,a),u(Bs,s,a),t(s,Ve,a),t(s,Gs,a),t(s,Ne,a),u(Ss,s,a),t(s,Qe,a),u(Rs,s,a),t(s,Be,a),t(s,Es,a),t(s,Ge,a),u(Hs,s,a),t(s,Se,a),t(s,Xs,a),t(s,Re,a),u(Ws,s,a),t(s,Ee,a),u(Ds,s,a),t(s,He,a),t(s,Ks,a),t(s,Xe,a),u(Ys,s,a),t(s,We,a),t(s,Ps,a),t(s,De,a),u(Fs,s,a),t(s,Ke,a),t(s,Ls,a),t(s,Ye,a),u(Os,s,a),t(s,Pe,a),u(q,s,a),t(s,Fe,a),t(s,sa,a),t(s,Le,a),u(aa,s,a),t(s,Oe,a),t(s,ea,a),t(s,st,a),u(ta,s,a),t(s,at,a),u(la,s,a),t(s,et,a),t(s,na,a),t(s,tt,a),u(pa,s,a),t(s,lt,a),t(s,Ma,a),nt=!0},p(s,[a]){const ia={};a&2&&(ia.$$scope={dirty:a,ctx:s}),C.$set(ia);const f={};a&2&&(f.$$scope={dirty:a,ctx:s}),v.$set(f);const x={};a&2&&(x.$$scope={dirty:a,ctx:s}),k.$set(x);const cl={};a&2&&(cl.$$scope={dirty:a,ctx:s}),$.$set(cl);const ml={};a&2&&(ml.$$scope={dirty:a,ctx:s}),q.$set(ml)},i(s){nt||(M(A.$$.fragment,s),M(z.$$.fragment,s),M(_.$$.fragment,s),M(C.$$.fragment,s),M(V.$$.fragment,s),M(R.$$.fragment,s),M(H.$$.fragment,s),M(X.$$.fragment,s),M(Y.$$.fragment,s),M(v.$$.fragment,s),M(O.$$.fragment,s),M(ls.$$.fragment,s),M(k.$$.fragment,s),M(ps.$$.fragment,s),M(ms.$$.fragment,s),M(us.$$.fragment,s),M(ys.$$.fragment,s),M(Us.$$.fragment,s),M(Js.$$.fragment,s),M(bs.$$.fragment,s),M(gs.$$.fragment,s),M(Is.$$.fragment,s),M(ks.$$.fragment,s),M(qs.$$.fragment,s),M(xs.$$.fragment,s),M(zs.$$.fragment,s),M($.$$.fragment,s),M(_s.$$.fragment,s),M(Bs.$$.fragment,s),M(Ss.$$.fragment,s),M(Rs.$$.fragment,s),M(Hs.$$.fragment,s),M(Ws.$$.fragment,s),M(Ds.$$.fragment,s),M(Ys.$$.fragment,s),M(Fs.$$.fragment,s),M(Os.$$.fragment,s),M(q.$$.fragment,s),M(aa.$$.fragment,s),M(ta.$$.fragment,s),M(la.$$.fragment,s),M(pa.$$.fragment,s),nt=!0)},o(s){y(A.$$.fragment,s),y(z.$$.fragment,s),y(_.$$.fragment,s),y(C.$$.fragment,s),y(V.$$.fragment,s),y(R.$$.fragment,s),y(H.$$.fragment,s),y(X.$$.fragment,s),y(Y.$$.fragment,s),y(v.$$.fragment,s),y(O.$$.fragment,s),y(ls.$$.fragment,s),y(k.$$.fragment,s),y(ps.$$.fragment,s),y(ms.$$.fragment,s),y(us.$$.fragment,s),y(ys.$$.fragment,s),y(Us.$$.fragment,s),y(Js.$$.fragment,s),y(bs.$$.fragment,s),y(gs.$$.fragment,s),y(Is.$$.fragment,s),y(ks.$$.fragment,s),y(qs.$$.fragment,s),y(xs.$$.fragment,s),y(zs.$$.fragment,s),y($.$$.fragment,s),y(_s.$$.fragment,s),y(Bs.$$.fragment,s),y(Ss.$$.fragment,s),y(Rs.$$.fragment,s),y(Hs.$$.fragment,s),y(Ws.$$.fragment,s),y(Ds.$$.fragment,s),y(Ys.$$.fragment,s),y(Fs.$$.fragment,s),y(Os.$$.fragment,s),y(q.$$.fragment,s),y(aa.$$.fragment,s),y(ta.$$.fragment,s),y(la.$$.fragment,s),y(pa.$$.fragment,s),nt=!1},d(s){s&&(e(h),e(d),e(J),e(ha),e(Ja),e(Z),e(wa),e(ba),e(Ta),e(fa),e(N),e(ga),e(Q),e(Ca),e(I),e(ka),e(B),e($a),e(G),e(qa),e(S),e(xa),e(Aa),e(E),e(za),e(Za),e(_a),e(W),e(Va),e(D),e(Na),e(K),e(Qa),e(Ba),e(P),e(Ga),e(Sa),e(b),e(Ea),e(g),e(Xa),e(F),e(Wa),e(L),e(Da),e(Ka),e(ss),e(Ya),e(as),e(Pa),e(es),e(Fa),e(ts),e(La),e(Oa),e(ns),e(se),e(ae),e(ee),e(is),e(te),e(rs),e(le),e(cs),e(ne),e(pe),e(os),e(ie),e(re),e(Ms),e(ce),e(me),e(js),e(oe),e(ds),e(ue),e(Me),e(hs),e(ye),e(je),e(ws),e(de),e(Ue),e(Ts),e(he),e(fs),e(Je),e(we),e(Cs),e(be),e(Te),e(vs),e(fe),e(ge),e($s),e(Ce),e(Ie),e(ve),e(As),e(ke),e($e),e(qe),e(Zs),e(xe),e(Ae),e(Vs),e(ze),e(Ns),e(Ze),e(Qs),e(_e),e(Ve),e(Gs),e(Ne),e(Qe),e(Be),e(Es),e(Ge),e(Se),e(Xs),e(Re),e(Ee),e(He),e(Ks),e(Xe),e(We),e(Ps),e(De),e(Ke),e(Ls),e(Ye),e(Pe),e(Fe),e(sa),e(Le),e(Oe),e(ea),e(st),e(at),e(et),e(na),e(tt),e(lt),e(Ma)),e(c),j(A,s),j(z,s),j(_,s),j(C,s),j(V,s),j(R,s),j(H,s),j(X,s),j(Y,s),j(v,s),j(O,s),j(ls,s),j(k,s),j(ps,s),j(ms,s),j(us,s),j(ys,s),j(Us,s),j(Js,s),j(bs,s),j(gs,s),j(Is,s),j(ks,s),j(qs,s),j(xs,s),j(zs,s),j($,s),j(_s,s),j(Bs,s),j(Ss,s),j(Rs,s),j(Hs,s),j(Ws,s),j(Ds,s),j(Ys,s),j(Fs,s),j(Os,s),j(q,s),j(aa,s),j(ta,s),j(la,s),j(pa,s)}}}const $l='{"title":"Tokenizarea Unigram","local":"unigram-tokenization","sections":[{"title":"Algoritm de antrenare","local":"training-algorithm","sections":[],"depth":2},{"title":"Algoritm de tokenizare","local":"tokenization-algorithm","sections":[],"depth":2},{"title":"Înapoi la antrenare","local":"back-to-training","sections":[],"depth":2},{"title":"Implementarea Unigram","local":"implementarea-unigram","sections":[],"depth":2}],"depth":1}';function ql(w){return dl(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Ql extends Ul{constructor(c){super(),hl(this,c,ql,kl,jl,{})}}export{Ql as component};

Xet Storage Details

Size:
79.3 kB
·
Xet hash:
e5f1efbe0c263bd0a67c6568f54bcc87ad0236469b688a164250b45634c52621

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.