Buckets:
| import{s as ja,o as Ja,n as Ys}from"../chunks/scheduler.37c15a92.js";import{S as Ua,i as ma,g as c,s as e,r as M,A as Ta,h as j,f as n,c as t,j as oa,u as i,x as U,k as ca,y as ha,a,v as r,d as u,t as y,w as o}from"../chunks/index.2bf4358c.js";import{T as Rs}from"../chunks/Tip.363c041f.js";import{Y as wa}from"../chunks/Youtube.1e50a667.js";import{C as m}from"../chunks/CodeBlock.4e987730.js";import{C as da}from"../chunks/CourseFloatingBanner.6add7356.js";import{H as gn,E as Ca}from"../chunks/getInferenceSnippets.ebf8be91.js";function xa(w){let p,T="💡 Cette section couvre le BPE en profondeur, allant jusqu’à montrer une implémentation complète. Vous pouvez passer directement à la fin si vous souhaitez simplement avoir un aperçu général de l’algorithme de tokenisation.";return{c(){p=c("p"),p.textContent=T},l(J){p=j(J,"P",{"data-svelte-h":!0}),U(p)!=="svelte-n0frxx"&&(p.textContent=T)},m(J,h){a(J,p,h)},p:Ys,d(J){J&&n(p)}}}function ga(w){let p,T="Les <em>tokenizers</em> du GPT-2 et de RoBERTa (qui sont assez similaires) ont une façon intelligente de gérer ce problème : ils ne considèrent pas les mots comme étant écrits avec des caractères Unicode mais avec des octets. De cette façon, le vocabulaire de base a une petite taille (256) et tous les caractères auxquels vous pouvez penser seront inclus dedans et ne finiront pas par être convertis en un <em>token</em> inconnu. Cette astuce est appelée <em>byte-level BPE</em>.";return{c(){p=c("p"),p.innerHTML=T},l(J){p=j(J,"P",{"data-svelte-h":!0}),U(p)!=="svelte-1ytsk3t"&&(p.innerHTML=T)},m(J,h){a(J,p,h)},p:Ys,d(J){J&&n(p)}}}function Ia(w){let p,T="✏️ <strong>A votre tour !</strong> A votre avis, quelle sera la prochaine règle de fusion ?";return{c(){p=c("p"),p.innerHTML=T},l(J){p=j(J,"P",{"data-svelte-h":!0}),U(p)!=="svelte-1nzfw18"&&(p.innerHTML=T)},m(J,h){a(J,p,h)},p:Ys,d(J){J&&n(p)}}}function fa(w){let p,T="✏️ <strong>A votre tour !</strong> Comment pensez-vous que le mot « unhug » (détacher en français) sera tokenisé ?";return{c(){p=c("p"),p.innerHTML=T},l(J){p=j(J,"P",{"data-svelte-h":!0}),U(p)!=="svelte-g2m4o1"&&(p.innerHTML=T)},m(J,h){a(J,p,h)},p:Ys,d(J){J&&n(p)}}}function ba(w){let p,T="💡 Utiliser <code>train_new_from_iterator()</code> sur le même corpus ne donnera pas exactement le même vocabulaire. C’est parce que lorsqu’il y a un choix de la paire la plus fréquente, nous avons sélectionné la première rencontrée, alors que la bibliothèque 🤗 <em>Tokenizers</em> sélectionne la première en fonction de ses identifiants internes.";return{c(){p=c("p"),p.innerHTML=T},l(J){p=j(J,"P",{"data-svelte-h":!0}),U(p)!=="svelte-83jgtn"&&(p.innerHTML=T)},m(J,h){a(J,p,h)},p:Ys,d(J){J&&n(p)}}}function qa(w){let p,T="⚠️ Notre implémentation lancera une erreur s’il y a un caractère inconnu puisque nous n’avons rien fait pour les gérer. GPT-2 n’a pas réellement de <i>token</i> inconnu (il est impossible d’obtenir un caractère inconnu en utilisant le BPE au niveau de l’octet) mais cela pourrait arriver ici car nous n’avons pas inclus tous les octets possibles dans le vocabulaire initial. Cet aspect du BPE dépasse le cadre de cette section, nous avons donc laissé ces détails de côté.";return{c(){p=c("p"),p.innerHTML=T},l(J){p=j(J,"P",{"data-svelte-h":!0}),U(p)!=="svelte-9fj1z7"&&(p.innerHTML=T)},m(J,h){a(J,p,h)},p:Ys,d(J){J&&n(p)}}}function $a(w){let p,T,J,h,b,Ls,q,Fs,$,In="Le <em>Byte-Pair Encoding</em> (BPE) a été initialement développé en tant qu’algorithme de compression de textes puis utilisé par OpenAI pour la tokenisation du pré-entraînement du modèle GPT. Il est utilisé par de nombreux <em>transformers</em> dont GPT, GPT-2, RoBERTa, BART et DeBERTa.",Ks,k,Os,d,sl,Q,ll,A,fn="L’entraînement du BPE commence par le calcul de l’unique ensemble de mots utilisés dans le corpus (après les étapes de normalisation et de prétokénisation), puis la construction du vocabulaire en prenant tous les symboles utilisés pour écrire ces mots. A titre d’exemple, disons que notre corpus utilise ces cinq mots :",nl,B,al,v,bn="Le vocabulaire de base sera alors <code>["b", "g", "h", "n", "p", "s", "u"]</code>. Dans le monde réel, le vocabulaire de base contient au moins tous les caractères ASCII et probablement aussi quelques caractères Unicode. Si un exemple que vous tokenisez utilise un caractère qui n’est pas dans le corpus d’entraînement, ce caractère est converti en <em>token</em> inconnu. C’est l’une des raisons pour lesquelles de nombreux modèles de NLP sont par exemple très mauvais dans l’analyse de contenus contenant des emojis.",el,C,tl,V,qn="Après avoir obtenu ce vocabulaire de base, nous ajoutons de nouveaux <em>tokens</em> jusqu’à ce que la taille souhaitée du vocabulaire soit atteinte en apprenant les fusions qui sont des règles permettant de fusionner deux éléments du vocabulaire existant pour en créer un nouveau. Ainsi, au début, ces fusions créeront des <em>tokens</em> de deux caractères, puis au fur et à mesure de l’entraînement, des sous-mots plus longs.",pl,E,$n="À chaque étape de l’entraînement du <em>tokenizer</em>, l’algorithme BPE recherche la paire la plus fréquente de <em>tokens</em> existants (par « paire », nous entendons ici deux <em>tokens</em> consécutifs dans un mot). Cette paire la plus fréquente est celle qui sera fusionnée. Nous rinçons et répétons pour l’étape suivante.",Ml,z,kn="Pour revenir à notre exemple précédent, supposons que les mots ont les fréquences suivantes :",il,Z,rl,G,Qn="ce qui veut dire que <code>"hug"</code> était présent 10 fois dans le corpus, <code>"pug"</code> 5 fois, <code>"pun"</code> 12 fois, <code>"bun"</code> 4 fois et <code>"hugs"</code>” 5 fois. Nous commençons l’entraînement en divisant chaque mot en caractères (ceux qui forment notre vocabulaire initial) afin de voir chaque mot comme une liste de <em>tokens</em> :",ul,S,yl,N,An="Ensuite, nous regardons les paires. La paire <code>("h", "u")</code> est présente dans les mots <code>"hug"</code> et <code>"hugs"</code>, donc 15 fois au total dans le corpus. Ce n’est cependant pas la paire la plus fréquente. Cet honneur revient à <code>("u", "g")</code> qui est présent dans <code>"hug"</code>, <code>"pug"</code>, et <code>"hugs"</code>, pour un total de 20 fois dans le vocabulaire.",ol,_,Bn="Ainsi, la première règle de fusion apprise par le <em>tokenizer</em> est <code>("u", "g") -> "ug"</code>, ce qui signifie que <code>"ug"</code> est ajouté au vocabulaire et que la paire doit être fusionnée dans tous les mots du corpus. A la fin de cette étape, le vocabulaire et le corpus ressemblent à ceci :",cl,H,jl,X,vn="Nous avons maintenant quelques paires qui aboutissent à un <em>token</em> de plus de deux caractères. Par exemple la paire <code>("h", "ug")</code> présente 15 fois dans le corpus. La paire la plus fréquente à ce stade est <code>("u", "n")</code>, présente 16 fois dans le corpus, donc la deuxième règle de fusion apprise est <code>("u", "n") -> "un"</code>. En ajoutant cela au vocabulaire et en fusionnant toutes les occurrences existantes, nous obtenons :",Jl,D,Ul,W,Vn="Maintenant la paire la plus fréquente est <code>("h", "ug")</code> donc nous apprenons la règle de fusion <code>("h", "ug") -> "hug"</code>. Cela nous donne donc notre premier <em>token</em> de trois lettres. Après la fusion, le corpus ressemble à ceci :",ml,R,Tl,Y,En="Et nous continuons ainsi jusqu’à ce que nous atteignions la taille de vocabulaire souhaitée.",hl,x,wl,P,dl,L,zn="La tokenisation suit de près le processus d’entraînement, dans le sens où les nouvelles entrées sont tokenisées en appliquant les étapes suivantes :",Cl,F,Zn="<li>Normalisation</li> <li>Prétokénisation</li> <li>Découpage des mots en caractères individuels</li> <li>Application des règles de fusion apprises dans l’ordre sur ces divisions.</li>",xl,K,Gn="Prenons l’exemple que nous avons utilisé pendant l’entraînement, avec les trois règles de fusion apprises :",gl,O,Il,ss,Sn="Le mot « bug » sera traduit par « [“b”, “ug”] ». Par contre, le mot « mug » (tasse en français) sera traduit par « [”[UNK]”, “ug”] » puisque la lettre « m » ne fait pas partie du vocabulaire de base. De la même façon, le mot « thug » (voyou en français) sera tokenisé en « [”[UNK]”, “hug”] » car la lettre « t » n’est pas dans le vocabulaire de base et l’application des règles de fusion résulte d’abord en la fusion de « u » et « g » et ensuite en la fusion de « hu » et « g ».",fl,g,bl,ls,ql,ns,Nn="Voyons maintenant une implémentation de l’algorithme BPE. Il ne s’agira pas d’une version optimisée que vous pourrez utiliser sur un grand corpus. Nous voulons simplement vous montrer le code afin que vous puissiez comprendre un peu mieux l’algorithme.",$l,as,_n="Tout d’abord, nous avons besoin d’un corpus, alors créons un corpus simple avec quelques phrases :",kl,es,Ql,ts,Hn="Ensuite, nous devons prétokeniser ce corpus en mots. Puisque nous répliquons un <em>tokenizer</em> BPE (comme celui du GPT-2), nous utiliserons le <em>tokenizer</em> <code>gpt2</code> pour la prétokénisation :",Al,ps,Bl,Ms,Xn="Ensuite, nous calculons les fréquences de chaque mot dans le corpus comme nous le faisons pour la prétokénisation :",vl,is,Vl,rs,El,us,Dn="L’étape suivante consiste à calculer le vocabulaire de base, formé par tous les caractères utilisés dans le corpus :",zl,ys,Zl,os,Gl,cs,Wn="Nous ajoutons également les <em>tokens</em> spéciaux utilisés par le modèle au début de ce vocabulaire. Dans le cas du GPT-2, le seul <em>token</em> spécial est <code>"<|endoftext|>"</code> :",Sl,js,Nl,Js,Rn="Nous devons maintenant diviser chaque mot en caractères individuels pour pouvoir commencer l’entraînement :",_l,Us,Hl,ms,Yn="Maintenant que nous sommes prêts pour l’entraînement, écrivons une fonction qui calcule la fréquence de chaque paire. Nous devrons l’utiliser à chaque étape de l’entraînement :",Xl,Ts,Dl,hs,Pn="Jetons un coup d’œil à une partie de ce dictionnaire après les premières divisions :",Wl,ws,Rl,ds,Yl,Cs,Ln="Maintenant, trouver la paire la plus fréquente ne demande qu’une rapide boucle :",Pl,xs,Ll,gs,Fl,Is,Fn="Donc la première fusion à apprendre est <code>('Ġ', 't') -> 'Ġt'</code>, et on ajoute <code>'Ġt'</code> au vocabulaire :",Kl,fs,Ol,bs,Kn="Pour continuer, nous devons appliquer cette fusion dans notre dictionnaire <code>splits</code>. Écrivons une autre fonction pour cela :",sn,qs,ln,$s,On="Et nous pouvons regarder le résultat de la première fusion :",nn,ks,an,Qs,en,As,sa="Maintenant, nous avons tout ce dont nous avons besoin pour boucler jusqu’à ce que nous ayons appris toutes les fusions que nous voulons. Visons une taille de vocabulaire de 50 :",tn,Bs,pn,vs,la="En conséquence, nous avons appris 19 règles de fusion (le vocabulaire initial avait une taille de 31 : 30 caractères dans l’alphabet plus le <em>token</em> spécial) :",Mn,Vs,rn,Es,un,zs,na="Et le vocabulaire est composé du <em>token</em> spécial, de l’alphabet initial, et de tous les résultats des fusions :",yn,Zs,on,Gs,cn,I,jn,Ss,aa="Pour tokeniser un nouveau texte, on le prétokenise, on le divise, puis on applique toutes les règles de fusion apprises :",Jn,Ns,Un,_s,ea="Nous pouvons essayer cela sur n’importe quel texte composé de caractères de l’alphabet :",mn,Hs,Tn,Xs,hn,f,wn,Ds,ta="C’est tout pour l’algorithme BPE ! Nous allons nous intéresser à WordPiece dans la suite.",dn,Ws,Cn,Ps,xn;return b=new gn({props:{title:"Tokénisation <i> Byte-Pair Encoding </i>",local:"tokénisation-i-byte-pair-encoding-i",headingTag:"h1"}}),q=new da({props:{chapter:6,classNames:"absolute z-10 right-0 top-0",notebooks:[{label:"English",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter6/section5.ipynb"},{label:"Français",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/fr/chapter6/section5.ipynb"},{label:"English",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter6/section5.ipynb"},{label:"Français",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/fr/chapter6/section5.ipynb"}]}}),k=new wa({props:{id:"HEikzVL-lZU"}}),d=new Rs({props:{$$slots:{default:[xa]},$$scope:{ctx:w}}}),Q=new gn({props:{title:"Algorithme d’entraînement",local:"algorithme-dentraînement",headingTag:"h2"}}),B=new m({props:{code:"JTIyaHVnJTIyJTJDJTIwJTIycHVnJTIyJTJDJTIwJTIycHVuJTIyJTJDJTIwJTIyYnVuJTIyJTJDJTIwJTIyaHVncyUyMiUyMCUyMyUyMCUyMmMlQzMlQTJsaW4lMjIlMkMlMjAlMjJjYXJsaW4lMjIlMkMlMjAlMjJqZXUlMjBkZSUyMG1vdHMlMjIlMkMlMjAlMjJicmlvY2hlJTIyJTJDJTIwJTIyYyVDMyVBMmxpbnMlMjI=",highlighted:'<span class="hljs-string">"hug"</span>, <span class="hljs-string">"pug"</span>, <span class="hljs-string">"pun"</span>, <span class="hljs-string">"bun"</span>, <span class="hljs-string">"hugs"</span> <span class="hljs-meta"># <span class="hljs-string">"câlin"</span>, <span class="hljs-string">"carlin"</span>, <span class="hljs-string">"jeu de mots"</span>, <span class="hljs-string">"brioche"</span>, <span class="hljs-string">"câlins"</span></span>',wrap:!1}}),C=new Rs({props:{$$slots:{default:[ga]},$$scope:{ctx:w}}}),Z=new m({props:{code:"KCUyMmh1ZyUyMiUyQyUyMDEwKSUyQyUyMCglMjJwdWclMjIlMkMlMjA1KSUyQyUyMCglMjJwdW4lMjIlMkMlMjAxMiklMkMlMjAoJTIyYnVuJTIyJTJDJTIwNCklMkMlMjAoJTIyaHVncyUyMiUyQyUyMDUp",highlighted:'(<span class="hljs-string">"hug"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"pug"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"pun"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"bun"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"hugs"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)',wrap:!1}}),S=new m({props:{code:"KCUyMmglMjIlMjAlMjJ1JTIyJTIwJTIyZyUyMiUyQyUyMDEwKSUyQyUyMCglMjJwJTIyJTIwJTIydSUyMiUyMCUyMmclMjIlMkMlMjA1KSUyQyUyMCglMjJwJTIyJTIwJTIydSUyMiUyMCUyMm4lMjIlMkMlMjAxMiklMkMlMjAoJTIyYiUyMiUyMCUyMnUlMjIlMjAlMjJuJTIyJTJDJTIwNCklMkMlMjAoJTIyaCUyMiUyMCUyMnUlMjIlMjAlMjJnJTIyJTIwJTIycyUyMiUyQyUyMDUp",highlighted:'(<span class="hljs-string">"h"</span> <span class="hljs-string">"u"</span> <span class="hljs-string">"g"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">10</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"p"</span> <span class="hljs-string">"u"</span> <span class="hljs-string">"g"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"p"</span> <span class="hljs-string">"u"</span> <span class="hljs-string">"n"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">12</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"b"</span> <span class="hljs-string">"u"</span> <span class="hljs-string">"n"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">4</span>)<span class="hljs-punctuation">,</span> (<span class="hljs-string">"h"</span> <span class="hljs-string">"u"</span> <span class="hljs-string">"g"</span> <span class="hljs-string">"s"</span><span class="hljs-punctuation">,</span> <span class="hljs-number">5</span>)',wrap:!1}}),H=new m({props:{code:"Vm9jYWJ1bGFyeSUzQSUyMCU1QiUyMmIlMjIlMkMlMjAlMjJnJTIyJTJDJTIwJTIyaCUyMiUyQyUyMCUyMm4lMjIlMkMlMjAlMjJwJTIyJTJDJTIwJTIycyUyMiUyQyUyMCUyMnUlMjIlMkMlMjAlMjJ1ZyUyMiU1RCUwQUNvcnB1cyUzQSUyMCglMjJoJTIyJTIwJTIydWclMjIlMkMlMjAxMCklMkMlMjAoJTIycCUyMiUyMCUyMnVnJTIyJTJDJTIwNSklMkMlMjAoJTIycCUyMiUyMCUyMnUlMjIlMjAlMjJuJTIyJTJDJTIwMTIpJTJDJTIwKCUyMmIlMjIlMjAlMjJ1JTIyJTIwJTIybiUyMiUyQyUyMDQpJTJDJTIwKCUyMmglMjIlMjAlMjJ1ZyUyMiUyMCUyMnMlMjIlMkMlMjA1KQ==",highlighted:`<span class="hljs-symbol">Vocabulary:</span> [<span class="hljs-string">"b"</span>, <span class="hljs-string">"g"</span>, <span class="hljs-string">"h"</span>, <span class="hljs-string">"n"</span>, <span class="hljs-string">"p"</span>, <span class="hljs-string">"s"</span>, <span class="hljs-string">"u"</span>, <span class="hljs-string">"ug"</span>] | |
| <span class="hljs-symbol">Corpus:</span> (<span class="hljs-string">"h"</span> <span class="hljs-string">"ug"</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">"p"</span> <span class="hljs-string">"ug"</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">"p"</span> <span class="hljs-string">"u"</span> <span class="hljs-string">"n"</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">"b"</span> <span class="hljs-string">"u"</span> <span class="hljs-string">"n"</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">"h"</span> <span class="hljs-string">"ug"</span> <span class="hljs-string">"s"</span>, <span class="hljs-number">5</span>)`,wrap:!1}}),D=new m({props:{code:"Vm9jYWJ1bGFyeSUzQSUyMCU1QiUyMmIlMjIlMkMlMjAlMjJnJTIyJTJDJTIwJTIyaCUyMiUyQyUyMCUyMm4lMjIlMkMlMjAlMjJwJTIyJTJDJTIwJTIycyUyMiUyQyUyMCUyMnUlMjIlMkMlMjAlMjJ1ZyUyMiUyQyUyMCUyMnVuJTIyJTVEJTBBQ29ycHVzJTNBJTIwKCUyMmglMjIlMjAlMjJ1ZyUyMiUyQyUyMDEwKSUyQyUyMCglMjJwJTIyJTIwJTIydWclMjIlMkMlMjA1KSUyQyUyMCglMjJwJTIyJTIwJTIydW4lMjIlMkMlMjAxMiklMkMlMjAoJTIyYiUyMiUyMCUyMnVuJTIyJTJDJTIwNCklMkMlMjAoJTIyaCUyMiUyMCUyMnVnJTIyJTIwJTIycyUyMiUyQyUyMDUp",highlighted:`<span class="hljs-symbol">Vocabulary:</span> [<span class="hljs-string">"b"</span>, <span class="hljs-string">"g"</span>, <span class="hljs-string">"h"</span>, <span class="hljs-string">"n"</span>, <span class="hljs-string">"p"</span>, <span class="hljs-string">"s"</span>, <span class="hljs-string">"u"</span>, <span class="hljs-string">"ug"</span>, <span class="hljs-string">"un"</span>] | |
| <span class="hljs-symbol">Corpus:</span> (<span class="hljs-string">"h"</span> <span class="hljs-string">"ug"</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">"p"</span> <span class="hljs-string">"ug"</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">"p"</span> <span class="hljs-string">"un"</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">"b"</span> <span class="hljs-string">"un"</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">"h"</span> <span class="hljs-string">"ug"</span> <span class="hljs-string">"s"</span>, <span class="hljs-number">5</span>)`,wrap:!1}}),R=new m({props:{code:"Vm9jYWJ1bGFyeSUzQSUyMCU1QiUyMmIlMjIlMkMlMjAlMjJnJTIyJTJDJTIwJTIyaCUyMiUyQyUyMCUyMm4lMjIlMkMlMjAlMjJwJTIyJTJDJTIwJTIycyUyMiUyQyUyMCUyMnUlMjIlMkMlMjAlMjJ1ZyUyMiUyQyUyMCUyMnVuJTIyJTJDJTIwJTIyaHVnJTIyJTVEJTBBQ29ycHVzJTNBJTIwKCUyMmh1ZyUyMiUyQyUyMDEwKSUyQyUyMCglMjJwJTIyJTIwJTIydWclMjIlMkMlMjA1KSUyQyUyMCglMjJwJTIyJTIwJTIydW4lMjIlMkMlMjAxMiklMkMlMjAoJTIyYiUyMiUyMCUyMnVuJTIyJTJDJTIwNCklMkMlMjAoJTIyaHVnJTIyJTIwJTIycyUyMiUyQyUyMDUp",highlighted:`<span class="hljs-symbol">Vocabulary:</span> [<span class="hljs-string">"b"</span>, <span class="hljs-string">"g"</span>, <span class="hljs-string">"h"</span>, <span class="hljs-string">"n"</span>, <span class="hljs-string">"p"</span>, <span class="hljs-string">"s"</span>, <span class="hljs-string">"u"</span>, <span class="hljs-string">"ug"</span>, <span class="hljs-string">"un"</span>, <span class="hljs-string">"hug"</span>] | |
| <span class="hljs-symbol">Corpus:</span> (<span class="hljs-string">"hug"</span>, <span class="hljs-number">10</span>), (<span class="hljs-string">"p"</span> <span class="hljs-string">"ug"</span>, <span class="hljs-number">5</span>), (<span class="hljs-string">"p"</span> <span class="hljs-string">"un"</span>, <span class="hljs-number">12</span>), (<span class="hljs-string">"b"</span> <span class="hljs-string">"un"</span>, <span class="hljs-number">4</span>), (<span class="hljs-string">"hug"</span> <span class="hljs-string">"s"</span>, <span class="hljs-number">5</span>)`,wrap:!1}}),x=new Rs({props:{$$slots:{default:[Ia]},$$scope:{ctx:w}}}),P=new gn({props:{title:"Algorithme de tokenisation",local:"algorithme-de-tokenisation",headingTag:"h2"}}),O=new m({props:{code:"KCUyMnUlMjIlMkMlMjAlMjJnJTIyKSUyMC0lM0UlMjAlMjJ1ZyUyMiUwQSglMjJ1JTIyJTJDJTIwJTIybiUyMiklMjAtJTNFJTIwJTIydW4lMjIlMEEoJTIyaCUyMiUyQyUyMCUyMnVnJTIyKSUyMC0lM0UlMjAlMjJodWclMjI=",highlighted:`<span class="hljs-function"><span class="hljs-params">(<span class="hljs-string">"u"</span>, <span class="hljs-string">"g"</span>)</span> -></span> <span class="hljs-string">"ug"</span> | |
| <span class="hljs-function"><span class="hljs-params">(<span class="hljs-string">"u"</span>, <span class="hljs-string">"n"</span>)</span> -></span> <span class="hljs-string">"un"</span> | |
| <span class="hljs-function"><span class="hljs-params">(<span class="hljs-string">"h"</span>, <span class="hljs-string">"ug"</span>)</span> -></span> <span class="hljs-string">"hug"</span>`,wrap:!1}}),g=new Rs({props:{$$slots:{default:[fa]},$$scope:{ctx:w}}}),ls=new gn({props:{title:"Implémentation du BPE",local:"implémentation-du-bpe",headingTag:"h2"}}),es=new m({props:{code:"Y29ycHVzJTIwJTNEJTIwJTVCJTBBJTIwJTIwJTIwJTIwJTIyVGhpcyUyMGlzJTIwdGhlJTIwSHVnZ2luZyUyMEZhY2UlMjBDb3Vyc2UuJTIyJTJDJTBBJTIwJTIwJTIwJTIwJTIzJTIwQydlc3QlMjBsZSUyMGNvdXJzJTIwZCdIdWdnaW5nJTIwRmFjZS4lMEElMjAlMjAlMjAlMjAlMjJUaGlzJTIwY2hhcHRlciUyMGlzJTIwYWJvdXQlMjB0b2tlbml6YXRpb24uJTIyJTJDJTBBJTIwJTIwJTIwJTIwJTIzJTIwQ2UlMjBjaGFwaXRyZSUyMHRyYWl0ZSUyMGRlJTIwbGElMjB0b2tlbmlzYXRpb24uJTBBJTIwJTIwJTIwJTIwJTIyVGhpcyUyMHNlY3Rpb24lMjBzaG93cyUyMHNldmVyYWwlMjB0b2tlbml6ZXIlMjBhbGdvcml0aG1zLiUyMiUyQyUwQSUyMCUyMCUyMCUyMCUyMyUyMENldHRlJTIwc2VjdGlvbiUyMHByJUMzJUE5c2VudGUlMjBwbHVzaWV1cnMlMjBhbGdvcml0aG1lcyUyMGRlJTIwdG9rZW5pemVyLiUwQSUyMCUyMCUyMCUyMCUyMkhvcGVmdWxseSUyQyUyMHlvdSUyMHdpbGwlMjBiZSUyMGFibGUlMjB0byUyMHVuZGVyc3RhbmQlMjBob3clMjB0aGV5JTIwYXJlJTIwdHJhaW5lZCUyMGFuZCUyMGdlbmVyYXRlJTIwdG9rZW5zLiUyMiUyQyUwQSUyMCUyMCUyMCUyMCUyMyUyMEF2ZWMlMjB1biUyMHBldSUyMGRlJTIwY2hhbmNlJTJDJTIwdm91cyUyMHNlcmV6JTIwZW4lMjBtZXN1cmUlMjBkZSUyMGNvbXByZW5kcmUlMjBjb21tZW50JTIwaWxzJTIwc29udCUyMGVudHJhJUMzJUFFbiVDMyVBOXMlMjBldCUyMGclQzMlQTluJUMzJUE4cmVudCUyMGRlcyUyMHRva2Vucy4lMEElNUQ=",highlighted:`corpus = [ | |
| <span class="hljs-string">"This is the Hugging Face Course."</span>, | |
| <span class="hljs-comment"># C'est le cours d'Hugging Face.</span> | |
| <span class="hljs-string">"This chapter is about tokenization."</span>, | |
| <span class="hljs-comment"># Ce chapitre traite de la tokenisation.</span> | |
| <span class="hljs-string">"This section shows several tokenizer algorithms."</span>, | |
| <span class="hljs-comment"># Cette section présente plusieurs algorithmes de tokenizer.</span> | |
| <span class="hljs-string">"Hopefully, you will be able to understand how they are trained and generate tokens."</span>, | |
| <span class="hljs-comment"># Avec un peu de chance, vous serez en mesure de comprendre comment ils sont entraînés et génèrent des tokens.</span> | |
| ]`,wrap:!1}}),ps=new m({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Ub2tlbml6ZXIlMEElMEF0b2tlbml6ZXIlMjAlM0QlMjBBdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZCglMjJncHQyJTIyKQ==",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"gpt2"</span>)`,wrap:!1}}),is=new m({props:{code:"ZnJvbSUyMGNvbGxlY3Rpb25zJTIwaW1wb3J0JTIwZGVmYXVsdGRpY3QlMEElMEF3b3JkX2ZyZXFzJTIwJTNEJTIwZGVmYXVsdGRpY3QoaW50KSUwQSUwQWZvciUyMHRleHQlMjBpbiUyMGNvcnB1cyUzQSUwQSUyMCUyMCUyMCUyMHdvcmRzX3dpdGhfb2Zmc2V0cyUyMCUzRCUyMHRva2VuaXplci5iYWNrZW5kX3Rva2VuaXplci5wcmVfdG9rZW5pemVyLnByZV90b2tlbml6ZV9zdHIodGV4dCklMEElMjAlMjAlMjAlMjBuZXdfd29yZHMlMjAlM0QlMjAlNUJ3b3JkJTIwZm9yJTIwd29yZCUyQyUyMG9mZnNldCUyMGluJTIwd29yZHNfd2l0aF9vZmZzZXRzJTVEJTBBJTIwJTIwJTIwJTIwZm9yJTIwd29yZCUyMGluJTIwbmV3X3dvcmRzJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwd29yZF9mcmVxcyU1QndvcmQlNUQlMjAlMkIlM0QlMjAxJTBBJTBBcHJpbnQod29yZF9mcmVxcyk=",highlighted:`<span class="hljs-keyword">from</span> collections <span class="hljs-keyword">import</span> defaultdict | |
| word_freqs = defaultdict(<span class="hljs-built_in">int</span>) | |
| <span class="hljs-keyword">for</span> text <span class="hljs-keyword">in</span> corpus: | |
| words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text) | |
| new_words = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> words_with_offsets] | |
| <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> new_words: | |
| word_freqs[word] += <span class="hljs-number">1</span> | |
| <span class="hljs-built_in">print</span>(word_freqs)`,wrap:!1}}),rs=new m({props:{code:"ZGVmYXVsdGRpY3QoaW50JTJDJTIwJTdCJ1RoaXMnJTNBJTIwMyUyQyUyMCclQzQlQTBpcyclM0ElMjAyJTJDJTIwJyVDNCVBMHRoZSclM0ElMjAxJTJDJTIwJyVDNCVBMEh1Z2dpbmcnJTNBJTIwMSUyQyUyMCclQzQlQTBGYWNlJyUzQSUyMDElMkMlMjAnJUM0JUEwQ291cnNlJyUzQSUyMDElMkMlMjAnLiclM0ElMjA0JTJDJTIwJyVDNCVBMGNoYXB0ZXInJTNBJTIwMSUyQyUwQSUyMCUyMCUyMCUyMCclQzQlQTBhYm91dCclM0ElMjAxJTJDJTIwJyVDNCVBMHRva2VuaXphdGlvbiclM0ElMjAxJTJDJTIwJyVDNCVBMHNlY3Rpb24nJTNBJTIwMSUyQyUyMCclQzQlQTBzaG93cyclM0ElMjAxJTJDJTIwJyVDNCVBMHNldmVyYWwnJTNBJTIwMSUyQyUyMCclQzQlQTB0b2tlbml6ZXInJTNBJTIwMSUyQyUyMCclQzQlQTBhbGdvcml0aG1zJyUzQSUyMDElMkMlMEElMjAlMjAlMjAlMjAnSG9wZWZ1bGx5JyUzQSUyMDElMkMlMjAnJTJDJyUzQSUyMDElMkMlMjAnJUM0JUEweW91JyUzQSUyMDElMkMlMjAnJUM0JUEwd2lsbCclM0ElMjAxJTJDJTIwJyVDNCVBMGJlJyUzQSUyMDElMkMlMjAnJUM0JUEwYWJsZSclM0ElMjAxJTJDJTIwJyVDNCVBMHRvJyUzQSUyMDElMkMlMjAnJUM0JUEwdW5kZXJzdGFuZCclM0ElMjAxJTJDJTIwJyVDNCVBMGhvdyclM0ElMjAxJTJDJTBBJTIwJTIwJTIwJTIwJyVDNCVBMHRoZXknJTNBJTIwMSUyQyUyMCclQzQlQTBhcmUnJTNBJTIwMSUyQyUyMCclQzQlQTB0cmFpbmVkJyUzQSUyMDElMkMlMjAnJUM0JUEwYW5kJyUzQSUyMDElMkMlMjAnJUM0JUEwZ2VuZXJhdGUnJTNBJTIwMSUyQyUyMCclQzQlQTB0b2tlbnMnJTNBJTIwMSU3RCk=",highlighted:`defaultdict(<span class="hljs-built_in">int</span>, {<span class="hljs-string">'This'</span>: <span class="hljs-number">3</span>, <span class="hljs-string">'Ġis'</span>: <span class="hljs-number">2</span>, <span class="hljs-string">'Ġthe'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'ĠHugging'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'ĠFace'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'ĠCourse'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'.'</span>: <span class="hljs-number">4</span>, <span class="hljs-string">'Ġchapter'</span>: <span class="hljs-number">1</span>, | |
| <span class="hljs-string">'Ġabout'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġtokenization'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġsection'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġshows'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġseveral'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġtokenizer'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġalgorithms'</span>: <span class="hljs-number">1</span>, | |
| <span class="hljs-string">'Hopefully'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">','</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġyou'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġwill'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġbe'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġable'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġto'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġunderstand'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġhow'</span>: <span class="hljs-number">1</span>, | |
| <span class="hljs-string">'Ġthey'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġare'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġtrained'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġand'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġgenerate'</span>: <span class="hljs-number">1</span>, <span class="hljs-string">'Ġtokens'</span>: <span class="hljs-number">1</span>})`,wrap:!1}}),ys=new m({props:{code:"YWxwaGFiZXQlMjAlM0QlMjAlNUIlNUQlMEElMEFmb3IlMjB3b3JkJTIwaW4lMjB3b3JkX2ZyZXFzLmtleXMoKSUzQSUwQSUyMCUyMCUyMCUyMGZvciUyMGxldHRlciUyMGluJTIwd29yZCUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwbGV0dGVyJTIwbm90JTIwaW4lMjBhbHBoYWJldCUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGFscGhhYmV0LmFwcGVuZChsZXR0ZXIpJTBBYWxwaGFiZXQuc29ydCgpJTBBJTBBcHJpbnQoYWxwaGFiZXQp",highlighted:`alphabet = [] | |
| <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> word_freqs.keys(): | |
| <span class="hljs-keyword">for</span> letter <span class="hljs-keyword">in</span> word: | |
| <span class="hljs-keyword">if</span> letter <span class="hljs-keyword">not</span> <span class="hljs-keyword">in</span> alphabet: | |
| alphabet.append(letter) | |
| alphabet.sort() | |
| <span class="hljs-built_in">print</span>(alphabet)`,wrap:!1}}),os=new m({props:{code:"JTVCJTIwJyUyQyclMkMlMjAnLiclMkMlMjAnQyclMkMlMjAnRiclMkMlMjAnSCclMkMlMjAnVCclMkMlMjAnYSclMkMlMjAnYiclMkMlMjAnYyclMkMlMjAnZCclMkMlMjAnZSclMkMlMjAnZiclMkMlMjAnZyclMkMlMjAnaCclMkMlMjAnaSclMkMlMjAnayclMkMlMjAnbCclMkMlMjAnbSclMkMlMjAnbiclMkMlMjAnbyclMkMlMjAncCclMkMlMjAnciclMkMlMjAncyclMkMlMEElMjAlMjAndCclMkMlMjAndSclMkMlMjAndiclMkMlMjAndyclMkMlMjAneSclMkMlMjAneiclMkMlMjAnJUM0JUEwJyU1RA==",highlighted:`[ <span class="hljs-string">','</span>, <span class="hljs-string">'.'</span>, <span class="hljs-string">'C'</span>, <span class="hljs-string">'F'</span>, <span class="hljs-string">'H'</span>, <span class="hljs-string">'T'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'b'</span>, <span class="hljs-string">'c'</span>, <span class="hljs-string">'d'</span>, <span class="hljs-string">'e'</span>, <span class="hljs-string">'f'</span>, <span class="hljs-string">'g'</span>, <span class="hljs-string">'h'</span>, <span class="hljs-string">'i'</span>, <span class="hljs-string">'k'</span>, <span class="hljs-string">'l'</span>, <span class="hljs-string">'m'</span>, <span class="hljs-string">'n'</span>, <span class="hljs-string">'o'</span>, <span class="hljs-string">'p'</span>, <span class="hljs-string">'r'</span>, <span class="hljs-string">'s'</span>, | |
| <span class="hljs-string">'t'</span>, <span class="hljs-string">'u'</span>, <span class="hljs-string">'v'</span>, <span class="hljs-string">'w'</span>, <span class="hljs-string">'y'</span>, <span class="hljs-string">'z'</span>, <span class="hljs-string">'Ġ'</span>]`,wrap:!1}}),js=new m({props:{code:"dm9jYWIlMjAlM0QlMjAlNUIlMjIlM0MlN0NlbmRvZnRleHQlN0MlM0UlMjIlNUQlMjAlMkIlMjBhbHBoYWJldC5jb3B5KCk=",highlighted:'vocab = [<span class="hljs-string">"<|endoftext|>"</span>] + alphabet.copy()',wrap:!1}}),Us=new m({props:{code:"c3BsaXRzJTIwJTNEJTIwJTdCd29yZCUzQSUyMCU1QmMlMjBmb3IlMjBjJTIwaW4lMjB3b3JkJTVEJTIwZm9yJTIwd29yZCUyMGluJTIwd29yZF9mcmVxcy5rZXlzKCklN0Q=",highlighted:'splits = {word: [c <span class="hljs-keyword">for</span> c <span class="hljs-keyword">in</span> word] <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> word_freqs.keys()}',wrap:!1}}),Ts=new m({props:{code:"ZGVmJTIwY29tcHV0ZV9wYWlyX2ZyZXFzKHNwbGl0cyklM0ElMEElMjAlMjAlMjAlMjBwYWlyX2ZyZXFzJTIwJTNEJTIwZGVmYXVsdGRpY3QoaW50KSUwQSUyMCUyMCUyMCUyMGZvciUyMHdvcmQlMkMlMjBmcmVxJTIwaW4lMjB3b3JkX2ZyZXFzLml0ZW1zKCklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzcGxpdCUyMCUzRCUyMHNwbGl0cyU1QndvcmQlNUQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpZiUyMGxlbihzcGxpdCklMjAlM0QlM0QlMjAxJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwY29udGludWUlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBmb3IlMjBpJTIwaW4lMjByYW5nZShsZW4oc3BsaXQpJTIwLSUyMDEpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcGFpciUyMCUzRCUyMChzcGxpdCU1QmklNUQlMkMlMjBzcGxpdCU1QmklMjAlMkIlMjAxJTVEKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHBhaXJfZnJlcXMlNUJwYWlyJTVEJTIwJTJCJTNEJTIwZnJlcSUwQSUyMCUyMCUyMCUyMHJldHVybiUyMHBhaXJfZnJlcXM=",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_pair_freqs</span>(<span class="hljs-params">splits</span>): | |
| pair_freqs = defaultdict(<span class="hljs-built_in">int</span>) | |
| <span class="hljs-keyword">for</span> word, freq <span class="hljs-keyword">in</span> word_freqs.items(): | |
| split = splits[word] | |
| <span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(split) == <span class="hljs-number">1</span>: | |
| <span class="hljs-keyword">continue</span> | |
| <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(split) - <span class="hljs-number">1</span>): | |
| pair = (split[i], split[i + <span class="hljs-number">1</span>]) | |
| pair_freqs[pair] += freq | |
| <span class="hljs-keyword">return</span> pair_freqs`,wrap:!1}}),ws=new m({props:{code:"cGFpcl9mcmVxcyUyMCUzRCUyMGNvbXB1dGVfcGFpcl9mcmVxcyhzcGxpdHMpJTBBJTBBZm9yJTIwaSUyQyUyMGtleSUyMGluJTIwZW51bWVyYXRlKHBhaXJfZnJlcXMua2V5cygpKSUzQSUwQSUyMCUyMCUyMCUyMHByaW50KGYlMjIlN0JrZXklN0QlM0ElMjAlN0JwYWlyX2ZyZXFzJTVCa2V5JTVEJTdEJTIyKSUwQSUyMCUyMCUyMCUyMGlmJTIwaSUyMCUzRSUzRCUyMDUlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBicmVhaw==",highlighted:`pair_freqs = compute_pair_freqs(splits) | |
| <span class="hljs-keyword">for</span> i, key <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(pair_freqs.keys()): | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"<span class="hljs-subst">{key}</span>: <span class="hljs-subst">{pair_freqs[key]}</span>"</span>) | |
| <span class="hljs-keyword">if</span> i >= <span class="hljs-number">5</span>: | |
| <span class="hljs-keyword">break</span>`,wrap:!1}}),ds=new m({props:{code:"KCdUJyUyQyUyMCdoJyklM0ElMjAzJTBBKCdoJyUyQyUyMCdpJyklM0ElMjAzJTBBKCdpJyUyQyUyMCdzJyklM0ElMjA1JTBBKCclQzQlQTAnJTJDJTIwJ2knKSUzQSUyMDIlMEEoJyVDNCVBMCclMkMlMjAndCcpJTNBJTIwNyUwQSgndCclMkMlMjAnaCcpJTNBJTIwMw==",highlighted:`(<span class="hljs-string">'T'</span>, <span class="hljs-string">'h'</span>): <span class="hljs-number">3</span> | |
| (<span class="hljs-string">'h'</span>, <span class="hljs-string">'i'</span>): <span class="hljs-number">3</span> | |
| (<span class="hljs-string">'i'</span>, <span class="hljs-string">'s'</span>): <span class="hljs-number">5</span> | |
| (<span class="hljs-string">'Ġ'</span>, <span class="hljs-string">'i'</span>): <span class="hljs-number">2</span> | |
| (<span class="hljs-string">'Ġ'</span>, <span class="hljs-string">'t'</span>): <span class="hljs-number">7</span> | |
| (<span class="hljs-string">'t'</span>, <span class="hljs-string">'h'</span>): <span class="hljs-number">3</span>`,wrap:!1}}),xs=new m({props:{code:"YmVzdF9wYWlyJTIwJTNEJTIwJTIyJTIyJTBBbWF4X2ZyZXElMjAlM0QlMjBOb25lJTBBJTBBZm9yJTIwcGFpciUyQyUyMGZyZXElMjBpbiUyMHBhaXJfZnJlcXMuaXRlbXMoKSUzQSUwQSUyMCUyMCUyMCUyMGlmJTIwbWF4X2ZyZXElMjBpcyUyME5vbmUlMjBvciUyMG1heF9mcmVxJTIwJTNDJTIwZnJlcSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGJlc3RfcGFpciUyMCUzRCUyMHBhaXIlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBtYXhfZnJlcSUyMCUzRCUyMGZyZXElMEElMEFwcmludChiZXN0X3BhaXIlMkMlMjBtYXhfZnJlcSk=",highlighted:`best_pair = <span class="hljs-string">""</span> | |
| max_freq = <span class="hljs-literal">None</span> | |
| <span class="hljs-keyword">for</span> pair, freq <span class="hljs-keyword">in</span> pair_freqs.items(): | |
| <span class="hljs-keyword">if</span> max_freq <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">or</span> max_freq < freq: | |
| best_pair = pair | |
| max_freq = freq | |
| <span class="hljs-built_in">print</span>(best_pair, max_freq)`,wrap:!1}}),gs=new m({props:{code:"KCclQzQlQTAnJTJDJTIwJ3QnKSUyMDc=",highlighted:'(<span class="hljs-string">'Ġ'</span>, <span class="hljs-string">'t'</span>) <span class="hljs-number">7</span>',wrap:!1}}),fs=new m({props:{code:"bWVyZ2VzJTIwJTNEJTIwJTdCKCUyMiVDNCVBMCUyMiUyQyUyMCUyMnQlMjIpJTNBJTIwJTIyJUM0JUEwdCUyMiU3RCUwQXZvY2FiLmFwcGVuZCglMjIlQzQlQTB0JTIyKQ==",highlighted:`merges = {(<span class="hljs-string">"Ġ"</span>, <span class="hljs-string">"t"</span>): <span class="hljs-string">"Ġt"</span>} | |
| vocab.append(<span class="hljs-string">"Ġt"</span>)`,wrap:!1}}),qs=new m({props:{code:"ZGVmJTIwbWVyZ2VfcGFpcihhJTJDJTIwYiUyQyUyMHNwbGl0cyklM0ElMEElMjAlMjAlMjAlMjBmb3IlMjB3b3JkJTIwaW4lMjB3b3JkX2ZyZXFzJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc3BsaXQlMjAlM0QlMjBzcGxpdHMlNUJ3b3JkJTVEJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwaWYlMjBsZW4oc3BsaXQpJTIwJTNEJTNEJTIwMSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGNvbnRpbnVlJTBBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwaSUyMCUzRCUyMDAlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB3aGlsZSUyMGklMjAlM0MlMjBsZW4oc3BsaXQpJTIwLSUyMDElM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpZiUyMHNwbGl0JTVCaSU1RCUyMCUzRCUzRCUyMGElMjBhbmQlMjBzcGxpdCU1QmklMjAlMkIlMjAxJTVEJTIwJTNEJTNEJTIwYiUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHNwbGl0JTIwJTNEJTIwc3BsaXQlNUIlM0FpJTVEJTIwJTJCJTIwJTVCYSUyMCUyQiUyMGIlNUQlMjAlMkIlMjBzcGxpdCU1QmklMjAlMkIlMjAyJTIwJTNBJTVEJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZWxzZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGklMjAlMkIlM0QlMjAxJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc3BsaXRzJTVCd29yZCU1RCUyMCUzRCUyMHNwbGl0JTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwc3BsaXRz",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">merge_pair</span>(<span class="hljs-params">a, b, splits</span>): | |
| <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> word_freqs: | |
| split = splits[word] | |
| <span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(split) == <span class="hljs-number">1</span>: | |
| <span class="hljs-keyword">continue</span> | |
| i = <span class="hljs-number">0</span> | |
| <span class="hljs-keyword">while</span> i < <span class="hljs-built_in">len</span>(split) - <span class="hljs-number">1</span>: | |
| <span class="hljs-keyword">if</span> split[i] == a <span class="hljs-keyword">and</span> split[i + <span class="hljs-number">1</span>] == b: | |
| split = split[:i] + [a + b] + split[i + <span class="hljs-number">2</span> :] | |
| <span class="hljs-keyword">else</span>: | |
| i += <span class="hljs-number">1</span> | |
| splits[word] = split | |
| <span class="hljs-keyword">return</span> splits`,wrap:!1}}),ks=new m({props:{code:"c3BsaXRzJTIwJTNEJTIwbWVyZ2VfcGFpciglMjIlQzQlQTAlMjIlMkMlMjAlMjJ0JTIyJTJDJTIwc3BsaXRzKSUwQXByaW50KHNwbGl0cyU1QiUyMiVDNCVBMHRyYWluZWQlMjIlNUQp",highlighted:`splits = merge_pair(<span class="hljs-string">"Ġ"</span>, <span class="hljs-string">"t"</span>, splits) | |
| <span class="hljs-built_in">print</span>(splits[<span class="hljs-string">"Ġtrained"</span>])`,wrap:!1}}),Qs=new m({props:{code:"JTVCJyVDNCVBMHQnJTJDJTIwJ3InJTJDJTIwJ2EnJTJDJTIwJ2knJTJDJTIwJ24nJTJDJTIwJ2UnJTJDJTIwJ2QnJTVE",highlighted:'[<span class="hljs-string">'Ġt'</span>, <span class="hljs-string">'r'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'i'</span>, <span class="hljs-string">'n'</span>, <span class="hljs-string">'e'</span>, <span class="hljs-string">'d'</span>]',wrap:!1}}),Bs=new m({props:{code:"dm9jYWJfc2l6ZSUyMCUzRCUyMDUwJTBBJTBBd2hpbGUlMjBsZW4odm9jYWIpJTIwJTNDJTIwdm9jYWJfc2l6ZSUzQSUwQSUyMCUyMCUyMCUyMHBhaXJfZnJlcXMlMjAlM0QlMjBjb21wdXRlX3BhaXJfZnJlcXMoc3BsaXRzKSUwQSUyMCUyMCUyMCUyMGJlc3RfcGFpciUyMCUzRCUyMCUyMiUyMiUwQSUyMCUyMCUyMCUyMG1heF9mcmVxJTIwJTNEJTIwTm9uZSUwQSUyMCUyMCUyMCUyMGZvciUyMHBhaXIlMkMlMjBmcmVxJTIwaW4lMjBwYWlyX2ZyZXFzLml0ZW1zKCklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpZiUyMG1heF9mcmVxJTIwaXMlMjBOb25lJTIwb3IlMjBtYXhfZnJlcSUyMCUzQyUyMGZyZXElM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBiZXN0X3BhaXIlMjAlM0QlMjBwYWlyJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwbWF4X2ZyZXElMjAlM0QlMjBmcmVxJTBBJTIwJTIwJTIwJTIwc3BsaXRzJTIwJTNEJTIwbWVyZ2VfcGFpcigqYmVzdF9wYWlyJTJDJTIwc3BsaXRzKSUwQSUyMCUyMCUyMCUyMG1lcmdlcyU1QmJlc3RfcGFpciU1RCUyMCUzRCUyMGJlc3RfcGFpciU1QjAlNUQlMjAlMkIlMjBiZXN0X3BhaXIlNUIxJTVEJTBBJTIwJTIwJTIwJTIwdm9jYWIuYXBwZW5kKGJlc3RfcGFpciU1QjAlNUQlMjAlMkIlMjBiZXN0X3BhaXIlNUIxJTVEKQ==",highlighted:`vocab_size = <span class="hljs-number">50</span> | |
| <span class="hljs-keyword">while</span> <span class="hljs-built_in">len</span>(vocab) < vocab_size: | |
| pair_freqs = compute_pair_freqs(splits) | |
| best_pair = <span class="hljs-string">""</span> | |
| max_freq = <span class="hljs-literal">None</span> | |
| <span class="hljs-keyword">for</span> pair, freq <span class="hljs-keyword">in</span> pair_freqs.items(): | |
| <span class="hljs-keyword">if</span> max_freq <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">or</span> max_freq < freq: | |
| best_pair = pair | |
| max_freq = freq | |
| splits = merge_pair(*best_pair, splits) | |
| merges[best_pair] = best_pair[<span class="hljs-number">0</span>] + best_pair[<span class="hljs-number">1</span>] | |
| vocab.append(best_pair[<span class="hljs-number">0</span>] + best_pair[<span class="hljs-number">1</span>])`,wrap:!1}}),Vs=new m({props:{code:"cHJpbnQobWVyZ2VzKQ==",highlighted:'<span class="hljs-built_in">print</span>(merges)',wrap:!1}}),Es=new m({props:{code:"JTdCKCclQzQlQTAnJTJDJTIwJ3QnKSUzQSUyMCclQzQlQTB0JyUyQyUyMCgnaSclMkMlMjAncycpJTNBJTIwJ2lzJyUyQyUyMCgnZSclMkMlMjAncicpJTNBJTIwJ2VyJyUyQyUyMCgnJUM0JUEwJyUyQyUyMCdhJyklM0ElMjAnJUM0JUEwYSclMkMlMjAoJyVDNCVBMHQnJTJDJTIwJ28nKSUzQSUyMCclQzQlQTB0byclMkMlMjAoJ2UnJTJDJTIwJ24nKSUzQSUyMCdlbiclMkMlMEElMjAoJ1QnJTJDJTIwJ2gnKSUzQSUyMCdUaCclMkMlMjAoJ1RoJyUyQyUyMCdpcycpJTNBJTIwJ1RoaXMnJTJDJTIwKCdvJyUyQyUyMCd1JyklM0ElMjAnb3UnJTJDJTIwKCdzJyUyQyUyMCdlJyklM0ElMjAnc2UnJTJDJTIwKCclQzQlQTB0byclMkMlMjAnaycpJTNBJTIwJyVDNCVBMHRvayclMkMlMEElMjAoJyVDNCVBMHRvayclMkMlMjAnZW4nKSUzQSUyMCclQzQlQTB0b2tlbiclMkMlMjAoJ24nJTJDJTIwJ2QnKSUzQSUyMCduZCclMkMlMjAoJyVDNCVBMCclMkMlMjAnaXMnKSUzQSUyMCclQzQlQTBpcyclMkMlMjAoJyVDNCVBMHQnJTJDJTIwJ2gnKSUzQSUyMCclQzQlQTB0aCclMkMlMjAoJyVDNCVBMHRoJyUyQyUyMCdlJyklM0ElMjAnJUM0JUEwdGhlJyUyQyUwQSUyMCgnaSclMkMlMjAnbicpJTNBJTIwJ2luJyUyQyUyMCgnJUM0JUEwYSclMkMlMjAnYicpJTNBJTIwJyVDNCVBMGFiJyUyQyUyMCgnJUM0JUEwdG9rZW4nJTJDJTIwJ2knKSUzQSUyMCclQzQlQTB0b2tlbmknJTdE",highlighted:`{(<span class="hljs-string">'Ġ'</span>, <span class="hljs-string">'t'</span>): <span class="hljs-string">'Ġt'</span>, (<span class="hljs-string">'i'</span>, <span class="hljs-string">'s'</span>): <span class="hljs-string">'is'</span>, (<span class="hljs-string">'e'</span>, <span class="hljs-string">'r'</span>): <span class="hljs-string">'er'</span>, (<span class="hljs-string">'Ġ'</span>, <span class="hljs-string">'a'</span>): <span class="hljs-string">'Ġa'</span>, (<span class="hljs-string">'Ġt'</span>, <span class="hljs-string">'o'</span>): <span class="hljs-string">'Ġto'</span>, (<span class="hljs-string">'e'</span>, <span class="hljs-string">'n'</span>): <span class="hljs-string">'en'</span>, | |
| (<span class="hljs-string">'T'</span>, <span class="hljs-string">'h'</span>): <span class="hljs-string">'Th'</span>, (<span class="hljs-string">'Th'</span>, <span class="hljs-string">'is'</span>): <span class="hljs-string">'This'</span>, (<span class="hljs-string">'o'</span>, <span class="hljs-string">'u'</span>): <span class="hljs-string">'ou'</span>, (<span class="hljs-string">'s'</span>, <span class="hljs-string">'e'</span>): <span class="hljs-string">'se'</span>, (<span class="hljs-string">'Ġto'</span>, <span class="hljs-string">'k'</span>): <span class="hljs-string">'Ġtok'</span>, | |
| (<span class="hljs-string">'Ġtok'</span>, <span class="hljs-string">'en'</span>): <span class="hljs-string">'Ġtoken'</span>, (<span class="hljs-string">'n'</span>, <span class="hljs-string">'d'</span>): <span class="hljs-string">'nd'</span>, (<span class="hljs-string">'Ġ'</span>, <span class="hljs-string">'is'</span>): <span class="hljs-string">'Ġis'</span>, (<span class="hljs-string">'Ġt'</span>, <span class="hljs-string">'h'</span>): <span class="hljs-string">'Ġth'</span>, (<span class="hljs-string">'Ġth'</span>, <span class="hljs-string">'e'</span>): <span class="hljs-string">'Ġthe'</span>, | |
| (<span class="hljs-string">'i'</span>, <span class="hljs-string">'n'</span>): <span class="hljs-string">'in'</span>, (<span class="hljs-string">'Ġa'</span>, <span class="hljs-string">'b'</span>): <span class="hljs-string">'Ġab'</span>, (<span class="hljs-string">'Ġtoken'</span>, <span class="hljs-string">'i'</span>): <span class="hljs-string">'Ġtokeni'</span>}`,wrap:!1}}),Zs=new m({props:{code:"cHJpbnQodm9jYWIp",highlighted:'<span class="hljs-built_in">print</span>(vocab)',wrap:!1}}),Gs=new m({props:{code:"JTVCJyUzQyU3Q2VuZG9mdGV4dCU3QyUzRSclMkMlMjAnJTJDJyUyQyUyMCcuJyUyQyUyMCdDJyUyQyUyMCdGJyUyQyUyMCdIJyUyQyUyMCdUJyUyQyUyMCdhJyUyQyUyMCdiJyUyQyUyMCdjJyUyQyUyMCdkJyUyQyUyMCdlJyUyQyUyMCdmJyUyQyUyMCdnJyUyQyUyMCdoJyUyQyUyMCdpJyUyQyUyMCdrJyUyQyUyMCdsJyUyQyUyMCdtJyUyQyUyMCduJyUyQyUyMCdvJyUyQyUwQSUyMCdwJyUyQyUyMCdyJyUyQyUyMCdzJyUyQyUyMCd0JyUyQyUyMCd1JyUyQyUyMCd2JyUyQyUyMCd3JyUyQyUyMCd5JyUyQyUyMCd6JyUyQyUyMCclQzQlQTAnJTJDJTIwJyVDNCVBMHQnJTJDJTIwJ2lzJyUyQyUyMCdlciclMkMlMjAnJUM0JUEwYSclMkMlMjAnJUM0JUEwdG8nJTJDJTIwJ2VuJyUyQyUyMCdUaCclMkMlMjAnVGhpcyclMkMlMjAnb3UnJTJDJTIwJ3NlJyUyQyUwQSUyMCclQzQlQTB0b2snJTJDJTIwJyVDNCVBMHRva2VuJyUyQyUyMCduZCclMkMlMjAnJUM0JUEwaXMnJTJDJTIwJyVDNCVBMHRoJyUyQyUyMCclQzQlQTB0aGUnJTJDJTIwJ2luJyUyQyUyMCclQzQlQTBhYiclMkMlMjAnJUM0JUEwdG9rZW5pJyU1RA==",highlighted:`[<span class="hljs-string">'<|endoftext|>'</span>, <span class="hljs-string">','</span>, <span class="hljs-string">'.'</span>, <span class="hljs-string">'C'</span>, <span class="hljs-string">'F'</span>, <span class="hljs-string">'H'</span>, <span class="hljs-string">'T'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'b'</span>, <span class="hljs-string">'c'</span>, <span class="hljs-string">'d'</span>, <span class="hljs-string">'e'</span>, <span class="hljs-string">'f'</span>, <span class="hljs-string">'g'</span>, <span class="hljs-string">'h'</span>, <span class="hljs-string">'i'</span>, <span class="hljs-string">'k'</span>, <span class="hljs-string">'l'</span>, <span class="hljs-string">'m'</span>, <span class="hljs-string">'n'</span>, <span class="hljs-string">'o'</span>, | |
| <span class="hljs-string">'p'</span>, <span class="hljs-string">'r'</span>, <span class="hljs-string">'s'</span>, <span class="hljs-string">'t'</span>, <span class="hljs-string">'u'</span>, <span class="hljs-string">'v'</span>, <span class="hljs-string">'w'</span>, <span class="hljs-string">'y'</span>, <span class="hljs-string">'z'</span>, <span class="hljs-string">'Ġ'</span>, <span class="hljs-string">'Ġt'</span>, <span class="hljs-string">'is'</span>, <span class="hljs-string">'er'</span>, <span class="hljs-string">'Ġa'</span>, <span class="hljs-string">'Ġto'</span>, <span class="hljs-string">'en'</span>, <span class="hljs-string">'Th'</span>, <span class="hljs-string">'This'</span>, <span class="hljs-string">'ou'</span>, <span class="hljs-string">'se'</span>, | |
| <span class="hljs-string">'Ġtok'</span>, <span class="hljs-string">'Ġtoken'</span>, <span class="hljs-string">'nd'</span>, <span class="hljs-string">'Ġis'</span>, <span class="hljs-string">'Ġth'</span>, <span class="hljs-string">'Ġthe'</span>, <span class="hljs-string">'in'</span>, <span class="hljs-string">'Ġab'</span>, <span class="hljs-string">'Ġtokeni'</span>]`,wrap:!1}}),I=new Rs({props:{$$slots:{default:[ba]},$$scope:{ctx:w}}}),Ns=new m({props:{code:"ZGVmJTIwdG9rZW5pemUodGV4dCklM0ElMEElMjAlMjAlMjAlMjBwcmVfdG9rZW5pemVfcmVzdWx0JTIwJTNEJTIwdG9rZW5pemVyLl90b2tlbml6ZXIucHJlX3Rva2VuaXplci5wcmVfdG9rZW5pemVfc3RyKHRleHQpJTBBJTIwJTIwJTIwJTIwcHJlX3Rva2VuaXplZF90ZXh0JTIwJTNEJTIwJTVCd29yZCUyMGZvciUyMHdvcmQlMkMlMjBvZmZzZXQlMjBpbiUyMHByZV90b2tlbml6ZV9yZXN1bHQlNUQlMEElMjAlMjAlMjAlMjBzcGxpdHMlMjAlM0QlMjAlNUIlNUJsJTIwZm9yJTIwbCUyMGluJTIwd29yZCU1RCUyMGZvciUyMHdvcmQlMjBpbiUyMHByZV90b2tlbml6ZWRfdGV4dCU1RCUwQSUyMCUyMCUyMCUyMGZvciUyMHBhaXIlMkMlMjBtZXJnZSUyMGluJTIwbWVyZ2VzLml0ZW1zKCklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBmb3IlMjBpZHglMkMlMjBzcGxpdCUyMGluJTIwZW51bWVyYXRlKHNwbGl0cyklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpJTIwJTNEJTIwMCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHdoaWxlJTIwaSUyMCUzQyUyMGxlbihzcGxpdCklMjAtJTIwMSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwc3BsaXQlNUJpJTVEJTIwJTNEJTNEJTIwcGFpciU1QjAlNUQlMjBhbmQlMjBzcGxpdCU1QmklMjAlMkIlMjAxJTVEJTIwJTNEJTNEJTIwcGFpciU1QjElNUQlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzcGxpdCUyMCUzRCUyMHNwbGl0JTVCJTNBaSU1RCUyMCUyQiUyMCU1Qm1lcmdlJTVEJTIwJTJCJTIwc3BsaXQlNUJpJTIwJTJCJTIwMiUyMCUzQSU1RCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGVsc2UlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpJTIwJTJCJTNEJTIwMSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHNwbGl0cyU1QmlkeCU1RCUyMCUzRCUyMHNwbGl0JTBBJTBBJTIwJTIwJTIwJTIwcmV0dXJuJTIwc3VtKHNwbGl0cyUyQyUyMCU1QiU1RCk=",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">tokenize</span>(<span class="hljs-params">text</span>): | |
| pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text) | |
| pre_tokenized_text = [word <span class="hljs-keyword">for</span> word, offset <span class="hljs-keyword">in</span> pre_tokenize_result] | |
| splits = [[l <span class="hljs-keyword">for</span> l <span class="hljs-keyword">in</span> word] <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> pre_tokenized_text] | |
| <span class="hljs-keyword">for</span> pair, merge <span class="hljs-keyword">in</span> merges.items(): | |
| <span class="hljs-keyword">for</span> idx, split <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(splits): | |
| i = <span class="hljs-number">0</span> | |
| <span class="hljs-keyword">while</span> i < <span class="hljs-built_in">len</span>(split) - <span class="hljs-number">1</span>: | |
| <span class="hljs-keyword">if</span> split[i] == pair[<span class="hljs-number">0</span>] <span class="hljs-keyword">and</span> split[i + <span class="hljs-number">1</span>] == pair[<span class="hljs-number">1</span>]: | |
| split = split[:i] + [merge] + split[i + <span class="hljs-number">2</span> :] | |
| <span class="hljs-keyword">else</span>: | |
| i += <span class="hljs-number">1</span> | |
| splits[idx] = split | |
| <span class="hljs-keyword">return</span> <span class="hljs-built_in">sum</span>(splits, [])`,wrap:!1}}),Hs=new m({props:{code:"dG9rZW5pemUoJTIyVGhpcyUyMGlzJTIwbm90JTIwYSUyMHRva2VuLiUyMik=",highlighted:'tokenize(<span class="hljs-string">"This is not a token."</span>)',wrap:!1}}),Xs=new m({props:{code:"JTVCJ1RoaXMnJTJDJTIwJyVDNCVBMGlzJyUyQyUyMCclQzQlQTAnJTJDJTIwJ24nJTJDJTIwJ28nJTJDJTIwJ3QnJTJDJTIwJyVDNCVBMGEnJTJDJTIwJyVDNCVBMHRva2VuJyUyQyUyMCcuJyU1RA==",highlighted:'[<span class="hljs-string">'This'</span>, <span class="hljs-string">'Ġis'</span>, <span class="hljs-string">'Ġ'</span>, <span class="hljs-string">'n'</span>, <span class="hljs-string">'o'</span>, <span class="hljs-string">'t'</span>, <span class="hljs-string">'Ġa'</span>, <span class="hljs-string">'Ġtoken'</span>, <span class="hljs-string">'.'</span>]',wrap:!1}}),f=new Rs({props:{warning:!0,$$slots:{default:[qa]},$$scope:{ctx:w}}}),Ws=new Ca({props:{source:"https://github.com/huggingface/course/blob/main/chapters/fr/chapter6/5.mdx"}}),{c(){p=c("meta"),T=e(),J=c("p"),h=e(),M(b.$$.fragment),Ls=e(),M(q.$$.fragment),Fs=e(),$=c("p"),$.innerHTML=In,Ks=e(),M(k.$$.fragment),Os=e(),M(d.$$.fragment),sl=e(),M(Q.$$.fragment),ll=e(),A=c("p"),A.textContent=fn,nl=e(),M(B.$$.fragment),al=e(),v=c("p"),v.innerHTML=bn,el=e(),M(C.$$.fragment),tl=e(),V=c("p"),V.innerHTML=qn,pl=e(),E=c("p"),E.innerHTML=$n,Ml=e(),z=c("p"),z.textContent=kn,il=e(),M(Z.$$.fragment),rl=e(),G=c("p"),G.innerHTML=Qn,ul=e(),M(S.$$.fragment),yl=e(),N=c("p"),N.innerHTML=An,ol=e(),_=c("p"),_.innerHTML=Bn,cl=e(),M(H.$$.fragment),jl=e(),X=c("p"),X.innerHTML=vn,Jl=e(),M(D.$$.fragment),Ul=e(),W=c("p"),W.innerHTML=Vn,ml=e(),M(R.$$.fragment),Tl=e(),Y=c("p"),Y.textContent=En,hl=e(),M(x.$$.fragment),wl=e(),M(P.$$.fragment),dl=e(),L=c("p"),L.textContent=zn,Cl=e(),F=c("ol"),F.innerHTML=Zn,xl=e(),K=c("p"),K.textContent=Gn,gl=e(),M(O.$$.fragment),Il=e(),ss=c("p"),ss.textContent=Sn,fl=e(),M(g.$$.fragment),bl=e(),M(ls.$$.fragment),ql=e(),ns=c("p"),ns.textContent=Nn,$l=e(),as=c("p"),as.textContent=_n,kl=e(),M(es.$$.fragment),Ql=e(),ts=c("p"),ts.innerHTML=Hn,Al=e(),M(ps.$$.fragment),Bl=e(),Ms=c("p"),Ms.textContent=Xn,vl=e(),M(is.$$.fragment),Vl=e(),M(rs.$$.fragment),El=e(),us=c("p"),us.textContent=Dn,zl=e(),M(ys.$$.fragment),Zl=e(),M(os.$$.fragment),Gl=e(),cs=c("p"),cs.innerHTML=Wn,Sl=e(),M(js.$$.fragment),Nl=e(),Js=c("p"),Js.textContent=Rn,_l=e(),M(Us.$$.fragment),Hl=e(),ms=c("p"),ms.textContent=Yn,Xl=e(),M(Ts.$$.fragment),Dl=e(),hs=c("p"),hs.textContent=Pn,Wl=e(),M(ws.$$.fragment),Rl=e(),M(ds.$$.fragment),Yl=e(),Cs=c("p"),Cs.textContent=Ln,Pl=e(),M(xs.$$.fragment),Ll=e(),M(gs.$$.fragment),Fl=e(),Is=c("p"),Is.innerHTML=Fn,Kl=e(),M(fs.$$.fragment),Ol=e(),bs=c("p"),bs.innerHTML=Kn,sn=e(),M(qs.$$.fragment),ln=e(),$s=c("p"),$s.textContent=On,nn=e(),M(ks.$$.fragment),an=e(),M(Qs.$$.fragment),en=e(),As=c("p"),As.textContent=sa,tn=e(),M(Bs.$$.fragment),pn=e(),vs=c("p"),vs.innerHTML=la,Mn=e(),M(Vs.$$.fragment),rn=e(),M(Es.$$.fragment),un=e(),zs=c("p"),zs.innerHTML=na,yn=e(),M(Zs.$$.fragment),on=e(),M(Gs.$$.fragment),cn=e(),M(I.$$.fragment),jn=e(),Ss=c("p"),Ss.textContent=aa,Jn=e(),M(Ns.$$.fragment),Un=e(),_s=c("p"),_s.textContent=ea,mn=e(),M(Hs.$$.fragment),Tn=e(),M(Xs.$$.fragment),hn=e(),M(f.$$.fragment),wn=e(),Ds=c("p"),Ds.textContent=ta,dn=e(),M(Ws.$$.fragment),Cn=e(),Ps=c("p"),this.h()},l(s){const l=Ta("svelte-u9bgzb",document.head);p=j(l,"META",{name:!0,content:!0}),l.forEach(n),T=t(s),J=j(s,"P",{}),oa(J).forEach(n),h=t(s),i(b.$$.fragment,s),Ls=t(s),i(q.$$.fragment,s),Fs=t(s),$=j(s,"P",{"data-svelte-h":!0}),U($)!=="svelte-dc4rw6"&&($.innerHTML=In),Ks=t(s),i(k.$$.fragment,s),Os=t(s),i(d.$$.fragment,s),sl=t(s),i(Q.$$.fragment,s),ll=t(s),A=j(s,"P",{"data-svelte-h":!0}),U(A)!=="svelte-1h87em1"&&(A.textContent=fn),nl=t(s),i(B.$$.fragment,s),al=t(s),v=j(s,"P",{"data-svelte-h":!0}),U(v)!=="svelte-1uj0emc"&&(v.innerHTML=bn),el=t(s),i(C.$$.fragment,s),tl=t(s),V=j(s,"P",{"data-svelte-h":!0}),U(V)!=="svelte-1qcfng4"&&(V.innerHTML=qn),pl=t(s),E=j(s,"P",{"data-svelte-h":!0}),U(E)!=="svelte-34wkn5"&&(E.innerHTML=$n),Ml=t(s),z=j(s,"P",{"data-svelte-h":!0}),U(z)!=="svelte-1xtdwct"&&(z.textContent=kn),il=t(s),i(Z.$$.fragment,s),rl=t(s),G=j(s,"P",{"data-svelte-h":!0}),U(G)!=="svelte-8gs7d7"&&(G.innerHTML=Qn),ul=t(s),i(S.$$.fragment,s),yl=t(s),N=j(s,"P",{"data-svelte-h":!0}),U(N)!=="svelte-1eoh852"&&(N.innerHTML=An),ol=t(s),_=j(s,"P",{"data-svelte-h":!0}),U(_)!=="svelte-30qtcv"&&(_.innerHTML=Bn),cl=t(s),i(H.$$.fragment,s),jl=t(s),X=j(s,"P",{"data-svelte-h":!0}),U(X)!=="svelte-cosm6m"&&(X.innerHTML=vn),Jl=t(s),i(D.$$.fragment,s),Ul=t(s),W=j(s,"P",{"data-svelte-h":!0}),U(W)!=="svelte-1hj74ey"&&(W.innerHTML=Vn),ml=t(s),i(R.$$.fragment,s),Tl=t(s),Y=j(s,"P",{"data-svelte-h":!0}),U(Y)!=="svelte-1phz38r"&&(Y.textContent=En),hl=t(s),i(x.$$.fragment,s),wl=t(s),i(P.$$.fragment,s),dl=t(s),L=j(s,"P",{"data-svelte-h":!0}),U(L)!=="svelte-h50xo1"&&(L.textContent=zn),Cl=t(s),F=j(s,"OL",{"data-svelte-h":!0}),U(F)!=="svelte-19lg3ll"&&(F.innerHTML=Zn),xl=t(s),K=j(s,"P",{"data-svelte-h":!0}),U(K)!=="svelte-11oyli9"&&(K.textContent=Gn),gl=t(s),i(O.$$.fragment,s),Il=t(s),ss=j(s,"P",{"data-svelte-h":!0}),U(ss)!=="svelte-ciih5f"&&(ss.textContent=Sn),fl=t(s),i(g.$$.fragment,s),bl=t(s),i(ls.$$.fragment,s),ql=t(s),ns=j(s,"P",{"data-svelte-h":!0}),U(ns)!=="svelte-hkuo77"&&(ns.textContent=Nn),$l=t(s),as=j(s,"P",{"data-svelte-h":!0}),U(as)!=="svelte-1gfdk4v"&&(as.textContent=_n),kl=t(s),i(es.$$.fragment,s),Ql=t(s),ts=j(s,"P",{"data-svelte-h":!0}),U(ts)!=="svelte-1vs4829"&&(ts.innerHTML=Hn),Al=t(s),i(ps.$$.fragment,s),Bl=t(s),Ms=j(s,"P",{"data-svelte-h":!0}),U(Ms)!=="svelte-1nr0ogy"&&(Ms.textContent=Xn),vl=t(s),i(is.$$.fragment,s),Vl=t(s),i(rs.$$.fragment,s),El=t(s),us=j(s,"P",{"data-svelte-h":!0}),U(us)!=="svelte-6zl3q0"&&(us.textContent=Dn),zl=t(s),i(ys.$$.fragment,s),Zl=t(s),i(os.$$.fragment,s),Gl=t(s),cs=j(s,"P",{"data-svelte-h":!0}),U(cs)!=="svelte-zf5wvt"&&(cs.innerHTML=Wn),Sl=t(s),i(js.$$.fragment,s),Nl=t(s),Js=j(s,"P",{"data-svelte-h":!0}),U(Js)!=="svelte-189wuny"&&(Js.textContent=Rn),_l=t(s),i(Us.$$.fragment,s),Hl=t(s),ms=j(s,"P",{"data-svelte-h":!0}),U(ms)!=="svelte-1k3lfsg"&&(ms.textContent=Yn),Xl=t(s),i(Ts.$$.fragment,s),Dl=t(s),hs=j(s,"P",{"data-svelte-h":!0}),U(hs)!=="svelte-jx1j1k"&&(hs.textContent=Pn),Wl=t(s),i(ws.$$.fragment,s),Rl=t(s),i(ds.$$.fragment,s),Yl=t(s),Cs=j(s,"P",{"data-svelte-h":!0}),U(Cs)!=="svelte-1fn9ojv"&&(Cs.textContent=Ln),Pl=t(s),i(xs.$$.fragment,s),Ll=t(s),i(gs.$$.fragment,s),Fl=t(s),Is=j(s,"P",{"data-svelte-h":!0}),U(Is)!=="svelte-ctr4pf"&&(Is.innerHTML=Fn),Kl=t(s),i(fs.$$.fragment,s),Ol=t(s),bs=j(s,"P",{"data-svelte-h":!0}),U(bs)!=="svelte-1he3dod"&&(bs.innerHTML=Kn),sn=t(s),i(qs.$$.fragment,s),ln=t(s),$s=j(s,"P",{"data-svelte-h":!0}),U($s)!=="svelte-1nfafzn"&&($s.textContent=On),nn=t(s),i(ks.$$.fragment,s),an=t(s),i(Qs.$$.fragment,s),en=t(s),As=j(s,"P",{"data-svelte-h":!0}),U(As)!=="svelte-wy0zl0"&&(As.textContent=sa),tn=t(s),i(Bs.$$.fragment,s),pn=t(s),vs=j(s,"P",{"data-svelte-h":!0}),U(vs)!=="svelte-1wab3fs"&&(vs.innerHTML=la),Mn=t(s),i(Vs.$$.fragment,s),rn=t(s),i(Es.$$.fragment,s),un=t(s),zs=j(s,"P",{"data-svelte-h":!0}),U(zs)!=="svelte-kmb1d9"&&(zs.innerHTML=na),yn=t(s),i(Zs.$$.fragment,s),on=t(s),i(Gs.$$.fragment,s),cn=t(s),i(I.$$.fragment,s),jn=t(s),Ss=j(s,"P",{"data-svelte-h":!0}),U(Ss)!=="svelte-1dqmvm3"&&(Ss.textContent=aa),Jn=t(s),i(Ns.$$.fragment,s),Un=t(s),_s=j(s,"P",{"data-svelte-h":!0}),U(_s)!=="svelte-15d4qki"&&(_s.textContent=ea),mn=t(s),i(Hs.$$.fragment,s),Tn=t(s),i(Xs.$$.fragment,s),hn=t(s),i(f.$$.fragment,s),wn=t(s),Ds=j(s,"P",{"data-svelte-h":!0}),U(Ds)!=="svelte-e7tmzh"&&(Ds.textContent=ta),dn=t(s),i(Ws.$$.fragment,s),Cn=t(s),Ps=j(s,"P",{}),oa(Ps).forEach(n),this.h()},h(){ca(p,"name","hf:doc:metadata"),ca(p,"content",ka)},m(s,l){ha(document.head,p),a(s,T,l),a(s,J,l),a(s,h,l),r(b,s,l),a(s,Ls,l),r(q,s,l),a(s,Fs,l),a(s,$,l),a(s,Ks,l),r(k,s,l),a(s,Os,l),r(d,s,l),a(s,sl,l),r(Q,s,l),a(s,ll,l),a(s,A,l),a(s,nl,l),r(B,s,l),a(s,al,l),a(s,v,l),a(s,el,l),r(C,s,l),a(s,tl,l),a(s,V,l),a(s,pl,l),a(s,E,l),a(s,Ml,l),a(s,z,l),a(s,il,l),r(Z,s,l),a(s,rl,l),a(s,G,l),a(s,ul,l),r(S,s,l),a(s,yl,l),a(s,N,l),a(s,ol,l),a(s,_,l),a(s,cl,l),r(H,s,l),a(s,jl,l),a(s,X,l),a(s,Jl,l),r(D,s,l),a(s,Ul,l),a(s,W,l),a(s,ml,l),r(R,s,l),a(s,Tl,l),a(s,Y,l),a(s,hl,l),r(x,s,l),a(s,wl,l),r(P,s,l),a(s,dl,l),a(s,L,l),a(s,Cl,l),a(s,F,l),a(s,xl,l),a(s,K,l),a(s,gl,l),r(O,s,l),a(s,Il,l),a(s,ss,l),a(s,fl,l),r(g,s,l),a(s,bl,l),r(ls,s,l),a(s,ql,l),a(s,ns,l),a(s,$l,l),a(s,as,l),a(s,kl,l),r(es,s,l),a(s,Ql,l),a(s,ts,l),a(s,Al,l),r(ps,s,l),a(s,Bl,l),a(s,Ms,l),a(s,vl,l),r(is,s,l),a(s,Vl,l),r(rs,s,l),a(s,El,l),a(s,us,l),a(s,zl,l),r(ys,s,l),a(s,Zl,l),r(os,s,l),a(s,Gl,l),a(s,cs,l),a(s,Sl,l),r(js,s,l),a(s,Nl,l),a(s,Js,l),a(s,_l,l),r(Us,s,l),a(s,Hl,l),a(s,ms,l),a(s,Xl,l),r(Ts,s,l),a(s,Dl,l),a(s,hs,l),a(s,Wl,l),r(ws,s,l),a(s,Rl,l),r(ds,s,l),a(s,Yl,l),a(s,Cs,l),a(s,Pl,l),r(xs,s,l),a(s,Ll,l),r(gs,s,l),a(s,Fl,l),a(s,Is,l),a(s,Kl,l),r(fs,s,l),a(s,Ol,l),a(s,bs,l),a(s,sn,l),r(qs,s,l),a(s,ln,l),a(s,$s,l),a(s,nn,l),r(ks,s,l),a(s,an,l),r(Qs,s,l),a(s,en,l),a(s,As,l),a(s,tn,l),r(Bs,s,l),a(s,pn,l),a(s,vs,l),a(s,Mn,l),r(Vs,s,l),a(s,rn,l),r(Es,s,l),a(s,un,l),a(s,zs,l),a(s,yn,l),r(Zs,s,l),a(s,on,l),r(Gs,s,l),a(s,cn,l),r(I,s,l),a(s,jn,l),a(s,Ss,l),a(s,Jn,l),r(Ns,s,l),a(s,Un,l),a(s,_s,l),a(s,mn,l),r(Hs,s,l),a(s,Tn,l),r(Xs,s,l),a(s,hn,l),r(f,s,l),a(s,wn,l),a(s,Ds,l),a(s,dn,l),r(Ws,s,l),a(s,Cn,l),a(s,Ps,l),xn=!0},p(s,[l]){const pa={};l&2&&(pa.$$scope={dirty:l,ctx:s}),d.$set(pa);const Ma={};l&2&&(Ma.$$scope={dirty:l,ctx:s}),C.$set(Ma);const ia={};l&2&&(ia.$$scope={dirty:l,ctx:s}),x.$set(ia);const ra={};l&2&&(ra.$$scope={dirty:l,ctx:s}),g.$set(ra);const ua={};l&2&&(ua.$$scope={dirty:l,ctx:s}),I.$set(ua);const ya={};l&2&&(ya.$$scope={dirty:l,ctx:s}),f.$set(ya)},i(s){xn||(u(b.$$.fragment,s),u(q.$$.fragment,s),u(k.$$.fragment,s),u(d.$$.fragment,s),u(Q.$$.fragment,s),u(B.$$.fragment,s),u(C.$$.fragment,s),u(Z.$$.fragment,s),u(S.$$.fragment,s),u(H.$$.fragment,s),u(D.$$.fragment,s),u(R.$$.fragment,s),u(x.$$.fragment,s),u(P.$$.fragment,s),u(O.$$.fragment,s),u(g.$$.fragment,s),u(ls.$$.fragment,s),u(es.$$.fragment,s),u(ps.$$.fragment,s),u(is.$$.fragment,s),u(rs.$$.fragment,s),u(ys.$$.fragment,s),u(os.$$.fragment,s),u(js.$$.fragment,s),u(Us.$$.fragment,s),u(Ts.$$.fragment,s),u(ws.$$.fragment,s),u(ds.$$.fragment,s),u(xs.$$.fragment,s),u(gs.$$.fragment,s),u(fs.$$.fragment,s),u(qs.$$.fragment,s),u(ks.$$.fragment,s),u(Qs.$$.fragment,s),u(Bs.$$.fragment,s),u(Vs.$$.fragment,s),u(Es.$$.fragment,s),u(Zs.$$.fragment,s),u(Gs.$$.fragment,s),u(I.$$.fragment,s),u(Ns.$$.fragment,s),u(Hs.$$.fragment,s),u(Xs.$$.fragment,s),u(f.$$.fragment,s),u(Ws.$$.fragment,s),xn=!0)},o(s){y(b.$$.fragment,s),y(q.$$.fragment,s),y(k.$$.fragment,s),y(d.$$.fragment,s),y(Q.$$.fragment,s),y(B.$$.fragment,s),y(C.$$.fragment,s),y(Z.$$.fragment,s),y(S.$$.fragment,s),y(H.$$.fragment,s),y(D.$$.fragment,s),y(R.$$.fragment,s),y(x.$$.fragment,s),y(P.$$.fragment,s),y(O.$$.fragment,s),y(g.$$.fragment,s),y(ls.$$.fragment,s),y(es.$$.fragment,s),y(ps.$$.fragment,s),y(is.$$.fragment,s),y(rs.$$.fragment,s),y(ys.$$.fragment,s),y(os.$$.fragment,s),y(js.$$.fragment,s),y(Us.$$.fragment,s),y(Ts.$$.fragment,s),y(ws.$$.fragment,s),y(ds.$$.fragment,s),y(xs.$$.fragment,s),y(gs.$$.fragment,s),y(fs.$$.fragment,s),y(qs.$$.fragment,s),y(ks.$$.fragment,s),y(Qs.$$.fragment,s),y(Bs.$$.fragment,s),y(Vs.$$.fragment,s),y(Es.$$.fragment,s),y(Zs.$$.fragment,s),y(Gs.$$.fragment,s),y(I.$$.fragment,s),y(Ns.$$.fragment,s),y(Hs.$$.fragment,s),y(Xs.$$.fragment,s),y(f.$$.fragment,s),y(Ws.$$.fragment,s),xn=!1},d(s){s&&(n(T),n(J),n(h),n(Ls),n(Fs),n($),n(Ks),n(Os),n(sl),n(ll),n(A),n(nl),n(al),n(v),n(el),n(tl),n(V),n(pl),n(E),n(Ml),n(z),n(il),n(rl),n(G),n(ul),n(yl),n(N),n(ol),n(_),n(cl),n(jl),n(X),n(Jl),n(Ul),n(W),n(ml),n(Tl),n(Y),n(hl),n(wl),n(dl),n(L),n(Cl),n(F),n(xl),n(K),n(gl),n(Il),n(ss),n(fl),n(bl),n(ql),n(ns),n($l),n(as),n(kl),n(Ql),n(ts),n(Al),n(Bl),n(Ms),n(vl),n(Vl),n(El),n(us),n(zl),n(Zl),n(Gl),n(cs),n(Sl),n(Nl),n(Js),n(_l),n(Hl),n(ms),n(Xl),n(Dl),n(hs),n(Wl),n(Rl),n(Yl),n(Cs),n(Pl),n(Ll),n(Fl),n(Is),n(Kl),n(Ol),n(bs),n(sn),n(ln),n($s),n(nn),n(an),n(en),n(As),n(tn),n(pn),n(vs),n(Mn),n(rn),n(un),n(zs),n(yn),n(on),n(cn),n(jn),n(Ss),n(Jn),n(Un),n(_s),n(mn),n(Tn),n(hn),n(wn),n(Ds),n(dn),n(Cn),n(Ps)),n(p),o(b,s),o(q,s),o(k,s),o(d,s),o(Q,s),o(B,s),o(C,s),o(Z,s),o(S,s),o(H,s),o(D,s),o(R,s),o(x,s),o(P,s),o(O,s),o(g,s),o(ls,s),o(es,s),o(ps,s),o(is,s),o(rs,s),o(ys,s),o(os,s),o(js,s),o(Us,s),o(Ts,s),o(ws,s),o(ds,s),o(xs,s),o(gs,s),o(fs,s),o(qs,s),o(ks,s),o(Qs,s),o(Bs,s),o(Vs,s),o(Es,s),o(Zs,s),o(Gs,s),o(I,s),o(Ns,s),o(Hs,s),o(Xs,s),o(f,s),o(Ws,s)}}}const ka='{"title":"Tokénisation <i> Byte-Pair Encoding </i>","local":"tokénisation-i-byte-pair-encoding-i","sections":[{"title":"Algorithme d’entraînement","local":"algorithme-dentraînement","sections":[],"depth":2},{"title":"Algorithme de tokenisation","local":"algorithme-de-tokenisation","sections":[],"depth":2},{"title":"Implémentation du BPE","local":"implémentation-du-bpe","sections":[],"depth":2}],"depth":1}';function Qa(w){return Ja(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Ga extends Ua{constructor(p){super(),ma(this,p,Qa,$a,ja,{})}}export{Ga as component}; | |
Xet Storage Details
- Size:
- 68 kB
- Xet hash:
- 66c6d05cd568b0d52a95af33a66ed7d73d01a9e22a35223d57c481371c8ccf53
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.