Buckets:

hf-doc-build/doc / transformers /main /ro /tokenizer_summary.html
HuggingFaceDocBuilder's picture
download
raw
42.8 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Algoritmi de tokenization&quot;,&quot;local&quot;:&quot;algoritmi-de-tokenization&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Byte pair encoding (BPE)&quot;,&quot;local&quot;:&quot;byte-pair-encoding-bpe&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;BPE la nivel de byte&quot;,&quot;local&quot;:&quot;bpe-la-nivel-de-byte&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Unigram&quot;,&quot;local&quot;:&quot;unigram&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;SentencePiece&quot;,&quot;local&quot;:&quot;sentencepiece&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;WordPiece&quot;,&quot;local&quot;:&quot;wordpiece&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Tokenization la nivel de cuvânt&quot;,&quot;local&quot;:&quot;tokenization-la-nivel-de-cuvânt&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Tokenization la nivel de caracter&quot;,&quot;local&quot;:&quot;tokenization-la-nivel-de-caracter&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Resurse&quot;,&quot;local&quot;:&quot;resurse&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"/>
<link href="/docs/transformers/main/ro/_app/immutable/entry/start.C4V7Noit.js" rel="modulepreload">
<link href="/docs/transformers/main/ro/_app/immutable/chunks/BXj7IdsT.js" rel="modulepreload">
<link href="/docs/transformers/main/ro/_app/immutable/chunks/Cl6kX2N0.js" rel="modulepreload">
<link href="/docs/transformers/main/ro/_app/immutable/entry/app.BwPDdXcs.js" rel="modulepreload">
<link href="/docs/transformers/main/ro/_app/immutable/chunks/BzzeShVL.js" rel="modulepreload">
<link href="/docs/transformers/main/ro/_app/immutable/chunks/DWwSmaYb.js" rel="modulepreload">
<link href="/docs/transformers/main/ro/_app/immutable/chunks/DM0RASQS.js" rel="modulepreload">
<link href="/docs/transformers/main/ro/_app/immutable/chunks/DsnmJJEf.js" rel="modulepreload">
<link href="/docs/transformers/main/ro/_app/immutable/nodes/0.Cz_DtHj_.js" rel="modulepreload">
<link href="/docs/transformers/main/ro/_app/immutable/chunks/C8aKV6he.js" rel="modulepreload">
<link href="/docs/transformers/main/ro/_app/immutable/nodes/33.DxDxsiJv.js" rel="modulepreload">
<link href="/docs/transformers/main/ro/_app/immutable/chunks/CjTxHwdC.js" rel="modulepreload">
<link href="/docs/transformers/main/ro/_app/immutable/chunks/BbRxJ3mC.js" rel="modulepreload">
<!--klul1d--><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Algoritmi de tokenization&quot;,&quot;local&quot;:&quot;algoritmi-de-tokenization&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Byte pair encoding (BPE)&quot;,&quot;local&quot;:&quot;byte-pair-encoding-bpe&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;BPE la nivel de byte&quot;,&quot;local&quot;:&quot;bpe-la-nivel-de-byte&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Unigram&quot;,&quot;local&quot;:&quot;unigram&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;SentencePiece&quot;,&quot;local&quot;:&quot;sentencepiece&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;WordPiece&quot;,&quot;local&quot;:&quot;wordpiece&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Tokenization la nivel de cuvânt&quot;,&quot;local&quot;:&quot;tokenization-la-nivel-de-cuvânt&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Tokenization la nivel de caracter&quot;,&quot;local&quot;:&quot;tokenization-la-nivel-de-caracter&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Resurse&quot;,&quot;local&quot;:&quot;resurse&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"/><!---->
<link href="/docs/transformers/main/ro/_app/immutable/assets/0.tn0RQdqM.css" rel="modulepreload"> <!--[--><!--[0--><!--[--><!--[0--><!--[--><p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg><!----></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg><!----></button></div> <!--[-1--><!--]--></div><!----> <!--[0--><h1 class="relative group"><a id="algoritmi-de-tokenization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#algoritmi-de-tokenization"><span><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg><!----></span></a> <span>Algoritmi de tokenization</span></h1><!--]--><!----> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/zHvTiHr506c" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen=""></iframe><!----> <p>Transformers suportă trei algoritmi de tokenizare la nivel de subword: Byte pair encoding (BPE), Unigram și WordPiece. Aceștia împart textul în unități între cuvinte și caractere, menținând vocabularul compact, captând în același timp bucăți cu sens. Cuvintele comune rămân intacte ca token-uri unice, iar cuvintele rare sau necunoscute se descompun în subwords.</p> <p>De exemplu, <code>annoyingly</code> ar putea fi împărțit în <code>["annoying", "ly"]</code> sau <code>["annoy", "ing", "ly"]</code> în funcție de vocabular. Împărțirea în subwords permite modelului să reprezinte cuvinte nevăzute din subwords cunoscute.</p> <blockquote class="tip"><p>Tokenizarea la nivel de subword este deosebit de utilă pentru limbi ca turca, unde poți forma cuvinte lungi și complexe înlănțuind subword-uri.</p></blockquote> <!--[1--><h2 class="relative group"><a id="byte-pair-encoding-bpe" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#byte-pair-encoding-bpe"><span><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg><!----></span></a> <span>Byte pair encoding (BPE)</span></h2><!--]--><!----> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/HEikzVL-lZU" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen=""></iframe><!----> <p><a href="https://huggingface.co/papers/1508.07909" rel="nofollow">Byte pair encoding</a> (BPE) este cel mai popular algoritm de tokenizare din Transformers, folosit de modele ca [Llama], [Gemma], [Qwen2] și altele.</p> <ol><li>Un pre-tokenizator împarte textul pe spații sau alte reguli, producând un set de cuvinte unice și frecvențele lor.</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg><!----> <div class=" absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0 "><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent;"></div> Copied</div><!----></button><!----></div> <pre class="language-text "><!---->(&quot;hug&quot;, 10), (&quot;pug&quot;, 5), (&quot;pun&quot;, 12), (&quot;bun&quot;, 4), (&quot;hugs&quot;, 5)<!----></pre></div><!----> <ol start="2"><li>Algoritmul BPE creează un vocabular de bază, <code>["b", "g", "h", "n", "p", "s", "u"]</code>, din toate caracterele.</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg><!----> <div class=" absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0 "><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent;"></div> Copied</div><!----></button><!----></div> <pre class="language-text "><!---->(&quot;h&quot; &quot;u&quot; &quot;g&quot;, 10), (&quot;p&quot; &quot;u&quot; &quot;g&quot;, 5), (&quot;p&quot; &quot;u&quot; &quot;n&quot;, 12), (&quot;b&quot; &quot;u&quot; &quot;n&quot;, 4), (&quot;h&quot; &quot;u&quot; &quot;g&quot; &quot;s&quot;, 5)<!----></pre></div><!----> <ol start="3"><li>BPE pornește cu caractere individuale și îmbină iterativ perechea adiacentă cea mai frecventă. <code>"u"</code> și <code>"g"</code> apar împreună cel mai des în <code>"hug"</code>, <code>"pug"</code> și <code>"hugs"</code>, deci BPE le îmbină în <code>"ug"</code> și îl adaugă în vocabular.</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg><!----> <div class=" absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0 "><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent;"></div> Copied</div><!----></button><!----></div> <pre class="language-text "><!---->(&quot;h&quot; &quot;ug&quot;, 10), (&quot;p&quot; &quot;ug&quot;, 5), (&quot;p&quot; &quot;u&quot; &quot;n&quot;, 12), (&quot;b&quot; &quot;u&quot; &quot;n&quot;, 4), (&quot;h&quot; &quot;ug&quot; &quot;s&quot;, 5)<!----></pre></div><!----> <ol start="4"><li>Următoarea pereche cea mai comună este <code>"u"</code> și <code>"n"</code>, care apar în <code>"pun"</code> și <code>"bun"</code>, deci se îmbină în <code>"un"</code>.</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg><!----> <div class=" absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0 "><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent;"></div> Copied</div><!----></button><!----></div> <pre class="language-text "><!---->(&quot;h&quot; &quot;ug&quot;, 10), (&quot;p&quot; &quot;ug&quot;, 5), (&quot;p&quot; &quot;un&quot;, 12), (&quot;b&quot; &quot;un&quot;, 4), (&quot;h&quot; &quot;ug&quot; &quot;s&quot;, 5)<!----></pre></div><!----> <ol start="5"><li>Vocabularul este acum <code>["b", "g", "h", "n", "p", "s", "u", "ug", "un"]</code>. BPE continuă să învețe reguli de îmbinare până atinge dimensiunea țintă a vocabularului, egală cu dimensiunea vocabularului de bază plus numărul de embeddings. [GPT] folosește BPE cu un vocabular de 40.478 (478 token-uri de bază + 40.000 embeddings).</li></ol> <p>Orice caracter care nu se află în vocabularul de bază se mapează la un token necunoscut ca <code>"&lt;unk>"</code>. În practică, vocabularul de bază acoperă toate caracterele văzute în antrenare, deci token-urile necunoscute sunt rare.</p> <!--[2--><h3 class="relative group"><a id="bpe-la-nivel-de-byte" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#bpe-la-nivel-de-byte"><span><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg><!----></span></a> <span>BPE la nivel de byte</span></h3><!--]--><!----> <p>Includerea tuturor caracterelor Unicode ar face vocabularul de bază enorm. BPE la nivel de byte folosește în schimb 256 de valori de byte ca vocabular de bază, asigurând că orice cuvânt poate fi tokenizat fără token-ul <code>"&lt;unk>"</code>. [GPT-2] folosește BPE la nivel de byte cu un vocabular de 50.257 (256 token-uri de byte + 50.000 embeddings + un token special de sfârșit de text).</p> <!--[1--><h2 class="relative group"><a id="unigram" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#unigram"><span><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg><!----></span></a> <span>Unigram</span></h2><!--]--><!----> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/TGZfZVuF9Yc" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen=""></iframe><!----> <p><a href="https://huggingface.co/papers/1804.10959" rel="nofollow">Unigram</a> este al doilea algoritm de tokenizare ca popularitate din Transformers, folosit de modele precum [T5], [BigBird], [Pegasus] și altele.</p> <ol><li>Unigram pornește cu un set mare de subword-uri candidate, iar fiecare candidat primește un scor de probabilitate bazat pe frecvența sa.</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg><!----> <div class=" absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0 "><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent;"></div> Copied</div><!----></button><!----></div> <pre class="language-text "><!---->(&quot;hug&quot;, 10), (&quot;pug&quot;, 5), (&quot;pun&quot;, 12), (&quot;bun&quot;, 4), (&quot;hugs&quot;, 5)
[&quot;b&quot;, &quot;g&quot;, &quot;h&quot;, &quot;n&quot;, &quot;p&quot;, &quot;s&quot;, &quot;u&quot;, &quot;hu&quot;, &quot;ug&quot;, &quot;un&quot;, &quot;pu&quot;, &quot;bu&quot;, &quot;gs&quot;, &quot;hug&quot;, &quot;pug&quot;, &quot;pun&quot;, &quot;bun&quot;, &quot;ugs&quot;, &quot;hugs&quot;]<!----></pre></div><!----> <ol start="2"><li><p>Unigram evaluează cât de bine tokenizează vocabularul curent datele de antrenare la fiecare pas.</p></li> <li><p>Pentru fiecare token, Unigram măsoară cât de mult ar crește pierderea globală dacă token-ul ar fi eliminat. De exemplu, eliminarea <code>"pu"</code> afectează puțin pierderea pentru că <code>"pug"</code> și <code>"pun"</code> pot fi tokenizate în continuare ca <code>["p", "ug"]</code> și <code>["p", "un"]</code>.</p> <p>Dar eliminarea <code>"ug"</code> ar crește semnificativ pierderea pentru că <code>"hug"</code>, <code>"pug"</code> și <code>"hugs"</code> depind toate de el.</p></li> <li><p>Unigram elimină token-urile cu cea mai mică creștere a pierderii, de obicei primele 10-20% de jos. Caracterele de bază rămân mereu ca orice cuvânt să poată fi tokenizat. Token-uri ca <code>"bu"</code>, <code>"pu"</code>, <code>"gs"</code>, <code>"pug"</code> și <code>"bun"</code> sunt eliminate pentru că au contribuit cel mai puțin la probabilitatea globală.</p></li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg><!----> <div class=" absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0 "><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent;"></div> Copied</div><!----></button><!----></div> <pre class="language-text "><!---->[&quot;b&quot;, &quot;g&quot;, &quot;h&quot;, &quot;n&quot;, &quot;p&quot;, &quot;s&quot;, &quot;u&quot;, &quot;hu&quot;, &quot;ug&quot;, &quot;un&quot;, &quot;hug&quot;, &quot;pun&quot;, &quot;ugs&quot;, &quot;hugs&quot;]<!----></pre></div><!----> <ol start="5"><li>Pașii 2-4 se repetă până vocabularul atinge dimensiunea țintă.</li></ol> <p>În inferență, Unigram poate tokeniza un cuvânt în mai multe moduri. <code>"hugs"</code> ar putea deveni <code>["hug", "s"]</code>, <code>["h", "ug", "s"]</code> sau <code>["h", "u", "g", "s"]</code>. Unigram alege tokenizarea cu cea mai mare probabilitate. Spre deosebire de BPE, care este determinist și bazat pe reguli de îmbinare, Unigram este probabilistic și poate eșantiona tokenizări diferite în antrenare.</p> <!--[1--><h2 class="relative group"><a id="sentencepiece" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sentencepiece"><span><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg><!----></span></a> <span>SentencePiece</span></h2><!--]--><!----> <p><a href="https://huggingface.co/papers/1808.06226" rel="nofollow">SentencePiece</a> este o bibliotecă de tokenizare care aplică BPE sau Unigram direct pe text brut. BPE și Unigram standard presupun că spațiile separă cuvintele, ceea ce nu funcționează pentru limbi ca chineza și japoneza care nu folosesc spații.</p> <ol><li>SentencePiece tratează textul de intrare ca un flux brut de bytes sau caractere și include caracterul spațiu, reprezentat ca <code>"▁"</code>, în vocabular.</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg><!----> <div class=" absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0 "><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent;"></div> Copied</div><!----></button><!----></div> <pre class="language-text "><!---->(&quot;▁hug&quot;, 10), (&quot;▁pug&quot;, 5), (&quot;▁pun&quot;, 12), (&quot;▁bun&quot;, 4), (&quot;▁hugs&quot;, 5)<!----></pre></div><!----> <ol start="2"><li>SentencePiece aplică apoi BPE sau Unigram pe text.</li></ol> <p>La decodare, SentencePiece concatenează toate token-urile și înlocuiește <code>"▁"</code> cu un spațiu.</p> <!--[1--><h2 class="relative group"><a id="wordpiece" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#wordpiece"><span><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg><!----></span></a> <span>WordPiece</span></h2><!--]--><!----> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/qpv6ms_t_1A" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen=""></iframe><!----> <p><a href="https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/37842.pdf" rel="nofollow">WordPiece</a> este algoritmul de tokenizare pentru modelele din familia BERT, cum ar fi [DistilBERT] și [Electra].</p> <p>Este similar cu <a href="#byte-pair-encoding-bpe">BPE</a> și îmbină iterativ perechi de jos în sus, dar diferă în modul în care selectează perechile.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg><!----> <div class=" absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0 "><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent;"></div> Copied</div><!----></button><!----></div> <pre class="language-text "><!---->(&quot;h&quot; &quot;u&quot; &quot;g&quot;, 10), (&quot;p&quot; &quot;u&quot; &quot;g&quot;, 5), (&quot;p&quot; &quot;u&quot; &quot;n&quot;, 12), (&quot;b&quot; &quot;u&quot; &quot;n&quot;, 4), (&quot;h&quot; &quot;u&quot; &quot;g&quot; &quot;s&quot;, 5)<!----></pre></div><!----> <p>WordPiece îmbină perechile care maximizează probabilitatea datelor de antrenare.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg><!----> <div class=" absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0 "><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent;"></div> Copied</div><!----></button><!----></div> <pre class="language-text "><!---->score(&quot;u&quot;, &quot;g&quot;) = frequency(&quot;ug&quot;) / (frequency(&quot;u&quot;) × frequency(&quot;g&quot;))<!----></pre></div><!----> <table><thead><tr><th>pereche</th><th>frecvență</th><th>scor</th></tr></thead><tbody><tr><td><code>"u"</code> + <code>"g"</code></td><td>20</td><td>20 / (36 × 20) = 0.028</td></tr><tr><td><code>"u"</code> + <code>"n"</code></td><td>16</td><td>16 / (36 × 16) = 0.028</td></tr><tr><td><code>"h"</code> + <code>"u"</code></td><td>15</td><td>15 / (15 × 36) = 0.028</td></tr><tr><td><code>"g"</code> + <code>"s"</code></td><td>5</td><td>5 / (20 × 5) = 0.050</td></tr></tbody></table> <p>Scorul favorizează îmbinarea <code>"g"</code> și <code>"s"</code> unde token-ul combinat apare mai des decât s-ar aștepta din frecvențele individuale ale token-urilor. BPE îmbină pur și simplu perechea care apare cel mai des. WordPiece măsoară cât de <em>informativă</em> este fiecare îmbinare. Două token-uri care apar împreună mult mai des decât prevede probabilitatea sunt îmbinate primele.</p> <!--[1--><h2 class="relative group"><a id="tokenization-la-nivel-de-cuvânt" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tokenization-la-nivel-de-cuvânt"><span><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg><!----></span></a> <span>Tokenization la nivel de cuvânt</span></h2><!--]--><!----> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/nhJxYji1aho" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen=""></iframe><!----> <p>Tokenization-ul la nivel de cuvânt împarte textul în token-uri după spații, punctuație sau reguli specifice limbii.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg><!----> <div class=" absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0 "><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent;"></div> Copied</div><!----></button><!----></div> <pre class="language-text "><!---->[&quot;Do&quot;, &quot;n&#x27;t&quot;, &quot;you&quot;, &quot;love&quot;, &quot;🤗&quot;, &quot;Transformers&quot;, &quot;?&quot;, &quot;We&quot;, &quot;sure&quot;, &quot;do&quot;, &quot;.&quot;]<!----></pre></div><!----> <p>Dimensiunea vocabularului devine extrem de mare pentru că fiecare cuvânt unic necesită propriul token, inclusiv toate variantele (<code>"love"</code>, <code>"loving"</code>, <code>"loved"</code>, <code>"lovingly"</code>). Matricea de embeddings rezultată este enormă, crescând memoria și consumul de calcul. Cuvintele care nu se află în vocabular se mapează la un token <code>"&lt;unk>"</code>, deci modelul nu poate gestiona cuvinte noi.</p> <!--[1--><h2 class="relative group"><a id="tokenization-la-nivel-de-caracter" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tokenization-la-nivel-de-caracter"><span><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg><!----></span></a> <span>Tokenization la nivel de caracter</span></h2><!--]--><!----> <p>Tokenization-ul la nivel de caracter împarte textul în caractere individuale.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg><!----> <div class=" absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0 "><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent;"></div> Copied</div><!----></button><!----></div> <pre class="language-text "><!---->[&quot;D&quot;, &quot;o&quot;, &quot;n&quot;, &quot;&#x27;&quot;, &quot;t&quot;, &quot;y&quot;, &quot;o&quot;, &quot;u&quot;, &quot;l&quot;, &quot;o&quot;, &quot;v&quot;, &quot;e&quot;]<!----></pre></div><!----> <p>Vocabularul este mic și orice cuvânt poate fi reprezentat, deci nu există problema <code>"&lt;unk>"</code>. Dar secvențele devin mult mai lungi. Un caracter ca <code>"l"</code> poartă mult mai puțin sens decât <code>"love"</code>, deci performanța scade.</p> <!--[1--><h2 class="relative group"><a id="resurse" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#resurse"><span><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg><!----></span></a> <span>Resurse</span></h2><!--]--><!----> <ul><li><a href="https://huggingface.co/learn/llm-course/chapter6/1" rel="nofollow">Capitolul 6</a> din cursul LLM te învață cum să antrenezi un tokenizer de la zero și explică diferențele dintre algoritmii BPE, Unigram și WordPiece.</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/ro/tokenizer_summary.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg><!----> <span><span class="underline">Update</span> on GitHub</span></a><!----> <p></p><!--]--><!----><!--]--><!--]--><!--]--> <!--[-1--><!--]--><!--]-->
<script>
{
__sveltekit_1kj9n5j = {
base: "/docs/transformers/main/ro",
assets: "/docs/transformers/main/ro"
};
const element = document.currentScript.parentElement;
Promise.all([
import("/docs/transformers/main/ro/_app/immutable/entry/start.C4V7Noit.js"),
import("/docs/transformers/main/ro/_app/immutable/entry/app.BwPDdXcs.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 33],
data: [null,null],
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
42.8 kB
·
Xet hash:
3d4efe7bd35b4928006aa1a53d3824b6dbe259ffa1d8f687511a89c297c3b84c

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.