Buckets:

hf-doc-build/doc-dev / transformers /pr_26617 /en /custom_tokenizers.html
rtrm's picture
download
raw
29.9 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Customizing tokenizers&quot;,&quot;local&quot;:&quot;customizing-tokenizers&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Training a tokenizer&quot;,&quot;local&quot;:&quot;training-a-tokenizer&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Custom vocabulary&quot;,&quot;local&quot;:&quot;custom-vocabulary&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Subclassing TokenizersBackend&quot;,&quot;local&quot;:&quot;subclassing-tokenizersbackend&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/transformers/pr_26617/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/entry/start.b5ae2c21.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/scheduler.31fdf58d.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/singletons.512cdb48.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/index.252883d5.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/paths.81255c3b.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/entry/app.9acf2c3e.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/preload-helper.bb442aeb.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/index.2f76fdf0.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/nodes/0.da6b3909.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/nodes/34.9882e5e3.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/CopyLLMTxtMenu.a69e059a.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.e4c7f916.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/IconCopy.ac192424.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/CodeBlock.ab12f8e1.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Customizing tokenizers&quot;,&quot;local&quot;:&quot;customizing-tokenizers&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Training a tokenizer&quot;,&quot;local&quot;:&quot;training-a-tokenizer&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Custom vocabulary&quot;,&quot;local&quot;:&quot;custom-vocabulary&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Subclassing TokenizersBackend&quot;,&quot;local&quot;:&quot;subclassing-tokenizersbackend&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="customizing-tokenizers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#customizing-tokenizers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Customizing tokenizers</span></h1> <p data-svelte-h="svelte-1bxdeu2">Tokenizers are decoupled from their learned vocabularies. This allows you to initialize an empty tokenizer for training or create one directly with your own vocabulary. The underlying tokenization pipeline remains the same (normalizer, pre-tokenizer, tokenization algorithm) so you don’t need to recreate it from scratch.</p> <p data-svelte-h="svelte-1h6lkpz">This guide shows how to train and create a custom tokenizer.</p> <h2 class="relative group"><a id="training-a-tokenizer" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#training-a-tokenizer"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Training a tokenizer</span></h2> <p data-svelte-h="svelte-11ou8ab">An empty trainable tokenizer replaces the vocabulary with a new target vocabulary. This is useful for adapting to a new domain like finance, a low-resource language, or code.</p> <p data-svelte-h="svelte-1tgyxu9">Create an empty tokenizer and load a dataset.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> GemmaTokenizer
tokenizer = GemmaTokenizer()
dataset = load_dataset(<span class="hljs-string">&quot;Josephgflowers/Finance-Instruct-500k&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1w3iril">Use the <a href="/docs/transformers/pr_26617/en/main_classes/tokenizer#transformers.TokenizersBackend.train_new_from_iterator">TokenizersBackend.train_new_from_iterator()</a> method to train the tokenizer. This method accepts a generator function to return chunks of text from the dataset instead of loading everything into memory at once. The <code>vocab_size</code> argument sets the tokenizers vocabulary size.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">batch_iterator</span>(<span class="hljs-params">batch_size=<span class="hljs-number">1000</span></span>):
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">0</span>, <span class="hljs-built_in">len</span>(dataset), batch_size):
<span class="hljs-keyword">yield</span> dataset[i : i + batch_size][<span class="hljs-string">&quot;assistant&quot;</span>]
trained_tokenizer = tokenizer.train_new_from_iterator(
batch_iterator(),
vocab_size=<span class="hljs-number">32000</span>,
)
encoded = trained_tokenizer(<span class="hljs-string">&quot;The stock market rallied today.&quot;</span>)
<span class="hljs-built_in">print</span>(encoded[<span class="hljs-string">&quot;input_ids&quot;</span>])
[<span class="hljs-number">5866</span>, <span class="hljs-number">11503</span>, <span class="hljs-number">98</span>, <span class="hljs-number">5885</span>, <span class="hljs-number">8617</span>, <span class="hljs-number">13381</span>, <span class="hljs-number">30</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-psyjbb">Add new special tokens with the <code>new_special_tokens</code> argument or use <code>special_tokens_map</code> to rename the old special tokens to the new special tokens.</p> <p data-svelte-h="svelte-4qu3i8">Save the new finance tokenizer with <a href="/docs/transformers/pr_26617/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.save_pretrained">save_pretrained()</a> or save and upload it to the Hub with <a href="/docs/transformers/pr_26617/en/main_classes/model#transformers.utils.PushToHubMixin.push_to_hub">push_to_hub()</a>. This creates a <code>tokenizer.json</code> file that captures the newly trained vocabulary, merge rules, and full pipeline configuration.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trained_tokenizer.save_pretrained(<span class="hljs-string">&quot;./finance-gemma-tokenizer&quot;</span>)
trained_tokenizer.push_to_hub(<span class="hljs-string">&quot;finance-gemma-tokenizer&quot;</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="custom-vocabulary" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#custom-vocabulary"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Custom vocabulary</span></h2> <p data-svelte-h="svelte-ankuo6">An empty tokenizer supports custom vocabulary with the <code>vocab</code> and <code>merges</code> arguments.</p> <ul data-svelte-h="svelte-imk7fb"><li><code>vocab</code> is the complete set of tokens a tokenizer knows and each entry maps a token to its input id.</li> <li><code>merges</code> defines how the BPE algorithm should combine adjacent tokens.</li></ul> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> GemmaTokenizer
vocab={
<span class="hljs-string">&quot;&lt;pad&gt;&quot;</span>: <span class="hljs-number">0</span>,
<span class="hljs-string">&quot;&lt;/s&gt;&quot;</span>: <span class="hljs-number">1</span>,
<span class="hljs-string">&quot;&lt;s&gt;&quot;</span>: <span class="hljs-number">2</span>,
<span class="hljs-string">&quot;&lt;unk&gt;&quot;</span>: <span class="hljs-number">3</span>,
<span class="hljs-string">&quot;&lt;mask&gt;&quot;</span>: <span class="hljs-number">4</span>,
<span class="hljs-string">&quot;▁the&quot;</span>: <span class="hljs-number">5</span>,
<span class="hljs-string">&quot;▁stock&quot;</span>: <span class="hljs-number">6</span>,
<span class="hljs-string">&quot;▁market&quot;</span>: <span class="hljs-number">7</span>,
<span class="hljs-string">&quot;&quot;</span>: <span class="hljs-number">8</span>,
<span class="hljs-string">&quot;r&quot;</span>: <span class="hljs-number">9</span>,
<span class="hljs-string">&quot;a&quot;</span>: <span class="hljs-number">10</span>,
<span class="hljs-string">&quot;l&quot;</span>: <span class="hljs-number">11</span>,
<span class="hljs-string">&quot;i&quot;</span>: <span class="hljs-number">12</span>,
<span class="hljs-string">&quot;e&quot;</span>: <span class="hljs-number">13</span>,
<span class="hljs-string">&quot;d&quot;</span>: <span class="hljs-number">14</span>,
<span class="hljs-string">&quot;ra&quot;</span>: <span class="hljs-number">15</span>,
<span class="hljs-string">&quot;li&quot;</span>: <span class="hljs-number">16</span>,
<span class="hljs-string">&quot;lie&quot;</span>: <span class="hljs-number">17</span>,
<span class="hljs-string">&quot;lied&quot;</span>: <span class="hljs-number">18</span>,
<span class="hljs-string">&quot;ral&quot;</span>: <span class="hljs-number">19</span>,
<span class="hljs-string">&quot;ralli&quot;</span>: <span class="hljs-number">20</span>,
<span class="hljs-string">&quot;rallie&quot;</span>: <span class="hljs-number">21</span>,
<span class="hljs-string">&quot;rallied&quot;</span>: <span class="hljs-number">22</span>,
}
merges=[
(<span class="hljs-string">&quot;r&quot;</span>, <span class="hljs-string">&quot;a&quot;</span>), <span class="hljs-comment"># r + a → ra</span>
(<span class="hljs-string">&quot;l&quot;</span>, <span class="hljs-string">&quot;i&quot;</span>), <span class="hljs-comment"># l + i → li</span>
(<span class="hljs-string">&quot;li&quot;</span>, <span class="hljs-string">&quot;e&quot;</span>), <span class="hljs-comment"># li + e → lie</span>
(<span class="hljs-string">&quot;lie&quot;</span>, <span class="hljs-string">&quot;d&quot;</span>), <span class="hljs-comment"># lie + d → lied</span>
(<span class="hljs-string">&quot;ra&quot;</span>, <span class="hljs-string">&quot;l&quot;</span>), <span class="hljs-comment"># ra + l → ral</span>
(<span class="hljs-string">&quot;ral&quot;</span>, <span class="hljs-string">&quot;li&quot;</span>), <span class="hljs-comment"># ral + li → ralli</span>
(<span class="hljs-string">&quot;ralli&quot;</span>, <span class="hljs-string">&quot;e&quot;</span>), <span class="hljs-comment"># ralli + e → rallie</span>
(<span class="hljs-string">&quot;rallie&quot;</span>, <span class="hljs-string">&quot;d&quot;</span>), <span class="hljs-comment"># rallie + d → rallied</span>
]
tokenizer = GemmaTokenizer(vocab=vocab, merges=merges)
encoded = tokenizer(<span class="hljs-string">&quot;the stock market rallied&quot;</span>)
<span class="hljs-built_in">print</span>(encoded[<span class="hljs-string">&quot;input_ids&quot;</span>])<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="subclassing-tokenizersbackend" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#subclassing-tokenizersbackend"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Subclassing TokenizersBackend</span></h2> <p data-svelte-h="svelte-64cic4">Tokenizers supports four different <a href="./fast_tokenizers#backends">backends</a>. Generally, you should use the <a href="/docs/transformers/pr_26617/en/main_classes/tokenizer#transformers.TokenizersBackend">TokenizersBackend</a> to define a new tokenizer because it’s faster.</p> <blockquote class="tip" data-svelte-h="svelte-158z244"><p>The <a href="/docs/transformers/pr_26617/en/main_classes/tokenizer#transformers.PythonBackend">PythonBackend</a> is a pure Python tokenizer that does not rely on backends like Rust, SentencePiece, or mistral-common. You should only use <a href="/docs/transformers/pr_26617/en/main_classes/tokenizer#transformers.PythonBackend">PythonBackend</a> if you’re building a very specialized tokenizer that can’t be expressed by the Rust backend.</p></blockquote> <ol data-svelte-h="svelte-1kzkrob"><li>Subclass the <a href="/docs/transformers/pr_26617/en/main_classes/tokenizer#transformers.TokenizersBackend">TokenizersBackend</a> with class attributes like padding side and the tokenization algorithm to use.</li> <li>Define the tokenization pipeline in the <code>__init__</code>. This includes the tokenization algorithm to use, how to split the raw text before the algorithm, and how to decode the tokens back to text.</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> Tokenizer, decoders, pre_tokenizers
<span class="hljs-keyword">from</span> tokenizers.models <span class="hljs-keyword">import</span> BPE
<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> TokenizersBackend
<span class="hljs-keyword">class</span> <span class="hljs-title class_">NewTokenizer</span>(<span class="hljs-title class_ inherited__">TokenizersBackend</span>):
padding_side = <span class="hljs-string">&quot;left&quot;</span>
model = BPE
<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">
self,
vocab=<span class="hljs-literal">None</span>,
merges=<span class="hljs-literal">None</span>,
unk_token=<span class="hljs-string">&quot;&lt;unk&gt;&quot;</span>,
bos_token=<span class="hljs-string">&quot;&lt;s&gt;&quot;</span>,
eos_token=<span class="hljs-string">&quot;&lt;/s&gt;&quot;</span>,
pad_token=<span class="hljs-string">&quot;&lt;pad&gt;&quot;</span>,
</span>):
self._vocab = vocab <span class="hljs-keyword">or</span> {
<span class="hljs-built_in">str</span>(unk_token): <span class="hljs-number">0</span>,
<span class="hljs-built_in">str</span>(bos_token): <span class="hljs-number">1</span>,
<span class="hljs-built_in">str</span>(eos_token): <span class="hljs-number">2</span>,
<span class="hljs-built_in">str</span>(pad_token): <span class="hljs-number">3</span>,
}
self._merges = merges <span class="hljs-keyword">or</span> []
self._tokenizer = Tokenizer(
BPE(vocab=self._vocab, merges=self._merges, fuse_unk=<span class="hljs-literal">True</span>)
)
self._tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=<span class="hljs-literal">False</span>)
self._tokenizer.decoder = decoders.ByteLevel()
<span class="hljs-built_in">super</span>().__init__(
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-skf17z">Train or save the new empty tokenizer.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer = NewTokenizer()
<span class="hljs-comment"># train on new corpus</span>
tokenizer.train_new_from_iterator()
<span class="hljs-comment"># save tokenizer</span>
tokenizer.save_pretrained(<span class="hljs-string">&quot;./new-tokenizer&quot;</span>)<!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/custom_tokenizers.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1x0t0ja = {
assets: "/docs/transformers/pr_26617/en",
base: "/docs/transformers/pr_26617/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/transformers/pr_26617/en/_app/immutable/entry/start.b5ae2c21.js"),
import("/docs/transformers/pr_26617/en/_app/immutable/entry/app.9acf2c3e.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 34],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
29.9 kB
·
Xet hash:
b903aadaaaba254fa67606d46ce18e4809fa5d302768282be836f6e43d41e12b

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.