Buckets:

rtrm's picture
download
raw
38.7 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Components&quot;,&quot;local&quot;:&quot;components&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Normalizers&quot;,&quot;local&quot;:&quot;normalizers&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Pre-tokenizers&quot;,&quot;local&quot;:&quot;pre-tokenizers&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Models&quot;,&quot;local&quot;:&quot;models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Post-Processors&quot;,&quot;local&quot;:&quot;post-processors&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Decoders&quot;,&quot;local&quot;:&quot;decoders&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/tokenizers/pr_2012/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/tokenizers/pr_2012/en/_app/immutable/entry/start.82c81cf3.js">
<link rel="modulepreload" href="/docs/tokenizers/pr_2012/en/_app/immutable/chunks/scheduler.7c59faff.js">
<link rel="modulepreload" href="/docs/tokenizers/pr_2012/en/_app/immutable/chunks/singletons.48f81268.js">
<link rel="modulepreload" href="/docs/tokenizers/pr_2012/en/_app/immutable/chunks/index.adabef24.js">
<link rel="modulepreload" href="/docs/tokenizers/pr_2012/en/_app/immutable/chunks/paths.7e5a9ab3.js">
<link rel="modulepreload" href="/docs/tokenizers/pr_2012/en/_app/immutable/entry/app.cac551ee.js">
<link rel="modulepreload" href="/docs/tokenizers/pr_2012/en/_app/immutable/chunks/preload-helper.cd69f041.js">
<link rel="modulepreload" href="/docs/tokenizers/pr_2012/en/_app/immutable/chunks/index.09bb5655.js">
<link rel="modulepreload" href="/docs/tokenizers/pr_2012/en/_app/immutable/nodes/0.5c29e5da.js">
<link rel="modulepreload" href="/docs/tokenizers/pr_2012/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/tokenizers/pr_2012/en/_app/immutable/nodes/14.b10e0726.js">
<link rel="modulepreload" href="/docs/tokenizers/pr_2012/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.0ba496c3.js">
<link rel="modulepreload" href="/docs/tokenizers/pr_2012/en/_app/immutable/chunks/TokenizersLanguageContent.0fc17a7a.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Components&quot;,&quot;local&quot;:&quot;components&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Normalizers&quot;,&quot;local&quot;:&quot;normalizers&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Pre-tokenizers&quot;,&quot;local&quot;:&quot;pre-tokenizers&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Models&quot;,&quot;local&quot;:&quot;models&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Post-Processors&quot;,&quot;local&quot;:&quot;post-processors&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Decoders&quot;,&quot;local&quot;:&quot;decoders&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="components" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#components"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Components</span></h1> <p data-svelte-h="svelte-mynvre">When building a Tokenizer, you can attach various types of components to
this Tokenizer in order to customize its behavior. This page lists most
provided components.</p> <h2 class="relative group"><a id="normalizers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#normalizers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Normalizers</span></h2> <p data-svelte-h="svelte-f3xmjh">A <code>Normalizer</code> is in charge of pre-processing the input string in order
to normalize it as relevant for a given use case. Some common examples
of normalization are the Unicode normalization algorithms (NFD, NFKD,
NFC &amp; NFKC), lowercasing etc… The specificity of <code>tokenizers</code> is that
we keep track of the alignment while normalizing. This is essential to
allow mapping from the generated tokens back to the input text.</p> <p data-svelte-h="svelte-1ipyvu9">The <code>Normalizer</code> is optional.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M15.84.5a16.4,16.4,0,0,0-3.57.32C9.1,1.39,8.53,2.53,8.53,4.64V7.48H16v1H5.77a4.73,4.73,0,0,0-4.7,3.74,14.82,14.82,0,0,0,0,7.54c.57,2.28,1.86,3.82,4,3.82h2.6V20.14a4.73,4.73,0,0,1,4.63-4.63h7.38a3.72,3.72,0,0,0,3.73-3.73V4.64A4.16,4.16,0,0,0,19.65.82,20.49,20.49,0,0,0,15.84.5ZM11.78,2.77a1.39,1.39,0,0,1,1.38,1.46,1.37,1.37,0,0,1-1.38,1.38A1.42,1.42,0,0,1,10.4,4.23,1.44,1.44,0,0,1,11.78,2.77Z" fill="#5a9fd4"></path><path d="M16.16,31.5a16.4,16.4,0,0,0,3.57-.32c3.17-.57,3.74-1.71,3.74-3.82V24.52H16v-1H26.23a4.73,4.73,0,0,0,4.7-3.74,14.82,14.82,0,0,0,0-7.54c-.57-2.28-1.86-3.82-4-3.82h-2.6v3.41a4.73,4.73,0,0,1-4.63,4.63H12.35a3.72,3.72,0,0,0-3.73,3.73v7.14a4.16,4.16,0,0,0,3.73,3.82A20.49,20.49,0,0,0,16.16,31.5Zm4.06-2.27a1.39,1.39,0,0,1-1.38-1.46,1.37,1.37,0,0,1,1.38-1.38,1.42,1.42,0,0,1,1.38,1.38A1.44,1.44,0,0,1,20.22,29.23Z" fill="#ffd43b"></path></svg> Python </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" preserveAspectRatio="xMidYMid meet" width="1em" height="1em" viewBox="0 0 32 32"><path d="M31.77,15.61l-1.34-.83c0-.13,0-.26,0-.39l1.16-1.08a.46.46,0,0,0,.14-.43.44.44,0,0,0-.29-.34L29.92,12l-.12-.38.92-1.28a.46.46,0,0,0,.06-.45.47.47,0,0,0-.36-.28l-1.55-.25L28.68,9l.66-1.44a.48.48,0,0,0,0-.45.46.46,0,0,0-.4-.2L27.32,7l-.25-.3.36-1.54a.46.46,0,0,0-.12-.43.46.46,0,0,0-.43-.13l-1.54.37L25,4.68l.06-1.58a.44.44,0,0,0-.21-.4.45.45,0,0,0-.45,0L23,3.32l-.35-.19L22.4,1.57a.46.46,0,0,0-.28-.35.48.48,0,0,0-.45.05l-1.28.92L20,2.08,19.46.6a.44.44,0,0,0-.34-.29.46.46,0,0,0-.43.14L17.62,1.6l-.39,0L16.39.22a.46.46,0,0,0-.78,0l-.83,1.34-.39,0L13.31.45a.46.46,0,0,0-.43-.14.44.44,0,0,0-.34.29L12,2.08l-.38.11-1.28-.92a.48.48,0,0,0-.45-.05.5.5,0,0,0-.28.35L9.35,3.13,9,3.32,7.57,2.66a.45.45,0,0,0-.45,0,.49.49,0,0,0-.21.4L7,4.68l-.31.25L5.13,4.56a.48.48,0,0,0-.44.13.46.46,0,0,0-.12.43l.36,1.54L4.68,7l-1.58,0a.46.46,0,0,0-.4.2.48.48,0,0,0,0,.45L3.32,9l-.19.35L1.57,9.6a.47.47,0,0,0-.35.28.48.48,0,0,0,.05.45l.92,1.28c0,.12-.07.25-.11.38L.6,12.54a.44.44,0,0,0-.29.34.46.46,0,0,0,.14.43L1.6,14.39l0,.39-1.35.83a.47.47,0,0,0,0,.78l1.35.84,0,.39L.45,18.69a.46.46,0,0,0-.14.43.44.44,0,0,0,.29.34L2.08,20c0,.13.07.26.11.39l-.92,1.28a.46.46,0,0,0-.05.44.45.45,0,0,0,.36.28l1.55.25.19.35-.65,1.44a.45.45,0,0,0,.43.65L4.68,25l.25.3-.36,1.54a.46.46,0,0,0,.12.43.48.48,0,0,0,.44.12l1.54-.36.3.25L6.91,28.9a.49.49,0,0,0,.21.4.48.48,0,0,0,.45,0L9,28.68l.35.19.26,1.56a.46.46,0,0,0,.27.35.48.48,0,0,0,.45-.05l1.28-.92.38.12.55,1.47a.47.47,0,0,0,.34.29.46.46,0,0,0,.43-.13l1.08-1.16.39,0,.83,1.34A.46.46,0,0,0,16,32a.47.47,0,0,0,.4-.22l.83-1.34.39,0,1.08,1.16a.46.46,0,0,0,.43.13.47.47,0,0,0,.34-.29L20,29.93l.38-.12,1.28.92a.48.48,0,0,0,.45.05.45.45,0,0,0,.27-.35l.26-1.56.35-.19,1.43.66a.48.48,0,0,0,.45,0,.49.49,0,0,0,.21-.4L25,27.32l.3-.25,1.54.36a.48.48,0,0,0,.44-.12.46.46,0,0,0,.12-.43l-.36-1.54.25-.3,1.58.05a.45.45,0,0,0,.43-.65L28.69,23l.19-.35,1.55-.25a.45.45,0,0,0,.36-.28.43.43,0,0,0-.06-.44l-.92-1.28.12-.39,1.48-.55a.44.44,0,0,0,.29-.34.46.46,0,0,0-.14-.43L30.4,17.62c0-.13,0-.26,0-.39l1.34-.84a.46.46,0,0,0,0-.78Zm-9,11.16A1,1,0,1,1,23.92,26a.95.95,0,0,1-1.14.73Zm-.45-3.09a.87.87,0,0,0-1,.67l-.48,2.22a11.74,11.74,0,0,1-9.75,0l-.48-2.23a.85.85,0,0,0-1-.66l-2,.42a14.67,14.67,0,0,1-1-1.2h9.58c.1,0,.18,0,.18-.12V19.35c0-.1-.08-.12-.18-.12h-2.8V17.08h3a1.9,1.9,0,0,1,1.86,1.62c.12.47.39,2,.57,2.5s.91,1.65,1.69,1.65h4.77l.17,0a11,11,0,0,1-1.08,1.27l-2-.43Zm-13.24,3A.94.94,0,0,1,8,26a1,1,0,1,1,1.13.73ZM5.45,12a1,1,0,0,1-1.74.77,1,1,0,0,1,.49-1.26A1,1,0,0,1,5.45,12ZM4.33,14.66l2.05-.91a.87.87,0,0,0,.44-1.15l-.42-.95H8.06v7.46H4.73a11.37,11.37,0,0,1-.45-3.21,10.41,10.41,0,0,1,.07-1.26Zm9-.73v-2.2h3.95c.2,0,1.44.24,1.44,1.16,0,.77-.95,1-1.73,1H13.32Zm14.34,2q0,.45,0,.87h-1.2c-.12,0-.17.08-.17.2v.55c0,1.3-.73,1.58-1.37,1.65s-1.29-.25-1.37-.63a4.13,4.13,0,0,0-1.91-3.21C22.79,14.59,24,13.49,24,12a3.76,3.76,0,0,0-1.83-3.09,5.22,5.22,0,0,0-2.52-.83H7.25a11.79,11.79,0,0,1,6.54-3.7l1.47,1.54a.87.87,0,0,0,1.22,0l1.64-1.57a11.69,11.69,0,0,1,8,5.72L25,12.64a.87.87,0,0,0,.44,1.14l2.16,1a11.46,11.46,0,0,1,.06,1.17ZM15.25,3.1a1,1,0,0,1,1.34,0,1,1,0,0,1,0,1.35,1,1,0,0,1-1.34,0,1,1,0,0,1,0-1.35Zm11.13,9a.94.94,0,0,1,1.25-.48,1,1,0,1,1-1.25.48Z" fill="currentColor"></path></svg> Rust </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 289"><path fill="#539E43" d="M128 288.464c-3.975 0-7.685-1.06-11.13-2.915l-35.247-20.936c-5.3-2.915-2.65-3.975-1.06-4.505c7.155-2.385 8.48-2.915 15.9-7.156c.796-.53 1.856-.265 2.65.265l27.032 16.166c1.06.53 2.385.53 3.18 0l105.74-61.217c1.06-.53 1.59-1.59 1.59-2.915V83.08c0-1.325-.53-2.385-1.59-2.915l-105.74-60.953c-1.06-.53-2.385-.53-3.18 0L20.405 80.166c-1.06.53-1.59 1.855-1.59 2.915v122.17c0 1.06.53 2.385 1.59 2.915l28.887 16.695c15.636 7.95 25.44-1.325 25.44-10.6V93.68c0-1.59 1.326-3.18 3.181-3.18h13.516c1.59 0 3.18 1.325 3.18 3.18v120.58c0 20.936-11.396 33.126-31.272 33.126c-6.095 0-10.865 0-24.38-6.625l-27.827-15.9C4.24 220.885 0 213.465 0 205.515V83.346C0 75.396 4.24 67.976 11.13 64L116.87 2.783c6.625-3.71 15.635-3.71 22.26 0L244.87 64C251.76 67.975 256 75.395 256 83.346v122.17c0 7.95-4.24 15.37-11.13 19.345L139.13 286.08c-3.445 1.59-7.42 2.385-11.13 2.385Zm32.596-84.009c-46.377 0-55.917-21.2-55.917-39.221c0-1.59 1.325-3.18 3.18-3.18h13.78c1.59 0 2.916 1.06 2.916 2.65c2.12 14.045 8.215 20.936 36.306 20.936c22.261 0 31.802-5.035 31.802-16.96c0-6.891-2.65-11.926-37.367-15.372c-28.886-2.915-46.907-9.275-46.907-32.33c0-21.467 18.02-34.187 48.232-34.187c33.921 0 50.617 11.66 52.737 37.101c0 .795-.265 1.59-.795 2.385c-.53.53-1.325 1.06-2.12 1.06h-13.78c-1.326 0-2.65-1.06-2.916-2.385c-3.18-14.575-11.395-19.345-33.126-19.345c-24.38 0-27.296 8.48-27.296 14.84c0 7.686 3.445 10.07 36.306 14.31c32.597 4.24 47.967 10.336 47.967 33.127c-.265 23.321-19.345 36.571-53.002 36.571Z"></path></svg> Node </div></div> <div class="language-select"> <table data-svelte-h="svelte-5tel5o"><thead><tr><th align="left">Name</th> <th align="left">Description</th> <th align="left">Example</th></tr></thead> <tbody><tr><td align="left">NFD</td> <td align="left">NFD unicode normalization</td> <td align="left"></td></tr> <tr><td align="left">NFKD</td> <td align="left">NFKD unicode normalization</td> <td align="left"></td></tr> <tr><td align="left">NFC</td> <td align="left">NFC unicode normalization</td> <td align="left"></td></tr> <tr><td align="left">NFKC</td> <td align="left">NFKC unicode normalization</td> <td align="left"></td></tr> <tr><td align="left">Lowercase</td> <td align="left">Replaces all uppercase to lowercase</td> <td align="left">Input: <code>HELLO ὈΔΥΣΣΕΎΣ</code> <br> Output: <code>hello</code>ὀδυσσεύς`</td></tr> <tr><td align="left">Strip</td> <td align="left">Removes all whitespace characters on the specified sides (left, right or both) of the input</td> <td align="left">Input: <code>&quot;</code>hi<code>&quot;</code> <br> Output: <code>&quot;hi&quot;</code></td></tr> <tr><td align="left">StripAccents</td> <td align="left">Removes all accent symbols in unicode (to be used with NFD for consistency)</td> <td align="left">Input: <code>é</code> <br> Output: <code>e</code></td></tr> <tr><td align="left">Replace</td> <td align="left">Replaces a custom string or regexp and changes it with given content</td> <td align="left"><code>Replace(&quot;a&quot;, &quot;e&quot;)</code> will behave like this: <br> Input: <code>&quot;banana&quot;</code> <br> Output: <code>&quot;benene&quot;</code></td></tr> <tr><td align="left">BertNormalizer</td> <td align="left">Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul><li>clean_text</li> <li>handle_chinese_chars</li> <li>strip_accents</li> <li>lowercase</li></ul></td> <td align="left"></td></tr> <tr><td align="left">Sequence</td> <td align="left">Composes multiple normalizers that will run in the provided order</td> <td align="left"><code>Sequence([NFKC(), Lowercase()])</code></td></tr></tbody></table> </div> <h2 class="relative group"><a id="pre-tokenizers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#pre-tokenizers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Pre-tokenizers</span></h2> <p data-svelte-h="svelte-j4kxjn">The <code>PreTokenizer</code> takes care of splitting the input according to a set
of rules. This pre-processing lets you ensure that the underlying
<code>Model</code> does not build tokens across multiple “splits”. For example if
you don’t want to have whitespaces inside a token, then you can have a
<code>PreTokenizer</code> that splits on these whitespaces.</p> <p data-svelte-h="svelte-jacdwl">You can easily combine multiple <code>PreTokenizer</code> together using a
<code>Sequence</code> (see below). The <code>PreTokenizer</code> is also allowed to modify the
string, just like a <code>Normalizer</code> does. This is necessary to allow some
complicated algorithms that require to split before normalizing (e.g.
the ByteLevel)</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M15.84.5a16.4,16.4,0,0,0-3.57.32C9.1,1.39,8.53,2.53,8.53,4.64V7.48H16v1H5.77a4.73,4.73,0,0,0-4.7,3.74,14.82,14.82,0,0,0,0,7.54c.57,2.28,1.86,3.82,4,3.82h2.6V20.14a4.73,4.73,0,0,1,4.63-4.63h7.38a3.72,3.72,0,0,0,3.73-3.73V4.64A4.16,4.16,0,0,0,19.65.82,20.49,20.49,0,0,0,15.84.5ZM11.78,2.77a1.39,1.39,0,0,1,1.38,1.46,1.37,1.37,0,0,1-1.38,1.38A1.42,1.42,0,0,1,10.4,4.23,1.44,1.44,0,0,1,11.78,2.77Z" fill="#5a9fd4"></path><path d="M16.16,31.5a16.4,16.4,0,0,0,3.57-.32c3.17-.57,3.74-1.71,3.74-3.82V24.52H16v-1H26.23a4.73,4.73,0,0,0,4.7-3.74,14.82,14.82,0,0,0,0-7.54c-.57-2.28-1.86-3.82-4-3.82h-2.6v3.41a4.73,4.73,0,0,1-4.63,4.63H12.35a3.72,3.72,0,0,0-3.73,3.73v7.14a4.16,4.16,0,0,0,3.73,3.82A20.49,20.49,0,0,0,16.16,31.5Zm4.06-2.27a1.39,1.39,0,0,1-1.38-1.46,1.37,1.37,0,0,1,1.38-1.38,1.42,1.42,0,0,1,1.38,1.38A1.44,1.44,0,0,1,20.22,29.23Z" fill="#ffd43b"></path></svg> Python </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" preserveAspectRatio="xMidYMid meet" width="1em" height="1em" viewBox="0 0 32 32"><path d="M31.77,15.61l-1.34-.83c0-.13,0-.26,0-.39l1.16-1.08a.46.46,0,0,0,.14-.43.44.44,0,0,0-.29-.34L29.92,12l-.12-.38.92-1.28a.46.46,0,0,0,.06-.45.47.47,0,0,0-.36-.28l-1.55-.25L28.68,9l.66-1.44a.48.48,0,0,0,0-.45.46.46,0,0,0-.4-.2L27.32,7l-.25-.3.36-1.54a.46.46,0,0,0-.12-.43.46.46,0,0,0-.43-.13l-1.54.37L25,4.68l.06-1.58a.44.44,0,0,0-.21-.4.45.45,0,0,0-.45,0L23,3.32l-.35-.19L22.4,1.57a.46.46,0,0,0-.28-.35.48.48,0,0,0-.45.05l-1.28.92L20,2.08,19.46.6a.44.44,0,0,0-.34-.29.46.46,0,0,0-.43.14L17.62,1.6l-.39,0L16.39.22a.46.46,0,0,0-.78,0l-.83,1.34-.39,0L13.31.45a.46.46,0,0,0-.43-.14.44.44,0,0,0-.34.29L12,2.08l-.38.11-1.28-.92a.48.48,0,0,0-.45-.05.5.5,0,0,0-.28.35L9.35,3.13,9,3.32,7.57,2.66a.45.45,0,0,0-.45,0,.49.49,0,0,0-.21.4L7,4.68l-.31.25L5.13,4.56a.48.48,0,0,0-.44.13.46.46,0,0,0-.12.43l.36,1.54L4.68,7l-1.58,0a.46.46,0,0,0-.4.2.48.48,0,0,0,0,.45L3.32,9l-.19.35L1.57,9.6a.47.47,0,0,0-.35.28.48.48,0,0,0,.05.45l.92,1.28c0,.12-.07.25-.11.38L.6,12.54a.44.44,0,0,0-.29.34.46.46,0,0,0,.14.43L1.6,14.39l0,.39-1.35.83a.47.47,0,0,0,0,.78l1.35.84,0,.39L.45,18.69a.46.46,0,0,0-.14.43.44.44,0,0,0,.29.34L2.08,20c0,.13.07.26.11.39l-.92,1.28a.46.46,0,0,0-.05.44.45.45,0,0,0,.36.28l1.55.25.19.35-.65,1.44a.45.45,0,0,0,.43.65L4.68,25l.25.3-.36,1.54a.46.46,0,0,0,.12.43.48.48,0,0,0,.44.12l1.54-.36.3.25L6.91,28.9a.49.49,0,0,0,.21.4.48.48,0,0,0,.45,0L9,28.68l.35.19.26,1.56a.46.46,0,0,0,.27.35.48.48,0,0,0,.45-.05l1.28-.92.38.12.55,1.47a.47.47,0,0,0,.34.29.46.46,0,0,0,.43-.13l1.08-1.16.39,0,.83,1.34A.46.46,0,0,0,16,32a.47.47,0,0,0,.4-.22l.83-1.34.39,0,1.08,1.16a.46.46,0,0,0,.43.13.47.47,0,0,0,.34-.29L20,29.93l.38-.12,1.28.92a.48.48,0,0,0,.45.05.45.45,0,0,0,.27-.35l.26-1.56.35-.19,1.43.66a.48.48,0,0,0,.45,0,.49.49,0,0,0,.21-.4L25,27.32l.3-.25,1.54.36a.48.48,0,0,0,.44-.12.46.46,0,0,0,.12-.43l-.36-1.54.25-.3,1.58.05a.45.45,0,0,0,.43-.65L28.69,23l.19-.35,1.55-.25a.45.45,0,0,0,.36-.28.43.43,0,0,0-.06-.44l-.92-1.28.12-.39,1.48-.55a.44.44,0,0,0,.29-.34.46.46,0,0,0-.14-.43L30.4,17.62c0-.13,0-.26,0-.39l1.34-.84a.46.46,0,0,0,0-.78Zm-9,11.16A1,1,0,1,1,23.92,26a.95.95,0,0,1-1.14.73Zm-.45-3.09a.87.87,0,0,0-1,.67l-.48,2.22a11.74,11.74,0,0,1-9.75,0l-.48-2.23a.85.85,0,0,0-1-.66l-2,.42a14.67,14.67,0,0,1-1-1.2h9.58c.1,0,.18,0,.18-.12V19.35c0-.1-.08-.12-.18-.12h-2.8V17.08h3a1.9,1.9,0,0,1,1.86,1.62c.12.47.39,2,.57,2.5s.91,1.65,1.69,1.65h4.77l.17,0a11,11,0,0,1-1.08,1.27l-2-.43Zm-13.24,3A.94.94,0,0,1,8,26a1,1,0,1,1,1.13.73ZM5.45,12a1,1,0,0,1-1.74.77,1,1,0,0,1,.49-1.26A1,1,0,0,1,5.45,12ZM4.33,14.66l2.05-.91a.87.87,0,0,0,.44-1.15l-.42-.95H8.06v7.46H4.73a11.37,11.37,0,0,1-.45-3.21,10.41,10.41,0,0,1,.07-1.26Zm9-.73v-2.2h3.95c.2,0,1.44.24,1.44,1.16,0,.77-.95,1-1.73,1H13.32Zm14.34,2q0,.45,0,.87h-1.2c-.12,0-.17.08-.17.2v.55c0,1.3-.73,1.58-1.37,1.65s-1.29-.25-1.37-.63a4.13,4.13,0,0,0-1.91-3.21C22.79,14.59,24,13.49,24,12a3.76,3.76,0,0,0-1.83-3.09,5.22,5.22,0,0,0-2.52-.83H7.25a11.79,11.79,0,0,1,6.54-3.7l1.47,1.54a.87.87,0,0,0,1.22,0l1.64-1.57a11.69,11.69,0,0,1,8,5.72L25,12.64a.87.87,0,0,0,.44,1.14l2.16,1a11.46,11.46,0,0,1,.06,1.17ZM15.25,3.1a1,1,0,0,1,1.34,0,1,1,0,0,1,0,1.35,1,1,0,0,1-1.34,0,1,1,0,0,1,0-1.35Zm11.13,9a.94.94,0,0,1,1.25-.48,1,1,0,1,1-1.25.48Z" fill="currentColor"></path></svg> Rust </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm"><svg class="mr-1.5" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 289"><path fill="#539E43" d="M128 288.464c-3.975 0-7.685-1.06-11.13-2.915l-35.247-20.936c-5.3-2.915-2.65-3.975-1.06-4.505c7.155-2.385 8.48-2.915 15.9-7.156c.796-.53 1.856-.265 2.65.265l27.032 16.166c1.06.53 2.385.53 3.18 0l105.74-61.217c1.06-.53 1.59-1.59 1.59-2.915V83.08c0-1.325-.53-2.385-1.59-2.915l-105.74-60.953c-1.06-.53-2.385-.53-3.18 0L20.405 80.166c-1.06.53-1.59 1.855-1.59 2.915v122.17c0 1.06.53 2.385 1.59 2.915l28.887 16.695c15.636 7.95 25.44-1.325 25.44-10.6V93.68c0-1.59 1.326-3.18 3.181-3.18h13.516c1.59 0 3.18 1.325 3.18 3.18v120.58c0 20.936-11.396 33.126-31.272 33.126c-6.095 0-10.865 0-24.38-6.625l-27.827-15.9C4.24 220.885 0 213.465 0 205.515V83.346C0 75.396 4.24 67.976 11.13 64L116.87 2.783c6.625-3.71 15.635-3.71 22.26 0L244.87 64C251.76 67.975 256 75.395 256 83.346v122.17c0 7.95-4.24 15.37-11.13 19.345L139.13 286.08c-3.445 1.59-7.42 2.385-11.13 2.385Zm32.596-84.009c-46.377 0-55.917-21.2-55.917-39.221c0-1.59 1.325-3.18 3.18-3.18h13.78c1.59 0 2.916 1.06 2.916 2.65c2.12 14.045 8.215 20.936 36.306 20.936c22.261 0 31.802-5.035 31.802-16.96c0-6.891-2.65-11.926-37.367-15.372c-28.886-2.915-46.907-9.275-46.907-32.33c0-21.467 18.02-34.187 48.232-34.187c33.921 0 50.617 11.66 52.737 37.101c0 .795-.265 1.59-.795 2.385c-.53.53-1.325 1.06-2.12 1.06h-13.78c-1.326 0-2.65-1.06-2.916-2.385c-3.18-14.575-11.395-19.345-33.126-19.345c-24.38 0-27.296 8.48-27.296 14.84c0 7.686 3.445 10.07 36.306 14.31c32.597 4.24 47.967 10.336 47.967 33.127c-.265 23.321-19.345 36.571-53.002 36.571Z"></path></svg> Node </div></div> <div class="language-select"> <table data-svelte-h="svelte-1p93mxq"><thead><tr><th align="left">Name</th> <th align="left">Description</th> <th align="left">Example</th></tr></thead> <tbody><tr><td align="left">ByteLevel</td> <td align="left">Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul><li>Since it maps on bytes, a tokenizer using this only requires <strong>256</strong> characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li></ul></td> <td align="left">Input: <code>&quot;Hello my friend, how are you?&quot;</code> <br> Output: <code>&quot;Hello&quot;, &quot;Ġmy&quot;, Ġfriend&quot;, &quot;,&quot;, &quot;Ġhow&quot;, &quot;Ġare&quot;, &quot;Ġyou&quot;, &quot;?&quot;</code></td></tr> <tr><td align="left">Whitespace</td> <td align="left">Splits on word boundaries (using the following regular expression: <code>\w+&amp;#124;[^\w\s]+</code></td> <td align="left">Input: <code>&quot;Hello there!&quot;</code> <br> Output: <code>&quot;Hello&quot;, &quot;there&quot;, &quot;!&quot;</code></td></tr> <tr><td align="left">WhitespaceSplit</td> <td align="left">Splits on any whitespace character</td> <td align="left">Input: <code>&quot;Hello there!&quot;</code> <br> Output: <code>&quot;Hello&quot;, &quot;there!&quot;</code></td></tr> <tr><td align="left">Punctuation</td> <td align="left">Will isolate all punctuation characters</td> <td align="left">Input: <code>&quot;Hello?&quot;</code> <br> Output: <code>&quot;Hello&quot;, &quot;?&quot;</code></td></tr> <tr><td align="left">Metaspace</td> <td align="left">Splits on whitespaces and replaces them with a special char “▁” (U+2581)</td> <td align="left">Input: <code>&quot;Hello there&quot;</code> <br> Output: <code>&quot;Hello&quot;, &quot;▁there&quot;</code></td></tr> <tr><td align="left">CharDelimiterSplit</td> <td align="left">Splits on a given character</td> <td align="left">Example with <code>x</code>: <br> Input: <code>&quot;Helloxthere&quot;</code> <br> Output: <code>&quot;Hello&quot;, &quot;there&quot;</code></td></tr> <tr><td align="left">Digits</td> <td align="left">Splits the numbers from any other characters.</td> <td align="left">Input: <code>&quot;Hello123there&quot;</code> <br> Output: <code>&quot;Hello&quot;, &quot;123&quot;, &quot;there&quot;</code></td></tr> <tr><td align="left">Split</td> <td align="left">Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul><li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>removed</li><li>isolated</li><li>merged_with_previous</li><li>merged_with_next</li><li>contiguous</li></ul></li> <li>invert should be a boolean flag.</li></ul></td> <td align="left">Example with pattern = <code></code>, behavior = <code>&quot;isolated&quot;</code>, invert = <code>False</code>: <br> Input: <code>&quot;Hello, how are you?&quot;</code> <br> Output: <code>&quot;Hello,&quot;, &quot; &quot;, &quot;how&quot;, &quot; &quot;, &quot;are&quot;, &quot; &quot;, &quot;you?&quot;</code></td></tr> <tr><td align="left">Sequence</td> <td align="left">Lets you compose multiple <code>PreTokenizer</code> that will be run in the given order</td> <td align="left"><code>Sequence([Punctuation(), WhitespaceSplit()])</code></td></tr></tbody></table> </div> <h2 class="relative group"><a id="models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Models</span></h2> <p data-svelte-h="svelte-1x9obin">Models are the core algorithms used to actually tokenize, and therefore,
they are the only mandatory component of a Tokenizer.</p> <table data-svelte-h="svelte-1e1q21g"><thead><tr><th align="left">Name</th> <th align="left">Description</th></tr></thead> <tbody><tr><td align="left">WordLevel</td> <td align="left">This is the “classic” tokenization algorithm. It let’s you simply map words to IDs without anything fancy. This has the advantage of being really simple to use and understand, but it requires extremely large vocabularies for a good coverage. Using this <code>Model</code> requires the use of a <code>PreTokenizer</code>. No choice will be made by this model directly, it simply maps input tokens to IDs.</td></tr> <tr><td align="left">BPE</td> <td align="left">One of the most popular subword tokenization algorithm. The Byte-Pair-Encoding works by starting with characters, while merging those that are the most frequently seen together, thus creating new tokens. It then works iteratively to build new tokens out of the most frequent pairs it sees in a corpus. BPE is able to build words it has never seen by using multiple subword tokens, and thus requires smaller vocabularies, with less chances of having “unk” (unknown) tokens.</td></tr> <tr><td align="left">WordPiece</td> <td align="left">This is a subword tokenization algorithm quite similar to BPE, used mainly by Google in models like BERT. It uses a greedy algorithm, that tries to build long words first, splitting in multiple tokens when entire words don’t exist in the vocabulary. This is different from BPE that starts from characters, building bigger tokens as possible. It uses the famous <code>##</code> prefix to identify tokens that are part of a word (ie not starting a word).</td></tr> <tr><td align="left">Unigram</td> <td align="left">Unigram is also a subword tokenization algorithm, and works by trying to identify the best set of subword tokens to maximize the probability for a given sentence. This is different from BPE in the way that this is not deterministic based on a set of rules applied sequentially. Instead Unigram will be able to compute multiple ways of tokenizing, while choosing the most probable one.</td></tr></tbody></table> <h2 class="relative group"><a id="post-processors" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#post-processors"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Post-Processors</span></h2> <p data-svelte-h="svelte-w22x26">After the whole pipeline, we sometimes want to insert some special
tokens before feed a tokenized string into a model like ”[CLS] My
horse is amazing [SEP]”. The <code>PostProcessor</code> is the component doing
just that.</p> <table data-svelte-h="svelte-1gcblfi"><thead><tr><th align="left">Name</th> <th align="left">Description</th> <th align="left">Example</th></tr></thead> <tbody><tr><td align="left">TemplateProcessing</td> <td align="left">Let’s you easily template the post processing, adding special tokens, and specifying the <code>type_id</code> for each sequence/special token. The template is given two strings representing the single sequence and the pair of sequences, as well as a set of special tokens to use.</td> <td align="left">Example, when specifying a template with these values:<br> <ul><li>single: <code>&quot;[CLS] $A [SEP]&quot;</code></li> <li>pair: <code>&quot;[CLS] $A [SEP] $B [SEP]&quot;</code></li> <li>special tokens: <ul><li><code>&quot;[CLS]&quot;</code></li> <li><code>&quot;[SEP]&quot;</code></li></ul></li></ul> <br> Input: <code>(&quot;I like this&quot;, &quot;but not this&quot;)</code> <br> Output: <code>&quot;[CLS] I like this [SEP] but not this [SEP]&quot;</code></td></tr></tbody></table> <h2 class="relative group"><a id="decoders" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#decoders"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Decoders</span></h2> <p data-svelte-h="svelte-fbbe46">The Decoder knows how to go from the IDs used by the Tokenizer, back to
a readable piece of text. Some <code>Normalizer</code> and <code>PreTokenizer</code> use
special characters or identifiers that need to be reverted for example.</p> <table data-svelte-h="svelte-1ccloes"><thead><tr><th align="left">Name</th> <th align="left">Description</th></tr></thead> <tbody><tr><td align="left">ByteLevel</td> <td align="left">Reverts the ByteLevel PreTokenizer. This PreTokenizer encodes at the byte-level, using a set of visible Unicode characters to represent each byte, so we need a Decoder to revert this process and get something readable again.</td></tr> <tr><td align="left">Metaspace</td> <td align="left">Reverts the Metaspace PreTokenizer. This PreTokenizer uses a special identifier <code></code> to identify whitespaces, and so this Decoder helps with decoding these.</td></tr> <tr><td align="left">WordPiece</td> <td align="left">Reverts the WordPiece Model. This model uses a special identifier <code>##</code> for continuing subwords, and so this Decoder helps with decoding these.</td></tr></tbody></table> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/tokenizers/blob/main/docs/source-doc-builder/components.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1sdq0lx = {
assets: "/docs/tokenizers/pr_2012/en",
base: "/docs/tokenizers/pr_2012/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/tokenizers/pr_2012/en/_app/immutable/entry/start.82c81cf3.js"),
import("/docs/tokenizers/pr_2012/en/_app/immutable/entry/app.cac551ee.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 14],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
38.7 kB
·
Xet hash:
13a45cda459f232f7fb1c766bbc58a1b401e27212286582d4f4f2f64fe109ddb

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.