Buckets:
| import{s as Cl,o as zl,n as U}from"../chunks/scheduler.7c59faff.js";import{S as Vl,i as Ql,e as j,s as f,c as p,h as Zl,a as w,d as y,b as J,f as Il,g as i,p as h,j as bl,k as Nl,l as m,m as c,t as u,n as M,o as $}from"../chunks/index.09bb5655.js";import{C as Sl,H as Ie,E as Xl}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.2f6febbe.js";import{C as d}from"../chunks/CodeBlock.c2eb9f43.js";import{T as k,M as T}from"../chunks/TokenizersLanguageContent.0fc17a7a.js";function vl(a){let t,l;return t=new d({props:{code:"ZnJvbSUyMHRva2VuaXplcnMlMjBpbXBvcnQlMjBUb2tlbml6ZXIlMEF0b2tlbml6ZXIlMjAlM0QlMjBUb2tlbml6ZXIuZnJvbV9maWxlKCUyMmRhdGElMkZ0b2tlbml6ZXItd2lraS5qc29uJTIyKQ==",highlighted:`<span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> Tokenizer | |
| tokenizer = Tokenizer.from_file(<span class="hljs-string">"data/tokenizer-wiki.json"</span>)`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function ql(a){let t,l;return t=new T({props:{$$slots:{default:[vl]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Bl(a){let t,l;return t=new d({props:{code:"dXNlJTIwdG9rZW5pemVycyUzQSUzQVRva2VuaXplciUzQiUwQWxldCUyMG11dCUyMHRva2VuaXplciUyMCUzRCUyMFRva2VuaXplciUzQSUzQWZyb21fZmlsZSglMjJkYXRhJTJGdG9rZW5pemVyLXdpa2kuanNvbiUyMiklM0YlM0I=",highlighted:`<span class="hljs-keyword">use</span> tokenizers::Tokenizer; | |
| <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut </span><span class="hljs-variable">tokenizer</span> = Tokenizer::<span class="hljs-title function_ invoke__">from_file</span>(<span class="hljs-string">"data/tokenizer-wiki.json"</span>)?;`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Rl(a){let t,l;return t=new T({props:{$$slots:{default:[Bl]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Wl(a){let t,l;return t=new d({props:{code:"bGV0JTIwJTdCJTIwVG9rZW5pemVyJTIwJTdEJTIwJTNEJTIwcmVxdWlyZSglMjJ0b2tlbml6ZXJzJTIyKSUzQiUwQWxldCUyMHRva2VuaXplciUyMCUzRCUyMFRva2VuaXplci5mcm9tRmlsZSglMjJkYXRhJTJGdG9rZW5pemVyLXdpa2kuanNvbiUyMiklM0I=",highlighted:`<span class="hljs-keyword">let</span> { <span class="hljs-title class_">Tokenizer</span> } = <span class="hljs-built_in">require</span>(<span class="hljs-string">"tokenizers"</span>); | |
| <span class="hljs-keyword">let</span> tokenizer = <span class="hljs-title class_">Tokenizer</span>.<span class="hljs-title function_">fromFile</span>(<span class="hljs-string">"data/tokenizer-wiki.json"</span>);`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Gl(a){let t,l;return t=new T({props:{$$slots:{default:[Wl]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Al(a){let t,l;return t=new d({props:{code:"ZnJvbSUyMHRva2VuaXplcnMlMjBpbXBvcnQlMjBub3JtYWxpemVycyUwQWZyb20lMjB0b2tlbml6ZXJzLm5vcm1hbGl6ZXJzJTIwaW1wb3J0JTIwTkZEJTJDJTIwU3RyaXBBY2NlbnRzJTBBbm9ybWFsaXplciUyMCUzRCUyMG5vcm1hbGl6ZXJzLlNlcXVlbmNlKCU1Qk5GRCgpJTJDJTIwU3RyaXBBY2NlbnRzKCklNUQp",highlighted:`<span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> normalizers | |
| <span class="hljs-keyword">from</span> tokenizers.normalizers <span class="hljs-keyword">import</span> NFD, StripAccents | |
| normalizer = normalizers.<span class="hljs-type">Sequence</span>([NFD(), StripAccents()])`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function El(a){let t,l;return t=new T({props:{$$slots:{default:[Al]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Hl(a){let t,l;return t=new d({props:{code:"dXNlJTIwdG9rZW5pemVycyUzQSUzQW5vcm1hbGl6ZXJzJTNBJTNBJTdCJTBBJTIwJTIwJTIwJTIwc3RyaXAlM0ElM0FTdHJpcEFjY2VudHMlMkMlMjB1bmljb2RlJTNBJTNBTkZEJTJDJTIwdXRpbHMlM0ElM0FTZXF1ZW5jZSUyMGFzJTIwTm9ybWFsaXplclNlcXVlbmNlJTJDJTBBJTdEJTNCJTBBbGV0JTIwbm9ybWFsaXplciUyMCUzRCUyME5vcm1hbGl6ZXJTZXF1ZW5jZSUzQSUzQW5ldyh2ZWMhJTVCTkZELmludG8oKSUyQyUyMFN0cmlwQWNjZW50cy5pbnRvKCklNUQpJTNC",highlighted:`<span class="hljs-keyword">use</span> tokenizers::normalizers::{ | |
| strip::StripAccents, unicode::NFD, utils::Sequence <span class="hljs-keyword">as</span> NormalizerSequence, | |
| }; | |
| <span class="hljs-keyword">let</span> <span class="hljs-variable">normalizer</span> = NormalizerSequence::<span class="hljs-title function_ invoke__">new</span>(<span class="hljs-built_in">vec!</span>[NFD.<span class="hljs-title function_ invoke__">into</span>(), StripAccents.<span class="hljs-title function_ invoke__">into</span>()]);`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function xl(a){let t,l;return t=new T({props:{$$slots:{default:[Hl]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Fl(a){let t,l;return t=new d({props:{code:"bGV0JTIwJTdCJTIwc2VxdWVuY2VOb3JtYWxpemVyJTJDJTIwbmZkTm9ybWFsaXplciUyQyUyMHN0cmlwQWNjZW50c05vcm1hbGl6ZXIlMjAlN0QlMjAlM0QlMjByZXF1aXJlKCUyMnRva2VuaXplcnMlMjIpJTNCJTBBbGV0JTIwbm9ybWFsaXplciUyMCUzRCUyMHNlcXVlbmNlTm9ybWFsaXplciglNUJuZmROb3JtYWxpemVyKCklMkMlMjBzdHJpcEFjY2VudHNOb3JtYWxpemVyKCklNUQpJTNC",highlighted:`<span class="hljs-keyword">let</span> { sequenceNormalizer, nfdNormalizer, stripAccentsNormalizer } = <span class="hljs-built_in">require</span>(<span class="hljs-string">"tokenizers"</span>); | |
| <span class="hljs-keyword">let</span> normalizer = <span class="hljs-title function_">sequenceNormalizer</span>([<span class="hljs-title function_">nfdNormalizer</span>(), <span class="hljs-title function_">stripAccentsNormalizer</span>()]);`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Dl(a){let t,l;return t=new T({props:{$$slots:{default:[Fl]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Yl(a){let t,l;return t=new d({props:{code:"bm9ybWFsaXplci5ub3JtYWxpemVfc3RyKCUyMkglQzMlQTlsbCVDMyVCMiUyMGglQzMlQjR3JTIwYXJlJTIwJUMzJUJDJTNGJTIyKSUwQSUyMyUyMCUyMkhlbGxvJTIwaG93JTIwYXJlJTIwdSUzRiUyMg==",highlighted:`normalizer.normalize_str(<span class="hljs-string">"Héllò hôw are ü?"</span>) | |
| <span class="hljs-comment"># "Hello how are u?"</span>`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Kl(a){let t,l;return t=new T({props:{$$slots:{default:[Yl]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Ll(a){let t,l;return t=new d({props:{code:"dXNlJTIwdG9rZW5pemVycyUzQSUzQSU3Qk5vcm1hbGl6ZWRTdHJpbmclMkMlMjBOb3JtYWxpemVyJTdEJTNCJTBBbGV0JTIwbXV0JTIwbm9ybWFsaXplZCUyMCUzRCUyME5vcm1hbGl6ZWRTdHJpbmclM0ElM0Fmcm9tKCUyMkglQzMlQTlsbCVDMyVCMiUyMGglQzMlQjR3JTIwYXJlJTIwJUMzJUJDJTNGJTIyKSUzQiUwQW5vcm1hbGl6ZXIubm9ybWFsaXplKCUyNm11dCUyMG5vcm1hbGl6ZWQpJTNGJTNCJTBBcHJpbnRsbiEoJTIyJTdCJTdEJTIyJTJDJTIwbm9ybWFsaXplZC5nZXQoKSklM0IlMEElMkYlMkYlMjAlMjJIZWxsbyUyMGhvdyUyMGFyZSUyMHUlM0YlMjI=",highlighted:`<span class="hljs-keyword">use</span> tokenizers::{NormalizedString, Normalizer}; | |
| <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut </span><span class="hljs-variable">normalized</span> = NormalizedString::<span class="hljs-title function_ invoke__">from</span>(<span class="hljs-string">"Héllò hôw are ü?"</span>); | |
| normalizer.<span class="hljs-title function_ invoke__">normalize</span>(&<span class="hljs-keyword">mut</span> normalized)?; | |
| <span class="hljs-built_in">println!</span>(<span class="hljs-string">"{}"</span>, normalized.<span class="hljs-title function_ invoke__">get</span>()); | |
| <span class="hljs-comment">// "Hello how are u?"</span>`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Pl(a){let t,l;return t=new T({props:{$$slots:{default:[Ll]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Ol(a){let t,l;return t=new d({props:{code:"bGV0JTIwbm9ybWFsaXplZCUyMCUzRCUyMG5vcm1hbGl6ZXIubm9ybWFsaXplU3RyaW5nKCUyMkglQzMlQTlsbCVDMyVCMiUyMGglQzMlQjR3JTIwYXJlJTIwJUMzJUJDJTNGJTIyKSUwQSUyRiUyRiUyMCUyMkhlbGxvJTIwaG93JTIwYXJlJTIwdSUzRiUyMg==",highlighted:`<span class="hljs-keyword">let</span> normalized = normalizer.<span class="hljs-title function_">normalizeString</span>(<span class="hljs-string">"Héllò hôw are ü?"</span>) | |
| <span class="hljs-comment">// "Hello how are u?"</span>`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function es(a){let t,l;return t=new T({props:{$$slots:{default:[Ol]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function ts(a){let t,l;return t=new d({props:{code:"dG9rZW5pemVyLm5vcm1hbGl6ZXIlMjAlM0QlMjBub3JtYWxpemVy",highlighted:"tokenizer.normalizer = normalizer",wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function ls(a){let t,l;return t=new T({props:{$$slots:{default:[ts]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function ss(a){let t,l;return t=new d({props:{code:"dG9rZW5pemVyLndpdGhfbm9ybWFsaXplcihTb21lKG5vcm1hbGl6ZXIpKS51bndyYXAoKSUzQg==",highlighted:'tokenizer.<span class="hljs-title function_ invoke__">with_normalizer</span>(<span class="hljs-title function_ invoke__">Some</span>(normalizer)).<span class="hljs-title function_ invoke__">unwrap</span>();',wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function ns(a){let t,l;return t=new T({props:{$$slots:{default:[ss]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function as(a){let t,l;return t=new d({props:{code:"dG9rZW5pemVyLnNldE5vcm1hbGl6ZXIobm9ybWFsaXplcik=",highlighted:'tokenizer.<span class="hljs-title function_">setNormalizer</span>(normalizer)',wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function rs(a){let t,l;return t=new T({props:{$$slots:{default:[as]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function os(a){let t,l;return t=new d({props:{code:"ZnJvbSUyMHRva2VuaXplcnMucHJlX3Rva2VuaXplcnMlMjBpbXBvcnQlMjBXaGl0ZXNwYWNlJTBBcHJlX3Rva2VuaXplciUyMCUzRCUyMFdoaXRlc3BhY2UoKSUwQXByZV90b2tlbml6ZXIucHJlX3Rva2VuaXplX3N0ciglMjJIZWxsbyElMjBIb3clMjBhcmUlMjB5b3UlM0YlMjBJJ20lMjBmaW5lJTJDJTIwdGhhbmslMjB5b3UuJTIyKSUwQSUyMyUyMCU1QiglMjJIZWxsbyUyMiUyQyUyMCgwJTJDJTIwNSkpJTJDJTIwKCUyMiElMjIlMkMlMjAoNSUyQyUyMDYpKSUyQyUyMCglMjJIb3clMjIlMkMlMjAoNyUyQyUyMDEwKSklMkMlMjAoJTIyYXJlJTIyJTJDJTIwKDExJTJDJTIwMTQpKSUyQyUyMCglMjJ5b3UlMjIlMkMlMjAoMTUlMkMlMjAxOCkpJTJDJTBBJTIzJTIwJTIwKCUyMiUzRiUyMiUyQyUyMCgxOCUyQyUyMDE5KSklMkMlMjAoJTIySSUyMiUyQyUyMCgyMCUyQyUyMDIxKSklMkMlMjAoJTIyJyUyMiUyQyUyMCgyMSUyQyUyMDIyKSklMkMlMjAoJ20nJTJDJTIwKDIyJTJDJTIwMjMpKSUyQyUyMCglMjJmaW5lJTIyJTJDJTIwKDI0JTJDJTIwMjgpKSUyQyUwQSUyMyUyMCUyMCglMjIlMkMlMjIlMkMlMjAoMjglMkMlMjAyOSkpJTJDJTIwKCUyMnRoYW5rJTIyJTJDJTIwKDMwJTJDJTIwMzUpKSUyQyUyMCglMjJ5b3UlMjIlMkMlMjAoMzYlMkMlMjAzOSkpJTJDJTIwKCUyMi4lMjIlMkMlMjAoMzklMkMlMjA0MCkpJTVE",highlighted:`<span class="hljs-keyword">from</span> tokenizers.pre_tokenizers <span class="hljs-keyword">import</span> Whitespace | |
| pre_tokenizer = Whitespace() | |
| pre_tokenizer.pre_tokenize_str(<span class="hljs-string">"Hello! How are you? I'm fine, thank you."</span>) | |
| <span class="hljs-comment"># [("Hello", (0, 5)), ("!", (5, 6)), ("How", (7, 10)), ("are", (11, 14)), ("you", (15, 18)),</span> | |
| <span class="hljs-comment"># ("?", (18, 19)), ("I", (20, 21)), ("'", (21, 22)), ('m', (22, 23)), ("fine", (24, 28)),</span> | |
| <span class="hljs-comment"># (",", (28, 29)), ("thank", (30, 35)), ("you", (36, 39)), (".", (39, 40))]</span>`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function ps(a){let t,l;return t=new T({props:{$$slots:{default:[os]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function is(a){let t,l;return t=new d({props:{code:"dXNlJTIwdG9rZW5pemVycyUzQSUzQXByZV90b2tlbml6ZXJzJTNBJTNBd2hpdGVzcGFjZSUzQSUzQVdoaXRlc3BhY2UlM0IlMEF1c2UlMjB0b2tlbml6ZXJzJTNBJTNBJTdCT2Zmc2V0UmVmZXJlbnRpYWwlMkMlMjBPZmZzZXRUeXBlJTJDJTIwUHJlVG9rZW5pemVkU3RyaW5nJTJDJTIwUHJlVG9rZW5pemVyJTdEJTNCJTBBbGV0JTIwcHJlX3Rva2VuaXplciUyMCUzRCUyMFdoaXRlc3BhY2UlMjAlN0IlN0QlM0IlMEFsZXQlMjBtdXQlMjBwcmVfdG9rZW5pemVkJTIwJTNEJTIwUHJlVG9rZW5pemVkU3RyaW5nJTNBJTNBZnJvbSglMjJIZWxsbyElMjBIb3clMjBhcmUlMjB5b3UlM0YlMjBJJ20lMjBmaW5lJTJDJTIwdGhhbmslMjB5b3UuJTIyKSUzQiUwQXByZV90b2tlbml6ZXIucHJlX3Rva2VuaXplKCUyNm11dCUyMHByZV90b2tlbml6ZWQpJTNGJTNCJTBBcHJpbnRsbiEoJTBBJTIwJTIwJTIwJTIwJTIyJTdCJTNBJTNGJTdEJTIyJTJDJTBBJTIwJTIwJTIwJTIwcHJlX3Rva2VuaXplZC5nZXRfc3BsaXRzKE9mZnNldFJlZmVyZW50aWFsJTNBJTNBT3JpZ2luYWwlMkMlMjBPZmZzZXRUeXBlJTNBJTNBQnl0ZSklMEEpJTNCJTBBJTJGJTJGJTIwJTVCKCUyMkhlbGxvJTIyJTJDJTIwKDAlMkMlMjA1KSUyQyUyME5vbmUpJTJDJTIwKCUyMiElMjIlMkMlMjAoNSUyQyUyMDYpJTJDJTIwTm9uZSklMkMlMjAoJTIySG93JTIyJTJDJTIwKDclMkMlMjAxMCklMkMlMjBOb25lKSUyQyUwQSUyRiUyRiUyMCUyMCglMjJhcmUlMjIlMkMlMjAoMTElMkMlMjAxNCklMkMlMjBOb25lKSUyQyUyMCglMjJ5b3UlMjIlMkMlMjAoMTUlMkMlMjAxOCklMkMlMjBOb25lKSUyQyUyMCglMjIlM0YlMjIlMkMlMjAoMTglMkMlMjAxOSklMkMlMjBOb25lKSUyQyUwQSUyRiUyRiUyMCUyMCglMjJJJTIyJTJDJTIwKDIwJTJDJTIwMjEpJTJDJTIwTm9uZSklMkMlMjAoJTIyJTVDJyUyMiUyQyUyMCgyMSUyQyUyMDIyKSUyQyUyME5vbmUpJTJDJTIwKCUyMm0lMjIlMkMlMjAoMjIlMkMlMjAyMyklMkMlMjBOb25lKSUyQyUwQSUyRiUyRiUyMCUyMCglMjJmaW5lJTIyJTJDJTIwKDI0JTJDJTIwMjgpJTJDJTIwTm9uZSklMkMlMjAoJTIyJTJDJTIyJTJDJTIwKDI4JTJDJTIwMjkpJTJDJTIwTm9uZSklMkMlMjAoJTIydGhhbmslMjIlMkMlMjAoMzAlMkMlMjAzNSklMkMlMjBOb25lKSUyQyUwQSUyRiUyRiUyMCUyMCglMjJ5b3UlMjIlMkMlMjAoMzYlMkMlMjAzOSklMkMlMjBOb25lKSUyQyUyMCglMjIuJTIyJTJDJTIwKDM5JTJDJTIwNDApJTJDJTIwTm9uZSklNUQ=",highlighted:`<span class="hljs-keyword">use</span> tokenizers::pre_tokenizers::whitespace::Whitespace; | |
| <span class="hljs-keyword">use</span> tokenizers::{OffsetReferential, OffsetType, PreTokenizedString, PreTokenizer}; | |
| <span class="hljs-keyword">let</span> <span class="hljs-variable">pre_tokenizer</span> = Whitespace {}; | |
| <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut </span><span class="hljs-variable">pre_tokenized</span> = PreTokenizedString::<span class="hljs-title function_ invoke__">from</span>(<span class="hljs-string">"Hello! How are you? I'm fine, thank you."</span>); | |
| pre_tokenizer.<span class="hljs-title function_ invoke__">pre_tokenize</span>(&<span class="hljs-keyword">mut</span> pre_tokenized)?; | |
| <span class="hljs-built_in">println!</span>( | |
| <span class="hljs-string">"{:?}"</span>, | |
| pre_tokenized.<span class="hljs-title function_ invoke__">get_splits</span>(OffsetReferential::Original, OffsetType::Byte) | |
| ); | |
| <span class="hljs-comment">// [("Hello", (0, 5), None), ("!", (5, 6), None), ("How", (7, 10), None),</span> | |
| <span class="hljs-comment">// ("are", (11, 14), None), ("you", (15, 18), None), ("?", (18, 19), None),</span> | |
| <span class="hljs-comment">// ("I", (20, 21), None), ("\\'", (21, 22), None), ("m", (22, 23), None),</span> | |
| <span class="hljs-comment">// ("fine", (24, 28), None), (",", (28, 29), None), ("thank", (30, 35), None),</span> | |
| <span class="hljs-comment">// ("you", (36, 39), None), (".", (39, 40), None)]</span>`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function cs(a){let t,l;return t=new T({props:{$$slots:{default:[is]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function us(a){let t,l;return t=new d({props:{code:"bGV0JTIwJTdCJTIwd2hpdGVzcGFjZVByZVRva2VuaXplciUyMCU3RCUyMCUzRCUyMHJlcXVpcmUoJTIydG9rZW5pemVycyUyMiklM0IlMEF2YXIlMjBwcmVUb2tlbml6ZXIlMjAlM0QlMjB3aGl0ZXNwYWNlUHJlVG9rZW5pemVyKCklM0IlMEF2YXIlMjBwcmVUb2tlbml6ZWQlMjAlM0QlMjBwcmVUb2tlbml6ZXIucHJlVG9rZW5pemVTdHJpbmcoJTIySGVsbG8hJTIwSG93JTIwYXJlJTIweW91JTNGJTIwSSdtJTIwZmluZSUyQyUyMHRoYW5rJTIweW91LiUyMiklM0I=",highlighted:`<span class="hljs-keyword">let</span> { whitespacePreTokenizer } = <span class="hljs-built_in">require</span>(<span class="hljs-string">"tokenizers"</span>); | |
| <span class="hljs-keyword">var</span> preTokenizer = <span class="hljs-title function_">whitespacePreTokenizer</span>(); | |
| <span class="hljs-keyword">var</span> preTokenized = preTokenizer.<span class="hljs-title function_">preTokenizeString</span>(<span class="hljs-string">"Hello! How are you? I'm fine, thank you."</span>);`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Ms(a){let t,l;return t=new T({props:{$$slots:{default:[us]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function $s(a){let t,l;return t=new d({props:{code:"ZnJvbSUyMHRva2VuaXplcnMlMjBpbXBvcnQlMjBwcmVfdG9rZW5pemVycyUwQWZyb20lMjB0b2tlbml6ZXJzLnByZV90b2tlbml6ZXJzJTIwaW1wb3J0JTIwRGlnaXRzJTBBcHJlX3Rva2VuaXplciUyMCUzRCUyMHByZV90b2tlbml6ZXJzLlNlcXVlbmNlKCU1QldoaXRlc3BhY2UoKSUyQyUyMERpZ2l0cyhpbmRpdmlkdWFsX2RpZ2l0cyUzRFRydWUpJTVEKSUwQXByZV90b2tlbml6ZXIucHJlX3Rva2VuaXplX3N0ciglMjJDYWxsJTIwOTExISUyMiklMEElMjMlMjAlNUIoJTIyQ2FsbCUyMiUyQyUyMCgwJTJDJTIwNCkpJTJDJTIwKCUyMjklMjIlMkMlMjAoNSUyQyUyMDYpKSUyQyUyMCglMjIxJTIyJTJDJTIwKDYlMkMlMjA3KSklMkMlMjAoJTIyMSUyMiUyQyUyMCg3JTJDJTIwOCkpJTJDJTIwKCUyMiElMjIlMkMlMjAoOCUyQyUyMDkpKSU1RA==",highlighted:`<span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> pre_tokenizers | |
| <span class="hljs-keyword">from</span> tokenizers.pre_tokenizers <span class="hljs-keyword">import</span> Digits | |
| pre_tokenizer = pre_tokenizers.<span class="hljs-type">Sequence</span>([Whitespace(), Digits(individual_digits=<span class="hljs-literal">True</span>)]) | |
| pre_tokenizer.pre_tokenize_str(<span class="hljs-string">"Call 911!"</span>) | |
| <span class="hljs-comment"># [("Call", (0, 4)), ("9", (5, 6)), ("1", (6, 7)), ("1", (7, 8)), ("!", (8, 9))]</span>`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function ys(a){let t,l;return t=new T({props:{$$slots:{default:[$s]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function ms(a){let t,l;return t=new d({props:{code:"dXNlJTIwdG9rZW5pemVycyUzQSUzQXByZV90b2tlbml6ZXJzJTNBJTNBJTdCZGlnaXRzJTNBJTNBRGlnaXRzJTJDJTIwc2VxdWVuY2UlM0ElM0FTZXF1ZW5jZSU3RCUzQiUwQWxldCUyMHByZV90b2tlbml6ZXIlMjAlM0QlMjBTZXF1ZW5jZSUzQSUzQW5ldyh2ZWMhJTVCV2hpdGVzcGFjZSUyMCU3QiU3RC5pbnRvKCklMkMlMjBEaWdpdHMlM0ElM0FuZXcodHJ1ZSkuaW50bygpJTVEKSUzQiUwQWxldCUyMG11dCUyMHByZV90b2tlbml6ZWQlMjAlM0QlMjBQcmVUb2tlbml6ZWRTdHJpbmclM0ElM0Fmcm9tKCUyMkNhbGwlMjA5MTEhJTIyKSUzQiUwQXByZV90b2tlbml6ZXIucHJlX3Rva2VuaXplKCUyNm11dCUyMHByZV90b2tlbml6ZWQpJTNGJTNCJTBBcHJpbnRsbiEoJTBBJTIwJTIwJTIwJTIwJTIyJTdCJTNBJTNGJTdEJTIyJTJDJTBBJTIwJTIwJTIwJTIwcHJlX3Rva2VuaXplZC5nZXRfc3BsaXRzKE9mZnNldFJlZmVyZW50aWFsJTNBJTNBT3JpZ2luYWwlMkMlMjBPZmZzZXRUeXBlJTNBJTNBQnl0ZSklMEEpJTNC",highlighted:`<span class="hljs-keyword">use</span> tokenizers::pre_tokenizers::{digits::Digits, sequence::Sequence}; | |
| <span class="hljs-keyword">let</span> <span class="hljs-variable">pre_tokenizer</span> = Sequence::<span class="hljs-title function_ invoke__">new</span>(<span class="hljs-built_in">vec!</span>[Whitespace {}.<span class="hljs-title function_ invoke__">into</span>(), Digits::<span class="hljs-title function_ invoke__">new</span>(<span class="hljs-literal">true</span>).<span class="hljs-title function_ invoke__">into</span>()]); | |
| <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut </span><span class="hljs-variable">pre_tokenized</span> = PreTokenizedString::<span class="hljs-title function_ invoke__">from</span>(<span class="hljs-string">"Call 911!"</span>); | |
| pre_tokenizer.<span class="hljs-title function_ invoke__">pre_tokenize</span>(&<span class="hljs-keyword">mut</span> pre_tokenized)?; | |
| <span class="hljs-built_in">println!</span>( | |
| <span class="hljs-string">"{:?}"</span>, | |
| pre_tokenized.<span class="hljs-title function_ invoke__">get_splits</span>(OffsetReferential::Original, OffsetType::Byte) | |
| );`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function fs(a){let t,l;return t=new T({props:{$$slots:{default:[ms]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Js(a){let t,l;return t=new d({props:{code:"bGV0JTIwJTdCJTIwc2VxdWVuY2VQcmVUb2tlbml6ZXIlMkMlMjBkaWdpdHNQcmVUb2tlbml6ZXIlMjAlN0QlMjAlM0QlMjByZXF1aXJlKCUyMnRva2VuaXplcnMlMjIpJTNCJTBBdmFyJTIwcHJlVG9rZW5pemVyJTIwJTNEJTIwc2VxdWVuY2VQcmVUb2tlbml6ZXIoJTVCd2hpdGVzcGFjZVByZVRva2VuaXplcigpJTJDJTIwZGlnaXRzUHJlVG9rZW5pemVyKHRydWUpJTVEKSUzQiUwQXZhciUyMHByZVRva2VuaXplZCUyMCUzRCUyMHByZVRva2VuaXplci5wcmVUb2tlbml6ZVN0cmluZyglMjJDYWxsJTIwOTExISUyMiklM0I=",highlighted:`<span class="hljs-keyword">let</span> { sequencePreTokenizer, digitsPreTokenizer } = <span class="hljs-built_in">require</span>(<span class="hljs-string">"tokenizers"</span>); | |
| <span class="hljs-keyword">var</span> preTokenizer = <span class="hljs-title function_">sequencePreTokenizer</span>([<span class="hljs-title function_">whitespacePreTokenizer</span>(), <span class="hljs-title function_">digitsPreTokenizer</span>(<span class="hljs-literal">true</span>)]); | |
| <span class="hljs-keyword">var</span> preTokenized = preTokenizer.<span class="hljs-title function_">preTokenizeString</span>(<span class="hljs-string">"Call 911!"</span>);`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Us(a){let t,l;return t=new T({props:{$$slots:{default:[Js]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function ds(a){let t,l;return t=new d({props:{code:"dG9rZW5pemVyLnByZV90b2tlbml6ZXIlMjAlM0QlMjBwcmVfdG9rZW5pemVy",highlighted:"tokenizer.pre_tokenizer = pre_tokenizer",wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Ts(a){let t,l;return t=new T({props:{$$slots:{default:[ds]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function js(a){let t,l;return t=new d({props:{code:"dG9rZW5pemVyLndpdGhfcHJlX3Rva2VuaXplcihTb21lKHByZV90b2tlbml6ZXIpKSUzQg==",highlighted:'tokenizer.<span class="hljs-title function_ invoke__">with_pre_tokenizer</span>(<span class="hljs-title function_ invoke__">Some</span>(pre_tokenizer));',wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function ws(a){let t,l;return t=new T({props:{$$slots:{default:[js]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function hs(a){let t,l;return t=new d({props:{code:"dG9rZW5pemVyLnNldFByZVRva2VuaXplcihwcmVUb2tlbml6ZXIp",highlighted:'tokenizer.<span class="hljs-title function_">setPreTokenizer</span>(preTokenizer)',wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function ks(a){let t,l;return t=new T({props:{$$slots:{default:[hs]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function gs(a){let t,l;return t=new d({props:{code:"ZnJvbSUyMHRva2VuaXplcnMucHJvY2Vzc29ycyUyMGltcG9ydCUyMFRlbXBsYXRlUHJvY2Vzc2luZyUwQXRva2VuaXplci5wb3N0X3Byb2Nlc3NvciUyMCUzRCUyMFRlbXBsYXRlUHJvY2Vzc2luZyglMEElMjAlMjAlMjAlMjBzaW5nbGUlM0QlMjIlNUJDTFMlNUQlMjAlMjRBJTIwJTVCU0VQJTVEJTIyJTJDJTBBJTIwJTIwJTIwJTIwcGFpciUzRCUyMiU1QkNMUyU1RCUyMCUyNEElMjAlNUJTRVAlNUQlMjAlMjRCJTNBMSUyMCU1QlNFUCU1RCUzQTElMjIlMkMlMEElMjAlMjAlMjAlMjBzcGVjaWFsX3Rva2VucyUzRCU1QiglMjIlNUJDTFMlNUQlMjIlMkMlMjAxKSUyQyUyMCglMjIlNUJTRVAlNUQlMjIlMkMlMjAyKSU1RCUyQyUwQSk=",highlighted:`<span class="hljs-keyword">from</span> tokenizers.processors <span class="hljs-keyword">import</span> TemplateProcessing | |
| tokenizer.post_processor = TemplateProcessing( | |
| single=<span class="hljs-string">"[CLS] $A [SEP]"</span>, | |
| pair=<span class="hljs-string">"[CLS] $A [SEP] $B:1 [SEP]:1"</span>, | |
| special_tokens=[(<span class="hljs-string">"[CLS]"</span>, <span class="hljs-number">1</span>), (<span class="hljs-string">"[SEP]"</span>, <span class="hljs-number">2</span>)], | |
| )`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function _s(a){let t,l;return t=new T({props:{$$slots:{default:[gs]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Is(a){let t,l;return t=new d({props:{code:"dXNlJTIwdG9rZW5pemVycyUzQSUzQXByb2Nlc3NvcnMlM0ElM0F0ZW1wbGF0ZSUzQSUzQVRlbXBsYXRlUHJvY2Vzc2luZyUzQiUwQXRva2VuaXplci53aXRoX3Bvc3RfcHJvY2Vzc29yKFNvbWUoJTBBJTIwJTIwJTIwJTIwVGVtcGxhdGVQcm9jZXNzaW5nJTNBJTNBYnVpbGRlcigpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwLnRyeV9zaW5nbGUoJTIyJTVCQ0xTJTVEJTIwJTI0QSUyMCU1QlNFUCU1RCUyMiklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAudW53cmFwKCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAudHJ5X3BhaXIoJTIyJTVCQ0xTJTVEJTIwJTI0QSUyMCU1QlNFUCU1RCUyMCUyNEIlM0ExJTIwJTVCU0VQJTVEJTNBMSUyMiklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAudW53cmFwKCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAuc3BlY2lhbF90b2tlbnModmVjISU1QiglMjIlNUJDTFMlNUQlMjIlMkMlMjAxKSUyQyUyMCglMjIlNUJTRVAlNUQlMjIlMkMlMjAyKSU1RCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAuYnVpbGQoKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMC51bndyYXAoKSUyQyUwQSkpJTNC",highlighted:`<span class="hljs-keyword">use</span> tokenizers::processors::template::TemplateProcessing; | |
| tokenizer.<span class="hljs-title function_ invoke__">with_post_processor</span>(<span class="hljs-title function_ invoke__">Some</span>( | |
| TemplateProcessing::<span class="hljs-title function_ invoke__">builder</span>() | |
| .<span class="hljs-title function_ invoke__">try_single</span>(<span class="hljs-string">"[CLS] $A [SEP]"</span>) | |
| .<span class="hljs-title function_ invoke__">unwrap</span>() | |
| .<span class="hljs-title function_ invoke__">try_pair</span>(<span class="hljs-string">"[CLS] $A [SEP] $B:1 [SEP]:1"</span>) | |
| .<span class="hljs-title function_ invoke__">unwrap</span>() | |
| .<span class="hljs-title function_ invoke__">special_tokens</span>(<span class="hljs-built_in">vec!</span>[(<span class="hljs-string">"[CLS]"</span>, <span class="hljs-number">1</span>), (<span class="hljs-string">"[SEP]"</span>, <span class="hljs-number">2</span>)]) | |
| .<span class="hljs-title function_ invoke__">build</span>() | |
| .<span class="hljs-title function_ invoke__">unwrap</span>(), | |
| ));`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function bs(a){let t,l;return t=new T({props:{$$slots:{default:[Is]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Cs(a){let t,l;return t=new d({props:{code:"bGV0JTIwJTdCJTIwdGVtcGxhdGVQcm9jZXNzaW5nJTIwJTdEJTIwJTNEJTIwcmVxdWlyZSglMjJ0b2tlbml6ZXJzJTIyKSUzQiUwQXRva2VuaXplci5zZXRQb3N0UHJvY2Vzc29yKHRlbXBsYXRlUHJvY2Vzc2luZyglMEElMjAlMjAlMjAlMjAlMjIlNUJDTFMlNUQlMjAlMjRBJTIwJTVCU0VQJTVEJTIyJTJDJTBBJTIwJTIwJTIwJTIwJTIyJTVCQ0xTJTVEJTIwJTI0QSUyMCU1QlNFUCU1RCUyMCUyNEIlM0ExJTIwJTVCU0VQJTVEJTNBMSUyMiUyQyUwQSUyMCUyMCUyMCUyMCU1QiU1QiUyMiU1QkNMUyU1RCUyMiUyQyUyMDElNUQlMkMlMjAlNUIlMjIlNUJTRVAlNUQlMjIlMkMlMjAyJTVEJTVEJTBBKSklM0I=",highlighted:`<span class="hljs-keyword">let</span> { templateProcessing } = <span class="hljs-built_in">require</span>(<span class="hljs-string">"tokenizers"</span>); | |
| tokenizer.<span class="hljs-title function_">setPostProcessor</span>(<span class="hljs-title function_">templateProcessing</span>( | |
| <span class="hljs-string">"[CLS] $A [SEP]"</span>, | |
| <span class="hljs-string">"[CLS] $A [SEP] $B:1 [SEP]:1"</span>, | |
| [[<span class="hljs-string">"[CLS]"</span>, <span class="hljs-number">1</span>], [<span class="hljs-string">"[SEP]"</span>, <span class="hljs-number">2</span>]] | |
| ));`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function zs(a){let t,l;return t=new T({props:{$$slots:{default:[Cs]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Vs(a){let t,l;return t=new d({props:{code:"ZnJvbSUyMHRva2VuaXplcnMlMjBpbXBvcnQlMjBUb2tlbml6ZXIlMEFmcm9tJTIwdG9rZW5pemVycy5tb2RlbHMlMjBpbXBvcnQlMjBXb3JkUGllY2UlMEFiZXJ0X3Rva2VuaXplciUyMCUzRCUyMFRva2VuaXplcihXb3JkUGllY2UodW5rX3Rva2VuJTNEJTIyJTVCVU5LJTVEJTIyKSk=",highlighted:`<span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> Tokenizer | |
| <span class="hljs-keyword">from</span> tokenizers.models <span class="hljs-keyword">import</span> WordPiece | |
| bert_tokenizer = Tokenizer(WordPiece(unk_token=<span class="hljs-string">"[UNK]"</span>))`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Qs(a){let t,l;return t=new T({props:{$$slots:{default:[Vs]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Zs(a){let t,l;return t=new d({props:{code:"dXNlJTIwdG9rZW5pemVycyUzQSUzQW1vZGVscyUzQSUzQXdvcmRwaWVjZSUzQSUzQVdvcmRQaWVjZSUzQiUwQXVzZSUyMHRva2VuaXplcnMlM0ElM0FUb2tlbml6ZXIlM0IlMEFsZXQlMjBtdXQlMjBiZXJ0X3Rva2VuaXplciUyMCUzRCUyMFRva2VuaXplciUzQSUzQW5ldyglMEElMjAlMjAlMjAlMjBXb3JkUGllY2UlM0ElM0FidWlsZGVyKCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAudW5rX3Rva2VuKCUyMiU1QlVOSyU1RCUyMi50b19zdHJpbmcoKSklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAuYnVpbGQoKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMC51bndyYXAoKSUyQyUwQSklM0I=",highlighted:`<span class="hljs-keyword">use</span> tokenizers::models::wordpiece::WordPiece; | |
| <span class="hljs-keyword">use</span> tokenizers::Tokenizer; | |
| <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut </span><span class="hljs-variable">bert_tokenizer</span> = Tokenizer::<span class="hljs-title function_ invoke__">new</span>( | |
| WordPiece::<span class="hljs-title function_ invoke__">builder</span>() | |
| .<span class="hljs-title function_ invoke__">unk_token</span>(<span class="hljs-string">"[UNK]"</span>.<span class="hljs-title function_ invoke__">to_string</span>()) | |
| .<span class="hljs-title function_ invoke__">build</span>() | |
| .<span class="hljs-title function_ invoke__">unwrap</span>(), | |
| );`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Ns(a){let t,l;return t=new T({props:{$$slots:{default:[Zs]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Ss(a){let t,l;return t=new d({props:{code:"bGV0JTIwJTdCJTIwVG9rZW5pemVyJTIwJTdEJTIwJTNEJTIwcmVxdWlyZSglMjJ0b2tlbml6ZXJzJTIyKSUzQiUwQWxldCUyMCU3QiUyMFdvcmRQaWVjZSUyMCU3RCUyMCUzRCUyMHJlcXVpcmUoJTIydG9rZW5pemVycyUyMiklM0IlMEFsZXQlMjBiZXJ0VG9rZW5pemVyJTIwJTNEJTIwbmV3JTIwVG9rZW5pemVyKFdvcmRQaWVjZS5pbml0KCU3QiU3RCUyQyUyMCU3QiUyMHVua1Rva2VuJTNBJTIwJTIyJTVCVU5LJTVEJTIyJTIwJTdEKSklM0I=",highlighted:`<span class="hljs-keyword">let</span> { <span class="hljs-title class_">Tokenizer</span> } = <span class="hljs-built_in">require</span>(<span class="hljs-string">"tokenizers"</span>); | |
| <span class="hljs-keyword">let</span> { <span class="hljs-title class_">WordPiece</span> } = <span class="hljs-built_in">require</span>(<span class="hljs-string">"tokenizers"</span>); | |
| <span class="hljs-keyword">let</span> bertTokenizer = <span class="hljs-keyword">new</span> <span class="hljs-title class_">Tokenizer</span>(<span class="hljs-title class_">WordPiece</span>.<span class="hljs-title function_">init</span>({}, { <span class="hljs-attr">unkToken</span>: <span class="hljs-string">"[UNK]"</span> }));`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Xs(a){let t,l;return t=new T({props:{$$slots:{default:[Ss]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function vs(a){let t,l;return t=new d({props:{code:"ZnJvbSUyMHRva2VuaXplcnMlMjBpbXBvcnQlMjBub3JtYWxpemVycyUwQWZyb20lMjB0b2tlbml6ZXJzLm5vcm1hbGl6ZXJzJTIwaW1wb3J0JTIwTkZEJTJDJTIwTG93ZXJjYXNlJTJDJTIwU3RyaXBBY2NlbnRzJTBBYmVydF90b2tlbml6ZXIubm9ybWFsaXplciUyMCUzRCUyMG5vcm1hbGl6ZXJzLlNlcXVlbmNlKCU1Qk5GRCgpJTJDJTIwTG93ZXJjYXNlKCklMkMlMjBTdHJpcEFjY2VudHMoKSU1RCk=",highlighted:`<span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> normalizers | |
| <span class="hljs-keyword">from</span> tokenizers.normalizers <span class="hljs-keyword">import</span> NFD, Lowercase, StripAccents | |
| bert_tokenizer.normalizer = normalizers.<span class="hljs-type">Sequence</span>([NFD(), Lowercase(), StripAccents()])`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function qs(a){let t,l;return t=new T({props:{$$slots:{default:[vs]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Bs(a){let t,l;return t=new d({props:{code:"dXNlJTIwdG9rZW5pemVycyUzQSUzQW5vcm1hbGl6ZXJzJTNBJTNBdXRpbHMlM0ElM0FTZXF1ZW5jZSUyMGFzJTIwTm9ybWFsaXplclNlcXVlbmNlJTNCJTBBdXNlJTIwdG9rZW5pemVycyUzQSUzQW5vcm1hbGl6ZXJzJTNBJTNBJTdCc3RyaXAlM0ElM0FTdHJpcEFjY2VudHMlMkMlMjB1bmljb2RlJTNBJTNBTkZEJTJDJTIwdXRpbHMlM0ElM0FMb3dlcmNhc2UlN0QlM0IlMEFiZXJ0X3Rva2VuaXplciUwQSUyMCUyMCUyMCUyMC53aXRoX25vcm1hbGl6ZXIoU29tZShOb3JtYWxpemVyU2VxdWVuY2UlM0ElM0FuZXcodmVjISU1QiUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyME5GRC5pbnRvKCklMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBMb3dlcmNhc2UuaW50bygpJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwU3RyaXBBY2NlbnRzLmludG8oKSUyQyUwQSUyMCUyMCUyMCUyMCU1RCkpKSUwQSUyMCUyMCUyMCUyMC51bndyYXAoKSUzQg==",highlighted:`<span class="hljs-keyword">use</span> tokenizers::normalizers::utils::Sequence <span class="hljs-keyword">as</span> NormalizerSequence; | |
| <span class="hljs-keyword">use</span> tokenizers::normalizers::{strip::StripAccents, unicode::NFD, utils::Lowercase}; | |
| bert_tokenizer | |
| .<span class="hljs-title function_ invoke__">with_normalizer</span>(<span class="hljs-title function_ invoke__">Some</span>(NormalizerSequence::<span class="hljs-title function_ invoke__">new</span>(<span class="hljs-built_in">vec!</span>[ | |
| NFD.<span class="hljs-title function_ invoke__">into</span>(), | |
| Lowercase.<span class="hljs-title function_ invoke__">into</span>(), | |
| StripAccents.<span class="hljs-title function_ invoke__">into</span>(), | |
| ]))) | |
| .<span class="hljs-title function_ invoke__">unwrap</span>();`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Rs(a){let t,l;return t=new T({props:{$$slots:{default:[Bs]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Ws(a){let t,l;return t=new d({props:{code:"bGV0JTIwJTdCJTIwc2VxdWVuY2VOb3JtYWxpemVyJTJDJTIwbG93ZXJjYXNlTm9ybWFsaXplciUyQyUyMG5mZE5vcm1hbGl6ZXIlMkMlMjBzdHJpcEFjY2VudHNOb3JtYWxpemVyJTIwJTdEJTBBJTIwJTIwJTIwJTIwJTNEJTIwcmVxdWlyZSglMjJ0b2tlbml6ZXJzJTIyKSUzQiUwQWJlcnRUb2tlbml6ZXIuc2V0Tm9ybWFsaXplcihzZXF1ZW5jZU5vcm1hbGl6ZXIoJTVCJTBBJTIwJTIwJTIwJTIwbmZkTm9ybWFsaXplcigpJTJDJTIwbG93ZXJjYXNlTm9ybWFsaXplcigpJTJDJTIwc3RyaXBBY2NlbnRzTm9ybWFsaXplcigpJTBBJTVEKSk=",highlighted:`<span class="hljs-keyword">let</span> { sequenceNormalizer, lowercaseNormalizer, nfdNormalizer, stripAccentsNormalizer } | |
| = <span class="hljs-built_in">require</span>(<span class="hljs-string">"tokenizers"</span>); | |
| bertTokenizer.<span class="hljs-title function_">setNormalizer</span>(<span class="hljs-title function_">sequenceNormalizer</span>([ | |
| <span class="hljs-title function_">nfdNormalizer</span>(), <span class="hljs-title function_">lowercaseNormalizer</span>(), <span class="hljs-title function_">stripAccentsNormalizer</span>() | |
| ]))`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Gs(a){let t,l;return t=new T({props:{$$slots:{default:[Ws]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function As(a){let t,l;return t=new d({props:{code:"ZnJvbSUyMHRva2VuaXplcnMucHJlX3Rva2VuaXplcnMlMjBpbXBvcnQlMjBXaGl0ZXNwYWNlJTBBYmVydF90b2tlbml6ZXIucHJlX3Rva2VuaXplciUyMCUzRCUyMFdoaXRlc3BhY2UoKQ==",highlighted:`<span class="hljs-keyword">from</span> tokenizers.pre_tokenizers <span class="hljs-keyword">import</span> Whitespace | |
| bert_tokenizer.pre_tokenizer = Whitespace()`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Es(a){let t,l;return t=new T({props:{$$slots:{default:[As]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Hs(a){let t,l;return t=new d({props:{code:"dXNlJTIwdG9rZW5pemVycyUzQSUzQXByZV90b2tlbml6ZXJzJTNBJTNBd2hpdGVzcGFjZSUzQSUzQVdoaXRlc3BhY2UlM0IlMEFiZXJ0X3Rva2VuaXplci53aXRoX3ByZV90b2tlbml6ZXIoU29tZShXaGl0ZXNwYWNlJTIwJTdCJTdEKSklM0I=",highlighted:`<span class="hljs-keyword">use</span> tokenizers::pre_tokenizers::whitespace::Whitespace; | |
| bert_tokenizer.<span class="hljs-title function_ invoke__">with_pre_tokenizer</span>(<span class="hljs-title function_ invoke__">Some</span>(Whitespace {}));`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function xs(a){let t,l;return t=new T({props:{$$slots:{default:[Hs]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Fs(a){let t,l;return t=new d({props:{code:"bGV0JTIwJTdCJTIwd2hpdGVzcGFjZVByZVRva2VuaXplciUyMCU3RCUyMCUzRCUyMHJlcXVpcmUoJTIydG9rZW5pemVycyUyMiklM0IlMEFiZXJ0VG9rZW5pemVyLnNldFByZVRva2VuaXplcih3aGl0ZXNwYWNlUHJlVG9rZW5pemVyKCkpJTNC",highlighted:`<span class="hljs-keyword">let</span> { whitespacePreTokenizer } = <span class="hljs-built_in">require</span>(<span class="hljs-string">"tokenizers"</span>); | |
| bertTokenizer.<span class="hljs-title function_">setPreTokenizer</span>(<span class="hljs-title function_">whitespacePreTokenizer</span>());`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Ds(a){let t,l;return t=new T({props:{$$slots:{default:[Fs]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Ys(a){let t,l;return t=new d({props:{code:"ZnJvbSUyMHRva2VuaXplcnMucHJvY2Vzc29ycyUyMGltcG9ydCUyMFRlbXBsYXRlUHJvY2Vzc2luZyUwQWJlcnRfdG9rZW5pemVyLnBvc3RfcHJvY2Vzc29yJTIwJTNEJTIwVGVtcGxhdGVQcm9jZXNzaW5nKCUwQSUyMCUyMCUyMCUyMHNpbmdsZSUzRCUyMiU1QkNMUyU1RCUyMCUyNEElMjAlNUJTRVAlNUQlMjIlMkMlMEElMjAlMjAlMjAlMjBwYWlyJTNEJTIyJTVCQ0xTJTVEJTIwJTI0QSUyMCU1QlNFUCU1RCUyMCUyNEIlM0ExJTIwJTVCU0VQJTVEJTNBMSUyMiUyQyUwQSUyMCUyMCUyMCUyMHNwZWNpYWxfdG9rZW5zJTNEJTVCJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwKCUyMiU1QkNMUyU1RCUyMiUyQyUyMDEpJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwKCUyMiU1QlNFUCU1RCUyMiUyQyUyMDIpJTJDJTBBJTIwJTIwJTIwJTIwJTVEJTJDJTBBKQ==",highlighted:`<span class="hljs-keyword">from</span> tokenizers.processors <span class="hljs-keyword">import</span> TemplateProcessing | |
| bert_tokenizer.post_processor = TemplateProcessing( | |
| single=<span class="hljs-string">"[CLS] $A [SEP]"</span>, | |
| pair=<span class="hljs-string">"[CLS] $A [SEP] $B:1 [SEP]:1"</span>, | |
| special_tokens=[ | |
| (<span class="hljs-string">"[CLS]"</span>, <span class="hljs-number">1</span>), | |
| (<span class="hljs-string">"[SEP]"</span>, <span class="hljs-number">2</span>), | |
| ], | |
| )`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Ks(a){let t,l;return t=new T({props:{$$slots:{default:[Ys]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Ls(a){let t,l;return t=new d({props:{code:"dXNlJTIwdG9rZW5pemVycyUzQSUzQXByb2Nlc3NvcnMlM0ElM0F0ZW1wbGF0ZSUzQSUzQVRlbXBsYXRlUHJvY2Vzc2luZyUzQiUwQWJlcnRfdG9rZW5pemVyLndpdGhfcG9zdF9wcm9jZXNzb3IoU29tZSglMEElMjAlMjAlMjAlMjBUZW1wbGF0ZVByb2Nlc3NpbmclM0ElM0FidWlsZGVyKCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAudHJ5X3NpbmdsZSglMjIlNUJDTFMlNUQlMjAlMjRBJTIwJTVCU0VQJTVEJTIyKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMC51bndyYXAoKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMC50cnlfcGFpciglMjIlNUJDTFMlNUQlMjAlMjRBJTIwJTVCU0VQJTVEJTIwJTI0QiUzQTElMjAlNUJTRVAlNUQlM0ExJTIyKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMC51bndyYXAoKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMC5zcGVjaWFsX3Rva2Vucyh2ZWMhJTVCKCUyMiU1QkNMUyU1RCUyMiUyQyUyMDEpJTJDJTIwKCUyMiU1QlNFUCU1RCUyMiUyQyUyMDIpJTVEKSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMC5idWlsZCgpJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwLnVud3JhcCgpJTJDJTBBKSklM0I=",highlighted:`<span class="hljs-keyword">use</span> tokenizers::processors::template::TemplateProcessing; | |
| bert_tokenizer.<span class="hljs-title function_ invoke__">with_post_processor</span>(<span class="hljs-title function_ invoke__">Some</span>( | |
| TemplateProcessing::<span class="hljs-title function_ invoke__">builder</span>() | |
| .<span class="hljs-title function_ invoke__">try_single</span>(<span class="hljs-string">"[CLS] $A [SEP]"</span>) | |
| .<span class="hljs-title function_ invoke__">unwrap</span>() | |
| .<span class="hljs-title function_ invoke__">try_pair</span>(<span class="hljs-string">"[CLS] $A [SEP] $B:1 [SEP]:1"</span>) | |
| .<span class="hljs-title function_ invoke__">unwrap</span>() | |
| .<span class="hljs-title function_ invoke__">special_tokens</span>(<span class="hljs-built_in">vec!</span>[(<span class="hljs-string">"[CLS]"</span>, <span class="hljs-number">1</span>), (<span class="hljs-string">"[SEP]"</span>, <span class="hljs-number">2</span>)]) | |
| .<span class="hljs-title function_ invoke__">build</span>() | |
| .<span class="hljs-title function_ invoke__">unwrap</span>(), | |
| ));`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Ps(a){let t,l;return t=new T({props:{$$slots:{default:[Ls]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Os(a){let t,l;return t=new d({props:{code:"bGV0JTIwJTdCJTIwdGVtcGxhdGVQcm9jZXNzaW5nJTIwJTdEJTIwJTNEJTIwcmVxdWlyZSglMjJ0b2tlbml6ZXJzJTIyKSUzQiUwQWJlcnRUb2tlbml6ZXIuc2V0UG9zdFByb2Nlc3Nvcih0ZW1wbGF0ZVByb2Nlc3NpbmcoJTBBJTIwJTIwJTIwJTIwJTIyJTVCQ0xTJTVEJTIwJTI0QSUyMCU1QlNFUCU1RCUyMiUyQyUwQSUyMCUyMCUyMCUyMCUyMiU1QkNMUyU1RCUyMCUyNEElMjAlNUJTRVAlNUQlMjAlMjRCJTNBMSUyMCU1QlNFUCU1RCUzQTElMjIlMkMlMEElMjAlMjAlMjAlMjAlNUIlNUIlMjIlNUJDTFMlNUQlMjIlMkMlMjAxJTVEJTJDJTIwJTVCJTIyJTVCU0VQJTVEJTIyJTJDJTIwMiU1RCU1RCUwQSkpJTNC",highlighted:`<span class="hljs-keyword">let</span> { templateProcessing } = <span class="hljs-built_in">require</span>(<span class="hljs-string">"tokenizers"</span>); | |
| bertTokenizer.<span class="hljs-title function_">setPostProcessor</span>(<span class="hljs-title function_">templateProcessing</span>( | |
| <span class="hljs-string">"[CLS] $A [SEP]"</span>, | |
| <span class="hljs-string">"[CLS] $A [SEP] $B:1 [SEP]:1"</span>, | |
| [[<span class="hljs-string">"[CLS]"</span>, <span class="hljs-number">1</span>], [<span class="hljs-string">"[SEP]"</span>, <span class="hljs-number">2</span>]] | |
| ));`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function en(a){let t,l;return t=new T({props:{$$slots:{default:[Os]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function tn(a){let t,l;return t=new d({props:{code:"ZnJvbSUyMHRva2VuaXplcnMudHJhaW5lcnMlMjBpbXBvcnQlMjBXb3JkUGllY2VUcmFpbmVyJTBBdHJhaW5lciUyMCUzRCUyMFdvcmRQaWVjZVRyYWluZXIodm9jYWJfc2l6ZSUzRDMwNTIyJTJDJTIwc3BlY2lhbF90b2tlbnMlM0QlNUIlMjIlNUJVTkslNUQlMjIlMkMlMjAlMjIlNUJDTFMlNUQlMjIlMkMlMjAlMjIlNUJTRVAlNUQlMjIlMkMlMjAlMjIlNUJQQUQlNUQlMjIlMkMlMjAlMjIlNUJNQVNLJTVEJTIyJTVEKSUwQWZpbGVzJTIwJTNEJTIwJTVCZiUyMmRhdGElMkZ3aWtpdGV4dC0xMDMtcmF3JTJGd2lraS4lN0JzcGxpdCU3RC5yYXclMjIlMjBmb3IlMjBzcGxpdCUyMGluJTIwJTVCJTIydGVzdCUyMiUyQyUyMCUyMnRyYWluJTIyJTJDJTIwJTIydmFsaWQlMjIlNUQlNUQlMEFiZXJ0X3Rva2VuaXplci50cmFpbihmaWxlcyUyQyUyMHRyYWluZXIpJTBBYmVydF90b2tlbml6ZXIuc2F2ZSglMjJkYXRhJTJGYmVydC13aWtpLmpzb24lMjIp",highlighted:`<span class="hljs-keyword">from</span> tokenizers.trainers <span class="hljs-keyword">import</span> WordPieceTrainer | |
| trainer = WordPieceTrainer(vocab_size=<span class="hljs-number">30522</span>, special_tokens=[<span class="hljs-string">"[UNK]"</span>, <span class="hljs-string">"[CLS]"</span>, <span class="hljs-string">"[SEP]"</span>, <span class="hljs-string">"[PAD]"</span>, <span class="hljs-string">"[MASK]"</span>]) | |
| files = [<span class="hljs-string">f"data/wikitext-103-raw/wiki.<span class="hljs-subst">{split}</span>.raw"</span> <span class="hljs-keyword">for</span> split <span class="hljs-keyword">in</span> [<span class="hljs-string">"test"</span>, <span class="hljs-string">"train"</span>, <span class="hljs-string">"valid"</span>]] | |
| bert_tokenizer.train(files, trainer) | |
| bert_tokenizer.save(<span class="hljs-string">"data/bert-wiki.json"</span>)`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function ln(a){let t,l;return t=new T({props:{$$slots:{default:[tn]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function sn(a){let t,l;return t=new d({props:{code:"dXNlJTIwdG9rZW5pemVycyUzQSUzQW1vZGVscyUzQSUzQSU3QndvcmRwaWVjZSUzQSUzQVdvcmRQaWVjZVRyYWluZXIlMkMlMjBUcmFpbmVyV3JhcHBlciU3RCUzQiUwQWxldCUyMG11dCUyMHRyYWluZXIlM0ElMjBUcmFpbmVyV3JhcHBlciUyMCUzRCUyMFdvcmRQaWVjZVRyYWluZXIlM0ElM0FidWlsZGVyKCklMEElMjAlMjAlMjAlMjAudm9jYWJfc2l6ZSgzMF81MjIpJTBBJTIwJTIwJTIwJTIwLnNwZWNpYWxfdG9rZW5zKHZlYyElNUIlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBBZGRlZFRva2VuJTNBJTNBZnJvbSglMjIlNUJVTkslNUQlMjIlMkMlMjB0cnVlKSUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMEFkZGVkVG9rZW4lM0ElM0Fmcm9tKCUyMiU1QkNMUyU1RCUyMiUyQyUyMHRydWUpJTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwQWRkZWRUb2tlbiUzQSUzQWZyb20oJTIyJTVCU0VQJTVEJTIyJTJDJTIwdHJ1ZSklMkMlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBBZGRlZFRva2VuJTNBJTNBZnJvbSglMjIlNUJQQUQlNUQlMjIlMkMlMjB0cnVlKSUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMEFkZGVkVG9rZW4lM0ElM0Fmcm9tKCUyMiU1Qk1BU0slNUQlMjIlMkMlMjB0cnVlKSUyQyUwQSUyMCUyMCUyMCUyMCU1RCklMEElMjAlMjAlMjAlMjAuYnVpbGQoKSUwQSUyMCUyMCUyMCUyMC5pbnRvKCklM0IlMEFsZXQlMjBmaWxlcyUyMCUzRCUyMHZlYyElNUIlMEElMjAlMjAlMjAlMjAlMjJkYXRhJTJGd2lraXRleHQtMTAzLXJhdyUyRndpa2kudHJhaW4ucmF3JTIyLmludG8oKSUyQyUwQSUyMCUyMCUyMCUyMCUyMmRhdGElMkZ3aWtpdGV4dC0xMDMtcmF3JTJGd2lraS50ZXN0LnJhdyUyMi5pbnRvKCklMkMlMEElMjAlMjAlMjAlMjAlMjJkYXRhJTJGd2lraXRleHQtMTAzLXJhdyUyRndpa2kudmFsaWQucmF3JTIyLmludG8oKSUyQyUwQSU1RCUzQiUwQWJlcnRfdG9rZW5pemVyLnRyYWluX2Zyb21fZmlsZXMoJTI2bXV0JTIwdHJhaW5lciUyQyUyMGZpbGVzKSUzRiUzQiUwQWJlcnRfdG9rZW5pemVyLnNhdmUoJTIyZGF0YSUyRmJlcnQtd2lraS5qc29uJTIyJTJDJTIwZmFsc2UpJTNGJTNC",highlighted:`<span class="hljs-keyword">use</span> tokenizers::models::{wordpiece::WordPieceTrainer, TrainerWrapper}; | |
| <span class="hljs-keyword">let</span> <span class="hljs-keyword">mut </span><span class="hljs-variable">trainer</span>: TrainerWrapper = WordPieceTrainer::<span class="hljs-title function_ invoke__">builder</span>() | |
| .<span class="hljs-title function_ invoke__">vocab_size</span>(<span class="hljs-number">30_522</span>) | |
| .<span class="hljs-title function_ invoke__">special_tokens</span>(<span class="hljs-built_in">vec!</span>[ | |
| AddedToken::<span class="hljs-title function_ invoke__">from</span>(<span class="hljs-string">"[UNK]"</span>, <span class="hljs-literal">true</span>), | |
| AddedToken::<span class="hljs-title function_ invoke__">from</span>(<span class="hljs-string">"[CLS]"</span>, <span class="hljs-literal">true</span>), | |
| AddedToken::<span class="hljs-title function_ invoke__">from</span>(<span class="hljs-string">"[SEP]"</span>, <span class="hljs-literal">true</span>), | |
| AddedToken::<span class="hljs-title function_ invoke__">from</span>(<span class="hljs-string">"[PAD]"</span>, <span class="hljs-literal">true</span>), | |
| AddedToken::<span class="hljs-title function_ invoke__">from</span>(<span class="hljs-string">"[MASK]"</span>, <span class="hljs-literal">true</span>), | |
| ]) | |
| .<span class="hljs-title function_ invoke__">build</span>() | |
| .<span class="hljs-title function_ invoke__">into</span>(); | |
| <span class="hljs-keyword">let</span> <span class="hljs-variable">files</span> = <span class="hljs-built_in">vec!</span>[ | |
| <span class="hljs-string">"data/wikitext-103-raw/wiki.train.raw"</span>.<span class="hljs-title function_ invoke__">into</span>(), | |
| <span class="hljs-string">"data/wikitext-103-raw/wiki.test.raw"</span>.<span class="hljs-title function_ invoke__">into</span>(), | |
| <span class="hljs-string">"data/wikitext-103-raw/wiki.valid.raw"</span>.<span class="hljs-title function_ invoke__">into</span>(), | |
| ]; | |
| bert_tokenizer.<span class="hljs-title function_ invoke__">train_from_files</span>(&<span class="hljs-keyword">mut</span> trainer, files)?; | |
| bert_tokenizer.<span class="hljs-title function_ invoke__">save</span>(<span class="hljs-string">"data/bert-wiki.json"</span>, <span class="hljs-literal">false</span>)?;`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function nn(a){let t,l;return t=new T({props:{$$slots:{default:[sn]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function an(a){let t,l;return t=new d({props:{code:"bGV0JTIwJTdCJTIwd29yZFBpZWNlVHJhaW5lciUyMCU3RCUyMCUzRCUyMHJlcXVpcmUoJTIydG9rZW5pemVycyUyMiklM0IlMEFsZXQlMjB0cmFpbmVyJTIwJTNEJTIwd29yZFBpZWNlVHJhaW5lciglN0IlMEElMjAlMjAlMjAlMjB2b2NhYlNpemUlM0ElMjAzMDUyMiUyQyUwQSUyMCUyMCUyMCUyMHNwZWNpYWxUb2tlbnMlM0ElMjAlNUIlMjIlNUJVTkslNUQlMjIlMkMlMjAlMjIlNUJDTFMlNUQlMjIlMkMlMjAlMjIlNUJTRVAlNUQlMjIlMkMlMjAlMjIlNUJQQUQlNUQlMjIlMkMlMjAlMjIlNUJNQVNLJTVEJTIyJTVEJTBBJTdEKSUzQiUwQWxldCUyMGZpbGVzJTIwJTNEJTIwJTVCJTIydGVzdCUyMiUyQyUyMCUyMnRyYWluJTIyJTJDJTIwJTIydmFsaWQlMjIlNUQubWFwKHNwbGl0JTIwJTNEJTNFJTIwJTYwZGF0YSUyRndpa2l0ZXh0LTEwMy1yYXclMkZ3aWtpLiUyNCU3QnNwbGl0JTdELnJhdyU2MCklM0IlMEFiZXJ0VG9rZW5pemVyLnRyYWluKGZpbGVzJTJDJTIwdHJhaW5lciklM0IlMEFiZXJ0VG9rZW5pemVyLnNhdmUoJTIyZGF0YSUyRmJlcnQtd2lraS5qc29uJTIyKQ==",highlighted:`<span class="hljs-keyword">let</span> { wordPieceTrainer } = <span class="hljs-built_in">require</span>(<span class="hljs-string">"tokenizers"</span>); | |
| <span class="hljs-keyword">let</span> trainer = <span class="hljs-title function_">wordPieceTrainer</span>({ | |
| <span class="hljs-attr">vocabSize</span>: <span class="hljs-number">30522</span>, | |
| <span class="hljs-attr">specialTokens</span>: [<span class="hljs-string">"[UNK]"</span>, <span class="hljs-string">"[CLS]"</span>, <span class="hljs-string">"[SEP]"</span>, <span class="hljs-string">"[PAD]"</span>, <span class="hljs-string">"[MASK]"</span>] | |
| }); | |
| <span class="hljs-keyword">let</span> files = [<span class="hljs-string">"test"</span>, <span class="hljs-string">"train"</span>, <span class="hljs-string">"valid"</span>].<span class="hljs-title function_">map</span>(<span class="hljs-function"><span class="hljs-params">split</span> =></span> <span class="hljs-string">\`data/wikitext-103-raw/wiki.<span class="hljs-subst">\${split}</span>.raw\`</span>); | |
| bertTokenizer.<span class="hljs-title function_">train</span>(files, trainer); | |
| bertTokenizer.<span class="hljs-title function_">save</span>(<span class="hljs-string">"data/bert-wiki.json"</span>)`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function rn(a){let t,l;return t=new T({props:{$$slots:{default:[an]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function on(a){let t,l;return t=new d({props:{code:"b3V0cHV0JTIwJTNEJTIwdG9rZW5pemVyLmVuY29kZSglMjJIZWxsbyUyQyUyMHknYWxsISUyMEhvdyUyMGFyZSUyMHlvdSUyMCVGMCU5RiU5OCU4MSUyMCUzRiUyMiklMEFwcmludChvdXRwdXQuaWRzKSUwQSUyMyUyMCU1QjElMkMlMjAyNzI1MyUyQyUyMDE2JTJDJTIwOTMlMkMlMjAxMSUyQyUyMDUwOTclMkMlMjA1JTJDJTIwNzk2MSUyQyUyMDUxMTIlMkMlMjA2MjE4JTJDJTIwMCUyQyUyMDM1JTJDJTIwMiU1RCUwQXRva2VuaXplci5kZWNvZGUoJTVCMSUyQyUyMDI3MjUzJTJDJTIwMTYlMkMlMjA5MyUyQyUyMDExJTJDJTIwNTA5NyUyQyUyMDUlMkMlMjA3OTYxJTJDJTIwNTExMiUyQyUyMDYyMTglMkMlMjAwJTJDJTIwMzUlMkMlMjAyJTVEKSUwQSUyMyUyMCUyMkhlbGxvJTIwJTJDJTIweSUyMCclMjBhbGwlMjAhJTIwSG93JTIwYXJlJTIweW91JTIwJTNGJTIy",highlighted:`output = tokenizer.encode(<span class="hljs-string">"Hello, y'all! How are you 😁 ?"</span>) | |
| <span class="hljs-built_in">print</span>(output.ids) | |
| <span class="hljs-comment"># [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]</span> | |
| tokenizer.decode([<span class="hljs-number">1</span>, <span class="hljs-number">27253</span>, <span class="hljs-number">16</span>, <span class="hljs-number">93</span>, <span class="hljs-number">11</span>, <span class="hljs-number">5097</span>, <span class="hljs-number">5</span>, <span class="hljs-number">7961</span>, <span class="hljs-number">5112</span>, <span class="hljs-number">6218</span>, <span class="hljs-number">0</span>, <span class="hljs-number">35</span>, <span class="hljs-number">2</span>]) | |
| <span class="hljs-comment"># "Hello , y ' all ! How are you ?"</span>`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function pn(a){let t,l;return t=new T({props:{$$slots:{default:[on]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function cn(a){let t,l;return t=new d({props:{code:"bGV0JTIwb3V0cHV0JTIwJTNEJTIwdG9rZW5pemVyLmVuY29kZSglMjJIZWxsbyUyQyUyMHknYWxsISUyMEhvdyUyMGFyZSUyMHlvdSUyMCVGMCU5RiU5OCU4MSUyMCUzRiUyMiUyQyUyMHRydWUpJTNGJTNCJTBBcHJpbnRsbiEoJTIyJTdCJTNBJTNGJTdEJTIyJTJDJTIwb3V0cHV0LmdldF9pZHMoKSklM0IlMEElMkYlMkYlMjAlNUIxJTJDJTIwMjcyNTMlMkMlMjAxNiUyQyUyMDkzJTJDJTIwMTElMkMlMjA1MDk3JTJDJTIwNSUyQyUyMDc5NjElMkMlMjA1MTEyJTJDJTIwNjIxOCUyQyUyMDAlMkMlMjAzNSUyQyUyMDIlNUQlMEFsZXQlMjBkZWNvZGVkJTIwJTNEJTIwdG9rZW5pemVyLmRlY29kZSglMEElMjAlMjAlMjAlMjAlMjYlNUIxJTJDJTIwMjcyNTMlMkMlMjAxNiUyQyUyMDkzJTJDJTIwMTElMkMlMjA1MDk3JTJDJTIwNSUyQyUyMDc5NjElMkMlMjA1MTEyJTJDJTIwNjIxOCUyQyUyMDAlMkMlMjAzNSUyQyUyMDIlNUQlMkMlMEElMjAlMjAlMjAlMjB0cnVlJTJDJTBBKSUzRiUzQiUwQXByaW50bG4hKCUyMiU3QmRlY29kZWQlN0QlMjIpJTNCJTBBJTJGJTJGJTIwJTIySGVsbG8lMjAlMkMlMjB5JTIwJyUyMGFsbCUyMCElMjBIb3clMjBhcmUlMjB5b3UlMjAlM0YlMjI=",highlighted:`<span class="hljs-keyword">let</span> <span class="hljs-variable">output</span> = tokenizer.<span class="hljs-title function_ invoke__">encode</span>(<span class="hljs-string">"Hello, y'all! How are you 😁 ?"</span>, <span class="hljs-literal">true</span>)?; | |
| <span class="hljs-built_in">println!</span>(<span class="hljs-string">"{:?}"</span>, output.<span class="hljs-title function_ invoke__">get_ids</span>()); | |
| <span class="hljs-comment">// [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]</span> | |
| <span class="hljs-keyword">let</span> <span class="hljs-variable">decoded</span> = tokenizer.<span class="hljs-title function_ invoke__">decode</span>( | |
| &[<span class="hljs-number">1</span>, <span class="hljs-number">27253</span>, <span class="hljs-number">16</span>, <span class="hljs-number">93</span>, <span class="hljs-number">11</span>, <span class="hljs-number">5097</span>, <span class="hljs-number">5</span>, <span class="hljs-number">7961</span>, <span class="hljs-number">5112</span>, <span class="hljs-number">6218</span>, <span class="hljs-number">0</span>, <span class="hljs-number">35</span>, <span class="hljs-number">2</span>], | |
| <span class="hljs-literal">true</span>, | |
| )?; | |
| <span class="hljs-built_in">println!</span>(<span class="hljs-string">"{decoded}"</span>); | |
| <span class="hljs-comment">// "Hello , y ' all ! How are you ?"</span>`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function un(a){let t,l;return t=new T({props:{$$slots:{default:[cn]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Mn(a){let t,l;return t=new d({props:{code:"bGV0JTIwb3V0cHV0JTIwJTNEJTIwYXdhaXQlMjB0b2tlbml6ZXIuZW5jb2RlKCUyMkhlbGxvJTJDJTIweSdhbGwhJTIwSG93JTIwYXJlJTIweW91JTIwJUYwJTlGJTk4JTgxJTIwJTNGJTIyKSUzQiUwQWNvbnNvbGUubG9nKG91dHB1dC5nZXRJZHMoKSklM0IlMEElMkYlMkYlMjAlNUIxJTJDJTIwMjcyNTMlMkMlMjAxNiUyQyUyMDkzJTJDJTIwMTElMkMlMjA1MDk3JTJDJTIwNSUyQyUyMDc5NjElMkMlMjA1MTEyJTJDJTIwNjIxOCUyQyUyMDAlMkMlMjAzNSUyQyUyMDIlNUQlMEFsZXQlMjBkZWNvZGVkJTIwJTNEJTIwYXdhaXQlMjB0b2tlbml6ZXIuZGVjb2RlKCU1QjElMkMlMjAyNzI1MyUyQyUyMDE2JTJDJTIwOTMlMkMlMjAxMSUyQyUyMDUwOTclMkMlMjA1JTJDJTIwNzk2MSUyQyUyMDUxMTIlMkMlMjA2MjE4JTJDJTIwMCUyQyUyMDM1JTJDJTIwMiU1RCUyQyUyMHRydWUpJTNCJTBBJTJGJTJGJTIwJTIySGVsbG8lMjAlMkMlMjB5JTIwJyUyMGFsbCUyMCElMjBIb3clMjBhcmUlMjB5b3UlMjAlM0YlMjI=",highlighted:`<span class="hljs-keyword">let</span> output = <span class="hljs-keyword">await</span> tokenizer.<span class="hljs-title function_">encode</span>(<span class="hljs-string">"Hello, y'all! How are you 😁 ?"</span>); | |
| <span class="hljs-variable language_">console</span>.<span class="hljs-title function_">log</span>(output.<span class="hljs-title function_">getIds</span>()); | |
| <span class="hljs-comment">// [1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2]</span> | |
| <span class="hljs-keyword">let</span> decoded = <span class="hljs-keyword">await</span> tokenizer.<span class="hljs-title function_">decode</span>([<span class="hljs-number">1</span>, <span class="hljs-number">27253</span>, <span class="hljs-number">16</span>, <span class="hljs-number">93</span>, <span class="hljs-number">11</span>, <span class="hljs-number">5097</span>, <span class="hljs-number">5</span>, <span class="hljs-number">7961</span>, <span class="hljs-number">5112</span>, <span class="hljs-number">6218</span>, <span class="hljs-number">0</span>, <span class="hljs-number">35</span>, <span class="hljs-number">2</span>], <span class="hljs-literal">true</span>); | |
| <span class="hljs-comment">// "Hello , y ' all ! How are you ?"</span>`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function $n(a){let t,l;return t=new T({props:{$$slots:{default:[Mn]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function yn(a){let t,l;return t=new d({props:{code:"b3V0cHV0JTIwJTNEJTIwYmVydF90b2tlbml6ZXIuZW5jb2RlKCUyMldlbGNvbWUlMjB0byUyMHRoZSUyMCVGMCU5RiVBNCU5NyUyMFRva2VuaXplcnMlMjBsaWJyYXJ5LiUyMiklMEFwcmludChvdXRwdXQudG9rZW5zKSUwQSUyMyUyMCU1QiUyMiU1QkNMUyU1RCUyMiUyQyUyMCUyMndlbGNvbWUlMjIlMkMlMjAlMjJ0byUyMiUyQyUyMCUyMnRoZSUyMiUyQyUyMCUyMiU1QlVOSyU1RCUyMiUyQyUyMCUyMnRvayUyMiUyQyUyMCUyMiUyMyUyM2VuaSUyMiUyQyUyMCUyMiUyMyUyM3plciUyMiUyQyUyMCUyMiUyMyUyM3MlMjIlMkMlMjAlMjJsaWJyYXJ5JTIyJTJDJTIwJTIyLiUyMiUyQyUyMCUyMiU1QlNFUCU1RCUyMiU1RCUwQWJlcnRfdG9rZW5pemVyLmRlY29kZShvdXRwdXQuaWRzKSUwQSUyMyUyMCUyMndlbGNvbWUlMjB0byUyMHRoZSUyMHRvayUyMCUyMyUyM2VuaSUyMCUyMyUyM3plciUyMCUyMyUyM3MlMjBsaWJyYXJ5JTIwLiUyMg==",highlighted:`output = bert_tokenizer.encode(<span class="hljs-string">"Welcome to the 🤗 Tokenizers library."</span>) | |
| <span class="hljs-built_in">print</span>(output.tokens) | |
| <span class="hljs-comment"># ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]</span> | |
| bert_tokenizer.decode(output.ids) | |
| <span class="hljs-comment"># "welcome to the tok ##eni ##zer ##s library ."</span>`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function mn(a){let t,l;return t=new T({props:{$$slots:{default:[yn]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function fn(a){let t,l;return t=new d({props:{code:"bGV0JTIwb3V0cHV0JTIwJTNEJTIwYmVydF90b2tlbml6ZXIuZW5jb2RlKCUyMldlbGNvbWUlMjB0byUyMHRoZSUyMCVGMCU5RiVBNCU5NyUyMFRva2VuaXplcnMlMjBsaWJyYXJ5LiUyMiUyQyUyMHRydWUpJTNGJTNCJTBBcHJpbnRsbiEoJTIyJTdCJTNBJTNGJTdEJTIyJTJDJTIwb3V0cHV0LmdldF90b2tlbnMoKSklM0IlMEElMkYlMkYlMjAlNUIlMjIlNUJDTFMlNUQlMjIlMkMlMjAlMjJ3ZWxjb21lJTIyJTJDJTIwJTIydG8lMjIlMkMlMjAlMjJ0aGUlMjIlMkMlMjAlMjIlNUJVTkslNUQlMjIlMkMlMjAlMjJ0b2slMjIlMkMlMjAlMjIlMjMlMjNlbmklMjIlMkMlMjAlMjIlMjMlMjN6ZXIlMjIlMkMlMjAlMjIlMjMlMjNzJTIyJTJDJTIwJTIybGlicmFyeSUyMiUyQyUyMCUyMi4lMjIlMkMlMjAlMjIlNUJTRVAlNUQlMjIlNUQlMEFsZXQlMjBkZWNvZGVkJTIwJTNEJTIwYmVydF90b2tlbml6ZXIuZGVjb2RlKG91dHB1dC5nZXRfaWRzKCklMkMlMjB0cnVlKSUzRiUzQiUwQXByaW50bG4hKCUyMiU3QmRlY29kZWQlN0QlMjIpJTNCJTBBJTJGJTJGJTIwJTIyd2VsY29tZSUyMHRvJTIwdGhlJTIwdG9rJTIwJTIzJTIzZW5pJTIwJTIzJTIzemVyJTIwJTIzJTIzcyUyMGxpYnJhcnklMjAuJTIy",highlighted:`<span class="hljs-keyword">let</span> <span class="hljs-variable">output</span> = bert_tokenizer.<span class="hljs-title function_ invoke__">encode</span>(<span class="hljs-string">"Welcome to the 🤗 Tokenizers library."</span>, <span class="hljs-literal">true</span>)?; | |
| <span class="hljs-built_in">println!</span>(<span class="hljs-string">"{:?}"</span>, output.<span class="hljs-title function_ invoke__">get_tokens</span>()); | |
| <span class="hljs-comment">// ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]</span> | |
| <span class="hljs-keyword">let</span> <span class="hljs-variable">decoded</span> = bert_tokenizer.<span class="hljs-title function_ invoke__">decode</span>(output.<span class="hljs-title function_ invoke__">get_ids</span>(), <span class="hljs-literal">true</span>)?; | |
| <span class="hljs-built_in">println!</span>(<span class="hljs-string">"{decoded}"</span>); | |
| <span class="hljs-comment">// "welcome to the tok ##eni ##zer ##s library ."</span>`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Jn(a){let t,l;return t=new T({props:{$$slots:{default:[fn]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Un(a){let t,l;return t=new d({props:{code:"bGV0JTIwb3V0cHV0JTIwJTNEJTIwYXdhaXQlMjBiZXJ0VG9rZW5pemVyLmVuY29kZSglMjJXZWxjb21lJTIwdG8lMjB0aGUlMjAlRjAlOUYlQTQlOTclMjBUb2tlbml6ZXJzJTIwbGlicmFyeS4lMjIpJTNCJTBBY29uc29sZS5sb2cob3V0cHV0LmdldFRva2VucygpKSUzQiUwQSUyRiUyRiUyMCU1QiUyMiU1QkNMUyU1RCUyMiUyQyUyMCUyMndlbGNvbWUlMjIlMkMlMjAlMjJ0byUyMiUyQyUyMCUyMnRoZSUyMiUyQyUyMCUyMiU1QlVOSyU1RCUyMiUyQyUyMCUyMnRvayUyMiUyQyUyMCUyMiUyMyUyM2VuaSUyMiUyQyUyMCUyMiUyMyUyM3plciUyMiUyQyUyMCUyMiUyMyUyM3MlMjIlMkMlMjAlMjJsaWJyYXJ5JTIyJTJDJTIwJTIyLiUyMiUyQyUyMCUyMiU1QlNFUCU1RCUyMiU1RCUwQXZhciUyMGRlY29kZWQlMjAlM0QlMjBhd2FpdCUyMGJlcnRUb2tlbml6ZXIuZGVjb2RlKG91dHB1dC5nZXRJZHMoKSUyQyUyMHRydWUpJTNCJTBBJTJGJTJGJTIwJTIyd2VsY29tZSUyMHRvJTIwdGhlJTIwdG9rJTIwJTIzJTIzZW5pJTIwJTIzJTIzemVyJTIwJTIzJTIzcyUyMGxpYnJhcnklMjAuJTIy",highlighted:`<span class="hljs-keyword">let</span> output = <span class="hljs-keyword">await</span> bertTokenizer.<span class="hljs-title function_">encode</span>(<span class="hljs-string">"Welcome to the 🤗 Tokenizers library."</span>); | |
| <span class="hljs-variable language_">console</span>.<span class="hljs-title function_">log</span>(output.<span class="hljs-title function_">getTokens</span>()); | |
| <span class="hljs-comment">// ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]</span> | |
| <span class="hljs-keyword">var</span> decoded = <span class="hljs-keyword">await</span> bertTokenizer.<span class="hljs-title function_">decode</span>(output.<span class="hljs-title function_">getIds</span>(), <span class="hljs-literal">true</span>); | |
| <span class="hljs-comment">// "welcome to the tok ##eni ##zer ##s library ."</span>`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function dn(a){let t,l;return t=new T({props:{$$slots:{default:[Un]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function Tn(a){let t,l;return t=new d({props:{code:"ZnJvbSUyMHRva2VuaXplcnMlMjBpbXBvcnQlMjBkZWNvZGVycyUwQWJlcnRfdG9rZW5pemVyLmRlY29kZXIlMjAlM0QlMjBkZWNvZGVycy5Xb3JkUGllY2UoKSUwQWJlcnRfdG9rZW5pemVyLmRlY29kZShvdXRwdXQuaWRzKSUwQSUyMyUyMCUyMndlbGNvbWUlMjB0byUyMHRoZSUyMHRva2VuaXplcnMlMjBsaWJyYXJ5LiUyMg==",highlighted:`<span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> decoders | |
| bert_tokenizer.decoder = decoders.WordPiece() | |
| bert_tokenizer.decode(output.ids) | |
| <span class="hljs-comment"># "welcome to the tokenizers library."</span>`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function jn(a){let t,l;return t=new T({props:{$$slots:{default:[Tn]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function wn(a){let t,l;return t=new d({props:{code:"dXNlJTIwdG9rZW5pemVycyUzQSUzQWRlY29kZXJzJTNBJTNBd29yZHBpZWNlJTNBJTNBV29yZFBpZWNlJTIwYXMlMjBXb3JkUGllY2VEZWNvZGVyJTNCJTBBYmVydF90b2tlbml6ZXIud2l0aF9kZWNvZGVyKFNvbWUoV29yZFBpZWNlRGVjb2RlciUzQSUzQWRlZmF1bHQoKSkpJTNCJTBBbGV0JTIwZGVjb2RlZCUyMCUzRCUyMGJlcnRfdG9rZW5pemVyLmRlY29kZShvdXRwdXQuZ2V0X2lkcygpJTJDJTIwdHJ1ZSklM0YlM0IlMEElMkYlMkYlMjAlMjJ3ZWxjb21lJTIwdG8lMjB0aGUlMjB0b2tlbml6ZXJzJTIwbGlicmFyeS4lMjI=",highlighted:`<span class="hljs-keyword">use</span> tokenizers::decoders::wordpiece::WordPiece <span class="hljs-keyword">as</span> WordPieceDecoder; | |
| bert_tokenizer.<span class="hljs-title function_ invoke__">with_decoder</span>(<span class="hljs-title function_ invoke__">Some</span>(WordPieceDecoder::<span class="hljs-title function_ invoke__">default</span>())); | |
| <span class="hljs-keyword">let</span> <span class="hljs-variable">decoded</span> = bert_tokenizer.<span class="hljs-title function_ invoke__">decode</span>(output.<span class="hljs-title function_ invoke__">get_ids</span>(), <span class="hljs-literal">true</span>)?; | |
| <span class="hljs-comment">// "welcome to the tokenizers library."</span>`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function hn(a){let t,l;return t=new T({props:{$$slots:{default:[wn]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function kn(a){let t,l;return t=new d({props:{code:"bGV0JTIwJTdCJTIwd29yZFBpZWNlRGVjb2RlciUyMCU3RCUyMCUzRCUyMHJlcXVpcmUoJTIydG9rZW5pemVycyUyMiklM0IlMEFiZXJ0VG9rZW5pemVyLnNldERlY29kZXIod29yZFBpZWNlRGVjb2RlcigpKSUzQiUwQXZhciUyMGRlY29kZWQlMjAlM0QlMjBhd2FpdCUyMGJlcnRUb2tlbml6ZXIuZGVjb2RlKG91dHB1dC5nZXRJZHMoKSUyQyUyMHRydWUpJTNCJTBBJTJGJTJGJTIwJTIyd2VsY29tZSUyMHRvJTIwdGhlJTIwdG9rZW5pemVycyUyMGxpYnJhcnkuJTIy",highlighted:`<span class="hljs-keyword">let</span> { wordPieceDecoder } = <span class="hljs-built_in">require</span>(<span class="hljs-string">"tokenizers"</span>); | |
| bertTokenizer.<span class="hljs-title function_">setDecoder</span>(<span class="hljs-title function_">wordPieceDecoder</span>()); | |
| <span class="hljs-keyword">var</span> decoded = <span class="hljs-keyword">await</span> bertTokenizer.<span class="hljs-title function_">decode</span>(output.<span class="hljs-title function_">getIds</span>(), <span class="hljs-literal">true</span>); | |
| <span class="hljs-comment">// "welcome to the tokenizers library."</span>`,wrap:!1}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p:U,i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function gn(a){let t,l;return t=new T({props:{$$slots:{default:[kn]},$$scope:{ctx:a}}}),{c(){p(t.$$.fragment)},l(e){i(t.$$.fragment,e)},m(e,s){c(t,e,s),l=!0},p(e,s){const o={};s&2&&(o.$$scope={dirty:s,ctx:e}),t.$set(o)},i(e){l||(u(t.$$.fragment,e),l=!0)},o(e){M(t.$$.fragment,e),l=!1},d(e){$(t,e)}}}function _n(a){let t,l,e,s,o,Ce,W,ze,G,Zt=`When calling <code>Tokenizer.encode</code> or | |
| <code>Tokenizer.encode_batch</code>, the input | |
| text(s) go through the following pipeline:`,Ve,A,Nt="<li><code>normalization</code></li> <li><code>pre-tokenization</code></li> <li><code>model</code></li> <li><code>post-processing</code></li>",Qe,E,St=`We’ll see in details what happens during each of those steps in detail, | |
| as well as when you want to <code>decode <decoding></code> some token ids, and how the 🤗 Tokenizers library allows you | |
| to customize each of those steps to your needs. If you’re already | |
| familiar with those steps and want to learn by seeing some code, jump to | |
| <code>our BERT from scratch example <example></code>.`,Ze,H,Xt=`For the examples that require a <code>Tokenizer</code> we will use the tokenizer we trained in the | |
| <code>quicktour</code>, which you can load with:`,Ne,g,Se,x,Xe,F,vt=`Normalization is, in a nutshell, a set of operations you apply to a raw | |
| string to make it less random or “cleaner”. Common operations include | |
| stripping whitespace, removing accented characters or lowercasing all | |
| text. If you’re familiar with <a href="https://unicode.org/reports/tr15" rel="nofollow">Unicode | |
| normalization</a>, it is also a very | |
| common normalization operation applied in most tokenizers.`,ve,D,qt=`Each normalization operation is represented in the 🤗 Tokenizers library | |
| by a <code>Normalizer</code>, and you can combine | |
| several of those by using a <code>normalizers.Sequence</code>. Here is a normalizer applying NFD Unicode normalization | |
| and removing accents as an example:`,qe,_,Be,Y,Bt="You can manually test that normalizer by applying it to any string:",Re,I,We,K,Rt=`When building a <code>Tokenizer</code>, you can | |
| customize its normalizer by just changing the corresponding attribute:`,Ge,b,Ae,L,Wt=`Of course, if you change the way a tokenizer applies normalization, you | |
| should probably retrain it from scratch afterward.`,Ee,P,He,O,Gt=`Pre-tokenization is the act of splitting a text into smaller objects | |
| that give an upper bound to what your tokens will be at the end of | |
| training. A good way to think of this is that the pre-tokenizer will | |
| split your text into “words” and then, your final tokens will be parts | |
| of those words.`,xe,ee,At=`An easy way to pre-tokenize inputs is to split on spaces and | |
| punctuations, which is done by the | |
| <code>pre_tokenizers.Whitespace</code> | |
| pre-tokenizer:`,Fe,C,De,te,Et=`The output is a list of tuples, with each tuple containing one word and | |
| its span in the original sentence (which is used to determine the final | |
| <code>offsets</code> of our <code>Encoding</code>). Note that splitting on | |
| punctuation will split contractions like <code>"I'm"</code> in this example.`,Ye,le,Ht=`You can combine together any <code>PreTokenizer</code> together. For instance, here is a pre-tokenizer that will | |
| split on space, punctuation and digits, separating numbers in their | |
| individual digits:`,Ke,z,Le,se,xt=`As we saw in the <code>quicktour</code>, you can | |
| customize the pre-tokenizer of a <code>Tokenizer</code> by just changing the corresponding attribute:`,Pe,V,Oe,ne,Ft=`Of course, if you change the way the pre-tokenizer, you should probably | |
| retrain your tokenizer from scratch afterward.`,et,ae,tt,re,Dt=`Once the input texts are normalized and pre-tokenized, the | |
| <code>Tokenizer</code> applies the model on the | |
| pre-tokens. This is the part of the pipeline that needs training on your | |
| corpus (or that has been trained if you are using a pretrained | |
| tokenizer).`,lt,oe,Yt=`The role of the model is to split your “words” into tokens, using the | |
| rules it has learned. It’s also responsible for mapping those tokens to | |
| their corresponding IDs in the vocabulary of the model.`,st,pe,Kt=`This model is passed along when initializing the | |
| <code>Tokenizer</code> so you already know how to | |
| customize this part. Currently, the 🤗 Tokenizers library supports:`,nt,ie,Lt="<li><code>models.BPE</code></li> <li><code>models.Unigram</code></li> <li><code>models.WordLevel</code></li> <li><code>models.WordPiece</code></li>",at,ce,Pt=`For more details about each model and its behavior, you can check | |
| <a href="components#models">here</a>`,rt,ue,ot,Me,Ot=`Post-processing is the last step of the tokenization pipeline, to | |
| perform any additional transformation to the | |
| <code>Encoding</code> before it’s returned, like | |
| adding potential special tokens.`,pt,$e,el=`As we saw in the quick tour, we can customize the post processor of a | |
| <code>Tokenizer</code> by setting the | |
| corresponding attribute. For instance, here is how we can post-process | |
| to make the inputs suitable for the BERT model:`,it,Q,ct,ye,tl=`Note that contrarily to the pre-tokenizer or the normalizer, you don’t | |
| need to retrain a tokenizer after changing its post-processor.`,ut,me,Mt,fe,ll=`Let’s put all those pieces together to build a BERT tokenizer. First, | |
| BERT relies on WordPiece, so we instantiate a new | |
| <code>Tokenizer</code> with this model:`,$t,Z,yt,Je,sl=`Then we know that BERT preprocesses texts by removing accents and | |
| lowercasing. We also use a unicode normalizer:`,mt,N,ft,Ue,nl="The pre-tokenizer is just splitting on whitespace and punctuation:",Jt,S,Ut,de,al=`And the post-processing uses the template we saw in the previous | |
| section:`,dt,X,Tt,Te,rl=`We can use this tokenizer and train on it on wikitext like in the | |
| <code>quicktour</code>:`,jt,v,wt,je,ht,we,ol=`On top of encoding the input texts, a <code>Tokenizer</code> also has an API for decoding, that is converting IDs | |
| generated by your model back to a text. This is done by the methods | |
| <code>Tokenizer.decode</code> (for one predicted text) and <code>Tokenizer.decode_batch</code> (for a batch of predictions).`,kt,he,pl=`The <code>decoder</code> will first convert the IDs back to tokens | |
| (using the tokenizer’s vocabulary) and remove all special tokens, then | |
| join those tokens with spaces:`,gt,q,_t,ke,il=`If you used a model that added special characters to represent subtokens | |
| of a given “word” (like the <code>"##"</code> in | |
| WordPiece) you will need to customize the <code>decoder</code> to treat | |
| them properly. If we take our previous <code>bert_tokenizer</code> for instance the | |
| default decoding will give:`,It,B,bt,ge,cl="But by changing it to a proper decoder, we get:",Ct,R,zt,_e,Vt,be,Qt;return o=new Sl({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),W=new Ie({props:{title:"The tokenization pipeline",local:"the-tokenization-pipeline",headingTag:"h1"}}),g=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[Gl],rust:[Rl],python:[ql]},$$scope:{ctx:a}}}),x=new Ie({props:{title:"Normalization",local:"normalization",headingTag:"h2"}}),_=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[Dl],rust:[xl],python:[El]},$$scope:{ctx:a}}}),I=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[es],rust:[Pl],python:[Kl]},$$scope:{ctx:a}}}),b=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[rs],rust:[ns],python:[ls]},$$scope:{ctx:a}}}),P=new Ie({props:{title:"Pre-Tokenization",local:"pre-tokenization",headingTag:"h2"}}),C=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[Ms],rust:[cs],python:[ps]},$$scope:{ctx:a}}}),z=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[Us],rust:[fs],python:[ys]},$$scope:{ctx:a}}}),V=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[ks],rust:[ws],python:[Ts]},$$scope:{ctx:a}}}),ae=new Ie({props:{title:"Model",local:"model",headingTag:"h2"}}),ue=new Ie({props:{title:"Post-Processing",local:"post-processing",headingTag:"h2"}}),Q=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[zs],rust:[bs],python:[_s]},$$scope:{ctx:a}}}),me=new Ie({props:{title:"All together: a BERT tokenizer from scratch",local:"all-together-a-bert-tokenizer-from-scratch",headingTag:"h2"}}),Z=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[Xs],rust:[Ns],python:[Qs]},$$scope:{ctx:a}}}),N=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[Gs],rust:[Rs],python:[qs]},$$scope:{ctx:a}}}),S=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[Ds],rust:[xs],python:[Es]},$$scope:{ctx:a}}}),X=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[en],rust:[Ps],python:[Ks]},$$scope:{ctx:a}}}),v=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[rn],rust:[nn],python:[ln]},$$scope:{ctx:a}}}),je=new Ie({props:{title:"Decoding",local:"decoding",headingTag:"h2"}}),q=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[$n],rust:[un],python:[pn]},$$scope:{ctx:a}}}),B=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[dn],rust:[Jn],python:[mn]},$$scope:{ctx:a}}}),R=new k({props:{python:!0,rust:!0,node:!0,$$slots:{node:[gn],rust:[hn],python:[jn]},$$scope:{ctx:a}}}),_e=new Xl({props:{source:"https://github.com/huggingface/tokenizers/blob/main/docs/source-doc-builder/pipeline.mdx"}}),{c(){t=j("meta"),l=f(),e=j("p"),s=f(),p(o.$$.fragment),Ce=f(),p(W.$$.fragment),ze=f(),G=j("p"),G.innerHTML=Zt,Ve=f(),A=j("ul"),A.innerHTML=Nt,Qe=f(),E=j("p"),E.innerHTML=St,Ze=f(),H=j("p"),H.innerHTML=Xt,Ne=f(),p(g.$$.fragment),Se=f(),p(x.$$.fragment),Xe=f(),F=j("p"),F.innerHTML=vt,ve=f(),D=j("p"),D.innerHTML=qt,qe=f(),p(_.$$.fragment),Be=f(),Y=j("p"),Y.textContent=Bt,Re=f(),p(I.$$.fragment),We=f(),K=j("p"),K.innerHTML=Rt,Ge=f(),p(b.$$.fragment),Ae=f(),L=j("p"),L.textContent=Wt,Ee=f(),p(P.$$.fragment),He=f(),O=j("p"),O.textContent=Gt,xe=f(),ee=j("p"),ee.innerHTML=At,Fe=f(),p(C.$$.fragment),De=f(),te=j("p"),te.innerHTML=Et,Ye=f(),le=j("p"),le.innerHTML=Ht,Ke=f(),p(z.$$.fragment),Le=f(),se=j("p"),se.innerHTML=xt,Pe=f(),p(V.$$.fragment),Oe=f(),ne=j("p"),ne.textContent=Ft,et=f(),p(ae.$$.fragment),tt=f(),re=j("p"),re.innerHTML=Dt,lt=f(),oe=j("p"),oe.textContent=Yt,st=f(),pe=j("p"),pe.innerHTML=Kt,nt=f(),ie=j("ul"),ie.innerHTML=Lt,at=f(),ce=j("p"),ce.innerHTML=Pt,rt=f(),p(ue.$$.fragment),ot=f(),Me=j("p"),Me.innerHTML=Ot,pt=f(),$e=j("p"),$e.innerHTML=el,it=f(),p(Q.$$.fragment),ct=f(),ye=j("p"),ye.textContent=tl,ut=f(),p(me.$$.fragment),Mt=f(),fe=j("p"),fe.innerHTML=ll,$t=f(),p(Z.$$.fragment),yt=f(),Je=j("p"),Je.textContent=sl,mt=f(),p(N.$$.fragment),ft=f(),Ue=j("p"),Ue.textContent=nl,Jt=f(),p(S.$$.fragment),Ut=f(),de=j("p"),de.textContent=al,dt=f(),p(X.$$.fragment),Tt=f(),Te=j("p"),Te.innerHTML=rl,jt=f(),p(v.$$.fragment),wt=f(),p(je.$$.fragment),ht=f(),we=j("p"),we.innerHTML=ol,kt=f(),he=j("p"),he.innerHTML=pl,gt=f(),p(q.$$.fragment),_t=f(),ke=j("p"),ke.innerHTML=il,It=f(),p(B.$$.fragment),bt=f(),ge=j("p"),ge.textContent=cl,Ct=f(),p(R.$$.fragment),zt=f(),p(_e.$$.fragment),Vt=f(),be=j("p"),this.h()},l(n){const r=Zl("svelte-u9bgzb",document.head);t=w(r,"META",{name:!0,content:!0}),r.forEach(y),l=J(n),e=w(n,"P",{}),Il(e).forEach(y),s=J(n),i(o.$$.fragment,n),Ce=J(n),i(W.$$.fragment,n),ze=J(n),G=w(n,"P",{"data-svelte-h":!0}),h(G)!=="svelte-jozml5"&&(G.innerHTML=Zt),Ve=J(n),A=w(n,"UL",{"data-svelte-h":!0}),h(A)!=="svelte-1rz3n8j"&&(A.innerHTML=Nt),Qe=J(n),E=w(n,"P",{"data-svelte-h":!0}),h(E)!=="svelte-1rtvet7"&&(E.innerHTML=St),Ze=J(n),H=w(n,"P",{"data-svelte-h":!0}),h(H)!=="svelte-8zxp30"&&(H.innerHTML=Xt),Ne=J(n),i(g.$$.fragment,n),Se=J(n),i(x.$$.fragment,n),Xe=J(n),F=w(n,"P",{"data-svelte-h":!0}),h(F)!=="svelte-qddrd9"&&(F.innerHTML=vt),ve=J(n),D=w(n,"P",{"data-svelte-h":!0}),h(D)!=="svelte-u21ffr"&&(D.innerHTML=qt),qe=J(n),i(_.$$.fragment,n),Be=J(n),Y=w(n,"P",{"data-svelte-h":!0}),h(Y)!=="svelte-11fxjh0"&&(Y.textContent=Bt),Re=J(n),i(I.$$.fragment,n),We=J(n),K=w(n,"P",{"data-svelte-h":!0}),h(K)!=="svelte-ph5t46"&&(K.innerHTML=Rt),Ge=J(n),i(b.$$.fragment,n),Ae=J(n),L=w(n,"P",{"data-svelte-h":!0}),h(L)!=="svelte-c5if5y"&&(L.textContent=Wt),Ee=J(n),i(P.$$.fragment,n),He=J(n),O=w(n,"P",{"data-svelte-h":!0}),h(O)!=="svelte-17w2pja"&&(O.textContent=Gt),xe=J(n),ee=w(n,"P",{"data-svelte-h":!0}),h(ee)!=="svelte-8vbxhw"&&(ee.innerHTML=At),Fe=J(n),i(C.$$.fragment,n),De=J(n),te=w(n,"P",{"data-svelte-h":!0}),h(te)!=="svelte-rrvwxs"&&(te.innerHTML=Et),Ye=J(n),le=w(n,"P",{"data-svelte-h":!0}),h(le)!=="svelte-1sxj246"&&(le.innerHTML=Ht),Ke=J(n),i(z.$$.fragment,n),Le=J(n),se=w(n,"P",{"data-svelte-h":!0}),h(se)!=="svelte-1xkybhv"&&(se.innerHTML=xt),Pe=J(n),i(V.$$.fragment,n),Oe=J(n),ne=w(n,"P",{"data-svelte-h":!0}),h(ne)!=="svelte-1hbd718"&&(ne.textContent=Ft),et=J(n),i(ae.$$.fragment,n),tt=J(n),re=w(n,"P",{"data-svelte-h":!0}),h(re)!=="svelte-1d5s5ai"&&(re.innerHTML=Dt),lt=J(n),oe=w(n,"P",{"data-svelte-h":!0}),h(oe)!=="svelte-1776i4x"&&(oe.textContent=Yt),st=J(n),pe=w(n,"P",{"data-svelte-h":!0}),h(pe)!=="svelte-1ssp78q"&&(pe.innerHTML=Kt),nt=J(n),ie=w(n,"UL",{"data-svelte-h":!0}),h(ie)!=="svelte-17p2ane"&&(ie.innerHTML=Lt),at=J(n),ce=w(n,"P",{"data-svelte-h":!0}),h(ce)!=="svelte-vo0orx"&&(ce.innerHTML=Pt),rt=J(n),i(ue.$$.fragment,n),ot=J(n),Me=w(n,"P",{"data-svelte-h":!0}),h(Me)!=="svelte-bq6cd6"&&(Me.innerHTML=Ot),pt=J(n),$e=w(n,"P",{"data-svelte-h":!0}),h($e)!=="svelte-1rbxjab"&&($e.innerHTML=el),it=J(n),i(Q.$$.fragment,n),ct=J(n),ye=w(n,"P",{"data-svelte-h":!0}),h(ye)!=="svelte-1k7gzcb"&&(ye.textContent=tl),ut=J(n),i(me.$$.fragment,n),Mt=J(n),fe=w(n,"P",{"data-svelte-h":!0}),h(fe)!=="svelte-xj0mrt"&&(fe.innerHTML=ll),$t=J(n),i(Z.$$.fragment,n),yt=J(n),Je=w(n,"P",{"data-svelte-h":!0}),h(Je)!=="svelte-15y7mhn"&&(Je.textContent=sl),mt=J(n),i(N.$$.fragment,n),ft=J(n),Ue=w(n,"P",{"data-svelte-h":!0}),h(Ue)!=="svelte-ifp6a5"&&(Ue.textContent=nl),Jt=J(n),i(S.$$.fragment,n),Ut=J(n),de=w(n,"P",{"data-svelte-h":!0}),h(de)!=="svelte-1jpbo92"&&(de.textContent=al),dt=J(n),i(X.$$.fragment,n),Tt=J(n),Te=w(n,"P",{"data-svelte-h":!0}),h(Te)!=="svelte-v2snrq"&&(Te.innerHTML=rl),jt=J(n),i(v.$$.fragment,n),wt=J(n),i(je.$$.fragment,n),ht=J(n),we=w(n,"P",{"data-svelte-h":!0}),h(we)!=="svelte-ppu6m1"&&(we.innerHTML=ol),kt=J(n),he=w(n,"P",{"data-svelte-h":!0}),h(he)!=="svelte-16ah0w1"&&(he.innerHTML=pl),gt=J(n),i(q.$$.fragment,n),_t=J(n),ke=w(n,"P",{"data-svelte-h":!0}),h(ke)!=="svelte-19gaglf"&&(ke.innerHTML=il),It=J(n),i(B.$$.fragment,n),bt=J(n),ge=w(n,"P",{"data-svelte-h":!0}),h(ge)!=="svelte-la4qmm"&&(ge.textContent=cl),Ct=J(n),i(R.$$.fragment,n),zt=J(n),i(_e.$$.fragment,n),Vt=J(n),be=w(n,"P",{}),Il(be).forEach(y),this.h()},h(){bl(t,"name","hf:doc:metadata"),bl(t,"content",In)},m(n,r){Nl(document.head,t),m(n,l,r),m(n,e,r),m(n,s,r),c(o,n,r),m(n,Ce,r),c(W,n,r),m(n,ze,r),m(n,G,r),m(n,Ve,r),m(n,A,r),m(n,Qe,r),m(n,E,r),m(n,Ze,r),m(n,H,r),m(n,Ne,r),c(g,n,r),m(n,Se,r),c(x,n,r),m(n,Xe,r),m(n,F,r),m(n,ve,r),m(n,D,r),m(n,qe,r),c(_,n,r),m(n,Be,r),m(n,Y,r),m(n,Re,r),c(I,n,r),m(n,We,r),m(n,K,r),m(n,Ge,r),c(b,n,r),m(n,Ae,r),m(n,L,r),m(n,Ee,r),c(P,n,r),m(n,He,r),m(n,O,r),m(n,xe,r),m(n,ee,r),m(n,Fe,r),c(C,n,r),m(n,De,r),m(n,te,r),m(n,Ye,r),m(n,le,r),m(n,Ke,r),c(z,n,r),m(n,Le,r),m(n,se,r),m(n,Pe,r),c(V,n,r),m(n,Oe,r),m(n,ne,r),m(n,et,r),c(ae,n,r),m(n,tt,r),m(n,re,r),m(n,lt,r),m(n,oe,r),m(n,st,r),m(n,pe,r),m(n,nt,r),m(n,ie,r),m(n,at,r),m(n,ce,r),m(n,rt,r),c(ue,n,r),m(n,ot,r),m(n,Me,r),m(n,pt,r),m(n,$e,r),m(n,it,r),c(Q,n,r),m(n,ct,r),m(n,ye,r),m(n,ut,r),c(me,n,r),m(n,Mt,r),m(n,fe,r),m(n,$t,r),c(Z,n,r),m(n,yt,r),m(n,Je,r),m(n,mt,r),c(N,n,r),m(n,ft,r),m(n,Ue,r),m(n,Jt,r),c(S,n,r),m(n,Ut,r),m(n,de,r),m(n,dt,r),c(X,n,r),m(n,Tt,r),m(n,Te,r),m(n,jt,r),c(v,n,r),m(n,wt,r),c(je,n,r),m(n,ht,r),m(n,we,r),m(n,kt,r),m(n,he,r),m(n,gt,r),c(q,n,r),m(n,_t,r),m(n,ke,r),m(n,It,r),c(B,n,r),m(n,bt,r),m(n,ge,r),m(n,Ct,r),c(R,n,r),m(n,zt,r),c(_e,n,r),m(n,Vt,r),m(n,be,r),Qt=!0},p(n,[r]){const ul={};r&2&&(ul.$$scope={dirty:r,ctx:n}),g.$set(ul);const Ml={};r&2&&(Ml.$$scope={dirty:r,ctx:n}),_.$set(Ml);const $l={};r&2&&($l.$$scope={dirty:r,ctx:n}),I.$set($l);const yl={};r&2&&(yl.$$scope={dirty:r,ctx:n}),b.$set(yl);const ml={};r&2&&(ml.$$scope={dirty:r,ctx:n}),C.$set(ml);const fl={};r&2&&(fl.$$scope={dirty:r,ctx:n}),z.$set(fl);const Jl={};r&2&&(Jl.$$scope={dirty:r,ctx:n}),V.$set(Jl);const Ul={};r&2&&(Ul.$$scope={dirty:r,ctx:n}),Q.$set(Ul);const dl={};r&2&&(dl.$$scope={dirty:r,ctx:n}),Z.$set(dl);const Tl={};r&2&&(Tl.$$scope={dirty:r,ctx:n}),N.$set(Tl);const jl={};r&2&&(jl.$$scope={dirty:r,ctx:n}),S.$set(jl);const wl={};r&2&&(wl.$$scope={dirty:r,ctx:n}),X.$set(wl);const hl={};r&2&&(hl.$$scope={dirty:r,ctx:n}),v.$set(hl);const kl={};r&2&&(kl.$$scope={dirty:r,ctx:n}),q.$set(kl);const gl={};r&2&&(gl.$$scope={dirty:r,ctx:n}),B.$set(gl);const _l={};r&2&&(_l.$$scope={dirty:r,ctx:n}),R.$set(_l)},i(n){Qt||(u(o.$$.fragment,n),u(W.$$.fragment,n),u(g.$$.fragment,n),u(x.$$.fragment,n),u(_.$$.fragment,n),u(I.$$.fragment,n),u(b.$$.fragment,n),u(P.$$.fragment,n),u(C.$$.fragment,n),u(z.$$.fragment,n),u(V.$$.fragment,n),u(ae.$$.fragment,n),u(ue.$$.fragment,n),u(Q.$$.fragment,n),u(me.$$.fragment,n),u(Z.$$.fragment,n),u(N.$$.fragment,n),u(S.$$.fragment,n),u(X.$$.fragment,n),u(v.$$.fragment,n),u(je.$$.fragment,n),u(q.$$.fragment,n),u(B.$$.fragment,n),u(R.$$.fragment,n),u(_e.$$.fragment,n),Qt=!0)},o(n){M(o.$$.fragment,n),M(W.$$.fragment,n),M(g.$$.fragment,n),M(x.$$.fragment,n),M(_.$$.fragment,n),M(I.$$.fragment,n),M(b.$$.fragment,n),M(P.$$.fragment,n),M(C.$$.fragment,n),M(z.$$.fragment,n),M(V.$$.fragment,n),M(ae.$$.fragment,n),M(ue.$$.fragment,n),M(Q.$$.fragment,n),M(me.$$.fragment,n),M(Z.$$.fragment,n),M(N.$$.fragment,n),M(S.$$.fragment,n),M(X.$$.fragment,n),M(v.$$.fragment,n),M(je.$$.fragment,n),M(q.$$.fragment,n),M(B.$$.fragment,n),M(R.$$.fragment,n),M(_e.$$.fragment,n),Qt=!1},d(n){n&&(y(l),y(e),y(s),y(Ce),y(ze),y(G),y(Ve),y(A),y(Qe),y(E),y(Ze),y(H),y(Ne),y(Se),y(Xe),y(F),y(ve),y(D),y(qe),y(Be),y(Y),y(Re),y(We),y(K),y(Ge),y(Ae),y(L),y(Ee),y(He),y(O),y(xe),y(ee),y(Fe),y(De),y(te),y(Ye),y(le),y(Ke),y(Le),y(se),y(Pe),y(Oe),y(ne),y(et),y(tt),y(re),y(lt),y(oe),y(st),y(pe),y(nt),y(ie),y(at),y(ce),y(rt),y(ot),y(Me),y(pt),y($e),y(it),y(ct),y(ye),y(ut),y(Mt),y(fe),y($t),y(yt),y(Je),y(mt),y(ft),y(Ue),y(Jt),y(Ut),y(de),y(dt),y(Tt),y(Te),y(jt),y(wt),y(ht),y(we),y(kt),y(he),y(gt),y(_t),y(ke),y(It),y(bt),y(ge),y(Ct),y(zt),y(Vt),y(be)),y(t),$(o,n),$(W,n),$(g,n),$(x,n),$(_,n),$(I,n),$(b,n),$(P,n),$(C,n),$(z,n),$(V,n),$(ae,n),$(ue,n),$(Q,n),$(me,n),$(Z,n),$(N,n),$(S,n),$(X,n),$(v,n),$(je,n),$(q,n),$(B,n),$(R,n),$(_e,n)}}}const In='{"title":"The tokenization pipeline","local":"the-tokenization-pipeline","sections":[{"title":"Normalization","local":"normalization","sections":[],"depth":2},{"title":"Pre-Tokenization","local":"pre-tokenization","sections":[],"depth":2},{"title":"Model","local":"model","sections":[],"depth":2},{"title":"Post-Processing","local":"post-processing","sections":[],"depth":2},{"title":"All together: a BERT tokenizer from scratch","local":"all-together-a-bert-tokenizer-from-scratch","sections":[],"depth":2},{"title":"Decoding","local":"decoding","sections":[],"depth":2}],"depth":1}';function bn(a){return zl(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Nn extends Vl{constructor(t){super(),Ql(this,t,bn,_n,Cl,{})}}export{Nn as component}; | |
Xet Storage Details
- Size:
- 97.3 kB
- Xet hash:
- 7824c15ba1e1a358efa525e830fd68899f499c9ee79bf3407224d6fea6743489
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.