Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / tokenizers /pr_2011 /en /_app /immutable /nodes /2.da15eae0.js

rtrm's picture

about 2 months ago

10.4 kB

	import{s as pe,o as fe,n as se}from"../chunks/scheduler.7c59faff.js";import{S as ue,i as he,e as u,s as m,c as $,h as $e,a as h,d as i,b as p,f as F,g,j as E,k as d,l as x,m as k,t as w,n as _,o as v,p as S}from"../chunks/index.09bb5655.js";import{C as ge,H as me,E as ke}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.6b0e673b.js";import{D as U}from"../chunks/Docstring.9d82a444.js";import{T as we,M as ae}from"../chunks/TokenizersLanguageContent.0fc17a7a.js";function _e(T){let t,r,e,o,l,z,M=`Represents a token that can be be added to a <a href="/docs/tokenizers/pr_2011/en/api/tokenizer#tokenizers.Tokenizer">Tokenizer</a>.
	It can have special options that defines the way it should behave.`,D,f,b,y,A,L="Get the content of this <code>AddedToken</code>",H,n,s,R,j,de="Get the value of the <code>lstrip</code> option",Z,P,G,ee,K,ie="Get the value of the <code>normalized</code> option",te,I,q,ne,N,le="Get the value of the <code>rstrip</code> option",oe,C,V,re,O,ce="Get the value of the <code>single_word</code> option",Y;return t=new me({props:{title:"AddedToken",local:"tokenizers.AddedToken",headingTag:"h2"}}),o=new U({props:{name:"class tokenizers.AddedToken",anchor:"tokenizers.AddedToken",parameters:[{name:"content",val:" = None"},{name:"single_word",val:" = False"},{name:"lstrip",val:" = False"},{name:"rstrip",val:" = False"},{name:"normalized",val:" = True"},{name:"special",val:" = False"}],parametersDescription:[{anchor:"tokenizers.AddedToken.content",description:"<strong>content</strong> (<code>str</code>) — The content of the token",name:"content"},{anchor:"tokenizers.AddedToken.single_word",description:`<strong>single_word</strong> (<code>bool</code>, defaults to <code>False</code>) —
	Defines whether this token should only match single words. If <code>True</code>, this
	token will never match inside of a word. For example the token <code>ing</code> would match
	on <code>tokenizing</code> if this option is <code>False</code>, but not if it is <code>True</code>.
	The notion of ”<em>inside of a word</em>” is defined by the word boundaries pattern in
	regular expressions (ie. the token should start and end with word boundaries).`,name:"single_word"},{anchor:"tokenizers.AddedToken.lstrip",description:`<strong>lstrip</strong> (<code>bool</code>, defaults to <code>False</code>) —
	Defines whether this token should strip all potential whitespaces on its left side.
	If <code>True</code>, this token will greedily match any whitespace on its left. For
	example if we try to match the token <code>[MASK]</code> with <code>lstrip=True</code>, in the text
	<code>"I saw a [MASK]"</code>, we would match on <code>" [MASK]"</code>. (Note the space on the left).`,name:"lstrip"},{anchor:"tokenizers.AddedToken.rstrip",description:`<strong>rstrip</strong> (<code>bool</code>, defaults to <code>False</code>) —
	Defines whether this token should strip all potential whitespaces on its right
	side. If <code>True</code>, this token will greedily match any whitespace on its right.
	It works just like <code>lstrip</code> but on the right.`,name:"rstrip"},{anchor:"tokenizers.AddedToken.normalized",description:`<strong>normalized</strong> (<code>bool</code>, defaults to <code>True</code> with —meth:<em>~tokenizers.Tokenizer.add_tokens</em> and <code>False</code> with <code>add_special_tokens()</code>):
	Defines whether this token should match against the normalized version of the input
	text. For example, with the added token <code>"yesterday"</code>, and a normalizer in charge of
	lowercasing the text, the token could be extract from the input <code>"I saw a lion Yesterday"</code>.`,name:"normalized"},{anchor:"tokenizers.AddedToken.special",description:`<strong>special</strong> (<code>bool</code>, defaults to <code>False</code> with —meth:<em>~tokenizers.Tokenizer.add_tokens</em> and <code>False</code> with <code>add_special_tokens()</code>):
	Defines whether this token should be skipped when decoding.`,name:"special"}]}}),b=new U({props:{name:"content",anchor:"tokenizers.AddedToken.content",parameters:[],isGetSetDescriptor:!0}}),s=new U({props:{name:"lstrip",anchor:"tokenizers.AddedToken.lstrip",parameters:[],isGetSetDescriptor:!0}}),G=new U({props:{name:"normalized",anchor:"tokenizers.AddedToken.normalized",parameters:[],isGetSetDescriptor:!0}}),q=new U({props:{name:"rstrip",anchor:"tokenizers.AddedToken.rstrip",parameters:[],isGetSetDescriptor:!0}}),V=new U({props:{name:"single_word",anchor:"tokenizers.AddedToken.single_word",parameters:[],isGetSetDescriptor:!0}}),{c(){$(t.$$.fragment),r=m(),e=u("div"),$(o.$$.fragment),l=m(),z=u("p"),z.innerHTML=M,D=m(),f=u("div"),$(b.$$.fragment),y=m(),A=u("p"),A.innerHTML=L,H=m(),n=u("div"),$(s.$$.fragment),R=m(),j=u("p"),j.innerHTML=de,Z=m(),P=u("div"),$(G.$$.fragment),ee=m(),K=u("p"),K.innerHTML=ie,te=m(),I=u("div"),$(q.$$.fragment),ne=m(),N=u("p"),N.innerHTML=le,oe=m(),C=u("div"),$(V.$$.fragment),re=m(),O=u("p"),O.innerHTML=ce,this.h()},l(a){g(t.$$.fragment,a),r=p(a),e=h(a,"DIV",{class:!0});var c=F(e);g(o.$$.fragment,c),l=p(c),z=h(c,"P",{"data-svelte-h":!0}),S(z)!=="svelte-18q5wc"&&(z.innerHTML=M),D=p(c),f=h(c,"DIV",{class:!0});var B=F(f);g(b.$$.fragment,B),y=p(B),A=h(B,"P",{"data-svelte-h":!0}),S(A)!=="svelte-11ic05p"&&(A.innerHTML=L),B.forEach(i),H=p(c),n=h(c,"DIV",{class:!0});var J=F(n);g(s.$$.fragment,J),R=p(J),j=h(J,"P",{"data-svelte-h":!0}),S(j)!=="svelte-1ysl9wg"&&(j.innerHTML=de),J.forEach(i),Z=p(c),P=h(c,"DIV",{class:!0});var Q=F(P);g(G.$$.fragment,Q),ee=p(Q),K=h(Q,"P",{"data-svelte-h":!0}),S(K)!=="svelte-1rc21rb"&&(K.innerHTML=ie),Q.forEach(i),te=p(c),I=h(c,"DIV",{class:!0});var W=F(I);g(q.$$.fragment,W),ne=p(W),N=h(W,"P",{"data-svelte-h":!0}),S(N)!=="svelte-wndfu2"&&(N.innerHTML=le),W.forEach(i),oe=p(c),C=h(c,"DIV",{class:!0});var X=F(C);g(V.$$.fragment,X),re=p(X),O=h(X,"P",{"data-svelte-h":!0}),S(O)!=="svelte-2k02jz"&&(O.innerHTML=ce),X.forEach(i),c.forEach(i),this.h()},h(){E(f,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),E(n,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),E(P,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),E(I,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),E(C,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8"),E(e,"class","docstring border-l-2 border-t-2 pl-4 pt-3.5 border-gray-100 rounded-tl-xl mb-6 mt-8")},m(a,c){k(t,a,c),x(a,r,c),x(a,e,c),k(o,e,null),d(e,l),d(e,z),d(e,D),d(e,f),k(b,f,null),d(f,y),d(f,A),d(e,H),d(e,n),k(s,n,null),d(n,R),d(n,j),d(e,Z),d(e,P),k(G,P,null),d(P,ee),d(P,K),d(e,te),d(e,I),k(q,I,null),d(I,ne),d(I,N),d(e,oe),d(e,C),k(V,C,null),d(C,re),d(C,O),Y=!0},p:se,i(a){Y\|\|(w(t.$$.fragment,a),w(o.$$.fragment,a),w(b.$$.fragment,a),w(s.$$.fragment,a),w(G.$$.fragment,a),w(q.$$.fragment,a),w(V.$$.fragment,a),Y=!0)},o(a){_(t.$$.fragment,a),_(o.$$.fragment,a),_(b.$$.fragment,a),_(s.$$.fragment,a),_(G.$$.fragment,a),_(q.$$.fragment,a),_(V.$$.fragment,a),Y=!1},d(a){a&&(i(r),i(e)),v(t,a),v(o),v(b),v(s),v(G),v(q),v(V)}}}function ve(T){let t,r;return t=new ae({props:{$$slots:{default:[_e]},$$scope:{ctx:T}}}),{c(){$(t.$$.fragment)},l(e){g(t.$$.fragment,e)},m(e,o){k(t,e,o),r=!0},p(e,o){const l={};o&2&&(l.$$scope={dirty:o,ctx:e}),t.$set(l)},i(e){r\|\|(w(t.$$.fragment,e),r=!0)},o(e){_(t.$$.fragment,e),r=!1},d(e){v(t,e)}}}function Te(T){let t,r='The Rust API Reference is available directly on the <a href="https://docs.rs/tokenizers/latest/tokenizers/" rel="nofollow">Docs.rs</a> website.';return{c(){t=u("p"),t.innerHTML=r},l(e){t=h(e,"P",{"data-svelte-h":!0}),S(t)!=="svelte-4ytcyb"&&(t.innerHTML=r)},m(e,o){x(e,t,o)},p:se,d(e){e&&i(t)}}}function be(T){let t,r;return t=new ae({props:{$$slots:{default:[Te]},$$scope:{ctx:T}}}),{c(){$(t.$$.fragment)},l(e){g(t.$$.fragment,e)},m(e,o){k(t,e,o),r=!0},p(e,o){const l={};o&2&&(l.$$scope={dirty:o,ctx:e}),t.$set(l)},i(e){r\|\|(w(t.$$.fragment,e),r=!0)},o(e){_(t.$$.fragment,e),r=!1},d(e){v(t,e)}}}function xe(T){let t,r="The node API has not been documented yet.";return{c(){t=u("p"),t.textContent=r},l(e){t=h(e,"P",{"data-svelte-h":!0}),S(t)!=="svelte-1mrchm6"&&(t.textContent=r)},m(e,o){x(e,t,o)},p:se,d(e){e&&i(t)}}}function ze(T){let t,r;return t=new ae({props:{$$slots:{default:[xe]},$$scope:{ctx:T}}}),{c(){$(t.$$.fragment)},l(e){g(t.$$.fragment,e)},m(e,o){k(t,e,o),r=!0},p(e,o){const l={};o&2&&(l.$$scope={dirty:o,ctx:e}),t.$set(l)},i(e){r\|\|(w(t.$$.fragment,e),r=!0)},o(e){_(t.$$.fragment,e),r=!1},d(e){v(t,e)}}}function ye(T){let t,r,e,o,l,z,M,D,f,b,y,A,L,H;return l=new ge({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),M=new me({props:{title:"Added Tokens",local:"added-tokens",headingTag:"h1"}}),f=new we({props:{python:!0,rust:!0,node:!0,$$slots:{node:[ze],rust:[be],python:[ve]},$$scope:{ctx:T}}}),y=new ke({props:{source:"https://github.com/huggingface/tokenizers/blob/main/docs/source-doc-builder/api/added-tokens.mdx"}}),{c(){t=u("meta"),r=m(),e=u("p"),o=m(),$(l.$$.fragment),z=m(),$(M.$$.fragment),D=m(),$(f.$$.fragment),b=m(),$(y.$$.fragment),A=m(),L=u("p"),this.h()},l(n){const s=$e("svelte-u9bgzb",document.head);t=h(s,"META",{name:!0,content:!0}),s.forEach(i),r=p(n),e=h(n,"P",{}),F(e).forEach(i),o=p(n),g(l.$$.fragment,n),z=p(n),g(M.$$.fragment,n),D=p(n),g(f.$$.fragment,n),b=p(n),g(y.$$.fragment,n),A=p(n),L=h(n,"P",{}),F(L).forEach(i),this.h()},h(){E(t,"name","hf:doc:metadata"),E(t,"content",Ae)},m(n,s){d(document.head,t),x(n,r,s),x(n,e,s),x(n,o,s),k(l,n,s),x(n,z,s),k(M,n,s),x(n,D,s),k(f,n,s),x(n,b,s),k(y,n,s),x(n,A,s),x(n,L,s),H=!0},p(n,[s]){const R={};s&2&&(R.$$scope={dirty:s,ctx:n}),f.$set(R)},i(n){H\|\|(w(l.$$.fragment,n),w(M.$$.fragment,n),w(f.$$.fragment,n),w(y.$$.fragment,n),H=!0)},o(n){_(l.$$.fragment,n),_(M.$$.fragment,n),_(f.$$.fragment,n),_(y.$$.fragment,n),H=!1},d(n){n&&(i(r),i(e),i(o),i(z),i(D),i(b),i(A),i(L)),i(t),v(l,n),v(M,n),v(f,n),v(y,n)}}}const Ae='{"title":"Added Tokens","local":"added-tokens","sections":[{"title":"AddedToken","local":"tokenizers.AddedToken","sections":[],"depth":2}],"depth":1}';function Me(T){return fe(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Ce extends ue{constructor(t){super(),he(this,t,Me,ye,pe,{})}}export{Ce as component};

Xet Storage Details

Size:: 10.4 kB
Xet hash:: c863ae473f7b4d253790211abbcd6cf1771afd01145aedefccec3cff0d53fe40

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.