Buckets:

rtrm's picture
download
raw
9.45 kB
import{s as ce,n as Me,o as ge}from"../chunks/scheduler.9bc65507.js";import{S as ke,i as de,g as r,s as n,r as p,A as je,h as i,f as s,c as a,j as oe,u as o,x as f,k as fe,y as ye,a as l,v as c,d as M,t as g,w as k}from"../chunks/index.707bf1b6.js";import{C as D}from"../chunks/CodeBlock.54a9f38d.js";import{H as ee,E as ue}from"../chunks/EditOnGithub.922df6ba.js";function he(te){let m,C,F,I,d,P,j,se='<a href="/docs/transformers/pr_33956/ja/main_classes/tokenizer#transformers.PreTrainedTokenizerFast">PreTrainedTokenizerFast</a>は<a href="https://huggingface.co/docs/tokenizers" rel="nofollow">🤗 Tokenizers</a>ライブラリに依存しています。🤗 Tokenizersライブラリから取得したトークナイザーは、非常に簡単に🤗 Transformersにロードできます。',N,y,le="具体的な内容に入る前に、まずはいくつかの行でダミーのトークナイザーを作成することから始めましょう:",Q,u,W,h,ne=`私たちは今、定義したファイルにトレーニングされたトークナイザーを持っています。これをランタイムで引き続き使用するか、
将来の再利用のためにJSONファイルに保存することができます。`,E,$,G,T,ae=`🤗 Transformersライブラリでこのトークナイザーオブジェクトをどのように活用できるかを見てみましょう。<a href="/docs/transformers/pr_33956/ja/main_classes/tokenizer#transformers.PreTrainedTokenizerFast">PreTrainedTokenizerFast</a>クラスは、
<em>tokenizer</em>オブジェクトを引数として受け入れ、簡単にインスタンス化できるようにします。`,X,z,q,b,re='このオブジェクトは、🤗 Transformers トークナイザーが共有するすべてのメソッドと一緒に使用できます!詳細については、<a href="main_classes/tokenizer">トークナイザーページ</a>をご覧ください。',R,U,x,w,ie="JSONファイルからトークナイザーを読み込むには、まずトークナイザーを保存することから始めましょう:",L,Z,H,_,me="このファイルを保存したパスは、<code>PreTrainedTokenizerFast</code> の初期化メソッドに <code>tokenizer_file</code> パラメータを使用して渡すことができます:",S,J,A,V,pe='このオブジェクトは、🤗 Transformers トークナイザーが共有するすべてのメソッドと一緒に使用できるようになりました!詳細については、<a href="main_classes/tokenizer">トークナイザーページ</a>をご覧ください。',Y,B,K,v,O;return d=new ee({props:{title:"Use tokenizers from 🤗 Tokenizers",local:"use-tokenizers-from--tokenizers",headingTag:"h1"}}),u=new D({props:{code:"ZnJvbSUyMHRva2VuaXplcnMlMjBpbXBvcnQlMjBUb2tlbml6ZXIlMEFmcm9tJTIwdG9rZW5pemVycy5tb2RlbHMlMjBpbXBvcnQlMjBCUEUlMEFmcm9tJTIwdG9rZW5pemVycy50cmFpbmVycyUyMGltcG9ydCUyMEJwZVRyYWluZXIlMEFmcm9tJTIwdG9rZW5pemVycy5wcmVfdG9rZW5pemVycyUyMGltcG9ydCUyMFdoaXRlc3BhY2UlMEElMEF0b2tlbml6ZXIlMjAlM0QlMjBUb2tlbml6ZXIoQlBFKHVua190b2tlbiUzRCUyMiU1QlVOSyU1RCUyMikpJTBBdHJhaW5lciUyMCUzRCUyMEJwZVRyYWluZXIoc3BlY2lhbF90b2tlbnMlM0QlNUIlMjIlNUJVTkslNUQlMjIlMkMlMjAlMjIlNUJDTFMlNUQlMjIlMkMlMjAlMjIlNUJTRVAlNUQlMjIlMkMlMjAlMjIlNUJQQUQlNUQlMjIlMkMlMjAlMjIlNUJNQVNLJTVEJTIyJTVEKSUwQSUwQXRva2VuaXplci5wcmVfdG9rZW5pemVyJTIwJTNEJTIwV2hpdGVzcGFjZSgpJTBBZmlsZXMlMjAlM0QlMjAlNUIuLi4lNUQlMEF0b2tlbml6ZXIudHJhaW4oZmlsZXMlMkMlMjB0cmFpbmVyKQ==",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> Tokenizer
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> tokenizers.models <span class="hljs-keyword">import</span> BPE
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> tokenizers.trainers <span class="hljs-keyword">import</span> BpeTrainer
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> tokenizers.pre_tokenizers <span class="hljs-keyword">import</span> Whitespace
<span class="hljs-meta">&gt;&gt;&gt; </span>tokenizer = Tokenizer(BPE(unk_token=<span class="hljs-string">&quot;[UNK]&quot;</span>))
<span class="hljs-meta">&gt;&gt;&gt; </span>trainer = BpeTrainer(special_tokens=[<span class="hljs-string">&quot;[UNK]&quot;</span>, <span class="hljs-string">&quot;[CLS]&quot;</span>, <span class="hljs-string">&quot;[SEP]&quot;</span>, <span class="hljs-string">&quot;[PAD]&quot;</span>, <span class="hljs-string">&quot;[MASK]&quot;</span>])
<span class="hljs-meta">&gt;&gt;&gt; </span>tokenizer.pre_tokenizer = Whitespace()
<span class="hljs-meta">&gt;&gt;&gt; </span>files = [...]
<span class="hljs-meta">&gt;&gt;&gt; </span>tokenizer.train(files, trainer)`,wrap:!1}}),$=new ee({props:{title:"Loading directly from the tokenizer object",local:"loading-directly-from-the-tokenizer-object",headingTag:"h2"}}),z=new D({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFByZVRyYWluZWRUb2tlbml6ZXJGYXN0JTBBJTBBZmFzdF90b2tlbml6ZXIlMjAlM0QlMjBQcmVUcmFpbmVkVG9rZW5pemVyRmFzdCh0b2tlbml6ZXJfb2JqZWN0JTNEdG9rZW5pemVyKQ==",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast
<span class="hljs-meta">&gt;&gt;&gt; </span>fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)`,wrap:!1}}),U=new ee({props:{title:"Loading from a JSON file",local:"loading-from-a-json-file",headingTag:"h2"}}),Z=new D({props:{code:"dG9rZW5pemVyLnNhdmUoJTIydG9rZW5pemVyLmpzb24lMjIp",highlighted:'<span class="hljs-meta">&gt;&gt;&gt; </span>tokenizer.save(<span class="hljs-string">&quot;tokenizer.json&quot;</span>)',wrap:!1}}),J=new D({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFByZVRyYWluZWRUb2tlbml6ZXJGYXN0JTBBJTBBZmFzdF90b2tlbml6ZXIlMjAlM0QlMjBQcmVUcmFpbmVkVG9rZW5pemVyRmFzdCh0b2tlbml6ZXJfZmlsZSUzRCUyMnRva2VuaXplci5qc29uJTIyKQ==",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast
<span class="hljs-meta">&gt;&gt;&gt; </span>fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file=<span class="hljs-string">&quot;tokenizer.json&quot;</span>)`,wrap:!1}}),B=new ue({props:{source:"https://github.com/huggingface/transformers/blob/main/docs/source/ja/fast_tokenizers.md"}}),{c(){m=r("meta"),C=n(),F=r("p"),I=n(),p(d.$$.fragment),P=n(),j=r("p"),j.innerHTML=se,N=n(),y=r("p"),y.textContent=le,Q=n(),p(u.$$.fragment),W=n(),h=r("p"),h.textContent=ne,E=n(),p($.$$.fragment),G=n(),T=r("p"),T.innerHTML=ae,X=n(),p(z.$$.fragment),q=n(),b=r("p"),b.innerHTML=re,R=n(),p(U.$$.fragment),x=n(),w=r("p"),w.textContent=ie,L=n(),p(Z.$$.fragment),H=n(),_=r("p"),_.innerHTML=me,S=n(),p(J.$$.fragment),A=n(),V=r("p"),V.innerHTML=pe,Y=n(),p(B.$$.fragment),K=n(),v=r("p"),this.h()},l(e){const t=je("svelte-u9bgzb",document.head);m=i(t,"META",{name:!0,content:!0}),t.forEach(s),C=a(e),F=i(e,"P",{}),oe(F).forEach(s),I=a(e),o(d.$$.fragment,e),P=a(e),j=i(e,"P",{"data-svelte-h":!0}),f(j)!=="svelte-1cxd6wu"&&(j.innerHTML=se),N=a(e),y=i(e,"P",{"data-svelte-h":!0}),f(y)!=="svelte-ut5c9x"&&(y.textContent=le),Q=a(e),o(u.$$.fragment,e),W=a(e),h=i(e,"P",{"data-svelte-h":!0}),f(h)!=="svelte-invdsu"&&(h.textContent=ne),E=a(e),o($.$$.fragment,e),G=a(e),T=i(e,"P",{"data-svelte-h":!0}),f(T)!=="svelte-1uq8w3b"&&(T.innerHTML=ae),X=a(e),o(z.$$.fragment,e),q=a(e),b=i(e,"P",{"data-svelte-h":!0}),f(b)!=="svelte-3q8kj6"&&(b.innerHTML=re),R=a(e),o(U.$$.fragment,e),x=a(e),w=i(e,"P",{"data-svelte-h":!0}),f(w)!=="svelte-132sdzq"&&(w.textContent=ie),L=a(e),o(Z.$$.fragment,e),H=a(e),_=i(e,"P",{"data-svelte-h":!0}),f(_)!=="svelte-1djco1p"&&(_.innerHTML=me),S=a(e),o(J.$$.fragment,e),A=a(e),V=i(e,"P",{"data-svelte-h":!0}),f(V)!=="svelte-1ipwbgd"&&(V.innerHTML=pe),Y=a(e),o(B.$$.fragment,e),K=a(e),v=i(e,"P",{}),oe(v).forEach(s),this.h()},h(){fe(m,"name","hf:doc:metadata"),fe(m,"content",$e)},m(e,t){ye(document.head,m),l(e,C,t),l(e,F,t),l(e,I,t),c(d,e,t),l(e,P,t),l(e,j,t),l(e,N,t),l(e,y,t),l(e,Q,t),c(u,e,t),l(e,W,t),l(e,h,t),l(e,E,t),c($,e,t),l(e,G,t),l(e,T,t),l(e,X,t),c(z,e,t),l(e,q,t),l(e,b,t),l(e,R,t),c(U,e,t),l(e,x,t),l(e,w,t),l(e,L,t),c(Z,e,t),l(e,H,t),l(e,_,t),l(e,S,t),c(J,e,t),l(e,A,t),l(e,V,t),l(e,Y,t),c(B,e,t),l(e,K,t),l(e,v,t),O=!0},p:Me,i(e){O||(M(d.$$.fragment,e),M(u.$$.fragment,e),M($.$$.fragment,e),M(z.$$.fragment,e),M(U.$$.fragment,e),M(Z.$$.fragment,e),M(J.$$.fragment,e),M(B.$$.fragment,e),O=!0)},o(e){g(d.$$.fragment,e),g(u.$$.fragment,e),g($.$$.fragment,e),g(z.$$.fragment,e),g(U.$$.fragment,e),g(Z.$$.fragment,e),g(J.$$.fragment,e),g(B.$$.fragment,e),O=!1},d(e){e&&(s(C),s(F),s(I),s(P),s(j),s(N),s(y),s(Q),s(W),s(h),s(E),s(G),s(T),s(X),s(q),s(b),s(R),s(x),s(w),s(L),s(H),s(_),s(S),s(A),s(V),s(Y),s(K),s(v)),s(m),k(d,e),k(u,e),k($,e),k(z,e),k(U,e),k(Z,e),k(J,e),k(B,e)}}}const $e='{"title":"Use tokenizers from 🤗 Tokenizers","local":"use-tokenizers-from--tokenizers","sections":[{"title":"Loading directly from the tokenizer object","local":"loading-directly-from-the-tokenizer-object","sections":[],"depth":2},{"title":"Loading from a JSON file","local":"loading-from-a-json-file","sections":[],"depth":2}],"depth":1}';function Te(te){return ge(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Ze extends ke{constructor(m){super(),de(this,m,Te,he,ce,{})}}export{Ze as component};

Xet Storage Details

Size:
9.45 kB
·
Xet hash:
ace272dc1d75f50b6ad4b2b39e60242d4689770fbb3dc698b2d34878d85f9ab9

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.