Buckets:
| import{s as ce,n as ke,o as Me}from"../chunks/scheduler.9bc65507.js";import{S as ge,i as de,g as r,s as n,r as o,A as je,h as i,f as s,c as a,j as pe,u as p,x as f,k as fe,y as ye,a as l,v as c,d as k,t as M,w as g}from"../chunks/index.707bf1b6.js";import{C as D}from"../chunks/CodeBlock.54a9f38d.js";import{H as ee,E as he}from"../chunks/EditOnGithub.922df6ba.js";function ue(te){let m,C,F,I,d,P,j,se='<a href="/docs/transformers/main/ja/main_classes/tokenizer#transformers.PreTrainedTokenizerFast">PreTrainedTokenizerFast</a>は<a href="https://huggingface.co/docs/tokenizers" rel="nofollow">🤗 Tokenizers</a>ライブラリに依存しています。🤗 Tokenizersライブラリから取得したトークナイザーは、非常に簡単に🤗 Transformersにロードできます。',N,y,le="具体的な内容に入る前に、まずはいくつかの行でダミーのトークナイザーを作成することから始めましょう:",Q,h,W,u,ne=`私たちは今、定義したファイルにトレーニングされたトークナイザーを持っています。これをランタイムで引き続き使用するか、 | |
| 将来の再利用のためにJSONファイルに保存することができます。`,E,$,G,T,ae=`🤗 Transformersライブラリでこのトークナイザーオブジェクトをどのように活用できるかを見てみましょう。<a href="/docs/transformers/main/ja/main_classes/tokenizer#transformers.PreTrainedTokenizerFast">PreTrainedTokenizerFast</a>クラスは、 | |
| <em>tokenizer</em>オブジェクトを引数として受け入れ、簡単にインスタンス化できるようにします。`,X,z,q,b,re='このオブジェクトは、🤗 Transformers トークナイザーが共有するすべてのメソッドと一緒に使用できます!詳細については、<a href="main_classes/tokenizer">トークナイザーページ</a>をご覧ください。',R,U,L,Z,ie="JSONファイルからトークナイザーを読み込むには、まずトークナイザーを保存することから始めましょう:",x,w,H,J,me="このファイルを保存したパスは、<code>PreTrainedTokenizerFast</code> の初期化メソッドに <code>tokenizer_file</code> パラメータを使用して渡すことができます:",S,V,A,_,oe='このオブジェクトは、🤗 Transformers トークナイザーが共有するすべてのメソッドと一緒に使用できるようになりました!詳細については、<a href="main_classes/tokenizer">トークナイザーページ</a>をご覧ください。',Y,B,K,v,O;return d=new ee({props:{title:"Use tokenizers from 🤗 Tokenizers",local:"use-tokenizers-from--tokenizers",headingTag:"h1"}}),h=new D({props:{code:"ZnJvbSUyMHRva2VuaXplcnMlMjBpbXBvcnQlMjBUb2tlbml6ZXIlMEFmcm9tJTIwdG9rZW5pemVycy5tb2RlbHMlMjBpbXBvcnQlMjBCUEUlMEFmcm9tJTIwdG9rZW5pemVycy50cmFpbmVycyUyMGltcG9ydCUyMEJwZVRyYWluZXIlMEFmcm9tJTIwdG9rZW5pemVycy5wcmVfdG9rZW5pemVycyUyMGltcG9ydCUyMFdoaXRlc3BhY2UlMEElMEF0b2tlbml6ZXIlMjAlM0QlMjBUb2tlbml6ZXIoQlBFKHVua190b2tlbiUzRCUyMiU1QlVOSyU1RCUyMikpJTBBdHJhaW5lciUyMCUzRCUyMEJwZVRyYWluZXIoc3BlY2lhbF90b2tlbnMlM0QlNUIlMjIlNUJVTkslNUQlMjIlMkMlMjAlMjIlNUJDTFMlNUQlMjIlMkMlMjAlMjIlNUJTRVAlNUQlMjIlMkMlMjAlMjIlNUJQQUQlNUQlMjIlMkMlMjAlMjIlNUJNQVNLJTVEJTIyJTVEKSUwQSUwQXRva2VuaXplci5wcmVfdG9rZW5pemVyJTIwJTNEJTIwV2hpdGVzcGFjZSgpJTBBZmlsZXMlMjAlM0QlMjAlNUIuLi4lNUQlMEF0b2tlbml6ZXIudHJhaW4oZmlsZXMlMkMlMjB0cmFpbmVyKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> Tokenizer | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> tokenizers.models <span class="hljs-keyword">import</span> BPE | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> tokenizers.trainers <span class="hljs-keyword">import</span> BpeTrainer | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> tokenizers.pre_tokenizers <span class="hljs-keyword">import</span> Whitespace | |
| <span class="hljs-meta">>>> </span>tokenizer = Tokenizer(BPE(unk_token=<span class="hljs-string">"[UNK]"</span>)) | |
| <span class="hljs-meta">>>> </span>trainer = BpeTrainer(special_tokens=[<span class="hljs-string">"[UNK]"</span>, <span class="hljs-string">"[CLS]"</span>, <span class="hljs-string">"[SEP]"</span>, <span class="hljs-string">"[PAD]"</span>, <span class="hljs-string">"[MASK]"</span>]) | |
| <span class="hljs-meta">>>> </span>tokenizer.pre_tokenizer = Whitespace() | |
| <span class="hljs-meta">>>> </span>files = [...] | |
| <span class="hljs-meta">>>> </span>tokenizer.train(files, trainer)`,wrap:!1}}),$=new ee({props:{title:"Loading directly from the tokenizer object",local:"loading-directly-from-the-tokenizer-object",headingTag:"h2"}}),z=new D({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFByZVRyYWluZWRUb2tlbml6ZXJGYXN0JTBBJTBBZmFzdF90b2tlbml6ZXIlMjAlM0QlMjBQcmVUcmFpbmVkVG9rZW5pemVyRmFzdCh0b2tlbml6ZXJfb2JqZWN0JTNEdG9rZW5pemVyKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast | |
| <span class="hljs-meta">>>> </span>fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)`,wrap:!1}}),U=new ee({props:{title:"Loading from a JSON file",local:"loading-from-a-json-file",headingTag:"h2"}}),w=new D({props:{code:"dG9rZW5pemVyLnNhdmUoJTIydG9rZW5pemVyLmpzb24lMjIp",highlighted:'<span class="hljs-meta">>>> </span>tokenizer.save(<span class="hljs-string">"tokenizer.json"</span>)',wrap:!1}}),V=new D({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFByZVRyYWluZWRUb2tlbml6ZXJGYXN0JTBBJTBBZmFzdF90b2tlbml6ZXIlMjAlM0QlMjBQcmVUcmFpbmVkVG9rZW5pemVyRmFzdCh0b2tlbml6ZXJfZmlsZSUzRCUyMnRva2VuaXplci5qc29uJTIyKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast | |
| <span class="hljs-meta">>>> </span>fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file=<span class="hljs-string">"tokenizer.json"</span>)`,wrap:!1}}),B=new he({props:{source:"https://github.com/huggingface/transformers/blob/main/docs/source/ja/fast_tokenizers.md"}}),{c(){m=r("meta"),C=n(),F=r("p"),I=n(),o(d.$$.fragment),P=n(),j=r("p"),j.innerHTML=se,N=n(),y=r("p"),y.textContent=le,Q=n(),o(h.$$.fragment),W=n(),u=r("p"),u.textContent=ne,E=n(),o($.$$.fragment),G=n(),T=r("p"),T.innerHTML=ae,X=n(),o(z.$$.fragment),q=n(),b=r("p"),b.innerHTML=re,R=n(),o(U.$$.fragment),L=n(),Z=r("p"),Z.textContent=ie,x=n(),o(w.$$.fragment),H=n(),J=r("p"),J.innerHTML=me,S=n(),o(V.$$.fragment),A=n(),_=r("p"),_.innerHTML=oe,Y=n(),o(B.$$.fragment),K=n(),v=r("p"),this.h()},l(e){const t=je("svelte-u9bgzb",document.head);m=i(t,"META",{name:!0,content:!0}),t.forEach(s),C=a(e),F=i(e,"P",{}),pe(F).forEach(s),I=a(e),p(d.$$.fragment,e),P=a(e),j=i(e,"P",{"data-svelte-h":!0}),f(j)!=="svelte-19yjkfk"&&(j.innerHTML=se),N=a(e),y=i(e,"P",{"data-svelte-h":!0}),f(y)!=="svelte-ut5c9x"&&(y.textContent=le),Q=a(e),p(h.$$.fragment,e),W=a(e),u=i(e,"P",{"data-svelte-h":!0}),f(u)!=="svelte-invdsu"&&(u.textContent=ne),E=a(e),p($.$$.fragment,e),G=a(e),T=i(e,"P",{"data-svelte-h":!0}),f(T)!=="svelte-myekfd"&&(T.innerHTML=ae),X=a(e),p(z.$$.fragment,e),q=a(e),b=i(e,"P",{"data-svelte-h":!0}),f(b)!=="svelte-3q8kj6"&&(b.innerHTML=re),R=a(e),p(U.$$.fragment,e),L=a(e),Z=i(e,"P",{"data-svelte-h":!0}),f(Z)!=="svelte-132sdzq"&&(Z.textContent=ie),x=a(e),p(w.$$.fragment,e),H=a(e),J=i(e,"P",{"data-svelte-h":!0}),f(J)!=="svelte-1djco1p"&&(J.innerHTML=me),S=a(e),p(V.$$.fragment,e),A=a(e),_=i(e,"P",{"data-svelte-h":!0}),f(_)!=="svelte-1ipwbgd"&&(_.innerHTML=oe),Y=a(e),p(B.$$.fragment,e),K=a(e),v=i(e,"P",{}),pe(v).forEach(s),this.h()},h(){fe(m,"name","hf:doc:metadata"),fe(m,"content",$e)},m(e,t){ye(document.head,m),l(e,C,t),l(e,F,t),l(e,I,t),c(d,e,t),l(e,P,t),l(e,j,t),l(e,N,t),l(e,y,t),l(e,Q,t),c(h,e,t),l(e,W,t),l(e,u,t),l(e,E,t),c($,e,t),l(e,G,t),l(e,T,t),l(e,X,t),c(z,e,t),l(e,q,t),l(e,b,t),l(e,R,t),c(U,e,t),l(e,L,t),l(e,Z,t),l(e,x,t),c(w,e,t),l(e,H,t),l(e,J,t),l(e,S,t),c(V,e,t),l(e,A,t),l(e,_,t),l(e,Y,t),c(B,e,t),l(e,K,t),l(e,v,t),O=!0},p:ke,i(e){O||(k(d.$$.fragment,e),k(h.$$.fragment,e),k($.$$.fragment,e),k(z.$$.fragment,e),k(U.$$.fragment,e),k(w.$$.fragment,e),k(V.$$.fragment,e),k(B.$$.fragment,e),O=!0)},o(e){M(d.$$.fragment,e),M(h.$$.fragment,e),M($.$$.fragment,e),M(z.$$.fragment,e),M(U.$$.fragment,e),M(w.$$.fragment,e),M(V.$$.fragment,e),M(B.$$.fragment,e),O=!1},d(e){e&&(s(C),s(F),s(I),s(P),s(j),s(N),s(y),s(Q),s(W),s(u),s(E),s(G),s(T),s(X),s(q),s(b),s(R),s(L),s(Z),s(x),s(H),s(J),s(S),s(A),s(_),s(Y),s(K),s(v)),s(m),g(d,e),g(h,e),g($,e),g(z,e),g(U,e),g(w,e),g(V,e),g(B,e)}}}const $e='{"title":"Use tokenizers from 🤗 Tokenizers","local":"use-tokenizers-from--tokenizers","sections":[{"title":"Loading directly from the tokenizer object","local":"loading-directly-from-the-tokenizer-object","sections":[],"depth":2},{"title":"Loading from a JSON file","local":"loading-from-a-json-file","sections":[],"depth":2}],"depth":1}';function Te(te){return Me(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class we extends ge{constructor(m){super(),de(this,m,Te,ue,ce,{})}}export{we as component}; | |
Xet Storage Details
- Size:
- 9.44 kB
- Xet hash:
- a100be9dc175168d338c5e5a9e6053468c5882cc30644abd48749f1eaa4a4deb
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.