Buckets:

rtrm's picture
download
raw
9.13 kB
import{s as ce,n as Me,o as de}from"../chunks/scheduler.9bc65507.js";import{S as ge,i as ke,g as r,s as n,r as m,A as ye,h as p,f as s,c as a,j as oe,u as o,x as f,k as fe,y as ue,a as l,v as c,d as M,t as d,w as g}from"../chunks/index.707bf1b6.js";import{C as D}from"../chunks/CodeBlock.54a9f38d.js";import{H as ee,E as je}from"../chunks/EditOnGithub.922df6ba.js";function he(te){let i,F,v,N,k,I,y,se=`<code>PreTrainedTokenizerFast</code>λŠ” <a href="https://huggingface.co/docs/tokenizers" rel="nofollow">πŸ€— Tokenizers</a> λΌμ΄λΈŒλŸ¬λ¦¬μ— κΈ°λ°˜ν•©λ‹ˆλ‹€. πŸ€— Tokenizers 라이브러리의 ν† ν¬λ‚˜μ΄μ €λŠ”
πŸ€— Transformers둜 맀우 κ°„λ‹¨ν•˜κ²Œ 뢈러올 수 μžˆμŠ΅λ‹ˆλ‹€.`,P,u,le="ꡬ체적인 λ‚΄μš©μ— λ“€μ–΄κ°€κΈ° 전에, λͺ‡ μ€„μ˜ μ½”λ“œλ‘œ 더미 ν† ν¬λ‚˜μ΄μ €λ₯Ό λ§Œλ“€μ–΄ λ³΄κ² μŠ΅λ‹ˆλ‹€:",Q,j,W,h,ne="μš°λ¦¬κ°€ μ •μ˜ν•œ νŒŒμΌμ„ 톡해 이제 ν•™μŠ΅λœ ν† ν¬λ‚˜μ΄μ €λ₯Ό κ°–κ²Œ λ˜μ—ˆμŠ΅λ‹ˆλ‹€. 이 λŸ°νƒ€μž„μ—μ„œ 계속 μ‚¬μš©ν•˜κ±°λ‚˜ JSON 파일둜 μ €μž₯ν•˜μ—¬ λ‚˜μ€‘μ— μ‚¬μš©ν•  수 μžˆμŠ΅λ‹ˆλ‹€.",E,$,G,T,ae=`πŸ€— Transformers λΌμ΄λΈŒλŸ¬λ¦¬μ—μ„œ 이 ν† ν¬λ‚˜μ΄μ € 객체λ₯Ό ν™œμš©ν•˜λŠ” 방법을 μ‚΄νŽ΄λ³΄κ² μŠ΅λ‹ˆλ‹€.
<code>PreTrainedTokenizerFast</code> ν΄λž˜μŠ€λŠ” μΈμŠ€ν„΄μŠ€ν™”λœ <em>ν† ν¬λ‚˜μ΄μ €</em> 객체λ₯Ό 인수둜 λ°›μ•„ μ‰½κ²Œ μΈμŠ€ν„΄μŠ€ν™”ν•  수 μžˆμŠ΅λ‹ˆλ‹€:`,X,b,x,z,re='이제 <code>fast_tokenizer</code> κ°μ²΄λŠ” πŸ€— Transformers ν† ν¬λ‚˜μ΄μ €μ—μ„œ κ³΅μœ ν•˜λŠ” λͺ¨λ“  λ©”μ†Œλ“œμ™€ ν•¨κ»˜ μ‚¬μš©ν•  수 μžˆμŠ΅λ‹ˆλ‹€! μžμ„Έν•œ λ‚΄μš©μ€ <a href="main_classes/tokenizer">ν† ν¬λ‚˜μ΄μ € νŽ˜μ΄μ§€</a>λ₯Ό μ°Έμ‘°ν•˜μ„Έμš”.',R,U,S,Z,pe="JSON νŒŒμΌμ—μ„œ ν† ν¬λ‚˜μ΄μ €λ₯Ό 뢈러였기 μœ„ν•΄, λ¨Όμ € ν† ν¬λ‚˜μ΄μ €λ₯Ό μ €μž₯ν•΄ λ³΄κ² μŠ΅λ‹ˆλ‹€:",q,w,H,J,ie="JSON νŒŒμΌμ„ μ €μž₯ν•œ κ²½λ‘œλŠ” <code>tokenizer_file</code> λ§€κ°œλ³€μˆ˜λ₯Ό μ‚¬μš©ν•˜μ—¬ <code>PreTrainedTokenizerFast</code> μ΄ˆκΈ°ν™” λ©”μ†Œλ“œμ— 전달할 수 μžˆμŠ΅λ‹ˆλ‹€:",L,V,A,_,me='이제 <code>fast_tokenizer</code> κ°μ²΄λŠ” πŸ€— Transformers ν† ν¬λ‚˜μ΄μ €μ—μ„œ κ³΅μœ ν•˜λŠ” λͺ¨λ“  λ©”μ†Œλ“œμ™€ ν•¨κ»˜ μ‚¬μš©ν•  수 μžˆμŠ΅λ‹ˆλ‹€! μžμ„Έν•œ λ‚΄μš©μ€ <a href="main_classes/tokenizer">ν† ν¬λ‚˜μ΄μ € νŽ˜μ΄μ§€</a>λ₯Ό μ°Έμ‘°ν•˜μ„Έμš”.',Y,B,O,C,K;return k=new ee({props:{title:"πŸ€— Tokenizers 라이브러리의 ν† ν¬λ‚˜μ΄μ € μ‚¬μš©ν•˜κΈ°",local:"use-tokenizers-from-tokenizers",headingTag:"h1"}}),j=new D({props:{code:"ZnJvbSUyMHRva2VuaXplcnMlMjBpbXBvcnQlMjBUb2tlbml6ZXIlMEFmcm9tJTIwdG9rZW5pemVycy5tb2RlbHMlMjBpbXBvcnQlMjBCUEUlMEFmcm9tJTIwdG9rZW5pemVycy50cmFpbmVycyUyMGltcG9ydCUyMEJwZVRyYWluZXIlMEFmcm9tJTIwdG9rZW5pemVycy5wcmVfdG9rZW5pemVycyUyMGltcG9ydCUyMFdoaXRlc3BhY2UlMEElMEF0b2tlbml6ZXIlMjAlM0QlMjBUb2tlbml6ZXIoQlBFKHVua190b2tlbiUzRCUyMiU1QlVOSyU1RCUyMikpJTBBdHJhaW5lciUyMCUzRCUyMEJwZVRyYWluZXIoc3BlY2lhbF90b2tlbnMlM0QlNUIlMjIlNUJVTkslNUQlMjIlMkMlMjAlMjIlNUJDTFMlNUQlMjIlMkMlMjAlMjIlNUJTRVAlNUQlMjIlMkMlMjAlMjIlNUJQQUQlNUQlMjIlMkMlMjAlMjIlNUJNQVNLJTVEJTIyJTVEKSUwQSUwQXRva2VuaXplci5wcmVfdG9rZW5pemVyJTIwJTNEJTIwV2hpdGVzcGFjZSgpJTBBZmlsZXMlMjAlM0QlMjAlNUIuLi4lNUQlMEF0b2tlbml6ZXIudHJhaW4oZmlsZXMlMkMlMjB0cmFpbmVyKQ==",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> Tokenizer
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> tokenizers.models <span class="hljs-keyword">import</span> BPE
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> tokenizers.trainers <span class="hljs-keyword">import</span> BpeTrainer
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> tokenizers.pre_tokenizers <span class="hljs-keyword">import</span> Whitespace
<span class="hljs-meta">&gt;&gt;&gt; </span>tokenizer = Tokenizer(BPE(unk_token=<span class="hljs-string">&quot;[UNK]&quot;</span>))
<span class="hljs-meta">&gt;&gt;&gt; </span>trainer = BpeTrainer(special_tokens=[<span class="hljs-string">&quot;[UNK]&quot;</span>, <span class="hljs-string">&quot;[CLS]&quot;</span>, <span class="hljs-string">&quot;[SEP]&quot;</span>, <span class="hljs-string">&quot;[PAD]&quot;</span>, <span class="hljs-string">&quot;[MASK]&quot;</span>])
<span class="hljs-meta">&gt;&gt;&gt; </span>tokenizer.pre_tokenizer = Whitespace()
<span class="hljs-meta">&gt;&gt;&gt; </span>files = [...]
<span class="hljs-meta">&gt;&gt;&gt; </span>tokenizer.train(files, trainer)`,wrap:!1}}),$=new ee({props:{title:"ν† ν¬λ‚˜μ΄μ € κ°μ²΄λ‘œλΆ€ν„° 직접 뢈러였기",local:"loading-directly-from-the-tokenizer-object",headingTag:"h2"}}),b=new D({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFByZVRyYWluZWRUb2tlbml6ZXJGYXN0JTBBJTBBZmFzdF90b2tlbml6ZXIlMjAlM0QlMjBQcmVUcmFpbmVkVG9rZW5pemVyRmFzdCh0b2tlbml6ZXJfb2JqZWN0JTNEdG9rZW5pemVyKQ==",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast
<span class="hljs-meta">&gt;&gt;&gt; </span>fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)`,wrap:!1}}),U=new ee({props:{title:"JSON νŒŒμΌμ—μ„œ 뢈러였기",local:"loading-from-a-JSON-file",headingTag:"h2"}}),w=new D({props:{code:"dG9rZW5pemVyLnNhdmUoJTIydG9rZW5pemVyLmpzb24lMjIp",highlighted:'<span class="hljs-meta">&gt;&gt;&gt; </span>tokenizer.save(<span class="hljs-string">&quot;tokenizer.json&quot;</span>)',wrap:!1}}),V=new D({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFByZVRyYWluZWRUb2tlbml6ZXJGYXN0JTBBJTBBZmFzdF90b2tlbml6ZXIlMjAlM0QlMjBQcmVUcmFpbmVkVG9rZW5pemVyRmFzdCh0b2tlbml6ZXJfZmlsZSUzRCUyMnRva2VuaXplci5qc29uJTIyKQ==",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast
<span class="hljs-meta">&gt;&gt;&gt; </span>fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file=<span class="hljs-string">&quot;tokenizer.json&quot;</span>)`,wrap:!1}}),B=new je({props:{source:"https://github.com/huggingface/transformers/blob/main/docs/source/ko/fast_tokenizers.md"}}),{c(){i=r("meta"),F=n(),v=r("p"),N=n(),m(k.$$.fragment),I=n(),y=r("p"),y.innerHTML=se,P=n(),u=r("p"),u.textContent=le,Q=n(),m(j.$$.fragment),W=n(),h=r("p"),h.textContent=ne,E=n(),m($.$$.fragment),G=n(),T=r("p"),T.innerHTML=ae,X=n(),m(b.$$.fragment),x=n(),z=r("p"),z.innerHTML=re,R=n(),m(U.$$.fragment),S=n(),Z=r("p"),Z.textContent=pe,q=n(),m(w.$$.fragment),H=n(),J=r("p"),J.innerHTML=ie,L=n(),m(V.$$.fragment),A=n(),_=r("p"),_.innerHTML=me,Y=n(),m(B.$$.fragment),O=n(),C=r("p"),this.h()},l(e){const t=ye("svelte-u9bgzb",document.head);i=p(t,"META",{name:!0,content:!0}),t.forEach(s),F=a(e),v=p(e,"P",{}),oe(v).forEach(s),N=a(e),o(k.$$.fragment,e),I=a(e),y=p(e,"P",{"data-svelte-h":!0}),f(y)!=="svelte-1ilwdv2"&&(y.innerHTML=se),P=a(e),u=p(e,"P",{"data-svelte-h":!0}),f(u)!=="svelte-nobjxu"&&(u.textContent=le),Q=a(e),o(j.$$.fragment,e),W=a(e),h=p(e,"P",{"data-svelte-h":!0}),f(h)!=="svelte-vmdasx"&&(h.textContent=ne),E=a(e),o($.$$.fragment,e),G=a(e),T=p(e,"P",{"data-svelte-h":!0}),f(T)!=="svelte-10u78cn"&&(T.innerHTML=ae),X=a(e),o(b.$$.fragment,e),x=a(e),z=p(e,"P",{"data-svelte-h":!0}),f(z)!=="svelte-tdf1x7"&&(z.innerHTML=re),R=a(e),o(U.$$.fragment,e),S=a(e),Z=p(e,"P",{"data-svelte-h":!0}),f(Z)!=="svelte-16yucd6"&&(Z.textContent=pe),q=a(e),o(w.$$.fragment,e),H=a(e),J=p(e,"P",{"data-svelte-h":!0}),f(J)!=="svelte-126md19"&&(J.innerHTML=ie),L=a(e),o(V.$$.fragment,e),A=a(e),_=p(e,"P",{"data-svelte-h":!0}),f(_)!=="svelte-tdf1x7"&&(_.innerHTML=me),Y=a(e),o(B.$$.fragment,e),O=a(e),C=p(e,"P",{}),oe(C).forEach(s),this.h()},h(){fe(i,"name","hf:doc:metadata"),fe(i,"content",$e)},m(e,t){ue(document.head,i),l(e,F,t),l(e,v,t),l(e,N,t),c(k,e,t),l(e,I,t),l(e,y,t),l(e,P,t),l(e,u,t),l(e,Q,t),c(j,e,t),l(e,W,t),l(e,h,t),l(e,E,t),c($,e,t),l(e,G,t),l(e,T,t),l(e,X,t),c(b,e,t),l(e,x,t),l(e,z,t),l(e,R,t),c(U,e,t),l(e,S,t),l(e,Z,t),l(e,q,t),c(w,e,t),l(e,H,t),l(e,J,t),l(e,L,t),c(V,e,t),l(e,A,t),l(e,_,t),l(e,Y,t),c(B,e,t),l(e,O,t),l(e,C,t),K=!0},p:Me,i(e){K||(M(k.$$.fragment,e),M(j.$$.fragment,e),M($.$$.fragment,e),M(b.$$.fragment,e),M(U.$$.fragment,e),M(w.$$.fragment,e),M(V.$$.fragment,e),M(B.$$.fragment,e),K=!0)},o(e){d(k.$$.fragment,e),d(j.$$.fragment,e),d($.$$.fragment,e),d(b.$$.fragment,e),d(U.$$.fragment,e),d(w.$$.fragment,e),d(V.$$.fragment,e),d(B.$$.fragment,e),K=!1},d(e){e&&(s(F),s(v),s(N),s(I),s(y),s(P),s(u),s(Q),s(W),s(h),s(E),s(G),s(T),s(X),s(x),s(z),s(R),s(S),s(Z),s(q),s(H),s(J),s(L),s(A),s(_),s(Y),s(O),s(C)),s(i),g(k,e),g(j,e),g($,e),g(b,e),g(U,e),g(w,e),g(V,e),g(B,e)}}}const $e='{"title":"πŸ€— Tokenizers 라이브러리의 ν† ν¬λ‚˜μ΄μ € μ‚¬μš©ν•˜κΈ°","local":"use-tokenizers-from-tokenizers","sections":[{"title":"ν† ν¬λ‚˜μ΄μ € κ°μ²΄λ‘œλΆ€ν„° 직접 뢈러였기","local":"loading-directly-from-the-tokenizer-object","sections":[],"depth":2},{"title":"JSON νŒŒμΌμ—μ„œ 뢈러였기","local":"loading-from-a-JSON-file","sections":[],"depth":2}],"depth":1}';function Te(te){return de(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class we extends ge{constructor(i){super(),ke(this,i,Te,he,ce,{})}}export{we as component};

Xet Storage Details

Size:
9.13 kB
Β·
Xet hash:
eb79c2dd903c6293346ce479ba75661f358b39acbf2122854ec80dfff5302cac

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.