Buckets:
| import{s as ce,n as ue,o as fe}from"../chunks/scheduler.d586627e.js";import{S as ge,i as ke,g as o,s as n,r as m,A as Me,h as r,f as s,c as l,j as pe,u as p,x as d,k as de,y as je,a,v as c,d as u,t as f,w as g}from"../chunks/index.8589a59c.js";import{C as D}from"../chunks/CodeBlock.47c46d2c.js";import{H as ee,E as ze}from"../chunks/EditOnGithub.073dfa26.js";function ye(te){let i,q,C,F,k,I,M,se='O <code>PreTrainedTokenizerFast</code> depende da biblioteca <a href="https://huggingface.co/docs/tokenizers" rel="nofollow">🤗 Tokenizers</a>. O Tokenizer obtido da biblioteca 🤗 Tokenizers pode ser carregado facilmente pelo 🤗 Transformers.',N,j,ae="Antes de entrar nos detalhes, vamos começar criando um tokenizer fictício em algumas linhas:",P,z,E,y,ne="Agora temos um tokenizer treinado nos arquivos que foram definidos. Nós podemos continuar usando nessa execução ou salvar em um arquivo JSON para re-utilizar no futuro.",Q,b,W,h,le="Vamos ver como aproveitar esse objeto tokenizer na biblioteca 🤗 Transformers. A classe <code>PreTrainedTokenizerFast</code> permite uma instanciação fácil, aceitando o objeto <em>tokenizer</em> instanciado como um argumento:",G,$,X,T,oe='Esse objeto pode ser utilizado com todos os métodos compartilhados pelos tokenizers dos 🤗 Transformers! Vá para <a href="main_classes/tokenizer">a página do tokenizer</a> para mais informações.',R,U,x,v,re="Para carregar um tokenizer de um arquivo JSON vamos primeiro começar salvando nosso tokenizer:",H,Z,A,w,ie="A pasta para qual salvamos esse arquivo pode ser passada para o método de inicialização do <code>PreTrainedTokenizerFast</code> usando o <code>tokenizer_file</code> parâmetro:",S,V,L,J,me='Esse objeto pode ser utilizado com todos os métodos compartilhados pelos tokenizers dos 🤗 Transformers! Vá para <a href="main_classes/tokenizer">a página do tokenizer</a> para mais informações.',Y,_,K,B,O;return k=new ee({props:{title:"Usando os Tokenizers do 🤗 Tokenizers",local:"usando-os-tokenizers-do--tokenizers",headingTag:"h1"}}),z=new D({props:{code:"ZnJvbSUyMHRva2VuaXplcnMlMjBpbXBvcnQlMjBUb2tlbml6ZXIlMEFmcm9tJTIwdG9rZW5pemVycy5tb2RlbHMlMjBpbXBvcnQlMjBCUEUlMEFmcm9tJTIwdG9rZW5pemVycy50cmFpbmVycyUyMGltcG9ydCUyMEJwZVRyYWluZXIlMEFmcm9tJTIwdG9rZW5pemVycy5wcmVfdG9rZW5pemVycyUyMGltcG9ydCUyMFdoaXRlc3BhY2UlMEElMEF0b2tlbml6ZXIlMjAlM0QlMjBUb2tlbml6ZXIoQlBFKHVua190b2tlbiUzRCUyMiU1QlVOSyU1RCUyMikpJTBBdHJhaW5lciUyMCUzRCUyMEJwZVRyYWluZXIoc3BlY2lhbF90b2tlbnMlM0QlNUIlMjIlNUJVTkslNUQlMjIlMkMlMjAlMjIlNUJDTFMlNUQlMjIlMkMlMjAlMjIlNUJTRVAlNUQlMjIlMkMlMjAlMjIlNUJQQUQlNUQlMjIlMkMlMjAlMjIlNUJNQVNLJTVEJTIyJTVEKSUwQSUwQXRva2VuaXplci5wcmVfdG9rZW5pemVyJTIwJTNEJTIwV2hpdGVzcGFjZSgpJTBBZmlsZXMlMjAlM0QlMjAlNUIuLi4lNUQlMEF0b2tlbml6ZXIudHJhaW4oZmlsZXMlMkMlMjB0cmFpbmVyKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> tokenizers <span class="hljs-keyword">import</span> Tokenizer | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> tokenizers.models <span class="hljs-keyword">import</span> BPE | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> tokenizers.trainers <span class="hljs-keyword">import</span> BpeTrainer | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> tokenizers.pre_tokenizers <span class="hljs-keyword">import</span> Whitespace | |
| <span class="hljs-meta">>>> </span>tokenizer = Tokenizer(BPE(unk_token=<span class="hljs-string">"[UNK]"</span>)) | |
| <span class="hljs-meta">>>> </span>trainer = BpeTrainer(special_tokens=[<span class="hljs-string">"[UNK]"</span>, <span class="hljs-string">"[CLS]"</span>, <span class="hljs-string">"[SEP]"</span>, <span class="hljs-string">"[PAD]"</span>, <span class="hljs-string">"[MASK]"</span>]) | |
| <span class="hljs-meta">>>> </span>tokenizer.pre_tokenizer = Whitespace() | |
| <span class="hljs-meta">>>> </span>files = [...] | |
| <span class="hljs-meta">>>> </span>tokenizer.train(files, trainer)`,wrap:!1}}),b=new ee({props:{title:"Carregando diretamente de um objeto tokenizer",local:"carregando-diretamente-de-um-objeto-tokenizer",headingTag:"h2"}}),$=new D({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFByZVRyYWluZWRUb2tlbml6ZXJGYXN0JTBBJTBBZmFzdF90b2tlbml6ZXIlMjAlM0QlMjBQcmVUcmFpbmVkVG9rZW5pemVyRmFzdCh0b2tlbml6ZXJfb2JqZWN0JTNEdG9rZW5pemVyKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast | |
| <span class="hljs-meta">>>> </span>fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)`,wrap:!1}}),U=new ee({props:{title:"Carregando de um arquivo JSON",local:"carregando-de-um-arquivo-json",headingTag:"h2"}}),Z=new D({props:{code:"dG9rZW5pemVyLnNhdmUoJTIydG9rZW5pemVyLmpzb24lMjIp",highlighted:'<span class="hljs-meta">>>> </span>tokenizer.save(<span class="hljs-string">"tokenizer.json"</span>)',wrap:!1}}),V=new D({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFByZVRyYWluZWRUb2tlbml6ZXJGYXN0JTBBJTBBZmFzdF90b2tlbml6ZXIlMjAlM0QlMjBQcmVUcmFpbmVkVG9rZW5pemVyRmFzdCh0b2tlbml6ZXJfZmlsZSUzRCUyMnRva2VuaXplci5qc29uJTIyKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> PreTrainedTokenizerFast | |
| <span class="hljs-meta">>>> </span>fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file=<span class="hljs-string">"tokenizer.json"</span>)`,wrap:!1}}),_=new ze({props:{source:"https://github.com/huggingface/transformers/blob/main/docs/source/pt/fast_tokenizers.md"}}),{c(){i=o("meta"),q=n(),C=o("p"),F=n(),m(k.$$.fragment),I=n(),M=o("p"),M.innerHTML=se,N=n(),j=o("p"),j.textContent=ae,P=n(),m(z.$$.fragment),E=n(),y=o("p"),y.textContent=ne,Q=n(),m(b.$$.fragment),W=n(),h=o("p"),h.innerHTML=le,G=n(),m($.$$.fragment),X=n(),T=o("p"),T.innerHTML=oe,R=n(),m(U.$$.fragment),x=n(),v=o("p"),v.textContent=re,H=n(),m(Z.$$.fragment),A=n(),w=o("p"),w.innerHTML=ie,S=n(),m(V.$$.fragment),L=n(),J=o("p"),J.innerHTML=me,Y=n(),m(_.$$.fragment),K=n(),B=o("p"),this.h()},l(e){const t=Me("svelte-u9bgzb",document.head);i=r(t,"META",{name:!0,content:!0}),t.forEach(s),q=l(e),C=r(e,"P",{}),pe(C).forEach(s),F=l(e),p(k.$$.fragment,e),I=l(e),M=r(e,"P",{"data-svelte-h":!0}),d(M)!=="svelte-borl69"&&(M.innerHTML=se),N=l(e),j=r(e,"P",{"data-svelte-h":!0}),d(j)!=="svelte-1q50rcm"&&(j.textContent=ae),P=l(e),p(z.$$.fragment,e),E=l(e),y=r(e,"P",{"data-svelte-h":!0}),d(y)!=="svelte-17y7lov"&&(y.textContent=ne),Q=l(e),p(b.$$.fragment,e),W=l(e),h=r(e,"P",{"data-svelte-h":!0}),d(h)!=="svelte-11rm4tu"&&(h.innerHTML=le),G=l(e),p($.$$.fragment,e),X=l(e),T=r(e,"P",{"data-svelte-h":!0}),d(T)!=="svelte-bzpmh"&&(T.innerHTML=oe),R=l(e),p(U.$$.fragment,e),x=l(e),v=r(e,"P",{"data-svelte-h":!0}),d(v)!=="svelte-vycu7g"&&(v.textContent=re),H=l(e),p(Z.$$.fragment,e),A=l(e),w=r(e,"P",{"data-svelte-h":!0}),d(w)!=="svelte-15so4os"&&(w.innerHTML=ie),S=l(e),p(V.$$.fragment,e),L=l(e),J=r(e,"P",{"data-svelte-h":!0}),d(J)!=="svelte-bzpmh"&&(J.innerHTML=me),Y=l(e),p(_.$$.fragment,e),K=l(e),B=r(e,"P",{}),pe(B).forEach(s),this.h()},h(){de(i,"name","hf:doc:metadata"),de(i,"content",be)},m(e,t){je(document.head,i),a(e,q,t),a(e,C,t),a(e,F,t),c(k,e,t),a(e,I,t),a(e,M,t),a(e,N,t),a(e,j,t),a(e,P,t),c(z,e,t),a(e,E,t),a(e,y,t),a(e,Q,t),c(b,e,t),a(e,W,t),a(e,h,t),a(e,G,t),c($,e,t),a(e,X,t),a(e,T,t),a(e,R,t),c(U,e,t),a(e,x,t),a(e,v,t),a(e,H,t),c(Z,e,t),a(e,A,t),a(e,w,t),a(e,S,t),c(V,e,t),a(e,L,t),a(e,J,t),a(e,Y,t),c(_,e,t),a(e,K,t),a(e,B,t),O=!0},p:ue,i(e){O||(u(k.$$.fragment,e),u(z.$$.fragment,e),u(b.$$.fragment,e),u($.$$.fragment,e),u(U.$$.fragment,e),u(Z.$$.fragment,e),u(V.$$.fragment,e),u(_.$$.fragment,e),O=!0)},o(e){f(k.$$.fragment,e),f(z.$$.fragment,e),f(b.$$.fragment,e),f($.$$.fragment,e),f(U.$$.fragment,e),f(Z.$$.fragment,e),f(V.$$.fragment,e),f(_.$$.fragment,e),O=!1},d(e){e&&(s(q),s(C),s(F),s(I),s(M),s(N),s(j),s(P),s(E),s(y),s(Q),s(W),s(h),s(G),s(X),s(T),s(R),s(x),s(v),s(H),s(A),s(w),s(S),s(L),s(J),s(Y),s(K),s(B)),s(i),g(k,e),g(z,e),g(b,e),g($,e),g(U,e),g(Z,e),g(V,e),g(_,e)}}}const be='{"title":"Usando os Tokenizers do 🤗 Tokenizers","local":"usando-os-tokenizers-do--tokenizers","sections":[{"title":"Carregando diretamente de um objeto tokenizer","local":"carregando-diretamente-de-um-objeto-tokenizer","sections":[],"depth":2},{"title":"Carregando de um arquivo JSON","local":"carregando-de-um-arquivo-json","sections":[],"depth":2}],"depth":1}';function he(te){return fe(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Ze extends ge{constructor(i){super(),ke(this,i,he,ye,ce,{})}}export{Ze as component}; | |
Xet Storage Details
- Size:
- 8.82 kB
- Xet hash:
- 1a33297298d3e11eae33b276cdf0e6508ce8afbf72f2f4e3182f923b11c01876
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.