Buckets:
| import{s as O,n as R,o as D}from"../chunks/scheduler.37c15a92.js";import{S as K,i as Q,g as s,s as o,r as M,A as V,h as l,f as a,c as i,j as G,u as E,x as $,k as J,y as X,a as n,v as I,d as S,t as q,w as A}from"../chunks/index.7cb9c9b8.js";import{C as Y}from"../chunks/CourseFloatingBanner.df82c153.js";import{H as Z,E as ee}from"../chunks/getInferenceSnippets.f9350a3f.js";function te(F){let r,k,g,v,h,_,u,y,p,N='In <a href="/course/chapter3">Chapter 3</a>, we looked at how to fine-tune a model on a given task. When we do that, we use the same tokenizer that the model was pretrained with — but what do we do when we want to train a model from scratch? In these cases, using a tokenizer that was pretrained on a corpus from another domain or language is typically suboptimal. For example, a tokenizer that’s trained on an English corpus will perform poorly on a corpus of Japanese texts because the use of spaces and punctuation is very different in the two languages.',T,f,U='In this chapter, you will learn how to train a brand new tokenizer on a corpus of texts, so it can then be used to pretrain a language model. This will all be done with the help of the <a href="https://github.com/huggingface/tokenizers" rel="nofollow">🤗 Tokenizers</a> library, which provides the “fast” tokenizers in the <a href="https://github.com/huggingface/transformers" rel="nofollow">🤗 Transformers</a> library. We’ll take a close look at the features that this library provides, and explore how the fast tokenizers differ from the “slow” versions.',x,m,W="Topics we will cover include:",z,c,j="<li>How to train a new tokenizer similar to the one used by a given checkpoint on a new corpus of texts</li> <li>The special features of fast tokenizers</li> <li>The differences between the three main subword tokenization algorithms used in NLP today</li> <li>How to build a tokenizer from scratch with the 🤗 Tokenizers library and train it on some data</li>",C,d,B='The techniques introduced in this chapter will prepare you for the section in <a href="/course/chapter7/6">Chapter 7</a> where we look at creating a language model for Python source code. Let’s start by looking at what it means to “train” a tokenizer in the first place.',H,w,L,b,P;return h=new Z({props:{title:"Introduction",local:"introduction",headingTag:"h1"}}),u=new Y({props:{chapter:6,classNames:"absolute z-10 right-0 top-0"}}),w=new ee({props:{source:"https://github.com/huggingface/course/blob/main/chapters/en/chapter6/1.mdx"}}),{c(){r=s("meta"),k=o(),g=s("p"),v=o(),M(h.$$.fragment),_=o(),M(u.$$.fragment),y=o(),p=s("p"),p.innerHTML=N,T=o(),f=s("p"),f.innerHTML=U,x=o(),m=s("p"),m.textContent=W,z=o(),c=s("ul"),c.innerHTML=j,C=o(),d=s("p"),d.innerHTML=B,H=o(),M(w.$$.fragment),L=o(),b=s("p"),this.h()},l(e){const t=V("svelte-u9bgzb",document.head);r=l(t,"META",{name:!0,content:!0}),t.forEach(a),k=i(e),g=l(e,"P",{}),G(g).forEach(a),v=i(e),E(h.$$.fragment,e),_=i(e),E(u.$$.fragment,e),y=i(e),p=l(e,"P",{"data-svelte-h":!0}),$(p)!=="svelte-1bhpfou"&&(p.innerHTML=N),T=i(e),f=l(e,"P",{"data-svelte-h":!0}),$(f)!=="svelte-cst5sb"&&(f.innerHTML=U),x=i(e),m=l(e,"P",{"data-svelte-h":!0}),$(m)!=="svelte-1v9fbl3"&&(m.textContent=W),z=i(e),c=l(e,"UL",{"data-svelte-h":!0}),$(c)!=="svelte-1mhak9"&&(c.innerHTML=j),C=i(e),d=l(e,"P",{"data-svelte-h":!0}),$(d)!=="svelte-l9t9z5"&&(d.innerHTML=B),H=i(e),E(w.$$.fragment,e),L=i(e),b=l(e,"P",{}),G(b).forEach(a),this.h()},h(){J(r,"name","hf:doc:metadata"),J(r,"content",ae)},m(e,t){X(document.head,r),n(e,k,t),n(e,g,t),n(e,v,t),I(h,e,t),n(e,_,t),I(u,e,t),n(e,y,t),n(e,p,t),n(e,T,t),n(e,f,t),n(e,x,t),n(e,m,t),n(e,z,t),n(e,c,t),n(e,C,t),n(e,d,t),n(e,H,t),I(w,e,t),n(e,L,t),n(e,b,t),P=!0},p:R,i(e){P||(S(h.$$.fragment,e),S(u.$$.fragment,e),S(w.$$.fragment,e),P=!0)},o(e){q(h.$$.fragment,e),q(u.$$.fragment,e),q(w.$$.fragment,e),P=!1},d(e){e&&(a(k),a(g),a(v),a(_),a(y),a(p),a(T),a(f),a(x),a(m),a(z),a(c),a(C),a(d),a(H),a(L),a(b)),a(r),A(h,e),A(u,e),A(w,e)}}}const ae='{"title":"Introduction","local":"introduction","sections":[],"depth":1}';function ne(F){return D(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class le extends K{constructor(r){super(),Q(this,r,ne,te,O,{})}}export{le as component}; | |
Xet Storage Details
- Size:
- 4.24 kB
- Xet hash:
- c6219c8848e2e0488908e3acc5ea8e7f1d97f5ed7ed0e6de3e9b8c20caa57701
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.