Buckets:

rtrm's picture
download
raw
17.4 kB
import{s as ye,n as ke,o as be}from"../chunks/scheduler.37c15a92.js";import{S as ze,i as ve,g as ue,s as i,r as s,A as Te,h as de,f as a,c as o,j as we,u as r,x as qe,k as xe,y as We,a as n,v as l,d as h,t as p,w as m}from"../chunks/index.7cb9c9b8.js";import{C as _e}from"../chunks/CourseFloatingBanner.df82c153.js";import{Q as g}from"../chunks/Question.7e41e492.js";import{H as f,E as Pe}from"../chunks/getInferenceSnippets.b6a8c7d2.js";function Ie(ce){let u,j,L,D,d,F,c,M,$,$e="Let’s test what you learned in this chapter!",Q,w,Y,x,G,y,O,k,J,b,K,z,V,v,X,T,Z,q,ee,W,te,_,ae,P,ne,I,ie,E,oe,N,se,B,re,C,le,S,he,U,pe,A,me,H,fe,R,ge;return d=new f({props:{title:"End-of-chapter quiz",local:"end-of-chapter-quiz",headingTag:"h1"}}),c=new _e({props:{chapter:6,classNames:"absolute z-10 right-0 top-0"}}),w=new f({props:{title:"1. When should you train a new tokenizer?",local:"1-when-should-you-train-a-new-tokenizer",headingTag:"h3"}}),x=new g({props:{choices:[{text:"When your dataset is similar to that used by an existing pretrained model, and you want to pretrain a new model",explain:"In this case, to save time and compute resources, a better choice would be to use the same tokenizer as the pretrained model and fine-tune that model instead."},{text:"When your dataset is similar to that used by an existing pretrained model, and you want to fine-tune a new model using this pretrained model",explain:"To fine-tune a model from a pretrained model, you should always use the same tokenizer."},{text:"When your dataset is different from the one used by an existing pretrained model, and you want to pretrain a new model",explain:"Correct! In this case there's no advantage to using the same tokenizer.",correct:!0},{text:"When your dataset is different from the one used by an existing pretrained model, but you want to fine-tune a new model using this pretrained model",explain:"To fine-tune a model from a pretrained model, you should always use the same tokenizer."}]}}),y=new f({props:{title:"2. What is the advantage of using a generator of lists of texts compared to a list of lists of texts when using train_new_from_iterator() ?",local:"2-what-is-the-advantage-of-using-a-generator-of-lists-of-texts-compared-to-a-list-of-lists-of-texts-when-using-trainnewfromiterator-",headingTag:"h3"}}),k=new g({props:{choices:[{text:"That's the only type the method <code>train_new_from_iterator()</code> accepts.",explain:"A list of lists of texts is a particular kind of generator of lists of texts, so the method will accept this too. Try again!"},{text:"You will avoid loading the whole dataset into memory at once.",explain:"Right! Each batch of texts will be released from memory when you iterate, and the gain will be especially visible if you use 🤗 Datasets to store your texts.",correct:!0},{text:"This will allow the 🤗 Tokenizers library to use multiprocessing.",explain:"No, it will use multiprocessing either way."},{text:"The tokenizer you train will generate better texts.",explain:"The tokenizer does not generate text -- are you confusing it with a language model?"}]}}),b=new f({props:{title:"3. What are the advantages of using a “fast” tokenizer?",local:"3-what-are-the-advantages-of-using-a-fast-tokenizer",headingTag:"h3"}}),z=new g({props:{choices:[{text:"It can process inputs faster than a slow tokenizer when you batch lots of inputs together.",explain:"Correct! Thanks to parallelism implemented in Rust, it will be faster on batches of inputs. What other benefit can you think of?",correct:!0},{text:"Fast tokenizers always tokenize faster than their slow counterparts.",explain:"A fast tokenizer can actually be slower when you only give it one or very few texts, since it can't use parallelism."},{text:"It can apply padding and truncation.",explain:"True, but slow tokenizers also do that."},{text:"It has some additional features allowing you to map tokens to the span of text that created them.",explain:"Indeed -- those are called offset mappings. That's not the only advantage, though.",correct:!0}]}}),v=new f({props:{title:"4. How does the token-classification pipeline handle entities that span over several tokens?",local:"4-how-does-the-token-classification-pipeline-handle-entities-that-span-over-several-tokens",headingTag:"h3"}}),T=new g({props:{choices:[{text:"The entities with the same label are merged into one entity.",explain:"That's oversimplifying things a little. Try again!"},{text:"There is a label for the beginning of an entity and a label for the continuation of an entity.",explain:"Correct!",correct:!0},{text:"In a given word, as long as the first token has the label of the entity, the whole word is considered labeled with that entity.",explain:"That's one strategy to handle entities. What other answers here apply?",correct:!0},{text:"When a token has the label of a given entity, any other following token with the same label is considered part of the same entity, unless it's labeled as the start of a new entity.",explain:"That's the most common way to group entities together -- it's not the only right answer, though.",correct:!0}]}}),q=new f({props:{title:"5. How does the question-answering pipeline handle long contexts?",local:"5-how-does-the-question-answering-pipeline-handle-long-contexts",headingTag:"h3"}}),W=new g({props:{choices:[{text:"It doesn't really, as it truncates the long context at the maximum length accepted by the model.",explain:"There is a trick you can use to handle long contexts. Do you remember what it is?"},{text:"It splits the context into several parts and averages the results obtained.",explain:"No, it wouldn't make sense to average the results, as some parts of the context won't include the answer."},{text:"It splits the context into several parts (with overlap) and finds the maximum score for an answer in each part.",explain:"That's the correct answer!",correct:!0},{text:"It splits the context into several parts (without overlap, for efficiency) and finds the maximum score for an answer in each part.",explain:"No, it includes some overlap between the parts to avoid a situation where the answer would be split across two parts."}]}}),_=new f({props:{title:"6. What is normalization?",local:"6-what-is-normalization",headingTag:"h3"}}),P=new g({props:{choices:[{text:"It's any cleanup the tokenizer performs on the texts in the initial stages.",explain:"That's correct -- for instance, it might involve removing accents or whitespace, or lowercasing the inputs.",correct:!0},{text:"It's a data augmentation technique that involves making the text more normal by removing rare words.",explain:"That's incorrect! Try again."},{text:"It's the final post-processing step where the tokenizer adds the special tokens.",explain:"That stage is simply called post-processing."},{text:"It's when the embeddings are made with mean 0 and standard deviation 1, by subtracting the mean and dividing by the std.",explain:"That process is commonly called normalization when applied to pixel values in computer vision, but it's not what normalization means in NLP."}]}}),I=new f({props:{title:"7. What is pre-tokenization for a subword tokenizer?",local:"7-what-is-pre-tokenization-for-a-subword-tokenizer",headingTag:"h3"}}),E=new g({props:{choices:[{text:"It's the step before the tokenization, where data augmentation (like random masking) is applied.",explain:"No, that step is part of the preprocessing."},{text:"It's the step before the tokenization, where the desired cleanup operations are applied to the text.",explain:"No, that's the normalization step."},{text:"It's the step before the tokenizer model is applied, to split the input into words.",explain:"That's the correct answer!",correct:!0},{text:"It's the step before the tokenizer model is applied, to split the input into tokens.",explain:"No, splitting into tokens is the job of the tokenizer model."}]}}),N=new f({props:{title:"8. Select the sentences that apply to the BPE model of tokenization.",local:"8-select-the-sentences-that-apply-to-the-bpe-model-of-tokenization",headingTag:"h3"}}),B=new g({props:{choices:[{text:"BPE is a subword tokenization algorithm that starts with a small vocabulary and learns merge rules.",explain:"That's the case indeed!",correct:!0},{text:"BPE is a subword tokenization algorithm that starts with a big vocabulary and progressively removes tokens from it.",explain:"No, that's the approach taken by a different tokenization algorithm."},{text:"BPE tokenizers learn merge rules by merging the pair of tokens that is the most frequent.",explain:"That's correct!",correct:!0},{text:"A BPE tokenizer learns a merge rule by merging the pair of tokens that maximizes a score that privileges frequent pairs with less frequent individual parts.",explain:"No, that's the strategy applied by another tokenization algorithm."},{text:"BPE tokenizes words into subwords by splitting them into characters and then applying the merge rules.",explain:"That's correct!",correct:!0},{text:"BPE tokenizes words into subwords by finding the longest subword starting from the beginning that is in the vocabulary, then repeating the process for the rest of the text.",explain:"No, that's another tokenization algorithm's way of doing things."}]}}),C=new f({props:{title:"9. Select the sentences that apply to the WordPiece model of tokenization.",local:"9-select-the-sentences-that-apply-to-the-wordpiece-model-of-tokenization",headingTag:"h3"}}),S=new g({props:{choices:[{text:"WordPiece is a subword tokenization algorithm that starts with a small vocabulary and learns merge rules.",explain:"That's the case indeed!",correct:!0},{text:"WordPiece is a subword tokenization algorithm that starts with a big vocabulary and progressively removes tokens from it.",explain:"No, that's the approach taken by a different tokenization algorithm."},{text:"WordPiece tokenizers learn merge rules by merging the pair of tokens that is the most frequent.",explain:"No, that's the strategy applied by another tokenization algorithm."},{text:"A WordPiece tokenizer learns a merge rule by merging the pair of tokens that maximizes a score that privileges frequent pairs with less frequent individual parts.",explain:"That's correct!",correct:!0},{text:"WordPiece tokenizes words into subwords by finding the most likely segmentation into tokens, according to the model.",explain:"No, that's how another tokenization algorithm works."},{text:"WordPiece tokenizes words into subwords by finding the longest subword starting from the beginning that is in the vocabulary, then repeating the process for the rest of the text.",explain:"Yes, this is how WordPiece proceeds for the encoding.",correct:!0}]}}),U=new f({props:{title:"10. Select the sentences that apply to the Unigram model of tokenization.",local:"10-select-the-sentences-that-apply-to-the-unigram-model-of-tokenization",headingTag:"h3"}}),A=new g({props:{choices:[{text:"Unigram is a subword tokenization algorithm that starts with a small vocabulary and learns merge rules.",explain:"No, that's the approach taken by a different tokenization algorithm."},{text:"Unigram is a subword tokenization algorithm that starts with a big vocabulary and progressively removes tokens from it.",explain:"That's correct!",correct:!0},{text:"Unigram adapts its vocabulary by minimizing a loss computed over the whole corpus.",explain:"That's correct!",correct:!0},{text:"Unigram adapts its vocabulary by keeping the most frequent subwords.",explain:"No, this incorrect."},{text:"Unigram tokenizes words into subwords by finding the most likely segmentation into tokens, according to the model.",explain:"That's correct!",correct:!0},{text:"Unigram tokenizes words into subwords by splitting them into characters, then applying the merge rules.",explain:"No, that's how another tokenization algorithm works."}]}}),H=new Pe({props:{source:"https://github.com/huggingface/course/blob/main/chapters/en/chapter6/10.mdx"}}),{c(){u=ue("meta"),j=i(),L=ue("p"),D=i(),s(d.$$.fragment),F=i(),s(c.$$.fragment),M=i(),$=ue("p"),$.textContent=$e,Q=i(),s(w.$$.fragment),Y=i(),s(x.$$.fragment),G=i(),s(y.$$.fragment),O=i(),s(k.$$.fragment),J=i(),s(b.$$.fragment),K=i(),s(z.$$.fragment),V=i(),s(v.$$.fragment),X=i(),s(T.$$.fragment),Z=i(),s(q.$$.fragment),ee=i(),s(W.$$.fragment),te=i(),s(_.$$.fragment),ae=i(),s(P.$$.fragment),ne=i(),s(I.$$.fragment),ie=i(),s(E.$$.fragment),oe=i(),s(N.$$.fragment),se=i(),s(B.$$.fragment),re=i(),s(C.$$.fragment),le=i(),s(S.$$.fragment),he=i(),s(U.$$.fragment),pe=i(),s(A.$$.fragment),me=i(),s(H.$$.fragment),fe=i(),R=ue("p"),this.h()},l(e){const t=Te("svelte-u9bgzb",document.head);u=de(t,"META",{name:!0,content:!0}),t.forEach(a),j=o(e),L=de(e,"P",{}),we(L).forEach(a),D=o(e),r(d.$$.fragment,e),F=o(e),r(c.$$.fragment,e),M=o(e),$=de(e,"P",{"data-svelte-h":!0}),qe($)!=="svelte-19og2hy"&&($.textContent=$e),Q=o(e),r(w.$$.fragment,e),Y=o(e),r(x.$$.fragment,e),G=o(e),r(y.$$.fragment,e),O=o(e),r(k.$$.fragment,e),J=o(e),r(b.$$.fragment,e),K=o(e),r(z.$$.fragment,e),V=o(e),r(v.$$.fragment,e),X=o(e),r(T.$$.fragment,e),Z=o(e),r(q.$$.fragment,e),ee=o(e),r(W.$$.fragment,e),te=o(e),r(_.$$.fragment,e),ae=o(e),r(P.$$.fragment,e),ne=o(e),r(I.$$.fragment,e),ie=o(e),r(E.$$.fragment,e),oe=o(e),r(N.$$.fragment,e),se=o(e),r(B.$$.fragment,e),re=o(e),r(C.$$.fragment,e),le=o(e),r(S.$$.fragment,e),he=o(e),r(U.$$.fragment,e),pe=o(e),r(A.$$.fragment,e),me=o(e),r(H.$$.fragment,e),fe=o(e),R=de(e,"P",{}),we(R).forEach(a),this.h()},h(){xe(u,"name","hf:doc:metadata"),xe(u,"content",Ee)},m(e,t){We(document.head,u),n(e,j,t),n(e,L,t),n(e,D,t),l(d,e,t),n(e,F,t),l(c,e,t),n(e,M,t),n(e,$,t),n(e,Q,t),l(w,e,t),n(e,Y,t),l(x,e,t),n(e,G,t),l(y,e,t),n(e,O,t),l(k,e,t),n(e,J,t),l(b,e,t),n(e,K,t),l(z,e,t),n(e,V,t),l(v,e,t),n(e,X,t),l(T,e,t),n(e,Z,t),l(q,e,t),n(e,ee,t),l(W,e,t),n(e,te,t),l(_,e,t),n(e,ae,t),l(P,e,t),n(e,ne,t),l(I,e,t),n(e,ie,t),l(E,e,t),n(e,oe,t),l(N,e,t),n(e,se,t),l(B,e,t),n(e,re,t),l(C,e,t),n(e,le,t),l(S,e,t),n(e,he,t),l(U,e,t),n(e,pe,t),l(A,e,t),n(e,me,t),l(H,e,t),n(e,fe,t),n(e,R,t),ge=!0},p:ke,i(e){ge||(h(d.$$.fragment,e),h(c.$$.fragment,e),h(w.$$.fragment,e),h(x.$$.fragment,e),h(y.$$.fragment,e),h(k.$$.fragment,e),h(b.$$.fragment,e),h(z.$$.fragment,e),h(v.$$.fragment,e),h(T.$$.fragment,e),h(q.$$.fragment,e),h(W.$$.fragment,e),h(_.$$.fragment,e),h(P.$$.fragment,e),h(I.$$.fragment,e),h(E.$$.fragment,e),h(N.$$.fragment,e),h(B.$$.fragment,e),h(C.$$.fragment,e),h(S.$$.fragment,e),h(U.$$.fragment,e),h(A.$$.fragment,e),h(H.$$.fragment,e),ge=!0)},o(e){p(d.$$.fragment,e),p(c.$$.fragment,e),p(w.$$.fragment,e),p(x.$$.fragment,e),p(y.$$.fragment,e),p(k.$$.fragment,e),p(b.$$.fragment,e),p(z.$$.fragment,e),p(v.$$.fragment,e),p(T.$$.fragment,e),p(q.$$.fragment,e),p(W.$$.fragment,e),p(_.$$.fragment,e),p(P.$$.fragment,e),p(I.$$.fragment,e),p(E.$$.fragment,e),p(N.$$.fragment,e),p(B.$$.fragment,e),p(C.$$.fragment,e),p(S.$$.fragment,e),p(U.$$.fragment,e),p(A.$$.fragment,e),p(H.$$.fragment,e),ge=!1},d(e){e&&(a(j),a(L),a(D),a(F),a(M),a($),a(Q),a(Y),a(G),a(O),a(J),a(K),a(V),a(X),a(Z),a(ee),a(te),a(ae),a(ne),a(ie),a(oe),a(se),a(re),a(le),a(he),a(pe),a(me),a(fe),a(R)),a(u),m(d,e),m(c,e),m(w,e),m(x,e),m(y,e),m(k,e),m(b,e),m(z,e),m(v,e),m(T,e),m(q,e),m(W,e),m(_,e),m(P,e),m(I,e),m(E,e),m(N,e),m(B,e),m(C,e),m(S,e),m(U,e),m(A,e),m(H,e)}}}const Ee='{"title":"End-of-chapter quiz","local":"end-of-chapter-quiz","sections":[{"title":"1. When should you train a new tokenizer?","local":"1-when-should-you-train-a-new-tokenizer","sections":[],"depth":3},{"title":"2. What is the advantage of using a generator of lists of texts compared to a list of lists of texts when using train_new_from_iterator() ?","local":"2-what-is-the-advantage-of-using-a-generator-of-lists-of-texts-compared-to-a-list-of-lists-of-texts-when-using-trainnewfromiterator-","sections":[],"depth":3},{"title":"3. What are the advantages of using a “fast” tokenizer?","local":"3-what-are-the-advantages-of-using-a-fast-tokenizer","sections":[],"depth":3},{"title":"4. How does the token-classification pipeline handle entities that span over several tokens?","local":"4-how-does-the-token-classification-pipeline-handle-entities-that-span-over-several-tokens","sections":[],"depth":3},{"title":"5. How does the question-answering pipeline handle long contexts?","local":"5-how-does-the-question-answering-pipeline-handle-long-contexts","sections":[],"depth":3},{"title":"6. What is normalization?","local":"6-what-is-normalization","sections":[],"depth":3},{"title":"7. What is pre-tokenization for a subword tokenizer?","local":"7-what-is-pre-tokenization-for-a-subword-tokenizer","sections":[],"depth":3},{"title":"8. Select the sentences that apply to the BPE model of tokenization.","local":"8-select-the-sentences-that-apply-to-the-bpe-model-of-tokenization","sections":[],"depth":3},{"title":"9. Select the sentences that apply to the WordPiece model of tokenization.","local":"9-select-the-sentences-that-apply-to-the-wordpiece-model-of-tokenization","sections":[],"depth":3},{"title":"10. Select the sentences that apply to the Unigram model of tokenization.","local":"10-select-the-sentences-that-apply-to-the-unigram-model-of-tokenization","sections":[],"depth":3}],"depth":1}';function Ne(ce){return be(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class He extends ze{constructor(u){super(),ve(this,u,Ne,Ie,ye,{})}}export{He as component};

Xet Storage Details

Size:
17.4 kB
·
Xet hash:
91b5fe66e3ac8e018d88744290b95b6e56a02e3feff0bb20752b83697e6fee4b

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.