Buckets:
| import{s as Cs,n as Us,o as ks}from"../chunks/scheduler.d75c11ed.js";import{S as Gs,i as Ws,e as p,s as l,c as m,h as vs,a as r,d as n,b as t,f as Ts,g as u,j as c,k as Js,l as zs,m as e,n as h,t as i,o as b,p as j}from"../chunks/index.4ec9dfe9.js";import{C as qs,H as is,E as Zs}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.ee0f129e.js";import{C as Y}from"../chunks/CodeBlock.5919a092.js";function Is(bs){let o,V,H,F,d,P,f,N,g,js="This guide shows specific methods for processing text datasets. Learn how to:",Q,y,os='<li>Tokenize a dataset with <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a>.</li> <li>Align dataset labels with label ids for NLI datasets.</li>',A,$,ds='For a guide on how to process any type of dataset, take a look at the <a class="underline decoration-sky-400 decoration-2 font-semibold" href="./process">general process guide</a>.',X,M,S,_,fs='The <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a> function supports processing batches of examples at once which speeds up tokenization.',E,w,gs='Load a tokenizer from 🤗 <a href="https://huggingface.co/transformers/" rel="nofollow">Transformers</a>:',B,x,D,T,ys='Set the <code>batched</code> parameter to <code>True</code> in the <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a> function to apply the tokenizer to batches of examples:',K,J,O,C,$s='The <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a> function converts the returned values to a PyArrow-supported format. But explicitly returning the tensors as NumPy arrays is faster because it is a natively supported PyArrow format. Set <code>return_tensors="np"</code> when you tokenize your text:',ss,U,as,k,ns,G,Ms='The <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.align_labels_with_mapping">align_labels_with_mapping()</a> function aligns a dataset label id with the label name. Not all 🤗 Transformers models follow the prescribed label mapping of the original dataset, especially for NLI datasets. For example, the <a href="https://huggingface.co/datasets/glue" rel="nofollow">MNLI</a> dataset uses the following label mapping:',es,W,ls,v,_s="To align the dataset label mapping with the mapping used by a model, create a dictionary of the label name and id to align on:",ts,z,ps,q,ws='Pass the dictionary of the label mappings to the <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.align_labels_with_mapping">align_labels_with_mapping()</a> function, and the column to align on:',rs,Z,ms,I,xs="You can also use this function to assign a custom mapping of labels to ids.",us,L,cs,R,hs;return d=new qs({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),f=new is({props:{title:"Process text data",local:"process-text-data",headingTag:"h1"}}),M=new is({props:{title:"Map",local:"map",headingTag:"h2"}}),x=new Y({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Ub2tlbml6ZXIlMEElMEF0b2tlbml6ZXIlMjAlM0QlMjBBdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZCglMjJiZXJ0LWJhc2UtY2FzZWQlMjIp",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer | |
| <span class="hljs-meta">>>> </span>tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"bert-base-cased"</span>)`,wrap:!1}}),J=new Y({props:{code:"ZGF0YXNldCUyMCUzRCUyMGRhdGFzZXQubWFwKGxhbWJkYSUyMGV4YW1wbGVzJTNBJTIwdG9rZW5pemVyKGV4YW1wbGVzJTVCJTIydGV4dCUyMiU1RCklMkMlMjBiYXRjaGVkJTNEVHJ1ZSklMEFkYXRhc2V0JTVCMCU1RA==",highlighted:`<span class="hljs-meta">>>> </span>dataset = dataset.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> examples: tokenizer(examples[<span class="hljs-string">"text"</span>]), batched=<span class="hljs-literal">True</span>) | |
| <span class="hljs-meta">>>> </span>dataset[<span class="hljs-number">0</span>] | |
| {<span class="hljs-string">'text'</span>: <span class="hljs-string">'the rock is destined to be the 21st century\\'s new " conan " and that he\\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'</span>, | |
| <span class="hljs-string">'label'</span>: <span class="hljs-number">1</span>, | |
| <span class="hljs-string">'input_ids'</span>: [<span class="hljs-number">101</span>, <span class="hljs-number">1996</span>, <span class="hljs-number">2600</span>, <span class="hljs-number">2003</span>, <span class="hljs-number">16036</span>, <span class="hljs-number">2000</span>, <span class="hljs-number">2022</span>, <span class="hljs-number">1996</span>, <span class="hljs-number">7398</span>, <span class="hljs-number">2301</span>, <span class="hljs-number">1005</span>, <span class="hljs-number">1055</span>, <span class="hljs-number">2047</span>, <span class="hljs-number">1000</span>, <span class="hljs-number">16608</span>, <span class="hljs-number">1000</span>, <span class="hljs-number">1998</span>, <span class="hljs-number">2008</span>, <span class="hljs-number">2002</span>, <span class="hljs-number">1005</span>, <span class="hljs-number">1055</span>, <span class="hljs-number">2183</span>, <span class="hljs-number">2000</span>, <span class="hljs-number">2191</span>, <span class="hljs-number">1037</span>, <span class="hljs-number">17624</span>, <span class="hljs-number">2130</span>, <span class="hljs-number">3618</span>, <span class="hljs-number">2084</span>, <span class="hljs-number">7779</span>, <span class="hljs-number">29058</span>, <span class="hljs-number">8625</span>, <span class="hljs-number">13327</span>, <span class="hljs-number">1010</span>, <span class="hljs-number">3744</span>, <span class="hljs-number">1011</span>, <span class="hljs-number">18856</span>, <span class="hljs-number">19513</span>, <span class="hljs-number">3158</span>, <span class="hljs-number">5477</span>, <span class="hljs-number">4168</span>, <span class="hljs-number">2030</span>, <span class="hljs-number">7112</span>, <span class="hljs-number">16562</span>, <span class="hljs-number">2140</span>, <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>], | |
| <span class="hljs-string">'token_type_ids'</span>: [<span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>], | |
| <span class="hljs-string">'attention_mask'</span>: [<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>]}`,wrap:!1}}),U=new Y({props:{code:"ZGF0YXNldCUyMCUzRCUyMGRhdGFzZXQubWFwKGxhbWJkYSUyMGV4YW1wbGVzJTNBJTIwdG9rZW5pemVyKGV4YW1wbGVzJTVCJTIydGV4dCUyMiU1RCUyQyUyMHJldHVybl90ZW5zb3JzJTNEJTIybnAlMjIpJTJDJTIwYmF0Y2hlZCUzRFRydWUp",highlighted:'<span class="hljs-meta">>>> </span>dataset = dataset.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> examples: tokenizer(examples[<span class="hljs-string">"text"</span>], return_tensors=<span class="hljs-string">"np"</span>), batched=<span class="hljs-literal">True</span>)',wrap:!1}}),k=new is({props:{title:"Align",local:"align",headingTag:"h2"}}),W=new Y({props:{code:"bGFiZWwyaWQlMjAlM0QlMjAlN0IlMjJlbnRhaWxtZW50JTIyJTNBJTIwMCUyQyUyMCUyMm5ldXRyYWwlMjIlM0ElMjAxJTJDJTIwJTIyY29udHJhZGljdGlvbiUyMiUzQSUyMDIlN0Q=",highlighted:'<span class="hljs-meta">>>> </span>label2id = {<span class="hljs-string">"entailment"</span>: <span class="hljs-number">0</span>, <span class="hljs-string">"neutral"</span>: <span class="hljs-number">1</span>, <span class="hljs-string">"contradiction"</span>: <span class="hljs-number">2</span>}',wrap:!1}}),z=new Y({props:{code:"bGFiZWwyaWQlMjAlM0QlMjAlN0IlMjJjb250cmFkaWN0aW9uJTIyJTNBJTIwMCUyQyUyMCUyMm5ldXRyYWwlMjIlM0ElMjAxJTJDJTIwJTIyZW50YWlsbWVudCUyMiUzQSUyMDIlN0Q=",highlighted:'<span class="hljs-meta">>>> </span>label2id = {<span class="hljs-string">"contradiction"</span>: <span class="hljs-number">0</span>, <span class="hljs-string">"neutral"</span>: <span class="hljs-number">1</span>, <span class="hljs-string">"entailment"</span>: <span class="hljs-number">2</span>}',wrap:!1}}),Z=new Y({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBJTBBbW5saSUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJueXUtbWxsJTJGZ2x1ZSUyMiUyQyUyMCUyMm1ubGklMjIlMkMlMjBzcGxpdCUzRCUyMnRyYWluJTIyKSUwQW1ubGlfYWxpZ25lZCUyMCUzRCUyMG1ubGkuYWxpZ25fbGFiZWxzX3dpdGhfbWFwcGluZyhsYWJlbDJpZCUyQyUyMCUyMmxhYmVsJTIyKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-meta">>>> </span>mnli = load_dataset(<span class="hljs-string">"nyu-mll/glue"</span>, <span class="hljs-string">"mnli"</span>, split=<span class="hljs-string">"train"</span>) | |
| <span class="hljs-meta">>>> </span>mnli_aligned = mnli.align_labels_with_mapping(label2id, <span class="hljs-string">"label"</span>)`,wrap:!1}}),L=new Zs({props:{source:"https://github.com/huggingface/datasets/blob/main/docs/source/nlp_process.mdx"}}),{c(){o=p("meta"),V=l(),H=p("p"),F=l(),m(d.$$.fragment),P=l(),m(f.$$.fragment),N=l(),g=p("p"),g.textContent=js,Q=l(),y=p("ul"),y.innerHTML=os,A=l(),$=p("p"),$.innerHTML=ds,X=l(),m(M.$$.fragment),S=l(),_=p("p"),_.innerHTML=fs,E=l(),w=p("p"),w.innerHTML=gs,B=l(),m(x.$$.fragment),D=l(),T=p("p"),T.innerHTML=ys,K=l(),m(J.$$.fragment),O=l(),C=p("p"),C.innerHTML=$s,ss=l(),m(U.$$.fragment),as=l(),m(k.$$.fragment),ns=l(),G=p("p"),G.innerHTML=Ms,es=l(),m(W.$$.fragment),ls=l(),v=p("p"),v.textContent=_s,ts=l(),m(z.$$.fragment),ps=l(),q=p("p"),q.innerHTML=ws,rs=l(),m(Z.$$.fragment),ms=l(),I=p("p"),I.textContent=xs,us=l(),m(L.$$.fragment),cs=l(),R=p("p"),this.h()},l(s){const a=vs("svelte-u9bgzb",document.head);o=r(a,"META",{name:!0,content:!0}),a.forEach(n),V=t(s),H=r(s,"P",{}),Ts(H).forEach(n),F=t(s),u(d.$$.fragment,s),P=t(s),u(f.$$.fragment,s),N=t(s),g=r(s,"P",{"data-svelte-h":!0}),c(g)!=="svelte-zxifil"&&(g.textContent=js),Q=t(s),y=r(s,"UL",{"data-svelte-h":!0}),c(y)!=="svelte-d5y30q"&&(y.innerHTML=os),A=t(s),$=r(s,"P",{"data-svelte-h":!0}),c($)!=="svelte-3s2bzp"&&($.innerHTML=ds),X=t(s),u(M.$$.fragment,s),S=t(s),_=r(s,"P",{"data-svelte-h":!0}),c(_)!=="svelte-c58mc4"&&(_.innerHTML=fs),E=t(s),w=r(s,"P",{"data-svelte-h":!0}),c(w)!=="svelte-b5bjp1"&&(w.innerHTML=gs),B=t(s),u(x.$$.fragment,s),D=t(s),T=r(s,"P",{"data-svelte-h":!0}),c(T)!=="svelte-sduubz"&&(T.innerHTML=ys),K=t(s),u(J.$$.fragment,s),O=t(s),C=r(s,"P",{"data-svelte-h":!0}),c(C)!=="svelte-u93075"&&(C.innerHTML=$s),ss=t(s),u(U.$$.fragment,s),as=t(s),u(k.$$.fragment,s),ns=t(s),G=r(s,"P",{"data-svelte-h":!0}),c(G)!=="svelte-bnqvv3"&&(G.innerHTML=Ms),es=t(s),u(W.$$.fragment,s),ls=t(s),v=r(s,"P",{"data-svelte-h":!0}),c(v)!=="svelte-tn6t6n"&&(v.textContent=_s),ts=t(s),u(z.$$.fragment,s),ps=t(s),q=r(s,"P",{"data-svelte-h":!0}),c(q)!=="svelte-1ufodl"&&(q.innerHTML=ws),rs=t(s),u(Z.$$.fragment,s),ms=t(s),I=r(s,"P",{"data-svelte-h":!0}),c(I)!=="svelte-18y4a1w"&&(I.textContent=xs),us=t(s),u(L.$$.fragment,s),cs=t(s),R=r(s,"P",{}),Ts(R).forEach(n),this.h()},h(){Js(o,"name","hf:doc:metadata"),Js(o,"content",Ls)},m(s,a){zs(document.head,o),e(s,V,a),e(s,H,a),e(s,F,a),h(d,s,a),e(s,P,a),h(f,s,a),e(s,N,a),e(s,g,a),e(s,Q,a),e(s,y,a),e(s,A,a),e(s,$,a),e(s,X,a),h(M,s,a),e(s,S,a),e(s,_,a),e(s,E,a),e(s,w,a),e(s,B,a),h(x,s,a),e(s,D,a),e(s,T,a),e(s,K,a),h(J,s,a),e(s,O,a),e(s,C,a),e(s,ss,a),h(U,s,a),e(s,as,a),h(k,s,a),e(s,ns,a),e(s,G,a),e(s,es,a),h(W,s,a),e(s,ls,a),e(s,v,a),e(s,ts,a),h(z,s,a),e(s,ps,a),e(s,q,a),e(s,rs,a),h(Z,s,a),e(s,ms,a),e(s,I,a),e(s,us,a),h(L,s,a),e(s,cs,a),e(s,R,a),hs=!0},p:Us,i(s){hs||(i(d.$$.fragment,s),i(f.$$.fragment,s),i(M.$$.fragment,s),i(x.$$.fragment,s),i(J.$$.fragment,s),i(U.$$.fragment,s),i(k.$$.fragment,s),i(W.$$.fragment,s),i(z.$$.fragment,s),i(Z.$$.fragment,s),i(L.$$.fragment,s),hs=!0)},o(s){b(d.$$.fragment,s),b(f.$$.fragment,s),b(M.$$.fragment,s),b(x.$$.fragment,s),b(J.$$.fragment,s),b(U.$$.fragment,s),b(k.$$.fragment,s),b(W.$$.fragment,s),b(z.$$.fragment,s),b(Z.$$.fragment,s),b(L.$$.fragment,s),hs=!1},d(s){s&&(n(V),n(H),n(F),n(P),n(N),n(g),n(Q),n(y),n(A),n($),n(X),n(S),n(_),n(E),n(w),n(B),n(D),n(T),n(K),n(O),n(C),n(ss),n(as),n(ns),n(G),n(es),n(ls),n(v),n(ts),n(ps),n(q),n(rs),n(ms),n(I),n(us),n(cs),n(R)),n(o),j(d,s),j(f,s),j(M,s),j(x,s),j(J,s),j(U,s),j(k,s),j(W,s),j(z,s),j(Z,s),j(L,s)}}}const Ls='{"title":"Process text data","local":"process-text-data","sections":[{"title":"Map","local":"map","sections":[],"depth":2},{"title":"Align","local":"align","sections":[],"depth":2}],"depth":1}';function Ys(bs){return ks(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Ps extends Gs{constructor(o){super(),Ws(this,o,Ys,Is,Cs,{})}}export{Ps as component}; | |
Xet Storage Details
- Size:
- 16.3 kB
- Xet hash:
- 8ade91b44e630ca76a75453972bf977fc9474caa9ffe0327f818f6d6e57173d3
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.