Buckets:
| import{s as K,n as O,o as tt}from"../chunks/scheduler.bdbef820.js";import{S as st,i as at,g as p,s as l,r as T,A as et,h as i,f as a,c as n,j as A,u as j,x as b,k as D,y as lt,a as e,v as x,d as _,t as U,w as q}from"../chunks/index.c0aea24a.js";import{C as E}from"../chunks/CodeBlock.6ccca92e.js";import{H as nt,E as ot}from"../chunks/EditOnGithub.725ee0c1.js";function pt(W){let o,w,R,C,m,Z,d,L='This guide shows you how to load text datasets. To learn how to load any type of dataset, take a look at the <a class="underline decoration-sky-400 decoration-2 font-semibold" href="./loading">general loading guide</a>.',I,r,Q="Text files are one of the most common file types for storing a dataset. By default, 🤗 Datasets samples a text file line by line to build the dataset.",X,h,k,u,B="To sample a text file by paragraph or even an entire document, use the <code>sample_by</code> parameter:",G,c,Y,y,P="You can also use grep patterns to load specific files:",F,f,V,g,S="To load remote text files via HTTP, pass the URLs instead:",z,M,v,J,N,$,H;return m=new nt({props:{title:"Load text data",local:"load-text-data",headingTag:"h1"}}),h=new E({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJ0ZXh0JTIyJTJDJTIwZGF0YV9maWxlcyUzRCU3QiUyMnRyYWluJTIyJTNBJTIwJTVCJTIybXlfdGV4dF8xLnR4dCUyMiUyQyUyMCUyMm15X3RleHRfMi50eHQlMjIlNUQlMkMlMjAlMjJ0ZXN0JTIyJTNBJTIwJTIybXlfdGVzdF9maWxlLnR4dCUyMiU3RCklMEElMEFkYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMnRleHQlMjIlMkMlMjBkYXRhX2RpciUzRCUyMnBhdGglMkZ0byUyRnRleHQlMkZkYXRhc2V0JTIyKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-meta">>>> </span>dataset = load_dataset(<span class="hljs-string">"text"</span>, data_files={<span class="hljs-string">"train"</span>: [<span class="hljs-string">"my_text_1.txt"</span>, <span class="hljs-string">"my_text_2.txt"</span>], <span class="hljs-string">"test"</span>: <span class="hljs-string">"my_test_file.txt"</span>}) | |
| <span class="hljs-comment"># Load from a directory</span> | |
| <span class="hljs-meta">>>> </span>dataset = load_dataset(<span class="hljs-string">"text"</span>, data_dir=<span class="hljs-string">"path/to/text/dataset"</span>)`,wrap:!1}}),c=new E({props:{code:"ZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJ0ZXh0JTIyJTJDJTIwZGF0YV9maWxlcyUzRCU3QiUyMnRyYWluJTIyJTNBJTIwJTIybXlfdHJhaW5fZmlsZS50eHQlMjIlMkMlMjAlMjJ0ZXN0JTIyJTNBJTIwJTIybXlfdGVzdF9maWxlLnR4dCUyMiU3RCUyQyUyMHNhbXBsZV9ieSUzRCUyMnBhcmFncmFwaCUyMiklMEElMEFkYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMnRleHQlMjIlMkMlMjBkYXRhX2ZpbGVzJTNEJTdCJTIydHJhaW4lMjIlM0ElMjAlMjJteV90cmFpbl9maWxlLnR4dCUyMiUyQyUyMCUyMnRlc3QlMjIlM0ElMjAlMjJteV90ZXN0X2ZpbGUudHh0JTIyJTdEJTJDJTIwc2FtcGxlX2J5JTNEJTIyZG9jdW1lbnQlMjIp",highlighted:`<span class="hljs-comment"># Sample by paragraph</span> | |
| <span class="hljs-meta">>>> </span>dataset = load_dataset(<span class="hljs-string">"text"</span>, data_files={<span class="hljs-string">"train"</span>: <span class="hljs-string">"my_train_file.txt"</span>, <span class="hljs-string">"test"</span>: <span class="hljs-string">"my_test_file.txt"</span>}, sample_by=<span class="hljs-string">"paragraph"</span>) | |
| <span class="hljs-comment"># Sample by document</span> | |
| <span class="hljs-meta">>>> </span>dataset = load_dataset(<span class="hljs-string">"text"</span>, data_files={<span class="hljs-string">"train"</span>: <span class="hljs-string">"my_train_file.txt"</span>, <span class="hljs-string">"test"</span>: <span class="hljs-string">"my_test_file.txt"</span>}, sample_by=<span class="hljs-string">"document"</span>)`,wrap:!1}}),f=new E({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBYzRfc3Vic2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMmFsbGVuYWklMkZjNCUyMiUyQyUyMGRhdGFfZmlsZXMlM0QlMjJlbiUyRmM0LXRyYWluLjAwMDAqLW9mLTAxMDI0Lmpzb24uZ3olMjIp",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-meta">>>> </span>c4_subset = load_dataset(<span class="hljs-string">"allenai/c4"</span>, data_files=<span class="hljs-string">"en/c4-train.0000*-of-01024.json.gz"</span>)`,wrap:!1}}),M=new E({props:{code:"ZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJ0ZXh0JTIyJTJDJTIwZGF0YV9maWxlcyUzRCUyMmh0dHBzJTNBJTJGJTJGaHVnZ2luZ2ZhY2UuY28lMkZkYXRhc2V0cyUyRmxob2VzdHElMkZ0ZXN0JTJGcmVzb2x2ZSUyRm1haW4lMkZzb21lX3RleHQudHh0JTIyKQ==",highlighted:'<span class="hljs-meta">>>> </span>dataset = load_dataset(<span class="hljs-string">"text"</span>, data_files=<span class="hljs-string">"https://huggingface.co/datasets/lhoestq/test/resolve/main/some_text.txt"</span>)',wrap:!1}}),J=new ot({props:{source:"https://github.com/huggingface/datasets/blob/main/docs/source/nlp_load.mdx"}}),{c(){o=p("meta"),w=l(),R=p("p"),C=l(),T(m.$$.fragment),Z=l(),d=p("p"),d.innerHTML=L,I=l(),r=p("p"),r.textContent=Q,X=l(),T(h.$$.fragment),k=l(),u=p("p"),u.innerHTML=B,G=l(),T(c.$$.fragment),Y=l(),y=p("p"),y.textContent=P,F=l(),T(f.$$.fragment),V=l(),g=p("p"),g.textContent=S,z=l(),T(M.$$.fragment),v=l(),T(J.$$.fragment),N=l(),$=p("p"),this.h()},l(t){const s=et("svelte-u9bgzb",document.head);o=i(s,"META",{name:!0,content:!0}),s.forEach(a),w=n(t),R=i(t,"P",{}),A(R).forEach(a),C=n(t),j(m.$$.fragment,t),Z=n(t),d=i(t,"P",{"data-svelte-h":!0}),b(d)!=="svelte-w06s16"&&(d.innerHTML=L),I=n(t),r=i(t,"P",{"data-svelte-h":!0}),b(r)!=="svelte-1nologr"&&(r.textContent=Q),X=n(t),j(h.$$.fragment,t),k=n(t),u=i(t,"P",{"data-svelte-h":!0}),b(u)!=="svelte-7z7mls"&&(u.innerHTML=B),G=n(t),j(c.$$.fragment,t),Y=n(t),y=i(t,"P",{"data-svelte-h":!0}),b(y)!=="svelte-19wmves"&&(y.textContent=P),F=n(t),j(f.$$.fragment,t),V=n(t),g=i(t,"P",{"data-svelte-h":!0}),b(g)!=="svelte-129tbg5"&&(g.textContent=S),z=n(t),j(M.$$.fragment,t),v=n(t),j(J.$$.fragment,t),N=n(t),$=i(t,"P",{}),A($).forEach(a),this.h()},h(){D(o,"name","hf:doc:metadata"),D(o,"content",it)},m(t,s){lt(document.head,o),e(t,w,s),e(t,R,s),e(t,C,s),x(m,t,s),e(t,Z,s),e(t,d,s),e(t,I,s),e(t,r,s),e(t,X,s),x(h,t,s),e(t,k,s),e(t,u,s),e(t,G,s),x(c,t,s),e(t,Y,s),e(t,y,s),e(t,F,s),x(f,t,s),e(t,V,s),e(t,g,s),e(t,z,s),x(M,t,s),e(t,v,s),x(J,t,s),e(t,N,s),e(t,$,s),H=!0},p:O,i(t){H||(_(m.$$.fragment,t),_(h.$$.fragment,t),_(c.$$.fragment,t),_(f.$$.fragment,t),_(M.$$.fragment,t),_(J.$$.fragment,t),H=!0)},o(t){U(m.$$.fragment,t),U(h.$$.fragment,t),U(c.$$.fragment,t),U(f.$$.fragment,t),U(M.$$.fragment,t),U(J.$$.fragment,t),H=!1},d(t){t&&(a(w),a(R),a(C),a(Z),a(d),a(I),a(r),a(X),a(k),a(u),a(G),a(Y),a(y),a(F),a(V),a(g),a(z),a(v),a(N),a($)),a(o),q(m,t),q(h,t),q(c,t),q(f,t),q(M,t),q(J,t)}}}const it='{"title":"Load text data","local":"load-text-data","sections":[],"depth":1}';function mt(W){return tt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class ct extends st{constructor(o){super(),at(this,o,mt,pt,K,{})}}export{ct as component}; | |
Xet Storage Details
- Size:
- 7.25 kB
- Xet hash:
- b97b13bf528703639616dccf2b9be9606de6c93e8ad9d1b5b841e222cefd5f92
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.