Buckets:
| import{s as nt,n as pt,o as ot}from"../chunks/scheduler.d75c11ed.js";import{S as it,i as mt,e as o,s as l,c as m,h as dt,a as i,d as a,b as n,f as et,g as d,j as b,k as lt,l as rt,m as e,n as r,t as h,o as u,p as c}from"../chunks/index.4ec9dfe9.js";import{C as ht,H as ut,E as ct}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.ee0f129e.js";import{C as Z}from"../chunks/CodeBlock.5919a092.js";function yt(A){let p,G,I,X,y,k,M,Y,f,D='This guide shows you how to load text datasets. To learn how to load any type of dataset, take a look at the <a class="underline decoration-sky-400 decoration-2 font-semibold" href="./loading">general loading guide</a>.',F,g,K="Text files are one of the most common file types for storing a dataset. By default, 🤗 Datasets samples a text file line by line to build the dataset.",V,J,W,j,O="To sample a text file by paragraph or even an entire document, use the <code>sample_by</code> parameter:",N,T,v,x,tt="You can also use grep patterns to load specific files:",z,_,L,q,st="To load remote text files via HTTP, pass the URLs instead:",Q,R,E,$,at="To load XML data you can use the “xml” loader, which is equivalent to “text” with sample_by=“document”:",B,w,H,U,S,C,P;return y=new ht({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),M=new ut({props:{title:"Load text data",local:"load-text-data",headingTag:"h1"}}),J=new Z({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJ0ZXh0JTIyJTJDJTIwZGF0YV9maWxlcyUzRCU3QiUyMnRyYWluJTIyJTNBJTIwJTVCJTIybXlfdGV4dF8xLnR4dCUyMiUyQyUyMCUyMm15X3RleHRfMi50eHQlMjIlNUQlMkMlMjAlMjJ0ZXN0JTIyJTNBJTIwJTIybXlfdGVzdF9maWxlLnR4dCUyMiU3RCklMEElMEFkYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMnRleHQlMjIlMkMlMjBkYXRhX2RpciUzRCUyMnBhdGglMkZ0byUyRnRleHQlMkZkYXRhc2V0JTIyKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-meta">>>> </span>dataset = load_dataset(<span class="hljs-string">"text"</span>, data_files={<span class="hljs-string">"train"</span>: [<span class="hljs-string">"my_text_1.txt"</span>, <span class="hljs-string">"my_text_2.txt"</span>], <span class="hljs-string">"test"</span>: <span class="hljs-string">"my_test_file.txt"</span>}) | |
| <span class="hljs-comment"># Load from a directory</span> | |
| <span class="hljs-meta">>>> </span>dataset = load_dataset(<span class="hljs-string">"text"</span>, data_dir=<span class="hljs-string">"path/to/text/dataset"</span>)`,wrap:!1}}),T=new Z({props:{code:"ZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJ0ZXh0JTIyJTJDJTIwZGF0YV9maWxlcyUzRCU3QiUyMnRyYWluJTIyJTNBJTIwJTIybXlfdHJhaW5fZmlsZS50eHQlMjIlMkMlMjAlMjJ0ZXN0JTIyJTNBJTIwJTIybXlfdGVzdF9maWxlLnR4dCUyMiU3RCUyQyUyMHNhbXBsZV9ieSUzRCUyMnBhcmFncmFwaCUyMiklMEElMEFkYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMnRleHQlMjIlMkMlMjBkYXRhX2ZpbGVzJTNEJTdCJTIydHJhaW4lMjIlM0ElMjAlMjJteV90cmFpbl9maWxlLnR4dCUyMiUyQyUyMCUyMnRlc3QlMjIlM0ElMjAlMjJteV90ZXN0X2ZpbGUudHh0JTIyJTdEJTJDJTIwc2FtcGxlX2J5JTNEJTIyZG9jdW1lbnQlMjIp",highlighted:`<span class="hljs-comment"># Sample by paragraph</span> | |
| <span class="hljs-meta">>>> </span>dataset = load_dataset(<span class="hljs-string">"text"</span>, data_files={<span class="hljs-string">"train"</span>: <span class="hljs-string">"my_train_file.txt"</span>, <span class="hljs-string">"test"</span>: <span class="hljs-string">"my_test_file.txt"</span>}, sample_by=<span class="hljs-string">"paragraph"</span>) | |
| <span class="hljs-comment"># Sample by document</span> | |
| <span class="hljs-meta">>>> </span>dataset = load_dataset(<span class="hljs-string">"text"</span>, data_files={<span class="hljs-string">"train"</span>: <span class="hljs-string">"my_train_file.txt"</span>, <span class="hljs-string">"test"</span>: <span class="hljs-string">"my_test_file.txt"</span>}, sample_by=<span class="hljs-string">"document"</span>)`,wrap:!1}}),_=new Z({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBYzRfc3Vic2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMmFsbGVuYWklMkZjNCUyMiUyQyUyMGRhdGFfZmlsZXMlM0QlMjJlbiUyRmM0LXRyYWluLjAwMDAqLW9mLTAxMDI0Lmpzb24uZ3olMjIp",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-meta">>>> </span>c4_subset = load_dataset(<span class="hljs-string">"allenai/c4"</span>, data_files=<span class="hljs-string">"en/c4-train.0000*-of-01024.json.gz"</span>)`,wrap:!1}}),R=new Z({props:{code:"ZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJ0ZXh0JTIyJTJDJTIwZGF0YV9maWxlcyUzRCUyMmh0dHBzJTNBJTJGJTJGaHVnZ2luZ2ZhY2UuY28lMkZkYXRhc2V0cyUyRmhmLWludGVybmFsLXRlc3RpbmclMkZkYXRhc2V0X3dpdGhfZGF0YV9maWxlcyUyRnJlc29sdmUlMkZtYWluJTJGZGF0YSUyRnRyYWluLnR4dCUyMik=",highlighted:'<span class="hljs-meta">>>> </span>dataset = load_dataset(<span class="hljs-string">"text"</span>, data_files=<span class="hljs-string">"https://huggingface.co/datasets/hf-internal-testing/dataset_with_data_files/resolve/main/data/train.txt"</span>)',wrap:!1}}),w=new Z({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJ4bWwlMjIlMkMlMjBkYXRhX2ZpbGVzJTNEJTdCJTIydHJhaW4lMjIlM0ElMjAlNUIlMjJteV94bWxfMS54bWwlMjIlMkMlMjAlMjJteV94bWxfMi54bWwlMjIlNUQlMkMlMjAlMjJ0ZXN0JTIyJTNBJTIwJTIybXlfeG1sX2ZpbGUueG1sJTIyJTdEKSUwQSUwQWRhdGFzZXQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTIyeG1sJTIyJTJDJTIwZGF0YV9kaXIlM0QlMjJwYXRoJTJGdG8lMkZ4bWwlMkZkYXRhc2V0JTIyKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-meta">>>> </span>dataset = load_dataset(<span class="hljs-string">"xml"</span>, data_files={<span class="hljs-string">"train"</span>: [<span class="hljs-string">"my_xml_1.xml"</span>, <span class="hljs-string">"my_xml_2.xml"</span>], <span class="hljs-string">"test"</span>: <span class="hljs-string">"my_xml_file.xml"</span>}) | |
| <span class="hljs-comment"># Load from a directory</span> | |
| <span class="hljs-meta">>>> </span>dataset = load_dataset(<span class="hljs-string">"xml"</span>, data_dir=<span class="hljs-string">"path/to/xml/dataset"</span>)`,wrap:!1}}),U=new ct({props:{source:"https://github.com/huggingface/datasets/blob/main/docs/source/nlp_load.mdx"}}),{c(){p=o("meta"),G=l(),I=o("p"),X=l(),m(y.$$.fragment),k=l(),m(M.$$.fragment),Y=l(),f=o("p"),f.innerHTML=D,F=l(),g=o("p"),g.textContent=K,V=l(),m(J.$$.fragment),W=l(),j=o("p"),j.innerHTML=O,N=l(),m(T.$$.fragment),v=l(),x=o("p"),x.textContent=tt,z=l(),m(_.$$.fragment),L=l(),q=o("p"),q.textContent=st,Q=l(),m(R.$$.fragment),E=l(),$=o("p"),$.textContent=at,B=l(),m(w.$$.fragment),H=l(),m(U.$$.fragment),S=l(),C=o("p"),this.h()},l(t){const s=dt("svelte-u9bgzb",document.head);p=i(s,"META",{name:!0,content:!0}),s.forEach(a),G=n(t),I=i(t,"P",{}),et(I).forEach(a),X=n(t),d(y.$$.fragment,t),k=n(t),d(M.$$.fragment,t),Y=n(t),f=i(t,"P",{"data-svelte-h":!0}),b(f)!=="svelte-w06s16"&&(f.innerHTML=D),F=n(t),g=i(t,"P",{"data-svelte-h":!0}),b(g)!=="svelte-1nologr"&&(g.textContent=K),V=n(t),d(J.$$.fragment,t),W=n(t),j=i(t,"P",{"data-svelte-h":!0}),b(j)!=="svelte-7z7mls"&&(j.innerHTML=O),N=n(t),d(T.$$.fragment,t),v=n(t),x=i(t,"P",{"data-svelte-h":!0}),b(x)!=="svelte-19wmves"&&(x.textContent=tt),z=n(t),d(_.$$.fragment,t),L=n(t),q=i(t,"P",{"data-svelte-h":!0}),b(q)!=="svelte-129tbg5"&&(q.textContent=st),Q=n(t),d(R.$$.fragment,t),E=n(t),$=i(t,"P",{"data-svelte-h":!0}),b($)!=="svelte-q2d66r"&&($.textContent=at),B=n(t),d(w.$$.fragment,t),H=n(t),d(U.$$.fragment,t),S=n(t),C=i(t,"P",{}),et(C).forEach(a),this.h()},h(){lt(p,"name","hf:doc:metadata"),lt(p,"content",Mt)},m(t,s){rt(document.head,p),e(t,G,s),e(t,I,s),e(t,X,s),r(y,t,s),e(t,k,s),r(M,t,s),e(t,Y,s),e(t,f,s),e(t,F,s),e(t,g,s),e(t,V,s),r(J,t,s),e(t,W,s),e(t,j,s),e(t,N,s),r(T,t,s),e(t,v,s),e(t,x,s),e(t,z,s),r(_,t,s),e(t,L,s),e(t,q,s),e(t,Q,s),r(R,t,s),e(t,E,s),e(t,$,s),e(t,B,s),r(w,t,s),e(t,H,s),r(U,t,s),e(t,S,s),e(t,C,s),P=!0},p:pt,i(t){P||(h(y.$$.fragment,t),h(M.$$.fragment,t),h(J.$$.fragment,t),h(T.$$.fragment,t),h(_.$$.fragment,t),h(R.$$.fragment,t),h(w.$$.fragment,t),h(U.$$.fragment,t),P=!0)},o(t){u(y.$$.fragment,t),u(M.$$.fragment,t),u(J.$$.fragment,t),u(T.$$.fragment,t),u(_.$$.fragment,t),u(R.$$.fragment,t),u(w.$$.fragment,t),u(U.$$.fragment,t),P=!1},d(t){t&&(a(G),a(I),a(X),a(k),a(Y),a(f),a(F),a(g),a(V),a(W),a(j),a(N),a(v),a(x),a(z),a(L),a(q),a(Q),a(E),a($),a(B),a(H),a(S),a(C)),a(p),c(y,t),c(M,t),c(J,t),c(T,t),c(_,t),c(R,t),c(w,t),c(U,t)}}}const Mt='{"title":"Load text data","local":"load-text-data","sections":[],"depth":1}';function ft(A){return ot(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class xt extends it{constructor(p){super(),mt(this,p,ft,yt,nt,{})}}export{xt as component}; | |
Xet Storage Details
- Size:
- 9.28 kB
- Xet hash:
- 19a9c70fd6fdcf03d9cc946a62ec92567cb8a2d632b2d49d898695b32f2f1cc4
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.