Buckets:
| import{s as It,n as bt,o as Ut}from"../chunks/scheduler.d75c11ed.js";import{S as $t,i as _t,e as o,s as l,c as d,h as kt,a as p,d as a,b as n,f as jt,g as r,j as i,k as Ct,l as vt,m as s,n as f,t as h,o as u,p as g}from"../chunks/index.4ec9dfe9.js";import{C as xt,H as pt,E as Gt}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.ee0f129e.js";import{C as it}from"../chunks/CodeBlock.5919a092.js";function Zt(mt){let m,q,F,H,c,Y,y,S,w,V,M,dt="The Hugging Face Dataset Hub is home to a growing collection of datasets that span a variety of domains and tasks.",A,J,rt="It’s more than a cloud storage: the Dataset Hub is a platform that provides data versioning thanks to git, as well as a Dataset Viewer to explore the data, making it a great place to store AI-ready datasets.",Q,T,ft="This guide shows how to import data from other cloud storage using the filesystems implementations from <code>fsspec</code>.",W,j,E,C,ht=`Most cloud storage providers have a <code>fsspec</code> FileSystem implementation, which is useful to import data from any cloud provider with the same code. | |
| This is especially useful to publish datasets on Hugging Face.`,z,I,ut="Take a look at the following table for some example of supported cloud storage providers:",L,b,gt='<thead><tr><th>Storage provider</th> <th>Filesystem implementation</th></tr></thead> <tbody><tr><td>Amazon S3</td> <td><a href="https://s3fs.readthedocs.io/en/latest/" rel="nofollow">s3fs</a></td></tr> <tr><td>Google Cloud Storage</td> <td><a href="https://gcsfs.readthedocs.io/en/latest/" rel="nofollow">gcsfs</a></td></tr> <tr><td>Azure Blob/DataLake</td> <td><a href="https://github.com/fsspec/adlfs" rel="nofollow">adlfs</a></td></tr> <tr><td>Oracle Cloud Storage</td> <td><a href="https://ocifs.readthedocs.io/en/latest/" rel="nofollow">ocifs</a></td></tr></tbody>',P,U,ct="This guide will show you how to import data files from any cloud storage and save a dataset on Hugging Face.",N,$,yt="Let’s say we want to publish a dataset on Hugging Face from Parquet files from a cloud storage.",D,_,wt="First, instantiate your cloud storage filesystem and list the files you’d like to import:",K,k,O,v,Mt="Then you can create a dataset on Hugging Face and import the data files, using for example:",tt,x,et,G,Jt='Check out the <a href="https://huggingface.co/docs/huggingface_hub" rel="nofollow">huggingface_hub</a> documentation on files uploads <a href="https://huggingface.co/docs/huggingface_hub/en/guides/upload" rel="nofollow">here</a> if you’re looking for more upload options.',at,Z,Tt="Finally you can now load the dataset using 🤗 Datasets:",st,X,lt,B,nt,R,ot;return c=new xt({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),y=new pt({props:{title:"Cloud storage",local:"cloud-storage",headingTag:"h1"}}),w=new pt({props:{title:"Hugging Face Datasets",local:"hugging-face-datasets",headingTag:"h2"}}),j=new pt({props:{title:"Import data from a cloud storage",local:"import-data-from-a-cloud-storage",headingTag:"h2"}}),k=new it({props:{code:"aW1wb3J0JTIwZnNzcGVjJTBBZnMlMjAlM0QlMjBmc3NwZWMuZmlsZXN5c3RlbSglMjIuLi4lMjIpJTIwJTIwJTIzJTIwczMlMjAlMkYlMjBnY3MlMjAlMkYlMjBhYmZzJTIwJTJGJTIwYWRsJTIwJTJGJTIwb2NpJTIwJTJGJTIwLi4uJTBBZGF0YV9kaXIlMjAlM0QlMjAlMjJwYXRoJTJGdG8lMkZteSUyRmRhdGElMkYlMjIlMEFwYXR0ZXJuJTIwJTNEJTIwJTIyKi5wYXJxdWV0JTIyJTBBZGF0YV9maWxlcyUyMCUzRCUyMGZzLmdsb2IoZGF0YV9kaXIlMjAlMkIlMjBwYXR0ZXJuKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> fsspec | |
| <span class="hljs-meta">>>> </span>fs = fsspec.filesystem(<span class="hljs-string">"..."</span>) <span class="hljs-comment"># s3 / gcs / abfs / adl / oci / ...</span> | |
| <span class="hljs-meta">>>> </span>data_dir = <span class="hljs-string">"path/to/my/data/"</span> | |
| <span class="hljs-meta">>>> </span>pattern = <span class="hljs-string">"*.parquet"</span> | |
| <span class="hljs-meta">>>> </span>data_files = fs.glob(data_dir + pattern) | |
| [<span class="hljs-string">"path/to/my/data/0001.parquet"</span>, <span class="hljs-string">"path/to/my/data/0001.parquet"</span>, ...]`,wrap:!1}}),x=new it({props:{code:"ZnJvbSUyMGh1Z2dpbmdmYWNlX2h1YiUyMGltcG9ydCUyMGNyZWF0ZV9yZXBvJTJDJTIwdXBsb2FkX2ZpbGUlMEFmcm9tJTIwdHFkbS5hdXRvJTIwaW1wb3J0JTIwdHFkbSUwQWRlc3RpbmF0aW9uX2RhdGFzZXQlMjAlM0QlMjAlMjJ1c2VybmFtZSUyRm15LWRhdGFzZXQlMjIlMEFjcmVhdGVfcmVwbyhkZXN0aW5hdGlvbl9kYXRhc2V0JTJDJTIwcmVwb190eXBlJTNEJTIyZGF0YXNldCUyMiklMEFmb3IlMjBkYXRhX2ZpbGUlMjBpbiUyMHRxZG0oZnMuZ2xvYihkYXRhX2RpciUyMCUyQiUyMHBhdHRlcm4pKSUzQSUwQSUyMCUyMCUyMCUyMHdpdGglMjBmcy5vcGVuKGRhdGFfZmlsZSklMjBhcyUyMGZpbGVvYmolM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBwYXRoX2luX3JlcG8lMjAlM0QlMjBkYXRhX2ZpbGUlNUJsZW4oZGF0YV9kaXIpJTNBJTVEJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwdXBsb2FkX2ZpbGUoJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcGF0aF9vcl9maWxlb2JqJTNEZmlsZW9iaiUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHBhdGhfaW5fcmVwbyUzRHBhdGhfaW5fcmVwbyUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHJlcG9faWQlM0RkZXN0aW5hdGlvbl9kYXRhc2V0JTJDJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcmVwb190eXBlJTNEJTIyZGF0YXNldCUyMiUyQyUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCk=",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> create_repo, upload_file | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> tqdm.auto <span class="hljs-keyword">import</span> tqdm | |
| <span class="hljs-meta">>>> </span>destination_dataset = <span class="hljs-string">"username/my-dataset"</span> | |
| <span class="hljs-meta">>>> </span>create_repo(destination_dataset, repo_type=<span class="hljs-string">"dataset"</span>) | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">for</span> data_file <span class="hljs-keyword">in</span> tqdm(fs.glob(data_dir + pattern)): | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">with</span> fs.<span class="hljs-built_in">open</span>(data_file) <span class="hljs-keyword">as</span> fileobj: | |
| <span class="hljs-meta">... </span> path_in_repo = data_file[<span class="hljs-built_in">len</span>(data_dir):] | |
| <span class="hljs-meta">... </span> upload_file( | |
| <span class="hljs-meta">... </span> path_or_fileobj=fileobj, | |
| <span class="hljs-meta">... </span> path_in_repo=path_in_repo, | |
| <span class="hljs-meta">... </span> repo_id=destination_dataset, | |
| <span class="hljs-meta">... </span> repo_type=<span class="hljs-string">"dataset"</span>, | |
| <span class="hljs-meta">... </span> )`,wrap:!1}}),X=new it({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBZHMlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTIydXNlcm5hbWUlMkZteS1kYXRhc2V0JTIyKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-meta">>>> </span>ds = load_dataset(<span class="hljs-string">"username/my-dataset"</span>)`,wrap:!1}}),B=new Gt({props:{source:"https://github.com/huggingface/datasets/blob/main/docs/source/filesystems.mdx"}}),{c(){m=o("meta"),q=l(),F=o("p"),H=l(),d(c.$$.fragment),Y=l(),d(y.$$.fragment),S=l(),d(w.$$.fragment),V=l(),M=o("p"),M.textContent=dt,A=l(),J=o("p"),J.textContent=rt,Q=l(),T=o("p"),T.innerHTML=ft,W=l(),d(j.$$.fragment),E=l(),C=o("p"),C.innerHTML=ht,z=l(),I=o("p"),I.textContent=ut,L=l(),b=o("table"),b.innerHTML=gt,P=l(),U=o("p"),U.textContent=ct,N=l(),$=o("p"),$.textContent=yt,D=l(),_=o("p"),_.textContent=wt,K=l(),d(k.$$.fragment),O=l(),v=o("p"),v.textContent=Mt,tt=l(),d(x.$$.fragment),et=l(),G=o("p"),G.innerHTML=Jt,at=l(),Z=o("p"),Z.textContent=Tt,st=l(),d(X.$$.fragment),lt=l(),d(B.$$.fragment),nt=l(),R=o("p"),this.h()},l(t){const e=kt("svelte-u9bgzb",document.head);m=p(e,"META",{name:!0,content:!0}),e.forEach(a),q=n(t),F=p(t,"P",{}),jt(F).forEach(a),H=n(t),r(c.$$.fragment,t),Y=n(t),r(y.$$.fragment,t),S=n(t),r(w.$$.fragment,t),V=n(t),M=p(t,"P",{"data-svelte-h":!0}),i(M)!=="svelte-pwg4if"&&(M.textContent=dt),A=n(t),J=p(t,"P",{"data-svelte-h":!0}),i(J)!=="svelte-le6063"&&(J.textContent=rt),Q=n(t),T=p(t,"P",{"data-svelte-h":!0}),i(T)!=="svelte-ei4lhv"&&(T.innerHTML=ft),W=n(t),r(j.$$.fragment,t),E=n(t),C=p(t,"P",{"data-svelte-h":!0}),i(C)!=="svelte-1uadmic"&&(C.innerHTML=ht),z=n(t),I=p(t,"P",{"data-svelte-h":!0}),i(I)!=="svelte-1enr8yq"&&(I.textContent=ut),L=n(t),b=p(t,"TABLE",{"data-svelte-h":!0}),i(b)!=="svelte-40f5jy"&&(b.innerHTML=gt),P=n(t),U=p(t,"P",{"data-svelte-h":!0}),i(U)!=="svelte-tdsgr3"&&(U.textContent=ct),N=n(t),$=p(t,"P",{"data-svelte-h":!0}),i($)!=="svelte-19vswnu"&&($.textContent=yt),D=n(t),_=p(t,"P",{"data-svelte-h":!0}),i(_)!=="svelte-1fbzvas"&&(_.textContent=wt),K=n(t),r(k.$$.fragment,t),O=n(t),v=p(t,"P",{"data-svelte-h":!0}),i(v)!=="svelte-br8k8v"&&(v.textContent=Mt),tt=n(t),r(x.$$.fragment,t),et=n(t),G=p(t,"P",{"data-svelte-h":!0}),i(G)!=="svelte-drsgq7"&&(G.innerHTML=Jt),at=n(t),Z=p(t,"P",{"data-svelte-h":!0}),i(Z)!=="svelte-5gahax"&&(Z.textContent=Tt),st=n(t),r(X.$$.fragment,t),lt=n(t),r(B.$$.fragment,t),nt=n(t),R=p(t,"P",{}),jt(R).forEach(a),this.h()},h(){Ct(m,"name","hf:doc:metadata"),Ct(m,"content",Xt)},m(t,e){vt(document.head,m),s(t,q,e),s(t,F,e),s(t,H,e),f(c,t,e),s(t,Y,e),f(y,t,e),s(t,S,e),f(w,t,e),s(t,V,e),s(t,M,e),s(t,A,e),s(t,J,e),s(t,Q,e),s(t,T,e),s(t,W,e),f(j,t,e),s(t,E,e),s(t,C,e),s(t,z,e),s(t,I,e),s(t,L,e),s(t,b,e),s(t,P,e),s(t,U,e),s(t,N,e),s(t,$,e),s(t,D,e),s(t,_,e),s(t,K,e),f(k,t,e),s(t,O,e),s(t,v,e),s(t,tt,e),f(x,t,e),s(t,et,e),s(t,G,e),s(t,at,e),s(t,Z,e),s(t,st,e),f(X,t,e),s(t,lt,e),f(B,t,e),s(t,nt,e),s(t,R,e),ot=!0},p:bt,i(t){ot||(h(c.$$.fragment,t),h(y.$$.fragment,t),h(w.$$.fragment,t),h(j.$$.fragment,t),h(k.$$.fragment,t),h(x.$$.fragment,t),h(X.$$.fragment,t),h(B.$$.fragment,t),ot=!0)},o(t){u(c.$$.fragment,t),u(y.$$.fragment,t),u(w.$$.fragment,t),u(j.$$.fragment,t),u(k.$$.fragment,t),u(x.$$.fragment,t),u(X.$$.fragment,t),u(B.$$.fragment,t),ot=!1},d(t){t&&(a(q),a(F),a(H),a(Y),a(S),a(V),a(M),a(A),a(J),a(Q),a(T),a(W),a(E),a(C),a(z),a(I),a(L),a(b),a(P),a(U),a(N),a($),a(D),a(_),a(K),a(O),a(v),a(tt),a(et),a(G),a(at),a(Z),a(st),a(lt),a(nt),a(R)),a(m),g(c,t),g(y,t),g(w,t),g(j,t),g(k,t),g(x,t),g(X,t),g(B,t)}}}const Xt='{"title":"Cloud storage","local":"cloud-storage","sections":[{"title":"Hugging Face Datasets","local":"hugging-face-datasets","sections":[],"depth":2},{"title":"Import data from a cloud storage","local":"import-data-from-a-cloud-storage","sections":[],"depth":2}],"depth":1}';function Bt(mt){return Ut(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Yt extends $t{constructor(m){super(),_t(this,m,Bt,Zt,It,{})}}export{Yt as component}; | |
Xet Storage Details
- Size:
- 10.9 kB
- Xet hash:
- e6b6eacb3299a59c22b622fed56b8315687f5beca298bef157e9c05f6cf993ba
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.