Buckets:
| import{s as vt,n as Ct,o as Mt}from"../chunks/scheduler.d75c11ed.js";import{S as Ht,i as xt,e as i,s as l,c as p,h as kt,a as o,d as a,b as n,f as Qe,g as d,j as c,k as Oe,l as Jt,m as s,n as r,t as h,o as f,p as m}from"../chunks/index.4ec9dfe9.js";import{C as Zt,H as le,E as Rt}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.ee0f129e.js";import{C as g}from"../chunks/CodeBlock.5919a092.js";function jt(Ke){let u,oe,ne,ce,_,pe,w,de,b,et=`When you download a dataset from Hugging Face, the data are stored locally on your computer. | |
| Files from Hugging Face are stored as usual in the <code>huggingface_hub</code> cache, which is at <code>~/.cache/huggingface/hub</code> by default. | |
| See the <a href="https://huggingface.co/docs/huggingface_hub/guides/manage-cache" rel="nofollow">Hub cache documentation</a> for more details and how to change its location.`,re,y,tt="The Hub cache allows 🤗 Datasets to avoid re-downloading dataset files from Hugging Face every time you use them.",he,T,at='🤗 Datasets also has its own cache to store datasets converted in Arrow format (the format used by <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> objects).',fe,v,st="This guide focuses on the 🤗 Datasets cache and will show you how to:",me,C,lt="<li>Change the cache directory.</li> <li>Control how a dataset is loaded from the cache.</li> <li>Clean up cache files in the directory.</li> <li>Enable or disable caching.</li>",ue,M,ge,H,nt="The default 🤗 Datasets cache directory is <code>~/.cache/huggingface/datasets</code>. Change the cache location by setting the shell environment variable, <code>HF_HOME</code> to another directory:",$e,x,_e,k,it="Alternatively, you can set the <code>HF_DATASETS_CACHE</code> environment variable to control only the datasets-specific cache directory:",we,J,be,Z,ot=`⚠️ This only applies to files written by the <code>datasets</code> library (e.g., Arrow files and indices).<br/> | |
| It does <strong>not</strong> affect files downloaded from the Hugging Face Hub (such as models, tokenizers, or raw dataset sources), which are located in <code>~/.cache/huggingface/hub</code> by default and controlled separately via the <code>HF_HUB_CACHE</code> variable:`,ye,R,Te,j,ct="💡 If you’d like to relocate all Hugging Face caches — including datasets and hub downloads — use the <code>HF_HOME</code> variable instead:",ve,F,Ce,L,pt="This results in:",Me,U,dt="<li>datasets cache → <code>/path/to/cache_root/datasets</code></li> <li>hub cache → <code>/path/to/cache_root/hub</code></li>",He,E,rt=`These distinctions are especially useful when working in shared environments or networked file systems (e.g., NFS).<br/> | |
| See <a href="https://github.com/huggingface/datasets/issues/7480" rel="nofollow">issue #7480</a> for discussion on how users encountered unexpected cache locations when <code>HF_HUB_CACHE</code> was not set alongside <code>HF_DATASETS_CACHE</code>.`,xe,I,ht="When you load a dataset, you also have the option to change where the data is cached. Change the <code>cache_dir</code> parameter to the path you want:",ke,X,Je,Y,Ze,G,ft='After you download a dataset, control how it is loaded by <a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a> with the <code>download_mode</code> parameter. By default, 🤗 Datasets will reuse a dataset if it exists. But if you need the original dataset without any processing functions applied, re-download the files as shown below:',Re,S,je,z,mt='Refer to <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.DownloadMode">DownloadMode</a> for a full list of download modes.',Fe,N,Le,A,ut='Clean up the Arrow cache files in the directory with <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.cleanup_cache_files">Dataset.cleanup_cache_files()</a>:',Ue,D,Ee,P,Ie,W,gt='If you’re using a cached file locally, it will automatically reload the dataset with any previous transforms you applied to the dataset. Disable this behavior by setting the argument <code>load_from_cache_file=False</code> in <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">Dataset.map()</a>:',Xe,V,Ye,B,$t="In the example above, 🤗 Datasets will execute the function <code>add_prefix</code> over the entire dataset again instead of loading the dataset from its previous state.",Ge,q,_t='Disable caching on a global scale with <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.disable_caching">disable_caching()</a>:',Se,O,ze,Q,wt="When you disable caching, 🤗 Datasets will no longer reload cached files when applying transforms to datasets. Any transform you apply on your dataset will be need to be reapplied.",Ne,$,bt='<p>If you want to reuse a dataset from scratch, try setting the <code>download_mode</code> parameter in <a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a> instead.</p>',Ae,se,De,K,Pe,ee,yt="Disabling the cache and copying the dataset in-memory will speed up dataset operations. There are two options for copying the dataset in-memory:",We,te,Tt="<li><p>Set <code>datasets.config.IN_MEMORY_MAX_SIZE</code> to a nonzero value (in bytes) that fits in your RAM memory.</p></li> <li><p>Set the environment variable <code>HF_DATASETS_IN_MEMORY_MAX_SIZE</code> to a nonzero value. Note that the first method takes higher precedence.</p></li>",Ve,ae,Be,ie,qe;return _=new Zt({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),w=new le({props:{title:"Cache management",local:"cache-management",headingTag:"h1"}}),M=new le({props:{title:"Cache directory",local:"cache-directory",headingTag:"h2"}}),x=new g({props:{code:"JTI0JTIwZXhwb3J0JTIwSEZfSE9NRSUzRCUyMiUyRnBhdGglMkZ0byUyRmFub3RoZXIlMkZkaXJlY3RvcnklMkZkYXRhc2V0cyUyMg==",highlighted:'$ <span class="hljs-built_in">export</span> <span class="hljs-attribute">HF_HOME</span>=<span class="hljs-string">"/path/to/another/directory/datasets"</span>',wrap:!1}}),J=new g({props:{code:"JTI0JTIwZXhwb3J0JTIwSEZfREFUQVNFVFNfQ0FDSEUlM0QlMjIlMkZwYXRoJTJGdG8lMkZkYXRhc2V0c19jYWNoZSUyMg==",highlighted:'$ <span class="hljs-built_in">export</span> <span class="hljs-attribute">HF_DATASETS_CACHE</span>=<span class="hljs-string">"/path/to/datasets_cache"</span>',wrap:!1}}),R=new g({props:{code:"JTI0JTIwZXhwb3J0JTIwSEZfSFVCX0NBQ0hFJTNEJTIyJTJGcGF0aCUyRnRvJTJGaHViX2NhY2hlJTIy",highlighted:'$ <span class="hljs-built_in">export</span> <span class="hljs-attribute">HF_HUB_CACHE</span>=<span class="hljs-string">"/path/to/hub_cache"</span>',wrap:!1}}),F=new g({props:{code:"JTI0JTIwZXhwb3J0JTIwSEZfSE9NRSUzRCUyMiUyRnBhdGglMkZ0byUyRmNhY2hlX3Jvb3QlMjI=",highlighted:'$ <span class="hljs-built_in">export</span> <span class="hljs-attribute">HF_HOME</span>=<span class="hljs-string">"/path/to/cache_root"</span>',wrap:!1}}),X=new g({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCgndXNlcm5hbWUlMkZkYXRhc2V0JyUyQyUyMGNhY2hlX2RpciUzRCUyMiUyRnBhdGglMkZ0byUyRmFub3RoZXIlMkZkaXJlY3RvcnklMkZkYXRhc2V0cyUyMik=",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-meta">>>> </span>dataset = load_dataset(<span class="hljs-string">'username/dataset'</span>, cache_dir=<span class="hljs-string">"/path/to/another/directory/datasets"</span>)`,wrap:!1}}),Y=new le({props:{title:"Download mode",local:"download-mode",headingTag:"h2"}}),S=new g({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCgncmFqcHVya2FyJTJGc3F1YWQnJTJDJTIwZG93bmxvYWRfbW9kZSUzRCdmb3JjZV9yZWRvd25sb2FkJyk=",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-meta">>>> </span>dataset = load_dataset(<span class="hljs-string">'rajpurkar/squad'</span>, download_mode=<span class="hljs-string">'force_redownload'</span>)`,wrap:!1}}),N=new le({props:{title:"Cache files",local:"cache-files",headingTag:"h2"}}),D=new g({props:{code:"ZGF0YXNldC5jbGVhbnVwX2NhY2hlX2ZpbGVzKCk=",highlighted:`<span class="hljs-comment"># Returns the number of removed cache files</span> | |
| <span class="hljs-meta">>>> </span>dataset.cleanup_cache_files() | |
| <span class="hljs-number">2</span>`,wrap:!1}}),P=new le({props:{title:"Enable or disable caching",local:"enable-or-disable-caching",headingTag:"h2"}}),V=new g({props:{code:"dXBkYXRlZF9kYXRhc2V0JTIwJTNEJTIwc21hbGxfZGF0YXNldC5tYXAoYWRkX3ByZWZpeCUyQyUyMGxvYWRfZnJvbV9jYWNoZV9maWxlJTNERmFsc2Up",highlighted:'<span class="hljs-meta">>>> </span>updated_dataset = small_dataset.<span class="hljs-built_in">map</span>(add_prefix, load_from_cache_file=<span class="hljs-literal">False</span>)',wrap:!1}}),O=new g({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwZGlzYWJsZV9jYWNoaW5nJTBBZGlzYWJsZV9jYWNoaW5nKCk=",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> disable_caching | |
| <span class="hljs-meta">>>> </span>disable_caching()`,wrap:!1}}),K=new le({props:{title:"Improve performance",local:"improve-performance",headingTag:"h2"}}),ae=new Rt({props:{source:"https://github.com/huggingface/datasets/blob/main/docs/source/cache.mdx"}}),{c(){u=i("meta"),oe=l(),ne=i("p"),ce=l(),p(_.$$.fragment),pe=l(),p(w.$$.fragment),de=l(),b=i("p"),b.innerHTML=et,re=l(),y=i("p"),y.textContent=tt,he=l(),T=i("p"),T.innerHTML=at,fe=l(),v=i("p"),v.textContent=st,me=l(),C=i("ul"),C.innerHTML=lt,ue=l(),p(M.$$.fragment),ge=l(),H=i("p"),H.innerHTML=nt,$e=l(),p(x.$$.fragment),_e=l(),k=i("p"),k.innerHTML=it,we=l(),p(J.$$.fragment),be=l(),Z=i("p"),Z.innerHTML=ot,ye=l(),p(R.$$.fragment),Te=l(),j=i("p"),j.innerHTML=ct,ve=l(),p(F.$$.fragment),Ce=l(),L=i("p"),L.textContent=pt,Me=l(),U=i("ul"),U.innerHTML=dt,He=l(),E=i("p"),E.innerHTML=rt,xe=l(),I=i("p"),I.innerHTML=ht,ke=l(),p(X.$$.fragment),Je=l(),p(Y.$$.fragment),Ze=l(),G=i("p"),G.innerHTML=ft,Re=l(),p(S.$$.fragment),je=l(),z=i("p"),z.innerHTML=mt,Fe=l(),p(N.$$.fragment),Le=l(),A=i("p"),A.innerHTML=ut,Ue=l(),p(D.$$.fragment),Ee=l(),p(P.$$.fragment),Ie=l(),W=i("p"),W.innerHTML=gt,Xe=l(),p(V.$$.fragment),Ye=l(),B=i("p"),B.innerHTML=$t,Ge=l(),q=i("p"),q.innerHTML=_t,Se=l(),p(O.$$.fragment),ze=l(),Q=i("p"),Q.textContent=wt,Ne=l(),$=i("blockquote"),$.innerHTML=bt,Ae=l(),se=i("a"),De=l(),p(K.$$.fragment),Pe=l(),ee=i("p"),ee.textContent=yt,We=l(),te=i("ol"),te.innerHTML=Tt,Ve=l(),p(ae.$$.fragment),Be=l(),ie=i("p"),this.h()},l(e){const t=kt("svelte-u9bgzb",document.head);u=o(t,"META",{name:!0,content:!0}),t.forEach(a),oe=n(e),ne=o(e,"P",{}),Qe(ne).forEach(a),ce=n(e),d(_.$$.fragment,e),pe=n(e),d(w.$$.fragment,e),de=n(e),b=o(e,"P",{"data-svelte-h":!0}),c(b)!=="svelte-iay7o3"&&(b.innerHTML=et),re=n(e),y=o(e,"P",{"data-svelte-h":!0}),c(y)!=="svelte-19z69yp"&&(y.textContent=tt),he=n(e),T=o(e,"P",{"data-svelte-h":!0}),c(T)!=="svelte-2bzb6i"&&(T.innerHTML=at),fe=n(e),v=o(e,"P",{"data-svelte-h":!0}),c(v)!=="svelte-9v8z4"&&(v.textContent=st),me=n(e),C=o(e,"UL",{"data-svelte-h":!0}),c(C)!=="svelte-1pju6bx"&&(C.innerHTML=lt),ue=n(e),d(M.$$.fragment,e),ge=n(e),H=o(e,"P",{"data-svelte-h":!0}),c(H)!=="svelte-106khif"&&(H.innerHTML=nt),$e=n(e),d(x.$$.fragment,e),_e=n(e),k=o(e,"P",{"data-svelte-h":!0}),c(k)!=="svelte-h24suv"&&(k.innerHTML=it),we=n(e),d(J.$$.fragment,e),be=n(e),Z=o(e,"P",{"data-svelte-h":!0}),c(Z)!=="svelte-16exhn5"&&(Z.innerHTML=ot),ye=n(e),d(R.$$.fragment,e),Te=n(e),j=o(e,"P",{"data-svelte-h":!0}),c(j)!=="svelte-968r6o"&&(j.innerHTML=ct),ve=n(e),d(F.$$.fragment,e),Ce=n(e),L=o(e,"P",{"data-svelte-h":!0}),c(L)!=="svelte-3mxhsb"&&(L.textContent=pt),Me=n(e),U=o(e,"UL",{"data-svelte-h":!0}),c(U)!=="svelte-4qd4zy"&&(U.innerHTML=dt),He=n(e),E=o(e,"P",{"data-svelte-h":!0}),c(E)!=="svelte-ydfbpl"&&(E.innerHTML=rt),xe=n(e),I=o(e,"P",{"data-svelte-h":!0}),c(I)!=="svelte-rd1xw8"&&(I.innerHTML=ht),ke=n(e),d(X.$$.fragment,e),Je=n(e),d(Y.$$.fragment,e),Ze=n(e),G=o(e,"P",{"data-svelte-h":!0}),c(G)!=="svelte-1hskw5a"&&(G.innerHTML=ft),Re=n(e),d(S.$$.fragment,e),je=n(e),z=o(e,"P",{"data-svelte-h":!0}),c(z)!=="svelte-16300th"&&(z.innerHTML=mt),Fe=n(e),d(N.$$.fragment,e),Le=n(e),A=o(e,"P",{"data-svelte-h":!0}),c(A)!=="svelte-tlw8sv"&&(A.innerHTML=ut),Ue=n(e),d(D.$$.fragment,e),Ee=n(e),d(P.$$.fragment,e),Ie=n(e),W=o(e,"P",{"data-svelte-h":!0}),c(W)!=="svelte-1lez75y"&&(W.innerHTML=gt),Xe=n(e),d(V.$$.fragment,e),Ye=n(e),B=o(e,"P",{"data-svelte-h":!0}),c(B)!=="svelte-135z99u"&&(B.innerHTML=$t),Ge=n(e),q=o(e,"P",{"data-svelte-h":!0}),c(q)!=="svelte-1gmx89x"&&(q.innerHTML=_t),Se=n(e),d(O.$$.fragment,e),ze=n(e),Q=o(e,"P",{"data-svelte-h":!0}),c(Q)!=="svelte-1cf02ij"&&(Q.textContent=wt),Ne=n(e),$=o(e,"BLOCKQUOTE",{class:!0,"data-svelte-h":!0}),c($)!=="svelte-151690e"&&($.innerHTML=bt),Ae=n(e),se=o(e,"A",{id:!0}),Qe(se).forEach(a),De=n(e),d(K.$$.fragment,e),Pe=n(e),ee=o(e,"P",{"data-svelte-h":!0}),c(ee)!=="svelte-1dk1qbe"&&(ee.textContent=yt),We=n(e),te=o(e,"OL",{"data-svelte-h":!0}),c(te)!=="svelte-1bgoi8z"&&(te.innerHTML=Tt),Ve=n(e),d(ae.$$.fragment,e),Be=n(e),ie=o(e,"P",{}),Qe(ie).forEach(a),this.h()},h(){Oe(u,"name","hf:doc:metadata"),Oe(u,"content",Ft),Oe($,"class","tip"),Oe(se,"id","load_dataset_enhancing_performance")},m(e,t){Jt(document.head,u),s(e,oe,t),s(e,ne,t),s(e,ce,t),r(_,e,t),s(e,pe,t),r(w,e,t),s(e,de,t),s(e,b,t),s(e,re,t),s(e,y,t),s(e,he,t),s(e,T,t),s(e,fe,t),s(e,v,t),s(e,me,t),s(e,C,t),s(e,ue,t),r(M,e,t),s(e,ge,t),s(e,H,t),s(e,$e,t),r(x,e,t),s(e,_e,t),s(e,k,t),s(e,we,t),r(J,e,t),s(e,be,t),s(e,Z,t),s(e,ye,t),r(R,e,t),s(e,Te,t),s(e,j,t),s(e,ve,t),r(F,e,t),s(e,Ce,t),s(e,L,t),s(e,Me,t),s(e,U,t),s(e,He,t),s(e,E,t),s(e,xe,t),s(e,I,t),s(e,ke,t),r(X,e,t),s(e,Je,t),r(Y,e,t),s(e,Ze,t),s(e,G,t),s(e,Re,t),r(S,e,t),s(e,je,t),s(e,z,t),s(e,Fe,t),r(N,e,t),s(e,Le,t),s(e,A,t),s(e,Ue,t),r(D,e,t),s(e,Ee,t),r(P,e,t),s(e,Ie,t),s(e,W,t),s(e,Xe,t),r(V,e,t),s(e,Ye,t),s(e,B,t),s(e,Ge,t),s(e,q,t),s(e,Se,t),r(O,e,t),s(e,ze,t),s(e,Q,t),s(e,Ne,t),s(e,$,t),s(e,Ae,t),s(e,se,t),s(e,De,t),r(K,e,t),s(e,Pe,t),s(e,ee,t),s(e,We,t),s(e,te,t),s(e,Ve,t),r(ae,e,t),s(e,Be,t),s(e,ie,t),qe=!0},p:Ct,i(e){qe||(h(_.$$.fragment,e),h(w.$$.fragment,e),h(M.$$.fragment,e),h(x.$$.fragment,e),h(J.$$.fragment,e),h(R.$$.fragment,e),h(F.$$.fragment,e),h(X.$$.fragment,e),h(Y.$$.fragment,e),h(S.$$.fragment,e),h(N.$$.fragment,e),h(D.$$.fragment,e),h(P.$$.fragment,e),h(V.$$.fragment,e),h(O.$$.fragment,e),h(K.$$.fragment,e),h(ae.$$.fragment,e),qe=!0)},o(e){f(_.$$.fragment,e),f(w.$$.fragment,e),f(M.$$.fragment,e),f(x.$$.fragment,e),f(J.$$.fragment,e),f(R.$$.fragment,e),f(F.$$.fragment,e),f(X.$$.fragment,e),f(Y.$$.fragment,e),f(S.$$.fragment,e),f(N.$$.fragment,e),f(D.$$.fragment,e),f(P.$$.fragment,e),f(V.$$.fragment,e),f(O.$$.fragment,e),f(K.$$.fragment,e),f(ae.$$.fragment,e),qe=!1},d(e){e&&(a(oe),a(ne),a(ce),a(pe),a(de),a(b),a(re),a(y),a(he),a(T),a(fe),a(v),a(me),a(C),a(ue),a(ge),a(H),a($e),a(_e),a(k),a(we),a(be),a(Z),a(ye),a(Te),a(j),a(ve),a(Ce),a(L),a(Me),a(U),a(He),a(E),a(xe),a(I),a(ke),a(Je),a(Ze),a(G),a(Re),a(je),a(z),a(Fe),a(Le),a(A),a(Ue),a(Ee),a(Ie),a(W),a(Xe),a(Ye),a(B),a(Ge),a(q),a(Se),a(ze),a(Q),a(Ne),a($),a(Ae),a(se),a(De),a(Pe),a(ee),a(We),a(te),a(Ve),a(Be),a(ie)),a(u),m(_,e),m(w,e),m(M,e),m(x,e),m(J,e),m(R,e),m(F,e),m(X,e),m(Y,e),m(S,e),m(N,e),m(D,e),m(P,e),m(V,e),m(O,e),m(K,e),m(ae,e)}}}const Ft='{"title":"Cache management","local":"cache-management","sections":[{"title":"Cache directory","local":"cache-directory","sections":[],"depth":2},{"title":"Download mode","local":"download-mode","sections":[],"depth":2},{"title":"Cache files","local":"cache-files","sections":[],"depth":2},{"title":"Enable or disable caching","local":"enable-or-disable-caching","sections":[],"depth":2},{"title":"Improve performance","local":"improve-performance","sections":[],"depth":2}],"depth":1}';function Lt(Ke){return Mt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Yt extends Ht{constructor(u){super(),xt(this,u,Lt,jt,vt,{})}}export{Yt as component}; | |
Xet Storage Details
- Size:
- 16.4 kB
- Xet hash:
- c964de92d62332854f91eea00bc72ec1eeb386c1363602e2e8c0ef2cb6c0d323
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.