Buckets:

rtrm's picture
download
raw
9.23 kB
import{s as is,n as ps,o as os}from"../chunks/scheduler.d75c11ed.js";import{S as ms,i as rs,e as p,s as l,c as m,h as cs,a as o,d as a,b as n,f as es,g as r,j as _,k as ls,l as us,m as e,n as c,t as u,o as f,p as h}from"../chunks/index.4ec9dfe9.js";import{C as fs,H as Y,E as hs}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.ee0f129e.js";import{C as ns}from"../chunks/CodeBlock.5919a092.js";function ds(L){let i,Z,v,x,d,R,w,W,M,A,g,K='<a href="https://arrow.apache.org/" rel="nofollow">Arrow</a> enables large amounts of data to be processed and moved quickly. It is a specific data format that stores data in a columnar memory layout. This provides several significant advantages:',z,y,D='<li>Arrow’s standard format allows <a href="https://en.wikipedia.org/wiki/Zero-copy" rel="nofollow">zero-copy reads</a> which removes virtually all serialization overhead.</li> <li>Arrow is language-agnostic so it supports different programming languages.</li> <li>Arrow is column-oriented so it is faster at querying and processing slices or columns of data.</li> <li>Arrow allows for copy-free hand-offs to standard machine learning tools such as NumPy, Pandas, PyTorch, and TensorFlow.</li> <li>Arrow supports many, possibly nested, column types.</li>',F,j,X,b,O=`🤗 Datasets uses Arrow for its local caching system. It allows datasets to be backed by an on-disk cache, which is memory-mapped for fast lookup.
This architecture allows for large datasets to be used on machines with relatively small device memory.`,Q,T,ss="For example, loading the full English Wikipedia dataset only takes a few MB of RAM:",S,J,G,$,ts=`This is possible because the Arrow data is actually memory-mapped from disk, and not loaded in memory.
Memory-mapping allows access to data on disk, and leverages virtual memory capabilities for fast lookups.`,q,U,E,k,as="Iterating over a memory-mapped dataset using Arrow is fast. Iterating over Wikipedia on a laptop gives you speeds of 1-3 Gbit/s:",N,I,P,C,H,B,V;return d=new fs({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),w=new Y({props:{title:"Datasets 🤝 Arrow",local:"datasets--arrow",headingTag:"h1"}}),M=new Y({props:{title:"What is Arrow?",local:"what-is-arrow",headingTag:"h2"}}),j=new Y({props:{title:"Memory-mapping",local:"memory-mapping",headingTag:"h2"}}),J=new ns({props:{code:"aW1wb3J0JTIwb3MlM0IlMjBpbXBvcnQlMjBwc3V0aWwlM0IlMjBpbXBvcnQlMjB0aW1laXQlMEFmcm9tJTIwZGF0YXNldHMlMjBpbXBvcnQlMjBsb2FkX2RhdGFzZXQlMEElMEFtZW1fYmVmb3JlJTIwJTNEJTIwcHN1dGlsLlByb2Nlc3Mob3MuZ2V0cGlkKCkpLm1lbW9yeV9pbmZvKCkucnNzJTIwJTJGJTIwKDEwMjQlMjAqJTIwMTAyNCklMEF3aWtpJTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMndpa2ltZWRpYSUyRndpa2lwZWRpYSUyMiUyQyUyMCUyMjIwMjIwMzAxLmVuJTIyJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiklMEFtZW1fYWZ0ZXIlMjAlM0QlMjBwc3V0aWwuUHJvY2Vzcyhvcy5nZXRwaWQoKSkubWVtb3J5X2luZm8oKS5yc3MlMjAlMkYlMjAoMTAyNCUyMColMjAxMDI0KSUwQSUwQXByaW50KGYlMjJSQU0lMjBtZW1vcnklMjB1c2VkJTNBJTIwJTdCKG1lbV9hZnRlciUyMC0lMjBtZW1fYmVmb3JlKSU3RCUyME1CJTIyKQ==",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> os; <span class="hljs-keyword">import</span> psutil; <span class="hljs-keyword">import</span> timeit
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-comment"># Process.memory_info is expressed in bytes, so convert to megabytes </span>
<span class="hljs-meta">&gt;&gt;&gt; </span>mem_before = psutil.Process(os.getpid()).memory_info().rss / (<span class="hljs-number">1024</span> * <span class="hljs-number">1024</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>wiki = load_dataset(<span class="hljs-string">&quot;wikimedia/wikipedia&quot;</span>, <span class="hljs-string">&quot;20220301.en&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>mem_after = psutil.Process(os.getpid()).memory_info().rss / (<span class="hljs-number">1024</span> * <span class="hljs-number">1024</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;RAM memory used: <span class="hljs-subst">{(mem_after - mem_before)}</span> MB&quot;</span>)
RAM memory used: <span class="hljs-number">50</span> MB`,wrap:!1}}),U=new Y({props:{title:"Performance",local:"performance",headingTag:"h2"}}),I=new ns({props:{code:"cyUyMCUzRCUyMCUyMiUyMiUyMmJhdGNoX3NpemUlMjAlM0QlMjAxMDAwJTBBZm9yJTIwYmF0Y2glMjBpbiUyMHdpa2kuaXRlcihiYXRjaF9zaXplKSUzQSUwQSUyMCUyMCUyMCUyMC4uLiUwQSUyMiUyMiUyMiUwQSUwQWVsYXBzZWRfdGltZSUyMCUzRCUyMHRpbWVpdC50aW1laXQoc3RtdCUzRHMlMkMlMjBudW1iZXIlM0QxJTJDJTIwZ2xvYmFscyUzRGdsb2JhbHMoKSklMEFwcmludChmJTIyVGltZSUyMHRvJTIwaXRlcmF0ZSUyMG92ZXIlMjB0aGUlMjAlN0J3aWtpLmRhdGFzZXRfc2l6ZSUyMCUzRSUzRSUyMDMwJTdEJTIwR0IlMjBkYXRhc2V0JTNBJTIwJTdCZWxhcHNlZF90aW1lJTNBLjFmJTdEJTIwc2VjJTJDJTIwJTIyJTBBJTIwJTIwJTIwJTIwJTIwJTIwZiUyMmllLiUyMCU3QmZsb2F0KHdpa2kuZGF0YXNldF9zaXplJTIwJTNFJTNFJTIwMjcpJTJGZWxhcHNlZF90aW1lJTNBLjFmJTdEJTIwR2IlMkZzJTIyKQ==",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span>s = <span class="hljs-string">&quot;&quot;&quot;batch_size = 1000
<span class="hljs-meta">... </span>for batch in wiki.iter(batch_size):
<span class="hljs-meta">... </span> ...
<span class="hljs-meta">... </span>&quot;&quot;&quot;</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>elapsed_time = timeit.timeit(stmt=s, number=<span class="hljs-number">1</span>, <span class="hljs-built_in">globals</span>=<span class="hljs-built_in">globals</span>())
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Time to iterate over the <span class="hljs-subst">{wiki.dataset_size &gt;&gt; <span class="hljs-number">30</span>}</span> GB dataset: <span class="hljs-subst">{elapsed_time:<span class="hljs-number">.1</span>f}</span> sec, &quot;</span>
<span class="hljs-meta">... </span> <span class="hljs-string">f&quot;ie. <span class="hljs-subst">{<span class="hljs-built_in">float</span>(wiki.dataset_size &gt;&gt; <span class="hljs-number">27</span>)/elapsed_time:<span class="hljs-number">.1</span>f}</span> Gb/s&quot;</span>)
Time to iterate over the <span class="hljs-number">18</span> GB dataset: <span class="hljs-number">31.8</span> sec, ie. <span class="hljs-number">4.8</span> Gb/s`,wrap:!1}}),C=new hs({props:{source:"https://github.com/huggingface/datasets/blob/main/docs/source/about_arrow.md"}}),{c(){i=p("meta"),Z=l(),v=p("p"),x=l(),m(d.$$.fragment),R=l(),m(w.$$.fragment),W=l(),m(M.$$.fragment),A=l(),g=p("p"),g.innerHTML=K,z=l(),y=p("ul"),y.innerHTML=D,F=l(),m(j.$$.fragment),X=l(),b=p("p"),b.textContent=O,Q=l(),T=p("p"),T.textContent=ss,S=l(),m(J.$$.fragment),G=l(),$=p("p"),$.textContent=ts,q=l(),m(U.$$.fragment),E=l(),k=p("p"),k.textContent=as,N=l(),m(I.$$.fragment),P=l(),m(C.$$.fragment),H=l(),B=p("p"),this.h()},l(s){const t=cs("svelte-u9bgzb",document.head);i=o(t,"META",{name:!0,content:!0}),t.forEach(a),Z=n(s),v=o(s,"P",{}),es(v).forEach(a),x=n(s),r(d.$$.fragment,s),R=n(s),r(w.$$.fragment,s),W=n(s),r(M.$$.fragment,s),A=n(s),g=o(s,"P",{"data-svelte-h":!0}),_(g)!=="svelte-52goft"&&(g.innerHTML=K),z=n(s),y=o(s,"UL",{"data-svelte-h":!0}),_(y)!=="svelte-6kbcii"&&(y.innerHTML=D),F=n(s),r(j.$$.fragment,s),X=n(s),b=o(s,"P",{"data-svelte-h":!0}),_(b)!=="svelte-xad2nu"&&(b.textContent=O),Q=n(s),T=o(s,"P",{"data-svelte-h":!0}),_(T)!=="svelte-k9gzax"&&(T.textContent=ss),S=n(s),r(J.$$.fragment,s),G=n(s),$=o(s,"P",{"data-svelte-h":!0}),_($)!=="svelte-kx9jy5"&&($.textContent=ts),q=n(s),r(U.$$.fragment,s),E=n(s),k=o(s,"P",{"data-svelte-h":!0}),_(k)!=="svelte-1telpcz"&&(k.textContent=as),N=n(s),r(I.$$.fragment,s),P=n(s),r(C.$$.fragment,s),H=n(s),B=o(s,"P",{}),es(B).forEach(a),this.h()},h(){ls(i,"name","hf:doc:metadata"),ls(i,"content",ws)},m(s,t){us(document.head,i),e(s,Z,t),e(s,v,t),e(s,x,t),c(d,s,t),e(s,R,t),c(w,s,t),e(s,W,t),c(M,s,t),e(s,A,t),e(s,g,t),e(s,z,t),e(s,y,t),e(s,F,t),c(j,s,t),e(s,X,t),e(s,b,t),e(s,Q,t),e(s,T,t),e(s,S,t),c(J,s,t),e(s,G,t),e(s,$,t),e(s,q,t),c(U,s,t),e(s,E,t),e(s,k,t),e(s,N,t),c(I,s,t),e(s,P,t),c(C,s,t),e(s,H,t),e(s,B,t),V=!0},p:ps,i(s){V||(u(d.$$.fragment,s),u(w.$$.fragment,s),u(M.$$.fragment,s),u(j.$$.fragment,s),u(J.$$.fragment,s),u(U.$$.fragment,s),u(I.$$.fragment,s),u(C.$$.fragment,s),V=!0)},o(s){f(d.$$.fragment,s),f(w.$$.fragment,s),f(M.$$.fragment,s),f(j.$$.fragment,s),f(J.$$.fragment,s),f(U.$$.fragment,s),f(I.$$.fragment,s),f(C.$$.fragment,s),V=!1},d(s){s&&(a(Z),a(v),a(x),a(R),a(W),a(A),a(g),a(z),a(y),a(F),a(X),a(b),a(Q),a(T),a(S),a(G),a($),a(q),a(E),a(k),a(N),a(P),a(H),a(B)),a(i),h(d,s),h(w,s),h(M,s),h(j,s),h(J,s),h(U,s),h(I,s),h(C,s)}}}const ws='{"title":"Datasets 🤝 Arrow","local":"datasets--arrow","sections":[{"title":"What is Arrow?","local":"what-is-arrow","sections":[],"depth":2},{"title":"Memory-mapping","local":"memory-mapping","sections":[],"depth":2},{"title":"Performance","local":"performance","sections":[],"depth":2}],"depth":1}';function Ms(L){return os(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Ts extends ms{constructor(i){super(),rs(this,i,Ms,ds,is,{})}}export{Ts as component};

Xet Storage Details

Size:
9.23 kB
·
Xet hash:
ccf70bd879988adaa42bd46788565a3659c7ac3f6f3138728a13c2ac4bfc7cf2

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.