Buckets:

rtrm's picture
download
raw
8.95 kB
import{s as ls,n as ns,o as is}from"../chunks/scheduler.bdbef820.js";import{S as ps,i as os,g as p,s as l,r as T,A as rs,h as o,f as a,c as n,j as ts,u as J,x as _,k as as,y as ms,a as e,v as $,d as U,t as k,w as I}from"../chunks/index.c0aea24a.js";import{C as es}from"../chunks/CodeBlock.e814ab8d.js";import{H,E as cs}from"../chunks/getInferenceSnippets.fbead63c.js";function us(V){let i,B,C,Z,r,x,m,A,c,Y='<a href="https://arrow.apache.org/" rel="nofollow">Arrow</a> enables large amounts of data to be processed and moved quickly. It is a specific data format that stores data in a columnar memory layout. This provides several significant advantages:',R,u,K='<li>Arrow’s standard format allows <a href="https://en.wikipedia.org/wiki/Zero-copy" rel="nofollow">zero-copy reads</a> which removes virtually all serialization overhead.</li> <li>Arrow is language-agnostic so it supports different programming languages.</li> <li>Arrow is column-oriented so it is faster at querying and processing slices or columns of data.</li> <li>Arrow allows for copy-free hand-offs to standard machine learning tools such as NumPy, Pandas, PyTorch, and TensorFlow.</li> <li>Arrow supports many, possibly nested, column types.</li>',W,h,F,d,L=`🤗 Datasets uses Arrow for its local caching system. It allows datasets to be backed by an on-disk cache, which is memory-mapped for fast lookup.
This architecture allows for large datasets to be used on machines with relatively small device memory.`,X,f,D="For example, loading the full English Wikipedia dataset only takes a few MB of RAM:",z,w,Q,M,O=`This is possible because the Arrow data is actually memory-mapped from disk, and not loaded in memory.
Memory-mapping allows access to data on disk, and leverages virtual memory capabilities for fast lookups.`,G,g,S,y,ss="Iterating over a memory-mapped dataset using Arrow is fast. Iterating over Wikipedia on a laptop gives you speeds of 1-3 Gbit/s:",q,j,E,b,N,v,P;return r=new H({props:{title:"Datasets 🤝 Arrow",local:"datasets--arrow",headingTag:"h1"}}),m=new H({props:{title:"What is Arrow?",local:"what-is-arrow",headingTag:"h2"}}),h=new H({props:{title:"Memory-mapping",local:"memory-mapping",headingTag:"h2"}}),w=new es({props:{code:"aW1wb3J0JTIwb3MlM0IlMjBpbXBvcnQlMjBwc3V0aWwlM0IlMjBpbXBvcnQlMjB0aW1laXQlMEFmcm9tJTIwZGF0YXNldHMlMjBpbXBvcnQlMjBsb2FkX2RhdGFzZXQlMEElMEFtZW1fYmVmb3JlJTIwJTNEJTIwcHN1dGlsLlByb2Nlc3Mob3MuZ2V0cGlkKCkpLm1lbW9yeV9pbmZvKCkucnNzJTIwJTJGJTIwKDEwMjQlMjAqJTIwMTAyNCklMEF3aWtpJTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMndpa2ltZWRpYSUyRndpa2lwZWRpYSUyMiUyQyUyMCUyMjIwMjIwMzAxLmVuJTIyJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiklMEFtZW1fYWZ0ZXIlMjAlM0QlMjBwc3V0aWwuUHJvY2Vzcyhvcy5nZXRwaWQoKSkubWVtb3J5X2luZm8oKS5yc3MlMjAlMkYlMjAoMTAyNCUyMColMjAxMDI0KSUwQSUwQXByaW50KGYlMjJSQU0lMjBtZW1vcnklMjB1c2VkJTNBJTIwJTdCKG1lbV9hZnRlciUyMC0lMjBtZW1fYmVmb3JlKSU3RCUyME1CJTIyKQ==",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> os; <span class="hljs-keyword">import</span> psutil; <span class="hljs-keyword">import</span> timeit
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-comment"># Process.memory_info is expressed in bytes, so convert to megabytes </span>
<span class="hljs-meta">&gt;&gt;&gt; </span>mem_before = psutil.Process(os.getpid()).memory_info().rss / (<span class="hljs-number">1024</span> * <span class="hljs-number">1024</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>wiki = load_dataset(<span class="hljs-string">&quot;wikimedia/wikipedia&quot;</span>, <span class="hljs-string">&quot;20220301.en&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>mem_after = psutil.Process(os.getpid()).memory_info().rss / (<span class="hljs-number">1024</span> * <span class="hljs-number">1024</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;RAM memory used: <span class="hljs-subst">{(mem_after - mem_before)}</span> MB&quot;</span>)
RAM memory used: <span class="hljs-number">50</span> MB`,wrap:!1}}),g=new H({props:{title:"Performance",local:"performance",headingTag:"h2"}}),j=new es({props:{code:"cyUyMCUzRCUyMCUyMiUyMiUyMmJhdGNoX3NpemUlMjAlM0QlMjAxMDAwJTBBZm9yJTIwYmF0Y2glMjBpbiUyMHdpa2kuaXRlcihiYXRjaF9zaXplKSUzQSUwQSUyMCUyMCUyMCUyMC4uLiUwQSUyMiUyMiUyMiUwQSUwQWVsYXBzZWRfdGltZSUyMCUzRCUyMHRpbWVpdC50aW1laXQoc3RtdCUzRHMlMkMlMjBudW1iZXIlM0QxJTJDJTIwZ2xvYmFscyUzRGdsb2JhbHMoKSklMEFwcmludChmJTIyVGltZSUyMHRvJTIwaXRlcmF0ZSUyMG92ZXIlMjB0aGUlMjAlN0J3aWtpLmRhdGFzZXRfc2l6ZSUyMCUzRSUzRSUyMDMwJTdEJTIwR0IlMjBkYXRhc2V0JTNBJTIwJTdCZWxhcHNlZF90aW1lJTNBLjFmJTdEJTIwc2VjJTJDJTIwJTIyJTBBJTIwJTIwJTIwJTIwJTIwJTIwZiUyMmllLiUyMCU3QmZsb2F0KHdpa2kuZGF0YXNldF9zaXplJTIwJTNFJTNFJTIwMjcpJTJGZWxhcHNlZF90aW1lJTNBLjFmJTdEJTIwR2IlMkZzJTIyKQ==",highlighted:`<span class="hljs-meta">&gt;&gt;&gt; </span>s = <span class="hljs-string">&quot;&quot;&quot;batch_size = 1000
<span class="hljs-meta">... </span>for batch in wiki.iter(batch_size):
<span class="hljs-meta">... </span> ...
<span class="hljs-meta">... </span>&quot;&quot;&quot;</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>elapsed_time = timeit.timeit(stmt=s, number=<span class="hljs-number">1</span>, <span class="hljs-built_in">globals</span>=<span class="hljs-built_in">globals</span>())
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Time to iterate over the <span class="hljs-subst">{wiki.dataset_size &gt;&gt; <span class="hljs-number">30</span>}</span> GB dataset: <span class="hljs-subst">{elapsed_time:<span class="hljs-number">.1</span>f}</span> sec, &quot;</span>
<span class="hljs-meta">... </span> <span class="hljs-string">f&quot;ie. <span class="hljs-subst">{<span class="hljs-built_in">float</span>(wiki.dataset_size &gt;&gt; <span class="hljs-number">27</span>)/elapsed_time:<span class="hljs-number">.1</span>f}</span> Gb/s&quot;</span>)
Time to iterate over the <span class="hljs-number">18</span> GB dataset: <span class="hljs-number">31.8</span> sec, ie. <span class="hljs-number">4.8</span> Gb/s`,wrap:!1}}),b=new cs({props:{source:"https://github.com/huggingface/datasets/blob/main/docs/source/about_arrow.md"}}),{c(){i=p("meta"),B=l(),C=p("p"),Z=l(),T(r.$$.fragment),x=l(),T(m.$$.fragment),A=l(),c=p("p"),c.innerHTML=Y,R=l(),u=p("ul"),u.innerHTML=K,W=l(),T(h.$$.fragment),F=l(),d=p("p"),d.textContent=L,X=l(),f=p("p"),f.textContent=D,z=l(),T(w.$$.fragment),Q=l(),M=p("p"),M.textContent=O,G=l(),T(g.$$.fragment),S=l(),y=p("p"),y.textContent=ss,q=l(),T(j.$$.fragment),E=l(),T(b.$$.fragment),N=l(),v=p("p"),this.h()},l(s){const t=rs("svelte-u9bgzb",document.head);i=o(t,"META",{name:!0,content:!0}),t.forEach(a),B=n(s),C=o(s,"P",{}),ts(C).forEach(a),Z=n(s),J(r.$$.fragment,s),x=n(s),J(m.$$.fragment,s),A=n(s),c=o(s,"P",{"data-svelte-h":!0}),_(c)!=="svelte-52goft"&&(c.innerHTML=Y),R=n(s),u=o(s,"UL",{"data-svelte-h":!0}),_(u)!=="svelte-6kbcii"&&(u.innerHTML=K),W=n(s),J(h.$$.fragment,s),F=n(s),d=o(s,"P",{"data-svelte-h":!0}),_(d)!=="svelte-xad2nu"&&(d.textContent=L),X=n(s),f=o(s,"P",{"data-svelte-h":!0}),_(f)!=="svelte-k9gzax"&&(f.textContent=D),z=n(s),J(w.$$.fragment,s),Q=n(s),M=o(s,"P",{"data-svelte-h":!0}),_(M)!=="svelte-kx9jy5"&&(M.textContent=O),G=n(s),J(g.$$.fragment,s),S=n(s),y=o(s,"P",{"data-svelte-h":!0}),_(y)!=="svelte-1telpcz"&&(y.textContent=ss),q=n(s),J(j.$$.fragment,s),E=n(s),J(b.$$.fragment,s),N=n(s),v=o(s,"P",{}),ts(v).forEach(a),this.h()},h(){as(i,"name","hf:doc:metadata"),as(i,"content",hs)},m(s,t){ms(document.head,i),e(s,B,t),e(s,C,t),e(s,Z,t),$(r,s,t),e(s,x,t),$(m,s,t),e(s,A,t),e(s,c,t),e(s,R,t),e(s,u,t),e(s,W,t),$(h,s,t),e(s,F,t),e(s,d,t),e(s,X,t),e(s,f,t),e(s,z,t),$(w,s,t),e(s,Q,t),e(s,M,t),e(s,G,t),$(g,s,t),e(s,S,t),e(s,y,t),e(s,q,t),$(j,s,t),e(s,E,t),$(b,s,t),e(s,N,t),e(s,v,t),P=!0},p:ns,i(s){P||(U(r.$$.fragment,s),U(m.$$.fragment,s),U(h.$$.fragment,s),U(w.$$.fragment,s),U(g.$$.fragment,s),U(j.$$.fragment,s),U(b.$$.fragment,s),P=!0)},o(s){k(r.$$.fragment,s),k(m.$$.fragment,s),k(h.$$.fragment,s),k(w.$$.fragment,s),k(g.$$.fragment,s),k(j.$$.fragment,s),k(b.$$.fragment,s),P=!1},d(s){s&&(a(B),a(C),a(Z),a(x),a(A),a(c),a(R),a(u),a(W),a(F),a(d),a(X),a(f),a(z),a(Q),a(M),a(G),a(S),a(y),a(q),a(E),a(N),a(v)),a(i),I(r,s),I(m,s),I(h,s),I(w,s),I(g,s),I(j,s),I(b,s)}}}const hs='{"title":"Datasets 🤝 Arrow","local":"datasets--arrow","sections":[{"title":"What is Arrow?","local":"what-is-arrow","sections":[],"depth":2},{"title":"Memory-mapping","local":"memory-mapping","sections":[],"depth":2},{"title":"Performance","local":"performance","sections":[],"depth":2}],"depth":1}';function ds(V){return is(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class ys extends ps{constructor(i){super(),os(this,i,ds,us,ls,{})}}export{ys as component};

Xet Storage Details

Size:
8.95 kB
·
Xet hash:
ff8419b2b0ae272188f45e8fa65e4d61f02d53a1d215c6d6c342b01ea9151f4d

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.