Buckets:

hf-doc-build
/

doc

Files

xet

hf-doc-build/doc / datasets /v2.2.2 /en /about_arrow.html

rtrm

about 2 months ago

download

raw

12.9 kB

	<meta charset="utf-8" /><meta http-equiv="content-security-policy" content=""><meta name="hf:doc:metadata" content="{"local":"datasets-arrow","sections":[{"local":"what-is-arrow","title":"What is Arrow?"},{"local":"memorymapping","title":"Memory-mapping"},{"local":"performance","title":"Performance"}],"title":"Datasets 🤝 Arrow"}" data-svelte="svelte-1phssyn">
	<link rel="modulepreload" href="/docs/datasets/v2.2.2/en/_app/assets/pages/__layout.svelte-efc77dbd.css">
	<link rel="modulepreload" href="/docs/datasets/v2.2.2/en/_app/start-0f8c1da7.js">
	<link rel="modulepreload" href="/docs/datasets/v2.2.2/en/_app/chunks/vendor-8138ceec.js">
	<link rel="modulepreload" href="/docs/datasets/v2.2.2/en/_app/chunks/paths-4b3c6e7e.js">
	<link rel="modulepreload" href="/docs/datasets/v2.2.2/en/_app/pages/__layout.svelte-efb8e839.js">
	<link rel="modulepreload" href="/docs/datasets/v2.2.2/en/_app/pages/about_arrow.mdx-ca5bcb02.js">
	<link rel="modulepreload" href="/docs/datasets/v2.2.2/en/_app/chunks/IconCopyLink-2dd3a6ac.js">
	<link rel="modulepreload" href="/docs/datasets/v2.2.2/en/_app/chunks/CodeBlock-fc89709f.js">





	<h1 class="relative group"><a id="datasets-arrow" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#datasets-arrow"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Datasets 🤝 Arrow
	</span></h1>

	<h2 class="relative group"><a id="what-is-arrow" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-is-arrow"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>What is Arrow?
	</span></h2>

	<p><a href="https://arrow.apache.org/" rel="nofollow">Arrow</a> enables large amounts of data to be processed and moved quickly. It is a specific data format that stores data in a columnar memory layout. This provides several significant advantages:</p>
	<ul><li>Arrow’s standard format allows <a href="https://en.wikipedia.org/wiki/Zero-copy" rel="nofollow">zero-copy reads</a> which removes virtually all serialization overhead.</li>
	<li>Arrow is language-agnostic so it supports different programming languages.</li>
	<li>Arrow is column-oriented so it is faster at querying and processing slices or columns of data.</li>
	<li>Arrow allows for copy-free hand-offs to standard machine learning tools such as NumPy, Pandas, PyTorch, and TensorFlow.</li>
	<li>Arrow supports many, possibly nested, column types.</li></ul>
	<h2 class="relative group"><a id="memorymapping" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memorymapping"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Memory-mapping
	</span></h2>

	<p>🤗 Datasets uses Arrow for its local caching system. It allows datasets to be backed by an on-disk cache, which is memory-mapped for fast lookup.
	This architecture allows for large datasets to be used on machines with relatively small device memory.</p>
	<p>For example, loading the full English Wikipedia dataset only takes a few MB of RAM:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> os; <span class="hljs-keyword">import</span> psutil; <span class="hljs-keyword">import</span> timeit
	<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset

	<span class="hljs-comment"># Process.memory_info is expressed in bytes, so convert to megabytes </span>
	<span class="hljs-meta">>>> </span>mem_before = psutil.Process(os.getpid()).memory_info().rss / (<span class="hljs-number">1024</span> * <span class="hljs-number">1024</span>)
	<span class="hljs-meta">>>> </span>wiki = load_dataset(<span class="hljs-string">"wikipedia"</span>, <span class="hljs-string">"20220301.en"</span>, split=<span class="hljs-string">"train"</span>)
	<span class="hljs-meta">>>> </span>mem_after = psutil.Process(os.getpid()).memory_info().rss / (<span class="hljs-number">1024</span> * <span class="hljs-number">1024</span>)

	<span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(<span class="hljs-string">f"RAM memory used: <span class="hljs-subst">{(mem_after - mem_before)}</span> MB"</span>)
	RAM memory used: <span class="hljs-number">50</span> MB<!-- HTML_TAG_END --></pre></div>
	<p>This is possible because the Arrow data is actually memory-mapped from disk, and not loaded in memory.
	Memory-mapping allows access to data on disk, and leverages virtual memory capabilities for fast lookups.</p>
	<h2 class="relative group"><a id="performance" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#performance"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a>
	<span>Performance
	</span></h2>

	<p>Iterating over a memory-mapped dataset using Arrow is fast. Iterating over Wikipedia on a laptop gives you speeds of 1-3 Gbit/s:</p>

	<div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg>
	<div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div>
	Copied</div></button></div>
	<pre><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>s = <span class="hljs-string">"""batch_size = 1000
	<span class="hljs-meta">... </span>for i in range(0, len(wiki), batch_size):
	<span class="hljs-meta">... </span> batch = wiki[i:i + batch_size]
	<span class="hljs-meta">... </span>"""</span>

	<span class="hljs-meta">>>> </span>time = timeit.timeit(stmt=s, number=<span class="hljs-number">1</span>, <span class="hljs-built_in">globals</span>=<span class="hljs-built_in">globals</span>())
	<span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(<span class="hljs-string">f"Time to iterate over the <span class="hljs-subst">{wiki.dataset_size >> <span class="hljs-number">30</span>}</span> GB dataset: <span class="hljs-subst">{time:<span class="hljs-number">.1</span>f}</span> sec, "</span>
	<span class="hljs-meta">... </span> <span class="hljs-string">f"ie. <span class="hljs-subst">{<span class="hljs-built_in">float</span>(wiki.dataset_size >> <span class="hljs-number">27</span>)/time:<span class="hljs-number">.1</span>f}</span> Gb/s"</span>)
	Time to iterate over the <span class="hljs-number">18</span> GB dataset: <span class="hljs-number">70.5</span> sec, ie. <span class="hljs-number">2.1</span> Gb/s<!-- HTML_TAG_END --></pre></div>
	<p>You can obtain the best performance by accessing slices of data (or “batches”), in order to reduce the amount of lookups on disk.</p>


	<script type="module" data-hydrate="au0j95">
	import { start } from "/docs/datasets/v2.2.2/en/_app/start-0f8c1da7.js";
	start({
	target: document.querySelector('[data-hydrate="au0j95"]').parentNode,
	paths: {"base":"/docs/datasets/v2.2.2/en","assets":"/docs/datasets/v2.2.2/en"},
	session: {},
	route: false,
	spa: false,
	trailing_slash: "never",
	hydrate: {
	status: 200,
	error: null,
	nodes: [
	import("/docs/datasets/v2.2.2/en/_app/pages/__layout.svelte-efb8e839.js"),
	import("/docs/datasets/v2.2.2/en/_app/pages/about_arrow.mdx-ca5bcb02.js")
	],
	params: {}
	}
	});
	</script>

Xet Storage Details

Size:: 12.9 kB
Xet hash:: 2d0cf26facf1fc6a9d6471ad6fc653bc36f532610c673e39e90e7d2495be7b2a

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.