Buckets:

hf-doc-build/doc-dev / datasets /pr_8021 /en /about_mapstyle_vs_iterable.html
rtrm's picture
download
raw
61.1 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Differences between Dataset and IterableDataset&quot;,&quot;local&quot;:&quot;differences-between-dataset-and-iterabledataset&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Downloading and streaming&quot;,&quot;local&quot;:&quot;downloading-and-streaming&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Creating map-style datasets and iterable datasets&quot;,&quot;local&quot;:&quot;creating-map-style-datasets-and-iterable-datasets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Loading local files entirely and progressively&quot;,&quot;local&quot;:&quot;loading-local-files-entirely-and-progressively&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Eager data processing and lazy data processing&quot;,&quot;local&quot;:&quot;eager-data-processing-and-lazy-data-processing&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Exact and fast approximate shuffling&quot;,&quot;local&quot;:&quot;exact-and-fast-approximate-shuffling&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Speed differences&quot;,&quot;local&quot;:&quot;speed-differences&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Switch from map-style to iterable&quot;,&quot;local&quot;:&quot;switch-from-map-style-to-iterable&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/datasets/pr_8021/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/entry/start.467c4c66.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/scheduler.d75c11ed.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/singletons.24e4ec1f.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/index.d12496d4.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/paths.409c1290.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/entry/app.3b2ba720.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/preload-helper.a99c0584.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/index.4ec9dfe9.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/nodes/0.5fda7065.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/nodes/7.aa4ce26a.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.ee0f129e.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/CodeBlock.5919a092.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Differences between Dataset and IterableDataset&quot;,&quot;local&quot;:&quot;differences-between-dataset-and-iterabledataset&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Downloading and streaming&quot;,&quot;local&quot;:&quot;downloading-and-streaming&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Creating map-style datasets and iterable datasets&quot;,&quot;local&quot;:&quot;creating-map-style-datasets-and-iterable-datasets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Loading local files entirely and progressively&quot;,&quot;local&quot;:&quot;loading-local-files-entirely-and-progressively&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Eager data processing and lazy data processing&quot;,&quot;local&quot;:&quot;eager-data-processing-and-lazy-data-processing&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Exact and fast approximate shuffling&quot;,&quot;local&quot;:&quot;exact-and-fast-approximate-shuffling&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Speed differences&quot;,&quot;local&quot;:&quot;speed-differences&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Switch from map-style to iterable&quot;,&quot;local&quot;:&quot;switch-from-map-style-to-iterable&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="differences-between-dataset-and-iterabledataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#differences-between-dataset-and-iterabledataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Differences between Dataset and IterableDataset</span></h1> <p data-svelte-h="svelte-obbfk4">There are two types of dataset objects, a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> and an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>.
Whichever type of dataset you choose to use or create depends on the size of the dataset.
In general, an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> is ideal for big datasets (think hundreds of GBs!) due to its lazy behavior and speed advantages, while a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> is great for everything else.
This page will compare the differences between a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> and an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> to help you pick the right dataset object for you.</p> <h2 class="relative group"><a id="downloading-and-streaming" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#downloading-and-streaming"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Downloading and streaming</span></h2> <p data-svelte-h="svelte-noa0nl">When you have a regular <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a>, you can access it using <code>my_dataset[0]</code>. This provides random access to the rows.
Such datasets are also called “map-style” datasets.
For example you can download ImageNet-1k like this and access any row:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
imagenet = load_dataset(<span class="hljs-string">&quot;timm/imagenet-1k-wds&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>) <span class="hljs-comment"># downloads the full dataset</span>
<span class="hljs-built_in">print</span>(imagenet[<span class="hljs-number">0</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-152i6l3">But one caveat is that you must have the entire dataset stored on your disk or in memory, which blocks you from accessing datasets bigger than the disk.
Because it can become inconvenient for big datasets, there exists another type of dataset, the <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>.
When you have an <code>IterableDataset</code>, you can access it using a <code>for</code> loop to load the data progressively as you iterate over the dataset.
This way, only a small fraction of examples is loaded in memory, and you don’t write anything on disk.</p> <p data-svelte-h="svelte-p27tn9">For example, you can stream the ImageNet-1k dataset without downloading it on disk:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
imagenet = load_dataset(<span class="hljs-string">&quot;timm/imagenet-1k-wds&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>, streaming=<span class="hljs-literal">True</span>) <span class="hljs-comment"># will start loading the data when iterated over</span>
<span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> imagenet:
<span class="hljs-built_in">print</span>(example)
<span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1m5el5o">Streaming can read online data without writing any file to disk.
For example, you can stream datasets made out of multiple shards, each of which is hundreds of gigabytes like <a href="https://huggingface.co/datasets/c4" rel="nofollow">C4</a> or <a href="https://huggingface.co/datasets/laion/laion2B-en" rel="nofollow">LAION-2B</a>.
Learn more about how to stream a dataset in the <a href="./stream">Dataset Streaming Guide</a>.</p> <p data-svelte-h="svelte-1ik5e8b">This is not the only difference though, because the “lazy” behavior of an <code>IterableDataset</code> is also present when it comes to dataset creation and processing.</p> <h2 class="relative group"><a id="creating-map-style-datasets-and-iterable-datasets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#creating-map-style-datasets-and-iterable-datasets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Creating map-style datasets and iterable datasets</span></h2> <p data-svelte-h="svelte-1i29srk">You can create a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> using lists or dictionaries, and the data is entirely converted to Arrow so you can easily access any row:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_dataset = Dataset.from_dict({<span class="hljs-string">&quot;col_1&quot;</span>: [<span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">3</span>, <span class="hljs-number">4</span>, <span class="hljs-number">5</span>, <span class="hljs-number">6</span>, <span class="hljs-number">7</span>, <span class="hljs-number">8</span>, <span class="hljs-number">9</span>]})
<span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-z2r7k0">To create an <code>IterableDataset</code> on the other hand, you must provide a “lazy” way to load the data.
In Python, we generally use generator functions. These functions <code>yield</code> one example at a time, which means you can’t access a row by slicing it like a regular <code>Dataset</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">my_generator</span>(<span class="hljs-params">n</span>):
<span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n):
<span class="hljs-keyword">yield</span> {<span class="hljs-string">&quot;col_1&quot;</span>: i}
my_iterable_dataset = IterableDataset.from_generator(my_generator, gen_kwargs={<span class="hljs-string">&quot;n&quot;</span>: <span class="hljs-number">10</span>})
<span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset:
<span class="hljs-built_in">print</span>(example)
<span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="loading-local-files-entirely-and-progressively" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#loading-local-files-entirely-and-progressively"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Loading local files entirely and progressively</span></h2> <p data-svelte-h="svelte-zma22i">It is possible to convert local or remote data files to an Arrow <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> using <a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->data_files = {<span class="hljs-string">&quot;train&quot;</span>: [<span class="hljs-string">&quot;path/to/data.csv&quot;</span>]}
my_dataset = load_dataset(<span class="hljs-string">&quot;csv&quot;</span>, data_files=data_files, split=<span class="hljs-string">&quot;train&quot;</span>)
<span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-gryvti">However, this requires a conversion step from CSV to Arrow format, which takes time and disk space if your dataset is big.</p> <p data-svelte-h="svelte-gn3sd">To save disk space and skip the conversion step, you can define an <code>IterableDataset</code> by streaming from the local files directly.
This way, the data is read progressively from the local files as you iterate over the dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->data_files = {<span class="hljs-string">&quot;train&quot;</span>: [<span class="hljs-string">&quot;path/to/data.csv&quot;</span>]}
my_iterable_dataset = load_dataset(<span class="hljs-string">&quot;csv&quot;</span>, data_files=data_files, split=<span class="hljs-string">&quot;train&quot;</span>, streaming=<span class="hljs-literal">True</span>)
<span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: <span class="hljs-comment"># this reads the CSV file progressively as you iterate over the dataset</span>
<span class="hljs-built_in">print</span>(example)
<span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-u916f6">Many file formats are supported, like CSV, JSONL, and Parquet, as well as image and audio files.
You can find more information in the corresponding guides for loading <a href="./tabular_load">tabular</a>, <a href="./nlp_load">text</a>, <a href="./image_load">vision</a>, and <a href="./audio_load%5D">audio</a> datasets.</p> <h2 class="relative group"><a id="eager-data-processing-and-lazy-data-processing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#eager-data-processing-and-lazy-data-processing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Eager data processing and lazy data processing</span></h2> <p data-svelte-h="svelte-19wo518">When you process a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> object using <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">Dataset.map()</a>, the entire dataset is processed immediately and returned.
This is similar to how <code>pandas</code> works for example.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_dataset = my_dataset.<span class="hljs-built_in">map</span>(process_fn) <span class="hljs-comment"># process_fn is applied on all the examples of the dataset</span>
<span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-n9zp2j">On the other hand, due to the “lazy” nature of an <code>IterableDataset</code>, calling <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.map">IterableDataset.map()</a> does not apply your <code>map</code> function over the full dataset.
Instead, your <code>map</code> function is applied on-the-fly.</p> <p data-svelte-h="svelte-vsuw18">Because of that, you can chain multiple processing steps and they will all run at once when you start iterating over the dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_iterable_dataset = my_iterable_dataset.<span class="hljs-built_in">map</span>(process_fn_1)
my_iterable_dataset = my_iterable_dataset.<span class="hljs-built_in">filter</span>(filter_fn)
my_iterable_dataset = my_iterable_dataset.<span class="hljs-built_in">map</span>(process_fn_2)
<span class="hljs-comment"># process_fn_1, filter_fn and process_fn_2 are applied on-the-fly when iterating over the dataset</span>
<span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset:
<span class="hljs-built_in">print</span>(example)
<span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="exact-and-fast-approximate-shuffling" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#exact-and-fast-approximate-shuffling"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Exact and fast approximate shuffling</span></h2> <p data-svelte-h="svelte-3vxzt6">When you shuffle a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> using <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.shuffle">Dataset.shuffle()</a>, you apply an exact shuffling of the dataset.
It works by taking a list of indices <code>[0, 1, 2, ... len(my_dataset) - 1]</code> and shuffling this list.
Then, accessing <code>my_dataset[0]</code> returns the row and index defined by the first element of the indices mapping that has been shuffled:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_dataset = my_dataset.shuffle(seed=<span class="hljs-number">42</span>)
<span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-v7lvxb">Since we don’t have random access to the rows in the case of an <code>IterableDataset</code>, we can’t use a shuffled list of indices and access a row at an arbitrary position.
This prevents the use of exact shuffling.
Instead, a fast approximate shuffling is used in <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.shuffle">IterableDataset.shuffle()</a>.
It uses a shuffle buffer to sample random examples iteratively from the dataset.
Since the dataset is still read iteratively, it provides excellent speed performance:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_iterable_dataset = my_iterable_dataset.shuffle(seed=<span class="hljs-number">42</span>, buffer_size=<span class="hljs-number">100</span>)
<span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset:
<span class="hljs-built_in">print</span>(example)
<span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-nx8g88">But using a shuffle buffer is not enough to provide a satisfactory shuffling for machine learning model training. So <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.shuffle">IterableDataset.shuffle()</a> also shuffles the dataset shards if your dataset is made of multiple files or sources:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Stream from the internet</span>
my_iterable_dataset = load_dataset(<span class="hljs-string">&quot;deepmind/code_contests&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>, streaming=<span class="hljs-literal">True</span>)
my_iterable_dataset.num_shards <span class="hljs-comment"># 39</span>
<span class="hljs-comment"># Stream from local files</span>
data_files = {<span class="hljs-string">&quot;train&quot;</span>: [<span class="hljs-string">f&quot;path/to/data_<span class="hljs-subst">{i}</span>.csv&quot;</span> <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">1024</span>)]}
my_iterable_dataset = load_dataset(<span class="hljs-string">&quot;csv&quot;</span>, data_files=data_files, split=<span class="hljs-string">&quot;train&quot;</span>, streaming=<span class="hljs-literal">True</span>)
my_iterable_dataset.num_shards <span class="hljs-comment"># 1024</span>
<span class="hljs-comment"># From a generator function</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">my_generator</span>(<span class="hljs-params">n, sources</span>):
<span class="hljs-keyword">for</span> source <span class="hljs-keyword">in</span> sources:
<span class="hljs-keyword">for</span> example_id_for_current_source <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n):
<span class="hljs-keyword">yield</span> {<span class="hljs-string">&quot;example_id&quot;</span>: <span class="hljs-string">f&quot;<span class="hljs-subst">{source}</span>_<span class="hljs-subst">{example_id_for_current_source}</span>&quot;</span>}
gen_kwargs = {<span class="hljs-string">&quot;n&quot;</span>: <span class="hljs-number">10</span>, <span class="hljs-string">&quot;sources&quot;</span>: [<span class="hljs-string">f&quot;path/to/data_<span class="hljs-subst">{i}</span>&quot;</span> <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">1024</span>)]}
my_iterable_dataset = IterableDataset.from_generator(my_generator, gen_kwargs=gen_kwargs)
my_iterable_dataset.num_shards <span class="hljs-comment"># 1024</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="speed-differences" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#speed-differences"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Speed differences</span></h2> <p data-svelte-h="svelte-cfps1k">Regular <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> objects are based on Arrow which provides fast random access to the rows.
Thanks to memory mapping and the fact that Arrow is an in-memory format, reading data from disk doesn’t do expensive system calls and deserialization.
It provides even faster data loading when iterating using a <code>for</code> loop by iterating on contiguous Arrow record batches.</p> <p data-svelte-h="svelte-fna18m">However as soon as your <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> has an indices mapping (via <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.shuffle">Dataset.shuffle()</a> for example), the speed can become 10x slower.
This is because there is an extra step to get the row index to read using the indices mapping, and most importantly, you aren’t reading contiguous chunks of data anymore.
To restore the speed, you’d need to rewrite the entire dataset on your disk again using <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.flatten_indices">Dataset.flatten_indices()</a>, which removes the indices mapping.
This may take a lot of time depending on the size of your dataset though:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_dataset[<span class="hljs-number">0</span>] <span class="hljs-comment"># fast</span>
my_dataset = my_dataset.shuffle(seed=<span class="hljs-number">42</span>)
my_dataset[<span class="hljs-number">0</span>] <span class="hljs-comment"># up to 10x slower</span>
my_dataset = my_dataset.flatten_indices() <span class="hljs-comment"># rewrite the shuffled dataset on disk as contiguous chunks of data</span>
my_dataset[<span class="hljs-number">0</span>] <span class="hljs-comment"># fast again</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-mfr8up">In this case, we recommend switching to an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> and leveraging its fast approximate shuffling method <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.shuffle">IterableDataset.shuffle()</a>.
It only shuffles the shards order and adds a shuffle buffer to your dataset, which keeps the speed of your dataset optimal.
You can also reshuffle the dataset easily:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(my_iterable_dataset): <span class="hljs-comment"># fast</span>
<span class="hljs-keyword">pass</span>
shuffled_iterable_dataset = my_iterable_dataset.shuffle(seed=<span class="hljs-number">42</span>, buffer_size=<span class="hljs-number">100</span>)
<span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(shuffled_iterable_dataset): <span class="hljs-comment"># as fast as before</span>
<span class="hljs-keyword">pass</span>
shuffled_iterable_dataset = my_iterable_dataset.shuffle(seed=<span class="hljs-number">1337</span>, buffer_size=<span class="hljs-number">100</span>) <span class="hljs-comment"># reshuffling using another seed is instantaneous</span>
<span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(shuffled_iterable_dataset): <span class="hljs-comment"># still as fast as before</span>
<span class="hljs-keyword">pass</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1yhobsx">If you’re using your dataset on multiple epochs, the effective seed to shuffle the shards order in the shuffle buffer is <code>seed + epoch</code>.
It makes it easy to reshuffle a dataset between epochs:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">for</span> epoch <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n_epochs):
my_iterable_dataset.set_epoch(epoch)
<span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: <span class="hljs-comment"># fast + reshuffled at each epoch using `effective_seed = seed + epoch`</span>
<span class="hljs-keyword">pass</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1vtsyk">To restart the iteration of a map-style dataset, you can simply skip the first examples:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_dataset = my_dataset.select(<span class="hljs-built_in">range</span>(start_index, <span class="hljs-built_in">len</span>(dataset)))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qarbjh">But if you use a <code>DataLoader</code> with a <code>Sampler</code>, you should instead save the state of your sampler (you might have written a custom sampler that allows resuming).</p> <p data-svelte-h="svelte-177bwe6">On the other hand, iterable datasets don’t provide random access to a specific example index to resume from. But you can use <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.state_dict">IterableDataset.state_dict()</a> and <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.load_state_dict">IterableDataset.load_state_dict()</a> to resume from a checkpoint instead, similarly to what you can do for models and optimizers:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>iterable_dataset = Dataset.from_dict({<span class="hljs-string">&quot;a&quot;</span>: <span class="hljs-built_in">range</span>(<span class="hljs-number">6</span>)}).to_iterable_dataset(num_shards=<span class="hljs-number">3</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># save in the middle of training</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>state_dict = iterable_dataset.state_dict()
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># and resume later</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>iterable_dataset.load_state_dict(state_dict)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ul64r3">Under the hood, the iterable dataset keeps track of the current shard being read and the example index in the current shard and it stores this info in the <code>state_dict</code>.</p> <p data-svelte-h="svelte-fghrva">To resume from a checkpoint, the dataset skips all the shards that were previously read to restart from the current shard.
Then it reads the shard and skips examples until it reaches the exact example from the checkpoint.</p> <p data-svelte-h="svelte-1i92o70">Therefore restarting a dataset is quite fast, since it will not re-read the shards that have already been iterated on. Still, resuming a dataset is generally not instantaneous since it has to restart reading from the beginning of the current shard and skip examples until it reaches the checkpoint location.</p> <p data-svelte-h="svelte-1r4w1a2">This can be used with the <code>StatefulDataLoader</code> from <code>torchdata</code>, see <a href="./use_with_pytorch#stream-data">streaming with a PyTorch DataLoader</a>.</p> <h2 class="relative group"><a id="switch-from-map-style-to-iterable" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#switch-from-map-style-to-iterable"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Switch from map-style to iterable</span></h2> <p data-svelte-h="svelte-1dgl4fa">If you want to benefit from the “lazy” behavior of an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> or their speed advantages, you can switch your map-style <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> to an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_iterable_dataset = my_dataset.to_iterable_dataset()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8w5p2f">If you want to shuffle your dataset or <a href="./use_with_pytorch#stream-data">use it with a PyTorch DataLoader</a>, we recommend generating a sharded <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_iterable_dataset = my_dataset.to_iterable_dataset(num_shards=<span class="hljs-number">1024</span>)
my_iterable_dataset.num_shards <span class="hljs-comment"># 1024</span><!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/datasets/blob/main/docs/source/about_mapstyle_vs_iterable.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1tcoqe3 = {
assets: "/docs/datasets/pr_8021/en",
base: "/docs/datasets/pr_8021/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/datasets/pr_8021/en/_app/immutable/entry/start.467c4c66.js"),
import("/docs/datasets/pr_8021/en/_app/immutable/entry/app.3b2ba720.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 7],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
61.1 kB
·
Xet hash:
1dd33d46a12ab11c348a8d36cb68350422b2525275c2a96a70f2ac30f57dc0bf

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.