Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Differences between Dataset and IterableDataset","local":"differences-between-dataset-and-iterabledataset","sections":[{"title":"Downloading and streaming","local":"downloading-and-streaming","sections":[],"depth":2},{"title":"Creating map-style datasets and iterable datasets","local":"creating-map-style-datasets-and-iterable-datasets","sections":[],"depth":2},{"title":"Loading local files entirely and progressively","local":"loading-local-files-entirely-and-progressively","sections":[],"depth":2},{"title":"Eager data processing and lazy data processing","local":"eager-data-processing-and-lazy-data-processing","sections":[],"depth":2},{"title":"Exact and fast approximate shuffling","local":"exact-and-fast-approximate-shuffling","sections":[],"depth":2},{"title":"Speed differences","local":"speed-differences","sections":[],"depth":2},{"title":"Switch from map-style to iterable","local":"switch-from-map-style-to-iterable","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/datasets/pr_8021/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/entry/start.467c4c66.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/scheduler.d75c11ed.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/singletons.24e4ec1f.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/index.d12496d4.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/paths.409c1290.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/entry/app.3b2ba720.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/preload-helper.a99c0584.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/index.4ec9dfe9.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/nodes/0.5fda7065.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/nodes/7.aa4ce26a.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.ee0f129e.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/CodeBlock.5919a092.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Differences between Dataset and IterableDataset","local":"differences-between-dataset-and-iterabledataset","sections":[{"title":"Downloading and streaming","local":"downloading-and-streaming","sections":[],"depth":2},{"title":"Creating map-style datasets and iterable datasets","local":"creating-map-style-datasets-and-iterable-datasets","sections":[],"depth":2},{"title":"Loading local files entirely and progressively","local":"loading-local-files-entirely-and-progressively","sections":[],"depth":2},{"title":"Eager data processing and lazy data processing","local":"eager-data-processing-and-lazy-data-processing","sections":[],"depth":2},{"title":"Exact and fast approximate shuffling","local":"exact-and-fast-approximate-shuffling","sections":[],"depth":2},{"title":"Speed differences","local":"speed-differences","sections":[],"depth":2},{"title":"Switch from map-style to iterable","local":"switch-from-map-style-to-iterable","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="differences-between-dataset-and-iterabledataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#differences-between-dataset-and-iterabledataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Differences between Dataset and IterableDataset</span></h1> <p data-svelte-h="svelte-obbfk4">There are two types of dataset objects, a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> and an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>. | |
| Whichever type of dataset you choose to use or create depends on the size of the dataset. | |
| In general, an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> is ideal for big datasets (think hundreds of GBs!) due to its lazy behavior and speed advantages, while a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> is great for everything else. | |
| This page will compare the differences between a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> and an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> to help you pick the right dataset object for you.</p> <h2 class="relative group"><a id="downloading-and-streaming" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#downloading-and-streaming"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Downloading and streaming</span></h2> <p data-svelte-h="svelte-noa0nl">When you have a regular <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a>, you can access it using <code>my_dataset[0]</code>. This provides random access to the rows. | |
| Such datasets are also called “map-style” datasets. | |
| For example you can download ImageNet-1k like this and access any row:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| imagenet = load_dataset(<span class="hljs-string">"timm/imagenet-1k-wds"</span>, split=<span class="hljs-string">"train"</span>) <span class="hljs-comment"># downloads the full dataset</span> | |
| <span class="hljs-built_in">print</span>(imagenet[<span class="hljs-number">0</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-152i6l3">But one caveat is that you must have the entire dataset stored on your disk or in memory, which blocks you from accessing datasets bigger than the disk. | |
| Because it can become inconvenient for big datasets, there exists another type of dataset, the <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>. | |
| When you have an <code>IterableDataset</code>, you can access it using a <code>for</code> loop to load the data progressively as you iterate over the dataset. | |
| This way, only a small fraction of examples is loaded in memory, and you don’t write anything on disk.</p> <p data-svelte-h="svelte-p27tn9">For example, you can stream the ImageNet-1k dataset without downloading it on disk:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| imagenet = load_dataset(<span class="hljs-string">"timm/imagenet-1k-wds"</span>, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span>) <span class="hljs-comment"># will start loading the data when iterated over</span> | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> imagenet: | |
| <span class="hljs-built_in">print</span>(example) | |
| <span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1m5el5o">Streaming can read online data without writing any file to disk. | |
| For example, you can stream datasets made out of multiple shards, each of which is hundreds of gigabytes like <a href="https://huggingface.co/datasets/c4" rel="nofollow">C4</a> or <a href="https://huggingface.co/datasets/laion/laion2B-en" rel="nofollow">LAION-2B</a>. | |
| Learn more about how to stream a dataset in the <a href="./stream">Dataset Streaming Guide</a>.</p> <p data-svelte-h="svelte-1ik5e8b">This is not the only difference though, because the “lazy” behavior of an <code>IterableDataset</code> is also present when it comes to dataset creation and processing.</p> <h2 class="relative group"><a id="creating-map-style-datasets-and-iterable-datasets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#creating-map-style-datasets-and-iterable-datasets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Creating map-style datasets and iterable datasets</span></h2> <p data-svelte-h="svelte-1i29srk">You can create a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> using lists or dictionaries, and the data is entirely converted to Arrow so you can easily access any row:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_dataset = Dataset.from_dict({<span class="hljs-string">"col_1"</span>: [<span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">3</span>, <span class="hljs-number">4</span>, <span class="hljs-number">5</span>, <span class="hljs-number">6</span>, <span class="hljs-number">7</span>, <span class="hljs-number">8</span>, <span class="hljs-number">9</span>]}) | |
| <span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-z2r7k0">To create an <code>IterableDataset</code> on the other hand, you must provide a “lazy” way to load the data. | |
| In Python, we generally use generator functions. These functions <code>yield</code> one example at a time, which means you can’t access a row by slicing it like a regular <code>Dataset</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">my_generator</span>(<span class="hljs-params">n</span>): | |
| <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n): | |
| <span class="hljs-keyword">yield</span> {<span class="hljs-string">"col_1"</span>: i} | |
| my_iterable_dataset = IterableDataset.from_generator(my_generator, gen_kwargs={<span class="hljs-string">"n"</span>: <span class="hljs-number">10</span>}) | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: | |
| <span class="hljs-built_in">print</span>(example) | |
| <span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="loading-local-files-entirely-and-progressively" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#loading-local-files-entirely-and-progressively"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Loading local files entirely and progressively</span></h2> <p data-svelte-h="svelte-zma22i">It is possible to convert local or remote data files to an Arrow <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> using <a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->data_files = {<span class="hljs-string">"train"</span>: [<span class="hljs-string">"path/to/data.csv"</span>]} | |
| my_dataset = load_dataset(<span class="hljs-string">"csv"</span>, data_files=data_files, split=<span class="hljs-string">"train"</span>) | |
| <span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-gryvti">However, this requires a conversion step from CSV to Arrow format, which takes time and disk space if your dataset is big.</p> <p data-svelte-h="svelte-gn3sd">To save disk space and skip the conversion step, you can define an <code>IterableDataset</code> by streaming from the local files directly. | |
| This way, the data is read progressively from the local files as you iterate over the dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->data_files = {<span class="hljs-string">"train"</span>: [<span class="hljs-string">"path/to/data.csv"</span>]} | |
| my_iterable_dataset = load_dataset(<span class="hljs-string">"csv"</span>, data_files=data_files, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span>) | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: <span class="hljs-comment"># this reads the CSV file progressively as you iterate over the dataset</span> | |
| <span class="hljs-built_in">print</span>(example) | |
| <span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-u916f6">Many file formats are supported, like CSV, JSONL, and Parquet, as well as image and audio files. | |
| You can find more information in the corresponding guides for loading <a href="./tabular_load">tabular</a>, <a href="./nlp_load">text</a>, <a href="./image_load">vision</a>, and <a href="./audio_load%5D">audio</a> datasets.</p> <h2 class="relative group"><a id="eager-data-processing-and-lazy-data-processing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#eager-data-processing-and-lazy-data-processing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Eager data processing and lazy data processing</span></h2> <p data-svelte-h="svelte-19wo518">When you process a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> object using <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">Dataset.map()</a>, the entire dataset is processed immediately and returned. | |
| This is similar to how <code>pandas</code> works for example.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_dataset = my_dataset.<span class="hljs-built_in">map</span>(process_fn) <span class="hljs-comment"># process_fn is applied on all the examples of the dataset</span> | |
| <span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-n9zp2j">On the other hand, due to the “lazy” nature of an <code>IterableDataset</code>, calling <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.map">IterableDataset.map()</a> does not apply your <code>map</code> function over the full dataset. | |
| Instead, your <code>map</code> function is applied on-the-fly.</p> <p data-svelte-h="svelte-vsuw18">Because of that, you can chain multiple processing steps and they will all run at once when you start iterating over the dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_iterable_dataset = my_iterable_dataset.<span class="hljs-built_in">map</span>(process_fn_1) | |
| my_iterable_dataset = my_iterable_dataset.<span class="hljs-built_in">filter</span>(filter_fn) | |
| my_iterable_dataset = my_iterable_dataset.<span class="hljs-built_in">map</span>(process_fn_2) | |
| <span class="hljs-comment"># process_fn_1, filter_fn and process_fn_2 are applied on-the-fly when iterating over the dataset</span> | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: | |
| <span class="hljs-built_in">print</span>(example) | |
| <span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="exact-and-fast-approximate-shuffling" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#exact-and-fast-approximate-shuffling"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Exact and fast approximate shuffling</span></h2> <p data-svelte-h="svelte-3vxzt6">When you shuffle a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> using <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.shuffle">Dataset.shuffle()</a>, you apply an exact shuffling of the dataset. | |
| It works by taking a list of indices <code>[0, 1, 2, ... len(my_dataset) - 1]</code> and shuffling this list. | |
| Then, accessing <code>my_dataset[0]</code> returns the row and index defined by the first element of the indices mapping that has been shuffled:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_dataset = my_dataset.shuffle(seed=<span class="hljs-number">42</span>) | |
| <span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-v7lvxb">Since we don’t have random access to the rows in the case of an <code>IterableDataset</code>, we can’t use a shuffled list of indices and access a row at an arbitrary position. | |
| This prevents the use of exact shuffling. | |
| Instead, a fast approximate shuffling is used in <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.shuffle">IterableDataset.shuffle()</a>. | |
| It uses a shuffle buffer to sample random examples iteratively from the dataset. | |
| Since the dataset is still read iteratively, it provides excellent speed performance:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_iterable_dataset = my_iterable_dataset.shuffle(seed=<span class="hljs-number">42</span>, buffer_size=<span class="hljs-number">100</span>) | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: | |
| <span class="hljs-built_in">print</span>(example) | |
| <span class="hljs-keyword">break</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-nx8g88">But using a shuffle buffer is not enough to provide a satisfactory shuffling for machine learning model training. So <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.shuffle">IterableDataset.shuffle()</a> also shuffles the dataset shards if your dataset is made of multiple files or sources:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Stream from the internet</span> | |
| my_iterable_dataset = load_dataset(<span class="hljs-string">"deepmind/code_contests"</span>, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span>) | |
| my_iterable_dataset.num_shards <span class="hljs-comment"># 39</span> | |
| <span class="hljs-comment"># Stream from local files</span> | |
| data_files = {<span class="hljs-string">"train"</span>: [<span class="hljs-string">f"path/to/data_<span class="hljs-subst">{i}</span>.csv"</span> <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">1024</span>)]} | |
| my_iterable_dataset = load_dataset(<span class="hljs-string">"csv"</span>, data_files=data_files, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span>) | |
| my_iterable_dataset.num_shards <span class="hljs-comment"># 1024</span> | |
| <span class="hljs-comment"># From a generator function</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">my_generator</span>(<span class="hljs-params">n, sources</span>): | |
| <span class="hljs-keyword">for</span> source <span class="hljs-keyword">in</span> sources: | |
| <span class="hljs-keyword">for</span> example_id_for_current_source <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n): | |
| <span class="hljs-keyword">yield</span> {<span class="hljs-string">"example_id"</span>: <span class="hljs-string">f"<span class="hljs-subst">{source}</span>_<span class="hljs-subst">{example_id_for_current_source}</span>"</span>} | |
| gen_kwargs = {<span class="hljs-string">"n"</span>: <span class="hljs-number">10</span>, <span class="hljs-string">"sources"</span>: [<span class="hljs-string">f"path/to/data_<span class="hljs-subst">{i}</span>"</span> <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">1024</span>)]} | |
| my_iterable_dataset = IterableDataset.from_generator(my_generator, gen_kwargs=gen_kwargs) | |
| my_iterable_dataset.num_shards <span class="hljs-comment"># 1024</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="speed-differences" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#speed-differences"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Speed differences</span></h2> <p data-svelte-h="svelte-cfps1k">Regular <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> objects are based on Arrow which provides fast random access to the rows. | |
| Thanks to memory mapping and the fact that Arrow is an in-memory format, reading data from disk doesn’t do expensive system calls and deserialization. | |
| It provides even faster data loading when iterating using a <code>for</code> loop by iterating on contiguous Arrow record batches.</p> <p data-svelte-h="svelte-fna18m">However as soon as your <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> has an indices mapping (via <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.shuffle">Dataset.shuffle()</a> for example), the speed can become 10x slower. | |
| This is because there is an extra step to get the row index to read using the indices mapping, and most importantly, you aren’t reading contiguous chunks of data anymore. | |
| To restore the speed, you’d need to rewrite the entire dataset on your disk again using <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.flatten_indices">Dataset.flatten_indices()</a>, which removes the indices mapping. | |
| This may take a lot of time depending on the size of your dataset though:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_dataset[<span class="hljs-number">0</span>] <span class="hljs-comment"># fast</span> | |
| my_dataset = my_dataset.shuffle(seed=<span class="hljs-number">42</span>) | |
| my_dataset[<span class="hljs-number">0</span>] <span class="hljs-comment"># up to 10x slower</span> | |
| my_dataset = my_dataset.flatten_indices() <span class="hljs-comment"># rewrite the shuffled dataset on disk as contiguous chunks of data</span> | |
| my_dataset[<span class="hljs-number">0</span>] <span class="hljs-comment"># fast again</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-mfr8up">In this case, we recommend switching to an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> and leveraging its fast approximate shuffling method <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.shuffle">IterableDataset.shuffle()</a>. | |
| It only shuffles the shards order and adds a shuffle buffer to your dataset, which keeps the speed of your dataset optimal. | |
| You can also reshuffle the dataset easily:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(my_iterable_dataset): <span class="hljs-comment"># fast</span> | |
| <span class="hljs-keyword">pass</span> | |
| shuffled_iterable_dataset = my_iterable_dataset.shuffle(seed=<span class="hljs-number">42</span>, buffer_size=<span class="hljs-number">100</span>) | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(shuffled_iterable_dataset): <span class="hljs-comment"># as fast as before</span> | |
| <span class="hljs-keyword">pass</span> | |
| shuffled_iterable_dataset = my_iterable_dataset.shuffle(seed=<span class="hljs-number">1337</span>, buffer_size=<span class="hljs-number">100</span>) <span class="hljs-comment"># reshuffling using another seed is instantaneous</span> | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(shuffled_iterable_dataset): <span class="hljs-comment"># still as fast as before</span> | |
| <span class="hljs-keyword">pass</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1yhobsx">If you’re using your dataset on multiple epochs, the effective seed to shuffle the shards order in the shuffle buffer is <code>seed + epoch</code>. | |
| It makes it easy to reshuffle a dataset between epochs:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">for</span> epoch <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n_epochs): | |
| my_iterable_dataset.set_epoch(epoch) | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: <span class="hljs-comment"># fast + reshuffled at each epoch using `effective_seed = seed + epoch`</span> | |
| <span class="hljs-keyword">pass</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1vtsyk">To restart the iteration of a map-style dataset, you can simply skip the first examples:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_dataset = my_dataset.select(<span class="hljs-built_in">range</span>(start_index, <span class="hljs-built_in">len</span>(dataset)))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qarbjh">But if you use a <code>DataLoader</code> with a <code>Sampler</code>, you should instead save the state of your sampler (you might have written a custom sampler that allows resuming).</p> <p data-svelte-h="svelte-177bwe6">On the other hand, iterable datasets don’t provide random access to a specific example index to resume from. But you can use <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.state_dict">IterableDataset.state_dict()</a> and <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.load_state_dict">IterableDataset.load_state_dict()</a> to resume from a checkpoint instead, similarly to what you can do for models and optimizers:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>iterable_dataset = Dataset.from_dict({<span class="hljs-string">"a"</span>: <span class="hljs-built_in">range</span>(<span class="hljs-number">6</span>)}).to_iterable_dataset(num_shards=<span class="hljs-number">3</span>) | |
| <span class="hljs-meta">>>> </span><span class="hljs-comment"># save in the middle of training</span> | |
| <span class="hljs-meta">>>> </span>state_dict = iterable_dataset.state_dict() | |
| <span class="hljs-meta">>>> </span><span class="hljs-comment"># and resume later</span> | |
| <span class="hljs-meta">>>> </span>iterable_dataset.load_state_dict(state_dict)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ul64r3">Under the hood, the iterable dataset keeps track of the current shard being read and the example index in the current shard and it stores this info in the <code>state_dict</code>.</p> <p data-svelte-h="svelte-fghrva">To resume from a checkpoint, the dataset skips all the shards that were previously read to restart from the current shard. | |
| Then it reads the shard and skips examples until it reaches the exact example from the checkpoint.</p> <p data-svelte-h="svelte-1i92o70">Therefore restarting a dataset is quite fast, since it will not re-read the shards that have already been iterated on. Still, resuming a dataset is generally not instantaneous since it has to restart reading from the beginning of the current shard and skip examples until it reaches the checkpoint location.</p> <p data-svelte-h="svelte-1r4w1a2">This can be used with the <code>StatefulDataLoader</code> from <code>torchdata</code>, see <a href="./use_with_pytorch#stream-data">streaming with a PyTorch DataLoader</a>.</p> <h2 class="relative group"><a id="switch-from-map-style-to-iterable" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#switch-from-map-style-to-iterable"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Switch from map-style to iterable</span></h2> <p data-svelte-h="svelte-1dgl4fa">If you want to benefit from the “lazy” behavior of an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> or their speed advantages, you can switch your map-style <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> to an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_iterable_dataset = my_dataset.to_iterable_dataset()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8w5p2f">If you want to shuffle your dataset or <a href="./use_with_pytorch#stream-data">use it with a PyTorch DataLoader</a>, we recommend generating a sharded <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->my_iterable_dataset = my_dataset.to_iterable_dataset(num_shards=<span class="hljs-number">1024</span>) | |
| my_iterable_dataset.num_shards <span class="hljs-comment"># 1024</span><!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/datasets/blob/main/docs/source/about_mapstyle_vs_iterable.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_1tcoqe3 = { | |
| assets: "/docs/datasets/pr_8021/en", | |
| base: "/docs/datasets/pr_8021/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/datasets/pr_8021/en/_app/immutable/entry/start.467c4c66.js"), | |
| import("/docs/datasets/pr_8021/en/_app/immutable/entry/app.3b2ba720.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 7], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 61.1 kB
- Xet hash:
- 1dd33d46a12ab11c348a8d36cb68350422b2525275c2a96a70f2ac30f57dc0bf
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.