Buckets:

rtrm's picture
download
raw
202 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Process&quot;,&quot;local&quot;:&quot;process&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Sort, shuffle, select, split, and shard&quot;,&quot;local&quot;:&quot;sort-shuffle-select-split-and-shard&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Sort&quot;,&quot;local&quot;:&quot;sort&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Shuffle&quot;,&quot;local&quot;:&quot;shuffle&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Select and Filter&quot;,&quot;local&quot;:&quot;select-and-filter&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Split&quot;,&quot;local&quot;:&quot;split&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Shard&quot;,&quot;local&quot;:&quot;shard&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Rename, remove, cast, and flatten&quot;,&quot;local&quot;:&quot;rename-remove-cast-and-flatten&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Rename&quot;,&quot;local&quot;:&quot;rename&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Remove&quot;,&quot;local&quot;:&quot;remove&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Cast&quot;,&quot;local&quot;:&quot;cast&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Flatten&quot;,&quot;local&quot;:&quot;flatten&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Map&quot;,&quot;local&quot;:&quot;map&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Multiprocessing&quot;,&quot;local&quot;:&quot;multiprocessing&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Batch processing&quot;,&quot;local&quot;:&quot;batch-processing&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Split long examples&quot;,&quot;local&quot;:&quot;split-long-examples&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Data augmentation&quot;,&quot;local&quot;:&quot;data-augmentation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Asynchronous processing&quot;,&quot;local&quot;:&quot;asynchronous-processing&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Process multiple splits&quot;,&quot;local&quot;:&quot;process-multiple-splits&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Distributed usage&quot;,&quot;local&quot;:&quot;distributed-usage&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Batch&quot;,&quot;local&quot;:&quot;batch&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Concatenate&quot;,&quot;local&quot;:&quot;concatenate&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Interleave&quot;,&quot;local&quot;:&quot;interleave&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Format&quot;,&quot;local&quot;:&quot;format&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Tensors formats&quot;,&quot;local&quot;:&quot;tensors-formats&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Tabular formats&quot;,&quot;local&quot;:&quot;tabular-formats&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Custom format transform&quot;,&quot;local&quot;:&quot;custom-format-transform&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Save&quot;,&quot;local&quot;:&quot;save&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Export&quot;,&quot;local&quot;:&quot;export&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/datasets/pr_8021/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/entry/start.467c4c66.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/scheduler.d75c11ed.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/singletons.24e4ec1f.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/index.d12496d4.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/paths.409c1290.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/entry/app.3b2ba720.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/preload-helper.a99c0584.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/index.4ec9dfe9.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/nodes/0.5fda7065.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/nodes/39.0a77fef3.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.ee0f129e.js">
<link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/CodeBlock.5919a092.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Process&quot;,&quot;local&quot;:&quot;process&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Sort, shuffle, select, split, and shard&quot;,&quot;local&quot;:&quot;sort-shuffle-select-split-and-shard&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Sort&quot;,&quot;local&quot;:&quot;sort&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Shuffle&quot;,&quot;local&quot;:&quot;shuffle&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Select and Filter&quot;,&quot;local&quot;:&quot;select-and-filter&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Split&quot;,&quot;local&quot;:&quot;split&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Shard&quot;,&quot;local&quot;:&quot;shard&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Rename, remove, cast, and flatten&quot;,&quot;local&quot;:&quot;rename-remove-cast-and-flatten&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Rename&quot;,&quot;local&quot;:&quot;rename&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Remove&quot;,&quot;local&quot;:&quot;remove&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Cast&quot;,&quot;local&quot;:&quot;cast&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Flatten&quot;,&quot;local&quot;:&quot;flatten&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Map&quot;,&quot;local&quot;:&quot;map&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Multiprocessing&quot;,&quot;local&quot;:&quot;multiprocessing&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Batch processing&quot;,&quot;local&quot;:&quot;batch-processing&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Split long examples&quot;,&quot;local&quot;:&quot;split-long-examples&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Data augmentation&quot;,&quot;local&quot;:&quot;data-augmentation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Asynchronous processing&quot;,&quot;local&quot;:&quot;asynchronous-processing&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Process multiple splits&quot;,&quot;local&quot;:&quot;process-multiple-splits&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Distributed usage&quot;,&quot;local&quot;:&quot;distributed-usage&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Batch&quot;,&quot;local&quot;:&quot;batch&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Concatenate&quot;,&quot;local&quot;:&quot;concatenate&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Interleave&quot;,&quot;local&quot;:&quot;interleave&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Format&quot;,&quot;local&quot;:&quot;format&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Tensors formats&quot;,&quot;local&quot;:&quot;tensors-formats&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Tabular formats&quot;,&quot;local&quot;:&quot;tabular-formats&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Custom format transform&quot;,&quot;local&quot;:&quot;custom-format-transform&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Save&quot;,&quot;local&quot;:&quot;save&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Export&quot;,&quot;local&quot;:&quot;export&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="process" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#process"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Process</span></h1> <p data-svelte-h="svelte-o0c0lx">🤗 Datasets provides many tools for modifying the structure and content of a dataset. These tools are important for tidying up a dataset, creating additional columns, converting between features and formats, and much more.</p> <p data-svelte-h="svelte-1aff4p7">This guide will show you how to:</p> <ul data-svelte-h="svelte-141dx8c"><li>Reorder rows and split the dataset.</li> <li>Rename and remove columns, and other common column operations.</li> <li>Apply processing functions to each example in a dataset.</li> <li>Concatenate datasets.</li> <li>Apply a custom formatting transform.</li> <li>Save and export processed datasets.</li></ul> <p data-svelte-h="svelte-1xlvt1h">For more details specific to processing other dataset modalities, take a look at the <a class="underline decoration-pink-400 decoration-2 font-semibold" href="./audio_process">process audio dataset guide</a>, the <a class="underline decoration-yellow-400 decoration-2 font-semibold" href="./image_process">process image dataset guide</a>, or the <a class="underline decoration-green-400 decoration-2 font-semibold" href="./nlp_process">process text dataset guide</a>.</p> <p data-svelte-h="svelte-1bplt66">The examples in this guide use the MRPC dataset, but feel free to load any dataset of your choice and follow along!</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = load_dataset(<span class="hljs-string">&quot;nyu-mll/glue&quot;</span>, <span class="hljs-string">&quot;mrpc&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)<!-- HTML_TAG_END --></pre></div> <blockquote class="warning" data-svelte-h="svelte-ildmst"><p>All processing methods in this guide return a new <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> object. Modification is not done in-place. Be careful about overriding your previous dataset!</p></blockquote> <h2 class="relative group"><a id="sort-shuffle-select-split-and-shard" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sort-shuffle-select-split-and-shard"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Sort, shuffle, select, split, and shard</span></h2> <p data-svelte-h="svelte-1bmil1o">There are several functions for rearranging the structure of a dataset.
These functions are useful for selecting only the rows you want, creating train and test splits, and sharding very large datasets into smaller chunks.</p> <h3 class="relative group"><a id="sort" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sort"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Sort</span></h3> <p data-svelte-h="svelte-11cerg1">Use <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.sort">sort()</a> to sort column values according to their numerical values. The provided column must be NumPy compatible.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>dataset[<span class="hljs-string">&quot;label&quot;</span>][:<span class="hljs-number">10</span>]
[<span class="hljs-number">1</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>sorted_dataset = dataset.sort(<span class="hljs-string">&quot;label&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>sorted_dataset[<span class="hljs-string">&quot;label&quot;</span>][:<span class="hljs-number">10</span>]
[<span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>sorted_dataset[<span class="hljs-string">&quot;label&quot;</span>][-<span class="hljs-number">10</span>:]
[<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-50nyg6">Under the hood, this creates a list of indices that is sorted according to values of the column.
This indices mapping is then used to access the right rows in the underlying Arrow table.</p> <h3 class="relative group"><a id="shuffle" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#shuffle"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Shuffle</span></h3> <p data-svelte-h="svelte-vy7kxe">The <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.shuffle">shuffle()</a> function randomly rearranges the column values. You can specify the <code>generator</code> parameter in this function to use a different <code>numpy.random.Generator</code> if you want more control over the algorithm used to shuffle the dataset.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>shuffled_dataset = sorted_dataset.shuffle(seed=<span class="hljs-number">42</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>shuffled_dataset[<span class="hljs-string">&quot;label&quot;</span>][:<span class="hljs-number">10</span>]
[<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1y7qjl3">Shuffling takes the list of indices <code>[0:len(my_dataset)]</code> and shuffles it to create an indices mapping.
However as soon as your <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> has an indices mapping, the speed can become 10x slower.
This is because there is an extra step to get the row index to read using the indices mapping, and most importantly, you aren’t reading contiguous chunks of data anymore.
To restore the speed, you’d need to rewrite the entire dataset on your disk again using <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.flatten_indices">Dataset.flatten_indices()</a>, which removes the indices mapping.
Alternatively, you can switch to an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> and leverage its fast approximate shuffling <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.shuffle">IterableDataset.shuffle()</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>iterable_dataset = dataset.to_iterable_dataset(num_shards=<span class="hljs-number">128</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>shuffled_iterable_dataset = iterable_dataset.shuffle(seed=<span class="hljs-number">42</span>, buffer_size=<span class="hljs-number">1000</span>)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="select-and-filter" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#select-and-filter"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Select and Filter</span></h3> <p data-svelte-h="svelte-umc5mx">There are two options for filtering rows in a dataset: <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.select">select()</a> and <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.filter">filter()</a>.</p> <ul data-svelte-h="svelte-10xpb44"><li><a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.select">select()</a> returns rows according to a list of indices:</li></ul> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>small_dataset = dataset.select([<span class="hljs-number">0</span>, <span class="hljs-number">10</span>, <span class="hljs-number">20</span>, <span class="hljs-number">30</span>, <span class="hljs-number">40</span>, <span class="hljs-number">50</span>])
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">len</span>(small_dataset)
<span class="hljs-number">6</span><!-- HTML_TAG_END --></pre></div> <ul data-svelte-h="svelte-5dirwe"><li><a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.filter">filter()</a> returns rows that match a specified condition:</li></ul> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>start_with_ar = dataset.<span class="hljs-built_in">filter</span>(<span class="hljs-keyword">lambda</span> example: example[<span class="hljs-string">&quot;sentence1&quot;</span>].startswith(<span class="hljs-string">&quot;Ar&quot;</span>))
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">len</span>(start_with_ar)
<span class="hljs-number">6</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>start_with_ar[<span class="hljs-string">&quot;sentence1&quot;</span>]
[<span class="hljs-string">&#x27;Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A {@html "<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>4.56</mn><mo separator="true">,</mo><mi>h</mi><mi>a</mi><mi>v</mi><mi>i</mi><mi>n</mi><mi>g</mi><mi>e</mi><mi>a</mi><mi>r</mi><mi>l</mi><mi>i</mi><mi>e</mi><mi>r</mi><mi>s</mi><mi>e</mi><mi>t</mi><mi>a</mi><mi>r</mi><mi>e</mi><mi>c</mi><mi>o</mi><mi>r</mi><mi>d</mi><mi>h</mi><mi>i</mi><mi>g</mi><mi>h</mi><mi>o</mi><mi>f</mi><mi>A</mi></mrow><annotation encoding="application/x-tex"> 4.56 , having earlier set a record high of A </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8889em;vertical-align:-0.1944em;"></span><span class="mord">4.56</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal">ha</span><span class="mord mathnormal" style="margin-right:0.03588em;">v</span><span class="mord mathnormal">in</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord mathnormal">e</span><span class="mord mathnormal">a</span><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="mord mathnormal" style="margin-right:0.01968em;">l</span><span class="mord mathnormal">i</span><span class="mord mathnormal">erse</span><span class="mord mathnormal">t</span><span class="mord mathnormal">a</span><span class="mord mathnormal" style="margin-right:0.02778em;">recor</span><span class="mord mathnormal">d</span><span class="mord mathnormal">hi</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord mathnormal">h</span><span class="mord mathnormal">o</span><span class="mord mathnormal" style="margin-right:0.10764em;">f</span><span class="mord mathnormal">A</span></span></span></span>"} 4.57 .&#x27;</span>,
<span class="hljs-string">&#x27;Arison said Mann may have been one of the pioneers of the world music movement and he had a deep love of Brazilian music .&#x27;</span>,
<span class="hljs-string">&#x27;Arts helped coach the youth on an eighth-grade football team at Lombardi Middle School in Green Bay .&#x27;</span>,
<span class="hljs-string">&#x27;Around 9 : 00 a.m. EDT ( 1300 GMT ) , the euro was at $ 1.1566 against the dollar , up 0.07 percent on the day .&#x27;</span>,
<span class="hljs-string">&quot;Arguing that the case was an isolated example , Canada has threatened a trade backlash if Tokyo &#x27;s ban is not justified on scientific grounds .&quot;</span>,
<span class="hljs-string">&#x27;Artists are worried the plan would harm those who need help most - performers who have a difficult time lining up shows .&#x27;</span>
]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-luww8"><a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.filter">filter()</a> can also filter by indices if you set <code>with_indices=True</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>even_dataset = dataset.<span class="hljs-built_in">filter</span>(<span class="hljs-keyword">lambda</span> example, idx: idx % <span class="hljs-number">2</span> == <span class="hljs-number">0</span>, with_indices=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">len</span>(even_dataset)
<span class="hljs-number">1834</span>
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">len</span>(dataset) / <span class="hljs-number">2</span>
<span class="hljs-number">1834.0</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1y27i4k">Unless the list of indices to keep is contiguous, those methods also create an indices mapping under the hood.</p> <h3 class="relative group"><a id="split" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#split"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Split</span></h3> <p data-svelte-h="svelte-dlszhj">The <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.train_test_split">train_test_split()</a> function creates train and test splits if your dataset doesn’t already have them. This allows you to adjust the relative proportions or an absolute number of samples in each split. In the example below, use the <code>test_size</code> parameter to create a test split that is 10% of the original dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>dataset.train_test_split(test_size=<span class="hljs-number">0.1</span>)
{<span class="hljs-string">&#x27;train&#x27;</span>: Dataset(schema: {<span class="hljs-string">&#x27;sentence1&#x27;</span>: <span class="hljs-string">&#x27;string&#x27;</span>, <span class="hljs-string">&#x27;sentence2&#x27;</span>: <span class="hljs-string">&#x27;string&#x27;</span>, <span class="hljs-string">&#x27;label&#x27;</span>: <span class="hljs-string">&#x27;int64&#x27;</span>, <span class="hljs-string">&#x27;idx&#x27;</span>: <span class="hljs-string">&#x27;int32&#x27;</span>}, num_rows: <span class="hljs-number">3301</span>),
<span class="hljs-string">&#x27;test&#x27;</span>: Dataset(schema: {<span class="hljs-string">&#x27;sentence1&#x27;</span>: <span class="hljs-string">&#x27;string&#x27;</span>, <span class="hljs-string">&#x27;sentence2&#x27;</span>: <span class="hljs-string">&#x27;string&#x27;</span>, <span class="hljs-string">&#x27;label&#x27;</span>: <span class="hljs-string">&#x27;int64&#x27;</span>, <span class="hljs-string">&#x27;idx&#x27;</span>: <span class="hljs-string">&#x27;int32&#x27;</span>}, num_rows: <span class="hljs-number">367</span>)}
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-number">0.1</span> * <span class="hljs-built_in">len</span>(dataset)
<span class="hljs-number">366.8</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-kxs7ja">The splits are shuffled by default, but you can set <code>shuffle=False</code> to prevent shuffling.</p> <h3 class="relative group"><a id="shard" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#shard"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Shard</span></h3> <p data-svelte-h="svelte-17hl2xw">🤗 Datasets supports sharding to divide a very large dataset into a predefined number of chunks. Specify the <code>num_shards</code> parameter in <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.shard">shard()</a> to determine the number of shards to split the dataset into. You’ll also need to provide the shard you want to return with the <code>index</code> parameter.</p> <p data-svelte-h="svelte-irelh5">For example, the <a href="https://huggingface.co/datasets/stanfordnlp/imdb" rel="nofollow">stanfordnlp/imdb</a> dataset has 25000 examples:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = load_dataset(<span class="hljs-string">&quot;stanfordnlp/imdb&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(dataset)
Dataset({
features: [<span class="hljs-string">&#x27;text&#x27;</span>, <span class="hljs-string">&#x27;label&#x27;</span>],
num_rows: <span class="hljs-number">25000</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-189aje2">After sharding the dataset into four chunks, the first shard will only have 6250 examples:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>dataset.shard(num_shards=<span class="hljs-number">4</span>, index=<span class="hljs-number">0</span>)
Dataset({
features: [<span class="hljs-string">&#x27;text&#x27;</span>, <span class="hljs-string">&#x27;label&#x27;</span>],
num_rows: <span class="hljs-number">6250</span>
})
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(<span class="hljs-number">25000</span>/<span class="hljs-number">4</span>)
<span class="hljs-number">6250.0</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="rename-remove-cast-and-flatten" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#rename-remove-cast-and-flatten"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Rename, remove, cast, and flatten</span></h2> <p data-svelte-h="svelte-1vk4iy9">The following functions allow you to modify the columns of a dataset. These functions are useful for renaming or removing columns, changing columns to a new set of features, and flattening nested column structures.</p> <h3 class="relative group"><a id="rename" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#rename"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Rename</span></h3> <p data-svelte-h="svelte-10kk1q3">Use <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.rename_column">rename_column()</a> when you need to rename a column in your dataset. Features associated with the original column are actually moved under the new column name, instead of just replacing the original column in-place.</p> <p data-svelte-h="svelte-5nx2d2">Provide <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.rename_column">rename_column()</a> with the name of the original column, and the new column name:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>dataset
Dataset({
features: [<span class="hljs-string">&#x27;sentence1&#x27;</span>, <span class="hljs-string">&#x27;sentence2&#x27;</span>, <span class="hljs-string">&#x27;label&#x27;</span>, <span class="hljs-string">&#x27;idx&#x27;</span>],
num_rows: <span class="hljs-number">3668</span>
})
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = dataset.rename_column(<span class="hljs-string">&quot;sentence1&quot;</span>, <span class="hljs-string">&quot;sentenceA&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = dataset.rename_column(<span class="hljs-string">&quot;sentence2&quot;</span>, <span class="hljs-string">&quot;sentenceB&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset
Dataset({
features: [<span class="hljs-string">&#x27;sentenceA&#x27;</span>, <span class="hljs-string">&#x27;sentenceB&#x27;</span>, <span class="hljs-string">&#x27;label&#x27;</span>, <span class="hljs-string">&#x27;idx&#x27;</span>],
num_rows: <span class="hljs-number">3668</span>
})<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="remove" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#remove"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Remove</span></h3> <p data-svelte-h="svelte-1jbkd5l">When you need to remove one or more columns, provide the column name to remove to the <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.remove_columns">remove_columns()</a> function. Remove more than one column by providing a list of column names:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>dataset = dataset.remove_columns(<span class="hljs-string">&quot;label&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset
Dataset({
features: [<span class="hljs-string">&#x27;sentence1&#x27;</span>, <span class="hljs-string">&#x27;sentence2&#x27;</span>, <span class="hljs-string">&#x27;idx&#x27;</span>],
num_rows: <span class="hljs-number">3668</span>
})
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = dataset.remove_columns([<span class="hljs-string">&quot;sentence1&quot;</span>, <span class="hljs-string">&quot;sentence2&quot;</span>])
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset
Dataset({
features: [<span class="hljs-string">&#x27;idx&#x27;</span>],
num_rows: <span class="hljs-number">3668</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8knhxc">Conversely, <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.select_columns">select_columns()</a> selects one or more columns to keep and removes the rest. This function takes either one or a list of column names:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>dataset
Dataset({
features: [<span class="hljs-string">&#x27;sentence1&#x27;</span>, <span class="hljs-string">&#x27;sentence2&#x27;</span>, <span class="hljs-string">&#x27;label&#x27;</span>, <span class="hljs-string">&#x27;idx&#x27;</span>],
num_rows: <span class="hljs-number">3668</span>
})
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = dataset.select_columns([<span class="hljs-string">&#x27;sentence1&#x27;</span>, <span class="hljs-string">&#x27;sentence2&#x27;</span>, <span class="hljs-string">&#x27;idx&#x27;</span>])
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset
Dataset({
features: [<span class="hljs-string">&#x27;sentence1&#x27;</span>, <span class="hljs-string">&#x27;sentence2&#x27;</span>, <span class="hljs-string">&#x27;idx&#x27;</span>],
num_rows: <span class="hljs-number">3668</span>
})
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = dataset.select_columns(<span class="hljs-string">&#x27;idx&#x27;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset
Dataset({
features: [<span class="hljs-string">&#x27;idx&#x27;</span>],
num_rows: <span class="hljs-number">3668</span>
})<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="cast" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#cast"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Cast</span></h3> <p data-svelte-h="svelte-r2h8eq">The <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.cast">cast()</a> function transforms the feature type of one or more columns. This function accepts your new <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Features">Features</a> as its argument. The example below demonstrates how to change the <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.ClassLabel">ClassLabel</a> and <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Value">Value</a> features:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>dataset.features
{<span class="hljs-string">&#x27;sentence1&#x27;</span>: Value(<span class="hljs-string">&#x27;string&#x27;</span>),
<span class="hljs-string">&#x27;sentence2&#x27;</span>: Value(<span class="hljs-string">&#x27;string&#x27;</span>),
<span class="hljs-string">&#x27;label&#x27;</span>: ClassLabel(names=[<span class="hljs-string">&#x27;not_equivalent&#x27;</span>, <span class="hljs-string">&#x27;equivalent&#x27;</span>]),
<span class="hljs-string">&#x27;idx&#x27;</span>: Value(<span class="hljs-string">&#x27;int32&#x27;</span>)}
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> ClassLabel, Value
<span class="hljs-meta">&gt;&gt;&gt; </span>new_features = dataset.features.copy()
<span class="hljs-meta">&gt;&gt;&gt; </span>new_features[<span class="hljs-string">&quot;label&quot;</span>] = ClassLabel(names=[<span class="hljs-string">&quot;negative&quot;</span>, <span class="hljs-string">&quot;positive&quot;</span>])
<span class="hljs-meta">&gt;&gt;&gt; </span>new_features[<span class="hljs-string">&quot;idx&quot;</span>] = Value(<span class="hljs-string">&quot;int64&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = dataset.cast(new_features)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset.features
{<span class="hljs-string">&#x27;sentence1&#x27;</span>: Value(<span class="hljs-string">&#x27;string&#x27;</span>),
<span class="hljs-string">&#x27;sentence2&#x27;</span>: Value(<span class="hljs-string">&#x27;string&#x27;</span>),
<span class="hljs-string">&#x27;label&#x27;</span>: ClassLabel(names=[<span class="hljs-string">&#x27;negative&#x27;</span>, <span class="hljs-string">&#x27;positive&#x27;</span>]),
<span class="hljs-string">&#x27;idx&#x27;</span>: Value(<span class="hljs-string">&#x27;int64&#x27;</span>)}<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-2gfsp1"><p>Casting only works if the original feature type and new feature type are compatible. For example, you can cast a column with the feature type <code>Value(&quot;int32&quot;)</code> to <code>Value(&quot;bool&quot;)</code> if the original column only contains ones and zeros.</p></blockquote> <p data-svelte-h="svelte-110okbj">Use the <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.cast_column">cast_column()</a> function to change the feature type of a single column. Pass the column name and its new feature type as arguments:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>dataset.features
{<span class="hljs-string">&#x27;audio&#x27;</span>: Audio(sampling_rate=<span class="hljs-number">44100</span>, mono=<span class="hljs-literal">True</span>)}
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = dataset.cast_column(<span class="hljs-string">&quot;audio&quot;</span>, Audio(sampling_rate=<span class="hljs-number">16000</span>))
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset.features
{<span class="hljs-string">&#x27;audio&#x27;</span>: Audio(sampling_rate=<span class="hljs-number">16000</span>, mono=<span class="hljs-literal">True</span>)}<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="flatten" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#flatten"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Flatten</span></h3> <p data-svelte-h="svelte-68ctim">Sometimes a column can be a nested structure of several types. Take a look at the nested structure below from the SQuAD dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = load_dataset(<span class="hljs-string">&quot;rajpurkar/squad&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset.features
{<span class="hljs-string">&#x27;id&#x27;</span>: Value(<span class="hljs-string">&#x27;string&#x27;</span>),
<span class="hljs-string">&#x27;title&#x27;</span>: Value(<span class="hljs-string">&#x27;string&#x27;</span>),
<span class="hljs-string">&#x27;context&#x27;</span>: Value(<span class="hljs-string">&#x27;string&#x27;</span>),
<span class="hljs-string">&#x27;question&#x27;</span>: Value(<span class="hljs-string">&#x27;string&#x27;</span>),
<span class="hljs-string">&#x27;answers&#x27;</span>: {<span class="hljs-string">&#x27;text&#x27;</span>: <span class="hljs-type">List</span>(Value(<span class="hljs-string">&#x27;string&#x27;</span>)),
<span class="hljs-string">&#x27;answer_start&#x27;</span>: <span class="hljs-type">List</span>(Value(<span class="hljs-string">&#x27;int32&#x27;</span>))}}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ozog37">The <code>answers</code> field contains two subfields: <code>text</code> and <code>answer_start</code>. Use the <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.flatten">flatten()</a> function to extract the subfields into their own separate columns:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>flat_dataset = dataset.flatten()
<span class="hljs-meta">&gt;&gt;&gt; </span>flat_dataset
Dataset({
features: [<span class="hljs-string">&#x27;id&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;context&#x27;</span>, <span class="hljs-string">&#x27;question&#x27;</span>, <span class="hljs-string">&#x27;answers.text&#x27;</span>, <span class="hljs-string">&#x27;answers.answer_start&#x27;</span>],
num_rows: <span class="hljs-number">87599</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-80t9a4">Notice how the subfields are now their own independent columns: <code>answers.text</code> and <code>answers.answer_start</code>.</p> <h2 class="relative group"><a id="map" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#map"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Map</span></h2> <p data-svelte-h="svelte-1uamwwu">Some of the more powerful applications of 🤗 Datasets come from using the <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a> function. The primary purpose of <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a> is to speed up processing functions. It allows you to apply a processing function to each example in a dataset, independently or in batches. This function can even create new rows and columns.</p> <p data-svelte-h="svelte-ytojbc">In the following example, prefix each <code>sentence1</code> value in the dataset with <code>&#39;My sentence: &#39;</code>.</p> <p data-svelte-h="svelte-1t98b4r">Start by creating a function that adds <code>&#39;My sentence: &#39;</code> to the beginning of each sentence. The function needs to accept and output a <code>dict</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">add_prefix</span>(<span class="hljs-params">example</span>):
<span class="hljs-meta">... </span> example[<span class="hljs-string">&quot;sentence1&quot;</span>] = <span class="hljs-string">&#x27;My sentence: &#x27;</span> + example[<span class="hljs-string">&quot;sentence1&quot;</span>]
<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> example<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qeumtu">Now use <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a> to apply the <code>add_prefix</code> function to the entire dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>updated_dataset = small_dataset.<span class="hljs-built_in">map</span>(add_prefix)
<span class="hljs-meta">&gt;&gt;&gt; </span>updated_dataset[<span class="hljs-string">&quot;sentence1&quot;</span>][:<span class="hljs-number">5</span>]
[<span class="hljs-string">&#x27;My sentence: Amrozi accused his brother , whom he called &quot; the witness &quot; , of deliberately distorting his evidence .&#x27;</span>,
<span class="hljs-string">&quot;My sentence: Yucaipa owned Dominick &#x27;s before selling the chain to Safeway in 1998 for $ 2.5 billion .&quot;</span>,
<span class="hljs-string">&#x27;My sentence: They had published an advertisement on the Internet on June 10 , offering the cargo for sale , he added .&#x27;</span>,
<span class="hljs-string">&#x27;My sentence: Around 0335 GMT , Tab shares were up 19 cents , or 4.4 % , at A {@html "<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>4.56</mn><mo separator="true">,</mo><mi>h</mi><mi>a</mi><mi>v</mi><mi>i</mi><mi>n</mi><mi>g</mi><mi>e</mi><mi>a</mi><mi>r</mi><mi>l</mi><mi>i</mi><mi>e</mi><mi>r</mi><mi>s</mi><mi>e</mi><mi>t</mi><mi>a</mi><mi>r</mi><mi>e</mi><mi>c</mi><mi>o</mi><mi>r</mi><mi>d</mi><mi>h</mi><mi>i</mi><mi>g</mi><mi>h</mi><mi>o</mi><mi>f</mi><mi>A</mi></mrow><annotation encoding="application/x-tex"> 4.56 , having earlier set a record high of A </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8889em;vertical-align:-0.1944em;"></span><span class="mord">4.56</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal">ha</span><span class="mord mathnormal" style="margin-right:0.03588em;">v</span><span class="mord mathnormal">in</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord mathnormal">e</span><span class="mord mathnormal">a</span><span class="mord mathnormal" style="margin-right:0.02778em;">r</span><span class="mord mathnormal" style="margin-right:0.01968em;">l</span><span class="mord mathnormal">i</span><span class="mord mathnormal">erse</span><span class="mord mathnormal">t</span><span class="mord mathnormal">a</span><span class="mord mathnormal" style="margin-right:0.02778em;">recor</span><span class="mord mathnormal">d</span><span class="mord mathnormal">hi</span><span class="mord mathnormal" style="margin-right:0.03588em;">g</span><span class="mord mathnormal">h</span><span class="mord mathnormal">o</span><span class="mord mathnormal" style="margin-right:0.10764em;">f</span><span class="mord mathnormal">A</span></span></span></span>"} 4.57 .&#x27;</span>,
]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-wod4o6">Let’s take a look at another example, except this time, you’ll remove a column with <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a>. When you remove a column, it is only removed after the example has been provided to the mapped function. This allows the mapped function to use the content of the columns before they are removed.</p> <p data-svelte-h="svelte-1l4cglu">Specify the column to remove with the <code>remove_columns</code> parameter in <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>updated_dataset = dataset.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> example: {<span class="hljs-string">&quot;new_sentence&quot;</span>: example[<span class="hljs-string">&quot;sentence1&quot;</span>]}, remove_columns=[<span class="hljs-string">&quot;sentence1&quot;</span>])
<span class="hljs-meta">&gt;&gt;&gt; </span>updated_dataset.column_names
[<span class="hljs-string">&#x27;sentence2&#x27;</span>, <span class="hljs-string">&#x27;label&#x27;</span>, <span class="hljs-string">&#x27;idx&#x27;</span>, <span class="hljs-string">&#x27;new_sentence&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-13f6svm"><p>🤗 Datasets also has a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.remove_columns">remove_columns()</a> function which is faster because it doesn’t copy the data of the remaining columns.</p></blockquote> <p data-svelte-h="svelte-t32y24">You can also use <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a> with indices if you set <code>with_indices=True</code>. The example below adds the index to the beginning of each sentence:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>updated_dataset = dataset.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> example, idx: {<span class="hljs-string">&quot;sentence2&quot;</span>: <span class="hljs-string">f&quot;<span class="hljs-subst">{idx}</span>: &quot;</span> + example[<span class="hljs-string">&quot;sentence2&quot;</span>]}, with_indices=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>updated_dataset[<span class="hljs-string">&quot;sentence2&quot;</span>][:<span class="hljs-number">5</span>]
[<span class="hljs-string">&#x27;0: Referring to him as only &quot; the witness &quot; , Amrozi accused his brother of deliberately distorting his evidence .&#x27;</span>,
<span class="hljs-string">&quot;1: Yucaipa bought Dominick &#x27;s in 1995 for {@html "<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>693</mn><mi>m</mi><mi>i</mi><mi>l</mi><mi>l</mi><mi>i</mi><mi>o</mi><mi>n</mi><mi>a</mi><mi>n</mi><mi>d</mi><mi>s</mi><mi>o</mi><mi>l</mi><mi>d</mi><mi>i</mi><mi>t</mi><mi>t</mi><mi>o</mi><mi>S</mi><mi>a</mi><mi>f</mi><mi>e</mi><mi>w</mi><mi>a</mi><mi>y</mi><mi>f</mi><mi>o</mi><mi>r</mi></mrow><annotation encoding="application/x-tex"> 693 million and sold it to Safeway for </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8889em;vertical-align:-0.1944em;"></span><span class="mord">693</span><span class="mord mathnormal">mi</span><span class="mord mathnormal" style="margin-right:0.01968em;">ll</span><span class="mord mathnormal">i</span><span class="mord mathnormal">o</span><span class="mord mathnormal">nan</span><span class="mord mathnormal">d</span><span class="mord mathnormal">so</span><span class="mord mathnormal" style="margin-right:0.01968em;">l</span><span class="mord mathnormal">d</span><span class="mord mathnormal">i</span><span class="mord mathnormal">tt</span><span class="mord mathnormal">o</span><span class="mord mathnormal" style="margin-right:0.05764em;">S</span><span class="mord mathnormal">a</span><span class="mord mathnormal" style="margin-right:0.10764em;">f</span><span class="mord mathnormal">e</span><span class="mord mathnormal" style="margin-right:0.02691em;">w</span><span class="mord mathnormal">a</span><span class="mord mathnormal" style="margin-right:0.03588em;">y</span><span class="mord mathnormal" style="margin-right:0.10764em;">f</span><span class="mord mathnormal" style="margin-right:0.02778em;">or</span></span></span></span>"} 1.8 billion in 1998 .&quot;</span>,
<span class="hljs-string">&quot;2: On June 10 , the ship &#x27;s owners had published an advertisement on the Internet , offering the explosives for sale .&quot;</span>,
<span class="hljs-string">&#x27;3: Tab shares jumped 20 cents , or 4.6 % , to set a record closing high at A $ 4.57 .&#x27;</span>,
<span class="hljs-string">&#x27;4: PG &amp; E Corp. shares jumped {@html "<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mn>1.63</mn><mi>o</mi><mi>r</mi><mn>8</mn><mi>p</mi><mi>e</mi><mi>r</mi><mi>c</mi><mi>e</mi><mi>n</mi><mi>t</mi><mi>t</mi><mi>o</mi></mrow><annotation encoding="application/x-tex"> 1.63 or 8 percent to </annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.8389em;vertical-align:-0.1944em;"></span><span class="mord">1.63</span><span class="mord mathnormal" style="margin-right:0.02778em;">or</span><span class="mord">8</span><span class="mord mathnormal">p</span><span class="mord mathnormal">erce</span><span class="mord mathnormal">n</span><span class="mord mathnormal">tt</span><span class="mord mathnormal">o</span></span></span></span>"} 21.03 on the New York Stock Exchange on Friday .&#x27;</span>
]<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="multiprocessing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#multiprocessing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Multiprocessing</span></h3> <p data-svelte-h="svelte-13osd1v">Multiprocessing significantly speeds up processing by parallelizing processes on the CPU. Set the <code>num_proc</code> parameter in <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a> to set the number of processes to use:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>updated_dataset = dataset.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> example, idx: {<span class="hljs-string">&quot;sentence2&quot;</span>: <span class="hljs-string">f&quot;<span class="hljs-subst">{idx}</span>: &quot;</span> + example[<span class="hljs-string">&quot;sentence2&quot;</span>]}, with_indices=<span class="hljs-literal">True</span>, num_proc=<span class="hljs-number">4</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-s5mhuj">The <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a> also works with the rank of the process if you set <code>with_rank=True</code>. This is analogous to the <code>with_indices</code> parameter. The <code>with_rank</code> parameter in the mapped function goes after the <code>index</code> one if it is already present.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> torch
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> multiprocess <span class="hljs-keyword">import</span> set_start_method
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModelForCausalLM
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
&gt;&gt;&gt;
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># Get an example dataset</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = load_dataset(<span class="hljs-string">&quot;fka/awesome-chatgpt-prompts&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
&gt;&gt;&gt;
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># Get an example model and its tokenizer</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">&quot;Qwen/Qwen1.5-0.5B-Chat&quot;</span>).<span class="hljs-built_in">eval</span>()
<span class="hljs-meta">&gt;&gt;&gt; </span>tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;Qwen/Qwen1.5-0.5B-Chat&quot;</span>)
&gt;&gt;&gt;
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">gpu_computation</span>(<span class="hljs-params">batch, rank</span>):
<span class="hljs-meta">... </span> <span class="hljs-comment"># Move the model on the right GPU if it&#x27;s not there already</span>
<span class="hljs-meta">... </span> device = <span class="hljs-string">f&quot;cuda:<span class="hljs-subst">{(rank <span class="hljs-keyword">or</span> <span class="hljs-number">0</span>) % torch.cuda.device_count()}</span>&quot;</span>
<span class="hljs-meta">... </span> model.to(device)
...
<span class="hljs-meta">... </span> <span class="hljs-comment"># Your big GPU call goes here, for example:</span>
<span class="hljs-meta">... </span> chats = [[
<span class="hljs-meta">... </span> {<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;system&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: <span class="hljs-string">&quot;You are a helpful assistant.&quot;</span>},
<span class="hljs-meta">... </span> {<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: prompt}
<span class="hljs-meta">... </span> ] <span class="hljs-keyword">for</span> prompt <span class="hljs-keyword">in</span> batch[<span class="hljs-string">&quot;prompt&quot;</span>]]
<span class="hljs-meta">... </span> texts = [tokenizer.apply_chat_template(
<span class="hljs-meta">... </span> chat,
<span class="hljs-meta">... </span> tokenize=<span class="hljs-literal">False</span>,
<span class="hljs-meta">... </span> add_generation_prompt=<span class="hljs-literal">True</span>
<span class="hljs-meta">... </span> ) <span class="hljs-keyword">for</span> chat <span class="hljs-keyword">in</span> chats]
<span class="hljs-meta">... </span> model_inputs = tokenizer(texts, padding=<span class="hljs-literal">True</span>, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>).to(device)
<span class="hljs-meta">... </span> <span class="hljs-keyword">with</span> torch.no_grad():
<span class="hljs-meta">... </span> outputs = model.generate(**model_inputs, max_new_tokens=<span class="hljs-number">512</span>)
<span class="hljs-meta">... </span> batch[<span class="hljs-string">&quot;output&quot;</span>] = tokenizer.batch_decode(outputs, skip_special_tokens=<span class="hljs-literal">True</span>)
<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> batch
&gt;&gt;&gt;
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">if</span> __name__ == <span class="hljs-string">&quot;__main__&quot;</span>:
<span class="hljs-meta">... </span> set_start_method(<span class="hljs-string">&quot;spawn&quot;</span>)
<span class="hljs-meta">... </span> updated_dataset = dataset.<span class="hljs-built_in">map</span>(
<span class="hljs-meta">... </span> gpu_computation,
<span class="hljs-meta">... </span> batched=<span class="hljs-literal">True</span>,
<span class="hljs-meta">... </span> batch_size=<span class="hljs-number">16</span>,
<span class="hljs-meta">... </span> with_rank=<span class="hljs-literal">True</span>,
<span class="hljs-meta">... </span> num_proc=torch.cuda.device_count(), <span class="hljs-comment"># one process per GPU</span>
<span class="hljs-meta">... </span> )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1wopha6">The main use-case for rank is to parallelize computation across several GPUs. This requires setting <code>multiprocess.set_start_method(&quot;spawn&quot;)</code>. If you don’t you’ll receive the following CUDA error:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->RuntimeError: Cannot re-initialize CUDA <span class="hljs-keyword">in</span> forked subprocess. To use CUDA with multiprocessing, you must use the <span class="hljs-string">&#x27;spawn&#x27;</span> start method.<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="batch-processing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#batch-processing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Batch processing</span></h3> <p data-svelte-h="svelte-kjxph2">The <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a> function supports working with batches of examples. Operate on batches by setting <code>batched=True</code>. The default batch size is 1000, but you can adjust it with the <code>batch_size</code> parameter. Batch processing enables interesting applications such as splitting long sentences into shorter chunks and data augmentation.</p> <h4 class="relative group"><a id="split-long-examples" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#split-long-examples"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Split long examples</span></h4> <p data-svelte-h="svelte-1k1qy5p">When examples are too long, you may want to split them into several smaller chunks. Begin by creating a function that:</p> <ol data-svelte-h="svelte-75f7q"><li><p>Splits the <code>sentence1</code> field into chunks of 50 characters.</p></li> <li><p>Stacks all the chunks together to create the new dataset.</p></li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">chunk_examples</span>(<span class="hljs-params">examples</span>):
<span class="hljs-meta">... </span> chunks = []
<span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> sentence <span class="hljs-keyword">in</span> examples[<span class="hljs-string">&quot;sentence1&quot;</span>]:
<span class="hljs-meta">... </span> chunks += [sentence[i:i + <span class="hljs-number">50</span>] <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">0</span>, <span class="hljs-built_in">len</span>(sentence), <span class="hljs-number">50</span>)]
<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> {<span class="hljs-string">&quot;chunks&quot;</span>: chunks}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-dk5cag">Apply the function with <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>chunked_dataset = dataset.<span class="hljs-built_in">map</span>(chunk_examples, batched=<span class="hljs-literal">True</span>, remove_columns=dataset.column_names)
<span class="hljs-meta">&gt;&gt;&gt; </span>chunked_dataset[:<span class="hljs-number">10</span>]
{<span class="hljs-string">&#x27;chunks&#x27;</span>: [<span class="hljs-string">&#x27;Amrozi accused his brother , whom he called &quot; the &#x27;</span>,
<span class="hljs-string">&#x27;witness &quot; , of deliberately distorting his evidenc&#x27;</span>,
<span class="hljs-string">&#x27;e .&#x27;</span>,
<span class="hljs-string">&quot;Yucaipa owned Dominick &#x27;s before selling the chain&quot;</span>,
<span class="hljs-string">&#x27; to Safeway in 1998 for $ 2.5 billion .&#x27;</span>,
<span class="hljs-string">&#x27;They had published an advertisement on the Interne&#x27;</span>,
<span class="hljs-string">&#x27;t on June 10 , offering the cargo for sale , he ad&#x27;</span>,
<span class="hljs-string">&#x27;ded .&#x27;</span>,
<span class="hljs-string">&#x27;Around 0335 GMT , Tab shares were up 19 cents , or&#x27;</span>,
<span class="hljs-string">&#x27; 4.4 % , at A $ 4.56 , having earlier set a record&#x27;</span>]}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-cmzler">Notice how the sentences are split into shorter chunks now, and there are more rows in the dataset.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>dataset
Dataset({
features: [<span class="hljs-string">&#x27;sentence1&#x27;</span>, <span class="hljs-string">&#x27;sentence2&#x27;</span>, <span class="hljs-string">&#x27;label&#x27;</span>, <span class="hljs-string">&#x27;idx&#x27;</span>],
num_rows: <span class="hljs-number">3668</span>
})
<span class="hljs-meta">&gt;&gt;&gt; </span>chunked_dataset
Dataset({
features: [<span class="hljs-string">&#x27;chunks&#x27;</span>],
num_rows: <span class="hljs-number">10470</span>
})<!-- HTML_TAG_END --></pre></div> <h4 class="relative group"><a id="data-augmentation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#data-augmentation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Data augmentation</span></h4> <p data-svelte-h="svelte-1qjc7zi">The <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a> function could also be used for data augmentation. The following example generates additional words for a masked token in a sentence.</p> <p data-svelte-h="svelte-2g3b8l">Load and use the <a href="https://huggingface.co/roberta-base" rel="nofollow">RoBERTA</a> model in 🤗 Transformers’ <a href="https://huggingface.co/transformers/main_classes/pipelines#transformers.FillMaskPipeline" rel="nofollow">FillMaskPipeline</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> random <span class="hljs-keyword">import</span> randint
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline
<span class="hljs-meta">&gt;&gt;&gt; </span>fillmask = pipeline(<span class="hljs-string">&quot;fill-mask&quot;</span>, model=<span class="hljs-string">&quot;roberta-base&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>mask_token = fillmask.tokenizer.mask_token
<span class="hljs-meta">&gt;&gt;&gt; </span>smaller_dataset = dataset.<span class="hljs-built_in">filter</span>(<span class="hljs-keyword">lambda</span> e, i: i&lt;<span class="hljs-number">100</span>, with_indices=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17o5i9a">Create a function to randomly select a word to mask in the sentence. The function should also return the original sentence and the top two replacements generated by RoBERTA.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">augment_data</span>(<span class="hljs-params">examples</span>):
<span class="hljs-meta">... </span> outputs = []
<span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> sentence <span class="hljs-keyword">in</span> examples[<span class="hljs-string">&quot;sentence1&quot;</span>]:
<span class="hljs-meta">... </span> words = sentence.split(<span class="hljs-string">&#x27; &#x27;</span>)
<span class="hljs-meta">... </span> K = randint(<span class="hljs-number">1</span>, <span class="hljs-built_in">len</span>(words)-<span class="hljs-number">1</span>)
<span class="hljs-meta">... </span> masked_sentence = <span class="hljs-string">&quot; &quot;</span>.join(words[:K] + [mask_token] + words[K+<span class="hljs-number">1</span>:])
<span class="hljs-meta">... </span> predictions = fillmask(masked_sentence)
<span class="hljs-meta">... </span> augmented_sequences = [predictions[i][<span class="hljs-string">&quot;sequence&quot;</span>] <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">3</span>)]
<span class="hljs-meta">... </span> outputs += [sentence] + augmented_sequences
...
<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> {<span class="hljs-string">&quot;data&quot;</span>: outputs}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-571s9g">Use <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a> to apply the function over the whole dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>augmented_dataset = smaller_dataset.<span class="hljs-built_in">map</span>(augment_data, batched=<span class="hljs-literal">True</span>, remove_columns=dataset.column_names, batch_size=<span class="hljs-number">8</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>augmented_dataset[:<span class="hljs-number">9</span>][<span class="hljs-string">&quot;data&quot;</span>]
[<span class="hljs-string">&#x27;Amrozi accused his brother , whom he called &quot; the witness &quot; , of deliberately distorting his evidence .&#x27;</span>,
<span class="hljs-string">&#x27;Amrozi accused his brother, whom he called &quot; the witness &quot;, of deliberately withholding his evidence.&#x27;</span>,
<span class="hljs-string">&#x27;Amrozi accused his brother, whom he called &quot; the witness &quot;, of deliberately suppressing his evidence.&#x27;</span>,
<span class="hljs-string">&#x27;Amrozi accused his brother, whom he called &quot; the witness &quot;, of deliberately destroying his evidence.&#x27;</span>,
<span class="hljs-string">&quot;Yucaipa owned Dominick &#x27;s before selling the chain to Safeway in 1998 for $ 2.5 billion .&quot;</span>,
<span class="hljs-string">&#x27;Yucaipa owned Dominick Stores before selling the chain to Safeway in 1998 for $ 2.5 billion.&#x27;</span>,
<span class="hljs-string">&quot;Yucaipa owned Dominick&#x27;s before selling the chain to Safeway in 1998 for $ 2.5 billion.&quot;</span>,
<span class="hljs-string">&#x27;Yucaipa owned Dominick Pizza before selling the chain to Safeway in 1998 for $ 2.5 billion.&#x27;</span>
]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ww14wc">For each original sentence, RoBERTA augmented a random word with three alternatives. The original word <code>distorting</code> is supplemented by <code>withholding</code>, <code>suppressing</code>, and <code>destroying</code>.</p> <h3 class="relative group"><a id="asynchronous-processing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#asynchronous-processing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Asynchronous processing</span></h3> <p data-svelte-h="svelte-z6wstw">Asynchronous functions are useful to call API endpoints in parallel, for example to download content like images or call a model endpoint.</p> <p data-svelte-h="svelte-wyxl4p">You can define an asynchronous function using the <code>async</code> and <code>await</code> keywords, here is an example function to call a chat model from Hugging Face:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> aiohttp
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> asyncio
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> get_token
<span class="hljs-meta">&gt;&gt;&gt; </span>sem = asyncio.Semaphore(<span class="hljs-number">20</span>) <span class="hljs-comment"># max number of simultaneous queries</span>
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">query_model</span>(<span class="hljs-params">model, prompt</span>):
<span class="hljs-meta">... </span> api_url = <span class="hljs-string">f&quot;https://api-inference.huggingface.co/models/<span class="hljs-subst">{model}</span>/v1/chat/completions&quot;</span>
<span class="hljs-meta">... </span> headers = {<span class="hljs-string">&quot;Authorization&quot;</span>: <span class="hljs-string">f&quot;Bearer <span class="hljs-subst">{get_token()}</span>&quot;</span>, <span class="hljs-string">&quot;Content-Type&quot;</span>: <span class="hljs-string">&quot;application/json&quot;</span>}
<span class="hljs-meta">... </span> json = {<span class="hljs-string">&quot;messages&quot;</span>: [{<span class="hljs-string">&quot;role&quot;</span>: <span class="hljs-string">&quot;user&quot;</span>, <span class="hljs-string">&quot;content&quot;</span>: prompt}], <span class="hljs-string">&quot;max_tokens&quot;</span>: <span class="hljs-number">20</span>, <span class="hljs-string">&quot;seed&quot;</span>: <span class="hljs-number">42</span>}
<span class="hljs-meta">... </span> <span class="hljs-keyword">async</span> <span class="hljs-keyword">with</span> sem, aiohttp.ClientSession() <span class="hljs-keyword">as</span> session, session.post(api_url, headers=headers, json=json) <span class="hljs-keyword">as</span> response:
<span class="hljs-meta">... </span> output = <span class="hljs-keyword">await</span> response.json()
<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> {<span class="hljs-string">&quot;Output&quot;</span>: output[<span class="hljs-string">&quot;choices&quot;</span>][<span class="hljs-number">0</span>][<span class="hljs-string">&quot;message&quot;</span>][<span class="hljs-string">&quot;content&quot;</span>]}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1cp101t">Asynchronous functions run in parallel, which accelerates the process a lot. The same code takes a lot more time if it’s run sequentially, because it does nothing while waiting for the model response. It is generally recommended to use <code>async</code> / <code>await</code> when you function has to wait for a response from an API for example, or if it downloads data and it can take some time.</p> <p data-svelte-h="svelte-5qbh2n">Note the presence of a <code>Semaphore</code>: it sets the maximum number of queries that can run in parallel. It is recommended to use a <code>Semaphore</code> when calling APIs to avoid rate limit errors.</p> <p data-svelte-h="svelte-2maj4">Let’s use it to call the <a href="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct" rel="nofollow">microsoft/Phi-3-mini-4k-instruct</a> model and ask it to return the main topic of each math problem in the <a href="https://huggingface.co/Maxwell-Jia/AIME_2024" rel="nofollow">Maxwell-Jia/AIME_2024</a> dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-meta">&gt;&gt;&gt; </span>ds = load_dataset(<span class="hljs-string">&quot;Maxwell-Jia/AIME_2024&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>model = <span class="hljs-string">&quot;microsoft/Phi-3-mini-4k-instruct&quot;</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>prompt = <span class="hljs-string">&#x27;What is this text mainly about ? Here is the text:\n\n```\n{Problem}\n```\n\nReply using one or two words max, e.g. &quot;The main topic is Linear Algebra&quot;.&#x27;</span>
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">get_topic</span>(<span class="hljs-params">example</span>):
<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> <span class="hljs-keyword">await</span> query_model(model, prompt.<span class="hljs-built_in">format</span>(Problem=example[<span class="hljs-string">&#x27;Problem&#x27;</span>]))
<span class="hljs-meta">&gt;&gt;&gt; </span>ds = ds.<span class="hljs-built_in">map</span>(get_topic)
<span class="hljs-meta">&gt;&gt;&gt; </span>ds[<span class="hljs-number">0</span>]
{<span class="hljs-string">&#x27;ID&#x27;</span>: <span class="hljs-string">&#x27;2024-II-4&#x27;</span>,
<span class="hljs-string">&#x27;Problem&#x27;</span>: <span class="hljs-string">&#x27;Let {@html "<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>x</mi><mo separator="true">,</mo><mi>y</mi></mrow><annotation encoding="application/x-tex">x,y</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.625em;vertical-align:-0.1944em;"></span><span class="mord mathnormal">x</span><span class="mpunct">,</span><span class="mspace" style="margin-right:0.1667em;"></span><span class="mord mathnormal" style="margin-right:0.03588em;">y</span></span></span></span>"} and {@html "<span class="katex"><span class="katex-mathml"><math xmlns="http://www.w3.org/1998/Math/MathML"><semantics><mrow><mi>z</mi></mrow><annotation encoding="application/x-tex">z</annotation></semantics></math></span><span class="katex-html" aria-hidden="true"><span class="base"><span class="strut" style="height:0.4306em;"></span><span class="mord mathnormal" style="margin-right:0.04398em;">z</span></span></span></span>"} be positive real numbers that...&#x27;</span>,
<span class="hljs-string">&#x27;Solution&#x27;</span>: <span class="hljs-string">&#x27;Denote $\\log_2(x) = a$, $\\log_2(y) = b$, and...,
&#x27;</span>Answe<span class="hljs-string">r&#x27;: 33,
&#x27;</span>Output<span class="hljs-string">&#x27;: &#x27;</span>The main topic <span class="hljs-keyword">is</span> Logarithms.<span class="hljs-string">&#x27;}</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1e6bels">Here, <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">Dataset.map()</a> runs many <code>get_topic</code> function asynchronously so it doesn’t have to wait for every single model response which would take a lot of time to do sequentially.</p> <p data-svelte-h="svelte-3ovtq0">By default, <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">Dataset.map()</a> runs up to one thousand map functions in parallel, so don’t forget to set the maximum number of API calls that can run in parallel with a <code>Semaphore</code>, otherwise the model could return rate limit errors or overload. For advanced use cases, you can change the maximum number of queries in parallel in <code>datasets.config</code>.</p> <h3 class="relative group"><a id="process-multiple-splits" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#process-multiple-splits"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Process multiple splits</span></h3> <p data-svelte-h="svelte-1ehaad6">Many datasets have splits that can be processed simultaneously with <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.DatasetDict.map">DatasetDict.map()</a>. For example, tokenize the <code>sentence1</code> field in the train and test split by:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-comment"># load all the splits</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = load_dataset(<span class="hljs-string">&#x27;nyu-mll/glue&#x27;</span>, <span class="hljs-string">&#x27;mrpc&#x27;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>encoded_dataset = dataset.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> examples: tokenizer(examples[<span class="hljs-string">&quot;sentence1&quot;</span>]), batched=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>encoded_dataset[<span class="hljs-string">&quot;train&quot;</span>][<span class="hljs-number">0</span>]
{<span class="hljs-string">&#x27;sentence1&#x27;</span>: <span class="hljs-string">&#x27;Amrozi accused his brother , whom he called &quot; the witness &quot; , of deliberately distorting his evidence .&#x27;</span>,
<span class="hljs-string">&#x27;sentence2&#x27;</span>: <span class="hljs-string">&#x27;Referring to him as only &quot; the witness &quot; , Amrozi accused his brother of deliberately distorting his evidence .&#x27;</span>,
<span class="hljs-string">&#x27;label&#x27;</span>: <span class="hljs-number">1</span>,
<span class="hljs-string">&#x27;idx&#x27;</span>: <span class="hljs-number">0</span>,
<span class="hljs-string">&#x27;input_ids&#x27;</span>: [ <span class="hljs-number">101</span>, <span class="hljs-number">7277</span>, <span class="hljs-number">2180</span>, <span class="hljs-number">5303</span>, <span class="hljs-number">4806</span>, <span class="hljs-number">1117</span>, <span class="hljs-number">1711</span>, <span class="hljs-number">117</span>, <span class="hljs-number">2292</span>, <span class="hljs-number">1119</span>, <span class="hljs-number">1270</span>, <span class="hljs-number">107</span>, <span class="hljs-number">1103</span>, <span class="hljs-number">7737</span>, <span class="hljs-number">107</span>, <span class="hljs-number">117</span>, <span class="hljs-number">1104</span>, <span class="hljs-number">9938</span>, <span class="hljs-number">4267</span>, <span class="hljs-number">12223</span>, <span class="hljs-number">21811</span>, <span class="hljs-number">1117</span>, <span class="hljs-number">2554</span>, <span class="hljs-number">119</span>, <span class="hljs-number">102</span>],
<span class="hljs-string">&#x27;token_type_ids&#x27;</span>: [<span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>],
<span class="hljs-string">&#x27;attention_mask&#x27;</span>: [<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>]
}<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="distributed-usage" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#distributed-usage"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Distributed usage</span></h3> <p data-svelte-h="svelte-fswvxq">When you use <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a> in a distributed setting, you should also use <a href="https://pytorch.org/docs/stable/distributed?highlight=barrier#torch.distributed.barrier" rel="nofollow">torch.distributed.barrier</a>. This ensures the main process performs the mapping, while the other processes load the results, thereby avoiding duplicate work.</p> <p data-svelte-h="svelte-1jo5sei">The following example shows how you can use <code>torch.distributed.barrier</code> to synchronize the processes:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> torch.distributed
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset1 = Dataset.from_dict({<span class="hljs-string">&quot;a&quot;</span>: [<span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>]})
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">if</span> training_args.local_rank &gt; <span class="hljs-number">0</span>:
<span class="hljs-meta">... </span> <span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Waiting for main process to perform the mapping&quot;</span>)
<span class="hljs-meta">... </span> torch.distributed.barrier()
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset2 = dataset1.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">&quot;a&quot;</span>: x[<span class="hljs-string">&quot;a&quot;</span>] + <span class="hljs-number">1</span>})
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">if</span> training_args.local_rank == <span class="hljs-number">0</span>:
<span class="hljs-meta">... </span> <span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Loading results from main process&quot;</span>)
<span class="hljs-meta">... </span> torch.distributed.barrier()<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="batch" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#batch"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Batch</span></h2> <p data-svelte-h="svelte-1grzzpy">The <code>batch()</code> method allows you to group samples from the dataset into batches. This is particularly useful when you want to create batches of data for training or evaluation, especially when working with deep learning models.</p> <p data-svelte-h="svelte-11s6yeq">Here’s an example of how to use the <code>batch()</code> method:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = load_dataset(<span class="hljs-string">&quot;cornell-movie-review-data/rotten_tomatoes&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>batched_dataset = dataset.batch(batch_size=<span class="hljs-number">4</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>batched_dataset[<span class="hljs-number">0</span>]
{<span class="hljs-string">&#x27;text&#x27;</span>: [<span class="hljs-string">&#x27;the rock is destined to be the 21st century\&#x27;s new &quot; conan &quot; and that he\&#x27;s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .&#x27;</span>,
<span class="hljs-string">&#x27;the gorgeously elaborate continuation of &quot; the lord of the rings &quot; trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\&#x27;s expanded vision of j . r . r . tolkien\&#x27;s middle-earth .&#x27;</span>,
<span class="hljs-string">&#x27;effective but too-tepid biopic&#x27;</span>,
<span class="hljs-string">&#x27;if you sometimes like to go to the movies to have fun , wasabi is a good place to start .&#x27;</span>],
<span class="hljs-string">&#x27;label&#x27;</span>: [<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>]}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1dww357">The <code>batch()</code> method accepts the following parameters:</p> <ul data-svelte-h="svelte-1ip99b0"><li><code>batch_size</code> (<code>int</code>): The number of samples in each batch.</li> <li><code>drop_last_batch</code> (<code>bool</code>, defaults to <code>False</code>): Whether to drop the last incomplete batch if the dataset size is not divisible by the batch size.</li> <li><code>num_proc</code> (<code>int</code>, optional, defaults to <code>None</code>): The number of processes to use for multiprocessing. If None, no multiprocessing is used. This can significantly speed up batching for large datasets.</li></ul> <p data-svelte-h="svelte-7ty3ma">Note that <code>Dataset.batch()</code> returns a new <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> where each item is a batch of multiple samples from the original dataset. If you want to process data in batches, you should use a batched <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a> directly, which applies a function to batches but the output dataset is unbatched.</p> <h2 class="relative group"><a id="concatenate" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#concatenate"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Concatenate</span></h2> <p data-svelte-h="svelte-1up79vz">Separate datasets can be concatenated if they share the same column types. Concatenate datasets with <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.concatenate_datasets">concatenate_datasets()</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> concatenate_datasets, load_dataset
<span class="hljs-meta">&gt;&gt;&gt; </span>stories = load_dataset(<span class="hljs-string">&quot;ajibawa-2023/General-Stories-Collection&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>stories = stories.select_columns([<span class="hljs-string">&quot;text&quot;</span>]) <span class="hljs-comment"># only keep the &#x27;text&#x27; column</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>wiki = load_dataset(<span class="hljs-string">&quot;wikimedia/wikipedia&quot;</span>, <span class="hljs-string">&quot;20231101.en&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>wiki = wiki.select_columns([<span class="hljs-string">&quot;text&quot;</span>]) <span class="hljs-comment"># only keep the &#x27;text&#x27; column</span>
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">assert</span> stories.features.<span class="hljs-built_in">type</span> == wiki.features.<span class="hljs-built_in">type</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>bert_dataset = concatenate_datasets([stories, wiki])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-qilcw">You can also concatenate two datasets horizontally by setting <code>axis=1</code> as long as the datasets have the same number of rows:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset
<span class="hljs-meta">&gt;&gt;&gt; </span>stories_ids = Dataset.from_dict({<span class="hljs-string">&quot;ids&quot;</span>: <span class="hljs-built_in">list</span>(<span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(stories)))})
<span class="hljs-meta">&gt;&gt;&gt; </span>stories_with_ids = concatenate_datasets([stories, stories_ids], axis=<span class="hljs-number">1</span>)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="interleave" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#interleave"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Interleave</span></h3> <p data-svelte-h="svelte-16o3y8p">You can also mix several datasets together by taking alternating examples from each one to create a new dataset. This is known as <em>interleaving</em>, which is enabled by the <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.interleave_datasets">interleave_datasets()</a> function. Both <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.interleave_datasets">interleave_datasets()</a> and <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.concatenate_datasets">concatenate_datasets()</a> work with regular <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> and <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> objects.
Refer to the <a href="./stream#interleave">Stream</a> guide for an example of how to interleave <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> objects.</p> <p data-svelte-h="svelte-1k6sf3c">You can define sampling probabilities for each of the original datasets to specify how to interleave the datasets.
In this case, the new dataset is constructed by getting examples one by one from a random dataset until one of the datasets runs out of samples.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset, interleave_datasets
<span class="hljs-meta">&gt;&gt;&gt; </span>seed = <span class="hljs-number">42</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>probabilities = [<span class="hljs-number">0.3</span>, <span class="hljs-number">0.5</span>, <span class="hljs-number">0.2</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>d1 = Dataset.from_dict({<span class="hljs-string">&quot;a&quot;</span>: [<span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>]})
<span class="hljs-meta">&gt;&gt;&gt; </span>d2 = Dataset.from_dict({<span class="hljs-string">&quot;a&quot;</span>: [<span class="hljs-number">10</span>, <span class="hljs-number">11</span>, <span class="hljs-number">12</span>, <span class="hljs-number">13</span>]})
<span class="hljs-meta">&gt;&gt;&gt; </span>d3 = Dataset.from_dict({<span class="hljs-string">&quot;a&quot;</span>: [<span class="hljs-number">20</span>, <span class="hljs-number">21</span>, <span class="hljs-number">22</span>]})
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = interleave_datasets([d1, d2, d3], probabilities=probabilities, seed=seed)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset[<span class="hljs-string">&quot;a&quot;</span>]
[<span class="hljs-number">10</span>, <span class="hljs-number">11</span>, <span class="hljs-number">20</span>, <span class="hljs-number">12</span>, <span class="hljs-number">0</span>, <span class="hljs-number">21</span>, <span class="hljs-number">13</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1pw4uir">You can also specify the <code>stopping_strategy</code>. The default strategy, <code>first_exhausted</code>, is a subsampling strategy, i.e the dataset construction is stopped as soon one of the dataset runs out of samples.
You can specify <code>stopping_strategy=all_exhausted</code> to execute an oversampling strategy. In this case, the dataset construction is stopped as soon as every samples in every dataset has been added at least once. In practice, it means that if a dataset is exhausted, it will return to the beginning of this dataset until the stop criterion has been reached.
Note that if no sampling probabilities are specified, the new dataset will have <code>max_length_datasets*nb_dataset samples</code>.
There is also <code>stopping_strategy=all_exhausted_without_replacement</code> to ensure that every sample is seen exactly once.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>d1 = Dataset.from_dict({<span class="hljs-string">&quot;a&quot;</span>: [<span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>]})
<span class="hljs-meta">&gt;&gt;&gt; </span>d2 = Dataset.from_dict({<span class="hljs-string">&quot;a&quot;</span>: [<span class="hljs-number">10</span>, <span class="hljs-number">11</span>, <span class="hljs-number">12</span>, <span class="hljs-number">13</span>]})
<span class="hljs-meta">&gt;&gt;&gt; </span>d3 = Dataset.from_dict({<span class="hljs-string">&quot;a&quot;</span>: [<span class="hljs-number">20</span>, <span class="hljs-number">21</span>, <span class="hljs-number">22</span>]})
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = interleave_datasets([d1, d2, d3], stopping_strategy=<span class="hljs-string">&quot;all_exhausted&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset[<span class="hljs-string">&quot;a&quot;</span>]
[<span class="hljs-number">0</span>, <span class="hljs-number">10</span>, <span class="hljs-number">20</span>, <span class="hljs-number">1</span>, <span class="hljs-number">11</span>, <span class="hljs-number">21</span>, <span class="hljs-number">2</span>, <span class="hljs-number">12</span>, <span class="hljs-number">22</span>, <span class="hljs-number">0</span>, <span class="hljs-number">13</span>, <span class="hljs-number">20</span>]<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="format" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#format"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Format</span></h2> <p data-svelte-h="svelte-1bp89id">The <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.with_format">with_format()</a> function changes the format of a column to be compatible with some common data formats. Specify the output you’d like in the <code>type</code> parameter. You can also choose which the columns you want to format using <code>columns=</code>. Formatting is applied on-the-fly.</p> <p data-svelte-h="svelte-n6dynl">For example, create PyTorch tensors by setting <code>type=&quot;torch&quot;</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>dataset = dataset.with_format(<span class="hljs-built_in">type</span>=<span class="hljs-string">&quot;torch&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-e46zjv">The <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.set_format">set_format()</a> function also changes the format of a column, except it runs in-place:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>dataset.set_format(<span class="hljs-built_in">type</span>=<span class="hljs-string">&quot;torch&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-n725fu">If you need to reset the dataset to its original format, set the format to <code>None</code> (or use <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.reset_format">reset_format()</a>):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>dataset.<span class="hljs-built_in">format</span>
{<span class="hljs-string">&#x27;type&#x27;</span>: <span class="hljs-string">&#x27;torch&#x27;</span>, <span class="hljs-string">&#x27;format_kwargs&#x27;</span>: {}, <span class="hljs-string">&#x27;columns&#x27;</span>: [...], <span class="hljs-string">&#x27;output_all_columns&#x27;</span>: <span class="hljs-literal">False</span>}
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = dataset.with_format(<span class="hljs-literal">None</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset.<span class="hljs-built_in">format</span>
{<span class="hljs-string">&#x27;type&#x27;</span>: <span class="hljs-literal">None</span>, <span class="hljs-string">&#x27;format_kwargs&#x27;</span>: {}, <span class="hljs-string">&#x27;columns&#x27;</span>: [...], <span class="hljs-string">&#x27;output_all_columns&#x27;</span>: <span class="hljs-literal">False</span>}<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="tensors-formats" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tensors-formats"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Tensors formats</span></h3> <p data-svelte-h="svelte-olkdez">Several tensors or arrays formats are supported. It is generally recommended to use these formats instead of converting outputs of a dataset to tensors or arrays manually to avoid unnecessary data copies and accelerate data loading.</p> <p data-svelte-h="svelte-18l19i7">Here is the list of supported tensors or arrays formats:</p> <ul data-svelte-h="svelte-1egnru7"><li>NumPy: format name is “numpy”, for more information see <a href="use_with_numpy">Using Datasets with NumPy</a></li> <li>PyTorch: format name is “torch”, for more information see <a href="use_with_pytorch">Using Datasets with PyTorch</a></li> <li>TensorFlow: format name is “tensorflow”, for more information see <a href="use_with_tensorflow">Using Datasets with TensorFlow</a></li> <li>JAX: format name is “jax”, for more information see <a href="use_with_jax">Using Datasets with JAX</a></li></ul> <blockquote class="tip" data-svelte-h="svelte-5o8ij8"><p>Check out the <a href="use_with_tensorflow#using-totfdataset">Using Datasets with TensorFlow</a> guide for more details on how to efficiently create a TensorFlow dataset.</p></blockquote> <p data-svelte-h="svelte-1gnjh2u">When a dataset is formatted in a tensor or array format, all the data are formatted as tensors or arrays (except unsupported types like strings for example for PyTorch):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>ds = Dataset.from_dict({<span class="hljs-string">&quot;text&quot;</span>: [<span class="hljs-string">&quot;foo&quot;</span>, <span class="hljs-string">&quot;bar&quot;</span>], <span class="hljs-string">&quot;tokens&quot;</span>: [[<span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>], [<span class="hljs-number">3</span>, <span class="hljs-number">4</span>, <span class="hljs-number">5</span>]]})
<span class="hljs-meta">&gt;&gt;&gt; </span>ds = ds.with_format(<span class="hljs-string">&quot;torch&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>ds[<span class="hljs-number">0</span>]
{<span class="hljs-string">&#x27;text&#x27;</span>: <span class="hljs-string">&#x27;foo&#x27;</span>, <span class="hljs-string">&#x27;tokens&#x27;</span>: tensor([<span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>])}
<span class="hljs-meta">&gt;&gt;&gt; </span>ds[:<span class="hljs-number">2</span>]
{<span class="hljs-string">&#x27;text&#x27;</span>: [<span class="hljs-string">&#x27;foo&#x27;</span>, <span class="hljs-string">&#x27;bar&#x27;</span>],
<span class="hljs-string">&#x27;tokens&#x27;</span>: tensor([[<span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>],
[<span class="hljs-number">3</span>, <span class="hljs-number">4</span>, <span class="hljs-number">5</span>]])}<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="tabular-formats" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tabular-formats"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Tabular formats</span></h3> <p data-svelte-h="svelte-dti2j5">You can use a dataframes or tables format to optimize data loading and data processing, since they generally offer zero-copy operations and transforms written in low-level languages.</p> <p data-svelte-h="svelte-1bcs4mw">Here is the list of supported dataframes or tables formats:</p> <ul data-svelte-h="svelte-17rybhy"><li>Pandas: format name is “pandas”, for more information see <a href="use_with_pandas">Using Datasets with Pandas</a></li> <li>Polars: format name is “polars”, for more information see <a href="use_with_polars">Using Datasets with Polars</a></li> <li>PyArrow: format name is “arrow”, for more information see <a href="use_with_tensorflow">Using Datasets with PyArrow</a></li></ul> <p data-svelte-h="svelte-qmjsud">When a dataset is formatted in a dataframe or table format, every dataset row or batches of rows is formatted as a dataframe or table, and dataset colums are formatted as a series or array:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>ds = Dataset.from_dict({<span class="hljs-string">&quot;text&quot;</span>: [<span class="hljs-string">&quot;foo&quot;</span>, <span class="hljs-string">&quot;bar&quot;</span>], <span class="hljs-string">&quot;label&quot;</span>: [<span class="hljs-number">0</span>, <span class="hljs-number">1</span>]})
<span class="hljs-meta">&gt;&gt;&gt; </span>ds = ds.with_format(<span class="hljs-string">&quot;pandas&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>ds[:<span class="hljs-number">2</span>]
text label
<span class="hljs-number">0</span> foo <span class="hljs-number">0</span>
<span class="hljs-number">1</span> bar <span class="hljs-number">1</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-12p7evr">Those formats make it possible to iterate on the data faster by avoiding data copies, and also enable faster data processing in <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">map()</a> or <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.filter">filter()</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>ds = ds.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> df: df.assign(upper_text=df.text.<span class="hljs-built_in">str</span>.upper()), batched=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>ds[:<span class="hljs-number">2</span>]
text label upper_text
<span class="hljs-number">0</span> foo <span class="hljs-number">0</span> FOO
<span class="hljs-number">1</span> bar <span class="hljs-number">1</span> BAR<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="custom-format-transform" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#custom-format-transform"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Custom format transform</span></h3> <p data-svelte-h="svelte-ju6jze">The <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.with_transform">with_transform()</a> function applies a custom formatting transform on-the-fly. This function replaces any previously specified format. For example, you can use this function to tokenize and pad tokens on-the-fly. Tokenization is only applied when examples are accessed:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
<span class="hljs-meta">&gt;&gt;&gt; </span>tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;bert-base-uncased&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">encode</span>(<span class="hljs-params">batch</span>):
<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> tokenizer(batch[<span class="hljs-string">&quot;sentence1&quot;</span>], batch[<span class="hljs-string">&quot;sentence2&quot;</span>], padding=<span class="hljs-string">&quot;longest&quot;</span>, truncation=<span class="hljs-literal">True</span>, max_length=<span class="hljs-number">512</span>, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = dataset.with_transform(encode)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset.<span class="hljs-built_in">format</span>
{<span class="hljs-string">&#x27;type&#x27;</span>: <span class="hljs-string">&#x27;custom&#x27;</span>, <span class="hljs-string">&#x27;format_kwargs&#x27;</span>: {<span class="hljs-string">&#x27;transform&#x27;</span>: &lt;function __main__.encode(batch)&gt;}, <span class="hljs-string">&#x27;columns&#x27;</span>: [<span class="hljs-string">&#x27;idx&#x27;</span>, <span class="hljs-string">&#x27;label&#x27;</span>, <span class="hljs-string">&#x27;sentence1&#x27;</span>, <span class="hljs-string">&#x27;sentence2&#x27;</span>], <span class="hljs-string">&#x27;output_all_columns&#x27;</span>: <span class="hljs-literal">False</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-3gota5">There is also <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.set_transform">set_transform()</a> which does the same but runs in-place.</p> <p data-svelte-h="svelte-19jrjj">You can also use the <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.with_transform">with_transform()</a> function for custom decoding on <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Features">Features</a>.</p> <p data-svelte-h="svelte-1fap6ri">The example below uses the <a href="http://pydub.com/" rel="nofollow"><code>pydub</code></a> package as an alternative to <code>torchcodec</code> decoding:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> numpy <span class="hljs-keyword">as</span> np
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> pydub <span class="hljs-keyword">import</span> AudioSegment
<span class="hljs-meta">&gt;&gt;&gt; </span>audio_dataset_amr = Dataset.from_dict({<span class="hljs-string">&quot;audio&quot;</span>: [<span class="hljs-string">&quot;audio_samples/audio.amr&quot;</span>]})
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">decode_audio_with_pydub</span>(<span class="hljs-params">batch, sampling_rate=<span class="hljs-number">16_000</span></span>):
<span class="hljs-meta">... </span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">pydub_decode_file</span>(<span class="hljs-params">audio_path</span>):
<span class="hljs-meta">... </span> sound = AudioSegment.from_file(audio_path)
<span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> sound.frame_rate != sampling_rate:
<span class="hljs-meta">... </span> sound = sound.set_frame_rate(sampling_rate)
<span class="hljs-meta">... </span> channel_sounds = sound.split_to_mono()
<span class="hljs-meta">... </span> samples = [s.get_array_of_samples() <span class="hljs-keyword">for</span> s <span class="hljs-keyword">in</span> channel_sounds]
<span class="hljs-meta">... </span> fp_arr = np.array(samples).T.astype(np.float32)
<span class="hljs-meta">... </span> fp_arr /= np.iinfo(samples[<span class="hljs-number">0</span>].typecode).<span class="hljs-built_in">max</span>
<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> fp_arr
...
<span class="hljs-meta">... </span> batch[<span class="hljs-string">&quot;audio&quot;</span>] = [pydub_decode_file(audio_path) <span class="hljs-keyword">for</span> audio_path <span class="hljs-keyword">in</span> batch[<span class="hljs-string">&quot;audio&quot;</span>]]
<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> batch
<span class="hljs-meta">&gt;&gt;&gt; </span>audio_dataset_amr.set_transform(decode_audio_with_pydub)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="save" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#save"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Save</span></h2> <p data-svelte-h="svelte-bcptzx">Once your dataset is ready, you can save it as a Hugging Face Dataset in Parquet format and reuse it later with <a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a>.</p> <p data-svelte-h="svelte-mf0efe">Save your dataset by providing the name of the dataset repository on Hugging Face you wish to save it to to <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.push_to_hub">push_to_hub()</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->encoded_dataset.push_to_hub(<span class="hljs-string">&quot;username/my_dataset&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1eaf4be">You can use multiple processes to upload it in parallel. This is especially useful if you want to speed up the process:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->dataset.push_to_hub(<span class="hljs-string">&quot;username/my_dataset&quot;</span>, num_proc=<span class="hljs-number">8</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-p98llm">Use the <a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a> function to reload the dataset (in streaming mode or not):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
reloaded_dataset = load_dataset(<span class="hljs-string">&quot;username/my_dataset&quot;</span>, streaming=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1b7jp0o">Alternatively, you can save it locally in Arrow format on disk. Compared to Parquet, Arrow is uncompressed which makes it much faster to reload which is great for local use on disk and ephemeral caching. But since it’s larger and with less metadata, it is slower to upload/download/query than Parquet and less suited for long term storage.</p> <p data-svelte-h="svelte-187fr7h">Use the <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.save_to_disk">save_to_disk()</a> and <a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.load_from_disk">load_from_disk()</a> function to reload the dataset from your disk:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>encoded_dataset.save_to_disk(<span class="hljs-string">&quot;path/of/my/dataset/directory&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># later</span>
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_from_disk
<span class="hljs-meta">&gt;&gt;&gt; </span>reloaded_dataset = load_from_disk(<span class="hljs-string">&quot;path/of/my/dataset/directory&quot;</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="export" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#export"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Export</span></h2> <p data-svelte-h="svelte-t9rz5l">🤗 Datasets supports exporting as well so you can work with your dataset in other applications. The following table shows currently supported file formats you can export to:</p> <table data-svelte-h="svelte-zvt9sv"><thead><tr><th>File type</th> <th>Export method</th></tr></thead> <tbody><tr><td>CSV</td> <td><a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.to_csv">Dataset.to_csv()</a></td></tr> <tr><td>JSON</td> <td><a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.to_json">Dataset.to_json()</a></td></tr> <tr><td>Parquet</td> <td><a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.to_parquet">Dataset.to_parquet()</a></td></tr> <tr><td>SQL</td> <td><a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.to_sql">Dataset.to_sql()</a></td></tr> <tr><td>In-memory Python object</td> <td><a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.to_pandas">Dataset.to_pandas()</a>, <code>Dataset.to_polars()</code> or <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.to_dict">Dataset.to_dict()</a></td></tr></tbody></table> <p data-svelte-h="svelte-1w51t97">For example, export your dataset to a CSV file like this:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>encoded_dataset.to_csv(<span class="hljs-string">&quot;path/of/my/dataset.csv&quot;</span>)<!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/datasets/blob/main/docs/source/process.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1tcoqe3 = {
assets: "/docs/datasets/pr_8021/en",
base: "/docs/datasets/pr_8021/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/datasets/pr_8021/en/_app/immutable/entry/start.467c4c66.js"),
import("/docs/datasets/pr_8021/en/_app/immutable/entry/app.3b2ba720.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 39],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
202 kB
·
Xet hash:
6cf7ea3668bb095ff9c21225520602c4366e19fec7a38c0fea6508a5050450ff

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.