Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Build and load","local":"build-and-load","sections":[{"title":"ELI5: load_dataset","local":"eli5-loaddataset","sections":[],"depth":2},{"title":"Building a dataset","local":"building-a-dataset","sections":[{"title":"BuilderConfig","local":"datasets-builderconfig","sections":[],"depth":3},{"title":"DatasetBuilder","local":"datasets-datasetbuilder","sections":[],"depth":3}],"depth":2},{"title":"Maintaining integrity","local":"maintaining-integrity","sections":[],"depth":2},{"title":"Security","local":"security","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/datasets/pr_8021/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/entry/start.467c4c66.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/scheduler.d75c11ed.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/singletons.24e4ec1f.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/index.d12496d4.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/paths.409c1290.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/entry/app.3b2ba720.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/preload-helper.a99c0584.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/index.4ec9dfe9.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/nodes/0.5fda7065.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/nodes/5.64cd9f43.js"> | |
| <link rel="modulepreload" href="/docs/datasets/pr_8021/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.ee0f129e.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Build and load","local":"build-and-load","sections":[{"title":"ELI5: load_dataset","local":"eli5-loaddataset","sections":[],"depth":2},{"title":"Building a dataset","local":"building-a-dataset","sections":[{"title":"BuilderConfig","local":"datasets-builderconfig","sections":[],"depth":3},{"title":"DatasetBuilder","local":"datasets-datasetbuilder","sections":[],"depth":3}],"depth":2},{"title":"Maintaining integrity","local":"maintaining-integrity","sections":[],"depth":2},{"title":"Security","local":"security","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="build-and-load" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#build-and-load"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Build and load</span></h1> <p data-svelte-h="svelte-8n54nx">Nearly every deep learning workflow begins with loading a dataset, which makes it one of the most important steps. With 🤗 Datasets, there are more than 900 datasets available to help you get started with your NLP task. All you have to do is call: <a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a> to take your first step. This function is a true workhorse in every sense because it builds and loads every dataset you use.</p> <h2 class="relative group"><a id="eli5-loaddataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#eli5-loaddataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ELI5: load_dataset</span></h2> <p data-svelte-h="svelte-1et77y">Let’s begin with a basic Explain Like I’m Five.</p> <p data-svelte-h="svelte-gersvb">A dataset is a directory that contains:</p> <ul data-svelte-h="svelte-vbiom9"><li>Some data files in generic formats (JSON, CSV, Parquet, text, etc.)</li> <li>A dataset card named <code>README.md</code> that contains documentation about the dataset as well as a YAML header to define the datasets tags and configurations</li></ul> <p data-svelte-h="svelte-70lh2v">The <a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a> function fetches the requested dataset locally or from the Hugging Face Hub. | |
| The Hub is a central repository where all the Hugging Face datasets and models are stored.</p> <p data-svelte-h="svelte-f9tbpx">If the dataset only contains data files, then <a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a> automatically infers how to load the data files from their extensions (json, csv, parquet, txt, etc.). | |
| Under the hood, 🤗 Datasets will use an appropriate <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.DatasetBuilder">DatasetBuilder</a> based on the data files format. There exist one builder per data file format in 🤗 Datasets:</p> <ul data-svelte-h="svelte-1tlhev0"><li><a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.packaged_modules.text.Text">datasets.packaged_modules.text.Text</a> for text</li> <li><a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.packaged_modules.csv.Csv">datasets.packaged_modules.csv.Csv</a> for CSV and TSV</li> <li><a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.packaged_modules.json.Json">datasets.packaged_modules.json.Json</a> for JSON and JSONL</li> <li><a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.packaged_modules.parquet.Parquet">datasets.packaged_modules.parquet.Parquet</a> for Parquet</li> <li><a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.packaged_modules.arrow.Arrow">datasets.packaged_modules.arrow.Arrow</a> for Arrow (streaming file format)</li> <li><a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.packaged_modules.sql.Sql">datasets.packaged_modules.sql.Sql</a> for SQL databases</li> <li><a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.packaged_modules.imagefolder.ImageFolder">datasets.packaged_modules.imagefolder.ImageFolder</a> for image folders</li> <li><a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.packaged_modules.audiofolder.AudioFolder">datasets.packaged_modules.audiofolder.AudioFolder</a> for audio folders</li></ul> <blockquote class="tip" data-svelte-h="svelte-1ragf3s"><p>Read the <a href="./upload_dataset">Share</a> section to learn more about how to share a dataset.</p></blockquote> <p data-svelte-h="svelte-iduykv">🤗 Datasets downloads the dataset files from the original URL, generates the dataset and caches it in an Arrow table on your drive. | |
| If you’ve downloaded the dataset before, then 🤗 Datasets will reload it from the cache to save you the trouble of downloading it again.</p> <p data-svelte-h="svelte-1jl5e53">Now that you have a high-level understanding about how datasets are built, let’s take a closer look at the nuts and bolts of how all this works.</p> <h2 class="relative group"><a id="building-a-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#building-a-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Building a dataset</span></h2> <p data-svelte-h="svelte-1y1475o">When you load a dataset for the first time, 🤗 Datasets takes the raw data file and builds it into a table of rows and typed columns. There are two main classes responsible for building a dataset: <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> and <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.DatasetBuilder">DatasetBuilder</a>.</p> <div class="flex justify-center" data-svelte-h="svelte-1adyav4"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/builderconfig.png"></div> <h3 class="relative group"><a id="datasets-builderconfig" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#datasets-builderconfig"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>BuilderConfig</span></h3> <p data-svelte-h="svelte-1utq1tj"><a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> is the configuration class of <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.DatasetBuilder">DatasetBuilder</a>. The <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> contains the following basic attributes about a dataset:</p> <table data-svelte-h="svelte-fm8v6i"><thead><tr><th>Attribute</th> <th>Description</th></tr></thead> <tbody><tr><td><code>name</code></td> <td>Short name of the dataset.</td></tr> <tr><td><code>version</code></td> <td>Dataset version identifier.</td></tr> <tr><td><code>data_dir</code></td> <td>Stores the path to a local folder containing the data files.</td></tr> <tr><td><code>data_files</code></td> <td>Stores paths to local data files.</td></tr> <tr><td><code>description</code></td> <td>Description of the dataset.</td></tr></tbody></table> <p data-svelte-h="svelte-11eflli">If you want to add additional attributes to your dataset such as the class labels, you can subclass the base <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> class. There are two ways to populate the attributes of a <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> class or subclass:</p> <ul data-svelte-h="svelte-15fnxlq"><li><p>Provide a list of predefined <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> class (or subclass) instances in the datasets <code>DatasetBuilder.BUILDER_CONFIGS()</code> attribute.</p></li> <li><p>When you call <a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a>, any keyword arguments that are not specific to the method will be used to set the associated attributes of the <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> class. This will override the predefined attributes if a specific configuration was selected.</p></li></ul> <p data-svelte-h="svelte-1wjr0sd">You can also set the <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.BuilderConfig">DatasetBuilder.BUILDER_CONFIG_CLASS</a> to any custom subclass of <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a>.</p> <h3 class="relative group"><a id="datasets-datasetbuilder" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#datasets-datasetbuilder"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DatasetBuilder</span></h3> <p data-svelte-h="svelte-vin3sf"><a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.DatasetBuilder">DatasetBuilder</a> accesses all the attributes inside <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> to build the actual dataset.</p> <div class="flex justify-center" data-svelte-h="svelte-11mt2nw"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/datasetbuilder.png"></div> <p data-svelte-h="svelte-h1taeq">There are three main methods in <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.DatasetBuilder">DatasetBuilder</a>:</p> <ol data-svelte-h="svelte-an1z4k"><li><p><code>DatasetBuilder._info()</code> is in charge of defining the dataset attributes. When you call <code>dataset.info</code>, 🤗 Datasets returns the information stored here. Likewise, the <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Features">Features</a> are also specified here. Remember, the <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Features">Features</a> are like the skeleton of the dataset. It provides the names and types of each column.</p></li> <li><p><code>DatasetBuilder._split_generator</code> downloads or retrieves the requested data files, organizes them into splits, and defines specific arguments for the generation process. This method has a <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.DownloadManager">DownloadManager</a> that downloads files or fetches them from your local filesystem. Within the <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.DownloadManager">DownloadManager</a>, there is a <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.DownloadManager.download_and_extract">DownloadManager.download_and_extract()</a> method that accepts a dictionary of URLs to the original data files, and downloads the requested files. Accepted inputs include: a single URL or path, or a list/dictionary of URLs or paths. Any compressed file types like TAR, GZIP and ZIP archives will be automatically extracted.</p> <p>Once the files are downloaded, <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.SplitGenerator">SplitGenerator</a> organizes them into splits. The <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.SplitGenerator">SplitGenerator</a> contains the name of the split, and any keyword arguments that are provided to the <code>DatasetBuilder._generate_examples</code> method. The keyword arguments can be specific to each split, and typically comprise at least the local path to the data files for each split.</p></li> <li><p><code>DatasetBuilder._generate_examples</code> reads and parses the data files for a split. Then it yields dataset examples according to the format specified in the <code>features</code> from <code>DatasetBuilder._info()</code>. The input of <code>DatasetBuilder._generate_examples</code> is actually the <code>filepath</code> provided in the keyword arguments of the last method.</p> <p>The dataset is generated with a Python generator, which doesn’t load all the data in memory. As a result, the generator can handle large datasets. However, before the generated samples are flushed to the dataset file on disk, they are stored in an <code>ArrowWriter</code> buffer. This means the generated samples are written by batch. If your dataset samples consumes a lot of memory (images or videos), then make sure to specify a low value for the <code>DEFAULT_WRITER_BATCH_SIZE</code> attribute in <a href="/docs/datasets/pr_8021/en/package_reference/builder_classes#datasets.DatasetBuilder">DatasetBuilder</a>. We recommend not exceeding a size of 200 MB.</p></li></ol> <h2 class="relative group"><a id="maintaining-integrity" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#maintaining-integrity"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Maintaining integrity</span></h2> <p data-svelte-h="svelte-l1qwol">To ensure a dataset is complete, <a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a> will perform a series of tests on the downloaded files to make sure everything is there. This way, you don’t encounter any surprises when your requested dataset doesn’t get generated as expected. <a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a> verifies:</p> <ul data-svelte-h="svelte-1gxjcyf"><li>The number of splits in the generated <code>DatasetDict</code>.</li> <li>The number of samples in each split of the generated <code>DatasetDict</code>.</li> <li>The list of downloaded files.</li> <li>The SHA256 checksums of the downloaded files (disabled by default).</li></ul> <p data-svelte-h="svelte-ukpxjx">If the dataset doesn’t pass the verifications, it is likely that the dataset author made some changes in the data files.</p> <p data-svelte-h="svelte-qismhe">In this case, an error is raised to alert that the dataset has changed. | |
| To ignore the error, one needs to specify <code>verification_mode="no_checks"</code> in <a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a>. | |
| Anytime you see a verification error, feel free to open a discussion or pull request in the corresponding dataset “Community” tab, so that the integrity checks for that dataset are updated.</p> <h2 class="relative group"><a id="security" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#security"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Security</span></h2> <p data-svelte-h="svelte-p38c5d">The dataset repositories on the Hub are scanned for malware, see more information <a href="https://huggingface.co/docs/hub/security#malware-scanning" rel="nofollow">here</a>.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/datasets/blob/main/docs/source/about_dataset_load.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_1tcoqe3 = { | |
| assets: "/docs/datasets/pr_8021/en", | |
| base: "/docs/datasets/pr_8021/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/datasets/pr_8021/en/_app/immutable/entry/start.467c4c66.js"), | |
| import("/docs/datasets/pr_8021/en/_app/immutable/entry/app.3b2ba720.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 5], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 26.9 kB
- Xet hash:
- abe0b1acc236462e98d8a1c6f3f65b2e83d409a46c136f1ba3044e9fbf8fdfb0
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.