Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / datasets /main /en /about_dataset_load.html

rtrm

3 months ago

download

raw

26.4 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Build and load","local":"build-and-load","sections":[{"title":"ELI5: load_dataset","local":"eli5-loaddataset","sections":[],"depth":2},{"title":"Building a dataset","local":"building-a-dataset","sections":[{"title":"BuilderConfig","local":"datasets-builderconfig","sections":[],"depth":3},{"title":"DatasetBuilder","local":"datasets-datasetbuilder","sections":[],"depth":3}],"depth":2},{"title":"Maintaining integrity","local":"maintaining-integrity","sections":[],"depth":2},{"title":"Security","local":"security","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/datasets/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/entry/start.4d44eea4.js">
	<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/scheduler.bdbef820.js">
	<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/singletons.36b689ad.js">
	<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/index.8a885b74.js">
	<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/paths.27092e28.js">
	<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/entry/app.d83067e8.js">
	<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/index.c0aea24a.js">
	<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/nodes/0.bfb01985.js">
	<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/nodes/5.e8e74ee3.js">
	<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/Tip.31005f7d.js">
	<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/EditOnGithub.725ee0c1.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Build and load","local":"build-and-load","sections":[{"title":"ELI5: load_dataset","local":"eli5-loaddataset","sections":[],"depth":2},{"title":"Building a dataset","local":"building-a-dataset","sections":[{"title":"BuilderConfig","local":"datasets-builderconfig","sections":[],"depth":3},{"title":"DatasetBuilder","local":"datasets-datasetbuilder","sections":[],"depth":3}],"depth":2},{"title":"Maintaining integrity","local":"maintaining-integrity","sections":[],"depth":2},{"title":"Security","local":"security","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="build-and-load" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#build-and-load"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Build and load</span></h1> <p data-svelte-h="svelte-367w5m">Nearly every deep learning workflow begins with loading a dataset, which makes it one of the most important steps. With 🤗 Datasets, there are more than 900 datasets available to help you get started with your NLP task. All you have to do is call: <a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a> to take your first step. This function is a true workhorse in every sense because it builds and loads every dataset you use.</p> <h2 class="relative group"><a id="eli5-loaddataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#eli5-loaddataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ELI5: load_dataset</span></h2> <p data-svelte-h="svelte-1et77y">Let’s begin with a basic Explain Like I’m Five.</p> <p data-svelte-h="svelte-gersvb">A dataset is a directory that contains:</p> <ul data-svelte-h="svelte-mtskvn"><li>Some data files in generic formats (JSON, CSV, Parquet, text, etc.)</li> <li>A dataset card named <code>README.md</code> that contains documentation about the dataset as well as a YAML header to define the datasets tags and configurations</li> <li>An optional dataset script if it requires some code to read the data files. This is sometimes used to load files of specific formats and structures.</li></ul> <p data-svelte-h="svelte-23u8ym">The <a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a> function fetches the requested dataset locally or from the Hugging Face Hub.
	The Hub is a central repository where all the Hugging Face datasets and models are stored.</p> <p data-svelte-h="svelte-ifhv41">If the dataset only contains data files, then <a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a> automatically infers how to load the data files from their extensions (json, csv, parquet, txt, etc.).
	Under the hood, 🤗 Datasets will use an appropriate <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.DatasetBuilder">DatasetBuilder</a> based on the data files format. There exist one builder per data file format in 🤗 Datasets:</p> <ul data-svelte-h="svelte-5tl7h6"><li><a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.packaged_modules.text.Text">datasets.packaged_modules.text.Text</a> for text</li> <li><a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.packaged_modules.csv.Csv">datasets.packaged_modules.csv.Csv</a> for CSV and TSV</li> <li><a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.packaged_modules.json.Json">datasets.packaged_modules.json.Json</a> for JSON and JSONL</li> <li><a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.packaged_modules.parquet.Parquet">datasets.packaged_modules.parquet.Parquet</a> for Parquet</li> <li><a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.packaged_modules.arrow.Arrow">datasets.packaged_modules.arrow.Arrow</a> for Arrow (streaming file format)</li> <li><a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.packaged_modules.sql.Sql">datasets.packaged_modules.sql.Sql</a> for SQL databases</li> <li><a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.packaged_modules.imagefolder.ImageFolder">datasets.packaged_modules.imagefolder.ImageFolder</a> for image folders</li> <li><a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.packaged_modules.audiofolder.AudioFolder">datasets.packaged_modules.audiofolder.AudioFolder</a> for audio folders</li></ul> <p data-svelte-h="svelte-1x5yq3m">If the dataset has a dataset script, then it downloads and imports it from the Hugging Face Hub.
	Code in the dataset script defines a custom <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.DatasetBuilder">DatasetBuilder</a> the dataset information (description, features, URL to the original files, etc.), and tells 🤗 Datasets how to generate and display examples from it.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-ktnbf3">Read the <a href="./upload_dataset">Share</a> section to learn more about how to share a dataset. This section also provides a step-by-step guide on how to write your own dataset loading script!</p></div> <p data-svelte-h="svelte-iduykv">🤗 Datasets downloads the dataset files from the original URL, generates the dataset and caches it in an Arrow table on your drive.
	If you’ve downloaded the dataset before, then 🤗 Datasets will reload it from the cache to save you the trouble of downloading it again.</p> <p data-svelte-h="svelte-1jl5e53">Now that you have a high-level understanding about how datasets are built, let’s take a closer look at the nuts and bolts of how all this works.</p> <h2 class="relative group"><a id="building-a-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#building-a-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Building a dataset</span></h2> <p data-svelte-h="svelte-1xtt8qc">When you load a dataset for the first time, 🤗 Datasets takes the raw data file and builds it into a table of rows and typed columns. There are two main classes responsible for building a dataset: <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> and <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.DatasetBuilder">DatasetBuilder</a>.</p> <div class="flex justify-center" data-svelte-h="svelte-1adyav4"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/builderconfig.png"></div> <h3 class="relative group"><a id="datasets-builderconfig" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#datasets-builderconfig"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>BuilderConfig</span></h3> <p data-svelte-h="svelte-2i8852"><a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> is the configuration class of <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.DatasetBuilder">DatasetBuilder</a>. The <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> contains the following basic attributes about a dataset:</p> <table data-svelte-h="svelte-fm8v6i"><thead><tr><th>Attribute</th> <th>Description</th></tr></thead> <tbody><tr><td><code>name</code></td> <td>Short name of the dataset.</td></tr> <tr><td><code>version</code></td> <td>Dataset version identifier.</td></tr> <tr><td><code>data_dir</code></td> <td>Stores the path to a local folder containing the data files.</td></tr> <tr><td><code>data_files</code></td> <td>Stores paths to local data files.</td></tr> <tr><td><code>description</code></td> <td>Description of the dataset.</td></tr></tbody></table> <p data-svelte-h="svelte-1ei6sfe">If you want to add additional attributes to your dataset such as the class labels, you can subclass the base <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> class. There are two ways to populate the attributes of a <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> class or subclass:</p> <ul data-svelte-h="svelte-n152ef"><li><p>Provide a list of predefined <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> class (or subclass) instances in the datasets <code>DatasetBuilder.BUILDER_CONFIGS()</code> attribute.</p></li> <li><p>When you call <a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a>, any keyword arguments that are not specific to the method will be used to set the associated attributes of the <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> class. This will override the predefined attributes if a specific configuration was selected.</p></li></ul> <p data-svelte-h="svelte-3kiu1v">You can also set the <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.BuilderConfig">DatasetBuilder.BUILDER_CONFIG_CLASS</a> to any custom subclass of <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a>.</p> <h3 class="relative group"><a id="datasets-datasetbuilder" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#datasets-datasetbuilder"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>DatasetBuilder</span></h3> <p data-svelte-h="svelte-n82b3l"><a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.DatasetBuilder">DatasetBuilder</a> accesses all the attributes inside <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.BuilderConfig">BuilderConfig</a> to build the actual dataset.</p> <div class="flex justify-center" data-svelte-h="svelte-11mt2nw"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/datasetbuilder.png"></div> <p data-svelte-h="svelte-1taz20f">There are three main methods in <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.DatasetBuilder">DatasetBuilder</a>:</p> <ol data-svelte-h="svelte-mvvbey"><li><p><code>DatasetBuilder._info()</code> is in charge of defining the dataset attributes. When you call <code>dataset.info</code>, 🤗 Datasets returns the information stored here. Likewise, the <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Features">Features</a> are also specified here. Remember, the <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Features">Features</a> are like the skeleton of the dataset. It provides the names and types of each column.</p></li> <li><p><code>DatasetBuilder._split_generator</code> downloads or retrieves the requested data files, organizes them into splits, and defines specific arguments for the generation process. This method has a <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.DownloadManager">DownloadManager</a> that downloads files or fetches them from your local filesystem. Within the <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.DownloadManager">DownloadManager</a>, there is a <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.DownloadManager.download_and_extract">DownloadManager.download_and_extract()</a> method that accepts a dictionary of URLs to the original data files, and downloads the requested files. Accepted inputs include: a single URL or path, or a list/dictionary of URLs or paths. Any compressed file types like TAR, GZIP and ZIP archives will be automatically extracted.</p> <p>Once the files are downloaded, <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.SplitGenerator">SplitGenerator</a> organizes them into splits. The <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.SplitGenerator">SplitGenerator</a> contains the name of the split, and any keyword arguments that are provided to the <code>DatasetBuilder._generate_examples</code> method. The keyword arguments can be specific to each split, and typically comprise at least the local path to the data files for each split.</p></li> <li><p><code>DatasetBuilder._generate_examples</code> reads and parses the data files for a split. Then it yields dataset examples according to the format specified in the <code>features</code> from <code>DatasetBuilder._info()</code>. The input of <code>DatasetBuilder._generate_examples</code> is actually the <code>filepath</code> provided in the keyword arguments of the last method.</p> <p>The dataset is generated with a Python generator, which doesn’t load all the data in memory. As a result, the generator can handle large datasets. However, before the generated samples are flushed to the dataset file on disk, they are stored in an <code>ArrowWriter</code> buffer. This means the generated samples are written by batch. If your dataset samples consumes a lot of memory (images or videos), then make sure to specify a low value for the <code>DEFAULT_WRITER_BATCH_SIZE</code> attribute in <a href="/docs/datasets/main/en/package_reference/builder_classes#datasets.DatasetBuilder">DatasetBuilder</a>. We recommend not exceeding a size of 200 MB.</p></li></ol> <h2 class="relative group"><a id="maintaining-integrity" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#maintaining-integrity"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Maintaining integrity</span></h2> <p data-svelte-h="svelte-rrj7xx">To ensure a dataset is complete, <a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a> will perform a series of tests on the downloaded files to make sure everything is there. This way, you don’t encounter any surprises when your requested dataset doesn’t get generated as expected. <a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a> verifies:</p> <ul data-svelte-h="svelte-g3c4q3"><li>The number of splits in the generated <code>DatasetDict</code>.</li> <li>The number of samples in each split of the generated <code>DatasetDict</code>.</li> <li>The list of downloaded files.</li> <li>The SHA256 checksums of the downloaded files (disabled by defaut).</li></ul> <p data-svelte-h="svelte-1m6pq65">If the dataset doesn’t pass the verifications, it is likely that the original host of the dataset made some changes in the data files.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1njk00f">If it is your own dataset, you’ll need to recompute the information above and update the <code>README.md</code> file in your dataset repository. Take a look at this <a href="dataset_script#optional-generate-dataset-metadata">section</a> to learn how to generate and update this metadata.</p></div> <p data-svelte-h="svelte-69oo7t">In this case, an error is raised to alert that the dataset has changed.
	To ignore the error, one needs to specify <code>verification_mode="no_checks"</code> in <a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a>.
	Anytime you see a verification error, feel free to open a discussion or pull request in the corresponding dataset “Community” tab, so that the integrity checks for that dataset are updated.</p> <h2 class="relative group"><a id="security" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#security"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Security</span></h2> <p data-svelte-h="svelte-p38c5d">The dataset repositories on the Hub are scanned for malware, see more information <a href="https://huggingface.co/docs/hub/security#malware-scanning" rel="nofollow">here</a>.</p> <p data-svelte-h="svelte-1qoxqaa">Moreover the datasets without a namespace (originally contributed on our GitHub repository) have all been reviewed by our maintainers.
	The code of these datasets is considered <strong>safe</strong>.
	It concerns datasets that are not under a namespace, e.g. “squad” or “glue”, unlike the other datasets that are named “username/dataset_name” or “org/dataset_name”.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/datasets/blob/main/docs/source/about_dataset_load.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_w3org2 = {
	assets: "/docs/datasets/main/en",
	base: "/docs/datasets/main/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/datasets/main/en/_app/immutable/entry/start.4d44eea4.js"),
	import("/docs/datasets/main/en/_app/immutable/entry/app.d83067e8.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 5],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 26.4 kB
Xet hash:: 373d27a0c474e1df3c4df25df10741e698f4ef9efc5d462f35e9f517ec74bb1d

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.