Buckets:

hf-doc-build/doc-dev / datasets /main /en /load_hub.html
rtrm's picture
download
raw
30.5 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Load a dataset from the Hub&quot;,&quot;local&quot;:&quot;load-a-dataset-from-the-hub&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Load a dataset&quot;,&quot;local&quot;:&quot;load-a-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Splits&quot;,&quot;local&quot;:&quot;splits&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Configurations&quot;,&quot;local&quot;:&quot;configurations&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Remote code&quot;,&quot;local&quot;:&quot;remote-code&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/datasets/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/entry/start.4d44eea4.js">
<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/scheduler.bdbef820.js">
<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/singletons.36b689ad.js">
<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/index.8a885b74.js">
<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/paths.27092e28.js">
<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/entry/app.d83067e8.js">
<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/index.c0aea24a.js">
<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/nodes/0.bfb01985.js">
<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/nodes/27.3d44ca30.js">
<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/Tip.31005f7d.js">
<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/CodeBlock.6ccca92e.js">
<link rel="modulepreload" href="/docs/datasets/main/en/_app/immutable/chunks/EditOnGithub.725ee0c1.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Load a dataset from the Hub&quot;,&quot;local&quot;:&quot;load-a-dataset-from-the-hub&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Load a dataset&quot;,&quot;local&quot;:&quot;load-a-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Splits&quot;,&quot;local&quot;:&quot;splits&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Configurations&quot;,&quot;local&quot;:&quot;configurations&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Remote code&quot;,&quot;local&quot;:&quot;remote-code&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="load-a-dataset-from-the-hub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#load-a-dataset-from-the-hub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Load a dataset from the Hub</span></h1> <p data-svelte-h="svelte-9qr6ee">Finding high-quality datasets that are reproducible and accessible can be difficult. One of 🤗 Datasets main goals is to provide a simple way to load a dataset of any format or type. The easiest way to get started is to discover an existing dataset on the <a href="https://huggingface.co/datasets" rel="nofollow">Hugging Face Hub</a> - a community-driven collection of datasets for tasks in NLP, computer vision, and audio - and use 🤗 Datasets to download and generate the dataset.</p> <p data-svelte-h="svelte-1nd3fhb">This tutorial uses the <a href="https://huggingface.co/datasets/rotten_tomatoes" rel="nofollow">rotten_tomatoes</a> and <a href="https://huggingface.co/datasets/PolyAI/minds14" rel="nofollow">MInDS-14</a> datasets, but feel free to load any dataset you want and follow along. Head over to the Hub now and find a dataset for your task!</p> <h2 class="relative group"><a id="load-a-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#load-a-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Load a dataset</span></h2> <p data-svelte-h="svelte-iyvhco">Before you take the time to download a dataset, it’s often helpful to quickly get some general information about a dataset. A dataset’s information is stored inside <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.DatasetInfo">DatasetInfo</a> and can include information such as the dataset description, features, and dataset size.</p> <p data-svelte-h="svelte-1p7opk8">Use the <a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset_builder">load_dataset_builder()</a> function to load a dataset builder and inspect a dataset’s attributes without committing to downloading it:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset_builder
<span class="hljs-meta">&gt;&gt;&gt; </span>ds_builder = load_dataset_builder(<span class="hljs-string">&quot;rotten_tomatoes&quot;</span>)
<span class="hljs-comment"># Inspect dataset description</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>ds_builder.info.description
Movie Review Dataset. This <span class="hljs-keyword">is</span> a dataset of containing <span class="hljs-number">5</span>,<span class="hljs-number">331</span> positive <span class="hljs-keyword">and</span> <span class="hljs-number">5</span>,<span class="hljs-number">331</span> negative processed sentences <span class="hljs-keyword">from</span> Rotten Tomatoes movie reviews. This data was first used <span class="hljs-keyword">in</span> Bo Pang <span class="hljs-keyword">and</span> Lillian Lee, ``Seeing stars: Exploiting <span class="hljs-keyword">class</span> <span class="hljs-title class_">relationships</span> <span class="hljs-keyword">for</span> sentiment categorization <span class="hljs-keyword">with</span> respect to rating scales.<span class="hljs-string">&#x27;&#x27;</span>, Proceedings of the ACL, <span class="hljs-number">2005.</span>
<span class="hljs-comment"># Inspect dataset features</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>ds_builder.info.features
{<span class="hljs-string">&#x27;label&#x27;</span>: ClassLabel(num_classes=<span class="hljs-number">2</span>, names=[<span class="hljs-string">&#x27;neg&#x27;</span>, <span class="hljs-string">&#x27;pos&#x27;</span>], <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>),
<span class="hljs-string">&#x27;text&#x27;</span>: Value(dtype=<span class="hljs-string">&#x27;string&#x27;</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>)}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1lqsd4u">If you’re happy with the dataset, then load it with <a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = load_dataset(<span class="hljs-string">&quot;rotten_tomatoes&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="splits" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#splits"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Splits</span></h2> <p data-svelte-h="svelte-z32per">A split is a specific subset of a dataset like <code>train</code> and <code>test</code>. List a dataset’s split names with the <a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.get_dataset_split_names">get_dataset_split_names()</a> function:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> get_dataset_split_names
<span class="hljs-meta">&gt;&gt;&gt; </span>get_dataset_split_names(<span class="hljs-string">&quot;rotten_tomatoes&quot;</span>)
[<span class="hljs-string">&#x27;train&#x27;</span>, <span class="hljs-string">&#x27;validation&#x27;</span>, <span class="hljs-string">&#x27;test&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1i9yshe">Then you can load a specific split with the <code>split</code> parameter. Loading a dataset <code>split</code> returns a <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset">Dataset</a> object:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = load_dataset(<span class="hljs-string">&quot;rotten_tomatoes&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset
Dataset({
features: [<span class="hljs-string">&#x27;text&#x27;</span>, <span class="hljs-string">&#x27;label&#x27;</span>],
num_rows: <span class="hljs-number">8530</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1gkubwl">If you don’t specify a <code>split</code>, 🤗 Datasets returns a <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.DatasetDict">DatasetDict</a> object instead:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = load_dataset(<span class="hljs-string">&quot;rotten_tomatoes&quot;</span>)
DatasetDict({
train: Dataset({
features: [<span class="hljs-string">&#x27;text&#x27;</span>, <span class="hljs-string">&#x27;label&#x27;</span>],
num_rows: <span class="hljs-number">8530</span>
})
validation: Dataset({
features: [<span class="hljs-string">&#x27;text&#x27;</span>, <span class="hljs-string">&#x27;label&#x27;</span>],
num_rows: <span class="hljs-number">1066</span>
})
test: Dataset({
features: [<span class="hljs-string">&#x27;text&#x27;</span>, <span class="hljs-string">&#x27;label&#x27;</span>],
num_rows: <span class="hljs-number">1066</span>
})
})<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="configurations" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#configurations"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Configurations</span></h2> <p data-svelte-h="svelte-2wgw8k">Some datasets contain several sub-datasets. For example, the <a href="https://huggingface.co/datasets/PolyAI/minds14" rel="nofollow">MInDS-14</a> dataset has several sub-datasets, each one containing audio data in a different language. These sub-datasets are known as <em>configurations</em> or <em>subsets</em>, and you must explicitly select one when loading the dataset. If you don’t provide a configuration name, 🤗 Datasets will raise a <code>ValueError</code> and remind you to choose a configuration.</p> <p data-svelte-h="svelte-1r3luk1">Use the <a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.get_dataset_config_names">get_dataset_config_names()</a> function to retrieve a list of all the possible configurations available to your dataset:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> get_dataset_config_names
<span class="hljs-meta">&gt;&gt;&gt; </span>configs = get_dataset_config_names(<span class="hljs-string">&quot;PolyAI/minds14&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(configs)
[<span class="hljs-string">&#x27;cs-CZ&#x27;</span>, <span class="hljs-string">&#x27;de-DE&#x27;</span>, <span class="hljs-string">&#x27;en-AU&#x27;</span>, <span class="hljs-string">&#x27;en-GB&#x27;</span>, <span class="hljs-string">&#x27;en-US&#x27;</span>, <span class="hljs-string">&#x27;es-ES&#x27;</span>, <span class="hljs-string">&#x27;fr-FR&#x27;</span>, <span class="hljs-string">&#x27;it-IT&#x27;</span>, <span class="hljs-string">&#x27;ko-KR&#x27;</span>, <span class="hljs-string">&#x27;nl-NL&#x27;</span>, <span class="hljs-string">&#x27;pl-PL&#x27;</span>, <span class="hljs-string">&#x27;pt-PT&#x27;</span>, <span class="hljs-string">&#x27;ru-RU&#x27;</span>, <span class="hljs-string">&#x27;zh-CN&#x27;</span>, <span class="hljs-string">&#x27;all&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-avfjb">Then load the configuration you want:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-meta">&gt;&gt;&gt; </span>mindsFR = load_dataset(<span class="hljs-string">&quot;PolyAI/minds14&quot;</span>, <span class="hljs-string">&quot;fr-FR&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="remote-code" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#remote-code"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Remote code</span></h2> <p data-svelte-h="svelte-ffeuwm">Certain datasets repositories contain a loading script with the Python code used to generate the dataset. All files and code uploaded to the Hub are scanned for malware (refer to the Hub security documentation for more information), but you should still review the dataset loading scripts and authors to avoid executing malicious code on your machine. You should set <code>trust_remote_code=True</code> to use a dataset with a loading script, or you will get an error:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> get_dataset_config_names, get_dataset_split_names, load_dataset
<span class="hljs-meta">&gt;&gt;&gt; </span>c4 = load_dataset(<span class="hljs-string">&quot;c4&quot;</span>, <span class="hljs-string">&quot;en&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>, trust_remote_code=<span class="hljs-literal">True</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>get_dataset_config_names(<span class="hljs-string">&quot;c4&quot;</span>, trust_remote_code=<span class="hljs-literal">True</span>)
[<span class="hljs-string">&#x27;en&#x27;</span>, <span class="hljs-string">&#x27;realnewslike&#x27;</span>, <span class="hljs-string">&#x27;en.noblocklist&#x27;</span>, <span class="hljs-string">&#x27;en.noclean&#x27;</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>get_dataset_split_names(<span class="hljs-string">&quot;c4&quot;</span>, <span class="hljs-string">&quot;en&quot;</span>, trust_remote_code=<span class="hljs-literal">True</span>)
[<span class="hljs-string">&#x27;train&#x27;</span>, <span class="hljs-string">&#x27;validation&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-1adalis">For security reasons, 🤗 Datasets do not allow running dataset loading scripts by default, and you have to pass <code>trust_remote_code=True</code> to load datasets that require running a dataset script.</p></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/datasets/blob/main/docs/source/load_hub.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_w3org2 = {
assets: "/docs/datasets/main/en",
base: "/docs/datasets/main/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/datasets/main/en/_app/immutable/entry/start.4d44eea4.js"),
import("/docs/datasets/main/en/_app/immutable/entry/app.d83067e8.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 27],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
30.5 kB
·
Xet hash:
daf3481368e6b0bdcf0674c4f15d5409fc2f1193d8b08dde955e012abd696ff3

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.