Buckets:

hf-doc-build/doc-dev / hub /pr_2437 /en /datasets-adding.html
download
raw
32.5 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Uploading datasets&quot;,&quot;local&quot;:&quot;uploading-datasets&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Upload using the Hub UI&quot;,&quot;local&quot;:&quot;upload-using-the-hub-ui&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Create a repository&quot;,&quot;local&quot;:&quot;create-a-repository&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Upload dataset&quot;,&quot;local&quot;:&quot;upload-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Create a Dataset card&quot;,&quot;local&quot;:&quot;create-a-dataset-card&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Using the huggingface_hub client library&quot;,&quot;local&quot;:&quot;using-the-huggingfacehub-client-library&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Using other libraries&quot;,&quot;local&quot;:&quot;using-other-libraries&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Using Git&quot;,&quot;local&quot;:&quot;using-git&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Ingest datasets&quot;,&quot;local&quot;:&quot;ingest-datasets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;File formats&quot;,&quot;local&quot;:&quot;file-formats&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Which file format should I use?&quot;,&quot;local&quot;:&quot;which-file-format-should-i-use&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Data Studio&quot;,&quot;local&quot;:&quot;data-studio&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Large scale datasets&quot;,&quot;local&quot;:&quot;large-scale-datasets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/hub/pr_2437/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/entry/start.5e3ed1fd.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/scheduler.258d2a4d.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/singletons.2d0c91e1.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/index.c8b82093.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/paths.d360121b.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/entry/app.93e7704b.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/preload-helper.a4507a26.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/index.421344fd.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/nodes/0.bedbef6e.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/nodes/22.e9383091.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/CopyLLMTxtMenu.9c2a67a1.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.9bb92958.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Uploading datasets&quot;,&quot;local&quot;:&quot;uploading-datasets&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Upload using the Hub UI&quot;,&quot;local&quot;:&quot;upload-using-the-hub-ui&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Create a repository&quot;,&quot;local&quot;:&quot;create-a-repository&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Upload dataset&quot;,&quot;local&quot;:&quot;upload-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Create a Dataset card&quot;,&quot;local&quot;:&quot;create-a-dataset-card&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Using the huggingface_hub client library&quot;,&quot;local&quot;:&quot;using-the-huggingfacehub-client-library&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Using other libraries&quot;,&quot;local&quot;:&quot;using-other-libraries&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Using Git&quot;,&quot;local&quot;:&quot;using-git&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Ingest datasets&quot;,&quot;local&quot;:&quot;ingest-datasets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;File formats&quot;,&quot;local&quot;:&quot;file-formats&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Which file format should I use?&quot;,&quot;local&quot;:&quot;which-file-format-should-i-use&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Data Studio&quot;,&quot;local&quot;:&quot;data-studio&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Large scale datasets&quot;,&quot;local&quot;:&quot;large-scale-datasets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="uploading-datasets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#uploading-datasets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Uploading datasets</span></h1> <p data-svelte-h="svelte-1754kge">The <a href="https://huggingface.co/datasets" rel="nofollow">Hub</a> is home to an extensive collection of community-curated and research datasets. We encourage you to share your dataset to the Hub to help grow the ML community and accelerate progress for everyone. All contributions are welcome; adding a dataset is just a drag and drop away!</p> <p data-svelte-h="svelte-1fojn2">Start by <a href="https://huggingface.co/join" rel="nofollow">creating a Hugging Face Hub account</a> if you don’t have one yet.</p> <h2 class="relative group"><a id="upload-using-the-hub-ui" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#upload-using-the-hub-ui"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Upload using the Hub UI</span></h2> <p data-svelte-h="svelte-wcztra">The Hub’s web-based interface allows users without any developer experience to upload a dataset.</p> <h3 class="relative group"><a id="create-a-repository" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#create-a-repository"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Create a repository</span></h3> <p data-svelte-h="svelte-193c6le">A repository hosts all your dataset files, including the revision history, making storing more than one dataset version possible.</p> <ol data-svelte-h="svelte-t7tqd0"><li>Click on your profile and select <strong>New Dataset</strong> to create a <a href="https://huggingface.co/new-dataset" rel="nofollow">new dataset repository</a>.</li> <li>Pick a name for your dataset, and choose whether it is a public or private dataset. A public dataset is visible to anyone, whereas a private dataset can only be viewed by you or members of your organization.</li></ol> <div class="flex justify-center" data-svelte-h="svelte-ggmafm"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/create_repo.png"></div> <h3 class="relative group"><a id="upload-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#upload-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Upload dataset</span></h3> <ol data-svelte-h="svelte-1371ekm"><li>Once you’ve created a repository, navigate to the <strong>Files and versions</strong> tab to add a file. Select <strong>Add file</strong> to upload your dataset files. We support many text, audio, image and other data extensions such as <code>.csv</code>, <code>.mp3</code>, and <code>.jpg</code> (see the full list of <a href="#file-formats">File formats</a>).</li></ol> <div class="flex justify-center" data-svelte-h="svelte-ejbx7w"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/upload_files.png"></div> <ol start="2" data-svelte-h="svelte-1q9alp6"><li>Drag and drop your dataset files.</li></ol> <div class="flex justify-center" data-svelte-h="svelte-14wj5uc"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/commit_files.png"></div> <ol start="3" data-svelte-h="svelte-9bhhwd"><li>After uploading your dataset files, they are stored in your dataset repository.</li></ol> <div class="flex justify-center" data-svelte-h="svelte-15ihy8q"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/files_stored.png"></div> <h3 class="relative group"><a id="create-a-dataset-card" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#create-a-dataset-card"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Create a Dataset card</span></h3> <p data-svelte-h="svelte-5ezmk1">Adding a Dataset card is super valuable for helping users find your dataset and understand how to use it responsibly.</p> <ol data-svelte-h="svelte-vc122w"><li>Click on <strong>Create Dataset Card</strong> to create a <a href="./datasets-cards">Dataset card</a>. This button creates a <code>README.md</code> file in your repository.</li></ol> <div class="flex justify-center" data-svelte-h="svelte-1o3g2qu"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/dataset_card.png"></div> <ol start="2" data-svelte-h="svelte-tbqb69"><li><p>At the top, you’ll see the <strong>Metadata UI</strong> with several fields to select from such as license, language, and task categories. These are the most important tags for helping users discover your dataset on the Hub (when applicable). When you select an option for a field, it will be automatically added to the top of the dataset card.</p> <p>You can also look at the <a href="https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1" rel="nofollow">Dataset Card specifications</a>, which has a complete set of allowed tags, including optional like <code>annotations_creators</code>, to help you choose the ones that are useful for your dataset.</p></li></ol> <div class="flex justify-center" data-svelte-h="svelte-stlsmn"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/metadata_ui.png"></div> <ol start="3" data-svelte-h="svelte-1nopxc3"><li><p>Write your dataset documentation in the Dataset Card to introduce your dataset to the community and help users understand what is inside: what are the use cases and limitations, where the data comes from, what are important ethical considerations, and any other relevant details.</p> <p>You can click on the <strong>Import dataset card template</strong> link at the top of the editor to automatically create a dataset card template. For a detailed example of what a good Dataset card should look like, take a look at the <a href="https://huggingface.co/datasets/cnn_dailymail" rel="nofollow">CNN DailyMail Dataset card</a>.</p></li></ol> <h2 class="relative group"><a id="using-the-huggingfacehub-client-library" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-the-huggingfacehub-client-library"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using the huggingface_hub client library</span></h2> <p data-svelte-h="svelte-1jsh2bn">The rich features set in the <code>huggingface_hub</code> library allows you to manage repositories, including creating repos and uploading datasets to the Hub. Visit <a href="/docs/huggingface_hub/index">the client library’s documentation</a> to learn more.</p> <h2 class="relative group"><a id="using-other-libraries" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-other-libraries"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using other libraries</span></h2> <p data-svelte-h="svelte-gt3tav">Some libraries like <a href="/docs/datasets/index">🤗 Datasets</a>, <a href="https://pandas.pydata.org/" rel="nofollow">Pandas</a>, <a href="https://pola.rs" rel="nofollow">Polars</a>, <a href="https://www.dask.org/" rel="nofollow">Dask</a>, <a href="https://duckdb.org/" rel="nofollow">DuckDB</a>, or <a href="https://daft.ai/" rel="nofollow">Daft</a> can upload files to the Hub.
See the list of <a href="./datasets-libraries">Libraries supported by the Datasets Hub</a> for more information.</p> <h2 class="relative group"><a id="using-git" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-git"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using Git</span></h2> <p data-svelte-h="svelte-b2l0oe">Since dataset repos are Git repositories, you can use Git to push your data files to the Hub. Follow the guide on <a href="repositories-getting-started">Getting Started with Repositories</a> to learn about using the <code>git</code> CLI to commit and push your datasets.</p> <h2 class="relative group"><a id="ingest-datasets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ingest-datasets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Ingest datasets</span></h2> <p data-svelte-h="svelte-1xq9iv">If you have data in databases, cloud storage or behind APIs, you can ingest them to Hugging Face as ready-to-use datasets.</p> <p data-svelte-h="svelte-1krc4ui">Find more information in the <a href="./datasets-ingesting">documentation on ingesting datasets</a>.</p> <h2 class="relative group"><a id="file-formats" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#file-formats"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>File formats</span></h2> <p data-svelte-h="svelte-dyuuws">The Hub natively supports multiple file formats:</p> <ul data-svelte-h="svelte-1x46iee"><li>Parquet (.parquet)</li> <li>CSV (.csv, .tsv)</li> <li>JSON Lines, JSON (.jsonl, .json)</li> <li>Arrow streaming and IPC formats (.arrow)</li> <li>Text (.txt)</li> <li>Images (.png, .jpg, etc.)</li> <li>Audio (.wav, .mp3, etc.)</li> <li>Video (.mp4, .mov, .avi, etc.)</li> <li>PDF (.pdf)</li> <li><a href="./datasets-webdataset">WebDataset</a> (.tar)</li> <li><a href="./datasets-lance">Lance</a> (.lance)</li></ul> <p data-svelte-h="svelte-mpmhfr">It supports files compressed using ZIP (.zip), GZIP (.gz), ZSTD (.zst), BZ2 (.bz2), LZ4 (.lz4) and LZMA (.xz).</p> <p data-svelte-h="svelte-185tivl">Image and audio files can also have additional metadata files. See the <a href="./datasets-data-files-configuration#image-and-audio-datasets">Data files Configuration</a> on image and audio datasets, as well as the collections of <a href="https://huggingface.co/datasets-examples" rel="nofollow">example datasets</a> for CSV, TSV and images.</p> <p data-svelte-h="svelte-1tta5o7">You may want to convert your files to these formats to benefit from all the Hub features.
Other formats and structures may not be recognized by the Hub.</p> <h3 class="relative group"><a id="which-file-format-should-i-use" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#which-file-format-should-i-use"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Which file format should I use?</span></h3> <p data-svelte-h="svelte-182e8gq">For most types of datasets, <strong>Parquet</strong> is the recommended format due to its efficient compression, rich typing, and since a variety of tools supports this format with optimized read and batched operations. Alternatively, CSV or JSON Lines/JSON can be used for tabular data (prefer JSON Lines for nested data). Although easy to parse compared to Parquet, these formats are not recommended for data larger than several GBs. For image and audio datasets, uploading raw files is the most practical for most use cases since it’s easy to access individual files. For large scale image and audio datasets streaming, <a href="https://github.com/webdataset/webdataset" rel="nofollow">WebDataset</a> should be preferred over raw image and audio files to avoid the overhead of accessing individual files. Though for more general use cases involving analytics, data filtering or metadata parsing, Parquet is the recommended option for large scale image and audio datasets.</p> <h3 class="relative group"><a id="data-studio" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#data-studio"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Data Studio</span></h3> <p data-svelte-h="svelte-19w4s7u">The <a href="./data-studio">Data Studio</a> is useful to know how the data actually looks like before you download it.
It is enabled by default for all public datasets. It is also available for private datasets owned by a <a href="https://huggingface.co/pricing" rel="nofollow">PRO user</a> or a <a href="https://huggingface.co/enterprise" rel="nofollow">Team or Enterprise organization</a>.</p> <p data-svelte-h="svelte-189qumh">After uploading your dataset, make sure the Dataset Viewer correctly shows your data, or <a href="./datasets-viewer-configure">Configure the Dataset Viewer</a>.</p> <h2 class="relative group"><a id="large-scale-datasets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#large-scale-datasets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Large scale datasets</span></h2> <p data-svelte-h="svelte-ivxrwk">The Hugging Face Hub supports large scale datasets, usually uploaded in Parquet (e.g. via <code>push_to_hub()</code> using <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.push_to_hub">🤗 Datasets</a>) or <a href="https://github.com/webdataset/webdataset" rel="nofollow">WebDataset</a> format.</p> <p data-svelte-h="svelte-14mep43">You can upload large scale datasets at high speed using the <code>huggingface_hub</code> library.</p> <p data-svelte-h="svelte-165e0se">See <a href="/docs/huggingface_hub/guides/upload#upload-a-folder-by-chunks">how to upload a folder by chunks</a>, the <a href="/docs/huggingface_hub/guides/upload#tips-and-tricks-for-large-uploads">tips and tricks for large uploads</a> and the <a href="./storage-limits">repository storage limits and recommendations</a>.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/hub-docs/blob/main/docs/hub/datasets-adding.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_ummgov = {
assets: "/docs/hub/pr_2437/en",
base: "/docs/hub/pr_2437/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/hub/pr_2437/en/_app/immutable/entry/start.5e3ed1fd.js"),
import("/docs/hub/pr_2437/en/_app/immutable/entry/app.93e7704b.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 22],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
32.5 kB
·
Xet hash:
bb0fc186397d9f1880b753b0408d2a4ec4a379bc8eb252446b8a4c48b3a8f1a0

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.