Buckets:

hf-doc-build/doc-dev / hub /pr_2437 /en /datasets-libraries.html
download
raw
49.5 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Libraries&quot;,&quot;local&quot;:&quot;libraries&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Libraries table&quot;,&quot;local&quot;:&quot;libraries-table&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Data Processing Libraries&quot;,&quot;local&quot;:&quot;data-processing-libraries&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Streaming&quot;,&quot;local&quot;:&quot;streaming&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Optimized Parquet files&quot;,&quot;local&quot;:&quot;optimized-parquet-files&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Training Libraries&quot;,&quot;local&quot;:&quot;training-libraries&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Streaming from Hub&quot;,&quot;local&quot;:&quot;streaming-from-hub&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Logging to Hub&quot;,&quot;local&quot;:&quot;logging-to-hub&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Integrating data libraries and tools with the Hub&quot;,&quot;local&quot;:&quot;integrating-data-libraries-and-tools-with-the-hub&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Loading data from the Hub&quot;,&quot;local&quot;:&quot;loading-data-from-the-hub&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Integrating via the Dataset Viewer and Parquet Files&quot;,&quot;local&quot;:&quot;integrating-via-the-dataset-viewer-and-parquet-files&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Uploading data to the Hub&quot;,&quot;local&quot;:&quot;uploading-data-to-the-hub&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Use the datasets library&quot;,&quot;local&quot;:&quot;use-the-datasets-library&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Rely on an existing libraries integration with the Hub&quot;,&quot;local&quot;:&quot;rely-on-an-existing-libraries-integration-with-the-hub&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Using the huggingface_hub Python library&quot;,&quot;local&quot;:&quot;using-the-huggingfacehub-python-library&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;More support&quot;,&quot;local&quot;:&quot;more-support&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/hub/pr_2437/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/entry/start.5e3ed1fd.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/scheduler.258d2a4d.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/singletons.2d0c91e1.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/index.c8b82093.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/paths.d360121b.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/entry/app.93e7704b.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/preload-helper.a4507a26.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/index.421344fd.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/nodes/0.bedbef6e.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/nodes/49.60b7d42b.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/CopyLLMTxtMenu.9c2a67a1.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.9bb92958.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/CodeBlock.619ec4e3.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Libraries&quot;,&quot;local&quot;:&quot;libraries&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Libraries table&quot;,&quot;local&quot;:&quot;libraries-table&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Data Processing Libraries&quot;,&quot;local&quot;:&quot;data-processing-libraries&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Streaming&quot;,&quot;local&quot;:&quot;streaming&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Optimized Parquet files&quot;,&quot;local&quot;:&quot;optimized-parquet-files&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Training Libraries&quot;,&quot;local&quot;:&quot;training-libraries&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Streaming from Hub&quot;,&quot;local&quot;:&quot;streaming-from-hub&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Logging to Hub&quot;,&quot;local&quot;:&quot;logging-to-hub&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Integrating data libraries and tools with the Hub&quot;,&quot;local&quot;:&quot;integrating-data-libraries-and-tools-with-the-hub&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Loading data from the Hub&quot;,&quot;local&quot;:&quot;loading-data-from-the-hub&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Integrating via the Dataset Viewer and Parquet Files&quot;,&quot;local&quot;:&quot;integrating-via-the-dataset-viewer-and-parquet-files&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Uploading data to the Hub&quot;,&quot;local&quot;:&quot;uploading-data-to-the-hub&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Use the datasets library&quot;,&quot;local&quot;:&quot;use-the-datasets-library&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Rely on an existing libraries integration with the Hub&quot;,&quot;local&quot;:&quot;rely-on-an-existing-libraries-integration-with-the-hub&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;Using the huggingface_hub Python library&quot;,&quot;local&quot;:&quot;using-the-huggingfacehub-python-library&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;More support&quot;,&quot;local&quot;:&quot;more-support&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="libraries" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#libraries"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Libraries</span></h1> <p data-svelte-h="svelte-19pvdk">The Datasets Hub has support for several libraries in the Open Source ecosystem.
Thanks to the <a href="/docs/huggingface_hub">huggingface_hub Python library</a>, it’s easy to enable sharing your datasets on the Hub.
We’re happy to welcome to the Hub a set of Open Source libraries that are pushing Machine Learning forward.</p> <h2 class="relative group"><a id="libraries-table" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#libraries-table"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Libraries table</span></h2> <p data-svelte-h="svelte-ff3ufa">The table below summarizes the supported libraries and their level of integration.</p> <table data-svelte-h="svelte-d9aik2"><thead><tr><th>Library</th> <th>Description</th> <th>Download from Hub</th> <th>Stream from Hub</th> <th>Push to Hub</th> <th>Stream to Hub</th> <th>Optimized Parquet files</th></tr></thead> <tbody><tr><td><a href="./datasets-argilla">Argilla</a></td> <td>Collaboration tool for AI engineers and domain experts that value high quality data.</td> <td></td> <td></td> <td></td> <td></td> <td></td></tr> <tr><td><a href="./datasets-daft">Daft</a></td> <td>Data engine for large scale, multimodal data processing with a Python-native interface.</td> <td></td> <td></td> <td></td> <td></td> <td></td></tr> <tr><td><a href="./datasets-dask">Dask</a></td> <td>Parallel and distributed computing library that scales the existing Python and PyData ecosystem.</td> <td></td> <td></td> <td></td> <td></td> <td>✅*</td></tr> <tr><td><a href="./datasets-data-designer">Data Designer</a></td> <td>NVIDIA NeMo framework for generating synthetic datasets using LLMs.</td> <td></td> <td></td> <td></td> <td></td> <td></td></tr> <tr><td><a href="./datasets-usage">Datasets</a></td> <td>🤗 Datasets is a library for accessing and sharing datasets for Audio, Computer Vision, and Natural Language Processing (NLP).</td> <td></td> <td></td> <td></td> <td></td> <td></td></tr> <tr><td><a href="./datasets-distilabel">Distilabel</a></td> <td>The framework for synthetic data generation and AI feedback.</td> <td></td> <td></td> <td></td> <td></td> <td></td></tr> <tr><td><a href="./datasets-duckdb">DuckDB</a></td> <td>In-process SQL OLAP database management system.</td> <td></td> <td></td> <td></td> <td></td> <td></td></tr> <tr><td><a href="./datasets-embedding-atlas">Embedding Atlas</a></td> <td>Interactive visualization and exploration tool for large embeddings.</td> <td></td> <td></td> <td></td> <td></td> <td></td></tr> <tr><td><a href="./datasets-fenic">Fenic</a></td> <td>PySpark-inspired DataFrame framework for building production AI and agentic applications.</td> <td></td> <td></td> <td></td> <td></td> <td></td></tr> <tr><td><a href="./datasets-fiftyone">FiftyOne</a></td> <td>FiftyOne is a library for curation and visualization of image, video, and 3D data.</td> <td></td> <td></td> <td></td> <td></td> <td></td></tr> <tr><td><a href="./datasets-lance">Lance</a></td> <td>An open lakehouse format for multimodal AI.</td> <td></td> <td></td> <td></td> <td></td> <td></td></tr> <tr><td><a href="./datasets-pandas">Pandas</a></td> <td>Python data analysis toolkit.</td> <td></td> <td></td> <td></td> <td></td> <td>✅*</td></tr> <tr><td><a href="./datasets-polars">Polars</a></td> <td>A DataFrame library on top of an OLAP query engine.</td> <td></td> <td></td> <td></td> <td></td> <td></td></tr> <tr><td><a href="./datasets-pyarrow">PyArrow</a></td> <td>Apache Arrow is a columnar format and a toolbox for fast data interchange and in-memory analytics.</td> <td></td> <td></td> <td></td> <td></td> <td>✅*</td></tr> <tr><td><a href="./datasets-spark">Spark</a></td> <td>Real-time, large-scale data processing tool in a distributed environment.</td> <td></td> <td></td> <td></td> <td></td> <td></td></tr> <tr><td><a href="./datasets-webdataset">WebDataset</a></td> <td>Library to write I/O pipelines for large datasets.</td> <td></td> <td></td> <td></td> <td></td> <td></td></tr></tbody></table> <p data-svelte-h="svelte-zhco94"><em>* Requires passing extra arguments to write optimized Parquet files</em></p> <h2 class="relative group"><a id="data-processing-libraries" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#data-processing-libraries"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Data Processing Libraries</span></h2> <h3 class="relative group"><a id="streaming" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#streaming"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Streaming</span></h3> <p data-svelte-h="svelte-12ie1yi">Dataset streaming allows iterating on a dataset from Hugging Face progressively without having to download it completely.
It saves local disk space because the data is never on disk. It saves memory since only a small portion of the dataset is used at a time. And it saves time, since there is no need to download data before the CPU or GPU workload.</p> <p data-svelte-h="svelte-18sh4bf">In addition to streaming <em>from</em> Hugging Face, many libraries also support streaming <em>back to</em> Hugging Face.
Therefore, they can run end-to-end streaming pipelines: streaming from a source and writing to Hugging Face progressively, often overlapping the download, upload, and processing steps.</p> <p data-svelte-h="svelte-op0jrb">For more details on how to do streaming, check out the documentation of a library that support streaming (see table above) or the <a href="./datasets-streaming">streaming datasets</a> documentation if you want to stream datasets from Hugging Face by yourself.</p> <h3 class="relative group"><a id="optimized-parquet-files" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#optimized-parquet-files"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Optimized Parquet files</span></h3> <p data-svelte-h="svelte-17eq3n1">Parquet files on Hugging Face are optimized to improve storage efficiency, accelerate downloads and uploads, and enable efficient dataset streaming and editing.</p> <div class="flex justify-center" data-svelte-h="svelte-gs805e"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/datasets-optimized-parquet-banner.png"></div> <p data-svelte-h="svelte-qwjq4x">Optimized Parquet files are Parquet files with additional features:</p> <ul data-svelte-h="svelte-sdwshs"><li><a href="https://huggingface.co/blog/parquet-cdc" rel="nofollow">Parquet Content Defined Chunking</a> optimizes Parquet for <a href="https://huggingface.co/docs/hub/en/xet/index" rel="nofollow">Xet</a>, Hugging Face’s storage backend. It accelerates uploads and downloads thanks to chunk-based deduplication and allows efficient file editing</li> <li>Page index accelerates filters when streaming and enables efficient random access, e.g. in the <a href="https://huggingface.co/docs/dataset-viewer" rel="nofollow">Dataset Viewer</a></li></ul> <p data-svelte-h="svelte-4blyvm">Some libraries require extra argument to write Optimized Parquet files like <code>Pandas</code> and <code>PyArrow</code>:</p> <ul data-svelte-h="svelte-qauouv"><li><code>use_content_defined_chunking=True</code> to enable Parquet Content Defined Chunking, for <a href="https://huggingface.co/blog/parquet-cdc" rel="nofollow">deduplication</a> and <a href="./datasets-editing">editing</a></li> <li><code>write_page_index=True</code> to include a page index in the Parquet metadata, for <a href="./datasets-streaming">streaming and random access</a></li></ul> <h2 class="relative group"><a id="training-libraries" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#training-libraries"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Training Libraries</span></h2> <p data-svelte-h="svelte-145nbko">Training libraries that integrate with Hub datasets for model training. The table below shows their streaming capabilities - the ability to train on datasets without downloading them first.</p> <table data-svelte-h="svelte-bppjgv"><thead><tr><th>Library</th> <th>Description</th> <th>Stream from Hub</th></tr></thead> <tbody><tr><td><a href="https://docs.axolotl.ai/docs/streaming.html" rel="nofollow">Axolotl</a></td> <td>Low-code LLM fine-tuning framework</td> <td></td></tr> <tr><td><a href="https://github.com/hiyouga/LLaMA-Factory" rel="nofollow">LlamaFactory</a></td> <td>Unified fine-tuning for 100+ LLMs</td> <td></td></tr> <tr><td><a href="https://sbert.net/docs/sentence_transformer/training_overview.html" rel="nofollow">Sentence Transformers</a></td> <td>Text embeddings and semantic similarity</td> <td></td></tr> <tr><td><a href="https://huggingface.co/docs/transformers/trainer" rel="nofollow">Transformers</a></td> <td>🤗 Transformers Trainer for fine-tuning models</td> <td></td></tr> <tr><td><a href="https://huggingface.co/docs/trl" rel="nofollow">TRL</a></td> <td>Training LLMs with reinforcement learning (SFT, DPO, GRPO)</td> <td>⚠️*</td></tr> <tr><td><a href="https://docs.unsloth.ai" rel="nofollow">Unsloth</a></td> <td>Fast LLM fine-tuning (2x speedup, 70% less memory)</td> <td></td></tr></tbody></table> <p data-svelte-h="svelte-2g4khw"><em>* SFTTrainer and DPOTrainer support streaming; GRPOTrainer does not yet support streaming input</em></p> <h3 class="relative group"><a id="streaming-from-hub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#streaming-from-hub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Streaming from Hub</span></h3> <p data-svelte-h="svelte-gncrl2">Streaming allows training on massive datasets without downloading them first. This is valuable when:</p> <ul data-svelte-h="svelte-zs402t"><li>Your dataset is too large to fit on disk</li> <li>You want to start training immediately</li> <li>You’re using <a href="https://huggingface.co/docs/hub/jobs" rel="nofollow">HF Jobs</a> where co-located compute provides faster streaming</li></ul> <p data-svelte-h="svelte-htdbf8">Recent improvements have made streaming <a href="https://huggingface.co/blog/streaming-datasets" rel="nofollow">up to 100x more efficient</a> with faster startup, prefetching, and better scaling to many workers.</p> <p data-svelte-h="svelte-1uhsve2"><strong>Note:</strong> Streaming requires <code>max_steps</code> in training arguments since dataset length is unknown, and uses buffer-based shuffling. See <a href="./datasets-streaming">streaming datasets</a> for more details.</p> <h3 class="relative group"><a id="logging-to-hub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#logging-to-hub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Logging to Hub</span></h3> <p data-svelte-h="svelte-5qghci">Some tools can stream training data back to the Hub during training:</p> <ul data-svelte-h="svelte-135fno6"><li><strong><a href="https://github.com/huggingface/trackio" rel="nofollow">Trackio</a></strong>: Streams training metrics to a Hub dataset in real-time</li></ul> <h2 class="relative group"><a id="integrating-data-libraries-and-tools-with-the-hub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#integrating-data-libraries-and-tools-with-the-hub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Integrating data libraries and tools with the Hub</span></h2> <p data-svelte-h="svelte-1bmppgx">This guide is designed for developers and maintainers of data libraries and tools who want to integrate with the Hugging Face Hub. Whether you’re building a data processing library, analysis tool, or any software that needs to interact with datasets, this documentation will help you implement a Hub integration.</p> <p data-svelte-h="svelte-e0z0qx">The guide covers:</p> <ul data-svelte-h="svelte-162x9zs"><li>Possible approaches to loading data from the Hub into your library/tool</li> <li>Possible approaches to uploading data from your library/tool to the Hub</li></ul> <h3 class="relative group"><a id="loading-data-from-the-hub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#loading-data-from-the-hub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Loading data from the Hub</span></h3> <p data-svelte-h="svelte-chb2m7">If you have a library for working with data, it can be helpful for your users to load data from the Hub.</p> <p data-svelte-h="svelte-m9xre3">In general, we suggest relying on an existing library like <code>datasets</code>, <code>pandas</code> or <code>polars</code> to do this unless you have a specific reason to implement your own. If you require more control over the loading process, you can use the <code>huggingface_hub</code> library, which will allow you, for example, to download a specific subset of files from a repository.</p> <p data-svelte-h="svelte-1kfkj1n">You can find more information about loading data from the Hub <a href="https://huggingface.co/docs/hub/datasets-downloading" rel="nofollow">here</a>.</p> <h4 class="relative group"><a id="integrating-via-the-dataset-viewer-and-parquet-files" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#integrating-via-the-dataset-viewer-and-parquet-files"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Integrating via the Dataset Viewer and Parquet Files</span></h4> <p data-svelte-h="svelte-nyvi1q">The Hub’s dataset viewer and Parquet conversion system provide a standardized way to integrate with datasets, regardless of their original format. This infrastructure is a reliable integration layer between the Hub and external libraries.</p> <p data-svelte-h="svelte-sdabnb">If the dataset is not already in Parquet, the Hub automatically converts the first 5GB of every dataset to Parquet format to power the dataset viewer and provide consistent access patterns. This standardization offers several benefits for library integrations:</p> <ul data-svelte-h="svelte-14xyszt"><li>Consistent data access patterns regardless of original format</li> <li>Built-in dataset preview and exploration through the Hub’s dataset viewer. The dataset viewer can also be embedded as an iframe in your applications, making it easy to provide rich dataset previews. For more information about embedding the viewer, see the <a href="https://huggingface.co/docs/hub/en/datasets-viewer-embed" rel="nofollow">dataset viewer embedding documentation</a>.</li> <li>Efficient columnar storage optimized for querying. For example, you could use a tool like <a href="https://duckdb.org/" rel="nofollow">DuckDB</a> to query or filter for a specific subset of data.</li> <li>Parquet is well supported across the machine learning and data science ecosystem.</li></ul> <p data-svelte-h="svelte-145eyta">For more details on working with the Dataset Viewer API, see the <a href="https://huggingface.co/docs/dataset-viewer/index" rel="nofollow">Dataset Viewer API documentation</a></p> <h3 class="relative group"><a id="uploading-data-to-the-hub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#uploading-data-to-the-hub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Uploading data to the Hub</span></h3> <p data-svelte-h="svelte-ia8icy">This section covers possible approaches for adding the ability to upload data to the Hub in your library, i.e. how to implement a <code>push_to_hub</code> method.</p> <p data-svelte-h="svelte-166sdwe">This guide will cover three primary ways to upload data to the Hub:</p> <ul data-svelte-h="svelte-2sc2rd"><li>using the <code>datasets</code> library and the <code>push_to_hub</code> method</li> <li>using <code>pandas</code> to write to the Hub</li> <li>using the <code>huggingface_hub</code> library and the <code>hf_hub_download</code> method</li> <li>directly using the API or Git with git-xet</li></ul> <h4 class="relative group"><a id="use-the-datasets-library" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#use-the-datasets-library"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Use the datasets library</span></h4> <p data-svelte-h="svelte-r25mif">The most straightforward approach to pushing data to the Hub is to rely on the existing <a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.push_to_hub" rel="nofollow"><code>push_to_hub</code></a> method from the <code>datasets</code> library. The <code>push_to_hub</code> method will automatically handle:</p> <ul data-svelte-h="svelte-iaou6v"><li>the creation of the repository</li> <li>the conversion of the dataset to Parquet</li> <li>chunking the dataset into suitable parts</li> <li>uploading the data</li></ul> <p data-svelte-h="svelte-d45nz">For example, if you have a synthetic data generation library that returns a list of dictionaries, you could simply do the following:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset
data = [{<span class="hljs-string">&quot;prompt&quot;</span>: <span class="hljs-string">&quot;Write a cake recipe&quot;</span>, <span class="hljs-string">&quot;response&quot;</span>: <span class="hljs-string">&quot;Measure 1 cup ...&quot;</span>}]
ds = Dataset.from_list(data)
ds.push_to_hub(<span class="hljs-string">&quot;USERNAME_OR_ORG/repo_ID&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-53l0c9">Examples of this kind of integration:</p> <ul data-svelte-h="svelte-5zd4l0"><li><a href="https://github.com/argilla-io/distilabel/blob/8ad48387dfa4d7bd5639065661f1975dcb44c16a/src/distilabel/distiset.py#L77" rel="nofollow">Distilabel</a></li></ul> <h4 class="relative group"><a id="rely-on-an-existing-libraries-integration-with-the-hub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#rely-on-an-existing-libraries-integration-with-the-hub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Rely on an existing libraries integration with the Hub</span></h4> <p data-svelte-h="svelte-1lhir1l">Polars, Pandas, Dask, Spark, DuckDB, and Daft can all write to a Hugging Face Hub repository. See <a href="https://huggingface.co/docs/hub/datasets-libraries" rel="nofollow">datasets libraries</a> for more details.</p> <p data-svelte-h="svelte-1s8u1ie">If you are already using one of these libraries in your code, adding the ability to push to the Hub is straightforward. For example, if you have a synthetic data generation library that can return a Pandas DataFrame, here is the code you would need to write to the Hub:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> HfApi
<span class="hljs-comment"># Initialize the Hub API</span>
hf_api = HfApi(token=os.getenv(<span class="hljs-string">&quot;HF_TOKEN&quot;</span>))
<span class="hljs-comment"># Create a repository (if it doesn&#x27;t exist)</span>
hf_api.create_repo(repo_id=<span class="hljs-string">&quot;username/my-dataset&quot;</span>, repo_type=<span class="hljs-string">&quot;dataset&quot;</span>)
<span class="hljs-comment"># Convert your data to a DataFrame and save directly to the Hub</span>
df.to_parquet(<span class="hljs-string">&quot;hf://datasets/username/my-dataset/data.parquet&quot;</span>)<!-- HTML_TAG_END --></pre></div> <h4 class="relative group"><a id="using-the-huggingfacehub-python-library" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-the-huggingfacehub-python-library"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using the huggingface_hub Python library</span></h4> <p data-svelte-h="svelte-1ygt6jd">The <code>huggingface_hub</code> Python library offers a more flexible approach to uploading data to the Hub. The library allows you to upload specific files or subsets of files to a repository. This is useful if you have a large dataset that you don’t want to convert to Parquet, want to upload a specific subset of files, or want more control over the repo structure.</p> <p data-svelte-h="svelte-7hs1zn">Depending on your use case, you can upload a file or folder at a specific point in your code, i.e., export annotations from a tool to the Hub when a user clicks “push to Hub”. For example,</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> HfApi
api = HfApi(token=HF_TOKEN)
api.upload_folder(
folder_path=<span class="hljs-string">&quot;/my-cool-library/data-folder&quot;</span>,
repo_id=<span class="hljs-string">&quot;username/my-cool-space&quot;</span>,
repo_type=<span class="hljs-string">&quot;dataset&quot;</span>,
commit_message=<span class="hljs-string">&quot;Push annotations to Hub&quot;</span>
allow_patterns=<span class="hljs-string">&quot;*.jsonl&quot;</span>,
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7kkfko">You can find more information about ways to upload data to the Hub <a href="https://huggingface.co/docs/huggingface_hub/main/en/guides/upload" rel="nofollow">here</a>.</p> <p data-svelte-h="svelte-iljeu7">Alternatively, there are situations where you may want to upload data in the background, for example, synthetic data being generated every 10 minutes. In this case you can use the <code>scheduled_uploads</code> feature of the <code>huggingface_hub</code> library. For more details, see the <a href="https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#scheduled-uploads" rel="nofollow">scheduled uploads documentation</a>.</p> <p data-svelte-h="svelte-yedhy2">You can see an example of using this approach to upload data to the Hub in</p> <ul data-svelte-h="svelte-5ri1p0"><li>The <a href="https://github.com/AnswerDotAI/fastdata/blob/main/nbs/00_core.ipynb" rel="nofollow">fastdata</a> library</li> <li>This <a href="https://huggingface.co/spaces/davanstrien/magpie/blob/fc79672c740b8d3d098378dca37c0f191c208de0/app.py#L67" rel="nofollow">magpie</a> Demo Space</li></ul> <h2 class="relative group"><a id="more-support" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#more-support"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>More support</span></h2> <p data-svelte-h="svelte-zmwud2">For technical questions about integration, feel free to contact the datasets team at <a href="mailto:datasets@huggingface.co">datasets@huggingface.co</a>.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/hub-docs/blob/main/docs/hub/datasets-libraries.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_ummgov = {
assets: "/docs/hub/pr_2437/en",
base: "/docs/hub/pr_2437/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/hub/pr_2437/en/_app/immutable/entry/start.5e3ed1fd.js"),
import("/docs/hub/pr_2437/en/_app/immutable/entry/app.93e7704b.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 49],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
49.5 kB
·
Xet hash:
b1c0e1e14649abcb4dc0bdf857284ea546886b0ec8b2d33f353fe771a4f68928

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.