Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Ingesting Datasets","local":"ingesting-datasets","sections":[{"title":"Using huggingface_hub","local":"using-huggingfacehub","sections":[],"depth":2},{"title":"Using dlt","local":"using-dlt","sections":[],"depth":2},{"title":"Using other libraries","local":"using-other-libraries","sections":[],"depth":2},{"title":"Ingest raw data","local":"ingest-raw-data","sections":[],"depth":2},{"title":"Scheduled ingestion","local":"scheduled-ingestion","sections":[{"title":"High frequency using Storage Buckets","local":"high-frequency-using-storage-buckets","sections":[],"depth":3},{"title":"Near real-time using a CommitScheduler","local":"near-real-time-using-a-commitscheduler","sections":[],"depth":3},{"title":"Cron-based using Hugging Face Jobs","local":"cron-based-using-hugging-face-jobs","sections":[],"depth":3}],"depth":2}],"depth":1}"> | |
| <link href="/docs/hub/pr_2437/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/entry/start.5e3ed1fd.js"> | |
| <link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/scheduler.258d2a4d.js"> | |
| <link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/singletons.2d0c91e1.js"> | |
| <link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/index.c8b82093.js"> | |
| <link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/paths.d360121b.js"> | |
| <link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/entry/app.93e7704b.js"> | |
| <link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/preload-helper.a4507a26.js"> | |
| <link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/index.421344fd.js"> | |
| <link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/nodes/0.bedbef6e.js"> | |
| <link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/nodes/47.8a42f514.js"> | |
| <link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/CopyLLMTxtMenu.9c2a67a1.js"> | |
| <link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.9bb92958.js"> | |
| <link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/CodeBlock.619ec4e3.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Ingesting Datasets","local":"ingesting-datasets","sections":[{"title":"Using huggingface_hub","local":"using-huggingfacehub","sections":[],"depth":2},{"title":"Using dlt","local":"using-dlt","sections":[],"depth":2},{"title":"Using other libraries","local":"using-other-libraries","sections":[],"depth":2},{"title":"Ingest raw data","local":"ingest-raw-data","sections":[],"depth":2},{"title":"Scheduled ingestion","local":"scheduled-ingestion","sections":[{"title":"High frequency using Storage Buckets","local":"high-frequency-using-storage-buckets","sections":[],"depth":3},{"title":"Near real-time using a CommitScheduler","local":"near-real-time-using-a-commitscheduler","sections":[],"depth":3},{"title":"Cron-based using Hugging Face Jobs","local":"cron-based-using-hugging-face-jobs","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="ingesting-datasets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ingesting-datasets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Ingesting Datasets</span></h1> <p data-svelte-h="svelte-19j8ii">Data generally lives in databases or cloud storage in forms that are not suited for AI workflows. | |
| Ingesting data to the <a href="https://huggingface.co/datasets" rel="nofollow">Hub</a> is a good way to publish them as AI-ready datasets, enabling easy and efficient data loading, processing and model training and evaluation.</p> <h2 class="relative group"><a id="using-huggingfacehub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-huggingfacehub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using huggingface_hub</span></h2> <p data-svelte-h="svelte-1f11oi5">The simplest way to ingest data is to simply upload the data files with <code>huggingface_hub</code>.</p> <p data-svelte-h="svelte-mspuv4">The <code>huggingface_hub</code> Python library provides a rich feature set that allows you to manage repositories, including creating repos and uploading datasets to the Hub. Visit <a href="/docs/huggingface_hub/index">the client library’s documentation</a> to learn more.</p> <p data-svelte-h="svelte-1c9ae9s">This is relevant if your data is static/frozen and if you can easily obtain a local dump of the data in a format supported by the Hub (e.g., Parquet or JSON Lines) with a usable structure (e.g., well-defined fields for training and evaluation).</p> <h2 class="relative group"><a id="using-dlt" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-dlt"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using dlt</span></h2> <p data-svelte-h="svelte-1r7cbkw"><a href="http://github.com/dlt-hub/dlt" rel="nofollow">dlt</a> is an open-source Python library for data movement (ETL), and is useful for developers (and their agents) building data pipelines. | |
| It can ingest data from diverse source types:</p> <ul data-svelte-h="svelte-hhnxrv"><li>Cloud storage or files</li> <li>REST APIs</li> <li>SQL databases</li> <li>Python generators</li></ul> <p data-svelte-h="svelte-1e7pom2">Examples of source types:</p> <ul data-svelte-h="svelte-1h9ys38"><li><code>filesystem</code> (includes s3, gs, az, abff, etc.)</li> <li><code>sql_database</code>, <code>mongodb</code>, <code>google_sheets</code></li> <li><code>notion</code>, <code>hubspot</code>, <code>rest_api</code></li></ul> <p data-svelte-h="svelte-atr54z">Find your source type from the <a href="https://dlthub.com/docs/dlt-ecosystem/verified-sources" rel="nofollow">list of sources</a> and create your <code>dlt</code> project:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-title">dlt</span> init <source-<span class="hljs-keyword">type</span>> filesystem<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1i380f4">You can then create a configuration file <code>.dlt/secrets.toml</code> in the root of your dlt project to define the Hub as a filesystem destination for your datasets, based on the <code>hf://</code> protocol:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-section">[destination.filesystem]</span> | |
| <span class="hljs-attr">bucket_url</span> = <span class="hljs-string">"hf://datasets/<namespace>"</span> | |
| <span class="hljs-section">[destination.filesystem.credentials]</span> | |
| <span class="hljs-attr">hf_token</span> = <span class="hljs-string">"hf_..."</span> <span class="hljs-comment"># Your Hugging Face Access Token</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-r0xdtx">The namespace should be your user name or the name of your organization/team where you want to ingest your dataset.</p> <p data-svelte-h="svelte-1tgis19">Then each dlt dataset creates or updates a Hugging Face dataset repository. The repository name is <namespace>/<dataset_name>, where <namespace> is the same one you used in the bucket_url (your organization or team), and <dataset_name> is the pipeline’s dataset_name.</p> <p data-svelte-h="svelte-13yd27t">Here is an example pipeline:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> dlt | |
| <span class="hljs-meta">@dlt.resource</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">my_data</span>(): | |
| <span class="hljs-comment"># One of the functions auto-generated by `dlt init` that you can customize,</span> | |
| <span class="hljs-comment"># or you can define your own python generator function.</span> | |
| <span class="hljs-comment"># Here is an example from the `chess` source type:</span> | |
| <span class="hljs-keyword">for</span> player <span class="hljs-keyword">in</span> [<span class="hljs-string">'magnuscarlsen'</span>, <span class="hljs-string">'rpragchess'</span>]: | |
| response = requests.get(<span class="hljs-string">f'https://api.chess.com/pub/player/<span class="hljs-subst">{player}</span>'</span>) | |
| response.raise_for_status() | |
| <span class="hljs-keyword">yield</span> response.json() | |
| <span class="hljs-comment"># Requires bucket_url = "hf://datasets/<namespace>" in .dlt/secrets.toml</span> | |
| pipeline = dlt.pipeline( | |
| pipeline_name=<span class="hljs-string">"my_pipeline"</span>, | |
| destination=<span class="hljs-string">"filesystem"</span>, | |
| dataset_name=<span class="hljs-string">"dataset_name"</span>, | |
| ) | |
| pipeline.run(my_data())<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1wt7r0a">Customize the <code>dlt</code> resource to load the data you want and parse the fields you want to publish in your dataset, e.g. the text you need for training and evaluation.</p> <h2 class="relative group"><a id="using-other-libraries" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-other-libraries"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using other libraries</span></h2> <p data-svelte-h="svelte-saqsq8">Some libraries like <a href="/docs/datasets/index">🤗 Datasets</a>, <a href="./datasets-pandas">Pandas</a>, <a href="./datasets-polars">Polars</a>, <a href="./datasets-dask">Dask</a>, <a href="./datasets-duckdb">DuckDB</a>, <a href="./datasets-spark">Spark</a>, or <a href="./datasets-daft">Daft</a> can ingest data from various places to the Hub. | |
| See the list of <a href="./datasets-libraries">Libraries supported by the Datasets Hub</a> for more information.</p> <h2 class="relative group"><a id="ingest-raw-data" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ingest-raw-data"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Ingest raw data</span></h2> <p data-svelte-h="svelte-uomm0s">If you are ingesting raw data that need further curation before being published as AI-ready datasets or if you need an S3-like experience, consider ingesting them to <a href="./storage-buckets">Hugging Face Storage Buckets</a>.</p> <h2 class="relative group"><a id="scheduled-ingestion" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#scheduled-ingestion"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Scheduled ingestion</span></h2> <p data-svelte-h="svelte-mqa2wp">There are some limitations when updating the same file on the Hub thousands of times. | |
| For instance, you might want to ingest generations of a running LLM inference server, live agents traces, or logs of a running model training. | |
| In such cases, uploading the data as a dataset on the Hub makes sense, but it can be hard to do properly. | |
| The main reason is that you don’t want to version every update of your data because it’ll make the git repository unusable.</p> <p data-svelte-h="svelte-1cugixl">Three options are available:</p> <ul data-svelte-h="svelte-1dreynf"><li><strong>Use a Storage Bucket instead of a Dataset repository:</strong> <a href="/docs/hub/storage-buckets">Storage Buckets</a> offer an S3-like experience that allows updating files very frequently, since they are not based on git. Storage Buckets are especially useful for data that are not ready to be published as a dataset, e.g. data that are still evolving or that need more curation.</li> <li><strong>Use a CommitScheduler</strong>: The <code>CommitScheduler</code> in <code>huggingface_hub</code> offers near real-time ingestion to keep the git history of a Dataset repository manageable. It can be configured to do git commits at intervals defined in minutes.</li> <li><strong>Use Hugging Face Jobs to schedule ingestion scripts</strong>: Hugging Face Jobs provides a way to run and schedule python scripts on Hugging Face infrastructure. Schedule ingestion scripts to run at intervals defined using the Cron syntax.</li></ul> <h3 class="relative group"><a id="high-frequency-using-storage-buckets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#high-frequency-using-storage-buckets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>High frequency using Storage Buckets</span></h3> <p data-svelte-h="svelte-1x7qiqi">Contrary to Dataset repositories that are based on git, you can update files on Storage Buckets at very high rate, offering quasi real-time ingestion.</p> <p data-svelte-h="svelte-o68yp3">Use <code>batch_bucket_files()</code> in <code>huggingface_hub</code> to update files in a bucket:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> batch_bucket_files | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">update_bucket</span>(<span class="hljs-params">local_files</span>): | |
| destinations = [os.path.basename(local_file) <span class="hljs-keyword">for</span> local_file <span class="hljs-keyword">in</span> local_file] | |
| batch_bucket_files(bucket_id=<span class="hljs-string">"username/bucket_name"</span>, add=[(local_file, dst) <span class="hljs-keyword">for</span> local_file, dst <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(local_files, destinations)])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-flvvlh">Alternatively, you can append to files in a Bucket and <code>flush()</code> on every new item:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> hffs | |
| <span class="hljs-keyword">with</span> hffs.<span class="hljs-built_in">open</span>(<span class="hljs-string">"buckets/username/bucket_name/texts.jsonl"</span>, <span class="hljs-string">"a"</span>) <span class="hljs-keyword">as</span> f: | |
| <span class="hljs-keyword">for</span> text <span class="hljs-keyword">in</span> live_texts_stream: | |
| f.write(json.dumps({<span class="hljs-string">"text"</span>: text}) + <span class="hljs-string">"\n"</span>) | |
| f.flush()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ujxmpl">The <code>HfFileSystem</code> is based on <code>fsspec</code> which has a default blocksize of 5MiB, which means flushing actually uploads the data once a full chunk of 5MiB of new data was appended. | |
| If you want to upload more often, lower <code>blocksize</code> in <code>hffs.open()</code> (e.g. <code>hffs.open(..., blocksize=100 * 2 ** 10)</code> for 100 kiB) or use <code>f.flush(force=True)</code>.</p> <p data-svelte-h="svelte-ak7yhy">Hugging Face storage is based on Xet which enables efficient I/O when appending to files: uploads are deduplicated and only new data are uploaded. | |
| Find more information on doing dynamic data ingestion in buckets in the <a href="/docs/hub/storage-buckets#uploading-files">buckets documentation on uploads</a> and in the <a href="./datasets-editing#only-upload-the-new-data">dataset editing documentation</a>.</p> <h3 class="relative group"><a id="near-real-time-using-a-commitscheduler" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#near-real-time-using-a-commitscheduler"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Near real-time using a CommitScheduler</span></h3> <p data-svelte-h="svelte-yo7vi">The idea is to run a background job that regularly pushes a local folder to the Hub. You want to save data to the Hub (potentially millions of entries), but you don’t need to save in real-time each user’s input. Instead, you can save the data locally in a JSON file and upload it every 10 minutes. For example:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> json | |
| <span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> CommitScheduler | |
| folder_path = <span class="hljs-string">"path/to/files/to/ingest"</span> | |
| every = <span class="hljs-number">10</span> <span class="hljs-comment"># ingest every 10min</span> | |
| <span class="hljs-keyword">with</span> CommitScheduler(repo_id=<span class="hljs-string">"username/dataset_name"</span>, repo_type=<span class="hljs-string">"dataset"</span>, folder_path=folder_path, every=every) <span class="hljs-keyword">as</span> scheduler: | |
| <span class="hljs-comment"># Write to the folder to ingest every 10min</span> | |
| <span class="hljs-comment"># For example:</span> | |
| <span class="hljs-keyword">with</span> <span class="hljs-built_in">open</span>(folder_path + <span class="hljs-string">"/texts.jsonl"</span>, <span class="hljs-string">"a"</span>) <span class="hljs-keyword">as</span> f: | |
| f.write(json.dumps({<span class="hljs-string">"text"</span>: text}) + <span class="hljs-string">"\n"</span>) | |
| ...<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-199zrt9">Check out how to ingest dynamic data without having to reupload everything every time in the documentation on <a href="./datasets-editing#only-upload-the-new-data">dataset editing</a>.</p> <p data-svelte-h="svelte-15dr6kx">Find more information on scheduled uploads in the <a href="/docs/huggingface_hub/guides/upload#scheduled-uploads">huggingface_hub documentation</a>.</p> <h3 class="relative group"><a id="cron-based-using-hugging-face-jobs" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#cron-based-using-hugging-face-jobs"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Cron-based using Hugging Face Jobs</span></h3> <p data-svelte-h="svelte-1iqer2n">Schedule python scripts to ingest data according to a schedule</p> <p data-svelte-h="svelte-1ghgfim">For example to run a script <code>ingest.py</code> every 5 minutes:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->hf <span class="hljs-built_in">jobs</span> scheduled uv run <span class="hljs-string">"*/5 * * * *"</span> ingest.py<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-e2mios">Declare the script dependencies <a href="https://docs.astral.sh/uv/guides/scripts/#declaring-script-dependencies" rel="nofollow">in the header of the script</a> or use <code>--with</code>. | |
| For example to run a <code>dlt</code> pipeline every day at midnight:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->hf <span class="hljs-built_in">jobs</span> scheduled uv run --with <span class="hljs-string">"dlt[hf]"</span> <span class="hljs-string">"0 0 * * *"</span> pipeline.py<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1t6fhxk">You can check the logs of every run using <code>hf jobs logs</code> or directly in the Jobs page on your account on Hugging Face.</p> <p data-svelte-h="svelte-138tgzt">Find more information about Hugging Face Jobs in the <a href="/docs/hub/jobs-overview">Jobs documentation</a>.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/hub-docs/blob/main/docs/hub/datasets-ingesting.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_ummgov = { | |
| assets: "/docs/hub/pr_2437/en", | |
| base: "/docs/hub/pr_2437/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/hub/pr_2437/en/_app/immutable/entry/start.5e3ed1fd.js"), | |
| import("/docs/hub/pr_2437/en/_app/immutable/entry/app.93e7704b.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 47], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 40.2 kB
- Xet hash:
- 92155fce1636a95cec3babf5c8a82fd1c9e21afe4beef4e43dc3e87846c6119b
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.