Buckets:

hf-doc-build/doc-dev / hub /pr_2437 /en /datasets-ingesting.html
download
raw
40.2 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Ingesting Datasets&quot;,&quot;local&quot;:&quot;ingesting-datasets&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Using huggingface_hub&quot;,&quot;local&quot;:&quot;using-huggingfacehub&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Using dlt&quot;,&quot;local&quot;:&quot;using-dlt&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Using other libraries&quot;,&quot;local&quot;:&quot;using-other-libraries&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Ingest raw data&quot;,&quot;local&quot;:&quot;ingest-raw-data&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Scheduled ingestion&quot;,&quot;local&quot;:&quot;scheduled-ingestion&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;High frequency using Storage Buckets&quot;,&quot;local&quot;:&quot;high-frequency-using-storage-buckets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Near real-time using a CommitScheduler&quot;,&quot;local&quot;:&quot;near-real-time-using-a-commitscheduler&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Cron-based using Hugging Face Jobs&quot;,&quot;local&quot;:&quot;cron-based-using-hugging-face-jobs&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/hub/pr_2437/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/entry/start.5e3ed1fd.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/scheduler.258d2a4d.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/singletons.2d0c91e1.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/index.c8b82093.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/paths.d360121b.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/entry/app.93e7704b.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/preload-helper.a4507a26.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/index.421344fd.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/nodes/0.bedbef6e.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/nodes/47.8a42f514.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/CopyLLMTxtMenu.9c2a67a1.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.9bb92958.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/CodeBlock.619ec4e3.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Ingesting Datasets&quot;,&quot;local&quot;:&quot;ingesting-datasets&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Using huggingface_hub&quot;,&quot;local&quot;:&quot;using-huggingfacehub&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Using dlt&quot;,&quot;local&quot;:&quot;using-dlt&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Using other libraries&quot;,&quot;local&quot;:&quot;using-other-libraries&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Ingest raw data&quot;,&quot;local&quot;:&quot;ingest-raw-data&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Scheduled ingestion&quot;,&quot;local&quot;:&quot;scheduled-ingestion&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;High frequency using Storage Buckets&quot;,&quot;local&quot;:&quot;high-frequency-using-storage-buckets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Near real-time using a CommitScheduler&quot;,&quot;local&quot;:&quot;near-real-time-using-a-commitscheduler&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Cron-based using Hugging Face Jobs&quot;,&quot;local&quot;:&quot;cron-based-using-hugging-face-jobs&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="ingesting-datasets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ingesting-datasets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Ingesting Datasets</span></h1> <p data-svelte-h="svelte-19j8ii">Data generally lives in databases or cloud storage in forms that are not suited for AI workflows.
Ingesting data to the <a href="https://huggingface.co/datasets" rel="nofollow">Hub</a> is a good way to publish them as AI-ready datasets, enabling easy and efficient data loading, processing and model training and evaluation.</p> <h2 class="relative group"><a id="using-huggingfacehub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-huggingfacehub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using huggingface_hub</span></h2> <p data-svelte-h="svelte-1f11oi5">The simplest way to ingest data is to simply upload the data files with <code>huggingface_hub</code>.</p> <p data-svelte-h="svelte-mspuv4">The <code>huggingface_hub</code> Python library provides a rich feature set that allows you to manage repositories, including creating repos and uploading datasets to the Hub. Visit <a href="/docs/huggingface_hub/index">the client library’s documentation</a> to learn more.</p> <p data-svelte-h="svelte-1c9ae9s">This is relevant if your data is static/frozen and if you can easily obtain a local dump of the data in a format supported by the Hub (e.g., Parquet or JSON Lines) with a usable structure (e.g., well-defined fields for training and evaluation).</p> <h2 class="relative group"><a id="using-dlt" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-dlt"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using dlt</span></h2> <p data-svelte-h="svelte-1r7cbkw"><a href="http://github.com/dlt-hub/dlt" rel="nofollow">dlt</a> is an open-source Python library for data movement (ETL), and is useful for developers (and their agents) building data pipelines.
It can ingest data from diverse source types:</p> <ul data-svelte-h="svelte-hhnxrv"><li>Cloud storage or files</li> <li>REST APIs</li> <li>SQL databases</li> <li>Python generators</li></ul> <p data-svelte-h="svelte-1e7pom2">Examples of source types:</p> <ul data-svelte-h="svelte-1h9ys38"><li><code>filesystem</code> (includes s3, gs, az, abff, etc.)</li> <li><code>sql_database</code>, <code>mongodb</code>, <code>google_sheets</code></li> <li><code>notion</code>, <code>hubspot</code>, <code>rest_api</code></li></ul> <p data-svelte-h="svelte-atr54z">Find your source type from the <a href="https://dlthub.com/docs/dlt-ecosystem/verified-sources" rel="nofollow">list of sources</a> and create your <code>dlt</code> project:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-title">dlt</span> init &lt;source-<span class="hljs-keyword">type</span>&gt; filesystem<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1i380f4">You can then create a configuration file <code>.dlt/secrets.toml</code> in the root of your dlt project to define the Hub as a filesystem destination for your datasets, based on the <code>hf://</code> protocol:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-section">[destination.filesystem]</span>
<span class="hljs-attr">bucket_url</span> = <span class="hljs-string">&quot;hf://datasets/&lt;namespace&gt;&quot;</span>
<span class="hljs-section">[destination.filesystem.credentials]</span>
<span class="hljs-attr">hf_token</span> = <span class="hljs-string">&quot;hf_...&quot;</span> <span class="hljs-comment"># Your Hugging Face Access Token</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-r0xdtx">The namespace should be your user name or the name of your organization/team where you want to ingest your dataset.</p> <p data-svelte-h="svelte-1tgis19">Then each dlt dataset creates or updates a Hugging Face dataset repository. The repository name is &lt;namespace&gt;/&lt;dataset_name&gt;, where &lt;namespace&gt; is the same one you used in the bucket_url (your organization or team), and &lt;dataset_name&gt; is the pipeline’s dataset_name.</p> <p data-svelte-h="svelte-13yd27t">Here is an example pipeline:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> dlt
<span class="hljs-meta">@dlt.resource</span>
<span class="hljs-keyword">def</span> <span class="hljs-title function_">my_data</span>():
<span class="hljs-comment"># One of the functions auto-generated by `dlt init` that you can customize,</span>
<span class="hljs-comment"># or you can define your own python generator function.</span>
<span class="hljs-comment"># Here is an example from the `chess` source type:</span>
<span class="hljs-keyword">for</span> player <span class="hljs-keyword">in</span> [<span class="hljs-string">&#x27;magnuscarlsen&#x27;</span>, <span class="hljs-string">&#x27;rpragchess&#x27;</span>]:
response = requests.get(<span class="hljs-string">f&#x27;https://api.chess.com/pub/player/<span class="hljs-subst">{player}</span>&#x27;</span>)
response.raise_for_status()
<span class="hljs-keyword">yield</span> response.json()
<span class="hljs-comment"># Requires bucket_url = &quot;hf://datasets/&lt;namespace&gt;&quot; in .dlt/secrets.toml</span>
pipeline = dlt.pipeline(
pipeline_name=<span class="hljs-string">&quot;my_pipeline&quot;</span>,
destination=<span class="hljs-string">&quot;filesystem&quot;</span>,
dataset_name=<span class="hljs-string">&quot;dataset_name&quot;</span>,
)
pipeline.run(my_data())<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1wt7r0a">Customize the <code>dlt</code> resource to load the data you want and parse the fields you want to publish in your dataset, e.g. the text you need for training and evaluation.</p> <h2 class="relative group"><a id="using-other-libraries" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-other-libraries"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using other libraries</span></h2> <p data-svelte-h="svelte-saqsq8">Some libraries like <a href="/docs/datasets/index">🤗 Datasets</a>, <a href="./datasets-pandas">Pandas</a>, <a href="./datasets-polars">Polars</a>, <a href="./datasets-dask">Dask</a>, <a href="./datasets-duckdb">DuckDB</a>, <a href="./datasets-spark">Spark</a>, or <a href="./datasets-daft">Daft</a> can ingest data from various places to the Hub.
See the list of <a href="./datasets-libraries">Libraries supported by the Datasets Hub</a> for more information.</p> <h2 class="relative group"><a id="ingest-raw-data" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ingest-raw-data"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Ingest raw data</span></h2> <p data-svelte-h="svelte-uomm0s">If you are ingesting raw data that need further curation before being published as AI-ready datasets or if you need an S3-like experience, consider ingesting them to <a href="./storage-buckets">Hugging Face Storage Buckets</a>.</p> <h2 class="relative group"><a id="scheduled-ingestion" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#scheduled-ingestion"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Scheduled ingestion</span></h2> <p data-svelte-h="svelte-mqa2wp">There are some limitations when updating the same file on the Hub thousands of times.
For instance, you might want to ingest generations of a running LLM inference server, live agents traces, or logs of a running model training.
In such cases, uploading the data as a dataset on the Hub makes sense, but it can be hard to do properly.
The main reason is that you don’t want to version every update of your data because it’ll make the git repository unusable.</p> <p data-svelte-h="svelte-1cugixl">Three options are available:</p> <ul data-svelte-h="svelte-1dreynf"><li><strong>Use a Storage Bucket instead of a Dataset repository:</strong> <a href="/docs/hub/storage-buckets">Storage Buckets</a> offer an S3-like experience that allows updating files very frequently, since they are not based on git. Storage Buckets are especially useful for data that are not ready to be published as a dataset, e.g. data that are still evolving or that need more curation.</li> <li><strong>Use a CommitScheduler</strong>: The <code>CommitScheduler</code> in <code>huggingface_hub</code> offers near real-time ingestion to keep the git history of a Dataset repository manageable. It can be configured to do git commits at intervals defined in minutes.</li> <li><strong>Use Hugging Face Jobs to schedule ingestion scripts</strong>: Hugging Face Jobs provides a way to run and schedule python scripts on Hugging Face infrastructure. Schedule ingestion scripts to run at intervals defined using the Cron syntax.</li></ul> <h3 class="relative group"><a id="high-frequency-using-storage-buckets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#high-frequency-using-storage-buckets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>High frequency using Storage Buckets</span></h3> <p data-svelte-h="svelte-1x7qiqi">Contrary to Dataset repositories that are based on git, you can update files on Storage Buckets at very high rate, offering quasi real-time ingestion.</p> <p data-svelte-h="svelte-o68yp3">Use <code>batch_bucket_files()</code> in <code>huggingface_hub</code> to update files in a bucket:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> batch_bucket_files
<span class="hljs-keyword">def</span> <span class="hljs-title function_">update_bucket</span>(<span class="hljs-params">local_files</span>):
destinations = [os.path.basename(local_file) <span class="hljs-keyword">for</span> local_file <span class="hljs-keyword">in</span> local_file]
batch_bucket_files(bucket_id=<span class="hljs-string">&quot;username/bucket_name&quot;</span>, add=[(local_file, dst) <span class="hljs-keyword">for</span> local_file, dst <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(local_files, destinations)])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-flvvlh">Alternatively, you can append to files in a Bucket and <code>flush()</code> on every new item:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> hffs
<span class="hljs-keyword">with</span> hffs.<span class="hljs-built_in">open</span>(<span class="hljs-string">&quot;buckets/username/bucket_name/texts.jsonl&quot;</span>, <span class="hljs-string">&quot;a&quot;</span>) <span class="hljs-keyword">as</span> f:
<span class="hljs-keyword">for</span> text <span class="hljs-keyword">in</span> live_texts_stream:
f.write(json.dumps({<span class="hljs-string">&quot;text&quot;</span>: text}) + <span class="hljs-string">&quot;\n&quot;</span>)
f.flush()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ujxmpl">The <code>HfFileSystem</code> is based on <code>fsspec</code> which has a default blocksize of 5MiB, which means flushing actually uploads the data once a full chunk of 5MiB of new data was appended.
If you want to upload more often, lower <code>blocksize</code> in <code>hffs.open()</code> (e.g. <code>hffs.open(..., blocksize=100 * 2 ** 10)</code> for 100 kiB) or use <code>f.flush(force=True)</code>.</p> <p data-svelte-h="svelte-ak7yhy">Hugging Face storage is based on Xet which enables efficient I/O when appending to files: uploads are deduplicated and only new data are uploaded.
Find more information on doing dynamic data ingestion in buckets in the <a href="/docs/hub/storage-buckets#uploading-files">buckets documentation on uploads</a> and in the <a href="./datasets-editing#only-upload-the-new-data">dataset editing documentation</a>.</p> <h3 class="relative group"><a id="near-real-time-using-a-commitscheduler" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#near-real-time-using-a-commitscheduler"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Near real-time using a CommitScheduler</span></h3> <p data-svelte-h="svelte-yo7vi">The idea is to run a background job that regularly pushes a local folder to the Hub. You want to save data to the Hub (potentially millions of entries), but you don’t need to save in real-time each user’s input. Instead, you can save the data locally in a JSON file and upload it every 10 minutes. For example:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> json
<span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> CommitScheduler
folder_path = <span class="hljs-string">&quot;path/to/files/to/ingest&quot;</span>
every = <span class="hljs-number">10</span> <span class="hljs-comment"># ingest every 10min</span>
<span class="hljs-keyword">with</span> CommitScheduler(repo_id=<span class="hljs-string">&quot;username/dataset_name&quot;</span>, repo_type=<span class="hljs-string">&quot;dataset&quot;</span>, folder_path=folder_path, every=every) <span class="hljs-keyword">as</span> scheduler:
<span class="hljs-comment"># Write to the folder to ingest every 10min</span>
<span class="hljs-comment"># For example:</span>
<span class="hljs-keyword">with</span> <span class="hljs-built_in">open</span>(folder_path + <span class="hljs-string">&quot;/texts.jsonl&quot;</span>, <span class="hljs-string">&quot;a&quot;</span>) <span class="hljs-keyword">as</span> f:
f.write(json.dumps({<span class="hljs-string">&quot;text&quot;</span>: text}) + <span class="hljs-string">&quot;\n&quot;</span>)
...<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-199zrt9">Check out how to ingest dynamic data without having to reupload everything every time in the documentation on <a href="./datasets-editing#only-upload-the-new-data">dataset editing</a>.</p> <p data-svelte-h="svelte-15dr6kx">Find more information on scheduled uploads in the <a href="/docs/huggingface_hub/guides/upload#scheduled-uploads">huggingface_hub documentation</a>.</p> <h3 class="relative group"><a id="cron-based-using-hugging-face-jobs" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#cron-based-using-hugging-face-jobs"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Cron-based using Hugging Face Jobs</span></h3> <p data-svelte-h="svelte-1iqer2n">Schedule python scripts to ingest data according to a schedule</p> <p data-svelte-h="svelte-1ghgfim">For example to run a script <code>ingest.py</code> every 5 minutes:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->hf <span class="hljs-built_in">jobs</span> scheduled uv run <span class="hljs-string">&quot;*/5 * * * *&quot;</span> ingest.py<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-e2mios">Declare the script dependencies <a href="https://docs.astral.sh/uv/guides/scripts/#declaring-script-dependencies" rel="nofollow">in the header of the script</a> or use <code>--with</code>.
For example to run a <code>dlt</code> pipeline every day at midnight:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->hf <span class="hljs-built_in">jobs</span> scheduled uv run --with <span class="hljs-string">&quot;dlt[hf]&quot;</span> <span class="hljs-string">&quot;0 0 * * *&quot;</span> pipeline.py<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1t6fhxk">You can check the logs of every run using <code>hf jobs logs</code> or directly in the Jobs page on your account on Hugging Face.</p> <p data-svelte-h="svelte-138tgzt">Find more information about Hugging Face Jobs in the <a href="/docs/hub/jobs-overview">Jobs documentation</a>.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/hub-docs/blob/main/docs/hub/datasets-ingesting.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_ummgov = {
assets: "/docs/hub/pr_2437/en",
base: "/docs/hub/pr_2437/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/hub/pr_2437/en/_app/immutable/entry/start.5e3ed1fd.js"),
import("/docs/hub/pr_2437/en/_app/immutable/entry/app.93e7704b.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 47],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
40.2 kB
·
Xet hash:
92155fce1636a95cec3babf5c8a82fd1c9e21afe4beef4e43dc3e87846c6119b

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.