Buckets:

hf-doc-build/doc-dev / hub /pr_2437 /en /datasets-lance.html
HuggingFaceDocBuilder's picture
download
raw
56.9 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Lance&quot;,&quot;local&quot;:&quot;lance&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Getting Started&quot;,&quot;local&quot;:&quot;getting-started&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Why Lance?&quot;,&quot;local&quot;:&quot;why-lance&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Store all your data in one place&quot;,&quot;local&quot;:&quot;store-all-your-data-in-one-place&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Stream from the Hub with datasets&quot;,&quot;local&quot;:&quot;stream-from-the-hub-with-datasets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Stream from the Hub with lance.dataset&quot;,&quot;local&quot;:&quot;stream-from-the-hub-with-lancedataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Work with binary assets&quot;,&quot;local&quot;:&quot;work-with-binary-assets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Write a subset to a new Lance dataset&quot;,&quot;local&quot;:&quot;write-a-subset-to-a-new-lance-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Create index&quot;,&quot;local&quot;:&quot;create-index&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Vector search&quot;,&quot;local&quot;:&quot;vector-search&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Dataset evolution&quot;,&quot;local&quot;:&quot;dataset-evolution&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Work with video blobs&quot;,&quot;local&quot;:&quot;work-with-video-blobs&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Prepare data for training&quot;,&quot;local&quot;:&quot;prepare-data-for-training&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Explore more Lance datasets&quot;,&quot;local&quot;:&quot;explore-more-lance-datasets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/hub/pr_2437/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/entry/start.5e3ed1fd.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/scheduler.258d2a4d.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/singletons.2d0c91e1.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/index.c8b82093.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/paths.d360121b.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/entry/app.93e7704b.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/preload-helper.a4507a26.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/index.421344fd.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/nodes/0.bedbef6e.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/nodes/48.ea3a94ba.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/CopyLLMTxtMenu.9c2a67a1.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.9bb92958.js">
<link rel="modulepreload" href="/docs/hub/pr_2437/en/_app/immutable/chunks/CodeBlock.619ec4e3.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Lance&quot;,&quot;local&quot;:&quot;lance&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Getting Started&quot;,&quot;local&quot;:&quot;getting-started&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Why Lance?&quot;,&quot;local&quot;:&quot;why-lance&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Store all your data in one place&quot;,&quot;local&quot;:&quot;store-all-your-data-in-one-place&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Stream from the Hub with datasets&quot;,&quot;local&quot;:&quot;stream-from-the-hub-with-datasets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Stream from the Hub with lance.dataset&quot;,&quot;local&quot;:&quot;stream-from-the-hub-with-lancedataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Work with binary assets&quot;,&quot;local&quot;:&quot;work-with-binary-assets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Write a subset to a new Lance dataset&quot;,&quot;local&quot;:&quot;write-a-subset-to-a-new-lance-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Create index&quot;,&quot;local&quot;:&quot;create-index&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Vector search&quot;,&quot;local&quot;:&quot;vector-search&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Dataset evolution&quot;,&quot;local&quot;:&quot;dataset-evolution&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Work with video blobs&quot;,&quot;local&quot;:&quot;work-with-video-blobs&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Prepare data for training&quot;,&quot;local&quot;:&quot;prepare-data-for-training&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Explore more Lance datasets&quot;,&quot;local&quot;:&quot;explore-more-lance-datasets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="lance" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#lance"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Lance</span></h1> <p data-svelte-h="svelte-14bf52v"><a href="https://lance.org" rel="nofollow">Lance</a> is an open multimodal lakehouse table format for AI. You can use Hugging Face paths (<code>hf://</code>) to access Lance datasets on the Hub. This lets you scan and search large datasets on the Hugging Face Hub without having to copy the entire dataset locally.</p> <div class="flex justify-center" data-svelte-h="svelte-7o5jb4"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/hub/hf_x_lance.png"></div> <h2 class="relative group"><a id="getting-started" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#getting-started"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Getting Started</span></h2> <p data-svelte-h="svelte-45a8gr">To get started, pip install <code>pylance</code> and <code>pyarrow</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install pylance pyarrow<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="why-lance" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#why-lance"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Why Lance?</span></h2> <ul data-svelte-h="svelte-lwvj0q"><li>Optimized for ML/AI workloads: Lance is a modern columnar format designed for fast random access without compromising scan performance, making it useful for search, analytics, training, feature engineering and many more use cases.</li> <li>Multimodal assets are stored as bytes, or binary objects (”<a href="https://lance.org/guide/blob/" rel="nofollow">blobs as files</a>”) in Lance alongside embeddings, and traditional scalar data — this makes it easier to govern, share, and distribute your large datasets via the Hub.</li> <li>Indexing is a first-class citizen (native to the format itself): Lance comes with fast, on-disk, scalable <a href="https://lance.org/quickstart/vector-search" rel="nofollow">vector</a> and FTS indexes that sit right alongside the dataset on the Hub, so you can share not only your data but also your embeddings and indexes without your users needing to recompute them.</li> <li>Flexible schema and <a href="https://lance.org/guide/data_evolution" rel="nofollow">data evolution</a> let you incrementally add new features/columns (moderation tags, embeddings, etc.) <strong>without</strong> needing to rewrite the entire table.</li></ul> <h2 class="relative group"><a id="store-all-your-data-in-one-place" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#store-all-your-data-in-one-place"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Store all your data in one place</span></h2> <p data-svelte-h="svelte-1xqw2yy">In Lance, your multimodal data assets (images, audio, video) are stored as raw bytes alongside your scalar metadata and embeddings. This makes it easy to scan and filter your dataset in one place without needing to stitch together multiple storage systems.</p> <h2 class="relative group"><a id="stream-from-the-hub-with-datasets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#stream-from-the-hub-with-datasets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Stream from the Hub with datasets</span></h2> <p data-svelte-h="svelte-wp861n">Use <code>load_dataset(..., streaming=True)</code> to scan and iterate through the data without downloading it locally.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-comment"># Return as a Hugging Face dataset</span>
ds = load_dataset(
<span class="hljs-string">&quot;lance-format/laion-1m&quot;</span>,
split=<span class="hljs-string">&quot;train&quot;</span>,
streaming=<span class="hljs-literal">True</span>
)
<span class="hljs-comment"># Take first three rows</span>
<span class="hljs-keyword">for</span> row <span class="hljs-keyword">in</span> ds.take(<span class="hljs-number">3</span>):
<span class="hljs-built_in">print</span>(row[<span class="hljs-string">&quot;caption&quot;</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ikfo2g">Streaming is great for sampling metadata to understand what you have. For vector search or working with large binary blobs, you can use the Lance <code>dataset</code> API, explained below.</p> <blockquote class="warning" data-svelte-h="svelte-1c5z32m"><p>Streaming is fast for sampling simple scalar metadata but not as quick for embeddings or large multimodal assets. To work with large datasets, it’s recommended to scan the metadata, identify subsets of what you need, and download that portion of the dataset locally to avoid facing Hub rate limits:
<code>hf download lance-format/laion-1m --repo-type dataset --local-dir ./laion</code></p></blockquote> <h2 class="relative group"><a id="stream-from-the-hub-with-lancedataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#stream-from-the-hub-with-lancedataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Stream from the Hub with lance.dataset</span></h2> <p data-svelte-h="svelte-1akbb7">You can also scan a Lance dataset that’s stored on the Hugging Face Hub using the <code>hf://</code> path specifier. This scans the remote dataset without requiring that you download it locally. Using the Lance <code>dataset</code> API, it’s very simple to set limits, filters and projections to only fetch the data you need.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> lance
<span class="hljs-comment"># Return as a Lance dataset</span>
ds = lance.dataset(<span class="hljs-string">&quot;hf://datasets/lance-format/laion-1m/data/train.lance&quot;</span>)
scanner = ds.scanner(
columns=[<span class="hljs-string">&quot;caption&quot;</span>, <span class="hljs-string">&quot;url&quot;</span>, <span class="hljs-string">&quot;similarity&quot;</span>],
limit=<span class="hljs-number">5</span>
)
rows = scanner.to_table().to_pylist()
<span class="hljs-keyword">for</span> row <span class="hljs-keyword">in</span> rows:
<span class="hljs-built_in">print</span>(row)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="work-with-binary-assets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#work-with-binary-assets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Work with binary assets</span></h2> <p data-svelte-h="svelte-4gytqq">The example below shows how images are retrieved from a Lance dataset as raw JPEG bytes in the <code>image</code> column, and used downstream.
Use <code>ds.take</code> to fetch the bytes and write them to disk so you can use them elsewhere.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> lance
<span class="hljs-keyword">from</span> pathlib <span class="hljs-keyword">import</span> Path
ds = lance.dataset(<span class="hljs-string">&quot;hf://datasets/lance-format/laion-1m/data/train.lance&quot;</span>)
dir_name = <span class="hljs-string">&quot;laion_samples&quot;</span>
Path(dir_name).mkdir(exist_ok=<span class="hljs-literal">True</span>)
rows = ds.take([<span class="hljs-number">0</span>, <span class="hljs-number">1</span>], columns=[<span class="hljs-string">&quot;image&quot;</span>, <span class="hljs-string">&quot;caption&quot;</span>]).to_pylist()
<span class="hljs-keyword">for</span> idx, row <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(rows):
<span class="hljs-keyword">with</span> <span class="hljs-built_in">open</span>(<span class="hljs-string">f&quot;<span class="hljs-subst">{dir_name}</span>/<span class="hljs-subst">{idx}</span>.jpg&quot;</span>, <span class="hljs-string">&quot;wb&quot;</span>) <span class="hljs-keyword">as</span> f:
f.write(row[<span class="hljs-string">&quot;image&quot;</span>])
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Wrote image with caption: <span class="hljs-subst">{row[<span class="hljs-string">&#x27;caption&#x27;</span>]}</span>&quot;</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="write-a-subset-to-a-new-lance-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#write-a-subset-to-a-new-lance-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Write a subset to a new Lance dataset</span></h2> <p data-svelte-h="svelte-2wa2ax">Working with large datasets? It’s simple to run a filtered scan to select a subset of rows from the Hub and materialize them into a local Lance dataset.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> lance
ds = lance.dataset(<span class="hljs-string">&quot;hf://datasets/lance-format/laion-1m/data/train.lance&quot;</span>)
scanner = ds.scanner(
columns=[<span class="hljs-string">&quot;image&quot;</span>, <span class="hljs-string">&quot;caption&quot;</span>, <span class="hljs-string">&quot;width&quot;</span>, <span class="hljs-string">&quot;height&quot;</span>],
<span class="hljs-built_in">filter</span>=<span class="hljs-string">&quot;width &gt;= 200 AND height &gt;= 100&quot;</span>,
limit=<span class="hljs-number">10</span>,
)
subset = scanner.to_table()
lance.write_dataset(subset, <span class="hljs-string">&quot;./laion_subset&quot;</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="create-index" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#create-index"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Create index</span></h2> <p data-svelte-h="svelte-a85jui">If your dataset doesn’t already have an index associated with it, you can create one after downloading it locally.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># ds is a local Lance dataset</span>
ds.create_index(
<span class="hljs-string">&quot;img_emb&quot;</span>,
index_type=<span class="hljs-string">&quot;IVF_PQ&quot;</span>,
num_partitions=<span class="hljs-number">256</span>,
num_sub_vectors=<span class="hljs-number">96</span>,
replace=<span class="hljs-literal">True</span>,
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ww6bwc">See the <a href="https://lance.org/quickstart/vector-search/" rel="nofollow">Lance docs</a> on vector index creation for a more detailed example. Once you have a vector index created, you can run similarity search on the data via embeddings.</p> <h2 class="relative group"><a id="vector-search" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#vector-search"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Vector search</span></h2> <p data-svelte-h="svelte-1r8hbv9">Because indexes are first-class citizens in Lance, you can store not only your data but also your embeddings and indexes together and query them <strong>directly on the Hub</strong>. Simply use the <code>describe_indices()</code> method to list the index information for the dataset. If an index doesn’t exist in the dataset, you can use <code>lance.write_dataset()</code> to write a local version of the dataset and use <a href="https://lance-format.github.io/lance-python-doc/all-modules.html#lance.dataset.LanceDataset.create_index" rel="nofollow">LanceDataset.create_index</a> to create an index for your needs.</p> <p data-svelte-h="svelte-6j6kd2">The example below shows a dataset for which we already have a vector index on the <code>img_emb</code> field:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> lance
ds = lance.dataset(<span class="hljs-string">&quot;hf://datasets/lance-format/laion-1m/data/train.lance&quot;</span>)
<span class="hljs-built_in">print</span>(ds.list_indices())
<span class="hljs-comment"># Returns</span>
<span class="hljs-comment"># [</span>
<span class="hljs-comment"># IndexDescription(</span>
<span class="hljs-comment"># name=img_emb_idx,</span>
<span class="hljs-comment"># type_url=/lance.table.VectorIndexDetails,</span>
<span class="hljs-comment"># num_rows_indexed=1209588,</span>
<span class="hljs-comment"># fields=[15],</span>
<span class="hljs-comment"># field_names=[&quot;img_emb&quot;],</span>
<span class="hljs-comment"># num_segments=1</span>
<span class="hljs-comment">#. )</span>
<span class="hljs-comment"># ]</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1753vns">You can run vector search queries directly on the remote dataset without downloading it (or, if you prefer, download the dataset locally and create a new index). The example below shows how to run a nearest neighbor search on a vector index using an image embedding as the query vector.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> lance
<span class="hljs-keyword">import</span> pyarrow <span class="hljs-keyword">as</span> pa
ds = lance.dataset(<span class="hljs-string">&quot;hf://datasets/lance-format/laion-1m/data/train.lance&quot;</span>)
emb_field = ds.schema.field(<span class="hljs-string">&quot;img_emb&quot;</span>)
ref = ds.take([<span class="hljs-number">0</span>], columns=[<span class="hljs-string">&quot;img_emb&quot;</span>]).to_pylist()[<span class="hljs-number">0</span>][<span class="hljs-string">&quot;img_emb&quot;</span>]
query = pa.array([ref], <span class="hljs-built_in">type</span>=emb_field.<span class="hljs-built_in">type</span>)
neighbors = ds.scanner(
nearest={
<span class="hljs-string">&quot;column&quot;</span>: emb_field.name,
<span class="hljs-string">&quot;q&quot;</span>: query[<span class="hljs-number">0</span>],
<span class="hljs-string">&quot;k&quot;</span>: <span class="hljs-number">6</span>,
<span class="hljs-string">&quot;nprobes&quot;</span>: <span class="hljs-number">16</span>,
<span class="hljs-string">&quot;refine_factor&quot;</span>: <span class="hljs-number">30</span>,
},
columns=[<span class="hljs-string">&quot;caption&quot;</span>, <span class="hljs-string">&quot;url&quot;</span>, <span class="hljs-string">&quot;similarity&quot;</span>],
).to_table().to_pylist()<!-- HTML_TAG_END --></pre></div> <blockquote class="note" data-svelte-h="svelte-1pvmolz"><p>Setting a large <code>k</code> or <code>nprobes</code> value, or sending a large batch of queries all at once can hit Hub rate limits. For heavy usage, download the dataset (or a subset of it) locally and point Lance at the local path to avoid throttling.</p></blockquote> <h2 class="relative group"><a id="dataset-evolution" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#dataset-evolution"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Dataset evolution</span></h2> <p data-svelte-h="svelte-17bvvwe">One of Lance’s most powerful features is flexible, zero-cost data evolution, meaning that you can effortlessly add derived columns <strong>without</strong> rewriting the original table. For very large tables with a lot of large blobs, the savings in I/O can be quite significant. This feature is very relevant if you’re experimenting with your data for ML/AI engineering tasks and you frequently find yourself adding new features, embeddings, or derived metadata.</p> <p data-svelte-h="svelte-19o5g06">The example below shows how to add a derived <code>moderation_label</code> column that marks an image as <code>NSFW</code> based on an existing score column. When you make this change, backfilling the new column <strong>only</strong> writes the new column data, without touching the original image blobs or data in other columns. You can also choose to just add the new column schema without backfilling any data.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> lance
<span class="hljs-keyword">import</span> pyarrow <span class="hljs-keyword">as</span> pa
<span class="hljs-comment"># Assumes you ran the export to Lance example above to store a local subset of the data</span>
local_ds = lance.dataset(<span class="hljs-string">&quot;./laion_subset&quot;</span>)
<span class="hljs-comment"># schema only (data to be added later)</span>
local_ds.add_columns(pa.field(<span class="hljs-string">&quot;moderation_label&quot;</span>, pa.string()))
<span class="hljs-comment"># with data backfill</span>
local_ds.add_columns(
{
<span class="hljs-string">&quot;moderation_label&quot;</span>: <span class="hljs-string">&quot;case WHEN \&quot;NSFW\&quot; &gt; 0.5 THEN &#x27;review&#x27; ELSE &#x27;ok&#x27; END&quot;</span>
}
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1lke28k">See the Lance docs on <a href="https://lance.org/guide/data_evolution/" rel="nofollow">data evolution</a> to learn how to alter and drop columns in Lance datasets.</p> <h2 class="relative group"><a id="work-with-video-blobs" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#work-with-video-blobs"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Work with video blobs</span></h2> <p data-svelte-h="svelte-ttzfn7">Lance tables also support large inline video blobs. The <code>OpenVid-1M</code> dataset (from <a href="https://arxiv.org/abs/2407.02371" rel="nofollow">this paper</a>) contains high-quality, expressive videos and their captions. The video data is stored in the <code>video_blob</code> column of the following Lance dataset on the Hub.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> lance
lance_ds = lance.dataset(<span class="hljs-string">&quot;hf://datasets/lance-format/Openvid-1M/data/train.lance&quot;</span>)
blob_file = lance_ds.take_blobs(<span class="hljs-string">&quot;video_blob&quot;</span>, ids=[<span class="hljs-number">0</span>])[<span class="hljs-number">0</span>]
video_bytes = blob_file.read()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-135ny8x">Unlike other data formats, large multimodal binary objects (blobs) are first-class citizens in Lance. The <a href="https://lance.org/guide/blob/" rel="nofollow">blob API</a> provides a high-level API to store and retrieve large blobs in Lance datasets. The following example shows how to efficiently browse metadata without loading the heavier video blobs, then fetch the relevant video blobs on demand.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> lance
ds = lance.dataset(<span class="hljs-string">&quot;hf://datasets/lance-format/Openvid-1M/data/train.lance&quot;</span>)
<span class="hljs-comment"># 1. Browse metadata without loading video blobs.</span>
metadata = ds.scanner(
columns=[<span class="hljs-string">&quot;caption&quot;</span>, <span class="hljs-string">&quot;aesthetic_score&quot;</span>],
<span class="hljs-built_in">filter</span>=<span class="hljs-string">&quot;aesthetic_score &gt;= 4.5&quot;</span>,
limit=<span class="hljs-number">2</span>,
).to_table().to_pylist()
<span class="hljs-comment"># 2. Fetch a single video blob by row index.</span>
selected_index = <span class="hljs-number">0</span>
blob_file = ds.take_blobs(<span class="hljs-string">&quot;video_blob&quot;</span>, ids=[selected_index])[<span class="hljs-number">0</span>]
<span class="hljs-keyword">with</span> <span class="hljs-built_in">open</span>(<span class="hljs-string">&quot;video_0.mp4&quot;</span>, <span class="hljs-string">&quot;wb&quot;</span>) <span class="hljs-keyword">as</span> f:
f.write(blob_file.read())<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="prepare-data-for-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#prepare-data-for-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Prepare data for training</span></h2> <p data-svelte-h="svelte-19fon41">Training is another area where Lance’s fast random access and scan performance can be useful. You can use Lance datasets as the storage mechanism for your training data, shuffling it and loading into batches as part of your training pipelines.</p> <p data-svelte-h="svelte-jxa5q7">The blob API in Lance is compatible with <code>torchcodec</code>, so you can easily decode video blobs as <code>torch</code> tensors:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> torchcodec.decoders <span class="hljs-keyword">import</span> VideoDecoder
decoder = VideoDecoder(blob_file)
tensor = decoder[<span class="hljs-number">0</span>] <span class="hljs-comment"># uint8 tensor of shape [C, H, W]</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-3vnbqh">See the <a href="https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.VideoDecoder.html" rel="nofollow">torchcodec docs</a> for more functions for efficiently decoding videos.</p> <p data-svelte-h="svelte-18wz79h">In addition, you can also check out the <a href="https://lance.org/examples/python/clip_training/" rel="nofollow">Lance documentation</a> for more examples on loading image data into <code>torchvision</code> for training your own image models.</p> <h2 class="relative group"><a id="explore-more-lance-datasets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#explore-more-lance-datasets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Explore more Lance datasets</span></h2> <p data-svelte-h="svelte-5n1m4v">Lance is an open format with native support for multimodal blobs alongside your traditional tabular data.
With the Hugging Face Hub integration, you can easily work with images, audio, video, text, embeddings, and
scalar metadata all in one place.</p> <p data-svelte-h="svelte-iletk7">Explore more Lance datasets on the <a href="https://huggingface.co/datasets?format=format:lance" rel="nofollow">Hugging Face Hub</a>,
and share your own Lance datasets with others in the community!
You can visit <a href="https://lance.org/integrations/huggingface/" rel="nofollow">lance.org</a> for more code snippets and examples.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/hub-docs/blob/main/docs/hub/datasets-lance.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_ummgov = {
assets: "/docs/hub/pr_2437/en",
base: "/docs/hub/pr_2437/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/hub/pr_2437/en/_app/immutable/entry/start.5e3ed1fd.js"),
import("/docs/hub/pr_2437/en/_app/immutable/entry/app.93e7704b.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 48],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
56.9 kB
·
Xet hash:
a5c3fd459a0cb21a855ad213b63779de6cb5b69af11da010c7caeede9c311aea

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.