Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Xorb Formation & Serialization Format","local":"xorb-formation--serialization-format","sections":[{"title":"Collecting Chunks","local":"collecting-chunks","sections":[],"depth":2},{"title":"Xorb Format","local":"xorb-format","sections":[{"title":"Chunk Addressing","local":"chunk-addressing","sections":[],"depth":3}],"depth":2},{"title":"Chunk Format","local":"chunk-format","sections":[{"title":"Chunk Header Structure","local":"chunk-header-structure","sections":[{"title":"Chunk Header Layout","local":"chunk-header-layout","sections":[],"depth":4}],"depth":3},{"title":"Chunk Compression Schemes","local":"chunk-compression-schemes","sections":[{"title":"Byte Grouping LZ4 Compression","local":"byte-grouping-lz4-compression","sections":[],"depth":4},{"title":"Chunk Data","local":"chunk-data","sections":[],"depth":4}],"depth":3},{"title":"Picking a Compression Scheme","local":"picking-a-compression-scheme","sections":[{"title":"Example Chunk Serialization","local":"example-chunk-serialization","sections":[],"depth":4}],"depth":3}],"depth":2},{"title":"Xorb Format Sample","local":"xorb-format-sample","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/xet/pr_2272/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/entry/start.7209fe0c.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/scheduler.de5597d1.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/singletons.800e7c38.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/paths.d1eab31c.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/entry/app.54268a84.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/preload-helper.1b7d12f9.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/index.f8bac2c1.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/nodes/0.097c0756.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/nodes/13.124b5500.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.4e1a23d9.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/CodeBlock.d8c0c481.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Xorb Formation & Serialization Format","local":"xorb-formation--serialization-format","sections":[{"title":"Collecting Chunks","local":"collecting-chunks","sections":[],"depth":2},{"title":"Xorb Format","local":"xorb-format","sections":[{"title":"Chunk Addressing","local":"chunk-addressing","sections":[],"depth":3}],"depth":2},{"title":"Chunk Format","local":"chunk-format","sections":[{"title":"Chunk Header Structure","local":"chunk-header-structure","sections":[{"title":"Chunk Header Layout","local":"chunk-header-layout","sections":[],"depth":4}],"depth":3},{"title":"Chunk Compression Schemes","local":"chunk-compression-schemes","sections":[{"title":"Byte Grouping LZ4 Compression","local":"byte-grouping-lz4-compression","sections":[],"depth":4},{"title":"Chunk Data","local":"chunk-data","sections":[],"depth":4}],"depth":3},{"title":"Picking a Compression Scheme","local":"picking-a-compression-scheme","sections":[{"title":"Example Chunk Serialization","local":"example-chunk-serialization","sections":[],"depth":4}],"depth":3}],"depth":2},{"title":"Xorb Format Sample","local":"xorb-format-sample","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="xorb-formation--serialization-format" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#xorb-formation--serialization-format"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Xorb Formation & Serialization Format</span></h1> <p data-svelte-h="svelte-1xhm4jg">A “Xorb” (Xet Orb, pronounced like “zorb”) is a sequence of chunks and a serialization format for a series of chunks.</p> <h2 class="relative group"><a id="collecting-chunks" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#collecting-chunks"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Collecting Chunks</span></h2> <p data-svelte-h="svelte-1ightrg">Using the chunking algorithm a file is mapped to a series of chunks, once those chunks are found, they need to be collected into collections of Xorbs.</p> <p data-svelte-h="svelte-1nudf0q">It is advantageous to collect series of chunks in Xorbs such that they can be referred to as a whole range of chunks.</p> <p data-svelte-h="svelte-115qz2m">Suppose a file is chunked into chunks A, B, C, D in the order ABCD. Then create a Xorb X1 with chunks A, B, C, D in this order (starting at chunk index 0), let’s say this Xorb’s hash is X1. Then to reconstruct the file we ask for Xorb X1 chunk range <code>[0, 4)</code>.</p> <p data-svelte-h="svelte-g03n5d">While there’s no explicit limit on the number of chunks in a Xorb, there is a limit of 64MiB on the total size of the Xorb as serialized. | |
| Since some chunks will get compressed, it is generally advised to collect chunks until their total uncompressed length is near 64 MiB then serialize the struct. | |
| Namely, Xorbs point to roughly 64 MiB worth of data. | |
| (Recall that the target chunk size is 64 KiB so expect roughly ~1024 chunks per Xorb).</p> <p data-svelte-h="svelte-f99jh1">The CAS server will reject Xorb uploads that exceed the 64 MiB serialized size limit.</p> <p data-svelte-h="svelte-6oqo7d">It is RECOMMENDED to pack chunks from multiple files into a Xorb if the size requirements allow, i.e. file X and Y both produced 10 new chunks each totalling a total of ~128000 bytes, then all those chunks can fit in a new Xorb.</p> <h2 class="relative group"><a id="xorb-format" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#xorb-format"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Xorb Format</span></h2> <p data-svelte-h="svelte-1ru2cqe">A Xorb is a series of “Chunks” that is serialized according to a specific format that enables accessing chunks of ranges and builds in chunk level compression.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->┌─────────┬─────────────────────────────────┬─────────┬─────────────────────────────────┬─────────┬─────────────────────────────────┬────────── | |
| │ Chunk │ │ Chunk │ │ Chunk │ │ | |
| │ Header │ Compressed Chunk Data │ Header │ Compressed Chunk Data │ Header │ Compressed Chunk Data │ ... | |
| │ │ │ │ │ │ │ | |
| └─────────┴─────────────────────────────────┴─────────┴─────────────────────────────────┴─────────┴─────────────────────────────────┴─────────── | |
| │ Chunk 0 │ Chunk 1 │ Chunk 2 │ ...<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="chunk-addressing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#chunk-addressing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Chunk Addressing</span></h3> <p data-svelte-h="svelte-155ojg1">Each chunk has an index within the Xorb it is in, starting at 0. | |
| Chunks can be addressed individually by their index but are usually addressed or fetched in range. | |
| Chunk ranges are always specified start inclusive and end exclusive i.e. <code>[start, end)</code>.</p> <h2 class="relative group"><a id="chunk-format" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#chunk-format"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Chunk Format</span></h2> <p data-svelte-h="svelte-l7kwob">A chunk consists of a header followed by compressed data. The header contains metadata about the chunk, particularly the compression scheme required to know how to deserialize the chunk.</p> <h3 class="relative group"><a id="chunk-header-structure" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#chunk-header-structure"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Chunk Header Structure</span></h3> <p data-svelte-h="svelte-1y94qrv">The chunk header is serialized as follows:</p> <ul data-svelte-h="svelte-bnf6vn"><li><strong>Version</strong> (1 byte): Protocol version, currently <code>0</code></li> <li><strong>Compressed Size</strong> (3 bytes): Size of data after compression as a 3 byte little-endian unsigned integer.</li> <li><strong>Compression Type</strong> (1 byte): Algorithm used for compression (See mapping below)</li> <li><strong>Uncompressed Size</strong> (3 bytes): Size of raw chunk data (before compression) as a 3 byte little-endian unsigned integer.</li></ul> <p data-svelte-h="svelte-q762np">Both Compressed and Uncompressed Size can fit in a 3 byte integer, given that that a raw uncompressed chunk can be 128KiB at most, | |
| requiring 18 binary digits to represent. | |
| If utilizing the intended compression scheme results in a larger compressed chunk then the chunk SHOULD be stored uncompressed with then | |
| the uncompressed size also being at a maximum of 128KiB.</p> <h4 class="relative group"><a id="chunk-header-layout" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#chunk-header-layout"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Chunk Header Layout</span></h4> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->┌─────────┬─────────────────────────────────┬──────────────┬─────────────────────────────────┐ | |
| │ Version │ Compressed Size │ Compression │ Uncompressed Size │ | |
| │ 1 byte │ 3 bytes │ Type │ 3 bytes │ | |
| │ │ (little-endian) │ 1 byte │ (little-endian) │ | |
| └─────────┴─────────────────────────────────┴──────────────┴─────────────────────────────────┘ | |
| 0 1 4 5 8<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="chunk-compression-schemes" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#chunk-compression-schemes"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Chunk Compression Schemes</span></h3> <table data-svelte-h="svelte-1hmbloz"><thead><tr><th>Value</th> <th>Name</th> <th>Description</th></tr></thead> <tbody><tr><td><code>0</code></td> <td><code>None</code></td> <td>No compression - data is stored as-is</td></tr> <tr><td><code>1</code></td> <td><code>LZ4</code></td> <td>Standard LZ4 compression</td></tr> <tr><td><code>2</code></td> <td><code>ByteGrouping4LZ4</code></td> <td>Byte grouping with 4-byte groups followed by LZ4 compression. Optimized for floating-point and other structured data where grouping bytes by position improves compression ratios</td></tr></tbody></table> <h4 class="relative group"><a id="byte-grouping-lz4-compression" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#byte-grouping-lz4-compression"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Byte Grouping LZ4 Compression</span></h4> <p data-svelte-h="svelte-wuw71k">Byte grouping LZ4 compression is an optimization technique that improves compression ratios for structured data like floating-point numbers, integers, and other data types where values have similar byte patterns at specific positions.</p> <ol data-svelte-h="svelte-g16cdg"><li><p><strong>Byte Grouping Phase</strong>: The input data is reorganized by grouping bytes by their position within each 4-byte groups: | |
| Create 4 buffers, for each 4 bytes of the chunk data (B1, B2, B3, B4) append each byte to their respective group i.e. in order from 1 to 4. Then concatenate the groups in order (1, 2, 3, 4).</p> <p>Example:</p> <ul><li>Original data: <code>[A1, A2, A3, A4, B1, B2, B3, B4, C1, C2, C3, C4, ...]</code></li> <li>Grouped data: <code>[A1, B1, C1, ..., A2, B2, C2, ..., A3, B3, C3, ..., A4, B4, C4, ...]</code></li></ul> <p>If the total number of bytes in the chunk is not a multiple of 4, append the remaining bytes following the pattern (1 byte to each group) to the first 1-3 groups until there are no more bytes left in the chunk.</p></li> <li><p><strong>LZ4 Compression</strong>: The grouped data is then compressed using standard LZ4 compression.</p></li></ol> <h4 class="relative group"><a id="chunk-data" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#chunk-data"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Chunk Data</span></h4> <p data-svelte-h="svelte-8oqpyr">Following the header is the compressed data block, exactly <code>compressed_size</code> bytes long.</p> <h3 class="relative group"><a id="picking-a-compression-scheme" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#picking-a-compression-scheme"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Picking a Compression Scheme</span></h3> <p data-svelte-h="svelte-1wg0kcz">Picking the chunk compression scheme for the Xorb is a task left to the client when uploading the Xorb. | |
| The goal is to minimize the overall size of the Xorb for faster transmission at the cost of resources to decompress a chunk on the receiving end.</p> <p data-svelte-h="svelte-monlun">When picking a compression scheme for the chunk there are a number of strategies and implementors MAY make their decisions as to how to pick a compression scheme. | |
| Note that a Xorb MAY contain chunks that utilize different compression schemes.</p> <ol data-svelte-h="svelte-1k3vtlh"><li><p><strong>Brute Force</strong></p> <p>Try all possible compression schemes, pick the best one. | |
| The best one MAY be the one producing the smallest compressed chunk or the fastest to decompress.</p></li> <li><p><strong>Best Effort Prediction</strong></p> <p>In <code>xet-core</code>, to predict if BG4 will be useful we maximum KL divergence between the distribution of per-byte pop-counts on a sample of each of the 4 groups that would be formed. | |
| You can read more about it in <a href="https://github.com/huggingface/xet-core/blob/main/xorb_object/src/byte_grouping/bg4_prediction.rs" rel="nofollow">bg4_prediction.rs</a> and accompanying scripts.</p> <p>If the predictor does not show that BG4 will be better, we use Lz4 and in either case we will store the chunk as the uncompressed version if the compression scheme used does not show any benefit.</p></li></ol> <h4 class="relative group"><a id="example-chunk-serialization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#example-chunk-serialization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Example Chunk Serialization</span></h4> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->VERSION = <span class="hljs-number">0</span> | |
| buffer = <span class="hljs-built_in">bytes</span>() | |
| <span class="hljs-keyword">for</span> chunk <span class="hljs-keyword">in</span> xorb.chunks: | |
| uncompressed_length = <span class="hljs-built_in">len</span>(chunk) | |
| compressed, compression_scheme = pick_compression_scheme_and_compress(chunk) | |
| header = Header(VERSION, <span class="hljs-built_in">len</span>(compressed), compression_scheme, uncompressed_length) | |
| buffer.write(header) | |
| buffer.write(compressed)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="xorb-format-sample" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#xorb-format-sample"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Xorb Format Sample</span></h2> <p data-svelte-h="svelte-clkmen">For a sample of a serialized xorb object see <a href="https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632.xorb" rel="nofollow">eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632.xorb</a>. | |
| The hash of this xorb is <code>eea25d6ee393ccae385820daed127b96ef0ea034dfb7cf6da3a950ce334b7632</code> and it is composed of chunks from file <a href="https://huggingface.co/datasets/xet-team/xet-spec-reference-files/blob/main/Electric_Vehicle_Population_Data_20250917.csv" rel="nofollow">Electric_Vehicle_Population_Data_20250917.csv</a>.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/hub-docs/blob/main/docs/xet/xorb.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_1t7h0ns = { | |
| assets: "/docs/xet/pr_2272/en", | |
| base: "/docs/xet/pr_2272/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/xet/pr_2272/en/_app/immutable/entry/start.7209fe0c.js"), | |
| import("/docs/xet/pr_2272/en/_app/immutable/entry/app.54268a84.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 13], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 36.6 kB
- Xet hash:
- 2244977e7ca03e910dad94aaefaa1f89543ec67954088abe61999b711b790e5c
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.