Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Xet Chunk-Level Deduplication Specification","local":"xet-chunk-level-deduplication-specification","sections":[{"title":"Overview","local":"overview","sections":[],"depth":2},{"title":"Core Concepts","local":"core-concepts","sections":[{"title":"Chunks","local":"chunks","sections":[],"depth":3},{"title":"Xorbs","local":"xorbs","sections":[],"depth":3},{"title":"Shards (Xorb Lists)","local":"shards-xorb-lists","sections":[],"depth":3},{"title":"CAS (Content Addressable Storage)","local":"cas-content-addressable-storage","sections":[],"depth":3}],"depth":2},{"title":"Deduplication Procedure","local":"deduplication-procedure","sections":[{"title":"1. File Processing and Chunking","local":"1-file-processing-and-chunking","sections":[],"depth":3},{"title":"2. Multi-Level Deduplication Strategy","local":"2-multi-level-deduplication-strategy","sections":[{"title":"Level 1: Local Session Deduplication","local":"level-1-local-session-deduplication","sections":[],"depth":4},{"title":"Level 2: Cached Metadata Deduplication","local":"level-2-cached-metadata-deduplication","sections":[],"depth":4},{"title":"Level 3: Global Deduplication API","local":"level-3-global-deduplication-api","sections":[],"depth":4}],"depth":3},{"title":"3. Global Deduplication Process","local":"3-global-deduplication-process","sections":[{"title":"Eligibility Criteria","local":"eligibility-criteria","sections":[],"depth":4},{"title":"Query Process","local":"query-process","sections":[],"depth":4},{"title":"HMAC Security Mechanism","local":"hmac-security-mechanism","sections":[],"depth":4}],"depth":3}],"depth":2},{"title":"Technical Implementation Details","local":"technical-implementation-details","sections":[{"title":"Chunk Hash Computation","local":"chunk-hash-computation","sections":[],"depth":3},{"title":"Xorb Formation","local":"xorb-formation","sections":[],"depth":3},{"title":"File Reconstruction Information","local":"file-reconstruction-information","sections":[],"depth":3}],"depth":2},{"title":"Fragmentation Prevention","local":"fragmentation-prevention","sections":[],"depth":2},{"title":"Conclusion","local":"conclusion","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/xet/pr_2272/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/entry/start.7209fe0c.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/scheduler.de5597d1.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/singletons.800e7c38.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/paths.d1eab31c.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/entry/app.54268a84.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/preload-helper.1b7d12f9.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/index.f8bac2c1.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/nodes/0.097c0756.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/nodes/5.bc2bade2.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.4e1a23d9.js"> | |
| <link rel="modulepreload" href="/docs/xet/pr_2272/en/_app/immutable/chunks/MermaidChart.5e4e5200.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Xet Chunk-Level Deduplication Specification","local":"xet-chunk-level-deduplication-specification","sections":[{"title":"Overview","local":"overview","sections":[],"depth":2},{"title":"Core Concepts","local":"core-concepts","sections":[{"title":"Chunks","local":"chunks","sections":[],"depth":3},{"title":"Xorbs","local":"xorbs","sections":[],"depth":3},{"title":"Shards (Xorb Lists)","local":"shards-xorb-lists","sections":[],"depth":3},{"title":"CAS (Content Addressable Storage)","local":"cas-content-addressable-storage","sections":[],"depth":3}],"depth":2},{"title":"Deduplication Procedure","local":"deduplication-procedure","sections":[{"title":"1. File Processing and Chunking","local":"1-file-processing-and-chunking","sections":[],"depth":3},{"title":"2. Multi-Level Deduplication Strategy","local":"2-multi-level-deduplication-strategy","sections":[{"title":"Level 1: Local Session Deduplication","local":"level-1-local-session-deduplication","sections":[],"depth":4},{"title":"Level 2: Cached Metadata Deduplication","local":"level-2-cached-metadata-deduplication","sections":[],"depth":4},{"title":"Level 3: Global Deduplication API","local":"level-3-global-deduplication-api","sections":[],"depth":4}],"depth":3},{"title":"3. Global Deduplication Process","local":"3-global-deduplication-process","sections":[{"title":"Eligibility Criteria","local":"eligibility-criteria","sections":[],"depth":4},{"title":"Query Process","local":"query-process","sections":[],"depth":4},{"title":"HMAC Security Mechanism","local":"hmac-security-mechanism","sections":[],"depth":4}],"depth":3}],"depth":2},{"title":"Technical Implementation Details","local":"technical-implementation-details","sections":[{"title":"Chunk Hash Computation","local":"chunk-hash-computation","sections":[],"depth":3},{"title":"Xorb Formation","local":"xorb-formation","sections":[],"depth":3},{"title":"File Reconstruction Information","local":"file-reconstruction-information","sections":[],"depth":3}],"depth":2},{"title":"Fragmentation Prevention","local":"fragmentation-prevention","sections":[],"depth":2},{"title":"Conclusion","local":"conclusion","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="xet-chunk-level-deduplication-specification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#xet-chunk-level-deduplication-specification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Xet Chunk-Level Deduplication Specification</span></h1> <h2 class="relative group"><a id="overview" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#overview"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Overview</span></h2> <p data-svelte-h="svelte-wh4eu5">Chunk-level deduplication is a fundamental optimization technique in the Xet system that eliminates redundant data by identifying and sharing identical content blocks (chunks) across files and repositories. | |
| This specification details the procedures, algorithms, and security mechanisms that enable efficient storage and transfer while maintaining data integrity and access control.</p> <p data-svelte-h="svelte-1ul8fih">Deduplication in Xet operates at the chunk level rather than the file level, providing fine-grained deduplication capabilities that can identify shared content even when files differ significantly. | |
| This approach is particularly effective for scenarios common in machine learning and data science workflows, such as:</p> <ul data-svelte-h="svelte-qg99xo"><li>Multiple versions of datasets with incremental changes</li> <li>Model checkpoints that share common layers or parameters</li> <li>Documentation and configuration files with similar content</li> <li>Large files where only portions have changed between versions</li></ul> <h2 class="relative group"><a id="core-concepts" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#core-concepts"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Core Concepts</span></h2> <h3 class="relative group"><a id="chunks" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#chunks"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Chunks</span></h3> <p data-svelte-h="svelte-1y0ff6x">A <strong>chunk</strong> is a variable-sized content block derived from files using Content-Defined Chunking (CDC) with a rolling hash function. Chunks are the fundamental unit of deduplication in Xet.</p> <ul data-svelte-h="svelte-1wnhwol"><li><strong>Target size</strong>: 64KB (configurable)</li> <li><strong>Size range</strong>: 8KB to 128KB (minimum and maximum constraints)</li> <li><strong>Identification</strong>: Each chunk is uniquely identified by its cryptographic hash (MerkleHash)</li></ul> <p data-svelte-h="svelte-dm8u6u"><a href="./chunking">Detailed chunking description</a></p> <h3 class="relative group"><a id="xorbs" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#xorbs"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Xorbs</span></h3> <p data-svelte-h="svelte-160besi"><strong>Xorbs</strong> are objects that aggregate multiple chunks for efficient storage and transfer:</p> <ul data-svelte-h="svelte-1pp0obe"><li><strong>Maximum size</strong>: 64MB</li> <li><strong>Maximum chunks</strong>: 8,192 chunks per xorb</li> <li><strong>Purpose</strong>: Batch multiple chunks together to reduce metadata and network overhead when uploading and downloading groups of chunks</li></ul> <h3 class="relative group"><a id="shards-xorb-lists" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#shards-xorb-lists"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Shards (Xorb Lists)</span></h3> <p data-svelte-h="svelte-1bdth14"><strong>Shards</strong> are objects that contain a list of xorbs that can be deduped against (for the context of deduplication, ignore the file info section of the shard format).</p> <ul data-svelte-h="svelte-4ettzu"><li><strong>Maximum size</strong>: 64MB</li> <li><strong>Purpose</strong>: Provide a format on a positive reply to a global deduplication request with information about xorbs that already exist in the CAS system.</li></ul> <h3 class="relative group"><a id="cas-content-addressable-storage" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#cas-content-addressable-storage"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>CAS (Content Addressable Storage)</span></h3> <p data-svelte-h="svelte-gehrbw">The <strong>CAS</strong> system provides the underlying storage infrastructure:</p> <ul data-svelte-h="svelte-t127ch"><li><strong>Content addressing</strong>: All objects are stored and retrieved by their cryptographic hash</li> <li><strong>Immutability</strong>: Once stored, file content cannot be modified</li> <li><strong>Deduplication</strong>: Identical content is automatically deduplicated at the storage level</li></ul> <h2 class="relative group"><a id="deduplication-procedure" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deduplication-procedure"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Deduplication Procedure</span></h2> <h3 class="relative group"><a id="1-file-processing-and-chunking" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1-file-processing-and-chunking"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. File Processing and Chunking</span></h3> <p data-svelte-h="svelte-t87a7v">When a file is processed for upload, it undergoes the following steps:</p> <div class="mermaid-chart " style="text-align: center;"></div> <ol data-svelte-h="svelte-1sq5l43"><li><strong>Chunking</strong>: Content-defined chunking using GearHash algorithm creates variable-sized chunks of file data</li> <li><strong>Hash Computation</strong>: Each chunk’s content is hashed using a cryptographic hash function (Blake3-based MerkleHash)</li> <li><strong>Chunk Object Creation</strong>: Chunks are wrapped with metadata including hash, size, and data</li></ol> <h3 class="relative group"><a id="2-multi-level-deduplication-strategy" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-multi-level-deduplication-strategy"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. Multi-Level Deduplication Strategy</span></h3> <p data-svelte-h="svelte-i7yikb">Xet employs a three-tiered deduplication strategy to maximize efficiency while minimizing latency:</p> <h4 class="relative group"><a id="level-1-local-session-deduplication" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#level-1-local-session-deduplication"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Level 1: Local Session Deduplication</span></h4> <p data-svelte-h="svelte-xcyw5o"><strong>Scope</strong>: Current upload session | |
| <strong>Mechanism</strong>: In-memory hash lookup table | |
| <strong>Purpose</strong>: Eliminate redundancy within the current file or session</p> <p data-svelte-h="svelte-15jvva3"><strong>Benefits</strong>:</p> <ul data-svelte-h="svelte-nppre4"><li>Fastest lookup (in-memory)</li> <li>Zero network overhead</li> <li>Immediate deduplication feedback</li></ul> <h4 class="relative group"><a id="level-2-cached-metadata-deduplication" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#level-2-cached-metadata-deduplication"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Level 2: Cached Metadata Deduplication</span></h4> <p data-svelte-h="svelte-19ttbs3"><strong>Scope</strong>: Previously uploaded files and sessions | |
| <strong>Mechanism</strong>: Local shard file metadata cache | |
| <strong>Purpose</strong>: Leverage deduplication against recently uploaded content</p> <p data-svelte-h="svelte-15jvva3"><strong>Benefits</strong>:</p> <ul data-svelte-h="svelte-hhphx7"><li>Fast local disk access</li> <li>No network latency</li> <li>Persistent across sessions</li></ul> <h4 class="relative group"><a id="level-3-global-deduplication-api" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#level-3-global-deduplication-api"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Level 3: Global Deduplication API</span></h4> <p data-svelte-h="svelte-n1wupe"><strong>Scope</strong>: Entire Xet system | |
| <strong>Mechanism</strong>: Global deduplication service with HMAC protection | |
| <strong>Purpose</strong>: Discover deduplication opportunities across all users and repositories</p> <h3 class="relative group"><a id="3-global-deduplication-process" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-global-deduplication-process"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. Global Deduplication Process</span></h3> <p data-svelte-h="svelte-1s14zeh">The global deduplication system provides deduplication capabilities across all data that is managed by the Xet system:</p> <h4 class="relative group"><a id="eligibility-criteria" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#eligibility-criteria"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Eligibility Criteria</span></h4> <p data-svelte-h="svelte-1ikjrvt">Not all chunks are eligible for global deduplication queries to manage system load:</p> <ol data-svelte-h="svelte-1jgm2gu"><li><strong>First chunk</strong>: The first chunk of every file is always eligible.</li> <li><strong>Hash pattern matching</strong>: Chunks are eligible if: the last 8 bytes of the hash interpreted as a little-endian 64 bit integer % 1024 == 0.</li></ol> <p data-svelte-h="svelte-iwomao"><strong>Recommendations:</strong> <strong>Spacing constraints</strong>: The global dedupe API is optimized to return information about nearby chunks when there is a match. Consider only issuing a request to an eligible chunk every ~4MB of data.</p> <h4 class="relative group"><a id="query-process" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#query-process"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Query Process</span></h4> <ol data-svelte-h="svelte-1h2ot0"><li><strong>Background Query</strong>: Global deduplication queries SHOULD run asynchronously to avoid blocking upload</li> <li><strong>HMAC Protection</strong>: Chunk hashes are protected using HMAC keys</li> <li><strong>Shard Response</strong>: When a match is found, the API returns a shard containing: | |
| <ul><li><strong>Xorb Info Section</strong>: Contains metadata about many xorbs that store chunks</li> <li><strong>HMAC Key</strong>: Included in the shard metadata header used to encrypt chunk hashes</li></ul></li> <li><strong>Encrypted Chunk Matching</strong>: All chunk hashes in the returned shard have been encrypted with the HMAC key</li> <li><strong>Match Discovery Process</strong>: To find matches, clients MUST: | |
| <ul><li>Encrypt their chunk hash using the provided HMAC key</li> <li>Search for the encrypted hash within the shard’s chunk listings</li> <li>For subsequent chunks, repeat the encryption and search process</li> <li>Track the original (non-encrypted) chunk hash while noting which xorb contains that chunk</li></ul></li> <li><strong>Metadata Caching</strong>: Client downloads and caches shard metadata for future deduplication</li></ol> <h4 class="relative group"><a id="hmac-security-mechanism" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#hmac-security-mechanism"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>HMAC Security Mechanism</span></h4> <p data-svelte-h="svelte-1h82855">Global deduplication uses HMAC (Hash-based Message Authentication Code) to protect chunk hashes while enabling deduplication.</p> <p data-svelte-h="svelte-wmt0sg"><strong>Security Properties</strong>:</p> <p data-svelte-h="svelte-efgrvu">Raw chunk hashes are never transmitted from servers to clients; a client has to encrypt their raw chunk hash and find a match to know a raw chunk hash exists in the system. | |
| They MAY know this chunk hash because they own this data, the match has made them privy to know which xorb has this chunk hash and the position in the xorb, but has not revealed any other raw chunk hashes in that xorb or other xorbs.</p> <h2 class="relative group"><a id="technical-implementation-details" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#technical-implementation-details"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Technical Implementation Details</span></h2> <h3 class="relative group"><a id="chunk-hash-computation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#chunk-hash-computation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Chunk Hash Computation</span></h3> <p data-svelte-h="svelte-1ty8it5">Each chunk has its content hashed using a cryptographic hash function (Blake3-based MerkleHash) to create a unique identifier for content addressing. | |
| <a href="./hashing#chunk-hashes">See section about hashing</a>.</p> <h3 class="relative group"><a id="xorb-formation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#xorb-formation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Xorb Formation</span></h3> <p data-svelte-h="svelte-4o1gvs">When new chunks need to be stored, they are aggregated into xorbs based on size and count limits. If adding a new chunk would exceed the maximum xorb size or chunk count, the current xorb is finalized and uploaded. <a href="./xorb">See section about xorb formation</a></p> <h3 class="relative group"><a id="file-reconstruction-information" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#file-reconstruction-information"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>File Reconstruction Information</span></h3> <p data-svelte-h="svelte-1f2nl4r">When chunks are deduplicated, the system creates file reconstruction information that includes:</p> <ul data-svelte-h="svelte-z1a6tk"><li>Hash of the xorb containing the chunks</li> <li>Flags for the xorb</li> <li>Total bytes in the segment</li> <li>Start and end indices within the xorb (start inclusive, end exclusive)</li></ul> <p data-svelte-h="svelte-exnnrh">This information allows the system to reconstruct files by:</p> <ol data-svelte-h="svelte-1hw0ygr"><li>Identifying which xorbs contain the needed chunks</li> <li>Extracting the specific chunk ranges from each xorb</li> <li>Concatenating chunks in the correct order</li></ol> <p data-svelte-h="svelte-1ks76o"><a href="./file-reconstruction">See section about file reconstruction</a>.</p> <h2 class="relative group"><a id="fragmentation-prevention" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fragmentation-prevention"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Fragmentation Prevention</span></h2> <p data-svelte-h="svelte-14kkp29">While deduplication is valuable for saving space, doing it too aggressively can cause file fragmentation—meaning a file’s chunks end up scattered across many different xorbs. This can make reading files slower and less efficient. | |
| To avoid this, in xet-core we aim (and encourage implementors) to keep long, continuous runs of chunks together in the same xorb whenever possible. Implementations SHOULD keep long, continuous runs together when feasible. | |
| Instead of always deduplicating every possible chunk, the system sometimes chooses to reference a straight run of chunks in a single xorb, even if it means skipping deduplication for a few chunks. | |
| This approach balances the benefits of deduplication with the need to keep files easy and fast to read. | |
| Consider for example referencing a deduplicated chunks in a minimum run of chunks (e.g. at least 8 chunks) or targeting an average contiguous run of chunks totalling length >= 1MB.</p> <h2 class="relative group"><a id="conclusion" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#conclusion"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Conclusion</span></h2> <p data-svelte-h="svelte-2enxh1">Xet’s chunk-level deduplication system provides a comprehensive solution for efficient data storage and transfer in large-scale data workflows. | |
| By combining local, cached, and global deduplication strategies with robust security mechanisms and fragmentation prevention, | |
| the system achieves significant storage savings while maintaining performance and data integrity.</p> <p data-svelte-h="svelte-82h05h">The multi-tiered approach ensures that deduplication is both effective and efficient:</p> <ul data-svelte-h="svelte-ir2er9"><li>Local deduplication provides immediate benefits within sessions</li> <li>Cached deduplication leverages recent upload history</li> <li>Global deduplication enables cross-repository optimization while preserving security</li></ul> <p data-svelte-h="svelte-xbd3py">The system’s design prioritizes both efficiency and safety, with comprehensive error handling, performance monitoring, and security measures that make it suitable for production use in data-intensive applications.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/hub-docs/blob/main/docs/xet/deduplication.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_1t7h0ns = { | |
| assets: "/docs/xet/pr_2272/en", | |
| base: "/docs/xet/pr_2272/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/xet/pr_2272/en/_app/immutable/entry/start.7209fe0c.js"), | |
| import("/docs/xet/pr_2272/en/_app/immutable/entry/app.54268a84.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 5], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 47.6 kB
- Xet hash:
- 09ee643558ce28713ea42e63b77bce9e329e5de718ab8852dfd066b70ad9be91
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.