Buckets:

rtrm's picture
download
raw
39 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;SINQ&quot;,&quot;local&quot;:&quot;sinq&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;🔍 What You’ll Find Here&quot;,&quot;local&quot;:&quot;-what-youll-find-here&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;📊 Feature Comparison: SINQ vs HQQ (calibration-free) and A-SINQ vs AWQ (calibrated)&quot;,&quot;local&quot;:&quot;-feature-comparison-sinq-vs-hqq-calibration-free-and-a-sinq-vs-awq-calibrated&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;1. Quantize any LLM with SINQ&quot;,&quot;local&quot;:&quot;1-quantize-any-llm-with-sinq&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Setup &amp; Quick Start&quot;,&quot;local&quot;:&quot;setup--quick-start&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Quantize in a few lines&quot;,&quot;local&quot;:&quot;quantize-in-a-few-lines&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Save &amp; reload&quot;,&quot;local&quot;:&quot;save--reload&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Compatible with lm-eval evaluation framework&quot;,&quot;local&quot;:&quot;compatible-with-lm-eval-evaluation-framework&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;2. How to Cite This Work&quot;,&quot;local&quot;:&quot;2-how-to-cite-this-work&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;3. Current Limitations&quot;,&quot;local&quot;:&quot;3-current-limitations&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/transformers/pr_26617/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/entry/start.b5ae2c21.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/scheduler.31fdf58d.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/singletons.512cdb48.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/index.252883d5.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/paths.81255c3b.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/entry/app.9acf2c3e.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/preload-helper.bb442aeb.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/index.2f76fdf0.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/nodes/0.da6b3909.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/nodes/581.53d6cf8a.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/CopyLLMTxtMenu.a69e059a.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.e4c7f916.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/IconCopy.ac192424.js">
<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/CodeBlock.ab12f8e1.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;SINQ&quot;,&quot;local&quot;:&quot;sinq&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;🔍 What You’ll Find Here&quot;,&quot;local&quot;:&quot;-what-youll-find-here&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;📊 Feature Comparison: SINQ vs HQQ (calibration-free) and A-SINQ vs AWQ (calibrated)&quot;,&quot;local&quot;:&quot;-feature-comparison-sinq-vs-hqq-calibration-free-and-a-sinq-vs-awq-calibrated&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;1. Quantize any LLM with SINQ&quot;,&quot;local&quot;:&quot;1-quantize-any-llm-with-sinq&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Setup &amp; Quick Start&quot;,&quot;local&quot;:&quot;setup--quick-start&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Quantize in a few lines&quot;,&quot;local&quot;:&quot;quantize-in-a-few-lines&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Save &amp; reload&quot;,&quot;local&quot;:&quot;save--reload&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Compatible with lm-eval evaluation framework&quot;,&quot;local&quot;:&quot;compatible-with-lm-eval-evaluation-framework&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;2. How to Cite This Work&quot;,&quot;local&quot;:&quot;2-how-to-cite-this-work&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;3. Current Limitations&quot;,&quot;local&quot;:&quot;3-current-limitations&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <p data-svelte-h="svelte-12zc5wy"><a href="https://arxiv.org/abs/2509.22944" rel="nofollow"><img src="https://img.shields.io/badge/arXiv-2509.22944-b31b1b.svg" alt="arXiv"></a> <a href="https://opensource.org/licenses/Apache-2.0" rel="nofollow"><img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg" alt="License"></a> <a href="https://github.com/huawei-csl/SINQ/stargazers" rel="nofollow"><img src="https://img.shields.io/github/stars/huawei-csl/SINQ?label=Stars&logo=github&logoColor=white&style=flat-square" alt="GitHub stars"></a> <a href="https://huggingface.co/huawei-csl" rel="nofollow"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Huawei%20CSL-ffc107?color=ffc107&logoColor=white" alt="hf-space"></a></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="sinq" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sinq"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>SINQ</span></h1> <p data-svelte-h="svelte-eku2pv"><a href="https://github.com/huawei-csl/SINQ/tree/main" rel="nofollow">Sinkhorn-Normalized Quantization (SINQ)</a> is a fast, plug-and-play, model-agnostic quantization technique delivering state-of-the-art performance for Large Language Models without sacrificing accuracy.</p> <h3 class="relative group"><a id="-what-youll-find-here" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-what-youll-find-here"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>🔍 What You’ll Find Here</span></h3> <ul data-svelte-h="svelte-1jxf7aw"><li><a href="#1-quantize-any-llm-with-sinq">1. Quantize (and save) any LLM with SINQ</a></li> <li><a href="#2-how-to-cite-this-work">2. How to Cite This Work</a></li> <li><a href="#3-current-limitations">3. Current Limitations</a></li></ul> <h4 class="relative group"><a id="-feature-comparison-sinq-vs-hqq-calibration-free-and-a-sinq-vs-awq-calibrated" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-feature-comparison-sinq-vs-hqq-calibration-free-and-a-sinq-vs-awq-calibrated"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>📊 Feature Comparison: SINQ vs HQQ (calibration-free) and A-SINQ vs AWQ (calibrated)</span></h4> <table data-svelte-h="svelte-o5ptvb"><thead><tr><th>Feature</th> <th align="center"><strong>SINQ</strong></th> <th align="center"><strong>HQQ</strong></th> <th align="center"><strong>A-SINQ</strong></th> <th align="center"><strong>AWQ</strong></th></tr></thead> <tbody><tr><td>🎯 Calibration</td> <td align="center">Calibration-free</td> <td align="center">Calibration-free</td> <td align="center">Calibrated</td> <td align="center">Calibrated</td></tr> <tr><td>🧮 Quantization Type</td> <td align="center">Symmetric &amp; Asymmetric</td> <td align="center">Asymmetric only</td> <td align="center">Symmetric &amp; Asymmetric</td> <td align="center">Symmetric &amp; Asymmetric</td></tr> <tr><td>📦 NF4 Support</td> <td align="center"><strong>Yes</strong></td> <td align="center">No</td> <td align="center"><strong>Yes</strong></td> <td align="center">No</td></tr> <tr><td>⚡ Quantization Speed</td> <td align="center">~2× <strong>Faster</strong> than HQQ</td> <td align="center">Slower</td> <td align="center">~4× <strong>Faster</strong> than AWQ</td> <td align="center">Slower</td></tr> <tr><td>📈 Model Quality</td> <td align="center"><strong>Higher</strong></td> <td align="center">Lower</td> <td align="center"><strong>Higher</strong></td> <td align="center">Lower</td></tr></tbody></table> <p data-svelte-h="svelte-wam0ck">📄 <strong>Want to know more?</strong></p> <ul data-svelte-h="svelte-1gkwrqn"><li>Read our paper on <a href="http://arxiv.org/abs/2509.22944" rel="nofollow"><strong>arXiv</strong></a></li> <li>Check the official <a href="https://github.com/huawei-csl/SINQ/tree/main" rel="nofollow"><strong>SINQ</strong></a> github repository</li></ul> <hr> <h2 class="relative group"><a id="1-quantize-any-llm-with-sinq" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1-quantize-any-llm-with-sinq"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. Quantize any LLM with SINQ</span></h2> <h3 class="relative group"><a id="setup--quick-start" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#setup--quick-start"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Setup &amp; Quick Start</span></h3> <p data-svelte-h="svelte-18yn57j">First, install the package. It can be done in two ways:</p> <ul data-svelte-h="svelte-1ts5xgr"><li>From source using the official Github repository <a href="https://github.com/huawei-csl/SINQ/tree/main" rel="nofollow"><strong>SINQ</strong></a> <strong>[Recommended]</strong></li> <li>Using pip package:</li></ul> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install sinq<!-- HTML_TAG_END --></pre></div> <hr> <h3 class="relative group"><a id="quantize-in-a-few-lines" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quantize-in-a-few-lines"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Quantize in a few lines</span></h3> <p data-svelte-h="svelte-chywk1">Quantizing any 🤗 Hugging Face model with SINQ is simple and takes only a few lines of code.
First, create a <a href="/docs/transformers/pr_26617/en/main_classes/quantization#transformers.SinqConfig">SinqConfig</a> and specify the following parameters:</p> <table data-svelte-h="svelte-97yn1t"><thead><tr><th>Flag</th> <th>Description</th> <th>Type</th> <th>Options</th> <th>Default</th></tr></thead> <tbody><tr><td><code>--nbits</code></td> <td>Bit-width for weight quantization</td> <td>int</td> <td>2, 3, 4, 5, 6, 8</td> <td>4</td></tr> <tr><td><code>--tiling_mode</code></td> <td>Weight matrix tiling strategy</td> <td>str</td> <td>1D, 2D</td> <td>1D</td></tr> <tr><td><code>--group_size</code></td> <td>Weights per quantization group</td> <td>int</td> <td>64, 128</td> <td>64</td></tr> <tr><td><code>--method</code></td> <td>Quantization method</td> <td>str</td> <td>sinq, asinq</td> <td>sinq</td></tr> <tr><td><code>--modules_to_not_convert</code></td> <td>List of the layers that are NOT quantize</td> <td>List of str</td> <td>[lm_head, …]</td> <td>[lm_head]</td></tr></tbody></table> <p data-svelte-h="svelte-zp11tx">Then specify the model you want to quantize and pass the SinqConfig as quantization configuration option</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModelForCausalLM, SinqConfig
model_name = <span class="hljs-string">&quot;Qwen/Qwen3-1.7B&quot;</span>
cfg = SinqConfig(
nbits=<span class="hljs-number">4</span>,
group_size=<span class="hljs-number">64</span>,
tiling_mode=<span class="hljs-string">&quot;1D&quot;</span>,
method=<span class="hljs-string">&quot;sinq&quot;</span>,
modules_to_not_convert=[<span class="hljs-string">&quot;lm_head&quot;</span>]
)
tok = AutoTokenizer.from_pretrained(model_name)
qmodel = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=cfg,
dtype=torch.bfloat16
)
<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-qkvyjl">✅ That’s it. Your model is now quantized with <strong>SINQ</strong> and ready for inference or saving.</p> <blockquote data-svelte-h="svelte-1yopjkv"><p>Check our official <a href="https://github.com/huawei-csl/SINQ/tree/main" rel="nofollow"><strong>SINQ</strong></a> github repository to stay updated!</p></blockquote> <hr> <h3 class="relative group"><a id="save--reload" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#save--reload"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Save &amp; reload</span></h3> <p data-svelte-h="svelte-1dr6mwh">If you want to reuse a quantized model later, save it to disk or push it on the HuggingFace Hub and reload it without needing base FP weights.
If you installed SINQ from source you should call <em>patch_hf_pretrained_io</em> function when re-loading a quantized model:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Save sinq quantized model</span>
model.save_pretrained(<span class="hljs-string">&quot;/path/to/save/qwen3-1.7B-sinq-4bit&quot;</span>)
model.push_to_hub(<span class="hljs-string">&quot;HF_Hub_username/qwen3-1.7B-sinq-4bit&quot;</span>)
tokenizer.push_to_hub(<span class="hljs-string">&quot;HF_Hub_username/qwen3-1.7B-sinq-4bit&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> sinq.hf_io <span class="hljs-keyword">import</span> patch_hf_pretrained_io
patch_hf_pretrained_io()
<span class="hljs-comment"># Reload a sinq quantized model</span>
hf_hub_model = <span class="hljs-string">&quot;HF_Hub_username/qwen3-1.7B-sinq-4bit&quot;</span>
tokenizer = AutoTokenizer.from_pretrained(hf_hub_model)
model = AutoModelForCausalLM.from_pretrained(hf_hub_model)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ob2opa">Otherwise, if you installed SINQ through pip, you can simply use HF built-in functions:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># --- Save to a folder (sharded safetensors) ---</span>
<span class="hljs-comment"># &#x27;model&#x27; must already be SINQ-quantized</span>
<span class="hljs-comment"># Locally save</span>
qmodel.save_pretrained(<span class="hljs-string">&quot;/path/to/save/qwen3-1.7B-sinq-4bit&quot;</span>)
<span class="hljs-comment"># Push to the Hub</span>
qmodel.push_to_hub(<span class="hljs-string">&quot;HF_Hub_username/qwen3-1.7B-sinq-4bit&quot;</span>)
tok.push_to_hub(<span class="hljs-string">&quot;HF_Hub_username/qwen3-1.7B-sinq-4bit&quot;</span>)
<span class="hljs-comment"># --- Reload later--</span>
save_dir = <span class="hljs-string">&quot;/path/to/save/qwen3-1.7B-sinq-4bit&quot;</span>
hf_hub_model = <span class="hljs-string">&quot;HF_Hub_username/qwen3-1.7B-sinq-4bit&quot;</span>
<span class="hljs-comment"># From local directory</span>
tok = AutoTokenizer.from_pretrained(save_dir)
qmodel = AutoModelForCausalLM.from_pretrained(save_dir)
<span class="hljs-comment"># From HF Hub</span>
tok = AutoTokenizer.from_pretrained(hf_hub_model)
qmodel = AutoModelForCausalLM.from_pretrained(hf_hub_model)
<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1cpbzxr">✅ Your model is now loaded and ready for inference!</p> <blockquote data-svelte-h="svelte-42wwmb"><p>Note: If the model has been quantized in 4 bit and <code>gemlite</code> library is installed, gemlite faster kernel is used to run the inference.</p></blockquote> <hr> <h3 class="relative group"><a id="compatible-with-lm-eval-evaluation-framework" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#compatible-with-lm-eval-evaluation-framework"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Compatible with lm-eval evaluation framework</span></h3> <p data-svelte-h="svelte-hg2dqk">Below is a minimal example showing how to evaluate a SINQ-quantized model on a benchmark dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> lm_eval <span class="hljs-keyword">import</span> evaluator
<span class="hljs-keyword">from</span> lm_eval.models.huggingface <span class="hljs-keyword">import</span> HFLM
<span class="hljs-comment"># Wrap the already quantized model and tokenizer with HFLM</span>
lm = HFLM(pretrained=qmodel, tokenizer=tok, device=device)
device = <span class="hljs-string">&quot;cuda:0&quot;</span>
<span class="hljs-comment"># Evaluate (many tasks available on lm-eval such as MMLU and HellaSwag)</span>
results = evaluator.simple_evaluate(
model=lm,
tasks=[<span class="hljs-string">&quot;wikitext&quot;</span>], <span class="hljs-comment"># small and fast benchmark</span>
device=device
)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="2-how-to-cite-this-work" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-how-to-cite-this-work"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. How to Cite This Work</span></h2> <p data-svelte-h="svelte-6ae8ts">If you find <strong>SINQ</strong> useful in your research or applications</p> <ul data-svelte-h="svelte-cx5mos"><li>Support our project by putting a star ⭐️ in the <a href="https://github.com/huawei-csl/SINQ/tree/main" rel="nofollow"><strong>SINQ</strong></a> github repository</li> <li>Please cite our <a href="http://arxiv.org/abs/2509.22944" target="_blank"><strong>paper</strong></a>:</li></ul> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment">@misc{muller2025sinq,</span>
title={SINQ: Sinkhorn-Normalized Quantization for Calibration-Free Low-Precision LLM Weights},
author={Lorenz K. <span class="hljs-keyword">Muller</span> <span class="hljs-keyword">and</span> Philippe Bich <span class="hljs-keyword">and</span> Jiawei Zhuang <span class="hljs-keyword">and</span> Ahmet Celik <span class="hljs-keyword">and</span> Luca Benfenati <span class="hljs-keyword">and</span> Lukas Cavigelli},
year={<span class="hljs-number">2025</span>},
eprint={<span class="hljs-number">2509</span>.<span class="hljs-number">22944</span>},
archivePrefix={arXiv},
primaryClass={cs.LG},
url={http:<span class="hljs-comment">//arxiv.org/abs/2509.22944}</span>
}<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="3-current-limitations" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-current-limitations"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. Current Limitations</span></h2> <p data-svelte-h="svelte-14eifhe">Currently, the A-SINQ method is not supported in Hugging Face. Please refer to the official <a href="https://github.com/huawei-csl/SINQ/tree/main" rel="nofollow">SINQ repository</a> to quantize a model with this strategy.
At the moment the SINQ quantization strategy and SINQ quantized models do not support Multi-GPU option, so if your system counts multiple GPUs please specify which one should be used.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/quantization/sinq.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1x0t0ja = {
assets: "/docs/transformers/pr_26617/en",
base: "/docs/transformers/pr_26617/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/transformers/pr_26617/en/_app/immutable/entry/start.b5ae2c21.js"),
import("/docs/transformers/pr_26617/en/_app/immutable/entry/app.9acf2c3e.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 581],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
39 kB
·
Xet hash:
0a8dad9323decc1748bb6ba327645d307de160cedc2357da815acb8b01dfe89c

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.