Buckets:

rtrm's picture
download
raw
34.4 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Selecting a quantization method&quot;,&quot;local&quot;:&quot;selecting-a-quantization-method&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Inference&quot;,&quot;local&quot;:&quot;inference&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;No Calibration Required (On-the-fly Quantization)&quot;,&quot;local&quot;:&quot;no-calibration-required-on-the-fly-quantization&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;bitsandbytes&quot;,&quot;local&quot;:&quot;bitsandbytes&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;HQQ (Half-Quadratic Quantization)&quot;,&quot;local&quot;:&quot;hqq-half-quadratic-quantization&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;torchao&quot;,&quot;local&quot;:&quot;torchao&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Calibration-based Quantization&quot;,&quot;local&quot;:&quot;calibration-based-quantization&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;GPTQ/GPTQModel&quot;,&quot;local&quot;:&quot;gptqgptqmodel&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;AWQ (Activation-aware Weight Quantization)&quot;,&quot;local&quot;:&quot;awq-activation-aware-weight-quantization&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Loading Specific Formats&quot;,&quot;local&quot;:&quot;loading-specific-formats&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;compressed-tensors&quot;,&quot;local&quot;:&quot;compressed-tensors&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Fine-tuning&quot;,&quot;local&quot;:&quot;fine-tuning&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;bitsandbytes&quot;,&quot;local&quot;:&quot;training&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Research&quot;,&quot;local&quot;:&quot;research&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Benchmark Comparison&quot;,&quot;local&quot;:&quot;benchmark-comparison&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/transformers/pr_33892/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/entry/start.b2c4257a.js">
<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/scheduler.31fdf58d.js">
<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/singletons.9860629f.js">
<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/index.252883d5.js">
<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/paths.e85c0ec8.js">
<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/entry/app.05ef1f97.js">
<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/preload-helper.40847a0e.js">
<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/index.2f76fdf0.js">
<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/nodes/0.ca4aafa4.js">
<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/nodes/541.87303ac4.js">
<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/CopyLLMTxtMenu.ff482081.js">
<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.71f274cc.js">
<link rel="modulepreload" href="/docs/transformers/pr_33892/en/_app/immutable/chunks/IconCopy.ac192424.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Selecting a quantization method&quot;,&quot;local&quot;:&quot;selecting-a-quantization-method&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Inference&quot;,&quot;local&quot;:&quot;inference&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;No Calibration Required (On-the-fly Quantization)&quot;,&quot;local&quot;:&quot;no-calibration-required-on-the-fly-quantization&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;bitsandbytes&quot;,&quot;local&quot;:&quot;bitsandbytes&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;HQQ (Half-Quadratic Quantization)&quot;,&quot;local&quot;:&quot;hqq-half-quadratic-quantization&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;torchao&quot;,&quot;local&quot;:&quot;torchao&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Calibration-based Quantization&quot;,&quot;local&quot;:&quot;calibration-based-quantization&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;GPTQ/GPTQModel&quot;,&quot;local&quot;:&quot;gptqgptqmodel&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4},{&quot;title&quot;:&quot;AWQ (Activation-aware Weight Quantization)&quot;,&quot;local&quot;:&quot;awq-activation-aware-weight-quantization&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Loading Specific Formats&quot;,&quot;local&quot;:&quot;loading-specific-formats&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;compressed-tensors&quot;,&quot;local&quot;:&quot;compressed-tensors&quot;,&quot;sections&quot;:[],&quot;depth&quot;:4}],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Fine-tuning&quot;,&quot;local&quot;:&quot;fine-tuning&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;bitsandbytes&quot;,&quot;local&quot;:&quot;training&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Research&quot;,&quot;local&quot;:&quot;research&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Benchmark Comparison&quot;,&quot;local&quot;:&quot;benchmark-comparison&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="selecting-a-quantization-method" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#selecting-a-quantization-method"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Selecting a quantization method</span></h1> <p data-svelte-h="svelte-1gysdbw">There are many quantization methods available in Transformers for inference and fine-tuning. This guide helps you choose the most common and production-ready quantization techniques depending on your use case, and presents the advantages and disadvantages of each technique.</p> <p data-svelte-h="svelte-nxe644">For a comprehensive overview of all supported methods and their features, refer back to the table in the <a href="./overview">Overview</a>.</p> <h2 class="relative group"><a id="inference" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#inference"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Inference</span></h2> <p data-svelte-h="svelte-y3qek">Consider the quantization methods below for inference.</p> <table data-svelte-h="svelte-119z950"><thead><tr><th>quantization method</th> <th>use case</th></tr></thead> <tbody><tr><td>bitsandbytes</td> <td>ease of use and QLoRA fine-tuning on NVIDIA and Intel GPUs</td></tr> <tr><td>compressed-tensors</td> <td>loading specific quantized formats (FP8, Sparse)</td></tr> <tr><td>GPTQModel or AWQ</td> <td>good 4-bit accuracy with upfront calibration</td></tr> <tr><td>HQQ</td> <td>fast on the fly quantization without calibration</td></tr> <tr><td>torchao</td> <td>flexibility and fast inference with torch.compile</td></tr></tbody></table> <h3 class="relative group"><a id="no-calibration-required-on-the-fly-quantization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#no-calibration-required-on-the-fly-quantization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>No Calibration Required (On-the-fly Quantization)</span></h3> <p data-svelte-h="svelte-1phmujg">These methods are generally easier to use as they don’t need a separate calibration dataset or step.</p> <h4 class="relative group"><a id="bitsandbytes" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#bitsandbytes"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>bitsandbytes</span></h4> <table data-svelte-h="svelte-1yjrd1e"><thead><tr><th>Pros</th> <th>Cons</th></tr></thead> <tbody><tr><td>Very simple, no calibration dataset required for inference.</td> <td>Primarily optimized for NVIDIA GPUs (CUDA).</td></tr> <tr><td>Good community support and widely adopted.</td> <td>Inference speedup isn’t guaranteed.</td></tr></tbody></table> <p data-svelte-h="svelte-zsk6yg">See the <a href="./bitsandbytes">bitsandbytes documentation</a> for more details.</p> <h4 class="relative group"><a id="hqq-half-quadratic-quantization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#hqq-half-quadratic-quantization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>HQQ (Half-Quadratic Quantization)</span></h4> <table data-svelte-h="svelte-1i66pe9"><thead><tr><th>Pros</th> <th>Cons</th></tr></thead> <tbody><tr><td>Fast quantization process, no calibration data needed.</td> <td>Accuracy can degrade significantly at bit depths &lt;4-bit.</td></tr> <tr><td>Multiple backends for fast inference.</td> <td>Inference speed may not match others unless using <code>torch.compile</code> or backends.</td></tr> <tr><td>Compatible with <code>torch.compile</code>.</td> <td></td></tr> <tr><td>Supports wide range of bit depths (8, 4, 3, 2, 1-bit).</td> <td></td></tr></tbody></table> <p data-svelte-h="svelte-1aajllk">See the <a href="./hqq">HQQ documentation</a> for more details.</p> <h4 class="relative group"><a id="torchao" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#torchao"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>torchao</span></h4> <table data-svelte-h="svelte-147aj1"><thead><tr><th>Pros</th> <th>Cons</th></tr></thead> <tbody><tr><td>Strong integration with <code>torch.compile</code> for potential speedups.</td> <td>Newer library, ecosystem still evolving.</td></tr> <tr><td>Offers decent CPU quantization support.</td> <td>Performance depends on <code>torch.compile</code> working well.</td></tr> <tr><td>Flexibility in quantization schemes (int8, int4, fp8).</td> <td>4-bit quantization (int4wo) may not match GPTQ/AWQ in accuracy.</td></tr></tbody></table> <p data-svelte-h="svelte-17wngmg">See the <a href="./torchao">torchao documentation</a> for more details.</p> <h3 class="relative group"><a id="calibration-based-quantization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#calibration-based-quantization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Calibration-based Quantization</span></h3> <p data-svelte-h="svelte-qn60gx">These methods require an upfront calibration step using a dataset to potentially achieve higher accuracy.</p> <h4 class="relative group"><a id="gptqgptqmodel" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gptqgptqmodel"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>GPTQ/GPTQModel</span></h4> <p data-svelte-h="svelte-bvcvpu">Calibration for 8B model takes ~20 minutes on one A100 gpu.</p> <table data-svelte-h="svelte-1n3q37i"><thead><tr><th>Pros</th> <th>Cons</th></tr></thead> <tbody><tr><td>Often achieves high accuracy.</td> <td>Requires a calibration dataset and a separate calibration step.</td></tr> <tr><td>Can lead to inference speedups.</td> <td>Possible to overfit on calibration data.</td></tr> <tr><td>Many pre-quantized GPTQ models on <a href="https://huggingface.co/models?other=gptq" rel="nofollow">Hugging Face Hub</a>.</td> <td></td></tr></tbody></table> <p data-svelte-h="svelte-1wuqim0">See the <a href="./gptq">GPTQ documentation</a> for more details.</p> <h4 class="relative group"><a id="awq-activation-aware-weight-quantization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#awq-activation-aware-weight-quantization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>AWQ (Activation-aware Weight Quantization)</span></h4> <p data-svelte-h="svelte-5o275f">Calibration for 8B model takes ~10 minutes on one A100 gpu.</p> <table data-svelte-h="svelte-d8b0ec"><thead><tr><th>Pros</th> <th>Cons</th></tr></thead> <tbody><tr><td>Often achieves high accuracy at 4-bit. (Sometimes surpasses GPTQ on specific tasks.)</td> <td>Requires calibration if quantizing yourself.</td></tr> <tr><td>Can lead to inference speedups.</td> <td></td></tr> <tr><td>Shorter calibration time than GPTQ.</td> <td></td></tr> <tr><td>Many pre-quantized AWQ models on <a href="https://huggingface.co/models?other=awq" rel="nofollow">Hugging Face Hub</a>.</td> <td></td></tr></tbody></table> <p data-svelte-h="svelte-1cpp7pm">See the <a href="./awq">AWQ documentation</a> for more details.</p> <h3 class="relative group"><a id="loading-specific-formats" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#loading-specific-formats"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Loading Specific Formats</span></h3> <h4 class="relative group"><a id="compressed-tensors" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#compressed-tensors"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>compressed-tensors</span></h4> <table data-svelte-h="svelte-1pirpe6"><thead><tr><th>Pros</th> <th>Cons</th></tr></thead> <tbody><tr><td>Supports flexible formats including FP8 and sparsity.</td> <td>Primarily for loading pre-quantized models.</td></tr> <tr><td></td> <td>Doesn’t perform quantization within Transformers directly.</td></tr></tbody></table> <p data-svelte-h="svelte-1a5p7n6">See the <a href="./compressed_tensors">compressed-tensors documentation</a> for more details.</p> <h2 class="relative group"><a id="fine-tuning" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#fine-tuning"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Fine-tuning</span></h2> <p data-svelte-h="svelte-d7wgm3">Consider the quantization method below during fine-tuning to save memory.</p> <h3 class="relative group"><a id="training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>bitsandbytes</span></h3> <ul data-svelte-h="svelte-1lrihuc"><li><strong>Description:</strong> The standard method for QLoRA fine-tuning via PEFT.</li> <li><strong>Pros:</strong> Enables fine-tuning large models on consumer GPUs; widely supported and documented for PEFT.</li> <li><strong>Cons:</strong> Primarily for NVIDIA GPUs.</li></ul> <p data-svelte-h="svelte-hk0h25">Other methods offer PEFT compatibility, though bitsandbytes is the most established and straightforward path for QLoRA.</p> <p data-svelte-h="svelte-12fl2tu">See the <a href="./bitsandbytes#qlora">bitsandbytes documentation</a> and <a href="https://huggingface.co/docs/peft/developer_guides/quantization#aqlm-quantization" rel="nofollow">PEFT Docs</a> for more details.</p> <h2 class="relative group"><a id="research" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#research"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Research</span></h2> <p data-svelte-h="svelte-ph3ctx">Methods like <a href="./aqlm">AQLM</a>, <a href="./spqr">SpQR</a>, <a href="./vptq">VPTQ</a>, <a href="./higgs">HIGGS</a>, etc., push the boundaries of compression (&lt; 2-bit) or explore novel techniques.</p> <ul data-svelte-h="svelte-a7ck22"><li>Consider these if:
<ul><li>You need extreme compression (sub-4-bit).</li> <li>You are conducting research or require state-of-the-art results from their respective papers.</li> <li>You have significant compute resources available for potentially complex quantization procedures.
We recommend consulting each methods documentation and associated papers carefully before choosing one for use in production.</li></ul></li></ul> <h2 class="relative group"><a id="benchmark-comparison" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#benchmark-comparison"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Benchmark Comparison</span></h2> <p data-svelte-h="svelte-p8xr0n">To provide a quantitative comparison of different quantization methods, we benchmarked several popular techniques on the Llama 3.1 8B and 70B models. The following tables show results for accuracy (higher is better), inference throughput measured in tokens/second (higher is better), peak VRAM usage measured in GB (lower is better), and quantization time.</p> <p data-svelte-h="svelte-1l263bb">Performance metrics were measured on 2 NVIDIA A100 80GB GPU for Llama 3.1 70B (bfloat16), 1 NVIDIA H100 80GB GPU for FP8 methods, and 1 NVIDIA A100 80GB GPU for all other methods. Throughput was measured with a batch size of 1 and generating 64 tokens.
Results for <code>torch.compile</code> and Marlin kernels are included where applicable and supported.</p> <iframe src="https://huggingface.co/datasets/derekl35/quantization-benchmarks/embed/viewer/default/train" frameborder="0" width="100%" height="560px" title="benchmarking results dataset"></iframe> <p data-svelte-h="svelte-1qk6xcy">The key takeaways are:</p> <table data-svelte-h="svelte-1nrwno0"><thead><tr><th>Quantization &amp; Methods</th> <th>Memory Savings (vs bf16)</th> <th>Accuracy</th> <th>Other Notes</th></tr></thead> <tbody><tr><td><strong>8-bit</strong> (bnb-int8, HQQ, Quanto, torchao, fp8)</td> <td>~2x</td> <td>Very close to baseline bf16 model</td> <td></td></tr> <tr><td><strong>4-bit</strong> (AWQ, GPTQ, HQQ, bnb-nf4)</td> <td>~4x</td> <td>Relatively high accuracy</td> <td>AWQ/GPTQ often lead in accuracy but need calibration. HQQ/bnb-nf4 are easy on-the-fly.</td></tr> <tr><td><strong>Sub-4-bit</strong> (VPTQ, AQLM, 2-bit GPTQ)</td> <td>Extreme (&gt;4x)</td> <td>Noticeable drop, especially at 2-bit</td> <td>Quantization times can be very long (AQLM, VPTQ). Performance varies.</td></tr></tbody></table> <blockquote class="tip" data-svelte-h="svelte-2a3hv9"><p>Always benchmark the performance (accuracy and speed) of the quantized model on your specific task and hardware to ensure it meets your requirements. Refer to the individual documentation pages linked above for detailed usage instructions.</p></blockquote> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/quantization/selecting.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_16tnnm8 = {
assets: "/docs/transformers/pr_33892/en",
base: "/docs/transformers/pr_33892/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/transformers/pr_33892/en/_app/immutable/entry/start.b2c4257a.js"),
import("/docs/transformers/pr_33892/en/_app/immutable/entry/app.05ef1f97.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 541],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
34.4 kB
·
Xet hash:
a7fb77a226ab029edb9810cea675e44ac7e2fc6708c0d1f8be237977ccb6370f

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.