Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / transformers /pr_36049 /en /quantization /vptq.html

rtrm

3 months ago

download

raw

28.4 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"VPTQ","local":"vptq","sections":[{"title":"Inference example","local":"inference-example","sections":[],"depth":2},{"title":"Quantize your own model","local":"quantize-your-own-model","sections":[],"depth":2},{"title":"Early Results from Tech Report","local":"early-results-from-tech-report","sections":[],"depth":2},{"title":"More Models in VPTQ-community","local":"more-models-in-vptq-community","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/transformers/pr_36049/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/entry/start.86af8b85.js">
	<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/scheduler.25b97de1.js">
	<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/singletons.20f80512.js">
	<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/index.e188933d.js">
	<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/paths.162096ab.js">
	<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/entry/app.d602e208.js">
	<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/index.d9030fc9.js">
	<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/nodes/0.8e0a4db0.js">
	<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/nodes/431.9265035e.js">
	<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/Tip.baa67368.js">
	<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/CodeBlock.e6cd0d95.js">
	<link rel="modulepreload" href="/docs/transformers/pr_36049/en/_app/immutable/chunks/EditOnGithub.91d95064.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"VPTQ","local":"vptq","sections":[{"title":"Inference example","local":"inference-example","sections":[],"depth":2},{"title":"Quantize your own model","local":"quantize-your-own-model","sections":[],"depth":2},{"title":"Early Results from Tech Report","local":"early-results-from-tech-report","sections":[],"depth":2},{"title":"More Models in VPTQ-community","local":"more-models-in-vptq-community","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="vptq" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#vptq"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>VPTQ</span></h1> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-oxgicx">Try VPTQ on <a href="https://huggingface.co/spaces/microsoft/VPTQ" rel="nofollow">Hugging Face</a>!
	Try VPTQ on <a href="https://colab.research.google.com/github/microsoft/VPTQ/blob/main/notebooks/vptq_example.ipynb" rel="nofollow">Google Colab</a>!
	Know more about VPTQ on <a href="https://arxiv.org/pdf/2409.17066" rel="nofollow">ArXiv</a>!</p></div> <p data-svelte-h="svelte-gnnp5m">Vector Post-Training Quantization (<a href="https://github.com/microsoft/VPTQ" rel="nofollow">VPTQ</a>) is a novel Post-Training Quantization method that leverages Vector Quantization to high accuracy on LLMs at an extremely low bit-width (<2-bit). VPTQ can compress 70B, even the 405B model, to 1-2 bits without retraining and maintain high accuracy.</p> <ul data-svelte-h="svelte-1f1exnc"><li>Better Accuracy on 1-2 bits, (405B @ <2bit, 70B @ 2bit)</li> <li>Lightweight Quantization Algorithm: only cost ~17 hours to quantize 405B Llama-3.1</li> <li>Agile Quantization Inference: low decode overhead, best throughput, and TTFT</li></ul> <p data-svelte-h="svelte-mdsvhd">Inference support for VPTQ is released in the <code>vptq</code> library. Make sure to install it to run the models:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install vptq<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xhueb9">The library provides efficient kernels for NVIDIA/AMD GPU inference.</p> <p data-svelte-h="svelte-1i4wrze">To run VPTQ models simply load a model that has been quantized with VPTQ:</p> <h2 class="relative group"><a id="inference-example" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#inference-example"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Inference example</span></h2> <p data-svelte-h="svelte-z3d1wz"><strong>Run Llama 3.1 70b on RTX4090 (24G @ ~2bits) in real time</strong> <img src="https://github.com/user-attachments/assets/d8729aca-4e1d-4fe1-ac71-c14da4bdd97f" alt="Llama3 1-70b-prompt"></p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModelForCausalLM

	quantized_model = AutoModelForCausalLM.from_pretrained(
	<span class="hljs-string">"VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft"</span>,
	torch_dtype=<span class="hljs-string">"auto"</span>,
	device_map=<span class="hljs-string">"auto"</span>
	)
	tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft"</span>)
	input_ids = tokenizer(<span class="hljs-string">"hello, it's me"</span>, return_tensors=<span class="hljs-string">"pt"</span>).to(<span class="hljs-string">"cuda"</span>)
	out = model.generate(**input_ids, max_new_tokens=<span class="hljs-number">32</span>, do_sample=<span class="hljs-literal">False</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="quantize-your-own-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quantize-your-own-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Quantize your own model</span></h2> <p data-svelte-h="svelte-t94tpj">VPTQ algorithm early-released at <a href="https://github.com/microsoft/VPTQ/tree/algorithm" rel="nofollow">VPTQ </a>,
	and checkout the <a href="https://github.com/microsoft/VPTQ/blob/algorithm/algorithm.md" rel="nofollow">tutorial</a>.</p> <h2 class="relative group"><a id="early-results-from-tech-report" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#early-results-from-tech-report"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Early Results from Tech Report</span></h2> <p data-svelte-h="svelte-l0x2ga">VPTQ achieves better accuracy and higher throughput with lower quantization overhead across models of different sizes. The following experimental results are for reference only; VPTQ can achieve better outcomes under reasonable parameters, especially in terms of model accuracy and inference speed.</p> <table data-svelte-h="svelte-uqhmp2"><thead><tr><th>Model</th> <th>bitwidth</th> <th>W2↓</th> <th>C4↓</th> <th>AvgQA↑</th> <th>tok/s↑</th> <th>mem(GB)</th> <th>cost/h↓</th></tr></thead> <tbody><tr><td>LLaMA-2 7B</td> <td>2.02</td> <td>6.13</td> <td>8.07</td> <td>58.2</td> <td>39.9</td> <td>2.28</td> <td>2</td></tr> <tr><td></td> <td>2.26</td> <td>5.95</td> <td>7.87</td> <td>59.4</td> <td>35.7</td> <td>2.48</td> <td>3.1</td></tr> <tr><td>LLaMA-2 13B</td> <td>2.02</td> <td>5.32</td> <td>7.15</td> <td>62.4</td> <td>26.9</td> <td>4.03</td> <td>3.2</td></tr> <tr><td></td> <td>2.18</td> <td>5.28</td> <td>7.04</td> <td>63.1</td> <td>18.5</td> <td>4.31</td> <td>3.6</td></tr> <tr><td>LLaMA-2 70B</td> <td>2.07</td> <td>3.93</td> <td>5.72</td> <td>68.6</td> <td>9.7</td> <td>19.54</td> <td>19</td></tr> <tr><td></td> <td>2.11</td> <td>3.92</td> <td>5.71</td> <td>68.7</td> <td>9.7</td> <td>20.01</td> <td>19</td></tr></tbody></table> <h2 class="relative group"><a id="more-models-in-vptq-community" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#more-models-in-vptq-community"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>More Models in VPTQ-community</span></h2> <p data-svelte-h="svelte-18yebhs">⚠️ The repository only provides a method of model quantization algorithm.</p> <p data-svelte-h="svelte-1b35uef">⚠️ The open-source community VPTQ-community provides models based on the technical report and quantization algorithm.</p> <p data-svelte-h="svelte-174osvm"><strong>Quick Estimation of Model Bitwidth (Excluding Codebook Overhead)</strong>:</p> <ul data-svelte-h="svelte-b82gml"><li><p><strong>Model Naming Convention</strong>: The model’s name includes the <strong>vector length</strong> $v$, <strong>codebook (lookup table) size</strong>, and <strong>residual codebook size</strong>. For example, “Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft” is “Meta-Llama-3.1-70B-Instruct”, where:</p> <ul><li><strong>Vector Length</strong>: 8</li> <li><strong>Number of Centroids</strong>: 65536 (2^16)</li> <li><strong>Number of Residual Centroids</strong>: 256 (2^8)</li></ul></li> <li><p><strong>Equivalent Bitwidth Calculation</strong>:</p> <ul><li><strong>Index</strong>: log2(65536) = 16 / 8 = 2 bits</li> <li><strong>Residual Index</strong>: log2(256) = 8 / 8 = 1 bit</li> <li><strong>Total Bitwidth</strong>: 2 + 1 = 3 bits</li></ul></li> <li><p><strong>Model Size Estimation</strong>: 70B * 3 bits / 8 bits per Byte = 26.25 GB</p></li> <li><p><strong>Note</strong>: This estimate does not include the size of the codebook (lookup table), other parameter overheads, and the padding overhead for storing indices. For the detailed calculation method, please refer to <strong>Tech Report Appendix C.2</strong>.</p></li></ul> <table data-svelte-h="svelte-1mb4fzs"><thead><tr><th align="center">Model Series</th> <th align="center">Collections</th> <th>(Estimated) Bit per weight</th></tr></thead> <tbody><tr><td align="center">Llama 3.1 Nemotron 70B Instruct HF</td> <td align="center"><a href="https://huggingface.co/collections/VPTQ-community/vptq-llama-31-nemotron-70b-instruct-hf-without-finetune-671730b96f16208d0b3fe942" rel="nofollow">HF 🤗</a></td> <td><a href="https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-65536-woft" rel="nofollow">4 bits</a> <a href="https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-256-woft" rel="nofollow">3 bits</a> <a href="https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-65536-woft" rel="nofollow">2 bits (1)</a> <a href="https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-0-woft" rel="nofollow">2 bits (2)</a> <a href="https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-16384-woft" rel="nofollow">1.875 bits</a> <a href="https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-1024-woft" rel="nofollow">1.625 bits</a> <a href="https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-256-woft" rel="nofollow">1.5 bits</a></td></tr> <tr><td align="center">Llama 3.1 8B Instruct</td> <td align="center"><a href="https://huggingface.co/collections/VPTQ-community/vptq-llama-31-8b-instruct-without-finetune-66f2b70b1d002ceedef02d2e" rel="nofollow">HF 🤗</a></td> <td><a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-65536-woft" rel="nofollow">4 bits</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-4096-woft" rel="nofollow">3.5 bits</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-256-woft" rel="nofollow">3 bits</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v12-k65536-4096-woft" rel="nofollow">2.3 bits</a></td></tr> <tr><td align="center">Llama 3.1 70B Instruct</td> <td align="center"><a href="https://huggingface.co/collections/VPTQ-community/vptq-llama-31-70b-instruct-without-finetune-66f2bf454d3dd78dfee2ff11" rel="nofollow">HF 🤗</a></td> <td><a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-65536-woft" rel="nofollow">4 bits</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft" rel="nofollow">3 bits</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-4-woft" rel="nofollow">2.25 bits</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft" rel="nofollow">2 bits (1)</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-0-woft" rel="nofollow">2 bits (2)</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-32768-woft" rel="nofollow">1.93 bits</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k32768-0-woft" rel="nofollow">1.875 bits</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k16384-0-woft" rel="nofollow">1.75 bits</a></td></tr> <tr><td align="center">Llama 3.1 405B Instruct</td> <td align="center"><a href="https://huggingface.co/collections/VPTQ-community/vptq-llama-31-405b-instruct-without-finetune-66f4413f9ba55e1a9e52cfb0" rel="nofollow">HF 🤗</a></td> <td><a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k65536-65536-woft" rel="nofollow">4 bits</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k65536-256-woft" rel="nofollow">3 bits</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-65536-woft" rel="nofollow">2 bits</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k32768-32768-woft" rel="nofollow">1.875 bits</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-1024-woft" rel="nofollow">1.625 bits</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k4096-0-woft" rel="nofollow">1.5 bits (1)</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-256-woft" rel="nofollow">1.5 bits (2)</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-128-woft" rel="nofollow">1.43 bits</a> <a href="https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-64-woft" rel="nofollow">1.375 bits</a></td></tr> <tr><td align="center">Mistral Large Instruct 2407 (123B)</td> <td align="center"><a href="https://huggingface.co/collections/VPTQ-community/vptq-mistral-large-instruct-2407-without-finetune-6711ebfb7faf85eed9cceb16" rel="nofollow">HF 🤗</a></td> <td><a href="https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-65536-woft" rel="nofollow">4 bits</a> <a href="https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-256-woft" rel="nofollow">3 bits</a> <a href="https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-65536-woft" rel="nofollow">2 bits (1)</a> <a href="https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-0-woft" rel="nofollow">2 bits (2)</a> <a href="https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-16384-woft" rel="nofollow">1.875 bits</a> <a href="https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-4096-woft" rel="nofollow">1.75 bits</a> <a href="https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-1024-woft" rel="nofollow">1.625 bits</a> <a href="https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-256-woft" rel="nofollow">1.5 bits</a></td></tr> <tr><td align="center">Qwen 2.5 7B Instruct</td> <td align="center"><a href="https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-7b-instruct-without-finetune-66f3e9866d3167cc05ce954a" rel="nofollow">HF 🤗</a></td> <td><a href="https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-65536-woft" rel="nofollow">4 bits</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-256-woft" rel="nofollow">3 bits</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k256-256-woft" rel="nofollow">2 bits (1)</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-0-woft" rel="nofollow">2 bits (2)</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v16-k65536-65536-woft" rel="nofollow">2 bits (3)</a></td></tr> <tr><td align="center">Qwen 2.5 14B Instruct</td> <td align="center"><a href="https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-14b-instruct-without-finetune-66f827f83c7ffa7931b8376c" rel="nofollow">HF 🤗</a></td> <td><a href="https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-65536-woft" rel="nofollow">4 bits</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-256-woft" rel="nofollow">3 bits</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k256-256-woft" rel="nofollow">2 bits (1)</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-0-woft" rel="nofollow">2 bits (2)</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v16-k65536-65536-woft" rel="nofollow">2 bits (3)</a></td></tr> <tr><td align="center">Qwen 2.5 32B Instruct</td> <td align="center"><a href="https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-32b-instruct-without-finetune-66fe77173bf7d64139f0f613" rel="nofollow">HF 🤗</a></td> <td><a href="https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-65536-woft" rel="nofollow">4 bits</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-256-woft" rel="nofollow">3 bits</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v16-k65536-65536-woft" rel="nofollow">2 bits (1)</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-0-woft" rel="nofollow">2 bits (2)</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k256-256-woft" rel="nofollow">2 bits (3)</a></td></tr> <tr><td align="center">Qwen 2.5 72B Instruct</td> <td align="center"><a href="https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-72b-instruct-without-finetune-66f3bf1b3757dfa1ecb481c0" rel="nofollow">HF 🤗</a></td> <td><a href="https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-65536-woft" rel="nofollow">4 bits</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-256-woft" rel="nofollow">3 bits</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k1024-512-woft" rel="nofollow">2.38 bits</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k512-512-woft" rel="nofollow">2.25 bits (1)</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-4-woft" rel="nofollow">2.25 bits (2)</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-0-woft" rel="nofollow">2 bits (1)</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v16-k65536-65536-woft" rel="nofollow">2 bits (2)</a> <a href="https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v16-k65536-32768-woft" rel="nofollow">1.94 bits</a></td></tr> <tr><td align="center">Reproduced from the tech report</td> <td align="center"><a href="https://huggingface.co/collections/VPTQ-community/reproduced-vptq-tech-report-baseline-66fbf1dffe741cc9e93ecf04" rel="nofollow">HF 🤗</a></td> <td>Results from the open source community for reference only, please use them responsibly.</td></tr> <tr><td align="center">Hessian and Inverse Hessian Matrix</td> <td align="center"><a href="https://huggingface.co/collections/VPTQ-community/hessian-and-invhessian-checkpoints-66fd249a104850d17b23fd8b" rel="nofollow">HF 🤗</a></td> <td>Collected from RedPajama-Data-1T-Sample, following <a href="https://github.com/Cornell-RelaxML/quip-sharp/blob/main/quantize_llama/hessian_offline_llama.py" rel="nofollow">Quip#</a></td></tr></tbody></table> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/quantization/vptq.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_tduuc7 = {
	assets: "/docs/transformers/pr_36049/en",
	base: "/docs/transformers/pr_36049/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/transformers/pr_36049/en/_app/immutable/entry/start.86af8b85.js"),
	import("/docs/transformers/pr_36049/en/_app/immutable/entry/app.d602e208.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 431],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 28.4 kB
Xet hash:: b777d42df37214fb6049839340ebf8f872009ba793dc46b712e3ed0a25d9e508

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.