Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / transformers /pr_33913 /en /quantization /compressed_tensors.html

rtrm

3 months ago

download

raw

33.7 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Compressed Tensors","local":"compressed-tensors","sections":[{"title":"Features:","local":"features","sections":[],"depth":4},{"title":"Installation","local":"installation","sections":[],"depth":2},{"title":"Quickstart Model Load","local":"quickstart-model-load","sections":[],"depth":2},{"title":"Sample Use Cases - Load and run an FP8 model","local":"sample-use-cases---load-and-run-an-fp8-model","sections":[],"depth":2},{"title":"Deep dive into a compressed-tensors model checkpoint","local":"deep-dive-into-a-compressed-tensors-model-checkpoint","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/transformers/pr_33913/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/entry/start.b67f883f.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/scheduler.25b97de1.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/singletons.62a184e0.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/index.e188933d.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/paths.51881b9e.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/entry/app.e436b1f2.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/index.d9030fc9.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/nodes/0.05e395f5.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/nodes/400.115a5e43.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/CodeBlock.e6cd0d95.js">
	<link rel="modulepreload" href="/docs/transformers/pr_33913/en/_app/immutable/chunks/EditOnGithub.91d95064.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Compressed Tensors","local":"compressed-tensors","sections":[{"title":"Features:","local":"features","sections":[],"depth":4},{"title":"Installation","local":"installation","sections":[],"depth":2},{"title":"Quickstart Model Load","local":"quickstart-model-load","sections":[],"depth":2},{"title":"Sample Use Cases - Load and run an FP8 model","local":"sample-use-cases---load-and-run-an-fp8-model","sections":[],"depth":2},{"title":"Deep dive into a compressed-tensors model checkpoint","local":"deep-dive-into-a-compressed-tensors-model-checkpoint","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="compressed-tensors" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#compressed-tensors"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Compressed Tensors</span></h1> <p data-svelte-h="svelte-qu2azz">The <a href="https://github.com/neuralmagic/compressed-tensors" rel="nofollow"><code>compressed-tensors</code></a> library provides a versatile and efficient way to store and manage compressed model checkpoints. This library supports various quantization and sparsity schemes, making it a unified format for handling different model optimizations like GPTQ, AWQ, SmoothQuant, INT8, FP8, SparseGPT, and more.</p> <p data-svelte-h="svelte-10asxkm">Some of the supported formats include:</p> <ol data-svelte-h="svelte-uq5ewu"><li><code>dense</code></li> <li><code>int-quantized</code> (<a href="https://huggingface.co/nm-testing/tinyllama-w8a8-compressed-hf-quantizer" rel="nofollow">sample</a>): INT8 quantized models</li> <li><code>float-quantized</code> (<a href="https://huggingface.co/nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat" rel="nofollow">sample</a>): FP8 quantized models; currently support E4M3</li> <li><code>pack-quantized</code> (<a href="https://huggingface.co/nm-testing/tinyllama-w4a16-compressed-hf-quantizer" rel="nofollow">sample</a>): INT4 or INT8 weight-quantized models, packed into INT32. For INT4, the weights have an INT4 range but are stored as INT8 and then packed into INT32.</li></ol> <p data-svelte-h="svelte-17lf5xn">Compressed models can be easily created using <a href="https://github.com/vllm-project/llm-compressor" rel="nofollow">llm-compressor</a>.
	Alternatively models can be created independently and serialized with a compressed tensors config.</p> <p data-svelte-h="svelte-1nrd39r">To find existing models on the Hugging Face Model Hub, search for the <a href="https://huggingface.co/models?other=compressed-tensors" rel="nofollow"><code>compressed-tensors</code> tag</a>.</p> <h4 class="relative group"><a id="features" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#features"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Features:</span></h4> <ul data-svelte-h="svelte-1bo9w2k"><li>Weight and activation precisions: FP8, INT4, INT8 (for Q/DQ arbitrary precision is allowed for INT)</li> <li>Quantization scales and zero-points strategies: <a href="https://github.com/neuralmagic/compressed-tensors/blob/83b2e7a969d70606421a76b9a3d112646077c8de/src/compressed_tensors/quantization/quant_args.py#L43-L52" rel="nofollow">tensor, channel, group, block, token</a></li> <li>Dynamic per-token activation quantization (or any static strategy)</li> <li>Sparsity in weights (unstructured or semi-structured like 2:4) can be composed with quantization for extreme compression</li> <li>Supports quantization of arbitrary modules, not just Linear modules</li> <li>Targeted support or ignoring of modules by name or class</li></ul> <h2 class="relative group"><a id="installation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#installation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Installation</span></h2> <p data-svelte-h="svelte-bgk99h">It is recommended to install stable releases of compressed-tensors from <a href="https://pypi.org/project/compressed-tensors" rel="nofollow">PyPI</a>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install compressed-tensors<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-oaj25y">Developers who want to experiment with the latest features can also install the package from source:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->git <span class="hljs-built_in">clone</span> https://github.com/neuralmagic/compressed-tensors
	<span class="hljs-built_in">cd</span> compressed-tensors
	pip install -e .<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="quickstart-model-load" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quickstart-model-load"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Quickstart Model Load</span></h2> <p data-svelte-h="svelte-mmaog6">Quantized models can be easily loaded for inference as shown below. Only models that have already been quantized can be loaded at the moment. To quantize a model into the compressed-tensors format see <a href="https://github.com/vllm-project/llm-compressor" rel="nofollow">llm-compressor</a>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM

	<span class="hljs-comment"># Load the model in compressed-tensors format</span>
	ct_model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf"</span>)

	<span class="hljs-comment"># Measure memory usage</span>
	mem_params = <span class="hljs-built_in">sum</span>([param.nelement()*param.element_size() <span class="hljs-keyword">for</span> param <span class="hljs-keyword">in</span> ct_model.parameters()])
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"<span class="hljs-subst">{mem/<span class="hljs-number">2</span>**<span class="hljs-number">30</span>:<span class="hljs-number">.4</span>f}</span> GB"</span>)
	<span class="hljs-comment"># 8.4575 GB</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-vselvi">We can see just above that the compressed-tensors FP8 checkpoint of Llama 3.1 8B is able to be loaded for inference using half of the memory of the unquantized reference checkpoint.</p> <h2 class="relative group"><a id="sample-use-cases---load-and-run-an-fp8-model" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sample-use-cases---load-and-run-an-fp8-model"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Sample Use Cases - Load and run an FP8 model</span></h2> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, AutoTokenizer

	prompt = [
	<span class="hljs-string">"Hello, my name is"</span>,
	<span class="hljs-string">"The capital of France is"</span>,
	<span class="hljs-string">"The future of AI is"</span>
	]

	model_name = <span class="hljs-string">"nm-testing/Meta-Llama-3-8B-Instruct-fp8-hf_compat"</span>

	quantized_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=<span class="hljs-string">"auto"</span>)
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	inputs = tokenizer(prompt, return_tensors=<span class="hljs-string">"pt"</span>)
	generated_ids = quantized_model.generate(**inputs, max_length=<span class="hljs-number">50</span>, do_sample=<span class="hljs-literal">False</span>)
	outputs = tokenizer.batch_decode(generated_ids)

	<span class="hljs-built_in">print</span>(outputs)

	<span class="hljs-string">"""
	['<\|begin_of_text\|>Hello, my name is [Name]. I am a [Your Profession/Student] and I am here to learn about the [Course/Program] at [University/Institution]. I am excited to be here and I am looking forward to', '<\|begin_of_text\|>The capital of France is Paris, which is located in the north-central part of the country. Paris is the most populous city in France and is known for its stunning architecture, art museums, fashion, and romantic atmosphere. The city is home to', "<\|begin_of_text\|>The future of AI is here, and it's already changing the way we live and work. From virtual assistants to self-driving cars, AI is transforming industries and revolutionizing the way we interact with technology. But what does the future of AI hold"]
	"""</span>
	<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-14cjyka">The above shows a quick example for running generation using a <code>compressed-tensors</code>
	model. Currently, once loaded the model cannot be saved.</p> <h2 class="relative group"><a id="deep-dive-into-a-compressed-tensors-model-checkpoint" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deep-dive-into-a-compressed-tensors-model-checkpoint"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Deep dive into a compressed-tensors model checkpoint</span></h2> <p data-svelte-h="svelte-10qolpt">In this example we will examine how the compressed-tensors model nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf is defined through its configuration entry and see how this translates to the loaded model representation.</p> <p data-svelte-h="svelte-1yyx6ne">First, let us look at the <a href="https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf/blob/main/config.json" rel="nofollow"><code>quantization_config</code> of the model</a>. At a glance it looks overwhelming with the number of entries but this is because compressed-tensors is a format that allows for flexible expression both during and after model compression.</p> <p data-svelte-h="svelte-1goyyy2">In practice for checkpoint loading and inference the configuration can be simplified to not include all the default or empty entries, so we will do that here to focus on what compression is actually represented.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-attr">"quantization_config":</span> {
	<span class="hljs-attr">"config_groups":</span> {
	<span class="hljs-attr">"group_0":</span> {
	<span class="hljs-attr">"input_activations":</span> {
	<span class="hljs-attr">"num_bits":</span> <span class="hljs-number">8</span>,
	<span class="hljs-attr">"strategy":</span> <span class="hljs-string">"tensor"</span>,
	<span class="hljs-attr">"type":</span> <span class="hljs-string">"float"</span>
	},
	<span class="hljs-attr">"targets":</span> [<span class="hljs-string">"Linear"</span>],
	<span class="hljs-attr">"weights":</span> {
	<span class="hljs-attr">"num_bits":</span> <span class="hljs-number">8</span>,
	<span class="hljs-attr">"strategy":</span> <span class="hljs-string">"tensor"</span>,
	<span class="hljs-attr">"type":</span> <span class="hljs-string">"float"</span>
	}
	}
	},
	<span class="hljs-attr">"format":</span> <span class="hljs-string">"naive-quantized"</span>,
	<span class="hljs-attr">"ignore":</span> [<span class="hljs-string">"lm_head"</span>],
	<span class="hljs-attr">"quant_method":</span> <span class="hljs-string">"compressed-tensors"</span>,
	<span class="hljs-attr">"quantization_status":</span> <span class="hljs-string">"frozen"</span>
	}<span class="hljs-string">,</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7czuzd">We can see from the above configuration that it is specifying one config group that includes weight and activation quantization to FP8 with a static per-tensor strategy. It is also worth noting that in the <code>ignore</code> list there is an entry to skip quantization of the <code>lm_head</code> module, so that module should be untouched in the checkpoint.</p> <p data-svelte-h="svelte-6j79nk">To see the result of the configuration in practice, we can simply use the <a href="https://huggingface.co/nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf?show_file_info=model.safetensors.index.json" rel="nofollow">safetensors viewer</a> on the model card to see the quantized weights, input_scale, and weight_scale for all of the Linear modules in the first model layer (and so on for the rest of the layers).</p> <table data-svelte-h="svelte-1ol90rr"><thead><tr><th>Tensors</th> <th>Shape</th> <th>Precision</th></tr></thead> <tbody><tr><td>model.layers.0.input_layernorm.weight</td> <td>[4 096]</td> <td>BF16</td></tr> <tr><td>model.layers.0.mlp.down_proj.input_scale</td> <td>[1]</td> <td>BF16</td></tr> <tr><td>model.layers.0.mlp.down_proj.weight</td> <td>[4 096, 14 336]</td> <td>F8_E4M3</td></tr> <tr><td>model.layers.0.mlp.down_proj.weight_scale</td> <td>[1]</td> <td>BF16</td></tr> <tr><td>model.layers.0.mlp.gate_proj.input_scale</td> <td>[1]</td> <td>BF16</td></tr> <tr><td>model.layers.0.mlp.gate_proj.weight</td> <td>[14 336, 4 096]</td> <td>F8_E4M3</td></tr> <tr><td>model.layers.0.mlp.gate_proj.weight_scale</td> <td>[1]</td> <td>BF16</td></tr> <tr><td>model.layers.0.mlp.up_proj.input_scale</td> <td>[1]</td> <td>BF16</td></tr> <tr><td>model.layers.0.mlp.up_proj.weight</td> <td>[14 336, 4 096]</td> <td>F8_E4M3</td></tr> <tr><td>model.layers.0.mlp.up_proj.weight_scale</td> <td>[1]</td> <td>BF16</td></tr> <tr><td>model.layers.0.post_attention_layernorm.weight</td> <td>[4 096]</td> <td>BF16</td></tr> <tr><td>model.layers.0.self_attn.k_proj.input_scale</td> <td>[1]</td> <td>BF16</td></tr> <tr><td>model.layers.0.self_attn.k_proj.weight</td> <td>[1 024, 4 096]</td> <td>F8_E4M3</td></tr> <tr><td>model.layers.0.self_attn.k_proj.weight_scale</td> <td>[1]</td> <td>BF16</td></tr> <tr><td>model.layers.0.self_attn.o_proj.input_scale</td> <td>[1]</td> <td>BF16</td></tr> <tr><td>model.layers.0.self_attn.o_proj.weight</td> <td>[4 096, 4 096]</td> <td>F8_E4M3</td></tr> <tr><td>model.layers.0.self_attn.o_proj.weight_scale</td> <td>[1]</td> <td>BF16</td></tr> <tr><td>model.layers.0.self_attn.q_proj.input_scale</td> <td>[1]</td> <td>BF16</td></tr> <tr><td>model.layers.0.self_attn.q_proj.weight</td> <td>[4 096, 4 096]</td> <td>F8_E4M3</td></tr> <tr><td>model.layers.0.self_attn.q_proj.weight_scale</td> <td>[1]</td> <td>BF16</td></tr> <tr><td>model.layers.0.self_attn.v_proj.input_scale</td> <td>[1]</td> <td>BF16</td></tr> <tr><td>model.layers.0.self_attn.v_proj.weight</td> <td>[1 024, 4 096]</td> <td>F8_E4M3</td></tr> <tr><td>model.layers.0.self_attn.v_proj.weight_scale</td> <td>[1]</td> <td>BF16</td></tr></tbody></table> <p data-svelte-h="svelte-1xwzei5">When we load the model with the compressed-tensors HFQuantizer integration, we can see that all of the Linear modules that are specified within the quantization configuration have been replaced by <code>CompressedLinear</code> modules that manage the compressed weights and forward pass for inference. Note that the <code>lm_head</code> mentioned before in the ignore list is still kept as an unquantized Linear module.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM

	ct_model = AutoModelForCausalLM.from_pretrained(<span class="hljs-string">"nm-testing/Meta-Llama-3.1-8B-Instruct-FP8-hf"</span>)
	<span class="hljs-built_in">print</span>(ct_model)
	<span class="hljs-string">"""
	LlamaForCausalLM(
	(model): LlamaModel(
	(embed_tokens): Embedding(128256, 4096)
	(layers): ModuleList(
	(0-31): 32 x LlamaDecoderLayer(
	(self_attn): LlamaSdpaAttention(
	(q_proj): CompressedLinear(
	in_features=4096, out_features=4096, bias=False
	(input_observer): MovingAverageMinMaxObserver()
	(weight_observer): MovingAverageMinMaxObserver()
	)
	(k_proj): CompressedLinear(
	in_features=4096, out_features=1024, bias=False
	(input_observer): MovingAverageMinMaxObserver()
	(weight_observer): MovingAverageMinMaxObserver()
	)
	(v_proj): CompressedLinear(
	in_features=4096, out_features=1024, bias=False
	(input_observer): MovingAverageMinMaxObserver()
	(weight_observer): MovingAverageMinMaxObserver()
	)
	(o_proj): CompressedLinear(
	in_features=4096, out_features=4096, bias=False
	(input_observer): MovingAverageMinMaxObserver()
	(weight_observer): MovingAverageMinMaxObserver()
	)
	(rotary_emb): LlamaRotaryEmbedding()
	)
	(mlp): LlamaMLP(
	(gate_proj): CompressedLinear(
	in_features=4096, out_features=14336, bias=False
	(input_observer): MovingAverageMinMaxObserver()
	(weight_observer): MovingAverageMinMaxObserver()
	)
	(up_proj): CompressedLinear(
	in_features=4096, out_features=14336, bias=False
	(input_observer): MovingAverageMinMaxObserver()
	(weight_observer): MovingAverageMinMaxObserver()
	)
	(down_proj): CompressedLinear(
	in_features=14336, out_features=4096, bias=False
	(input_observer): MovingAverageMinMaxObserver()
	(weight_observer): MovingAverageMinMaxObserver()
	)
	(act_fn): SiLU()
	)
	(input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
	(post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
	)
	)
	(norm): LlamaRMSNorm((4096,), eps=1e-05)
	(rotary_emb): LlamaRotaryEmbedding()
	)
	(lm_head): Linear(in_features=4096, out_features=128256, bias=False)
	)
	"""</span><!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/quantization/compressed_tensors.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_z647wz = {
	assets: "/docs/transformers/pr_33913/en",
	base: "/docs/transformers/pr_33913/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/transformers/pr_33913/en/_app/immutable/entry/start.b67f883f.js"),
	import("/docs/transformers/pr_33913/en/_app/immutable/entry/app.e436b1f2.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 400],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 33.7 kB
Xet hash:: 8635156b3fe5799d33d8fc6987aca085ad7b438575425255363e1c1ab03d0ed2

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.