Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / transformers /pr_26617 /en /quantization /contribute.html

rtrm

about 1 month ago

download

raw

35 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Contribute","local":"contribute","sections":[{"title":"Requirements","local":"requirements","sections":[],"depth":2},{"title":"Create new HFQuantizer class","local":"create-new-hfquantizer-class","sections":[],"depth":2},{"title":"Files overview","local":"files-overview","sections":[],"depth":2},{"title":"Understanding get_quantize_ops vs get_weight_conversions","local":"understanding-getquantizeops-vs-getweightconversions","sections":[{"title":"get_quantize_ops — Quantize on the fly","local":"getquantizeops--quantize-on-the-fly","sections":[],"depth":3},{"title":"get_weight_conversions — Load pre-quantized checkpoints","local":"getweightconversions--load-pre-quantized-checkpoints","sections":[],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/transformers/pr_26617/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/entry/start.b5ae2c21.js">
	<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/scheduler.31fdf58d.js">
	<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/singletons.512cdb48.js">
	<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/index.252883d5.js">
	<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/paths.81255c3b.js">
	<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/entry/app.9acf2c3e.js">
	<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/preload-helper.bb442aeb.js">
	<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/index.2f76fdf0.js">
	<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/nodes/0.da6b3909.js">
	<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/nodes/565.8a4215f6.js">
	<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/CopyLLMTxtMenu.a69e059a.js">
	<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.e4c7f916.js">
	<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/IconCopy.ac192424.js">
	<link rel="modulepreload" href="/docs/transformers/pr_26617/en/_app/immutable/chunks/CodeBlock.ab12f8e1.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Contribute","local":"contribute","sections":[{"title":"Requirements","local":"requirements","sections":[],"depth":2},{"title":"Create new HFQuantizer class","local":"create-new-hfquantizer-class","sections":[],"depth":2},{"title":"Files overview","local":"files-overview","sections":[],"depth":2},{"title":"Understanding get_quantize_ops vs get_weight_conversions","local":"understanding-getquantizeops-vs-getweightconversions","sections":[{"title":"get_quantize_ops — Quantize on the fly","local":"getquantizeops--quantize-on-the-fly","sections":[],"depth":3},{"title":"get_weight_conversions — Load pre-quantized checkpoints","local":"getweightconversions--load-pre-quantized-checkpoints","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="contribute" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#contribute"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Contribute</span></h1> <p data-svelte-h="svelte-1srj5sl">Transformers supports many quantization methods such as QLoRA, GPTQ, LLM.int8, and AWQ. However, there are still many more quantization approaches that haven’t been integrated yet. To make adding and using these quantization methods with Transformers easier, use the <a href="/docs/transformers/pr_26617/en/main_classes/quantization#transformers.quantizers.HfQuantizer">HfQuantizer</a> class. <a href="/docs/transformers/pr_26617/en/main_classes/quantization#transformers.quantizers.HfQuantizer">HfQuantizer</a> is designed to be an internal helper class for adding a quantization method instead of something applied to every PyTorch module.</p> <p data-svelte-h="svelte-18epg7">This guide will show you how to integrate a new quantization method with <a href="/docs/transformers/pr_26617/en/main_classes/quantization#transformers.quantizers.HfQuantizer">HfQuantizer</a>.</p> <h2 class="relative group"><a id="requirements" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#requirements"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Requirements</span></h2> <p data-svelte-h="svelte-1xzixq6">Before integrating a new quantization method into Transformers, ensure the method meets the following requirements. Only quantization methods that can be run with PyTorch modules are supported.</p> <ul><li data-svelte-h="svelte-sv9ywh"><p>The quantization method is available through a Python package that is pip-installable (it is also fine if you can only install the package from source). Ideally, pre-compiled kernels are included in the pip package.</p></li> <li data-svelte-h="svelte-9za9zo"><p>The method can run on commonly-used hardware (CPU, GPU, etc.).</p></li> <li><p data-svelte-h="svelte-jibffa">The method is wrapped in a <a href="https://pytorch.org/docs/stable/generated/torch.nn.Module.html" rel="nofollow">nn.Module</a> (<code>~bitsandbytes.nn.Linear8bitLt</code>, <code>~bitsandbytes.nn.Linear4bit</code>), and the quantized linear layer should have the following definition.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">Linear4bit</span>(nn.Module):
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, ...</span>):
	...

	<span class="hljs-keyword">def</span> <span class="hljs-title function_">forward</span>(<span class="hljs-params">self, x</span>):
	<span class="hljs-keyword">return</span> my_4bit_kernel(x, self.weight, self.bias)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-x4ekep">This way, Transformers models are easily quantized by replacing instances of <a href="https://pytorch.org/docs/stable/generated/torch.nn.Linear.html" rel="nofollow">nn.Linear</a> with a target class.</p></li> <li data-svelte-h="svelte-z7e46y"><p>The quantization method should be serializable. You can save the quantized weights locally or push them to the Hub.</p></li> <li data-svelte-h="svelte-1fr6u90"><p>Make sure the package containing the quantization kernels/primitive is stable (no frequent breaking changes).</p></li></ul> <p data-svelte-h="svelte-70g41u">Some quantization methods may require “pre-quantizing” the model through data calibration (AWQ). In this case, we prefer to only support inference in Transformers and let the third-party library maintained by the ML community deal handle the model quantization itself.</p> <h2 class="relative group"><a id="create-new-hfquantizer-class" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#create-new-hfquantizer-class"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Create new HFQuantizer class</span></h2> <ol start="0" data-svelte-h="svelte-1sqj1o3"><li><p>The best starting point would be to have a look at another quantization method such as Finegrained Fp8. You will have to update or create three files in total: the <a href="https://github.com/huggingface/transformers/blob/main/src/transformers/utils/quantization_config.py" rel="nofollow">config file</a>, the <a href="https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/finegrained_fp8.py" rel="nofollow">integration file</a> and the <a href="https://github.com/huggingface/transformers/blob/main/src/transformers/quantizers/quantizer_finegrained_fp8.py" rel="nofollow">quantizer file</a>.</p></li> <li><p>Create a new quantization config class inside <a href="https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/utils/quantization_config.py" rel="nofollow">src/transformers/utils/quantization_config.py</a>. Add the new quantization config to the <a href="https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/__init__.py#L1088" rel="nofollow">_import_structure</a> inside Transformers’ <a href="https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/__init__.py" rel="nofollow">src/transformers/__init__.py</a> file.</p></li> <li><p>Create a new file inside <a href="https://github.com/huggingface/transformers/tree/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers" rel="nofollow">src/transformers/quantizers/</a> named <code>quantizer_your_method.py</code>, and make it inherit from [`~quantizers.HfQuantizer]. Make sure to add the new quantizer and quantization config in the quantization auto-mapping in <a href="https://github.com/huggingface/transformers/blob/abbffc4525566a48a9733639797c812301218b83/src/transformers/quantizers/auto.py" rel="nofollow">src/transformers/quantizers/auto.py</a>.</p></li> <li><p>Define the following class attributes and property methods for your quantization method:</p> <ul><li><code>requires_calibration</code>: Whether the quantization method requires a data calibration process. If set to <code>True</code>, you can only support inference (with quantized weights) and not inference and quantization.</li> <li><code>is_serializable</code>: A property method to determine whether the method is serializable or not.</li> <li><code>is_trainable</code>: A property method to determine whether you can fine-tune models on top of the quantization method (with or without PEFT approaches).</li></ul></li> <li><p>Write the <code>validate_environment</code> and <code>update_dtype</code> methods. These methods are called before creating the quantized model to ensure users use the right configuration. Refer to other quantizers for an example of it is implemented.</p></li> <li><p>Write the <code>_process_model_before_weight_loading</code> method. In Transformers, the quantized models are initialized first on the <code>"meta"</code> device before loading the weights. This means the <code>_process_model_before_weight_loading</code> method takes care of manipulating the model skeleton to replace some modules (<a href="https://pytorch.org/docs/stable/generated/torch.nn.Linear.html" rel="nofollow">nn.Linear</a>) with the target modules (quantization modules).</p></li></ol> <p data-svelte-h="svelte-1ch1gag">You can define module replacement logic or any other utility method by creating a new file in <a href="https://github.com/huggingface/transformers/tree/abbffc4525566a48a9733639797c812301218b83/src/transformers/integrations" rel="nofollow">transformers/src/integrations/</a> and exposing the relevant methods in that folder’s <code>__init__.py</code> file.</p> <ol start="6" data-svelte-h="svelte-1r6yr7w"><li><p>Add the <code>get_quantize_ops</code> method to the quantizer class if the quantization supports quantizing on the fly. In transformers, we materialize each tensor and apply a sequence of different operations on it. In our case, the quantization operation happens at the end. You need to create a <code>XXXQuantize</code>, a subclass of <code>ConversionOps</code>, and add a <code>convert</code> method. In the <code>convert</code> method, you need to quantize the weights and return a dictionary of quantized params.</p></li> <li><p>Add the <code>get_weight_conversions</code> method to the quantizer class if the quantization supports loading pre-quantized weights. In transformers, we can collect multiple tensors and apply operations on them. This is particularly useful when we have tensors in the checkpoint that require to be regrouped to re-create the quantized tensors.</p></li> <li><p>Write the <code>_process_model_after_weight_loading</code> method if needed. This method enables implementing additional features that require manipulating the model after loading the weights.</p></li> <li><p>Document everything! Make sure your quantization method is documented by adding a new file under <code>docs/source/en/quantization</code>.</p></li> <li><p>You should add tests by adding the package in our nightly Dockerfile inside <code>docker/transformers-quantization-latest-gpu</code> and then adding a new test file in <code>tests/quantization/xxx</code>. Feel free to check out existing quantization methods to see how it is implemented.</p></li></ol> <h2 class="relative group"><a id="files-overview" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#files-overview"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Files overview</span></h2> <table data-svelte-h="svelte-zoim65"><thead><tr><th>File</th> <th>Purpose</th></tr></thead> <tbody><tr><td><code>utils/quantization_config.py</code></td> <td>Define <code>YourMethodConfig</code> inheriting from <code>QuantizationConfigMixin</code></td></tr> <tr><td><code>quantizers/quantizer_your_method.py</code></td> <td>Implement <code>YourMethodHfQuantizer</code> inheriting from <code>HfQuantizer</code></td></tr> <tr><td><code>integrations/your_method.py</code></td> <td>Implement <code>ConversionOps</code> subclasses and helper functions</td></tr> <tr><td><code>quantizers/auto.py</code></td> <td>Register quantizer and config in <code>AUTO_QUANTIZER_MAPPING</code> and <code>AUTO_QUANTIZATION_CONFIG_MAPPING</code></td></tr> <tr><td><code>docs/source/en/quantization/your_method.md</code></td> <td>Document usage for users</td></tr> <tr><td><code>tests/quantization/your_method/</code></td> <td>Add integration tests</td></tr></tbody></table> <h2 class="relative group"><a id="understanding-getquantizeops-vs-getweightconversions" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#understanding-getquantizeops-vs-getweightconversions"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Understanding get_quantize_ops vs get_weight_conversions</span></h2> <p data-svelte-h="svelte-q06yvu">These two methods handle different scenarios for loading weights. Understanding when to use each is essential.</p> <h3 class="relative group"><a id="getquantizeops--quantize-on-the-fly" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#getquantizeops--quantize-on-the-fly"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>get_quantize_ops — Quantize on the fly</span></h3> <p data-svelte-h="svelte-1ir5kap">Use this when loading a <strong>non-quantized checkpoint</strong> (e.g., float16/bfloat16 weights) and quantizing during load.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Checkpoint: model.safetensors (float16 weights for example)
	↓
	get_quantize_ops → YourQuantize.convert()
	↓
	Result: Quantized weights in memory<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-11dq0dc">The <code>convert</code> method receives one tensor at a time, quantizes it, and can return a dictionary of quantized params, for example:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">class</span> <span class="hljs-title class_">YourQuantize</span>(<span class="hljs-title class_ inherited__">ConversionOps</span>):
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">convert</span>(<span class="hljs-params">self, input_dict, model, full_layer_name, missing_keys, **kwargs</span>):
	<span class="hljs-comment"># input_dict = {"layer.weight": <float16 tensor>}</span>
	value = <span class="hljs-built_in">list</span>(input_dict.values())[<span class="hljs-number">0</span>]
	module, tensor_name = get_module_from_name(model, full_layer_name)

	<span class="hljs-comment"># Quantize and assign</span>
	quantized, scale, zero_point = your_quantize_fn(value)
	<span class="hljs-keyword">return</span> {full_layer_name: quantized, full_layer_name + <span class="hljs-string">".scale"</span>: scale, full_layer_name + <span class="hljs-string">".zero_point"</span>: zero_point}<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="getweightconversions--load-pre-quantized-checkpoints" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#getweightconversions--load-pre-quantized-checkpoints"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>get_weight_conversions — Load pre-quantized checkpoints</span></h3> <p data-svelte-h="svelte-10qrum6">Use this when loading a <strong>pre-quantized checkpoint</strong> where the quantized weights are saved as several separate components (such as data, scale, and zero point), and these need to be combined into one tensor during loading. Not all quantization methods require this reconstruction step: for example, some methods like FP8 simply load weights and scales as-is, without combining them. Others, such as torchao, do require reassembling the quantized tensor from its multiple saved components.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Checkpoint: model.safetensors (quantized components)
	- layer._weight_qdata
	- layer._weight_scale
	- layer._weight_zero_point
	↓
	get_weight_conversions → WeightConverter + YourDeserialize.convert()
	↓
	Result: Reconstructed quantized tensor → layer.weight<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-2anuij">The <code>WeightConverter</code> collects related tensors based on <code>source_patterns</code>, then passes them to your <code>convert</code> method:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">get_weight_conversions</span>(<span class="hljs-params">self</span>):
	<span class="hljs-keyword">if</span> self.pre_quantized:
	<span class="hljs-keyword">return</span> [
	WeightConverter(
	source_patterns=[<span class="hljs-string">"_weight_qdata"</span>, <span class="hljs-string">"_weight_scale"</span>, <span class="hljs-string">"_weight_zero_point"</span>],
	target_patterns=<span class="hljs-string">"weight"</span>,
	operations=[YourDeserialize(self)],
	),
	]
	<span class="hljs-keyword">return</span> []


	<span class="hljs-keyword">class</span> <span class="hljs-title class_">YourDeserialize</span>(<span class="hljs-title class_ inherited__">ConversionOps</span>):
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">convert</span>(<span class="hljs-params">self, input_dict, model, full_layer_name, **kwargs</span>):
	<span class="hljs-comment"># input_dict contains all collected tensors</span>
	<span class="hljs-comment"># Reconstruct the quantized tensor from components</span>
	reconstructed_tensor = reconstruct_from_components(input_dict)
	<span class="hljs-keyword">return</span> {full_layer_name: reconstructed_tensor}<!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/quantization/contribute.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1x0t0ja = {
	assets: "/docs/transformers/pr_26617/en",
	base: "/docs/transformers/pr_26617/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/transformers/pr_26617/en/_app/immutable/entry/start.b5ae2c21.js"),
	import("/docs/transformers/pr_26617/en/_app/immutable/entry/app.9acf2c3e.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 565],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 35 kB
Xet hash:: 7d40775bf252edc71b5a02971f27795206ecd01678590c66e14fd64e58094b7e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.