Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / transformers /pr_37396 /en /quantization /torchao.html

rtrm

3 months ago

download

raw

46.6 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"torchao","local":"torchao","sections":[{"title":"Quantization examples","local":"quantization-examples","sections":[{"title":"Autoquant","local":"autoquant","sections":[],"depth":3}],"depth":2},{"title":"Serialization","local":"serialization","sections":[],"depth":2},{"title":"Loading quantized models","local":"loading-quantized-models","sections":[],"depth":2},{"title":"⚠️ Deprecation Notice","local":"-deprecation-notice","sections":[],"depth":2},{"title":"Resources","local":"resources","sections":[],"depth":2},{"title":"Issues","local":"issues","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/transformers/pr_37396/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/entry/start.3e00b6da.js">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/chunks/scheduler.18a86fab.js">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/chunks/singletons.240f24f3.js">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/chunks/index.40ab8126.js">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/chunks/paths.2edea311.js">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/entry/app.e7a7eda8.js">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/chunks/index.98837b22.js">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/nodes/0.78534817.js">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/nodes/465.54b4e4c3.js">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/chunks/Tip.77304350.js">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/chunks/CodeBlock.8d0c2e8a.js">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/chunks/Heading.7e7e0c5b.js">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/chunks/HfOption.6641485e.js">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/chunks/stores.aef3a054.js">
	<link rel="modulepreload" href="/docs/transformers/pr_37396/en/_app/immutable/chunks/index.f01015d9.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"torchao","local":"torchao","sections":[{"title":"Quantization examples","local":"quantization-examples","sections":[{"title":"Autoquant","local":"autoquant","sections":[],"depth":3}],"depth":2},{"title":"Serialization","local":"serialization","sections":[],"depth":2},{"title":"Loading quantized models","local":"loading-quantized-models","sections":[],"depth":2},{"title":"⚠️ Deprecation Notice","local":"-deprecation-notice","sections":[],"depth":2},{"title":"Resources","local":"resources","sections":[],"depth":2},{"title":"Issues","local":"issues","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="torchao" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#torchao"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>torchao</span></h1> <p data-svelte-h="svelte-8yu6d7"><a href="https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/quantization/torchao.ipynb" rel="nofollow"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab: Torchao Demo"></a></p> <p data-svelte-h="svelte-hptymg"><a href="https://github.com/pytorch/ao" rel="nofollow">torchao</a> is a PyTorch architecture optimization library with support for custom high performance data types, quantization, and sparsity. It is composable with native PyTorch features such as <a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html" rel="nofollow">torch.compile</a> for even faster inference and training.</p> <p data-svelte-h="svelte-k1mb66">See the table below for additional torchao features.</p> <table data-svelte-h="svelte-1x8g49v"><thead><tr><th>Feature</th> <th>Description</th></tr></thead> <tbody><tr><td><strong>Quantization Aware Training (QAT)</strong></td> <td>Train quantized models with minimal accuracy loss (see <a href="https://github.com/pytorch/ao/blob/main/torchao/quantization/qat/README.md" rel="nofollow">QAT README</a>)</td></tr> <tr><td><strong>Float8 Training</strong></td> <td>High-throughput training with float8 formats (see <a href="https://github.com/pytorch/torchtitan/blob/main/docs/float8.md" rel="nofollow">torchtitan</a> and <a href="https://huggingface.co/docs/accelerate/usage_guides/low_precision_training#configuring-torchao" rel="nofollow">Accelerate</a> docs)</td></tr> <tr><td><strong>Sparsity Support</strong></td> <td>Semi-structured (2:4) sparsity for faster inference (see <a href="https://pytorch.org/blog/accelerating-neural-network-training/" rel="nofollow">Accelerating Neural Network Training with Semi-Structured (2:4) Sparsity</a> blog post)</td></tr> <tr><td><strong>Optimizer Quantization</strong></td> <td>Reduce optimizer state memory with 4 and 8-bit variants of Adam</td></tr> <tr><td><strong>KV Cache Quantization</strong></td> <td>Enables long context inference with lower memory (see <a href="https://github.com/pytorch/ao/blob/main/torchao/_models/llama/README.md" rel="nofollow">KV Cache Quantization</a>)</td></tr> <tr><td><strong>Custom Kernels Support</strong></td> <td>use your own <code>torch.compile</code> compatible ops</td></tr> <tr><td><strong>FSDP2</strong></td> <td>Composable with FSDP2 for training</td></tr></tbody></table> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1e1uvh6">Refer to the torchao <a href="https://github.com/pytorch/ao#torchao-pytorch-architecture-optimization" rel="nofollow">README.md</a> for more details about the library.</p></div> <p data-svelte-h="svelte-bctle4">torchao supports the <a href="https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md" rel="nofollow">quantization techniques</a> below.</p> <ul data-svelte-h="svelte-ymph8f"><li>A16W8 Int8 WeightOnly Quantization</li> <li>A16W4 WeightOnly Quantization</li> <li>A8W8 Int8 Dynamic Quantization</li> <li>A16W8 Float8 WeightOnly Quantization</li> <li>Autoquantization</li></ul> <p data-svelte-h="svelte-1o9cwfq">Check the table below to see if your hardware is compatible.</p> <table data-svelte-h="svelte-1ypy8nm"><thead><tr><th>Component</th> <th>Compatibility</th></tr></thead> <tbody><tr><td>CUDA Versions</td> <td>✅ cu118, cu124, cu126, cu128</td></tr> <tr><td>CPU</td> <td>✅ change <code>device_map="cpu"</code> (see examples below)</td></tr></tbody></table> <p data-svelte-h="svelte-1wnb12y">Install torchao from PyPi or the PyTorch index with the following commands.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">PyPi </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">PyTorch Index </div></div> <div class="language-select"><div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Updating 🤗 Transformers to the latest version, as the example script below uses the new auto compilation</span>
	<span class="hljs-comment"># Stable release from Pypi which will default to CUDA 12.4</span>
	pip install --upgrade torchao transformers<!-- HTML_TAG_END --></pre></div> </div> <p data-svelte-h="svelte-vh1vhn">If your torcha version is below 0.10.0, you need to upgrade it, please refer to the <a href="#deprecation-notice">deprecation notice</a> for more details.</p> <h2 class="relative group"><a id="quantization-examples" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#quantization-examples"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Quantization examples</span></h2> <p data-svelte-h="svelte-mfxtr3">TorchAO provides a variety of quantization configurations. Each configuration can be further customized with parameters such as <code>group_size</code>, <code>scheme</code>, and <code>layout</code> to optimize for specific hardware and model architectures.</p> <p data-svelte-h="svelte-1omxb57">For a complete list of available configurations, see the <a href="https://github.com/pytorch/ao/blob/main/torchao/quantization/quant_api.py" rel="nofollow">quantization API documentation</a>.</p> <p data-svelte-h="svelte-hxxsaz">You can manually choose the quantization types and settings or automatically select the quantization types.</p> <p data-svelte-h="svelte-2gxn41">Create a <a href="/docs/transformers/pr_37396/en/main_classes/quantization#transformers.TorchAoConfig">TorchAoConfig</a> and specify the quantization type and <code>group_size</code> of the weights to quantize (for int8 weight only and int4 weight only). Set the <code>cache_implementation</code> to <code>"static"</code> to automatically <a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html" rel="nofollow">torch.compile</a> the forward method.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd border-gray-800 bg-black dark:bg-gray-700 text-white">int8-weight-only cuda </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">int8-weight-only cpu </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">int4-weight-only cuda </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">int4-weight-only cpu </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">int8-dynamic-quantization cuda </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">int8-dynamic-quantization cpu </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">float8-weight-only cuda </div><div class="flex items-center border rounded-lg px-1.5 py-1 leading-none select-none text-smd text-gray-500 cursor-pointer opacity-90 hover:text-gray-700 dark:hover:text-gray-200 hover:shadow-sm">float8-weight-only cpu </div></div> <div class="language-select"><div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
	<span class="hljs-keyword">from</span> torchao.quantization <span class="hljs-keyword">import</span> Int8WeightOnlyConfig

	quant_config = Int8WeightOnlyConfig(group_size=<span class="hljs-number">128</span>)
	quantization_config = TorchAoConfig(quant_type=quant_config)

	<span class="hljs-comment"># Load and quantize the model</span>
	quantized_model = AutoModelForCausalLM.from_pretrained(
	<span class="hljs-string">"meta-llama/Llama-3.1-8B-Instruct"</span>,
	torch_dtype=<span class="hljs-string">"auto"</span>,
	device_map=<span class="hljs-string">"auto"</span>,
	quantization_config=quantization_config
	)

	tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"meta-llama/Llama-3.1-8B-Instruct"</span>)
	input_text = <span class="hljs-string">"What are we having for dinner?"</span>
	input_ids = tokenizer(input_text, return_tensors=<span class="hljs-string">"pt"</span>).to(<span class="hljs-string">"cuda"</span>)

	<span class="hljs-comment"># auto-compile the quantized model with `cache_implementation="static"` to get speed up</span>
	output = quantized_model.generate(**input_ids, max_new_tokens=<span class="hljs-number">10</span>, cache_implementation=<span class="hljs-string">"static"</span>)
	<span class="hljs-built_in">print</span>(tokenizer.decode(output[<span class="hljs-number">0</span>], skip_special_tokens=<span class="hljs-literal">True</span>))<!-- HTML_TAG_END --></pre></div> </div> <h3 class="relative group"><a id="autoquant" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#autoquant"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Autoquant</span></h3> <p data-svelte-h="svelte-1fr0k2r">If you want to automatically choose a quantization type for quantizable layers (<code>nn.Linear</code>) you can use the <a href="https://pytorch.org/ao/stable/generated/torchao.quantization.autoquant.html#torchao.quantization.autoquant" rel="nofollow">autoquant</a> API.</p> <p data-svelte-h="svelte-axxx0x">The <code>autoquant</code> API automatically chooses a quantization type by micro-benchmarking on input type and shape and compiling a single linear layer.</p> <p data-svelte-h="svelte-1ijm8rh">Create a <a href="/docs/transformers/pr_37396/en/main_classes/quantization#transformers.TorchAoConfig">TorchAoConfig</a> and set to <code>"autoquant"</code>. Set the <code>cache_implementation</code> to <code>"static"</code> to automatically <a href="https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html" rel="nofollow">torch.compile</a> the forward method. Finally, call <code>finalize_autoquant</code> on the quantized model to finalize the quantization and log the input shapes.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> TorchAoConfig, AutoModelForCausalLM, AutoTokenizer

	quantization_config = TorchAoConfig(<span class="hljs-string">"autoquant"</span>, min_sqnr=<span class="hljs-literal">None</span>)
	quantized_model = AutoModelForCausalLM.from_pretrained(
	<span class="hljs-string">"meta-llama/Llama-3.1-8B-Instruct"</span>,
	torch_dtype=<span class="hljs-string">"auto"</span>,
	device_map=<span class="hljs-string">"auto"</span>,
	quantization_config=quantization_config
	)

	tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"meta-llama/Llama-3.1-8B-Instruct"</span>)
	input_text = <span class="hljs-string">"What are we having for dinner?"</span>
	input_ids = tokenizer(input_text, return_tensors=<span class="hljs-string">"pt"</span>).to(<span class="hljs-string">"cuda"</span>)

	<span class="hljs-comment"># auto-compile the quantized model with `cache_implementation="static"` to get speed up</span>
	output = quantized_model.generate(**input_ids, max_new_tokens=<span class="hljs-number">10</span>, cache_implementation=<span class="hljs-string">"static"</span>)
	<span class="hljs-comment"># explicitly call `finalize_autoquant` (may be refactored and removed in the future)</span>
	quantized_model.finalize_autoquant()
	<span class="hljs-built_in">print</span>(tokenizer.decode(output[<span class="hljs-number">0</span>], skip_special_tokens=<span class="hljs-literal">True</span>))<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="serialization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#serialization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Serialization</span></h2> <p data-svelte-h="svelte-1g3oift">torchao implements <a href="https://pytorch.org/docs/stable/notes/extending.html#subclassing-torch-tensor" rel="nofollow">torch.Tensor subclasses</a> for maximum flexibility in supporting new quantized torch.Tensor formats. <a href="https://huggingface.co/docs/safetensors/en/index" rel="nofollow">Safetensors</a> serialization and deserialization does not work with torchao.</p> <p data-svelte-h="svelte-5ma9bd">To avoid arbitrary user code execution, torchao sets <code>weights_only=True</code> in <a href="https://pytorch.org/docs/stable/generated/torch.load.html" rel="nofollow">torch.load</a> to ensure only tensors are loaded. Any known user functions can be whitelisted with <a href="https://pytorch.org/docs/stable/notes/serialization.html#torch.serialization.add_safe_globals" rel="nofollow">add_safe_globals</a>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># don't serialize model with Safetensors</span>
	output_dir = <span class="hljs-string">"llama3-8b-int4wo-128"</span>
	quantized_model.save_pretrained(<span class="hljs-string">"llama3-8b-int4wo-128"</span>, safe_serialization=<span class="hljs-literal">False</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="loading-quantized-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#loading-quantized-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Loading quantized models</span></h2> <p data-svelte-h="svelte-14fyln8">Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
	<span class="hljs-keyword">from</span> torchao.quantization <span class="hljs-keyword">import</span> Int8WeightOnlyConfig

	quant_config = Int8WeightOnlyConfig(group_size=<span class="hljs-number">128</span>)
	quantization_config = TorchAoConfig(quant_type=quant_config)

	<span class="hljs-comment"># Load and quantize the model</span>
	quantized_model = AutoModelForCausalLM.from_pretrained(
	<span class="hljs-string">"meta-llama/Llama-3.1-8B-Instruct"</span>,
	torch_dtype=<span class="hljs-string">"auto"</span>,
	device_map=<span class="hljs-string">"cpu"</span>,
	quantization_config=quantization_config
	)
	<span class="hljs-comment"># save the quantized model</span>
	output_dir = <span class="hljs-string">"llama-3.1-8b-torchao-int8-cuda"</span>
	quantized_model.save_pretrained(output_dir, safe_serialization=<span class="hljs-literal">False</span>)

	<span class="hljs-comment"># reload the quantized model</span>
	reloaded_model = AutoModelForCausalLM.from_pretrained(
	output_dir,
	device_map=<span class="hljs-string">"auto"</span>,
	torch_dtype=torch.bfloat16
	)
	tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"meta-llama/Llama-3.1-8B-Instruct"</span>)
	input_text = <span class="hljs-string">"What are we having for dinner?"</span>
	input_ids = tokenizer(input_text, return_tensors=<span class="hljs-string">"pt"</span>).to(<span class="hljs-string">"cuda"</span>)

	output = reloaded_model.generate(**input_ids, max_new_tokens=<span class="hljs-number">10</span>)
	<span class="hljs-built_in">print</span>(tokenizer.decode(output[<span class="hljs-number">0</span>], skip_special_tokens=<span class="hljs-literal">True</span>))
	<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1oj3qy1">For int4, the model can only be loaded on the same device it was quantized on because the layout is specific to the device. The example below demonstrates quantizing and loading a model on the CPU.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> torch
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
	<span class="hljs-keyword">from</span> torchao.quantization <span class="hljs-keyword">import</span> Int4WeightOnlyConfig
	<span class="hljs-keyword">from</span> torchao.dtypes <span class="hljs-keyword">import</span> Int4CPULayout

	quant_config = Int4WeightOnlyConfig(group_size=<span class="hljs-number">128</span>, layout=Int4CPULayout())
	quantization_config = TorchAoConfig(quant_type=quant_config)

	<span class="hljs-comment"># Load and quantize the model</span>
	quantized_model = AutoModelForCausalLM.from_pretrained(
	<span class="hljs-string">"meta-llama/Llama-3.1-8B-Instruct"</span>,
	torch_dtype=<span class="hljs-string">"auto"</span>,
	device_map=<span class="hljs-string">"cpu"</span>,
	quantization_config=quantization_config
	)
	<span class="hljs-comment"># save the quantized model</span>
	output_dir = <span class="hljs-string">"llama-3.1-8b-torchao-int4-cpu"</span>
	quantized_model.save_pretrained(output_dir, safe_serialization=<span class="hljs-literal">False</span>)

	<span class="hljs-comment"># reload the quantized model</span>
	reloaded_model = AutoModelForCausalLM.from_pretrained(
	output_dir,
	device_map=<span class="hljs-string">"cpu"</span>,
	torch_dtype=torch.bfloat16
	)
	tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"meta-llama/Llama-3.1-8B-Instruct"</span>)
	input_text = <span class="hljs-string">"What are we having for dinner?"</span>
	input_ids = tokenizer(input_text, return_tensors=<span class="hljs-string">"pt"</span>)

	output = reloaded_model.generate(**input_ids, max_new_tokens=<span class="hljs-number">10</span>)
	<span class="hljs-built_in">print</span>(tokenizer.decode(output[<span class="hljs-number">0</span>], skip_special_tokens=<span class="hljs-literal">True</span>))
	<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="-deprecation-notice" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-deprecation-notice"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>⚠️ Deprecation Notice</span></h2> <blockquote><p data-svelte-h="svelte-jmxllg">Starting with version 0.10.0, the string-based API for quantization configuration (e.g., <code>TorchAoConfig("int4_weight_only", group_size=128)</code>) is <strong>deprecated</strong> and will be removed in a future release.</p> <p data-svelte-h="svelte-1qak5au">Please use the new <code>AOBaseConfig</code>-based approach instead:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Old way (deprecated)</span>
	quantization_config = TorchAoConfig(<span class="hljs-string">"int4_weight_only"</span>, group_size=<span class="hljs-number">128</span>)

	<span class="hljs-comment"># New way (recommended)</span>
	<span class="hljs-keyword">from</span> torchao.quantization <span class="hljs-keyword">import</span> Int4WeightOnlyConfig
	quant_config = Int4WeightOnlyConfig(group_size=<span class="hljs-number">128</span>)
	quantization_config = TorchAoConfig(quant_type=quant_config)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8tiw44">The new API offers greater flexibility, better type safety, and access to the full range of features available in torchao.</p> <p data-svelte-h="svelte-1wt1icj"><a href="#migration-guide">Migration Guide</a></p> <p data-svelte-h="svelte-1kyext4">Here’s how to migrate from common string identifiers to their <code>AOBaseConfig</code> equivalents:</p> <table data-svelte-h="svelte-1w9xui3"><thead><tr><th>Old String API</th> <th>New <code>AOBaseConfig</code> API</th></tr></thead> <tbody><tr><td><code>"int4_weight_only"</code></td> <td><code>Int4WeightOnlyConfig()</code></td></tr> <tr><td><code>"int8_weight_only"</code></td> <td><code>Int8WeightOnlyConfig()</code></td></tr> <tr><td><code>"int8_dynamic_activation_int8_weight"</code></td> <td><code>Int8DynamicActivationInt8WeightConfig()</code></td></tr></tbody></table> <p data-svelte-h="svelte-9z9ctj">All configuration objects accept parameters for customization (e.g., <code>group_size</code>, <code>scheme</code>, <code>layout</code>).</p></blockquote> <h2 class="relative group"><a id="resources" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#resources"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Resources</span></h2> <p data-svelte-h="svelte-1cgj2bs">For a better sense of expected performance, view the <a href="https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks" rel="nofollow">benchmarks</a> for various models with CUDA and XPU backends. You can also run the code below to benchmark a model yourself.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> torch._inductor.utils <span class="hljs-keyword">import</span> do_bench_using_profiling
	<span class="hljs-keyword">from</span> typing <span class="hljs-keyword">import</span> <span class="hljs-type">Callable</span>

	<span class="hljs-keyword">def</span> <span class="hljs-title function_">benchmark_fn</span>(<span class="hljs-params">func: <span class="hljs-type">Callable</span>, args, *kwargs</span>) -> <span class="hljs-built_in">float</span>:
	<span class="hljs-string">"""Thin wrapper around do_bench_using_profiling"""</span>
	no_args = <span class="hljs-keyword">lambda</span>: func(args, *kwargs)
	time = do_bench_using_profiling(no_args)
	<span class="hljs-keyword">return</span> time * <span class="hljs-number">1e3</span>

	MAX_NEW_TOKENS = <span class="hljs-number">1000</span>
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"int4wo-128 model:"</span>, benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation=<span class="hljs-string">"static"</span>))

	bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map=<span class="hljs-string">"auto"</span>, torch_dtype=torch.bfloat16)
	output = bf16_model.generate(**input_ids, max_new_tokens=<span class="hljs-number">10</span>, cache_implementation=<span class="hljs-string">"static"</span>) <span class="hljs-comment"># auto-compile</span>
	<span class="hljs-built_in">print</span>(<span class="hljs-string">"bf16 model:"</span>, benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation=<span class="hljs-string">"static"</span>))<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-dcid7j">For best performance, you can use recommended settings by calling <code>torchao.quantization.utils.recommended_inductor_config_setter()</code></p></div> <p data-svelte-h="svelte-fj0t1q">Refer to <a href="https://github.com/pytorch/ao/tree/main/torchao/quantization#other-available-quantization-techniques" rel="nofollow">Other Available Quantization Techniques</a> for more examples and documentation.</p> <h2 class="relative group"><a id="issues" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#issues"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Issues</span></h2> <p data-svelte-h="svelte-1auvp72">If you encounter any issues with the Transformers integration, please open an issue on the <a href="https://github.com/huggingface/transformers/issues" rel="nofollow">Transformers</a> repository. For issues directly related to torchao, please open an issue on the <a href="https://github.com/pytorch/ao/issues" rel="nofollow">torchao</a> repository.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/quantization/torchao.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1s09dg7 = {
	assets: "/docs/transformers/pr_37396/en",
	base: "/docs/transformers/pr_37396/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/transformers/pr_37396/en/_app/immutable/entry/start.3e00b6da.js"),
	import("/docs/transformers/pr_37396/en/_app/immutable/entry/app.e7a7eda8.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 465],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 46.6 kB
Xet hash:: 97fdfe0e3d7924ca00cb63de7c14ceb202cb886f72648ae30cbe48434a51767e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.