Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / transformers /main /en /gguf.html

rtrm

about 1 month ago

download

raw

16.5 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"GGUF and interaction with Transformers","local":"gguf-and-interaction-with-transformers","sections":[{"title":"Support within Transformers","local":"support-within-transformers","sections":[{"title":"Supported quantization types","local":"supported-quantization-types","sections":[],"depth":3},{"title":"Supported model architectures","local":"supported-model-architectures","sections":[],"depth":3}],"depth":2},{"title":"Example usage","local":"example-usage","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/transformers/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/entry/start.2135b7e6.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/scheduler.25b97de1.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/singletons.0f2b7d5f.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/index.e188933d.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/paths.3d04d2c6.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/entry/app.24372c84.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/index.d9030fc9.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/nodes/0.026d2fdd.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/nodes/23.dea0f42a.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/CodeBlock.e6cd0d95.js">
	<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/EditOnGithub.91d95064.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"GGUF and interaction with Transformers","local":"gguf-and-interaction-with-transformers","sections":[{"title":"Support within Transformers","local":"support-within-transformers","sections":[{"title":"Supported quantization types","local":"supported-quantization-types","sections":[],"depth":3},{"title":"Supported model architectures","local":"supported-model-architectures","sections":[],"depth":3}],"depth":2},{"title":"Example usage","local":"example-usage","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="gguf-and-interaction-with-transformers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#gguf-and-interaction-with-transformers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>GGUF and interaction with Transformers</span></h1> <p data-svelte-h="svelte-117klwz">The GGUF file format is used to store models for inference with <a href="https://github.com/ggerganov/ggml" rel="nofollow">GGML</a> and other
	libraries that depend on it, like the very popular <a href="https://github.com/ggerganov/llama.cpp" rel="nofollow">llama.cpp</a> or
	<a href="https://github.com/ggerganov/whisper.cpp" rel="nofollow">whisper.cpp</a>.</p> <p data-svelte-h="svelte-14alsnt">It is a file format <a href="https://huggingface.co/docs/hub/en/gguf" rel="nofollow">supported by the Hugging Face Hub</a> with features
	allowing for quick inspection of tensors and metadata within the file.</p> <p data-svelte-h="svelte-1lcntb8">This file format is designed as a “single-file-format” where a single file usually contains both the configuration
	attributes, the tokenizer vocabulary and other attributes, as well as all tensors to be loaded in the model. These
	files come in different formats according to the quantization type of the file. We briefly go over some of them
	<a href="https://huggingface.co/docs/hub/en/gguf#quantization-types" rel="nofollow">here</a>.</p> <h2 class="relative group"><a id="support-within-transformers" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#support-within-transformers"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Support within Transformers</span></h2> <p data-svelte-h="svelte-1fcaojo">We have added the ability to load <code>gguf</code> files within <code>transformers</code> in order to offer further training/fine-tuning
	capabilities to gguf models, before converting back those models to <code>gguf</code> to use within the <code>ggml</code> ecosystem. When
	loading a model, we first dequantize it to fp32, before loading the weights to be used in PyTorch.</p> <blockquote data-svelte-h="svelte-13g437o"><p>[!NOTE]
	The support is still very exploratory and we welcome contributions in order to solidify it across quantization types
	and model architectures.</p></blockquote> <p data-svelte-h="svelte-lue5cc">For now, here are the supported model architectures and quantization types:</p> <h3 class="relative group"><a id="supported-quantization-types" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#supported-quantization-types"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Supported quantization types</span></h3> <p data-svelte-h="svelte-1r2kpez">The initial supported quantization types are decided according to the popular quantized files that have been shared
	on the Hub.</p> <ul data-svelte-h="svelte-1f29qgp"><li>F32</li> <li>F16</li> <li>BF16</li> <li>Q4_0</li> <li>Q4_1</li> <li>Q5_0</li> <li>Q5_1</li> <li>Q8_0</li> <li>Q2_K</li> <li>Q3_K</li> <li>Q4_K</li> <li>Q5_K</li> <li>Q6_K</li> <li>IQ1_S</li> <li>IQ1_M</li> <li>IQ2_XXS</li> <li>IQ2_XS</li> <li>IQ2_S</li> <li>IQ3_XXS</li> <li>IQ3_S</li> <li>IQ4_XS</li> <li>IQ4_NL</li></ul> <blockquote data-svelte-h="svelte-tc3a5k"><p>[!NOTE]
	To support gguf dequantization, <code>gguf>=0.10.0</code> installation is required.</p></blockquote> <h3 class="relative group"><a id="supported-model-architectures" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#supported-model-architectures"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Supported model architectures</span></h3> <p data-svelte-h="svelte-tmyftl">For now the supported model architectures are the architectures that have been very popular on the Hub, namely:</p> <ul data-svelte-h="svelte-xxd66r"><li>LLaMa</li> <li>Mistral</li> <li>Qwen2</li> <li>Qwen2Moe</li> <li>Phi3</li></ul> <h2 class="relative group"><a id="example-usage" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#example-usage"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Example usage</span></h2> <p data-svelte-h="svelte-1lj1hm3">In order to load <code>gguf</code> files in <code>transformers</code>, you should specify the <code>gguf_file</code> argument to the <code>from_pretrained</code>
	methods of both tokenizers and models. Here is how one would load a tokenizer and a model, which can be loaded
	from the exact same file:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModelForCausalLM

	model_id = <span class="hljs-string">"TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"</span>
	filename = <span class="hljs-string">"tinyllama-1.1b-chat-v1.0.Q6_K.gguf"</span>

	tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
	model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-16kc502">Now you have access to the full, unquantized version of the model in the PyTorch ecosystem, where you can combine it
	with a plethora of other tools.</p> <p data-svelte-h="svelte-uk9y6w">In order to convert back to a <code>gguf</code> file, we recommend using the
	<a href="https://github.com/ggerganov/llama.cpp/blob/master/convert-hf-to-gguf.py" rel="nofollow"><code>convert-hf-to-gguf.py</code> file</a> from llama.cpp.</p> <p data-svelte-h="svelte-eicl62">Here’s how you would complete the script above to save the model and export it back to <code>gguf</code>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenizer.save_pretrained(<span class="hljs-string">'directory'</span>)
	model.save_pretrained(<span class="hljs-string">'directory'</span>)

	!python ${path_to_llama_cpp}/convert-hf-to-gguf.py ${directory}<!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/gguf.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1xexzbk = {
	assets: "/docs/transformers/main/en",
	base: "/docs/transformers/main/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/transformers/main/en/_app/immutable/entry/start.2135b7e6.js"),
	import("/docs/transformers/main/en/_app/immutable/entry/app.24372c84.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 23],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 16.5 kB
Xet hash:: a2b3e9b6be365bea9519814f5aac103e7d5e23cc2646480627d2b2ae628c55d2

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.