Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / text-generation-inference /main /en /conceptual /tensor_parallelism.html

rtrm

about 2 months ago

download

raw

5.62 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Tensor Parallelism","local":"tensor-parallelism","sections":[],"depth":1}">
	<link href="/docs/text-generation-inference/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/entry/start.1810066f.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/scheduler.362310b7.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/singletons.fa2b0eb7.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/index.7f53ec41.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/paths.284aef40.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/entry/app.8cfc1931.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/index.57dfc70d.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/nodes/0.543c9bd9.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/nodes/22.264d75ce.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/Tip.14b2ab21.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/EditOnGithub.9633c464.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Tensor Parallelism","local":"tensor-parallelism","sections":[],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="tensor-parallelism" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#tensor-parallelism"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Tensor Parallelism</span></h1> <p data-svelte-h="svelte-d2if4l">Tensor parallelism is a technique used to fit a large model in multiple GPUs. For example, when multiplying the input tensors with the first weight tensor, the matrix multiplication is equivalent to splitting the weight tensor column-wise, multiplying each column with the input separately, and then concatenating the separate outputs. These outputs are then transferred from the GPUs and concatenated together to get the final result, like below 👇</p> <p data-svelte-h="svelte-xk4xsj"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/TP.png" alt="Image courtesy of Anton Lozkhov"></p> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-1psn3zf">Tensor Parallelism only works for <a href="../supported_models">models officially supported</a>, it will not work when falling back to <code>transformers</code>. You can get more information about unsupported models <a href="../basic_tutorials/non_core_models">here</a>.</p></div> <p data-svelte-h="svelte-1mk24de">You can learn a lot more details about tensor-parallelism from <a href="https://huggingface.co/docs/transformers/main/en/perf_train_gpu_many#tensor-parallelism" rel="nofollow">the <code>transformers</code> docs</a>.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/text-generation-inference/blob/main/docs/source/conceptual/tensor_parallelism.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1dfb6m4 = {
	assets: "/docs/text-generation-inference/main/en",
	base: "/docs/text-generation-inference/main/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/text-generation-inference/main/en/_app/immutable/entry/start.1810066f.js"),
	import("/docs/text-generation-inference/main/en/_app/immutable/entry/app.8cfc1931.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 22],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 5.62 kB
Xet hash:: dc9697493cea19fafb630f18acddbf1ed79aa4b5050f3a15a48b1ce64fac3b46

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.