Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / trl /pr_3582 /en /vllm_integration.html

rtrm

about 2 months ago

download

raw

46.1 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"vLLM Integration","local":"vllm-integration","sections":[{"title":"🚀 How can I use vLLM with TRL to speed up training?","local":"-how-can-i-use-vllm-with-trl-to-speed-up-training","sections":[],"depth":2},{"title":"🎬 Flashback: Why do we need to use vLLM in online methods?","local":"-flashback-why-do-we-need-to-use-vllm-in-online-methods","sections":[],"depth":2},{"title":"🤔 How does vLLM solve the slow generation issue?","local":"-how-does-vllm-solve-the-slow-generation-issue","sections":[],"depth":2},{"title":"🤔 What exactly happens when you run trl vllm-serve --model &lt;model_name&gt; ?","local":"-what-exactly-happens-when-you-run-trl-vllm-serve---model-ltmodelnamegt-","sections":[],"depth":2},{"title":"🥸 More detail on what happens under the hood when running the server","local":"-more-detail-on-what-happens-under-the-hood-when-running-the-server","sections":[],"depth":2},{"title":"🍷 More customization options with vLLM?","local":"-more-customization-options-with-vllm","sections":[],"depth":2},{"title":"🥳 Okay, now that we have the server running, how can we use it to generate completions?","local":"-okay-now-that-we-have-the-server-running-how-can-we-use-it-to-generate-completions","sections":[],"depth":2},{"title":"💆🏻‍♀️ What’s the best distributed setup?","local":"-whats-the-best-distributed-setup","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/trl/pr_3582/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/trl/pr_3582/en/_app/immutable/entry/start.0f0f318c.js">
	<link rel="modulepreload" href="/docs/trl/pr_3582/en/_app/immutable/chunks/scheduler.d627b047.js">
	<link rel="modulepreload" href="/docs/trl/pr_3582/en/_app/immutable/chunks/singletons.affb0d47.js">
	<link rel="modulepreload" href="/docs/trl/pr_3582/en/_app/immutable/chunks/index.a57a1c33.js">
	<link rel="modulepreload" href="/docs/trl/pr_3582/en/_app/immutable/chunks/paths.15dc14db.js">
	<link rel="modulepreload" href="/docs/trl/pr_3582/en/_app/immutable/entry/app.b27a462f.js">
	<link rel="modulepreload" href="/docs/trl/pr_3582/en/_app/immutable/chunks/index.73c51727.js">
	<link rel="modulepreload" href="/docs/trl/pr_3582/en/_app/immutable/nodes/0.8cd8e450.js">
	<link rel="modulepreload" href="/docs/trl/pr_3582/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/trl/pr_3582/en/_app/immutable/nodes/51.26223a70.js">
	<link rel="modulepreload" href="/docs/trl/pr_3582/en/_app/immutable/chunks/CodeBlock.5f78c87f.js">
	<link rel="modulepreload" href="/docs/trl/pr_3582/en/_app/immutable/chunks/getInferenceSnippets.256dfbf1.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"vLLM Integration","local":"vllm-integration","sections":[{"title":"🚀 How can I use vLLM with TRL to speed up training?","local":"-how-can-i-use-vllm-with-trl-to-speed-up-training","sections":[],"depth":2},{"title":"🎬 Flashback: Why do we need to use vLLM in online methods?","local":"-flashback-why-do-we-need-to-use-vllm-in-online-methods","sections":[],"depth":2},{"title":"🤔 How does vLLM solve the slow generation issue?","local":"-how-does-vllm-solve-the-slow-generation-issue","sections":[],"depth":2},{"title":"🤔 What exactly happens when you run trl vllm-serve --model &lt;model_name&gt; ?","local":"-what-exactly-happens-when-you-run-trl-vllm-serve---model-ltmodelnamegt-","sections":[],"depth":2},{"title":"🥸 More detail on what happens under the hood when running the server","local":"-more-detail-on-what-happens-under-the-hood-when-running-the-server","sections":[],"depth":2},{"title":"🍷 More customization options with vLLM?","local":"-more-customization-options-with-vllm","sections":[],"depth":2},{"title":"🥳 Okay, now that we have the server running, how can we use it to generate completions?","local":"-okay-now-that-we-have-the-server-running-how-can-we-use-it-to-generate-completions","sections":[],"depth":2},{"title":"💆🏻‍♀️ What’s the best distributed setup?","local":"-whats-the-best-distributed-setup","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="vllm-integration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#vllm-integration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>vLLM Integration</span></h1> <p data-svelte-h="svelte-8htx9o">This document will guide you through the process of using vLLM with TRL for faster generation in online methods like GRPO and Online DPO. We first summarize a tl;dr on how to use vLLM with TRL, and then we will go into the details of how it works under the hood. Let’s go! 🔥</p> <h2 class="relative group"><a id="-how-can-i-use-vllm-with-trl-to-speed-up-training" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-how-can-i-use-vllm-with-trl-to-speed-up-training"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>🚀 How can I use vLLM with TRL to speed up training?</span></h2> <p data-svelte-h="svelte-chqfxa">💡 <strong>Note</strong>: Resources required for this specific example: a single node with 8 GPUs.</p> <p data-svelte-h="svelte-dqsh1">First, install vLLM using the following command:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install <span class="hljs-string">"trl[vllm]"</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-v8af46">Then run the server:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trl vllm-serve --model Qwen/Qwen2.5-7B --tensor-parallel-size 2 --data-parallel-size 2<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1dcyr4s">Once the server is running, you can use it to generate completions for training. In the example below, we are using the <code>GRPOTrainer</code> to train a model using the vLLM server for generation. The <code>--tensor-parallel-size</code> and <code>--data-parallel-size</code> arguments control how the model and data are sharded across GPUs.</p> <p data-svelte-h="svelte-1bflh17">In this example, we are sharding two copies of the model across 4 GPUs. Increasing data parallelism increases throughput, while increasing tensor parallelism allows for serving larger models. Then, run the training script by passing <code>use_vllm=True</code> in the training arguments as follows:</p> <p data-svelte-h="svelte-1ui7abz">Sample of a simple <code>train.py</code> script:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
	<span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOTrainer, GRPOConfig

	dataset = load_dataset(<span class="hljs-string">"trl-lib/tldr"</span>, split=<span class="hljs-string">"train"</span>)

	<span class="hljs-comment"># Dummy reward function: count the number of unique characters in the completions</span>
	<span class="hljs-keyword">def</span> <span class="hljs-title function_">reward_num_unique_chars</span>(<span class="hljs-params">completions, **kwargs</span>):
	<span class="hljs-keyword">return</span> [<span class="hljs-built_in">len</span>(<span class="hljs-built_in">set</span>(c)) <span class="hljs-keyword">for</span> c <span class="hljs-keyword">in</span> completions]

	training_args = GRPOConfig(
	output_dir=<span class="hljs-string">"my_test"</span>,
	use_vllm=<span class="hljs-literal">True</span>,
	bf16=<span class="hljs-literal">True</span>,
	gradient_checkpointing=<span class="hljs-literal">True</span>,
	)

	trainer = GRPOTrainer(
	model=<span class="hljs-string">"Qwen/Qwen2.5-7B"</span>,
	args=training_args,
	reward_funcs=reward_num_unique_chars,
	train_dataset=dataset,
	)

	trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-rr08fb">And the train command:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="-flashback-why-do-we-need-to-use-vllm-in-online-methods" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-flashback-why-do-we-need-to-use-vllm-in-online-methods"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>🎬 Flashback: Why do we need to use vLLM in online methods?</span></h2> <p data-svelte-h="svelte-1falafi">Online methods like GRPO or Online DPO require the model to generate completions during training, which are then used to compute reward signals. However, generation can be extremely time-consuming, especially with large or reasoning models. In the default setup (without vLLM), completions are generated using the <a href="https://github.com/huggingface/trl/blob/f3e8c2304428ef16e9ae5de9e5741ed84d533b7b/trl/trainer/grpo_trainer.py#L965C39-L965C66" rel="nofollow">(unwrapped) model’s <code>generate</code> method</a>. This approach quickly becomes a major bottleneck — generation is slow and inefficient, particularly for large batches or models. As a result, training times increase significantly, and overall efficiency drops. To address this, we turn to vLLM, which enables much faster and more scalable generation, helping eliminate this bottleneck in online methods.</p> <h2 class="relative group"><a id="-how-does-vllm-solve-the-slow-generation-issue" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-how-does-vllm-solve-the-slow-generation-issue"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>🤔 How does vLLM solve the slow generation issue?</span></h2> <p data-svelte-h="svelte-1mphyvc">If you’ve ever done autoregressive decoder training, you know all the input tokens to the LLM produce their attention key and value tensors, and these tensors are kept in GPU memory to later generate subsequent tokens based on them. These cached key and value tensors are often referred to as the KV cache. However, storing the KV cache occupies a lot of memory, so vLLM uses a technique called <strong>PagedAttention</strong> to solve this problem. PagedAttention, which is inspired by the OS’s virtual memory concept, stores continuous keys and values in <strong>non-contiguous memory space</strong>, which is much more efficient. The details of this are beyond the scope of this document, but in short, it allows the model to store the keys and values in a more efficient way, reducing the memory footprint and speeding up the generation process. If you are interested, make sure to check out the <a href="https://blog.vllm.ai/2023/06/20/vllm.html" rel="nofollow">vLLM PagedAttention</a> for more details.</p> <h2 class="relative group"><a id="-what-exactly-happens-when-you-run-trl-vllm-serve---model-ltmodelnamegt-" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-what-exactly-happens-when-you-run-trl-vllm-serve---model-ltmodelnamegt-"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>🤔 What exactly happens when you run trl vllm-serve --model <model_name> ?</span></h2> <p data-svelte-h="svelte-1m36fdl">When you run for example</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trl vllm-serve --model Qwen/Qwen2.5-7B --tensor-parallel-size 1 --data-parallel-size 4<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1n09rjr">the following happens:</p> <p data-svelte-h="svelte-1w0arxe"><img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/vllm-doc.png" alt="vllm"></p> <ol data-svelte-h="svelte-5de99j"><li><p>vLLM first spawns multiple workers to handle incoming requests in parallel. The number of workers is determined by multiplying the <code>--tensor-parallel-size</code> and <code>--data-parallel-size</code> values. In this example, it spawns 4 workers (1 × 4).
	Each worker operates independently and processes a chunk of the incoming requests — which are basically the prompts sent to the server for generation. A key point to understand is that these 4 workers are running in parallel, and each one is responsible for handling a subset of the total incoming load.</p></li> <li><p>Once the incoming requests (prompts) are distributed across the workers, the model starts generating completions. Internally, the model’s weights are split across multiple GPUs based on the <code>--tensor-parallel-size</code> argument — this is how tensor parallelism is handled. Meanwhile, data parallelism (controlled by <code>--data-parallel-size</code>) ensures that different sets of requests are processed independently across the workers. In short: tensor parallelism splits the model across GPUs, and data parallelism splits the batch of requests across different model replicas.</p></li> <li><p>Although the GPUs process requests independently and in parallel, they still need to communicate with each other. Remember that each GPU handles only a slice of the incoming prompts (for example, with 4 GPUs and 8 prompts using <code>--data-parallel-size=4</code>, each GPU processes 2 prompts).
	This GPU-to-GPU communication is managed efficiently by NVIDIA’s NCCL library. The communication mainly ensures that each GPU gets its correct portion of the incoming requests — it’s lightweight and doesn’t interfere with generation itself.
	Separately, the number of completions to generate per prompt is controlled by the <code>num_generations</code> setting in the GRPO config. For instance, if you set <code>num_generations=2</code> (like in the picture above), each prompt will have 2 completions. So, with 8 prompts and <code>num_generations=2</code>, you would end up with 16 completions total — regardless of the number of GPUs or parallelism settings.</p></li></ol> <h2 class="relative group"><a id="-more-detail-on-what-happens-under-the-hood-when-running-the-server" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-more-detail-on-what-happens-under-the-hood-when-running-the-server"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>🥸 More detail on what happens under the hood when running the server</span></h2> <ul data-svelte-h="svelte-1crxfh2"><li>The vLLM server starts by running the command: <code>trl vllm-serve --model Qwen/Qwen2.5-7B</code>.</li> <li>Once the server is running, it generates completions based on requests from the client (trainer) using <code>vllm_client.generate</code> <a href="https://github.com/huggingface/trl/blob/cc044e35b285be7dc062764b3364e1e684db4c7c/trl/trainer/grpo_trainer.py#L1025-L1035" rel="nofollow">here</a>.</li> <li>The client (trainer) then requests these completions from the server.</li> <li>These completions are used to compute the reward signal.</li> <li>Based on the reward signal and the model’s output, the loss is computed, and the backward pass is performed to update the model’s weights.</li> <li><strong>Note</strong>: The server only handles completion generation — it doesn’t train the model. Therefore, the model’s weights aren’t updated on the server. Once the backward pass is complete, the client sends the updated weights to the server using <code>vllm_client.update_named_param(name, param.data)</code>.</li></ul> <p data-svelte-h="svelte-10q4jgt">When using vLLM, ensure the GPUs assigned for training and generation are separate to avoid resource conflicts. For instance, if you plan to use 4 GPUs for training and another 4 for vLLM generation, you can specify GPU allocation for training using <code>CUDA_VISIBLE_DEVICES</code>. See the example below:</p> <ul data-svelte-h="svelte-1uprqyn"><li><strong>Set GPUs <em>0–3</em> for vLLM generation:</strong> Assume <code>CUDA_VISIBLE_DEVICES=0,1,2,3</code> are allocated for vLLM generation.</li></ul> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->trl vllm-serve --model <model_name> --tensor-parallel-size 1 --data-parallel-size 4<!-- HTML_TAG_END --></pre></div> <ul data-svelte-h="svelte-1peesjh"><li><strong>And GPUs <em>4–7</em> for training:</strong> If you do not set the <code>CUDA_VISIBLE_DEVICES</code> environment variable, the training script will use all available GPUs by default, which may lead to resource conflicts. To avoid this, you can specify which GPUs to use for training. For example, if you want to use GPUs 4–7 for training, set the environment variable as follows:</li></ul> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="-more-customization-options-with-vllm" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-more-customization-options-with-vllm"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>🍷 More customization options with vLLM?</span></h2> <p data-svelte-h="svelte-1vcl05h">You can customize the server configuration by passing additional arguments.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->$ trl vllm-serve <span class="hljs-comment">--help</span>
	usage: trl vllm-serve [-h] <span class="hljs-comment">--model MODEL [--revision REVISION] [--tensor_parallel_size TENSOR_PARALLEL_SIZE]</span>
	[<span class="hljs-comment">--data_parallel_size DATA_PARALLEL_SIZE] [--host HOST] [--port PORT]</span>
	[<span class="hljs-comment">--gpu_memory_utilization GPU_MEMORY_UTILIZATION] [--dtype DTYPE] [--max_model_len MAX_MODEL_LEN]</span>
	[<span class="hljs-comment">--enable_prefix_caching ENABLE_PREFIX_CACHING] [--enforce_eager ENFORCE_EAGER] [--log_level LOG_LEVEL]</span>

	options:
	-h, <span class="hljs-comment">--help Show this help message and exit</span>
	<span class="hljs-comment">--model MODEL Model name or path to load the model from. (default: None)</span>
	<span class="hljs-comment">--revision REVISION Revision to use for the model. If not specified, the default branch will be used. (default: None)</span>
	<span class="hljs-comment">--tensor_parallel_size TENSOR_PARALLEL_SIZE, --tensor-parallel-size TENSOR_PARALLEL_SIZE</span>
	Number <span class="hljs-keyword">of</span> tensor parallel workers <span class="hljs-keyword">to</span> <span class="hljs-keyword">use</span>. (<span class="hljs-keyword">default</span>: <span class="hljs-number">1</span>)
	<span class="hljs-comment">--data_parallel_size DATA_PARALLEL_SIZE, --data-parallel-size DATA_PARALLEL_SIZE</span>
	Number <span class="hljs-keyword">of</span> data parallel workers <span class="hljs-keyword">to</span> <span class="hljs-keyword">use</span>. (<span class="hljs-keyword">default</span>: <span class="hljs-number">1</span>)
	<span class="hljs-comment">--host HOST Host address to run the server on. (default: 0.0.0.0)</span>
	<span class="hljs-comment">--port PORT Port to run the server on. (default: 8000)</span>
	<span class="hljs-comment">--gpu_memory_utilization GPU_MEMORY_UTILIZATION, --gpu-memory-utilization GPU_MEMORY_UTILIZATION</span>
	Ratio (between <span class="hljs-number">0</span> <span class="hljs-keyword">and</span> <span class="hljs-number">1</span>) <span class="hljs-keyword">of</span> GPU memory <span class="hljs-keyword">to</span> reserve <span class="hljs-keyword">for</span> the model weights, activations, <span class="hljs-keyword">and</span> KV cache <span class="hljs-keyword">on</span> the device
	dedicated <span class="hljs-keyword">to</span> generation powered by vLLM. Higher values will increase the KV cache size <span class="hljs-keyword">and</span> thus improve the
	model<span class="hljs-symbol">'s</span> throughput. However, <span class="hljs-keyword">if</span> the value <span class="hljs-keyword">is</span> too high, it may cause <span class="hljs-keyword">out</span>-<span class="hljs-keyword">of</span>-memory (OOM) errors during
	initialization. (<span class="hljs-keyword">default</span>: <span class="hljs-number">0.9</span>)
	<span class="hljs-comment">--dtype DTYPE Data type to use for vLLM generation. If set to 'auto', the data type will be automatically determined based on</span>
	the model <span class="hljs-keyword">configuration</span>. Find the supported values <span class="hljs-keyword">in</span> the vLLM documentation. (<span class="hljs-keyword">default</span>: auto)
	<span class="hljs-comment">--max_model_len MAX_MODEL_LEN, --max-model-len MAX_MODEL_LEN</span>
	<span class="hljs-keyword">If</span> set, the `max_model_len` <span class="hljs-keyword">to</span> <span class="hljs-keyword">use</span> <span class="hljs-keyword">for</span> vLLM. This can be useful <span class="hljs-keyword">when</span> running <span class="hljs-keyword">with</span> reduced
	`vllm_gpu_memory_utilization`, leading <span class="hljs-keyword">to</span> a reduced KV cache size. <span class="hljs-keyword">If</span> <span class="hljs-keyword">not</span> set, vLLM will <span class="hljs-keyword">use</span> the model <span class="hljs-keyword">context</span>
	size, which might be much larger than the KV cache, leading <span class="hljs-keyword">to</span> inefficiencies. (<span class="hljs-keyword">default</span>: None)
	<span class="hljs-comment">--enable_prefix_caching ENABLE_PREFIX_CACHING, --enable-prefix-caching ENABLE_PREFIX_CACHING</span>
	Whether <span class="hljs-keyword">to</span> enable prefix caching <span class="hljs-keyword">in</span> vLLM. <span class="hljs-keyword">If</span> set <span class="hljs-keyword">to</span> `<span class="hljs-literal">True</span>`, ensure that the model <span class="hljs-keyword">and</span> the hardware support this
	feature. (<span class="hljs-keyword">default</span>: None)
	<span class="hljs-comment">--enforce_eager ENFORCE_EAGER, --enforce-eager ENFORCE_EAGER</span>
	Whether <span class="hljs-keyword">to</span> enforce eager execution. <span class="hljs-keyword">If</span> set <span class="hljs-keyword">to</span> `<span class="hljs-literal">True</span>`, we will disable CUDA graph <span class="hljs-keyword">and</span> always execute the model
	<span class="hljs-keyword">in</span> eager mode. <span class="hljs-keyword">If</span> `<span class="hljs-literal">False</span>` (<span class="hljs-keyword">default</span> behavior), we will <span class="hljs-keyword">use</span> CUDA graph <span class="hljs-keyword">and</span> eager execution <span class="hljs-keyword">in</span> hybrid. (<span class="hljs-keyword">default</span>:
	None)
	<span class="hljs-comment">--log_level LOG_LEVEL, --log-level LOG_LEVEL</span>
	Log level <span class="hljs-keyword">for</span> uvicorn. Possible choices: <span class="hljs-symbol">'critical</span>', <span class="hljs-symbol">'error</span>', <span class="hljs-symbol">'warning</span>', <span class="hljs-symbol">'info</span>', <span class="hljs-symbol">'debug</span>', <span class="hljs-symbol">'trace</span>'. (<span class="hljs-keyword">default</span>:
	info)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="-okay-now-that-we-have-the-server-running-how-can-we-use-it-to-generate-completions" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-okay-now-that-we-have-the-server-running-how-can-we-use-it-to-generate-completions"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>🥳 Okay, now that we have the server running, how can we use it to generate completions?</span></h2> <p data-svelte-h="svelte-lqjls6">Run the training script and pass <code>use_vllm=True</code> in the training arguments:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> trl <span class="hljs-keyword">import</span> GRPOConfig

	training_args = GRPOConfig(..., use_vllm=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="-whats-the-best-distributed-setup" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#-whats-the-best-distributed-setup"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>💆🏻‍♀️ What’s the best distributed setup?</span></h2> <p data-svelte-h="svelte-6ulixu"><img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_8_gpus.png"> <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_4_gpus.png"></p> <p data-svelte-h="svelte-hlfj7p">First and foremost, always remember that the optimal setup depends on:</p> <ul data-svelte-h="svelte-5t33fk"><li>The model size</li> <li>The number of GPUs you have</li> <li>The GPU memory size</li> <li>The batch size you are using</li> <li>The number of requests you are sending to the server (prompts)</li> <li>The <code>max_model_len</code> you are using (this is the max length of the input sequence that the model can process, a.k.a. the context window size)</li> <li>The number of completions you are generating for each request (<code>num_generations</code>)</li></ul> <p data-svelte-h="svelte-p7cqkp">Given these factors, our experiments on the Qwen model family (3B, 7B, 14B, 32B) using 8 H100 GPUs show that:</p> <ul data-svelte-h="svelte-1t8q0p1"><li>For reasonable-sized models (3B–14B) and a moderate context window (<code>max_len < 8k</code>), using full capacity for data parallelism gives better throughput. The setup <code>(tp=1, dp=8)</code> yields the best results.</li> <li>For larger models (32B) and longer context windows (<code>max_len > 8k</code>), a smaller DP size combined with some model-side parallelism performs better. For example, <code>(tp=2, dp=4)</code> is a good setup for 32B models with a larger context window.</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/trl/blob/main/docs/source/vllm_integration.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_4tczb2 = {
	assets: "/docs/trl/pr_3582/en",
	base: "/docs/trl/pr_3582/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/trl/pr_3582/en/_app/immutable/entry/start.0f0f318c.js"),
	import("/docs/trl/pr_3582/en/_app/immutable/entry/app.b27a462f.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 51],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 46.1 kB
Xet hash:: 26ec24e699012bc0f5c097cf505c94ffcd3c227846b022b01a2653110495b3ae

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.