Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / text-generation-inference /main /en /architecture.html

rtrm

about 1 month ago

download

raw

37.6 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Text Generation Inference Architecture","local":"text-generation-inference-architecture","sections":[{"title":"The Router","local":"the-router","sections":[{"title":"Router’s command line","local":"routers-command-line","sections":[],"depth":3}],"depth":2},{"title":"The Model Server","local":"the-model-server","sections":[{"title":"Model Server Variants","local":"model-server-variants","sections":[],"depth":3},{"title":"Command Line Interface","local":"command-line-interface","sections":[],"depth":3}],"depth":2},{"title":"Call Flow","local":"call-flow","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/text-generation-inference/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/entry/start.1810066f.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/scheduler.362310b7.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/singletons.fa2b0eb7.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/index.7f53ec41.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/paths.284aef40.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/entry/app.8cfc1931.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/index.57dfc70d.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/nodes/0.543c9bd9.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/nodes/2.5a4e0603.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/CodeBlock.d3c47f83.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/EditOnGithub.9633c464.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Text Generation Inference Architecture","local":"text-generation-inference-architecture","sections":[{"title":"The Router","local":"the-router","sections":[{"title":"Router’s command line","local":"routers-command-line","sections":[],"depth":3}],"depth":2},{"title":"The Model Server","local":"the-model-server","sections":[{"title":"Model Server Variants","local":"model-server-variants","sections":[],"depth":3},{"title":"Command Line Interface","local":"command-line-interface","sections":[],"depth":3}],"depth":2},{"title":"Call Flow","local":"call-flow","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="text-generation-inference-architecture" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#text-generation-inference-architecture"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Text Generation Inference Architecture</span></h1> <p data-svelte-h="svelte-vphelk">This document aims at describing the architecture of Text Generation Inference (TGI), by describing the call flow between the separate components.</p> <p data-svelte-h="svelte-772u2a">A high-level architecture diagram can be seen here:</p> <p data-svelte-h="svelte-1l2f8t4"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/TGI.png" alt="TGI architecture"></p> <p data-svelte-h="svelte-1lzr5l3">This diagram shows well there are these separate components:</p> <ul data-svelte-h="svelte-lfhn4u"><li><strong>The router</strong>, also named <code>webserver</code>, that receives the client requests, buffers them, creates some batches, and prepares gRPC calls to a model server.</li> <li><strong>The model server</strong>, responsible of receiving the gRPC requests and to process the inference on the model. If the model is sharded across multiple accelerators (e.g.: multiple GPUs), the model server shards might be synchronized via NCCL or equivalent.</li> <li><strong>The launcher</strong> is a helper thar will be able to launch one or several model servers (if model is sharded), and it launches the router with the compatible arguments.</li></ul> <p data-svelte-h="svelte-9in8o2">The router and the model server can be two different machines, they do not need to be deployed together.</p> <h2 class="relative group"><a id="the-router" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-router"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The Router</span></h2> <p data-svelte-h="svelte-12wcxfx">This component is a rust web server binary that accepts HTTP requests using the custom <a href="https://huggingface.github.io/text-generation-inference/" rel="nofollow">HTTP API</a>, as well as OpenAI’s <a href="https://huggingface.co/docs/text-generation-inference/messages_api" rel="nofollow">Messages API</a>.
	The router receives the API calls and handles the “baches” logic (and introduction to batching can be found <a href="https://github.com/huggingface/text-generation-inference/blob/main/router/README.md" rel="nofollow">here</a>).
	It uses different strategies to reduce latency between requests and responses, especially oriented to decoding latency. It will use queues, schedulers, and block allocators to achieve that and produce batched requests that it will then be sent to the model server.</p> <h3 class="relative group"><a id="routers-command-line" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#routers-command-line"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Router’s command line</span></h3> <p data-svelte-h="svelte-vrz9ad">The router command line will be the way to pass parameters to it (it does not rely on configuration file):</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Text Generation Webserver

	Usage: text-generation-router <span class="hljs-selector-attr">[OPTIONS]</span>

	Options:
	<span class="hljs-attr">--max-concurrent-requests</span> <MAX_CONCURRENT_REQUESTS>
	<span class="hljs-selector-attr">[env: MAX_CONCURRENT_REQUESTS=]</span> <span class="hljs-selector-attr">[default: 128]</span>
	<span class="hljs-attr">--max-best-of</span> <MAX_BEST_OF>
	<span class="hljs-selector-attr">[env: MAX_BEST_OF=]</span> <span class="hljs-selector-attr">[default: 2]</span>
	<span class="hljs-attr">--max-stop-sequences</span> <MAX_STOP_SEQUENCES>
	<span class="hljs-selector-attr">[env: MAX_STOP_SEQUENCES=]</span> <span class="hljs-selector-attr">[default: 4]</span>
	<span class="hljs-attr">--max-top-n-tokens</span> <MAX_TOP_N_TOKENS>
	<span class="hljs-selector-attr">[env: MAX_TOP_N_TOKENS=]</span> <span class="hljs-selector-attr">[default: 5]</span>
	<span class="hljs-attr">--max-input-tokens</span> <MAX_INPUT_TOKENS>
	<span class="hljs-selector-attr">[env: MAX_INPUT_TOKENS=]</span> <span class="hljs-selector-attr">[default: 1024]</span>
	<span class="hljs-attr">--max-total-tokens</span> <MAX_TOTAL_TOKENS>
	<span class="hljs-selector-attr">[env: MAX_TOTAL_TOKENS=]</span> <span class="hljs-selector-attr">[default: 2048]</span>
	<span class="hljs-attr">--waiting-served-ratio</span> <WAITING_SERVED_RATIO>
	<span class="hljs-selector-attr">[env: WAITING_SERVED_RATIO=]</span> <span class="hljs-selector-attr">[default: 1.2]</span>
	<span class="hljs-attr">--max-batch-prefill-tokens</span> <MAX_BATCH_PREFILL_TOKENS>
	<span class="hljs-selector-attr">[env: MAX_BATCH_PREFILL_TOKENS=]</span> <span class="hljs-selector-attr">[default: 4096]</span>
	<span class="hljs-attr">--max-batch-total-tokens</span> <MAX_BATCH_TOTAL_TOKENS>
	<span class="hljs-selector-attr">[env: MAX_BATCH_TOTAL_TOKENS=]</span>
	<span class="hljs-attr">--max-waiting-tokens</span> <MAX_WAITING_TOKENS>
	<span class="hljs-selector-attr">[env: MAX_WAITING_TOKENS=]</span> <span class="hljs-selector-attr">[default: 20]</span>
	<span class="hljs-attr">--max-batch-size</span> <MAX_BATCH_SIZE>
	<span class="hljs-selector-attr">[env: MAX_BATCH_SIZE=]</span>
	<span class="hljs-attr">--hostname</span> <HOSTNAME>
	<span class="hljs-selector-attr">[env: HOSTNAME=]</span> <span class="hljs-selector-attr">[default: 0.0.0.0]</span>
	-<span class="hljs-selector-tag">p</span>, <span class="hljs-attr">--port</span> <PORT>
	<span class="hljs-selector-attr">[env: PORT=]</span> <span class="hljs-selector-attr">[default: 3000]</span>
	<span class="hljs-attr">--master-shard-uds-path</span> <MASTER_SHARD_UDS_PATH>
	<span class="hljs-selector-attr">[env: MASTER_SHARD_UDS_PATH=]</span> <span class="hljs-selector-attr">[default: /tmp/text-generation-server-0]</span>
	<span class="hljs-attr">--tokenizer-name</span> <TOKENIZER_NAME>
	<span class="hljs-selector-attr">[env: TOKENIZER_NAME=]</span> <span class="hljs-selector-attr">[default: bigscience/bloom]</span>
	<span class="hljs-attr">--tokenizer-config-path</span> <TOKENIZER_CONFIG_PATH>
	<span class="hljs-selector-attr">[env: TOKENIZER_CONFIG_PATH=]</span>
	<span class="hljs-attr">--revision</span> <REVISION>
	<span class="hljs-selector-attr">[env: REVISION=]</span>
	<span class="hljs-attr">--validation-workers</span> <VALIDATION_WORKERS>
	<span class="hljs-selector-attr">[env: VALIDATION_WORKERS=]</span> <span class="hljs-selector-attr">[default: 2]</span>
	<span class="hljs-attr">--json-output</span>
	<span class="hljs-selector-attr">[env: JSON_OUTPUT=]</span>
	<span class="hljs-attr">--otlp-endpoint</span> <OTLP_ENDPOINT>
	<span class="hljs-selector-attr">[env: OTLP_ENDPOINT=]</span>
	<span class="hljs-attr">--otlp-service-name</span> <OTLP_SERVICE_NAME>
	<span class="hljs-selector-attr">[env: OTLP_SERVICE_NAME=]</span>
	<span class="hljs-attr">--cors-allow-origin</span> <CORS_ALLOW_ORIGIN>
	<span class="hljs-selector-attr">[env: CORS_ALLOW_ORIGIN=]</span>
	<span class="hljs-attr">--ngrok</span>
	<span class="hljs-selector-attr">[env: NGROK=]</span>
	<span class="hljs-attr">--ngrok-authtoken</span> <NGROK_AUTHTOKEN>
	<span class="hljs-selector-attr">[env: NGROK_AUTHTOKEN=]</span>
	<span class="hljs-attr">--ngrok-edge</span> <NGROK_EDGE>
	<span class="hljs-selector-attr">[env: NGROK_EDGE=]</span>
	<span class="hljs-attr">--messages-api-enabled</span>
	<span class="hljs-selector-attr">[env: MESSAGES_API_ENABLED=]</span>
	<span class="hljs-attr">--disable-grammar-support</span>
	<span class="hljs-selector-attr">[env: DISABLE_GRAMMAR_SUPPORT=]</span>
	<span class="hljs-attr">--max-client-batch-size</span> <MAX_CLIENT_BATCH_SIZE>
	<span class="hljs-selector-attr">[env: MAX_CLIENT_BATCH_SIZE=]</span> <span class="hljs-selector-attr">[default: 4]</span>
	-h, <span class="hljs-attr">--help</span>
	Print help
	-V, <span class="hljs-attr">--version</span>
	Print version<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="the-model-server" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#the-model-server"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>The Model Server</span></h2> <p data-svelte-h="svelte-1fiirti">The model server is a python server, capable of starting a server waiting for gRPC requests, loads a given model, perform sharding to provide <a href="https://huggingface.co/docs/text-generation-inference/conceptual/tensor_parallelism" rel="nofollow">tensor parallelism</a>, and stays alive while waiting for new requests.
	The model server supports models instantiated using Pytorch and optimized for inference mainly on CUDA/ROCM.</p> <h3 class="relative group"><a id="model-server-variants" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#model-server-variants"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Model Server Variants</span></h3> <p data-svelte-h="svelte-hdemao">Several variants of the model server exist that are actively supported by Hugging Face:</p> <ul data-svelte-h="svelte-pxmqif"><li>By default, the model server will attempt building <a href="https://huggingface.co/docs/text-generation-inference/installation_nvidia" rel="nofollow">a server optimized for Nvidia GPUs with CUDA</a>. The code for this version is hosted in the <a href="https://github.com/huggingface/text-generation-inference" rel="nofollow">main TGI repository</a>.</li> <li>A <a href="https://huggingface.co/docs/text-generation-inference/installation_amd" rel="nofollow">version optimized for AMD with ROCm</a> is hosted in the main TGI repository. Some model features differ.</li> <li>A <a href="https://huggingface.co/docs/text-generation-inference/installation_intel" rel="nofollow">version optimized for Intel GPUs</a> is hosted in the main TGI repository. Some model features differ.</li> <li>The <a href="https://huggingface.co/docs/text-generation-inference/installation_gaudi" rel="nofollow">version for Intel Gaudi</a> is maintained on a forked repository, often resynchronized with the main <a href="https://github.com/huggingface/tgi-gaudi" rel="nofollow">TGI repository</a>.</li> <li>A <a href="https://huggingface.co/docs/text-generation-inference/installation_inferentia" rel="nofollow">version for Neuron (AWS Inferentia2)</a> is maintained as part of <a href="https://github.com/huggingface/optimum-neuron/tree/main/text-generation-inference" rel="nofollow">Optimum Neuron</a>.</li> <li>A version for Google TPUs is maintained as part of <a href="https://github.com/huggingface/optimum-tpu/tree/main/text-generation-inference" rel="nofollow">Optimum TPU</a>.</li></ul> <p data-svelte-h="svelte-12gjbvy">Not all variants provide the same features, as hardware and middleware capabilities do not provide the same optimizations.</p> <h3 class="relative group"><a id="command-line-interface" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#command-line-interface"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Command Line Interface</span></h3> <p data-svelte-h="svelte-8olu0s">The official command line interface (CLI) for the server supports three subcommands, <code>download-weights</code>, <code>quantize</code> and <code>serve</code>:</p> <ul data-svelte-h="svelte-1gvj873"><li><code>download-weights</code> will download weights from the hub and, in some variants it will convert weights to a format that is adapted to the given implementation;</li> <li><code>quantize</code> will allow to quantize a model using the <code>qptq</code> package. This feature is not available nor supported on all variants;</li> <li><code>serve</code> will start the server that load a model (or a model shard), receives gRPC calls from the router, performs an inference and provides a formatted response to the given request.</li></ul> <p data-svelte-h="svelte-1v1wq4t">Serve’s command line parameters on the TGI repository are these:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --> Usage: cli.py serve [OPTIONS] MODEL_ID

	╭─ Arguments ──────────────────────────────────────────────────────────────────────────────────────────────╮
	│ * model_id TEXT [default: None] [required] │
	╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯
	╭─ Options ────────────────────────────────────────────────────────────────────────────────────────────────╮
	│ <span class="hljs-params">--revision</span> TEXT [default: None] │
	│ <span class="hljs-params">--sharded</span> <span class="hljs-params">--no-sharded</span> [default: no-sharded] │
	│ <span class="hljs-params">--quantize</span> [bitsandbytes\|bitsandbytes [default: None] │
	│ -nf4\|bitsandbytes-fp4\|gptq │
	│ \|awq\|eetq\|exl2\|fp8] │
	│ <span class="hljs-params">--speculate</span> INTEGER [default: None] │
	│ <span class="hljs-params">--dtype</span> [float16\|bfloat16] [default: None] │
	│ <span class="hljs-params">--trust-remote-code</span> <span class="hljs-params">--no-trust-remote-code</span> [default: │
	│ no-trust-remote-code] │
	│ <span class="hljs-params">--uds-path</span> PATH [default: │
	│ <span class="hljs-string">/tmp/text-generation-serve</span>… │
	│ <span class="hljs-params">--logger-level</span> TEXT [default: INFO] │
	│ <span class="hljs-params">--json-output</span> <span class="hljs-params">--no-json-output</span> [default: no-json-output] │
	│ <span class="hljs-params">--otlp-endpoint</span> TEXT [default: None] │
	│ <span class="hljs-params">--otlp-service-name</span> TEXT [default: │
	│ text-generation-inference.<span class="hljs-string">..</span>│
	│ <span class="hljs-params">--help</span> Show this message and exit. │
	╰──────────────────────────────────────────────────────────────────────────────────────────────────────────╯<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-182mwi5">Note that some variants might support different parameters, and they could possibly accept more options that can be passed on using environment variables.</p> <h2 class="relative group"><a id="call-flow" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#call-flow"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Call Flow</span></h2> <p data-svelte-h="svelte-10nkdn7">Once both components are initialized, weights downloaded and model server is up and running, router and model server exchange data and info through the gRPC call. There are currently two supported schemas, <a href="https://github.com/huggingface/text-generation-inference/blob/main/proto/generate.proto" rel="nofollow">v2</a> and <a href="https://github.com/huggingface/text-generation-inference/blob/main/proto/v3/generate.proto" rel="nofollow">v3</a>. These two versions are almost identical, except for:</p> <ul data-svelte-h="svelte-1e4xpg3"><li>input chunks support, for text and image data,</li> <li>paged attention support</li></ul> <p data-svelte-h="svelte-16m2ryh">Here’s a diagram that displays the exchanges that follow the router and model server startup.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->sequenceDiagram

	R<span class="hljs-function"><span class="hljs-title">outer</span>-></span>>Model Server: service discovery
	M<span class="hljs-function"><span class="hljs-title">odel</span> Server--></span>>Router: urls <span class="hljs-keyword">for</span> other shards

	R<span class="hljs-function"><span class="hljs-title">outer</span>-></span>>Model Server: get model info
	M<span class="hljs-function"><span class="hljs-title">odel</span> Server--></span>>Router: shard info

	R<span class="hljs-function"><span class="hljs-title">outer</span>-></span>>Model Server: health check
	M<span class="hljs-function"><span class="hljs-title">odel</span> Server--></span>>Router: health OK

	R<span class="hljs-function"><span class="hljs-title">outer</span>-></span>>Model Server: warmup(max_input_tokens, max_batch_prefill_tokens, max_total_tokens, max_batch_size)
	M<span class="hljs-function"><span class="hljs-title">odel</span> Server--></span>>Router: warmup result<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qai9e7">After these are done, the router is ready to receive generate calls from multiple clients. Here’s an example.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->sequenceDiagram
	participant Client <span class="hljs-number">1</span>
	participant Client <span class="hljs-number">2</span>
	participant Client <span class="hljs-number">3</span>
	participant Router
	participant Model Server

	Client <span class="hljs-number">1</span>->>Router: generate_stream
	Router->>Model Server: prefill(batch1)
	Model Server-->>Router: generations, cached_batch1, timings
	Router-->>Client <span class="hljs-number">1</span>: token <span class="hljs-number">1</span>

	Router->>Model Server: decode(cached_batch1)
	Model Server-->>Router: generations, cached_batch1, timings
	Router-->>Client <span class="hljs-number">1</span>: token <span class="hljs-number">2</span>

	Router->>Model Server: decode(cached_batch1)
	Model Server-->>Router: generations, cached_batch1, timings
	Router-->>Client <span class="hljs-number">1</span>: token <span class="hljs-number">3</span>

	Client <span class="hljs-number">2</span>->>Router: generate_stream
	Router->>Model Server: prefill(batch2)
	Note <span class="hljs-built_in">right</span> of Model Server: This stops previous batch, that is restarted
	Model Server-->>Router: generations, cached_batch2, timings
	Router-->>Client <span class="hljs-number">2</span>: token <span class="hljs-number">1</span>'

	Router->>Model Server: decode(cached_batch1, cached_batch2)
	Model Server-->>Router: generations, cached_batch1, timings
	Router-->>Client <span class="hljs-number">1</span>: token <span class="hljs-number">4</span>
	Router-->>Client <span class="hljs-number">2</span>: token <span class="hljs-number">2</span>'

	Note <span class="hljs-built_in">left</span> of Client <span class="hljs-number">1</span>: Client <span class="hljs-number">1</span> leaves
	Router->>Model Server: filter_batch(cached_batch1, request_ids_to_keep=batch2)
	Model Server-->>Router: filtered batch

	Router->>Model Server: decode(cached_batch2)
	Model Server-->>Router: generations, cached_batch2, timings
	Router-->>Client <span class="hljs-number">2</span>: token <span class="hljs-number">3</span>'

	Client <span class="hljs-number">3</span>->>Router: generate_stream
	Note <span class="hljs-built_in">right</span> of Model Server: This stops previous batch, that is restarted
	Router->>Model Server: prefill(batch3)
	Note <span class="hljs-built_in">left</span> of Client <span class="hljs-number">1</span>: Client <span class="hljs-number">3</span> leaves without receiving any batch
	Router->>Model Server: clear_cache(batch3)
	Note <span class="hljs-built_in">right</span> of Model Server: This stops previous batch, that is restarted

	Router->>Model Server: decode(cached_batch3)
	Note <span class="hljs-built_in">right</span> of Model Server: Last token (stopping criteria)
	Model Server-->>Router: generations, cached_batch3, timings
	Router-->>Client <span class="hljs-number">2</span>: token <span class="hljs-number">4</span>'

	<!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/text-generation-inference/blob/main/docs/source/architecture.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1dfb6m4 = {
	assets: "/docs/text-generation-inference/main/en",
	base: "/docs/text-generation-inference/main/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/text-generation-inference/main/en/_app/immutable/entry/start.1810066f.js"),
	import("/docs/text-generation-inference/main/en/_app/immutable/entry/app.8cfc1931.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 2],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 37.6 kB
Xet hash:: dfc04080ce4380905e8987e4532888449529e976d522fe1dfedc40763bbffb2b

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.