Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / text-generation-inference /main /en /conceptual /streaming.html

rtrm

about 2 months ago

download

raw

27.4 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Streaming","local":"streaming","sections":[{"title":"What is Streaming?","local":"what-is-streaming","sections":[],"depth":2},{"title":"How to use Streaming?","local":"how-to-use-streaming","sections":[{"title":"Streaming with Python","local":"streaming-with-python","sections":[],"depth":3},{"title":"Streaming with cURL","local":"streaming-with-curl","sections":[],"depth":3},{"title":"Streaming with JavaScript","local":"streaming-with-javascript","sections":[],"depth":3}],"depth":2},{"title":"How does Streaming work under the hood?","local":"how-does-streaming-work-under-the-hood","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/text-generation-inference/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/entry/start.1810066f.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/scheduler.362310b7.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/singletons.fa2b0eb7.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/index.7f53ec41.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/paths.284aef40.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/entry/app.8cfc1931.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/index.57dfc70d.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/nodes/0.543c9bd9.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/nodes/21.f2abfa1c.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/CodeBlock.d3c47f83.js">
	<link rel="modulepreload" href="/docs/text-generation-inference/main/en/_app/immutable/chunks/EditOnGithub.9633c464.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Streaming","local":"streaming","sections":[{"title":"What is Streaming?","local":"what-is-streaming","sections":[],"depth":2},{"title":"How to use Streaming?","local":"how-to-use-streaming","sections":[{"title":"Streaming with Python","local":"streaming-with-python","sections":[],"depth":3},{"title":"Streaming with cURL","local":"streaming-with-curl","sections":[],"depth":3},{"title":"Streaming with JavaScript","local":"streaming-with-javascript","sections":[],"depth":3}],"depth":2},{"title":"How does Streaming work under the hood?","local":"how-does-streaming-work-under-the-hood","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="streaming" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#streaming"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Streaming</span></h1> <h2 class="relative group"><a id="what-is-streaming" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#what-is-streaming"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>What is Streaming?</span></h2> <p data-svelte-h="svelte-18otp1w">Token streaming is the mode in which the server returns the tokens one by one as the model generates them. This enables showing progressive generations to the user rather than waiting for the whole generation. Streaming is an essential aspect of the end-user experience as it reduces latency, one of the most critical aspects of a smooth experience.</p> <div class="flex justify-center" data-svelte-h="svelte-1mls61x"><img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual_360.gif"> <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/streaming-generation-visual-dark_360.gif"></div> <p data-svelte-h="svelte-1i4w8o9">With token streaming, the server can start returning the tokens one by one before having to generate the whole response. Users can have a sense of the generation’s quality before the end of the generation. This has different positive effects:</p> <ul data-svelte-h="svelte-1sljm14"><li>Users can get results orders of magnitude earlier for extremely long queries.</li> <li>Seeing something in progress allows users to stop the generation if it’s not going in the direction they expect.</li> <li>Perceived latency is lower when results are shown in the early stages.</li> <li>When used in conversational UIs, the experience feels more natural.</li></ul> <p data-svelte-h="svelte-doyuuq">For example, a system can generate 100 tokens per second. If the system generates 1000 tokens, with the non-streaming setup, users need to wait 10 seconds to get results. On the other hand, with the streaming setup, users get initial results immediately, and although end-to-end latency will be the same, they can see half of the generation after five seconds. Below you can see an interactive demo that shows non-streaming vs streaming side-by-side. Click <strong>generate</strong> below.</p> <div class="block dark:hidden" data-svelte-h="svelte-7ia471"><iframe src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=light" width="850" height="350"></iframe></div> <div class="hidden dark:block" data-svelte-h="svelte-1wj2bw6"><iframe src="https://osanseviero-streaming-vs-non-streaming.hf.space?__theme=dark" width="850" height="350"></iframe></div> <h2 class="relative group"><a id="how-to-use-streaming" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-to-use-streaming"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How to use Streaming?</span></h2> <h3 class="relative group"><a id="streaming-with-python" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#streaming-with-python"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Streaming with Python</span></h3> <p data-svelte-h="svelte-16qpzge">To stream tokens with <code>InferenceClient</code>, simply pass <code>stream=True</code> and iterate over the response.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient

	client = InferenceClient(base_url=<span class="hljs-string">"http://127.0.0.1:8080"</span>)
	output = client.chat.completions.create(
	messages=[
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a helpful assistant."</span>},
	{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Count to 10"</span>},
	],
	stream=<span class="hljs-literal">True</span>,
	max_tokens=<span class="hljs-number">1024</span>,
	)

	<span class="hljs-keyword">for</span> chunk <span class="hljs-keyword">in</span> output:
	<span class="hljs-built_in">print</span>(chunk.choices[<span class="hljs-number">0</span>].delta.content)

	<span class="hljs-comment"># 1</span>
	<span class="hljs-comment"># 2</span>
	<span class="hljs-comment"># 3</span>
	<span class="hljs-comment"># 4</span>
	<span class="hljs-comment"># 5</span>
	<span class="hljs-comment"># 6</span>
	<span class="hljs-comment"># 7</span>
	<span class="hljs-comment"># 8</span>
	<span class="hljs-comment"># 9</span>
	<span class="hljs-comment"># 10</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1nblluv">The <code>huggingface_hub</code> library also comes with an <code>AsyncInferenceClient</code> in case you need to handle the requests concurrently.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> AsyncInferenceClient

	client = AsyncInferenceClient(base_url=<span class="hljs-string">"http://127.0.0.1:8080"</span>)
	<span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">main</span>():
	stream = <span class="hljs-keyword">await</span> client.chat.completions.create(
	messages=[{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Say this is a test"</span>}],
	stream=<span class="hljs-literal">True</span>,
	)
	<span class="hljs-keyword">async</span> <span class="hljs-keyword">for</span> chunk <span class="hljs-keyword">in</span> stream:
	<span class="hljs-built_in">print</span>(chunk.choices[<span class="hljs-number">0</span>].delta.content <span class="hljs-keyword">or</span> <span class="hljs-string">""</span>, end=<span class="hljs-string">""</span>)

	asyncio.run(main())

	<span class="hljs-comment"># This</span>
	<span class="hljs-comment"># is</span>
	<span class="hljs-comment"># a</span>
	<span class="hljs-comment"># test</span>
	<span class="hljs-comment">#.</span><!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="streaming-with-curl" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#streaming-with-curl"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Streaming with cURL</span></h3> <p data-svelte-h="svelte-foca00">To use the OpenAI Chat Completions compatible Messages API <code>v1/chat/completions</code> endpoint with curl, you can add the <code>-N</code> flag, which disables curl default buffering and shows data as it arrives from the server</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->curl localhost:<span class="hljs-number">8080</span>/v1/chat/completions \
	-X POST \
	-d '{
	<span class="hljs-string">"model"</span>: <span class="hljs-string">"tgi"</span>,
	<span class="hljs-string">"messages"</span>: [
	{
	<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>,
	<span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a helpful assistant."</span>
	},
	{
	<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>,
	<span class="hljs-string">"content"</span>: <span class="hljs-string">"What is deep learning?"</span>
	}
	],
	<span class="hljs-string">"stream"</span>: <span class="hljs-literal">true</span>,
	<span class="hljs-string">"max_tokens"</span>: <span class="hljs-number">20</span>
	}' \
	-H <span class="hljs-symbol">'Content</span>-<span class="hljs-keyword">Type</span>: application/json'<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="streaming-with-javascript" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#streaming-with-javascript"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Streaming with JavaScript</span></h3> <p data-svelte-h="svelte-16sqqc4">First, we need to install the <code>@huggingface/inference</code> library.
	<code>npm install @huggingface/inference</code></p> <p data-svelte-h="svelte-1qwkxte">If you’re using the free Inference API, you can use <code>HfInference</code>. If you’re using inference endpoints, you can use <code>HfInferenceEndpoint</code>.</p> <p data-svelte-h="svelte-5nogh0">We can create a <code>HfInferenceEndpoint</code> providing our endpoint URL and credential.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> { <span class="hljs-title class_">HfInferenceEndpoint</span> } <span class="hljs-keyword">from</span> <span class="hljs-string">'@huggingface/inference'</span>

	<span class="hljs-keyword">const</span> hf = <span class="hljs-keyword">new</span> <span class="hljs-title class_">HfInferenceEndpoint</span>(<span class="hljs-string">'https://YOUR_ENDPOINT.endpoints.huggingface.cloud'</span>, <span class="hljs-string">'hf_YOUR_TOKEN'</span>)

	<span class="hljs-comment">// prompt</span>
	<span class="hljs-keyword">const</span> prompt = <span class="hljs-string">'What can you do in Nuremberg, Germany? Give me 3 Tips'</span>

	<span class="hljs-keyword">const</span> stream = hf.<span class="hljs-title function_">textGenerationStream</span>({ <span class="hljs-attr">inputs</span>: prompt })
	<span class="hljs-keyword">for</span> <span class="hljs-keyword">await</span> (<span class="hljs-keyword">const</span> r <span class="hljs-keyword">of</span> stream) {
	<span class="hljs-comment">// yield the generated token</span>
	process.<span class="hljs-property">stdout</span>.<span class="hljs-title function_">write</span>(r.<span class="hljs-property">token</span>.<span class="hljs-property">text</span>)
	}<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="how-does-streaming-work-under-the-hood" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-does-streaming-work-under-the-hood"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How does Streaming work under the hood?</span></h2> <p data-svelte-h="svelte-1wghvdx">Under the hood, TGI uses Server-Sent Events (SSE). In an SSE Setup, a client sends a request with the data, opening an HTTP connection and subscribing to updates. Afterward, the server sends data to the client. There is no need for further requests; the server will keep sending the data. SSEs are unidirectional, meaning the client does not send other requests to the server. SSE sends data over HTTP, making it easy to use.</p> <p data-svelte-h="svelte-m95rqi">SSEs are different than:</p> <ul data-svelte-h="svelte-1vpfo5b"><li>Polling: where the client keeps calling the server to get data. This means that the server might return empty responses and cause overhead.</li> <li>Webhooks: where there is a bi-directional connection. The server can send information to the client, but the client can also send data to the server after the first request. Webhooks are more complex to operate as they don’t only use HTTP.</li></ul> <p data-svelte-h="svelte-jbqwxg">If there are too many requests at the same time, TGI returns an HTTP Error with an <code>overloaded</code> error type (<code>huggingface_hub</code> returns <code>OverloadedError</code>). This allows the client to manage the overloaded server (e.g., it could display a busy error to the user or retry with a new request). To configure the maximum number of concurrent requests, you can specify <code>--max_concurrent_requests</code>, allowing clients to handle backpressure.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/text-generation-inference/blob/main/docs/source/conceptual/streaming.md" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_1dfb6m4 = {
	assets: "/docs/text-generation-inference/main/en",
	base: "/docs/text-generation-inference/main/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/text-generation-inference/main/en/_app/immutable/entry/start.1810066f.js"),
	import("/docs/text-generation-inference/main/en/_app/immutable/entry/app.8cfc1931.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 21],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 27.4 kB
Xet hash:: 7e71c8d64fa297cf11ab74b67703fc97ababd93aef78ebb69ae1bd96a765b114

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.