Buckets:

hf-doc-build/doc-dev / transformers /main /en /pipeline_webserver.html
rtrm's picture
download
raw
25.1 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Using pipelines for a webserver&quot;,&quot;local&quot;:&quot;using-pipelines-for-a-webserver&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Few things you might want to consider&quot;,&quot;local&quot;:&quot;few-things-you-might-want-to-consider&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Error checking&quot;,&quot;local&quot;:&quot;error-checking&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Circuit breaking&quot;,&quot;local&quot;:&quot;circuit-breaking&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Blocking the main thread&quot;,&quot;local&quot;:&quot;blocking-the-main-thread&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Dynamic batching&quot;,&quot;local&quot;:&quot;dynamic-batching&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/transformers/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/entry/start.2135b7e6.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/scheduler.25b97de1.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/singletons.0f2b7d5f.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/index.e188933d.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/paths.3d04d2c6.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/entry/app.24372c84.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/index.d9030fc9.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/nodes/0.026d2fdd.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/nodes/379.e22355ba.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/Tip.baa67368.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/CodeBlock.e6cd0d95.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/EditOnGithub.91d95064.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Using pipelines for a webserver&quot;,&quot;local&quot;:&quot;using-pipelines-for-a-webserver&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Few things you might want to consider&quot;,&quot;local&quot;:&quot;few-things-you-might-want-to-consider&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Error checking&quot;,&quot;local&quot;:&quot;error-checking&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Circuit breaking&quot;,&quot;local&quot;:&quot;circuit-breaking&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Blocking the main thread&quot;,&quot;local&quot;:&quot;blocking-the-main-thread&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Dynamic batching&quot;,&quot;local&quot;:&quot;dynamic-batching&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="using-pipelines-for-a-webserver" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-pipelines-for-a-webserver"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using pipelines for a webserver</span></h1> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">Creating an inference engine is a complex topic, and the &quot;best&quot; solution
will most likely depend on your problem space. Are you on CPU or GPU? Do
you want the lowest latency, the highest throughput, support for
many models, or just highly optimize 1 specific model?
There are many ways to tackle this topic, so what we are going to present is a good default
to get started which may not necessarily be the most optimal solution for you.</div> <p data-svelte-h="svelte-n5c362">The key thing to understand is that we can use an iterator, just like you would <a href="pipeline_tutorial#using-pipelines-on-a-dataset">on a
dataset</a>, since a webserver is basically a system that waits for requests and
treats them as they come in.</p> <p data-svelte-h="svelte-1fnss2o">Usually webservers are multiplexed (multithreaded, async, etc..) to handle various
requests concurrently. Pipelines on the other hand (and mostly the underlying models)
are not really great for parallelism; they take up a lot of RAM, so it’s best to give them all the available resources when they are running or it’s a compute-intensive job.</p> <p data-svelte-h="svelte-8q5x1q">We are going to solve that by having the webserver handle the light load of receiving
and sending requests, and having a single thread handling the actual work.
This example is going to use <code>starlette</code>. The actual framework is not really
important, but you might have to tune or change the code if you are using another
one to achieve the same effect.</p> <p data-svelte-h="svelte-g4gw6x">Create <code>server.py</code>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> starlette.applications <span class="hljs-keyword">import</span> Starlette
<span class="hljs-keyword">from</span> starlette.responses <span class="hljs-keyword">import</span> JSONResponse
<span class="hljs-keyword">from</span> starlette.routing <span class="hljs-keyword">import</span> Route
<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline
<span class="hljs-keyword">import</span> asyncio
<span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">homepage</span>(<span class="hljs-params">request</span>):
payload = <span class="hljs-keyword">await</span> request.body()
string = payload.decode(<span class="hljs-string">&quot;utf-8&quot;</span>)
response_q = asyncio.Queue()
<span class="hljs-keyword">await</span> request.app.model_queue.put((string, response_q))
output = <span class="hljs-keyword">await</span> response_q.get()
<span class="hljs-keyword">return</span> JSONResponse(output)
<span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">server_loop</span>(<span class="hljs-params">q</span>):
pipe = pipeline(model=<span class="hljs-string">&quot;google-bert/bert-base-uncased&quot;</span>)
<span class="hljs-keyword">while</span> <span class="hljs-literal">True</span>:
(string, response_q) = <span class="hljs-keyword">await</span> q.get()
out = pipe(string)
<span class="hljs-keyword">await</span> response_q.put(out)
app = Starlette(
routes=[
Route(<span class="hljs-string">&quot;/&quot;</span>, homepage, methods=[<span class="hljs-string">&quot;POST&quot;</span>]),
],
)
<span class="hljs-meta">@app.on_event(<span class="hljs-params"><span class="hljs-string">&quot;startup&quot;</span></span>)</span>
<span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">startup_event</span>():
q = asyncio.Queue()
app.model_queue = q
asyncio.create_task(server_loop(q))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1uqobnc">Now you can start it with:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->uvicorn server:app<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-65h2jl">And you can query it:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->curl -X POST -d <span class="hljs-string">&quot;test [MASK]&quot;</span> http://localhost:8000/
<span class="hljs-comment">#[{&quot;score&quot;:0.7742936015129089,&quot;token&quot;:1012,&quot;token_str&quot;:&quot;.&quot;,&quot;sequence&quot;:&quot;test.&quot;},...]</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1c2g0w1">And there you go, now you have a good idea of how to create a webserver!</p> <p data-svelte-h="svelte-1l65fmc">What is really important is that we load the model only <strong>once</strong>, so there are no copies
of the model on the webserver. This way, no unnecessary RAM is being used.
Then the queuing mechanism allows you to do fancy stuff like maybe accumulating a few
items before inferring to use dynamic batching:</p> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-rnh8z1">The code sample below is intentionally written like pseudo-code for readability.
Do not run this without checking if it makes sense for your system resources!</p></div> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(string, rq) = <span class="hljs-keyword">await</span> q.get()
strings = []
queues = []
<span class="hljs-keyword">while</span> <span class="hljs-literal">True</span>:
<span class="hljs-keyword">try</span>:
(string, rq) = <span class="hljs-keyword">await</span> asyncio.wait_for(q.get(), timeout=<span class="hljs-number">0.001</span>) <span class="hljs-comment"># 1ms</span>
<span class="hljs-keyword">except</span> asyncio.exceptions.TimeoutError:
<span class="hljs-keyword">break</span>
strings.append(string)
queues.append(rq)
strings
outs = pipe(strings, batch_size=<span class="hljs-built_in">len</span>(strings))
<span class="hljs-keyword">for</span> rq, out <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(queues, outs):
<span class="hljs-keyword">await</span> rq.put(out)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-14jxcik">Again, the proposed code is optimized for readability, not for being the best code.
First of all, there’s no batch size limit which is usually not a
great idea. Next, the timeout is reset on every queue fetch, meaning you could
wait much more than 1ms before running the inference (delaying the first request
by that much).</p> <p data-svelte-h="svelte-1bprwig">It would be better to have a single 1ms deadline.</p> <p data-svelte-h="svelte-1fuz8qb">This will always wait for 1ms even if the queue is empty, which might not be the
best since you probably want to start doing inference if there’s nothing in the queue.
But maybe it does make sense if batching is really crucial for your use case.
Again, there’s really no one best solution.</p> <h2 class="relative group"><a id="few-things-you-might-want-to-consider" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#few-things-you-might-want-to-consider"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Few things you might want to consider</span></h2> <h3 class="relative group"><a id="error-checking" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#error-checking"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Error checking</span></h3> <p data-svelte-h="svelte-vb47n8">There’s a lot that can go wrong in production: out of memory, out of space,
loading the model might fail, the query might be wrong, the query might be
correct but still fail to run because of a model misconfiguration, and so on.</p> <p data-svelte-h="svelte-n6itpg">Generally, it’s good if the server outputs the errors to the user, so
adding a lot of <code>try..except</code> statements to show those errors is a good
idea. But keep in mind it may also be a security risk to reveal all those errors depending
on your security context.</p> <h3 class="relative group"><a id="circuit-breaking" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#circuit-breaking"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Circuit breaking</span></h3> <p data-svelte-h="svelte-1h0ewze">Webservers usually look better when they do circuit breaking. It means they
return proper errors when they’re overloaded instead of just waiting for the query indefinitely. Return a 503 error instead of waiting for a super long time or a 504 after a long time.</p> <p data-svelte-h="svelte-7kt80n">This is relatively easy to implement in the proposed code since there is a single queue.
Looking at the queue size is a basic way to start returning errors before your
webserver fails under load.</p> <h3 class="relative group"><a id="blocking-the-main-thread" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#blocking-the-main-thread"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Blocking the main thread</span></h3> <p data-svelte-h="svelte-1vr6kgq">Currently PyTorch is not async aware, and computation will block the main
thread while running. That means it would be better if PyTorch was forced to run
on its own thread/process. This wasn’t done here because the code is a lot more
complex (mostly because threads and async and queues don’t play nice together).
But ultimately it does the same thing.</p> <p data-svelte-h="svelte-ime2kf">This would be important if the inference of single items were long (&gt; 1s) because
in this case, it means every query during inference would have to wait for 1s before
even receiving an error.</p> <h3 class="relative group"><a id="dynamic-batching" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#dynamic-batching"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Dynamic batching</span></h3> <p data-svelte-h="svelte-hsmuqq">In general, batching is not necessarily an improvement over passing 1 item at
a time (see <a href="./main_classes/pipelines#pipeline-batching">batching details</a> for more information). But it can be very effective
when used in the correct setting. In the API, there is no dynamic
batching by default (too much opportunity for a slowdown). But for BLOOM inference -
which is a very large model - dynamic batching is <strong>essential</strong> to provide a decent experience for everyone.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/pipeline_webserver.md" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1xexzbk = {
assets: "/docs/transformers/main/en",
base: "/docs/transformers/main/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/transformers/main/en/_app/immutable/entry/start.2135b7e6.js"),
import("/docs/transformers/main/en/_app/immutable/entry/app.24372c84.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 379],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
25.1 kB
·
Xet hash:
0debbfaa24655254b106b23434eb10f09ee2179a4a794a7fe03c1b98091f936a

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.