Buckets:

rtrm's picture
download
raw
11.6 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;CLI arguments&quot;,&quot;local&quot;:&quot;cli-arguments&quot;,&quot;sections&quot;:[],&quot;depth&quot;:1}">
<link href="/docs/text-embeddings-inference/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/text-embeddings-inference/main/en/_app/immutable/entry/start.5929ce5e.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/main/en/_app/immutable/chunks/scheduler.b108d059.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/main/en/_app/immutable/chunks/singletons.8ba50d2e.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/main/en/_app/immutable/chunks/paths.0433c982.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/main/en/_app/immutable/entry/app.99d3b526.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/main/en/_app/immutable/chunks/index.008de539.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/main/en/_app/immutable/nodes/0.edd78360.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/main/en/_app/immutable/nodes/2.61393340.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/main/en/_app/immutable/chunks/CodeBlock.3968c746.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/main/en/_app/immutable/chunks/EditOnGithub.d1c48e3d.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;CLI arguments&quot;,&quot;local&quot;:&quot;cli-arguments&quot;,&quot;sections&quot;:[],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="cli-arguments" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#cli-arguments"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>CLI arguments</span></h1> <p data-svelte-h="svelte-1d21239">To see all options to serve your models, run the following:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->text-embeddings-router --help<!-- HTML_TAG_END --></pre></div> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Usage: text-embeddings-router [OPTIONS]
Options:
--model-id &lt;MODEL_ID&gt;
The name of the model to load. Can be a MODEL_ID as listed on &lt;https:<span class="hljs-comment">//hf.co/models&gt; like `thenlper/gte-base`.
Or it can be a local directory containing the necessary files as saved by `save_pretrained(...)` methods of
transformers
[env: MODEL_ID=]
[default: thenlper/gte-base]
--revision &lt;REVISION&gt;
The actual revision of the model if you&#x27;re referring to a model on the hub. You can use a specific commit id
or a branch like `refs/pr/2`
[env: REVISION=]
--tokenization-workers &lt;TOKENIZATION_WORKERS&gt;
Optionally control the number of tokenizer workers used for payload tokenization, validation and truncation.
Default to the number of CPU cores on the machine
[env: TOKENIZATION_WORKERS=]
--dtype &lt;DTYPE&gt;
The dtype to be forced upon the model
[env: DTYPE=]
[possible values: float16, float32]
--pooling &lt;POOLING&gt;
Optionally control the pooling method for embedding models.
If `pooling` is not set, the pooling configuration will be parsed from the model `1_Pooling/config.json` configuration.
If `pooling` is set, it will override the model pooling configuration
[env: POOLING=]
Possible values:
- cls: Select the CLS token as embedding
- mean: Apply Mean pooling to the model embeddings
- splade: Apply SPLADE (Sparse Lexical and Expansion) to the model embeddings. This option is only available if the loaded model is a `ForMaskedLM` Transformer model
--max-concurrent-requests &lt;MAX_CONCURRENT_REQUESTS&gt;
The maximum amount of concurrent requests for this particular deployment.
Having a low limit will refuse clients requests instead of having them wait for too long and is usually good
to handle backpressure correctly
[env: MAX_CONCURRENT_REQUESTS=]
[default: 512]
--max-batch-tokens &lt;MAX_BATCH_TOKENS&gt;
**IMPORTANT** This is one critical control to allow maximum usage of the available hardware.
This represents the total amount of potential tokens within a batch.
For `max_batch_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
Overall this number should be the largest possible until the model is compute bound. Since the actual memory
overhead depends on the model implementation, text-embeddings-inference cannot infer this number automatically.
[env: MAX_BATCH_TOKENS=]
[default: 16384]
--max-batch-requests &lt;MAX_BATCH_REQUESTS&gt;
Optionally control the maximum number of individual requests in a batch
[env: MAX_BATCH_REQUESTS=]
--max-client-batch-size &lt;MAX_CLIENT_BATCH_SIZE&gt;
Control the maximum number of inputs that a client can send in a single request
[env: MAX_CLIENT_BATCH_SIZE=]
[default: 32]
--hf-api-token &lt;HF_API_TOKEN&gt;
Your HuggingFace hub token
[env: HF_API_TOKEN=]
--hostname &lt;HOSTNAME&gt;
The IP address to listen on
[env: HOSTNAME=]
[default: 0.0.0.0]
-p, --port &lt;PORT&gt;
The port to listen on
[env: PORT=]
[default: 3000]
--uds-path &lt;UDS_PATH&gt;
The name of the unix socket some text-embeddings-inference backends will use as they communicate internally
with gRPC
[env: UDS_PATH=]
[default: /tmp/text-embeddings-inference-server]
--huggingface-hub-cache &lt;HUGGINGFACE_HUB_CACHE&gt;
The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk
for instance
[env: HUGGINGFACE_HUB_CACHE=/data]
--payload-limit &lt;PAYLOAD_LIMIT&gt;
Payload size limit in bytes
Default is 2MB
[env: PAYLOAD_LIMIT=]
[default: 2000000]
--api-key &lt;API_KEY&gt;
Set an api key for request authorization.
By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token.
[env: API_KEY=]
--json-output
Outputs the logs in JSON format (useful for telemetry)
[env: JSON_OUTPUT=]
--otlp-endpoint &lt;OTLP_ENDPOINT&gt;
The grpc endpoint for opentelemetry. Telemetry is sent to this endpoint as OTLP over gRPC. e.g. `http://localhost:4317`
[env: OTLP_ENDPOINT=]
--otlp-service-name &lt;OTLP_SERVICE_NAME&gt;
The service name for opentelemetry.
[env: OTLP_SERVICE_NAME=]
[default: text-embeddings-inference.server]
--cors-allow-origin &lt;CORS_ALLOW_ORIGIN&gt;
[env: CORS_ALLOW_ORIGIN=]</span><!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/text-embeddings-inference/blob/main/docs/source/en/cli_arguments.md" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1a9ukzg = {
assets: "/docs/text-embeddings-inference/main/en",
base: "/docs/text-embeddings-inference/main/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/text-embeddings-inference/main/en/_app/immutable/entry/start.5929ce5e.js"),
import("/docs/text-embeddings-inference/main/en/_app/immutable/entry/app.99d3b526.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 2],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
11.6 kB
·
Xet hash:
d3249d50ae347b68704a6ac8d68938927af009412ded802cf7627bc069d8c7c0

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.