Buckets:

rtrm's picture
download
raw
16.7 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;CLI arguments&quot;,&quot;local&quot;:&quot;cli-arguments&quot;,&quot;sections&quot;:[],&quot;depth&quot;:1}">
<link href="/docs/text-embeddings-inference/pr_742/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/text-embeddings-inference/pr_742/en/_app/immutable/entry/start.40236b54.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/pr_742/en/_app/immutable/chunks/scheduler.6efaaf90.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/pr_742/en/_app/immutable/chunks/singletons.d693fe39.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/pr_742/en/_app/immutable/chunks/paths.8b9dcfff.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/pr_742/en/_app/immutable/entry/app.339a0f46.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/pr_742/en/_app/immutable/chunks/preload-helper.3da9a969.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/pr_742/en/_app/immutable/chunks/index.eb3e1f0f.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/pr_742/en/_app/immutable/nodes/0.5cfecfd2.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/pr_742/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/pr_742/en/_app/immutable/nodes/3.ca22abba.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/pr_742/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.bf9a6737.js">
<link rel="modulepreload" href="/docs/text-embeddings-inference/pr_742/en/_app/immutable/chunks/CodeBlock.906ada77.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;CLI arguments&quot;,&quot;local&quot;:&quot;cli-arguments&quot;,&quot;sections&quot;:[],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="cli-arguments" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#cli-arguments"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>CLI arguments</span></h1> <p data-svelte-h="svelte-1d21239">To see all options to serve your models, run the following:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta prompt_">$ </span><span class="language-bash">text-embeddings-router --<span class="hljs-built_in">help</span></span>
Text Embedding Webserver
Usage: text-embeddings-router [OPTIONS] --model-id &lt;MODEL_ID&gt;
Options:
--model-id &lt;MODEL_ID&gt;
The Hugging Face model ID, can be any model listed on &lt;https://huggingface.co/models&gt; with the `text-embeddings-inference` tag (meaning it&#x27;s compatible with Text Embeddings Inference).
Alternatively, the specified ID can also be a path to a local directory containing the necessary model files saved by the `save_pretrained(...)` methods of either Transformers or Sentence Transformers.
[env: MODEL_ID=]
--revision &lt;REVISION&gt;
The actual revision of the model if you&#x27;re referring to a model on the hub. You can use a specific commit id or a branch like `refs/pr/2`
[env: REVISION=]
--tokenization-workers &lt;TOKENIZATION_WORKERS&gt;
Optionally control the number of tokenizer workers used for payload tokenization, validation and truncation. Default to the number of CPU cores on the machine
[env: TOKENIZATION_WORKERS=]
--dtype &lt;DTYPE&gt;
The dtype to be forced upon the model
[env: DTYPE=]
[possible values: float16, float32]
--served-model-name &lt;SERVED_MODEL_NAME&gt;
The name of the model that is being served. If not specified, defaults to `--model-id`. It is only used for the OpenAI-compatible endpoints via HTTP
[env: SERVED_MODEL_NAME=]
--pooling &lt;POOLING&gt;
Optionally control the pooling method for embedding models.
If `pooling` is not set, the pooling configuration will be parsed from the model `1_Pooling/config.json` configuration.
If `pooling` is set, it will override the model pooling configuration
[env: POOLING=]
Possible values:
- cls: Select the CLS token as embedding
- mean: Apply Mean pooling to the model embeddings
- splade: Apply SPLADE (Sparse Lexical and Expansion) to the model embeddings. This option is only available if the loaded model is a `ForMaskedLM` Transformer model
- last-token: Select the last token as embedding
--max-concurrent-requests &lt;MAX_CONCURRENT_REQUESTS&gt;
The maximum amount of concurrent requests for this particular deployment. Having a low limit will refuse clients requests instead of having them wait for too long and is usually good to handle backpressure correctly
[env: MAX_CONCURRENT_REQUESTS=]
[default: 512]
--max-batch-tokens &lt;MAX_BATCH_TOKENS&gt;
**IMPORTANT** This is one critical control to allow maximum usage of the available hardware.
This represents the total amount of potential tokens within a batch.
For `max_batch_tokens=1000`, you could fit `10` queries of `total_tokens=100` or a single query of `1000` tokens.
Overall this number should be the largest possible until the model is compute bound. Since the actual memory overhead depends on the model implementation, text-embeddings-inference cannot infer this number automatically.
[env: MAX_BATCH_TOKENS=]
[default: 16384]
--max-batch-requests &lt;MAX_BATCH_REQUESTS&gt;
Optionally control the maximum number of individual requests in a batch
[env: MAX_BATCH_REQUESTS=]
--max-client-batch-size &lt;MAX_CLIENT_BATCH_SIZE&gt;
Control the maximum number of inputs that a client can send in a single request
[env: MAX_CLIENT_BATCH_SIZE=]
[default: 32]
--auto-truncate
Control automatic truncation of inputs that exceed the model&#x27;s maximum supported size. Defaults to `true` (truncation enabled). Set to `false` to disable truncation; when disabled and the model&#x27;s maximum input length exceeds `--max-batch-tokens`, the server will refuse to start with an error instead of silently truncating sequences.
Unused for gRPC servers
[env: AUTO_TRUNCATE=]
--default-prompt-name &lt;DEFAULT_PROMPT_NAME&gt;
The name of the prompt that should be used by default for encoding. If not set, no prompt will be applied.
Must be a key in the `sentence-transformers` configuration `prompts` dictionary.
For example if ``default_prompt_name`` is &quot;query&quot; and the ``prompts`` is {&quot;query&quot;: &quot;query: &quot;, ...}, then the sentence &quot;What is the capital of France?&quot; will be encoded as &quot;query: What is the capital of France?&quot; because the prompt text will be prepended before any text to encode.
The argument &#x27;--default-prompt-name &lt;DEFAULT_PROMPT_NAME&gt;&#x27; cannot be used with &#x27;--default-prompt &lt;DEFAULT_PROMPT&gt;`
[env: DEFAULT_PROMPT_NAME=]
--default-prompt &lt;DEFAULT_PROMPT&gt;
The prompt that should be used by default for encoding. If not set, no prompt will be applied.
For example if ``default_prompt`` is &quot;query: &quot; then the sentence &quot;What is the capital of France?&quot; will be encoded as &quot;query: What is the capital of France?&quot; because the prompt text will be prepended before any text to encode.
The argument &#x27;--default-prompt &lt;DEFAULT_PROMPT&gt;&#x27; cannot be used with &#x27;--default-prompt-name &lt;DEFAULT_PROMPT_NAME&gt;`
[env: DEFAULT_PROMPT=]
--dense-path &lt;DENSE_PATH&gt;
Optionally, define the path to the Dense module required for some embedding models.
Some embedding models require an extra `Dense` module which contains a single Linear layer and an activation function. By default, those `Dense` modules are stored under the `2_Dense` directory, but there might be cases where different `Dense` modules are provided, to convert the pooled embeddings into different dimensions, available as `2_Dense_&lt;dims&gt;` e.g. https://huggingface.co/NovaSearch/stella_en_400M_v5.
Note that this argument is optional, only required to be set if there is no `modules.json` file or when you want to override a single Dense module path, only when running with the `candle` backend.
[env: DENSE_PATH=]
--hf-token &lt;HF_TOKEN&gt;
Your Hugging Face Hub token. If neither `--hf-token` nor `HF_TOKEN` is set, the token will be read from the `$HF_HOME/token` path, if it exists. This ensures access to private or gated models, and allows for a more permissive rate limiting
[env: HF_TOKEN=]
--hostname &lt;HOSTNAME&gt;
The IP address to listen on
[env: HOSTNAME=]
[default: 0.0.0.0]
-p, --port &lt;PORT&gt;
The port to listen on
[env: PORT=]
[default: 3000]
--uds-path &lt;UDS_PATH&gt;
The name of the unix socket some text-embeddings-inference backends will use as they communicate internally with gRPC
[env: UDS_PATH=]
[default: /tmp/text-embeddings-inference-server]
--huggingface-hub-cache &lt;HUGGINGFACE_HUB_CACHE&gt;
The location of the huggingface hub cache. Used to override the location if you want to provide a mounted disk for instance
[env: HUGGINGFACE_HUB_CACHE=]
--payload-limit &lt;PAYLOAD_LIMIT&gt;
Payload size limit in bytes
Default is 2MB
[env: PAYLOAD_LIMIT=]
[default: 2000000]
--api-key &lt;API_KEY&gt;
Set an api key for request authorization.
By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token.
[env: API_KEY=]
--json-output
Outputs the logs in JSON format (useful for telemetry)
[env: JSON_OUTPUT=]
--disable-spans
Whether or not to include the log trace through spans
[env: DISABLE_SPANS=]
--otlp-endpoint &lt;OTLP_ENDPOINT&gt;
The grpc endpoint for opentelemetry. Telemetry is sent to this endpoint as OTLP over gRPC. e.g. `http://localhost:4317`
[env: OTLP_ENDPOINT=]
--otlp-service-name &lt;OTLP_SERVICE_NAME&gt;
The service name for opentelemetry. e.g. `text-embeddings-inference.server`
[env: OTLP_SERVICE_NAME=]
[default: text-embeddings-inference.server]
--prometheus-port &lt;PROMETHEUS_PORT&gt;
The Prometheus port to listen on
[env: PROMETHEUS_PORT=]
[default: 9000]
--cors-allow-origin &lt;CORS_ALLOW_ORIGIN&gt;
Unused for gRPC servers
[env: CORS_ALLOW_ORIGIN=]
-h, --help
Print help (see a summary with &#x27;-h&#x27;)
-V, --version
Print version<!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/text-embeddings-inference/blob/main/docs/source/en/cli_arguments.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1f4ry9h = {
assets: "/docs/text-embeddings-inference/pr_742/en",
base: "/docs/text-embeddings-inference/pr_742/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/text-embeddings-inference/pr_742/en/_app/immutable/entry/start.40236b54.js"),
import("/docs/text-embeddings-inference/pr_742/en/_app/immutable/entry/app.339a0f46.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 3],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
16.7 kB
·
Xet hash:
d82532a38cca0e696dbfbaa8e4bd84ef89e7a3147a593497685f737c391c1d16

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.