Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Deploy with your own container","local":"deploy-with-your-own-container","sections":[{"title":"1. Create the inference server","local":"1-create-the-inference-server","sections":[{"title":"1.1 Initialize the uv project","local":"11-initialize-the-uv-project","sections":[],"depth":3},{"title":"1.2 Install the Python dependencies","local":"12-install-the-python-dependencies","sections":[],"depth":3},{"title":"1.3 Add configurations","local":"13-add-configurations","sections":[],"depth":3},{"title":"1.4 Implement the ModelManager","local":"14-implement-the-modelmanager","sections":[],"depth":3},{"title":"1.5 Use FastAPI lifespan for startup and shutdown","local":"15-use-fastapi-lifespan-for-startup-and-shutdown","sections":[],"depth":3},{"title":"1.6 Define the request and response schemas","local":"16-define-the-request-and-response-schemas","sections":[],"depth":3},{"title":"1.7 Implement the server routes","local":"17-implement-the-server-routes","sections":[],"depth":3},{"title":"1.8 Run the server locally","local":"18-run-the-server-locally","sections":[],"depth":3},{"title":"1.9 Full server code listing","local":"19-full-server-code-listing","sections":[],"depth":3}],"depth":2},{"title":"2. Build the Docker image","local":"2-build-the-docker-image","sections":[],"depth":2},{"title":"3. Build and Push the Image","local":"3-build-and-push-the-image","sections":[],"depth":2},{"title":"4. Create the Endpoint","local":"4-create-the-endpoint","sections":[],"depth":2},{"title":"5. Next steps and extensions","local":"5-next-steps-and-extensions","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/inference-endpoints/pr_162/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/inference-endpoints/pr_162/en/_app/immutable/entry/start.bc1fc624.js"> | |
| <link rel="modulepreload" href="/docs/inference-endpoints/pr_162/en/_app/immutable/chunks/scheduler.eb244325.js"> | |
| <link rel="modulepreload" href="/docs/inference-endpoints/pr_162/en/_app/immutable/chunks/singletons.78a3f154.js"> | |
| <link rel="modulepreload" href="/docs/inference-endpoints/pr_162/en/_app/immutable/chunks/index.3c23fb4b.js"> | |
| <link rel="modulepreload" href="/docs/inference-endpoints/pr_162/en/_app/immutable/chunks/paths.b2096035.js"> | |
| <link rel="modulepreload" href="/docs/inference-endpoints/pr_162/en/_app/immutable/entry/app.6dfc88c3.js"> | |
| <link rel="modulepreload" href="/docs/inference-endpoints/pr_162/en/_app/immutable/chunks/preload-helper.801b6ce8.js"> | |
| <link rel="modulepreload" href="/docs/inference-endpoints/pr_162/en/_app/immutable/chunks/index.661680a1.js"> | |
| <link rel="modulepreload" href="/docs/inference-endpoints/pr_162/en/_app/immutable/nodes/0.420f2bd3.js"> | |
| <link rel="modulepreload" href="/docs/inference-endpoints/pr_162/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/inference-endpoints/pr_162/en/_app/immutable/nodes/4.fbd88475.js"> | |
| <link rel="modulepreload" href="/docs/inference-endpoints/pr_162/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.3287a0af.js"> | |
| <link rel="modulepreload" href="/docs/inference-endpoints/pr_162/en/_app/immutable/chunks/CodeBlock.a4ef8b3e.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Deploy with your own container","local":"deploy-with-your-own-container","sections":[{"title":"1. Create the inference server","local":"1-create-the-inference-server","sections":[{"title":"1.1 Initialize the uv project","local":"11-initialize-the-uv-project","sections":[],"depth":3},{"title":"1.2 Install the Python dependencies","local":"12-install-the-python-dependencies","sections":[],"depth":3},{"title":"1.3 Add configurations","local":"13-add-configurations","sections":[],"depth":3},{"title":"1.4 Implement the ModelManager","local":"14-implement-the-modelmanager","sections":[],"depth":3},{"title":"1.5 Use FastAPI lifespan for startup and shutdown","local":"15-use-fastapi-lifespan-for-startup-and-shutdown","sections":[],"depth":3},{"title":"1.6 Define the request and response schemas","local":"16-define-the-request-and-response-schemas","sections":[],"depth":3},{"title":"1.7 Implement the server routes","local":"17-implement-the-server-routes","sections":[],"depth":3},{"title":"1.8 Run the server locally","local":"18-run-the-server-locally","sections":[],"depth":3},{"title":"1.9 Full server code listing","local":"19-full-server-code-listing","sections":[],"depth":3}],"depth":2},{"title":"2. Build the Docker image","local":"2-build-the-docker-image","sections":[],"depth":2},{"title":"3. Build and Push the Image","local":"3-build-and-push-the-image","sections":[],"depth":2},{"title":"4. Create the Endpoint","local":"4-create-the-endpoint","sections":[],"depth":2},{"title":"5. Next steps and extensions","local":"5-next-steps-and-extensions","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="deploy-with-your-own-container" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deploy-with-your-own-container"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Deploy with your own container</span></h1> <p data-svelte-h="svelte-h6ujrw">If the model you’re looking to deploy isn’t supported by any of the high-performance inference engines (vLLM, SGLang, etc.), or you have <em>custom inference logic</em>, need <em>specific Python dependencies</em>, you can deploy a <strong>custom Docker container</strong> on <strong>Inference Endpoints</strong>.</p> <p data-svelte-h="svelte-1lht9kg">This requires more upfront work & understanding of running models in production but gives you full control over the hardware and the server.</p> <p data-svelte-h="svelte-u7wl0r">We’ll walk you through a simple guide on how to:</p> <ul data-svelte-h="svelte-15os693"><li>build a FastAPI server to run <a href="https://huggingface.co/HuggingFaceTB/SmolLM3-3B" rel="nofollow"><code>HuggingFaceTB/SmolLM3-3B</code></a></li> <li>containerize the server</li> <li>deploy the container on Inference Endpoints</li></ul> <p data-svelte-h="svelte-jr240p">Let’s get to it! 😎</p> <h2 class="relative group"><a id="1-create-the-inference-server" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#1-create-the-inference-server"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1. Create the inference server</span></h2> <h3 class="relative group"><a id="11-initialize-the-uv-project" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#11-initialize-the-uv-project"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1.1 Initialize the uv project</span></h3> <p data-svelte-h="svelte-12ix9m7">Start by creating a new uv project by running:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->uv init inference-server<!-- HTML_TAG_END --></pre></div> <blockquote class="note" data-svelte-h="svelte-icy6tp"><p>We’ll be using <code>uv</code> to build this project but using <code>pip</code> or <code>conda</code> works as well, just adjust the commands accordingly.</p></blockquote> <p data-svelte-h="svelte-zhi31g">The <code>main.py</code> file will:</p> <ul data-svelte-h="svelte-1kqxuj4"><li>load the model from <code>/repository</code></li> <li>start a FastAPI app</li> <li>expose a <code>/health</code> and a <code>/generate</code> route</li></ul> <blockquote class="important" data-svelte-h="svelte-wanl1l"><p>Inference Endpoints has a way to download model artifacts super fast, so ideally our code doesn’t download anything related to the model. The model you select when creating the endpoint will be mounted at <code>/repository</code>. <strong>So always load your model from <code>/repository</code></strong>, not directly from the Hugging Face Hub.</p></blockquote> <h3 class="relative group"><a id="12-install-the-python-dependencies" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#12-install-the-python-dependencies"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1.2 Install the Python dependencies</span></h3> <p data-svelte-h="svelte-qy05hr">Before getting to the code, let’s install the necessary dependencies:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->uv add transformers torch <span class="hljs-string">"fastapi[standard]"</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jom1yq">Now let’s build the code step by step. We’ll start by adding all imports and declare a few global variables. The <code>DEVICE</code> and <code>DTYPE</code> global variables are dynamically set according to the underlying GPU/CPU hardware availability.</p> <h3 class="relative group"><a id="13-add-configurations" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#13-add-configurations"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1.3 Add configurations</span></h3> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> logging | |
| <span class="hljs-keyword">import</span> time | |
| <span class="hljs-keyword">from</span> contextlib <span class="hljs-keyword">import</span> asynccontextmanager | |
| <span class="hljs-keyword">from</span> typing <span class="hljs-keyword">import</span> <span class="hljs-type">Optional</span> | |
| <span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> fastapi <span class="hljs-keyword">import</span> FastAPI, HTTPException | |
| <span class="hljs-keyword">from</span> pydantic <span class="hljs-keyword">import</span> BaseModel, Field | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, AutoTokenizer | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-comment"># Config + Logging</span> | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| MODEL_ID = <span class="hljs-string">"/repository"</span> | |
| DEVICE = torch.device(<span class="hljs-string">"cuda"</span> <span class="hljs-keyword">if</span> torch.cuda.is_available() <span class="hljs-keyword">else</span> <span class="hljs-string">"cpu"</span>) | |
| DTYPE = torch.bfloat16 <span class="hljs-keyword">if</span> torch.cuda.is_available() <span class="hljs-keyword">else</span> torch.float32 | |
| MAX_NEW_TOKENS = <span class="hljs-number">512</span> | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="14-implement-the-modelmanager" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#14-implement-the-modelmanager"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1.4 Implement the ModelManager</span></h3> <p data-svelte-h="svelte-1wscg1i">We will follow a few best practices:</p> <ol data-svelte-h="svelte-1g3514s"><li><p><strong>ModelManager</strong> | |
| Avoid keeping raw global model/tokenizer objects without lifecycle control. A small <code>ModelManager</code> class lets us:</p> <ul><li>eagerly <strong>load</strong> the model onto the accelerator, and</li> <li>safely <strong>unload</strong> it and free memory when the server shuts down.</li></ul> <p>The benefit we get here is that we can control the server’s behaviour based on the state of the model and tokenizer. | |
| We want the server to start → load the model & tokenizer → then signal that the server is ready for requests.</p> <p>For convenience, we also create a small <code>ModelNotLoadedError</code> class, to be able to communicate more clearly when the model & tokenizer aren’t loaded.</p></li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-comment"># Model Manager</span> | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-keyword">class</span> <span class="hljs-title class_">ModelNotLoadedError</span>(<span class="hljs-title class_ inherited__">RuntimeError</span>): | |
| <span class="hljs-string">"""Raised when attempting to use the model before it is loaded."""</span> | |
| <span class="hljs-keyword">class</span> <span class="hljs-title class_">ModelManager</span>: | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, model_id: <span class="hljs-built_in">str</span>, device: <span class="hljs-built_in">str</span>, dtype: torch.dtype</span>): | |
| self.model_id = model_id | |
| self.device = device | |
| self.dtype = dtype | |
| self.model: <span class="hljs-type">Optional</span>[AutoModelForCausalLM] = <span class="hljs-literal">None</span> | |
| self.tokenizer: <span class="hljs-type">Optional</span>[AutoTokenizer] = <span class="hljs-literal">None</span> | |
| <span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">load</span>(<span class="hljs-params">self</span>): | |
| <span class="hljs-string">"""Load model + tokenizer if not already loaded."""</span> | |
| <span class="hljs-keyword">if</span> self.model <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">and</span> self.tokenizer <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>: | |
| <span class="hljs-keyword">return</span> | |
| start = time.perf_counter() | |
| logger.info(<span class="hljs-string">f"Loading tokenizer and model for <span class="hljs-subst">{self.model_id}</span>"</span>) | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| self.model_id, | |
| ) | |
| self.model = ( | |
| AutoModelForCausalLM.from_pretrained( | |
| self.model_id, | |
| dtype=self.dtype, | |
| ) | |
| .to(self.device) | |
| .<span class="hljs-built_in">eval</span>() | |
| ) | |
| duration_ms = (time.perf_counter() - start) * <span class="hljs-number">1000</span> | |
| logger.info(<span class="hljs-string">f"Finished loading <span class="hljs-subst">{self.model_id}</span> in <span class="hljs-subst">{duration_ms:<span class="hljs-number">.2</span>f}</span> ms"</span>) | |
| <span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">unload</span>(<span class="hljs-params">self</span>): | |
| <span class="hljs-string">"""Free model + tokenizer and clear CUDA cache."""</span> | |
| <span class="hljs-keyword">if</span> self.model <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>: | |
| self.model.to(<span class="hljs-string">"cpu"</span>) | |
| <span class="hljs-keyword">del</span> self.model | |
| self.model = <span class="hljs-literal">None</span> | |
| <span class="hljs-keyword">if</span> self.tokenizer <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>: | |
| <span class="hljs-keyword">del</span> self.tokenizer | |
| self.tokenizer = <span class="hljs-literal">None</span> | |
| <span class="hljs-keyword">if</span> torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">get</span>(<span class="hljs-params">self</span>): | |
| <span class="hljs-string">"""Return the loaded model + tokenizer or raise if not ready."""</span> | |
| <span class="hljs-keyword">if</span> self.model <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">or</span> self.tokenizer <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span>: | |
| <span class="hljs-keyword">raise</span> ModelNotLoadedError(<span class="hljs-string">"Model not loaded"</span>) | |
| <span class="hljs-keyword">return</span> self.model, self.tokenizer | |
| model_manager = ModelManager(MODEL_ID, DEVICE, DTYPE)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="15-use-fastapi-lifespan-for-startup-and-shutdown" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#15-use-fastapi-lifespan-for-startup-and-shutdown"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1.5 Use FastAPI lifespan for startup and shutdown</span></h3> <ol start="2" data-svelte-h="svelte-6k8ggt"><li><strong>FastAPI lifespan</strong> | |
| Using FastAPI’s <code>lifespan</code> we can tie the model manager’s functionality to the server by: | |
| <ul><li>loading the model on app startup using <code>model_manager.load()</code></li> <li>unloading the model on app shutdown using <code>model_manager.unload()</code> | |
| This keeps your server’s memory usage clean and predictable.</li></ul></li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-comment"># Lifespan (startup + shutdown)</span> | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-meta">@asynccontextmanager</span> | |
| <span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">lifespan</span>(<span class="hljs-params">app: FastAPI</span>): | |
| <span class="hljs-keyword">await</span> model_manager.load() | |
| <span class="hljs-keyword">try</span>: | |
| <span class="hljs-keyword">yield</span> | |
| <span class="hljs-keyword">finally</span>: | |
| <span class="hljs-keyword">await</span> model_manager.unload() | |
| app = FastAPI(lifespan=lifespan)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="16-define-the-request-and-response-schemas" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#16-define-the-request-and-response-schemas"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1.6 Define the request and response schemas</span></h3> <p data-svelte-h="svelte-k9psp2">Now that we have the lifecycle in place, we can start building the core logic of the server itself. We’ll start by defining the request and response types, so that we know exactly what type of data we can pass in to the server and what to expect in response.</p> <p data-svelte-h="svelte-16wcsb8">The default value for <code>max_new_tokens</code> is <code>128</code> and can be increased to a maximum of <code>512</code>. This is a practical way of capping the maximum memory a request can take.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-comment"># Schemas</span> | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-keyword">class</span> <span class="hljs-title class_">GenerateRequest</span>(<span class="hljs-title class_ inherited__">BaseModel</span>): | |
| prompt: <span class="hljs-built_in">str</span> = Field(..., min_length=<span class="hljs-number">1</span>, description=<span class="hljs-string">"Plain-text prompt"</span>) | |
| max_new_tokens: <span class="hljs-built_in">int</span> = Field( | |
| <span class="hljs-number">128</span>, | |
| ge=<span class="hljs-number">1</span>, | |
| le=MAX_NEW_TOKENS, | |
| description=<span class="hljs-string">"Upper bound on generated tokens"</span>, | |
| ) | |
| <span class="hljs-keyword">class</span> <span class="hljs-title class_">GenerateResponse</span>(<span class="hljs-title class_ inherited__">BaseModel</span>): | |
| response: <span class="hljs-built_in">str</span> | |
| input_token_count: <span class="hljs-built_in">int</span> | |
| output_token_count: <span class="hljs-built_in">int</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-lqc8e6">Feel free to extend the parameters to include <code>temperature</code>, <code>top_p</code> and other <a href="https://huggingface.co/docs/transformers/v5.0.0rc0/en/model_doc/smollm3#transformers.SmolLM3ForCausalLM" rel="nofollow">configurations supported</a> by the model.</p> <h3 class="relative group"><a id="17-implement-the-server-routes" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#17-implement-the-server-routes"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1.7 Implement the server routes</span></h3> <p data-svelte-h="svelte-1tiw1xa">Moving on to creating the routes for the server - let’s start with the <code>/health</code> route. Here, we’re finally using the model manager to know if the model and tokenizer are ready to go. If the model manager returns a <code>ModelNotLoadedError</code>, we also return an error with the status code of <code>503</code>.</p> <p data-svelte-h="svelte-2isapa">On Inference Endpoints (and most other platforms), a <em>readiness probe</em> will ping an endpoint every second on its <code>/health</code> route, to check that everything is okay. Using this pattern we can clearly signal that the server isn’t ready before the models and tokenizer are fully initialized.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-comment"># Routes</span> | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-meta">@app.get(<span class="hljs-params"><span class="hljs-string">"/health"</span></span>)</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">health</span>(): | |
| <span class="hljs-keyword">try</span>: | |
| model_manager.get() | |
| <span class="hljs-keyword">except</span> ModelNotLoadedError <span class="hljs-keyword">as</span> exc: | |
| <span class="hljs-keyword">raise</span> HTTPException(status_code=<span class="hljs-number">503</span>, detail=<span class="hljs-built_in">str</span>(exc)) <span class="hljs-keyword">from</span> exc | |
| <span class="hljs-keyword">return</span> {<span class="hljs-string">"message"</span>: <span class="hljs-string">"API is running."</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-qoor7x">And finally the most interesting section: the <code>/generate</code> route. This is the route that we want to call to actually use the model for text generation.</p> <ul data-svelte-h="svelte-1penqq8"><li>It starts with a similar guard as the <code>/health</code> route: we check that the model and tokenizer are loaded, and if not, return a <code>503</code> error.</li> <li>We assume that the model supports <code>apply_chat_template</code>, but fall back to passing the prompt directly without chat templating.</li> <li>We encode the text to tokens and call <code>model.generate()</code>.</li> <li>Lastly, we gather the outputs, decode the tokens to text, and return the response.</li></ul> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-meta">@app.post(<span class="hljs-params"><span class="hljs-string">"/generate"</span>, response_model=GenerateResponse</span>)</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">generate</span>(<span class="hljs-params">request: GenerateRequest</span>) -> GenerateResponse: | |
| start_time = time.perf_counter() | |
| <span class="hljs-keyword">try</span>: | |
| model, tokenizer = model_manager.get() | |
| <span class="hljs-keyword">except</span> ModelNotLoadedError <span class="hljs-keyword">as</span> exc: | |
| <span class="hljs-keyword">raise</span> HTTPException(status_code=<span class="hljs-number">503</span>, detail=<span class="hljs-built_in">str</span>(exc)) <span class="hljs-keyword">from</span> exc | |
| <span class="hljs-keyword">if</span> <span class="hljs-built_in">getattr</span>(tokenizer, <span class="hljs-string">"chat_template"</span>, <span class="hljs-literal">None</span>): | |
| messages = [{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: request.prompt}] | |
| input_text = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=<span class="hljs-literal">False</span>, | |
| add_generation_prompt=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-keyword">else</span>: | |
| input_text = request.prompt | |
| inputs = tokenizer(input_text, return_tensors=<span class="hljs-string">"pt"</span>).to(DEVICE) | |
| <span class="hljs-keyword">try</span>: | |
| <span class="hljs-keyword">with</span> torch.inference_mode(): | |
| outputs = model.generate(**inputs, max_new_tokens=request.max_new_tokens) | |
| <span class="hljs-keyword">except</span> RuntimeError <span class="hljs-keyword">as</span> exc: | |
| logger.exception(<span class="hljs-string">"Generation failed"</span>) | |
| <span class="hljs-keyword">raise</span> HTTPException( | |
| status_code=<span class="hljs-number">500</span>, detail=<span class="hljs-string">f"Generation failed: <span class="hljs-subst">{exc}</span>"</span> | |
| ) <span class="hljs-keyword">from</span> exc | |
| input_token_count = inputs.input_ids.shape[<span class="hljs-number">1</span>] | |
| generated_ids = outputs[<span class="hljs-number">0</span>][input_token_count:] | |
| generated_text = tokenizer.decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>) | |
| output_token_count = generated_ids.shape[<span class="hljs-number">0</span>] | |
| duration_ms = (time.perf_counter() - start_time) * <span class="hljs-number">1000</span> | |
| logger.info( | |
| <span class="hljs-string">f"generate prompt_tokens=<span class="hljs-subst">{input_token_count}</span> "</span> | |
| <span class="hljs-string">f"new_tokens=<span class="hljs-subst">{output_token_count}</span> max_new_tokens=<span class="hljs-subst">{request.max_new_tokens}</span> "</span> | |
| <span class="hljs-string">f"duration_ms=<span class="hljs-subst">{duration_ms:<span class="hljs-number">.2</span>f}</span>"</span> | |
| ) | |
| <span class="hljs-keyword">return</span> GenerateResponse( | |
| response=generated_text, | |
| input_token_count=input_token_count, | |
| output_token_count=output_token_count, | |
| )<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="18-run-the-server-locally" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#18-run-the-server-locally"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1.8 Run the server locally</span></h3> <p data-svelte-h="svelte-b277l5">If you want to run the server locally you would need to replace:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-diff "><!-- HTML_TAG_START --><span class="hljs-deletion">- MODEL_ID = "/repository"</span> | |
| <span class="hljs-addition">+ MODEL_ID = "HuggingFaceTB/SmolLM3-3B"</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-eeafha">Since locally we actually do want to download the model from the Hugging Face Hub. But don’t forget to change this back!</p> <p data-svelte-h="svelte-18opxl">and then run the following:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->uv run uvicorn main:app<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1tl1lae">Go to <code>http://127.0.0.1:8000/docs</code> to see the automatic documentation that FastAPI provides.</p> <p data-svelte-h="svelte-18rlkl3">Well done 🙌</p> <h3 class="relative group"><a id="19-full-server-code-listing" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#19-full-server-code-listing"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>1.9 Full server code listing</span></h3> <details><summary data-svelte-h="svelte-1necd30">If you want to copy & paste the full code you'll find it here:</summary> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> logging | |
| <span class="hljs-keyword">import</span> time | |
| <span class="hljs-keyword">from</span> contextlib <span class="hljs-keyword">import</span> asynccontextmanager | |
| <span class="hljs-keyword">from</span> typing <span class="hljs-keyword">import</span> <span class="hljs-type">Optional</span> | |
| <span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> fastapi <span class="hljs-keyword">import</span> FastAPI, HTTPException | |
| <span class="hljs-keyword">from</span> pydantic <span class="hljs-keyword">import</span> BaseModel, Field | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForCausalLM, AutoTokenizer | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-comment"># Config + Logging</span> | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| MODEL_ID = <span class="hljs-string">"/repository"</span> | |
| DEVICE = torch.device(<span class="hljs-string">"cuda"</span> <span class="hljs-keyword">if</span> torch.cuda.is_available() <span class="hljs-keyword">else</span> <span class="hljs-string">"cpu"</span>) | |
| DTYPE = torch.bfloat16 <span class="hljs-keyword">if</span> torch.cuda.is_available() <span class="hljs-keyword">else</span> torch.float32 | |
| MAX_NEW_TOKENS = <span class="hljs-number">512</span> | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-comment"># Model Manager</span> | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-keyword">class</span> <span class="hljs-title class_">ModelNotLoadedError</span>(<span class="hljs-title class_ inherited__">RuntimeError</span>): | |
| <span class="hljs-string">"""Raised when attempting to use the model before it is loaded."""</span> | |
| <span class="hljs-keyword">class</span> <span class="hljs-title class_">ModelManager</span>: | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, model_id: <span class="hljs-built_in">str</span>, device: <span class="hljs-built_in">str</span>, dtype: torch.dtype</span>): | |
| self.model_id = model_id | |
| self.device = device | |
| self.dtype = dtype | |
| self.model: <span class="hljs-type">Optional</span>[AutoModelForCausalLM] = <span class="hljs-literal">None</span> | |
| self.tokenizer: <span class="hljs-type">Optional</span>[AutoTokenizer] = <span class="hljs-literal">None</span> | |
| <span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">load</span>(<span class="hljs-params">self</span>): | |
| <span class="hljs-string">"""Load model + tokenizer if not already loaded."""</span> | |
| <span class="hljs-keyword">if</span> self.model <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">and</span> self.tokenizer <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>: | |
| <span class="hljs-keyword">return</span> | |
| start = time.perf_counter() | |
| logger.info(<span class="hljs-string">f"Loading tokenizer and model for <span class="hljs-subst">{self.model_id}</span>"</span>) | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| self.model_id, | |
| ) | |
| self.model = ( | |
| AutoModelForCausalLM.from_pretrained( | |
| self.model_id, | |
| dtype=self.dtype, | |
| ) | |
| .to(self.device) | |
| .<span class="hljs-built_in">eval</span>() | |
| ) | |
| duration_ms = (time.perf_counter() - start) * <span class="hljs-number">1000</span> | |
| logger.info(<span class="hljs-string">f"Finished loading <span class="hljs-subst">{self.model_id}</span> in <span class="hljs-subst">{duration_ms:<span class="hljs-number">.2</span>f}</span> ms"</span>) | |
| <span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">unload</span>(<span class="hljs-params">self</span>): | |
| <span class="hljs-string">"""Free model + tokenizer and clear CUDA cache."""</span> | |
| <span class="hljs-keyword">if</span> self.model <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>: | |
| self.model.to(<span class="hljs-string">"cpu"</span>) | |
| <span class="hljs-keyword">del</span> self.model | |
| self.model = <span class="hljs-literal">None</span> | |
| <span class="hljs-keyword">if</span> self.tokenizer <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>: | |
| <span class="hljs-keyword">del</span> self.tokenizer | |
| self.tokenizer = <span class="hljs-literal">None</span> | |
| <span class="hljs-keyword">if</span> torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">get</span>(<span class="hljs-params">self</span>): | |
| <span class="hljs-string">"""Return the loaded model + tokenizer or raise if not ready."""</span> | |
| <span class="hljs-keyword">if</span> self.model <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">or</span> self.tokenizer <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span>: | |
| <span class="hljs-keyword">raise</span> ModelNotLoadedError(<span class="hljs-string">"Model not loaded"</span>) | |
| <span class="hljs-keyword">return</span> self.model, self.tokenizer | |
| model_manager = ModelManager(MODEL_ID, DEVICE, DTYPE) | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-comment"># Lifespan (startup + shutdown)</span> | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-meta">@asynccontextmanager</span> | |
| <span class="hljs-keyword">async</span> <span class="hljs-keyword">def</span> <span class="hljs-title function_">lifespan</span>(<span class="hljs-params">app: FastAPI</span>): | |
| <span class="hljs-keyword">await</span> model_manager.load() | |
| <span class="hljs-keyword">try</span>: | |
| <span class="hljs-keyword">yield</span> | |
| <span class="hljs-keyword">finally</span>: | |
| <span class="hljs-keyword">await</span> model_manager.unload() | |
| app = FastAPI(lifespan=lifespan) | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-comment"># Schemas</span> | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-keyword">class</span> <span class="hljs-title class_">GenerateRequest</span>(<span class="hljs-title class_ inherited__">BaseModel</span>): | |
| prompt: <span class="hljs-built_in">str</span> = Field(..., min_length=<span class="hljs-number">1</span>, description=<span class="hljs-string">"Plain-text prompt"</span>) | |
| max_new_tokens: <span class="hljs-built_in">int</span> = Field( | |
| <span class="hljs-number">128</span>, | |
| ge=<span class="hljs-number">1</span>, | |
| le=MAX_NEW_TOKENS, | |
| description=<span class="hljs-string">"Upper bound on generated tokens"</span>, | |
| ) | |
| <span class="hljs-keyword">class</span> <span class="hljs-title class_">GenerateResponse</span>(<span class="hljs-title class_ inherited__">BaseModel</span>): | |
| response: <span class="hljs-built_in">str</span> | |
| input_token_count: <span class="hljs-built_in">int</span> | |
| output_token_count: <span class="hljs-built_in">int</span> | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-comment"># Routes</span> | |
| <span class="hljs-comment"># ------------------------------------------------------</span> | |
| <span class="hljs-meta">@app.get(<span class="hljs-params"><span class="hljs-string">"/health"</span></span>)</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">health</span>(): | |
| <span class="hljs-keyword">try</span>: | |
| model_manager.get() | |
| <span class="hljs-keyword">except</span> ModelNotLoadedError <span class="hljs-keyword">as</span> exc: | |
| <span class="hljs-keyword">raise</span> HTTPException(status_code=<span class="hljs-number">503</span>, detail=<span class="hljs-built_in">str</span>(exc)) <span class="hljs-keyword">from</span> exc | |
| <span class="hljs-keyword">return</span> {<span class="hljs-string">"message"</span>: <span class="hljs-string">"API is running."</span>} | |
| <span class="hljs-meta">@app.post(<span class="hljs-params"><span class="hljs-string">"/generate"</span>, response_model=GenerateResponse</span>)</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">generate</span>(<span class="hljs-params">request: GenerateRequest</span>) -> GenerateResponse: | |
| start_time = time.perf_counter() | |
| <span class="hljs-keyword">try</span>: | |
| model, tokenizer = model_manager.get() | |
| <span class="hljs-keyword">except</span> ModelNotLoadedError <span class="hljs-keyword">as</span> exc: | |
| <span class="hljs-keyword">raise</span> HTTPException(status_code=<span class="hljs-number">503</span>, detail=<span class="hljs-built_in">str</span>(exc)) <span class="hljs-keyword">from</span> exc | |
| <span class="hljs-keyword">if</span> <span class="hljs-built_in">getattr</span>(tokenizer, <span class="hljs-string">"chat_template"</span>, <span class="hljs-literal">None</span>): | |
| messages = [{<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: request.prompt}] | |
| input_text = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=<span class="hljs-literal">False</span>, | |
| add_generation_prompt=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-keyword">else</span>: | |
| input_text = request.prompt | |
| inputs = tokenizer(input_text, return_tensors=<span class="hljs-string">"pt"</span>).to(DEVICE) | |
| <span class="hljs-keyword">try</span>: | |
| <span class="hljs-keyword">with</span> torch.inference_mode(): | |
| outputs = model.generate(**inputs, max_new_tokens=request.max_new_tokens) | |
| <span class="hljs-keyword">except</span> RuntimeError <span class="hljs-keyword">as</span> exc: | |
| logger.exception(<span class="hljs-string">"Generation failed"</span>) | |
| <span class="hljs-keyword">raise</span> HTTPException( | |
| status_code=<span class="hljs-number">500</span>, detail=<span class="hljs-string">f"Generation failed: <span class="hljs-subst">{exc}</span>"</span> | |
| ) <span class="hljs-keyword">from</span> exc | |
| input_token_count = inputs.input_ids.shape[<span class="hljs-number">1</span>] | |
| generated_ids = outputs[<span class="hljs-number">0</span>][input_token_count:] | |
| generated_text = tokenizer.decode(generated_ids, skip_special_tokens=<span class="hljs-literal">True</span>) | |
| output_token_count = generated_ids.shape[<span class="hljs-number">0</span>] | |
| duration_ms = (time.perf_counter() - start_time) * <span class="hljs-number">1000</span> | |
| logger.info( | |
| <span class="hljs-string">f"generate prompt_tokens=<span class="hljs-subst">{input_token_count}</span> "</span> | |
| <span class="hljs-string">f"new_tokens=<span class="hljs-subst">{output_token_count}</span> max_new_tokens=<span class="hljs-subst">{request.max_new_tokens}</span> "</span> | |
| <span class="hljs-string">f"duration_ms=<span class="hljs-subst">{duration_ms:<span class="hljs-number">.2</span>f}</span>"</span> | |
| ) | |
| <span class="hljs-keyword">return</span> GenerateResponse( | |
| response=generated_text, | |
| input_token_count=input_token_count, | |
| output_token_count=output_token_count, | |
| )<!-- HTML_TAG_END --></pre></div></details> <h2 class="relative group"><a id="2-build-the-docker-image" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-build-the-docker-image"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. Build the Docker image</span></h2> <p data-svelte-h="svelte-zvx9kj">Now let’s create a <code>Dockerfile</code> to package our server into a container.</p> <blockquote class="tip" data-svelte-h="svelte-12w37im"><p>Model weights shouldn’t be baked into the image: Inference Endpoints will mount the selected model at <code>/repository</code>, so the image only needs your <strong>code</strong> and <strong>dependencies</strong>.</p></blockquote> <p data-svelte-h="svelte-n5vhv9">We’ll also avoid running as <code>root</code> inside the container by creating a non-root user and granting it access to <code>/app</code>.</p> <p data-svelte-h="svelte-2hy6e">First if your uv project doesn’t have a lockfile, which is common if you just created it, we can manually tell uv to make one for us by running:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->uv lock<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1mplbnu">Our Dockerfile will be very standard:</p> <ol data-svelte-h="svelte-1vepoie"><li>We use the base PyTorch image with CUDA and cuDNN</li> <li>We copy the uv binary</li> <li>Make sure that we’re not running things as a privileged user</li> <li>Install the dependencies with uv</li> <li>Make sure that we expose the correct port</li> <li>Run the server</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-Dockerfile "><!-- HTML_TAG_START --><span class="hljs-keyword">FROM</span> pytorch/pytorch:<span class="hljs-number">2.9</span>.<span class="hljs-number">1</span>-cuda12.<span class="hljs-number">8</span>-cudnn9-runtime | |
| <span class="hljs-comment"># Install uv by copying the static binary from the distroless image.</span> | |
| <span class="hljs-keyword">COPY</span><span class="language-bash"> --from=ghcr.io/astral-sh/uv:latest /uv /bin/uv</span> | |
| <span class="hljs-keyword">COPY</span><span class="language-bash"> --from=ghcr.io/astral-sh/uv:latest /uvx /bin/uvx</span> | |
| <span class="hljs-keyword">ENV</span> <span class="hljs-keyword">USER</span>=appuser HOME=/home/appuser | |
| <span class="hljs-keyword">RUN</span><span class="language-bash"> useradd -m -s /bin/bash <span class="hljs-variable">$USER</span></span> | |
| <span class="hljs-keyword">WORKDIR</span><span class="language-bash"> /app</span> | |
| <span class="hljs-comment"># Ensure uv uses the bundled venv</span> | |
| <span class="hljs-keyword">ENV</span> VIRTUAL_ENV=/app/.venv | |
| <span class="hljs-keyword">ENV</span> PATH=<span class="hljs-string">"/app/.venv/bin:${PATH}"</span> | |
| <span class="hljs-comment"># Copy project metadata first (better caching)</span> | |
| <span class="hljs-keyword">COPY</span><span class="language-bash"> pyproject.toml uv.lock ./</span> | |
| <span class="hljs-comment"># Create the venv up front and sync dependencies (no source yet for better caching)</span> | |
| <span class="hljs-keyword">RUN</span><span class="language-bash"> uv venv <span class="hljs-variable">${VIRTUAL_ENV}</span> \ | |
| && uv <span class="hljs-built_in">sync</span> --frozen --no-dev --no-install-project</span> | |
| <span class="hljs-comment"># Copy the main application code</span> | |
| <span class="hljs-keyword">COPY</span><span class="language-bash"> main.py .</span> | |
| <span class="hljs-comment"># Re-sync to capture the project itself inside the venv</span> | |
| <span class="hljs-keyword">RUN</span><span class="language-bash"> uv <span class="hljs-built_in">sync</span> --frozen --no-dev</span> | |
| <span class="hljs-keyword">RUN</span><span class="language-bash"> <span class="hljs-built_in">chown</span> -R <span class="hljs-variable">$USER</span>:<span class="hljs-variable">$USER</span> /app</span> | |
| <span class="hljs-keyword">USER</span> $<span class="hljs-keyword">USER</span> | |
| <span class="hljs-keyword">EXPOSE</span> <span class="hljs-number">8000</span> | |
| <span class="hljs-keyword">CMD</span><span class="language-bash"> [<span class="hljs-string">"uvicorn"</span>, <span class="hljs-string">"main:app"</span>, <span class="hljs-string">"--host"</span>, <span class="hljs-string">"0.0.0.0"</span>, <span class="hljs-string">"--port"</span>, <span class="hljs-string">"8000"</span>]</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="3-build-and-push-the-image" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-build-and-push-the-image"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. Build and Push the Image</span></h2> <p data-svelte-h="svelte-1le3hym">Once your <code>Dockerfile</code> and <code>main.py</code> are ready, build the container and push it to a registry that Inference Endpoints can access (Docker Hub, Amazon ECR, Azure ACR, or Google GCR).</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->docker build -t your-username/smollm-endpoint:v0.1.0 . --platform linux/amd64 | |
| docker push your-username/smollm-endpoint:v0.1.0<!-- HTML_TAG_END --></pre></div> <blockquote class="note" data-svelte-h="svelte-nzg1pu"><p>Why <code>--platform linux/amd64</code>? If you’re building this image on a Mac, it will automatically be built for an <code>arm64</code> machine, which is not a supported architecture for Inference Endpoints. That is why we need this flag to specify that we’re targeting the <code>x86</code> architecture. If you’re on an <code>x86</code> machine already, you can ignore this flag.</p></blockquote> <h2 class="relative group"><a id="4-create-the-endpoint" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#4-create-the-endpoint"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>4. Create the Endpoint</span></h2> <p data-svelte-h="svelte-79a9ta">Now switch to the Inference Endpoints UI and deploy your custom container.</p> <ol data-svelte-h="svelte-1rj27o6"><li><p>Open the <a href="https://endpoints.huggingface.co/" rel="nofollow">Inference Endpoints dashboard</a> and click <strong>”+ New”</strong>. | |
| <img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/custom_container/endpoint-new.png" alt="endpoint-new.png"></p></li> <li><p>Select <code>HuggingFaceTB/SmolLM3-3B</code> as the model repository (this will be mounted at <code>/repository</code> inside the container). | |
| <img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/custom_container/choose-smollm.png" alt="choose-smollm.png"></p></li> <li><p>Click <strong>“Configure”</strong> to proceed with the deployment setup. | |
| <img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/custom_container/configure.png" alt="configure.png"></p></li> <li><p>This is the configuration page where you’ll define compute, networking, and container settings. | |
| <img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/custom_container/home.png" alt="home.png"></p></li> <li><p>Choose the hardware. Let’s go with the suggested L4. | |
| <img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/custom_container/authenticated.png" alt="authenticated.png"></p></li> <li><p>Under <strong>Custom Container</strong>, enter:</p> <ul><li>your image URL (e.g., <code>your-username/smollm-endpoint:v0.1.0</code>)</li> <li>the port exposed by your container (in our case <code>8000</code>) | |
| <img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/custom_container/custom.png" alt="custom.png"></li></ul></li> <li><p>Click <strong>“Create Endpoint”</strong>. The platform will:</p> <ul><li>pull your container image</li> <li>mount the model at <code>/repository</code></li> <li>start your FastAPI server | |
| <img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/custom_container/initializing.png" alt="initializing.png"></li></ul></li> <li><p>After a short initialization period, the status will change to <strong>Running</strong>. Your custom container is now serving requests. | |
| <img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/custom_container/running.png" alt="running.png"></p></li></ol> <p data-svelte-h="svelte-1ur9euj">Once deployed, your endpoint will be available at a URL like:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-bash "><!-- HTML_TAG_START -->https://random-number.region.endpoints.huggingface.cloud/<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1w5htx2">Below is a minimal Python client you can use to test it:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class="language-python "><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> get_token | |
| <span class="hljs-keyword">import</span> requests | |
| url = <span class="hljs-string">"https://random-number.region.endpoints.huggingface.cloud/generate"</span> | |
| prompt = <span class="hljs-string">"What is an Inference Endpoint?"</span> | |
| data = {<span class="hljs-string">"prompt"</span>: prompt, <span class="hljs-string">"max_new_tokens"</span>: <span class="hljs-number">512</span>} | |
| response = requests.post( | |
| url=url, | |
| json=data, | |
| headers={ | |
| <span class="hljs-string">"Authorization"</span>: <span class="hljs-string">f"Bearer <span class="hljs-subst">{get_token()}</span>"</span>, | |
| <span class="hljs-string">"Content-Type"</span>: <span class="hljs-string">"application/json"</span>, | |
| }, | |
| ).json() | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"Input:\n<span class="hljs-subst">{prompt}</span>\n\nOutput:\n<span class="hljs-subst">{response[<span class="hljs-string">'response'</span>]}</span>"</span>) | |
| <!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-w6xt9s">If you open the <strong>Logs</strong> tab of your endpoint, you should see the incoming POST request and the model’s response.</p> <p data-svelte-h="svelte-jvs2q"><img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/custom_container/post.png" alt="post.png"></p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=" "><!-- HTML_TAG_START --><span class="hljs-symbol">Input:</span> | |
| What <span class="hljs-built_in">is</span> an Inference Endpoint? | |
| <span class="hljs-symbol">Output:</span> | |
| <think> | |
| Okay, so I need <span class="hljs-keyword">to</span> ...<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="5-next-steps-and-extensions" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#5-next-steps-and-extensions"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>5. Next steps and extensions</span></h2> <p data-svelte-h="svelte-dqwtkl">Congratulations for making it until the end. 🎉</p> <p data-svelte-h="svelte-1ynngyx">A good idea to extend this demo would be to test it out with a completely different model, say an audio model or image generation one.</p> <p data-svelte-h="svelte-1ueg8ci">Happy hacking 🙌</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/hf-endpoints-documentation/blob/main/docs/source/engines/custom_container.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_2j5mgh = { | |
| assets: "/docs/inference-endpoints/pr_162/en", | |
| base: "/docs/inference-endpoints/pr_162/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/inference-endpoints/pr_162/en/_app/immutable/entry/start.bc1fc624.js"), | |
| import("/docs/inference-endpoints/pr_162/en/_app/immutable/entry/app.6dfc88c3.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 4], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 89.4 kB
- Xet hash:
- 212df3301968880e6573f3a4a84d2f3585ebf5a14e40789aa06f8da588354847
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.