Buckets:

rtrm's picture
download
raw
33.3 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Analytics and Metrics&quot;,&quot;local&quot;:&quot;analytics-and-metrics&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Understanding the graphs&quot;,&quot;local&quot;:&quot;understanding-the-graphs&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Number of (HTTP) Requests&quot;,&quot;local&quot;:&quot;number-of-http-requests&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Pending Requests&quot;,&quot;local&quot;:&quot;pending-requests&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Latency Distribution&quot;,&quot;local&quot;:&quot;latency-distribution&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Running Replicas&quot;,&quot;local&quot;:&quot;running-replicas&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Compute&quot;,&quot;local&quot;:&quot;compute&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Create an integration with the Inference Endpoints OpenMetrics API&quot;,&quot;local&quot;:&quot;create-an-integration-with-the-inference-endpoints-openmetrics-api&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Query metrics manually&quot;,&quot;local&quot;:&quot;query-metrics-manually&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Connect with your observability tools&quot;,&quot;local&quot;:&quot;connect-with-your-observability-tools&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Subscribe to Team or Enterprise&quot;,&quot;local&quot;:&quot;subscribe-to-team-or-enterprise&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/inference-endpoints/pr_151/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/entry/start.56631b46.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/scheduler.eb244325.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/singletons.54c25bcd.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/index.3c23fb4b.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/paths.12ce0a18.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/entry/app.08bc0e6a.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/preload-helper.0ac538a1.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/index.661680a1.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/nodes/0.69485259.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/nodes/11.93a8d579.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.c047d438.js">
<link rel="modulepreload" href="/docs/inference-endpoints/pr_151/en/_app/immutable/chunks/CodeBlock.0d14d0aa.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Analytics and Metrics&quot;,&quot;local&quot;:&quot;analytics-and-metrics&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Understanding the graphs&quot;,&quot;local&quot;:&quot;understanding-the-graphs&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Number of (HTTP) Requests&quot;,&quot;local&quot;:&quot;number-of-http-requests&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Pending Requests&quot;,&quot;local&quot;:&quot;pending-requests&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Latency Distribution&quot;,&quot;local&quot;:&quot;latency-distribution&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Running Replicas&quot;,&quot;local&quot;:&quot;running-replicas&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Compute&quot;,&quot;local&quot;:&quot;compute&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Create an integration with the Inference Endpoints OpenMetrics API&quot;,&quot;local&quot;:&quot;create-an-integration-with-the-inference-endpoints-openmetrics-api&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Query metrics manually&quot;,&quot;local&quot;:&quot;query-metrics-manually&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Connect with your observability tools&quot;,&quot;local&quot;:&quot;connect-with-your-observability-tools&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Subscribe to Team or Enterprise&quot;,&quot;local&quot;:&quot;subscribe-to-team-or-enterprise&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="analytics-and-metrics" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#analytics-and-metrics"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Analytics and Metrics</span></h1> <p data-svelte-h="svelte-1qwdd7g">The Analytics page is like the control center for your deployed models. It tells you in real-time what’s going on, how many users are
calling your models, about hardware usage, latencies, and much more. In this documentation we’ll dive into what each metric means and
how to analyze the graphs.</p> <p data-svelte-h="svelte-1lzx4cg"><img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/analytics/1-intro.png" alt="intro"></p> <p data-svelte-h="svelte-2q1i31">In the top bar, you can configure the high level view:</p> <ul data-svelte-h="svelte-1tmrvm6"><li>Which replica to view metrics from: either an individual replica or all.</li> <li>If you want to view metrics related to requests, hardware, or timeline of replicas.</li> <li>Which time frame you’ll inspect the metrics, and this setting affects all graphs on the page. You can choose between any of the existing settings from the dropdown, or click-and-drag over any graph for a custom timeframe. You can also enable/disable
auto refresh or view the metrics per replica or all.</li></ul> <p data-svelte-h="svelte-s7g9t9"><img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/analytics/2-config.png" alt="config"></p> <h2 class="relative group"><a id="understanding-the-graphs" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#understanding-the-graphs"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Understanding the graphs</span></h2> <h3 class="relative group"><a id="number-of-http-requests" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#number-of-http-requests"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Number of (HTTP) Requests</span></h3> <p data-svelte-h="svelte-15jqzd1">The first graph at the top left shows you how many requests your Inference Endpoint has received. By default they are grouped by HTTP response
classes, but by switching the toggle you can view them by individual status. As a reminder the HTTP response classes are:</p> <ul data-svelte-h="svelte-18dtmml"><li><strong>Informational responses (100-199)</strong>: The server has received your request and is working on it. For example, <code>102 Processing</code> means the server is still handling your request.</li> <li><strong>Successful responses (200-299)</strong>: Your request was received and completed successfully. For example, <code>200 OK</code> means everything worked as expected.</li> <li><strong>Redirection messages (300-399)</strong>: The server is telling your client to look somewhere else for the information or to take another action. For example, <code>301 Moved Permanently</code> means the resource has a new address.</li> <li><strong>Client error responses (400-499)</strong>: There was a problem with the request sent by your client (like a typo in the URL or missing data). For example, <code>404 Not Found</code> means the server couldn’t find what you asked for.</li> <li><strong>Server error responses (500-599)</strong>: The server ran into an issue while trying to process your request. For example, <code>502 Bad Gateway</code> means the server got an invalid response from another server it tried to contact.</li></ul> <p data-svelte-h="svelte-1kwgrsz">We recommend checking the <a href="https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status" rel="nofollow">MDN web docs</a> for more information on individual
status codes.</p> <p data-svelte-h="svelte-1ibhate">The boxes above the graph also show the % of requests in the respective response class.</p> <p data-svelte-h="svelte-1xw4r5k"><img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/analytics/3-http-reqs.png" alt="http"></p> <h3 class="relative group"><a id="pending-requests" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#pending-requests"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Pending Requests</span></h3> <p data-svelte-h="svelte-4jupc9">Pending requests are requests that have not yet received an HTTP status, meaning they include in-flight requests and requests currently
being processed. If this metric increases too much, it means that your requests are queuing up, and your users have to wait for requests
to finish. In this case you should consider increasing your number of replicas or alternatively use autoscaling, you can read more about
it in the <a href="./autoscaling#scalingbasedonpendingrequests(betafeature)">autoscaling guide</a></p> <p data-svelte-h="svelte-eb33tn"><img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/analytics/4-pending-reqs.png" alt="pending"></p> <h3 class="relative group"><a id="latency-distribution" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#latency-distribution"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Latency Distribution</span></h3> <p data-svelte-h="svelte-20agt5">From this graph you’ll be able to see how long it takes for your Inference Endpoint to generate a response. Latency is reported as:</p> <ul data-svelte-h="svelte-1ypi2lx"><li><strong>p99</strong>: meaning that 99% of all requests were faster than this value</li> <li><strong>p95</strong>: meaning that 95% of all requests were faster than this value</li> <li><strong>p90</strong>: meaning that 90% of all requests were faster than this value</li> <li><strong>median</strong>: meaning that 50% of all requests were faster than this value</li></ul> <p data-svelte-h="svelte-19ch391">Usually a good metric is also to look at how big the difference is between the median and p99. The closer the values are to each other, the more
uniform the latency is, whereas if the difference is large, it means that the users of your Inference Endpoint have in general a fast response but
the worst case latencies can be long.</p> <p data-svelte-h="svelte-1eclpic"><img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/analytics/5-latency.png" alt="latency"></p> <h3 class="relative group"><a id="running-replicas" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#running-replicas"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Running Replicas</span></h3> <p data-svelte-h="svelte-kcly0l">In the running replica graph, you’ll see how many running replicas you have during a point in time. The red line shows
your current maximum replicas setting.</p> <p data-svelte-h="svelte-4i74o4"><img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/analytics/6-running.png" alt="status"></p> <p data-svelte-h="svelte-f72rot">For a more advanced view of different statuses for individual replicas, going from <em>pending</em> all the way
to <em>running</em>, you can toggle to the Timeline section. This is very useful to get a sense of how long it takes an Endpoint to become ready to serve requests.</p> <p data-svelte-h="svelte-bb5nbl"><img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/analytics/7-timeline.png" alt="advanced"></p> <h3 class="relative group"><a id="compute" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#compute"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Compute</span></h3> <p data-svelte-h="svelte-2qmv22">These four graphs are dedicated to hardware usage. You’ll find:</p> <ul data-svelte-h="svelte-c4522"><li>CPU usage: How much processing power is being used.</li> <li>Memory usage: How much RAM is being used.</li> <li>GPU usage: How much of the GPU’s processing power is being used.</li> <li>GPU Memory (VRAM) usage: How much GPU memory is being used.</li></ul> <p data-svelte-h="svelte-g3bjeh"><img src="https://raw.githubusercontent.com/huggingface/hf-endpoints-documentation/main/assets/analytics/8-usage.png" alt="usage"></p> <p data-svelte-h="svelte-1lyund8">By toggling “details” you can either view the average or per replica value for the metric in question.</p> <p data-svelte-h="svelte-1l2kowr">If you have autoscaling based on hardware utilization enabled, these are the metrics that determine your autoscaling behaviour. You can
read more about autoscaling <a href="./autoscaling#scalingbasedonhardwareutilization">here</a></p> <h2 class="relative group"><a id="create-an-integration-with-the-inference-endpoints-openmetrics-api" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#create-an-integration-with-the-inference-endpoints-openmetrics-api"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Create an integration with the Inference Endpoints OpenMetrics API</span></h2> <p data-svelte-h="svelte-13d2rfo"><strong>This feature is currently in Beta. You will need to be subscribed to <a href="https://huggingface.co/pricing" rel="nofollow">Team or Enterprise</a> to take advantage of this feature.</strong></p> <p data-svelte-h="svelte-b2zy11">You can export real-time metrics from your Inference Endpoints into your own monitoring stack. The Metrics API exposes metrics in the OpenMetrics format, which is widely supported by observability tools such as Prometheus, Grafana, and Datadog.</p> <p data-svelte-h="svelte-3dlayl">This allows you to monitor in near real-time:</p> <ul data-svelte-h="svelte-1vwan3"><li>Requests grouped by replica</li> <li>Latency distributions (p50, p95, etc.)</li> <li>Hardware metrics (CPU, GPU, memory, accelerator utilization)</li></ul> <h3 class="relative group"><a id="query-metrics-manually" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#query-metrics-manually"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Query metrics manually</span></h3> <p data-svelte-h="svelte-18xwev5">You can use <code>curl</code> to query the metrics endpoint directly and inspect the raw data:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->curl -X GET <span class="hljs-string">&quot;https://api.endpoints.huggingface.cloud/v2/endpoint/{namespace}/{endpoint-name}/open-metrics&quot;</span> \
-H <span class="hljs-string">&quot;Authorization: Bearer YOUR_AUTH_TOKEN&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ctq193">This will return metrics in OpenMetrics text format:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># HELP latency_distribution Latency distribution</span>
<span class="hljs-comment"># TYPE latency_distribution summary</span>
latency_distribution{quantile=<span class="hljs-string">&quot;0.5&quot;</span>} 0.006339203
latency_distribution{quantile=<span class="hljs-string">&quot;0.9&quot;</span>} 0.007574241
latency_distribution{quantile=<span class="hljs-string">&quot;0.95&quot;</span>} 0.007994495
latency_distribution{quantile=<span class="hljs-string">&quot;0.99&quot;</span>} 0.020140918
latency_distribution_count 4
latency_distribution_sum 0.042048857
<span class="hljs-comment"># HELP http_requests HTTP requests by code and replicas</span>
<span class="hljs-comment"># TYPE http_requests counter</span>
http_requests{replica_id=<span class="hljs-string">&quot;fqwg7eri-hskoj&quot;</span>,status_code=<span class="hljs-string">&quot;200&quot;</span>} 1152
http_requests{replica_id=<span class="hljs-string">&quot;q9cv26ut-3vo4s&quot;</span>,status_code=<span class="hljs-string">&quot;200&quot;</span>} 1
<span class="hljs-comment"># HELP cpu_usage_percent CPU percent</span>
<span class="hljs-comment"># TYPE cpu_usage_percent gauge</span>
<span class="hljs-comment"># UNIT cpu_usage_percent percent</span><!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="connect-with-your-observability-tools" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#connect-with-your-observability-tools"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Connect with your observability tools</span></h3> <p data-svelte-h="svelte-10grepd">OpenMetrics is widely supported across monitoring ecosystems. A few common options:</p> <ul data-svelte-h="svelte-1xaje68"><li><a href="https://docs.datadoghq.com/integrations/openmetrics/" rel="nofollow">Datadog OpenMetrics integration</a></li> <li><a href="https://tinyurl.com/e4fypk5m" rel="nofollow">Grafana Prometheus datasource</a></li></ul> <p data-svelte-h="svelte-1v8ph58">From there, you can set up dashboards, alerts, and reports to monitor endpoint performance.</p> <h3 class="relative group"><a id="subscribe-to-team-or-enterprise" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#subscribe-to-team-or-enterprise"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Subscribe to Team or Enterprise</span></h3> <p data-svelte-h="svelte-6fjb8c">Your organization can sign up for the Team or Enterprise plan <a href="https://huggingface.co/enterprise?subscribe=true" rel="nofollow">here</a> 🚀
For any questions or feature requests, please email us at <a href="mailto:api-enterprise@huggingface.co">api-enterprise@huggingface.co</a></p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/hf-endpoints-documentation/blob/main/docs/source/guides/analytics.md" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_10wt9iy = {
assets: "/docs/inference-endpoints/pr_151/en",
base: "/docs/inference-endpoints/pr_151/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/inference-endpoints/pr_151/en/_app/immutable/entry/start.56631b46.js"),
import("/docs/inference-endpoints/pr_151/en/_app/immutable/entry/app.08bc0e6a.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 11],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
33.3 kB
·
Xet hash:
a5e45541dc69c14be17cb38573b75c5857346900251bad191a6268d62fc3ae9e

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.