ollive-api / api /observability /cost_model.py
Karthik Namboori
Deploy ollive FastAPI Docker Space
7b4b748
from __future__ import annotations
from typing import Any
TOKEN_PROXY_USD_PER_CHAR: float = 0.000001
def runtime_cost_usd(latency_ms: float, cpu_hour_usd: float) -> float:
return (latency_ms / 3_600_000) * cpu_hour_usd
def token_proxy_cost_usd(total_chars: int, tokens_per_char: float) -> float:
return max(total_chars, 1) * tokens_per_char * TOKEN_PROXY_USD_PER_CHAR
def estimate_request_cost_usd(
*,
latency_ms: float,
total_chars: int,
cpu_hour_usd: float,
tokens_per_char: float,
) -> tuple[int, float]:
runtime = runtime_cost_usd(latency_ms, cpu_hour_usd)
token = token_proxy_cost_usd(total_chars, tokens_per_char)
estimated_tokens = int(max(total_chars, 1) * tokens_per_char)
return estimated_tokens, round(runtime + token, 8)
def pricing_summary(*, cpu_hour_usd: float, tokens_per_char: float) -> dict[str, float | str]:
return {
"cpu_hour_usd": cpu_hour_usd,
"tokens_per_char": tokens_per_char,
"token_proxy_usd_per_char": TOKEN_PROXY_USD_PER_CHAR,
"deployment": "oss_cpu",
}
def consumed_cost_tooltip(
*,
cpu_hour_usd: float,
tokens_per_char: float,
total_cost_usd: float,
total_requests: int,
) -> str:
if total_requests <= 0:
return (
f"No requests recorded yet. Each request is estimated as runtime + token proxy using "
f"${cpu_hour_usd:.4f}/CPU-hour and {tokens_per_char:.2f} tok/char × "
f"${TOKEN_PROXY_USD_PER_CHAR:.6f}/char."
)
avg_cost = total_cost_usd / total_requests
return (
f"API $ consumed = sum of estimated cost across {total_requests} request(s) = "
f"${total_cost_usd:.8f}. Per request: runtime = (latency_ms ÷ 3,600,000) × "
f"${cpu_hour_usd:.4f}/hr; token proxy = (input + output chars) × {tokens_per_char:.2f} × "
f"${TOKEN_PROXY_USD_PER_CHAR:.6f}. Request cost = runtime + token proxy. "
f"Average per request: ${avg_cost:.8f}."
)
def build_cost_latency_table(
request_rows: list[Any],
*,
cpu_hour_usd: float,
tokens_per_char: float,
fallback_latency_ms: float = 1500.0,
) -> dict[str, Any]:
del fallback_latency_ms # kept for caller compatibility
total_cost = sum(row.estimated_cost_usd for row in request_rows)
total_requests = len(request_rows)
return {
"pricing": pricing_summary(cpu_hour_usd=cpu_hour_usd, tokens_per_char=tokens_per_char),
"api_cost_consumed_usd": round(total_cost, 8),
"total_requests": total_requests,
"estimate_tooltip": consumed_cost_tooltip(
cpu_hour_usd=cpu_hour_usd,
tokens_per_char=tokens_per_char,
total_cost_usd=total_cost,
total_requests=total_requests,
),
}