Spaces:
Running
Running
| from __future__ import annotations | |
| from typing import Any | |
| TOKEN_PROXY_USD_PER_CHAR: float = 0.000001 | |
| def runtime_cost_usd(latency_ms: float, cpu_hour_usd: float) -> float: | |
| return (latency_ms / 3_600_000) * cpu_hour_usd | |
| def token_proxy_cost_usd(total_chars: int, tokens_per_char: float) -> float: | |
| return max(total_chars, 1) * tokens_per_char * TOKEN_PROXY_USD_PER_CHAR | |
| def estimate_request_cost_usd( | |
| *, | |
| latency_ms: float, | |
| total_chars: int, | |
| cpu_hour_usd: float, | |
| tokens_per_char: float, | |
| ) -> tuple[int, float]: | |
| runtime = runtime_cost_usd(latency_ms, cpu_hour_usd) | |
| token = token_proxy_cost_usd(total_chars, tokens_per_char) | |
| estimated_tokens = int(max(total_chars, 1) * tokens_per_char) | |
| return estimated_tokens, round(runtime + token, 8) | |
| def pricing_summary(*, cpu_hour_usd: float, tokens_per_char: float) -> dict[str, float | str]: | |
| return { | |
| "cpu_hour_usd": cpu_hour_usd, | |
| "tokens_per_char": tokens_per_char, | |
| "token_proxy_usd_per_char": TOKEN_PROXY_USD_PER_CHAR, | |
| "deployment": "oss_cpu", | |
| } | |
| def consumed_cost_tooltip( | |
| *, | |
| cpu_hour_usd: float, | |
| tokens_per_char: float, | |
| total_cost_usd: float, | |
| total_requests: int, | |
| ) -> str: | |
| if total_requests <= 0: | |
| return ( | |
| f"No requests recorded yet. Each request is estimated as runtime + token proxy using " | |
| f"${cpu_hour_usd:.4f}/CPU-hour and {tokens_per_char:.2f} tok/char × " | |
| f"${TOKEN_PROXY_USD_PER_CHAR:.6f}/char." | |
| ) | |
| avg_cost = total_cost_usd / total_requests | |
| return ( | |
| f"API $ consumed = sum of estimated cost across {total_requests} request(s) = " | |
| f"${total_cost_usd:.8f}. Per request: runtime = (latency_ms ÷ 3,600,000) × " | |
| f"${cpu_hour_usd:.4f}/hr; token proxy = (input + output chars) × {tokens_per_char:.2f} × " | |
| f"${TOKEN_PROXY_USD_PER_CHAR:.6f}. Request cost = runtime + token proxy. " | |
| f"Average per request: ${avg_cost:.8f}." | |
| ) | |
| def build_cost_latency_table( | |
| request_rows: list[Any], | |
| *, | |
| cpu_hour_usd: float, | |
| tokens_per_char: float, | |
| fallback_latency_ms: float = 1500.0, | |
| ) -> dict[str, Any]: | |
| del fallback_latency_ms # kept for caller compatibility | |
| total_cost = sum(row.estimated_cost_usd for row in request_rows) | |
| total_requests = len(request_rows) | |
| return { | |
| "pricing": pricing_summary(cpu_hour_usd=cpu_hour_usd, tokens_per_char=tokens_per_char), | |
| "api_cost_consumed_usd": round(total_cost, 8), | |
| "total_requests": total_requests, | |
| "estimate_tooltip": consumed_cost_tooltip( | |
| cpu_hour_usd=cpu_hour_usd, | |
| tokens_per_char=tokens_per_char, | |
| total_cost_usd=total_cost, | |
| total_requests=total_requests, | |
| ), | |
| } | |