Add GPU selection and token calc method (#1)
Browse files- Add GPU selection and token calc method (bddf37c66b35b663b56fce480f564fe30959e5c8)
- README.md +9 -23
- app.py +101 -110
- requirements.txt +2 -2
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title: LLM Cost &
|
| 3 |
emoji: 🧮
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
|
@@ -9,28 +9,14 @@ app_file: app.py
|
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
-
# LLM Cost
|
| 13 |
|
| 14 |
-
|
| 15 |
|
| 16 |
-
##
|
| 17 |
-
|
| 18 |
-
2. Choose **Gradio** as the SDK.
|
| 19 |
-
3. Upload `app.py` and `requirements.txt`.
|
| 20 |
-
4. The Space will build and launch automatically.
|
| 21 |
|
| 22 |
-
##
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
- **Busy-time only** (idealized, pay only when generating)
|
| 27 |
-
- **Fixed daily uptime** (more realistic: GPU is on N hours/day regardless of load)
|
| 28 |
-
- Shows **per-patient costs** for easy comparison.
|
| 29 |
-
|
| 30 |
-
## Suggested defaults
|
| 31 |
-
- Intake tokens per patient: 2,000
|
| 32 |
-
- Review tokens per patient: 5,000
|
| 33 |
-
- API price range: $0.002–$0.01 per 1K tokens
|
| 34 |
-
- GPU throughput: 200 tokens/sec (start conservative; tune after load tests)
|
| 35 |
-
- GPU price: $2.50/hr (A100-80GB ballpark; adjust to your provider)
|
| 36 |
-
- Utilization: 0.60 (60%)
|
|
|
|
| 1 |
---
|
| 2 |
+
title: LLM Cost, Capacity & Latency Estimator
|
| 3 |
emoji: 🧮
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
|
|
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# LLM Cost, Capacity & Latency Estimator
|
| 13 |
|
| 14 |
+
Compare API vs GPU costs and estimate latency.
|
| 15 |
|
| 16 |
+
## Utilization
|
| 17 |
+
Fraction of time GPU is busy while powered on (0–1).
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
+
## How to deploy
|
| 20 |
+
1. Create HF Space → Gradio.
|
| 21 |
+
2. Upload `app.py`, `requirements.txt`, `README.md`.
|
| 22 |
+
3. Launch.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,23 +1,22 @@
|
|
| 1 |
|
| 2 |
import gradio as gr
|
| 3 |
-
import math
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
**
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
-
|
| 20 |
-
-
|
| 21 |
"""
|
| 22 |
|
| 23 |
def calc_tokens_per_patient(intake_tokens, review_tokens):
|
|
@@ -27,71 +26,44 @@ def managed_api_cost(total_tokens, price_per_1k_tokens):
|
|
| 27 |
return (total_tokens / 1000.0) * price_per_1k_tokens
|
| 28 |
|
| 29 |
def gpu_busy_hours(total_tokens, toks_per_sec):
|
| 30 |
-
|
| 31 |
-
return 0.0
|
| 32 |
-
return total_tokens / toks_per_sec / 3600.0
|
| 33 |
|
| 34 |
def gpu_costs(total_tokens, toks_per_sec, gpu_price_per_hr, daily_uptime_hours, days_per_month, utilization):
|
| 35 |
-
# Busy-time (idealized) cost
|
| 36 |
busy_hours = gpu_busy_hours(total_tokens, toks_per_sec)
|
| 37 |
-
# Adjust busy hours for utilization (how well we keep the GPU busy while it's up)
|
| 38 |
-
# If utilization < 1, we need more clock hours to complete the same work.
|
| 39 |
effective_busy_hours = busy_hours / max(utilization, 1e-6)
|
| 40 |
busy_time_cost = effective_busy_hours * gpu_price_per_hr
|
| 41 |
-
|
| 42 |
-
# Fixed daily uptime model
|
| 43 |
billed_hours = daily_uptime_hours * days_per_month
|
| 44 |
fixed_uptime_cost = billed_hours * gpu_price_per_hr
|
| 45 |
-
|
| 46 |
return busy_hours, effective_busy_hours, busy_time_cost, billed_hours, fixed_uptime_cost
|
| 47 |
|
| 48 |
def per_patient(total_cost, num_patients):
|
| 49 |
-
return (total_cost / num_patients) if num_patients
|
| 50 |
-
|
| 51 |
-
def
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
num_patients,
|
| 56 |
-
intake_tokens,
|
| 57 |
-
review_tokens,
|
| 58 |
-
price_per_1k_low,
|
| 59 |
-
price_per_1k_high,
|
| 60 |
-
toks_per_sec,
|
| 61 |
-
gpu_price_per_hr,
|
| 62 |
-
daily_uptime_hours,
|
| 63 |
-
days_per_month,
|
| 64 |
-
utilization
|
| 65 |
-
):
|
| 66 |
tokens_per_patient = calc_tokens_per_patient(intake_tokens, review_tokens)
|
| 67 |
total_tokens = tokens_per_patient * num_patients
|
| 68 |
-
|
| 69 |
-
# Managed API costs (range)
|
| 70 |
api_cost_low = managed_api_cost(total_tokens, price_per_1k_low)
|
| 71 |
api_cost_high = managed_api_cost(total_tokens, price_per_1k_high)
|
| 72 |
-
|
| 73 |
-
# GPU costs
|
| 74 |
busy_hours, effective_busy_hours, busy_time_cost, billed_hours, fixed_uptime_cost = gpu_costs(
|
| 75 |
-
total_tokens, toks_per_sec, gpu_price_per_hr, daily_uptime_hours, days_per_month, utilization
|
| 76 |
-
)
|
| 77 |
-
|
| 78 |
-
# Per-patient costs
|
| 79 |
api_pp_low = per_patient(api_cost_low, num_patients)
|
| 80 |
api_pp_high = per_patient(api_cost_high, num_patients)
|
| 81 |
gpu_pp_busy = per_patient(busy_time_cost, num_patients)
|
| 82 |
gpu_pp_fixed = per_patient(fixed_uptime_cost, num_patients)
|
| 83 |
|
| 84 |
-
# Summary table
|
| 85 |
df = pd.DataFrame({
|
| 86 |
"Metric": [
|
| 87 |
"Patients",
|
| 88 |
-
"Tokens per Patient
|
| 89 |
"Total Tokens",
|
| 90 |
"API Cost (low)",
|
| 91 |
"API Cost (high)",
|
| 92 |
"GPU Busy Hours (idealized)",
|
| 93 |
-
"GPU Clock Hours
|
| 94 |
-
"GPU Cost (busy-time
|
| 95 |
"GPU Billed Hours (fixed uptime)",
|
| 96 |
"GPU Cost (fixed uptime)",
|
| 97 |
"Per-Patient API Cost (low)",
|
|
@@ -103,67 +75,86 @@ def calculate(
|
|
| 103 |
num_patients,
|
| 104 |
tokens_per_patient,
|
| 105 |
f"{total_tokens:,}",
|
| 106 |
-
|
| 107 |
-
|
| 108 |
f"{busy_hours:,.2f}",
|
| 109 |
f"{effective_busy_hours:,.2f}",
|
| 110 |
-
|
| 111 |
f"{billed_hours:,.2f}",
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
]
|
| 118 |
})
|
| 119 |
-
|
| 120 |
-
return df,
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
if __name__ == "__main__":
|
| 169 |
demo.launch()
|
|
|
|
| 1 |
|
| 2 |
import gradio as gr
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
|
| 5 |
+
APP_TITLE = "LLM Cost, Capacity & Latency Estimator"
|
| 6 |
+
|
| 7 |
+
INTRO = """
|
| 8 |
+
# LLM Cost, Capacity & Latency Estimator
|
| 9 |
+
|
| 10 |
+
Estimate and compare **Managed API costs (per-token)** vs **Self-hosted GPU costs (per GPU-hour)**,
|
| 11 |
+
plus a **Latency Estimator** for prompt+generation timing.
|
| 12 |
+
|
| 13 |
+
**Utilization (0–1)** = fraction of time GPU is busy while powered on.
|
| 14 |
+
- Example: 0.6 → GPU is busy 60% of the time and idle 40%.
|
| 15 |
+
- Used to convert busy-hours → clock-hours: busy_hours / utilization.
|
| 16 |
+
|
| 17 |
+
**Latency Estimator**:
|
| 18 |
+
- Prefill (read prompt) is usually faster than decode (generate output).
|
| 19 |
+
- Apply a queue/burst factor to estimate p95 latency under load.
|
| 20 |
"""
|
| 21 |
|
| 22 |
def calc_tokens_per_patient(intake_tokens, review_tokens):
|
|
|
|
| 26 |
return (total_tokens / 1000.0) * price_per_1k_tokens
|
| 27 |
|
| 28 |
def gpu_busy_hours(total_tokens, toks_per_sec):
|
| 29 |
+
return total_tokens / max(toks_per_sec, 1e-9) / 3600.0
|
|
|
|
|
|
|
| 30 |
|
| 31 |
def gpu_costs(total_tokens, toks_per_sec, gpu_price_per_hr, daily_uptime_hours, days_per_month, utilization):
|
|
|
|
| 32 |
busy_hours = gpu_busy_hours(total_tokens, toks_per_sec)
|
|
|
|
|
|
|
| 33 |
effective_busy_hours = busy_hours / max(utilization, 1e-6)
|
| 34 |
busy_time_cost = effective_busy_hours * gpu_price_per_hr
|
|
|
|
|
|
|
| 35 |
billed_hours = daily_uptime_hours * days_per_month
|
| 36 |
fixed_uptime_cost = billed_hours * gpu_price_per_hr
|
|
|
|
| 37 |
return busy_hours, effective_busy_hours, busy_time_cost, billed_hours, fixed_uptime_cost
|
| 38 |
|
| 39 |
def per_patient(total_cost, num_patients):
|
| 40 |
+
return (total_cost / num_patients) if num_patients else 0.0
|
| 41 |
+
|
| 42 |
+
def calculate_costs(num_patients, intake_tokens, review_tokens,
|
| 43 |
+
price_per_1k_low, price_per_1k_high,
|
| 44 |
+
toks_per_sec, gpu_price_per_hr,
|
| 45 |
+
daily_uptime_hours, days_per_month, utilization):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
tokens_per_patient = calc_tokens_per_patient(intake_tokens, review_tokens)
|
| 47 |
total_tokens = tokens_per_patient * num_patients
|
|
|
|
|
|
|
| 48 |
api_cost_low = managed_api_cost(total_tokens, price_per_1k_low)
|
| 49 |
api_cost_high = managed_api_cost(total_tokens, price_per_1k_high)
|
|
|
|
|
|
|
| 50 |
busy_hours, effective_busy_hours, busy_time_cost, billed_hours, fixed_uptime_cost = gpu_costs(
|
| 51 |
+
total_tokens, toks_per_sec, gpu_price_per_hr, daily_uptime_hours, days_per_month, utilization)
|
|
|
|
|
|
|
|
|
|
| 52 |
api_pp_low = per_patient(api_cost_low, num_patients)
|
| 53 |
api_pp_high = per_patient(api_cost_high, num_patients)
|
| 54 |
gpu_pp_busy = per_patient(busy_time_cost, num_patients)
|
| 55 |
gpu_pp_fixed = per_patient(fixed_uptime_cost, num_patients)
|
| 56 |
|
|
|
|
| 57 |
df = pd.DataFrame({
|
| 58 |
"Metric": [
|
| 59 |
"Patients",
|
| 60 |
+
"Tokens per Patient",
|
| 61 |
"Total Tokens",
|
| 62 |
"API Cost (low)",
|
| 63 |
"API Cost (high)",
|
| 64 |
"GPU Busy Hours (idealized)",
|
| 65 |
+
"GPU Clock Hours @ Utilization",
|
| 66 |
+
"GPU Cost (busy-time)",
|
| 67 |
"GPU Billed Hours (fixed uptime)",
|
| 68 |
"GPU Cost (fixed uptime)",
|
| 69 |
"Per-Patient API Cost (low)",
|
|
|
|
| 75 |
num_patients,
|
| 76 |
tokens_per_patient,
|
| 77 |
f"{total_tokens:,}",
|
| 78 |
+
f"${api_cost_low:,.2f}",
|
| 79 |
+
f"${api_cost_high:,.2f}",
|
| 80 |
f"{busy_hours:,.2f}",
|
| 81 |
f"{effective_busy_hours:,.2f}",
|
| 82 |
+
f"${busy_time_cost:,.2f}",
|
| 83 |
f"{billed_hours:,.2f}",
|
| 84 |
+
f"${fixed_uptime_cost:,.2f}",
|
| 85 |
+
f"${api_pp_low:,.2f}",
|
| 86 |
+
f"${api_pp_high:,.2f}",
|
| 87 |
+
f"${gpu_pp_busy:,.2f}",
|
| 88 |
+
f"${gpu_pp_fixed:,.2f}",
|
| 89 |
]
|
| 90 |
})
|
| 91 |
+
notes = f"Total tokens: {total_tokens:,}. Utilization = fraction of time GPU is busy while powered on."
|
| 92 |
+
return df, notes
|
| 93 |
+
|
| 94 |
+
def latency_estimator(prompt_tokens, output_tokens,
|
| 95 |
+
prefill_tps, decode_tps, overhead_ms, queue_factor):
|
| 96 |
+
prefill_sec = prompt_tokens / max(prefill_tps, 1e-9) if prefill_tps else 0.0
|
| 97 |
+
decode_sec = output_tokens / max(decode_tps, 1e-9) if decode_tps else 0.0
|
| 98 |
+
base_sec = prefill_sec + decode_sec + (overhead_ms / 1000.0)
|
| 99 |
+
p95_sec = base_sec * max(queue_factor, 1.0)
|
| 100 |
+
df = pd.DataFrame([
|
| 101 |
+
["Prefill time (s)", f"{prefill_sec:,.3f}"],
|
| 102 |
+
["Decode time (s)", f"{decode_sec:,.3f}"],
|
| 103 |
+
["Overhead (s)", f"{overhead_ms/1000.0:,.3f}"],
|
| 104 |
+
["Base total (s)", f"{base_sec:,.3f}"],
|
| 105 |
+
["Estimated p95 (s)", f"{p95_sec:,.3f}"],
|
| 106 |
+
], columns=["Component", "Seconds"])
|
| 107 |
+
msg = f"Base: {base_sec:,.3f}s, p95 (×{queue_factor}): {p95_sec:,.3f}s"
|
| 108 |
+
return df, msg
|
| 109 |
+
|
| 110 |
+
with gr.Blocks(title=APP_TITLE) as demo:
|
| 111 |
+
gr.Markdown(INTRO)
|
| 112 |
+
with gr.Tabs():
|
| 113 |
+
with gr.Tab("Cost & Capacity"):
|
| 114 |
+
with gr.Row():
|
| 115 |
+
with gr.Column():
|
| 116 |
+
num_patients = gr.Number(value=500, label="Number of patients")
|
| 117 |
+
intake_tokens = gr.Number(value=2000, label="Intake tokens")
|
| 118 |
+
review_tokens = gr.Number(value=5000, label="Review tokens")
|
| 119 |
+
with gr.Column():
|
| 120 |
+
price_per_1k_low = gr.Number(value=0.002, label="API Price Low ($/1K tok)")
|
| 121 |
+
price_per_1k_high = gr.Number(value=0.01, label="API Price High ($/1K tok)")
|
| 122 |
+
with gr.Row():
|
| 123 |
+
with gr.Column():
|
| 124 |
+
toks_per_sec = gr.Number(value=200, label="GPU Throughput (tok/s)")
|
| 125 |
+
gpu_price_per_hr = gr.Number(value=2.50, label="GPU Price ($/hr)")
|
| 126 |
+
with gr.Column():
|
| 127 |
+
daily_uptime_hours = gr.Number(value=8, label="Billed uptime (hr/day)")
|
| 128 |
+
days_per_month = gr.Number(value=30, label="Days/month")
|
| 129 |
+
utilization = gr.Slider(value=0.6, minimum=0.1, maximum=1.0, step=0.05,
|
| 130 |
+
label="Utilization (0–1) = busy time fraction")
|
| 131 |
+
calc_btn = gr.Button("Calculate Costs")
|
| 132 |
+
out_table = gr.Dataframe(label="Summary")
|
| 133 |
+
out_notes = gr.Textbox(label="Notes")
|
| 134 |
+
calc_btn.click(calculate_costs,
|
| 135 |
+
[num_patients, intake_tokens, review_tokens,
|
| 136 |
+
price_per_1k_low, price_per_1k_high,
|
| 137 |
+
toks_per_sec, gpu_price_per_hr,
|
| 138 |
+
daily_uptime_hours, days_per_month, utilization],
|
| 139 |
+
[out_table, out_notes])
|
| 140 |
+
|
| 141 |
+
with gr.Tab("Latency Estimator"):
|
| 142 |
+
with gr.Row():
|
| 143 |
+
with gr.Column():
|
| 144 |
+
prompt_tokens = gr.Number(value=8000, label="Prompt tokens")
|
| 145 |
+
output_tokens = gr.Number(value=300, label="Output tokens")
|
| 146 |
+
with gr.Column():
|
| 147 |
+
prefill_tps = gr.Number(value=1000, label="Prefill speed (tok/s)")
|
| 148 |
+
decode_tps = gr.Number(value=400, label="Decode speed (tok/s)")
|
| 149 |
+
with gr.Row():
|
| 150 |
+
overhead_ms = gr.Number(value=200, label="Overhead (ms)")
|
| 151 |
+
queue_factor = gr.Slider(value=1.3, minimum=1.0, maximum=3.0, step=0.05, label="Queue/Burst Factor ×")
|
| 152 |
+
lat_btn = gr.Button("Estimate Latency")
|
| 153 |
+
lat_table = gr.Dataframe(label="Latency Breakdown")
|
| 154 |
+
lat_notes = gr.Textbox(label="Notes")
|
| 155 |
+
lat_btn.click(latency_estimator,
|
| 156 |
+
[prompt_tokens, output_tokens, prefill_tps, decode_tps, overhead_ms, queue_factor],
|
| 157 |
+
[lat_table, lat_notes])
|
| 158 |
|
| 159 |
if __name__ == "__main__":
|
| 160 |
demo.launch()
|
requirements.txt
CHANGED
|
@@ -1,2 +1,2 @@
|
|
| 1 |
-
gradio>=4.
|
| 2 |
-
pandas>=2.2.0
|
|
|
|
| 1 |
+
gradio>=4.31.0
|
| 2 |
+
pandas>=2.2.0
|