Add batch size calculator
Browse files
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title: LLM Cost, Capacity &
|
| 3 |
emoji: 🧮
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
|
@@ -9,14 +9,13 @@ app_file: app.py
|
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
-
# LLM Cost, Capacity &
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
Fraction of time GPU is busy while powered on (0–1).
|
| 18 |
|
| 19 |
-
|
| 20 |
-
1. Create HF Space → Gradio.
|
| 21 |
-
2. Upload `app.py`, `requirements.txt`, `README.md`.
|
| 22 |
-
3. Launch.
|
|
|
|
| 1 |
---
|
| 2 |
+
title: LLM Cost, Capacity, Latency & Batch Sizer
|
| 3 |
emoji: 🧮
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
|
|
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# LLM Cost, Capacity, Latency & Batch Sizer
|
| 13 |
|
| 14 |
+
Tabs:
|
| 15 |
+
1) **Cost & Capacity** – Managed API vs GPU costs (busy-time vs scheduled uptime; set 24 h/day for always-on).
|
| 16 |
+
2) **Latency Estimator** – prefill + decode + overhead, scaled by Queue/Burst factor for p95.
|
| 17 |
+
3) **Batch Size Calculator** – computes theoretical & recommended safe batch from VRAM and KV-cache math.
|
| 18 |
|
| 19 |
+
**KV cache rule**: `KV ≈ 2 × hidden_size × bytes/elem × layers × seq_len × batch_size`
|
|
|
|
| 20 |
|
| 21 |
+
Use KV precision 4/8/16 bits, and reserve headroom to avoid OOMs.
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -2,23 +2,24 @@
|
|
| 2 |
import gradio as gr
|
| 3 |
import pandas as pd
|
| 4 |
|
| 5 |
-
APP_TITLE = "LLM Cost, Capacity &
|
| 6 |
|
| 7 |
INTRO = """
|
| 8 |
-
# LLM Cost, Capacity &
|
| 9 |
|
| 10 |
-
Estimate
|
| 11 |
-
|
|
|
|
|
|
|
| 12 |
|
| 13 |
**Utilization (0–1)** = fraction of time GPU is busy while powered on.
|
| 14 |
-
-
|
| 15 |
-
- Used to convert busy-hours → clock-hours: busy_hours / utilization.
|
| 16 |
-
|
| 17 |
-
**Latency Estimator**:
|
| 18 |
-
- Prefill (read prompt) is usually faster than decode (generate output).
|
| 19 |
-
- Apply a queue/burst factor to estimate p95 latency under load.
|
| 20 |
"""
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
def calc_tokens_per_patient(intake_tokens, review_tokens):
|
| 23 |
return intake_tokens + review_tokens
|
| 24 |
|
|
@@ -88,13 +89,17 @@ def calculate_costs(num_patients, intake_tokens, review_tokens,
|
|
| 88 |
f"${gpu_pp_fixed:,.2f}",
|
| 89 |
]
|
| 90 |
})
|
| 91 |
-
notes =
|
| 92 |
return df, notes
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
def latency_estimator(prompt_tokens, output_tokens,
|
| 95 |
prefill_tps, decode_tps, overhead_ms, queue_factor):
|
| 96 |
-
prefill_sec = prompt_tokens / max(prefill_tps, 1e-9) if prefill_tps else 0.0
|
| 97 |
-
decode_sec = output_tokens / max(decode_tps, 1e-9) if decode_tps else 0.0
|
| 98 |
base_sec = prefill_sec + decode_sec + (overhead_ms / 1000.0)
|
| 99 |
p95_sec = base_sec * max(queue_factor, 1.0)
|
| 100 |
df = pd.DataFrame([
|
|
@@ -107,18 +112,74 @@ def latency_estimator(prompt_tokens, output_tokens,
|
|
| 107 |
msg = f"Base: {base_sec:,.3f}s, p95 (×{queue_factor}): {p95_sec:,.3f}s"
|
| 108 |
return df, msg
|
| 109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
with gr.Blocks(title=APP_TITLE) as demo:
|
| 111 |
gr.Markdown(INTRO)
|
|
|
|
| 112 |
with gr.Tabs():
|
| 113 |
with gr.Tab("Cost & Capacity"):
|
| 114 |
with gr.Row():
|
| 115 |
with gr.Column():
|
| 116 |
num_patients = gr.Number(value=500, label="Number of patients")
|
| 117 |
-
intake_tokens = gr.Number(value=2000, label="Intake tokens")
|
| 118 |
-
review_tokens = gr.Number(value=5000, label="
|
| 119 |
with gr.Column():
|
| 120 |
price_per_1k_low = gr.Number(value=0.002, label="API Price Low ($/1K tok)")
|
| 121 |
-
price_per_1k_high = gr.Number(value=0.
|
| 122 |
with gr.Row():
|
| 123 |
with gr.Column():
|
| 124 |
toks_per_sec = gr.Number(value=200, label="GPU Throughput (tok/s)")
|
|
@@ -126,11 +187,10 @@ with gr.Blocks(title=APP_TITLE) as demo:
|
|
| 126 |
with gr.Column():
|
| 127 |
daily_uptime_hours = gr.Number(value=8, label="Billed uptime (hr/day)")
|
| 128 |
days_per_month = gr.Number(value=30, label="Days/month")
|
| 129 |
-
utilization = gr.Slider(value=0.6, minimum=0.1, maximum=1.0, step=0.05,
|
| 130 |
-
label="Utilization (0–1) = busy time fraction")
|
| 131 |
calc_btn = gr.Button("Calculate Costs")
|
| 132 |
-
out_table = gr.Dataframe(label="Summary")
|
| 133 |
-
out_notes = gr.Textbox(label="Notes")
|
| 134 |
calc_btn.click(calculate_costs,
|
| 135 |
[num_patients, intake_tokens, review_tokens,
|
| 136 |
price_per_1k_low, price_per_1k_high,
|
|
@@ -141,20 +201,42 @@ with gr.Blocks(title=APP_TITLE) as demo:
|
|
| 141 |
with gr.Tab("Latency Estimator"):
|
| 142 |
with gr.Row():
|
| 143 |
with gr.Column():
|
| 144 |
-
prompt_tokens = gr.Number(value=8000, label="Prompt tokens")
|
| 145 |
-
output_tokens = gr.Number(value=300, label="Output tokens")
|
| 146 |
with gr.Column():
|
| 147 |
prefill_tps = gr.Number(value=1000, label="Prefill speed (tok/s)")
|
| 148 |
decode_tps = gr.Number(value=400, label="Decode speed (tok/s)")
|
| 149 |
with gr.Row():
|
| 150 |
overhead_ms = gr.Number(value=200, label="Overhead (ms)")
|
| 151 |
-
queue_factor = gr.Slider(value=1.3, minimum=1.0, maximum=
|
| 152 |
lat_btn = gr.Button("Estimate Latency")
|
| 153 |
-
lat_table = gr.Dataframe(label="Latency Breakdown")
|
| 154 |
-
lat_notes = gr.Textbox(label="Notes")
|
| 155 |
lat_btn.click(latency_estimator,
|
| 156 |
[prompt_tokens, output_tokens, prefill_tps, decode_tps, overhead_ms, queue_factor],
|
| 157 |
[lat_table, lat_notes])
|
| 158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
if __name__ == "__main__":
|
| 160 |
demo.launch()
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import pandas as pd
|
| 4 |
|
| 5 |
+
APP_TITLE = "LLM Cost, Capacity, Latency & Batch Sizer"
|
| 6 |
|
| 7 |
INTRO = """
|
| 8 |
+
# LLM Cost, Capacity, Latency & Batch Sizer
|
| 9 |
|
| 10 |
+
Estimate:
|
| 11 |
+
- **Costs** for Managed API vs Self-hosted GPU (busy-time & scheduled uptime)
|
| 12 |
+
- **Latency** from prompt/output and token speeds
|
| 13 |
+
- **Batch size** limits from GPU VRAM and KV-cache math
|
| 14 |
|
| 15 |
**Utilization (0–1)** = fraction of time GPU is busy while powered on.
|
| 16 |
+
- Used to convert busy-hours → clock-hours: `effective_busy_hours = busy_hours / utilization`.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
"""
|
| 18 |
|
| 19 |
+
# --------------------
|
| 20 |
+
# Cost & Capacity
|
| 21 |
+
# --------------------
|
| 22 |
+
|
| 23 |
def calc_tokens_per_patient(intake_tokens, review_tokens):
|
| 24 |
return intake_tokens + review_tokens
|
| 25 |
|
|
|
|
| 89 |
f"${gpu_pp_fixed:,.2f}",
|
| 90 |
]
|
| 91 |
})
|
| 92 |
+
notes = "Set hours/day=24 to simulate always-on. Utilization = fraction of time GPU is busy while powered on."
|
| 93 |
return df, notes
|
| 94 |
|
| 95 |
+
# --------------------
|
| 96 |
+
# Latency Estimator
|
| 97 |
+
# --------------------
|
| 98 |
+
|
| 99 |
def latency_estimator(prompt_tokens, output_tokens,
|
| 100 |
prefill_tps, decode_tps, overhead_ms, queue_factor):
|
| 101 |
+
prefill_sec = (prompt_tokens / max(prefill_tps, 1e-9)) if prefill_tps else 0.0
|
| 102 |
+
decode_sec = (output_tokens / max(decode_tps, 1e-9)) if decode_tps else 0.0
|
| 103 |
base_sec = prefill_sec + decode_sec + (overhead_ms / 1000.0)
|
| 104 |
p95_sec = base_sec * max(queue_factor, 1.0)
|
| 105 |
df = pd.DataFrame([
|
|
|
|
| 112 |
msg = f"Base: {base_sec:,.3f}s, p95 (×{queue_factor}): {p95_sec:,.3f}s"
|
| 113 |
return df, msg
|
| 114 |
|
| 115 |
+
# --------------------
|
| 116 |
+
# Batch Size Calculator
|
| 117 |
+
# --------------------
|
| 118 |
+
|
| 119 |
+
def kv_bytes_per_elem(kv_precision_bits):
|
| 120 |
+
if kv_precision_bits == 16:
|
| 121 |
+
return 2.0
|
| 122 |
+
if kv_precision_bits == 8:
|
| 123 |
+
return 1.0
|
| 124 |
+
if kv_precision_bits == 4:
|
| 125 |
+
return 0.5
|
| 126 |
+
return 2.0
|
| 127 |
+
|
| 128 |
+
def batch_size_calculator(
|
| 129 |
+
gpu_vram_gb,
|
| 130 |
+
model_weights_gb,
|
| 131 |
+
runtime_overhead_gb,
|
| 132 |
+
hidden_size,
|
| 133 |
+
num_layers,
|
| 134 |
+
kv_precision_bits,
|
| 135 |
+
max_seq_len_tokens,
|
| 136 |
+
reserve_headroom_frac
|
| 137 |
+
):
|
| 138 |
+
bytes_per_elem = kv_bytes_per_elem(kv_precision_bits)
|
| 139 |
+
kv_per_token_bytes = 2.0 * hidden_size * bytes_per_elem * num_layers
|
| 140 |
+
kv_per_seq_bytes = kv_per_token_bytes * max_seq_len_tokens
|
| 141 |
+
|
| 142 |
+
total_vram_bytes = gpu_vram_gb * (1024**3)
|
| 143 |
+
used_bytes = (model_weights_gb + runtime_overhead_gb) * (1024**3)
|
| 144 |
+
reserve_bytes = total_vram_bytes * reserve_headroom_frac
|
| 145 |
+
free_for_kv = max(total_vram_bytes - used_bytes - reserve_bytes, 0)
|
| 146 |
+
|
| 147 |
+
theoretical_batch = int(free_for_kv // max(kv_per_seq_bytes, 1))
|
| 148 |
+
safe_batch_low = int(max(theoretical_batch * 0.5, 1))
|
| 149 |
+
safe_batch_high = int(max(theoretical_batch * 0.7, 1))
|
| 150 |
+
|
| 151 |
+
rows = [
|
| 152 |
+
["GPU VRAM (GB)", f"{gpu_vram_gb}"],
|
| 153 |
+
["Model weights (GB)", f"{model_weights_gb}"],
|
| 154 |
+
["Runtime overhead (GB)", f"{runtime_overhead_gb}"],
|
| 155 |
+
["Hidden size", f"{hidden_size}"],
|
| 156 |
+
["Layers", f"{num_layers}"],
|
| 157 |
+
["KV precision (bits)", f"{kv_precision_bits}"],
|
| 158 |
+
["Max seq length (tokens)", f"{max_seq_len_tokens}"],
|
| 159 |
+
["Reserve headroom (%)", f"{int(reserve_headroom_frac*100)}%"],
|
| 160 |
+
["KV bytes / token", f"{kv_per_token_bytes:,.0f}"],
|
| 161 |
+
["KV per sequence (GB)", f"{kv_per_seq_bytes / (1024**3):,.3f}"],
|
| 162 |
+
["Free VRAM for KV (GB)", f"{free_for_kv / (1024**3):,.2f}"],
|
| 163 |
+
["Max theoretical batch", f"{theoretical_batch}"],
|
| 164 |
+
["Recommended safe batch", f"{safe_batch_low}–{safe_batch_high}"],
|
| 165 |
+
]
|
| 166 |
+
df = pd.DataFrame(rows, columns=["Parameter", "Value"])
|
| 167 |
+
note = "Recommended safe batch is ~50–70% of theoretical to avoid OOM and keep p95 latency stable."
|
| 168 |
+
return df, note
|
| 169 |
+
|
| 170 |
with gr.Blocks(title=APP_TITLE) as demo:
|
| 171 |
gr.Markdown(INTRO)
|
| 172 |
+
|
| 173 |
with gr.Tabs():
|
| 174 |
with gr.Tab("Cost & Capacity"):
|
| 175 |
with gr.Row():
|
| 176 |
with gr.Column():
|
| 177 |
num_patients = gr.Number(value=500, label="Number of patients")
|
| 178 |
+
intake_tokens = gr.Number(value=2000, label="Intake tokens per patient")
|
| 179 |
+
review_tokens = gr.Number(value=5000, label="Clinician review tokens per patient")
|
| 180 |
with gr.Column():
|
| 181 |
price_per_1k_low = gr.Number(value=0.002, label="API Price Low ($/1K tok)")
|
| 182 |
+
price_per_1k_high = gr.Number(value=0.010, label="API Price High ($/1K tok)")
|
| 183 |
with gr.Row():
|
| 184 |
with gr.Column():
|
| 185 |
toks_per_sec = gr.Number(value=200, label="GPU Throughput (tok/s)")
|
|
|
|
| 187 |
with gr.Column():
|
| 188 |
daily_uptime_hours = gr.Number(value=8, label="Billed uptime (hr/day)")
|
| 189 |
days_per_month = gr.Number(value=30, label="Days/month")
|
| 190 |
+
utilization = gr.Slider(value=0.6, minimum=0.1, maximum=1.0, step=0.05, label="Utilization (0–1)")
|
|
|
|
| 191 |
calc_btn = gr.Button("Calculate Costs")
|
| 192 |
+
out_table = gr.Dataframe(label="Summary", interactive=False)
|
| 193 |
+
out_notes = gr.Textbox(label="Notes", interactive=False)
|
| 194 |
calc_btn.click(calculate_costs,
|
| 195 |
[num_patients, intake_tokens, review_tokens,
|
| 196 |
price_per_1k_low, price_per_1k_high,
|
|
|
|
| 201 |
with gr.Tab("Latency Estimator"):
|
| 202 |
with gr.Row():
|
| 203 |
with gr.Column():
|
| 204 |
+
prompt_tokens = gr.Number(value=8000, label="Prompt tokens (input)")
|
| 205 |
+
output_tokens = gr.Number(value=300, label="Output tokens (generated)")
|
| 206 |
with gr.Column():
|
| 207 |
prefill_tps = gr.Number(value=1000, label="Prefill speed (tok/s)")
|
| 208 |
decode_tps = gr.Number(value=400, label="Decode speed (tok/s)")
|
| 209 |
with gr.Row():
|
| 210 |
overhead_ms = gr.Number(value=200, label="Overhead (ms)")
|
| 211 |
+
queue_factor = gr.Slider(value=1.3, minimum=1.0, maximum=10.0, step=0.1, label="Queue/Burst Factor (×)")
|
| 212 |
lat_btn = gr.Button("Estimate Latency")
|
| 213 |
+
lat_table = gr.Dataframe(label="Latency Breakdown", interactive=False)
|
| 214 |
+
lat_notes = gr.Textbox(label="Notes", interactive=False)
|
| 215 |
lat_btn.click(latency_estimator,
|
| 216 |
[prompt_tokens, output_tokens, prefill_tps, decode_tps, overhead_ms, queue_factor],
|
| 217 |
[lat_table, lat_notes])
|
| 218 |
|
| 219 |
+
with gr.Tab("Batch Size Calculator"):
|
| 220 |
+
with gr.Row():
|
| 221 |
+
with gr.Column():
|
| 222 |
+
gpu_vram_gb = gr.Number(value=24, label="GPU VRAM (GB)")
|
| 223 |
+
model_weights_gb = gr.Number(value=6, label="Model weights (GB)")
|
| 224 |
+
runtime_overhead_gb = gr.Number(value=2, label="Runtime overhead (GB)")
|
| 225 |
+
with gr.Column():
|
| 226 |
+
hidden_size = gr.Number(value=4096, label="Hidden size (d_model)")
|
| 227 |
+
num_layers = gr.Number(value=32, label="Layers (transformer blocks)")
|
| 228 |
+
kv_precision_bits = gr.Dropdown(choices=[4,8,16], value=4, label="KV precision (bits)")
|
| 229 |
+
with gr.Row():
|
| 230 |
+
max_seq_len_tokens = gr.Number(value=4096, label="Max sequence length (tokens)")
|
| 231 |
+
reserve_headroom_frac = gr.Slider(value=0.2, minimum=0.0, maximum=0.5, step=0.05, label="Reserve headroom (fraction)")
|
| 232 |
+
batch_btn = gr.Button("Calculate Batch Size")
|
| 233 |
+
batch_table = gr.Dataframe(label="Batch Sizing Result", interactive=False)
|
| 234 |
+
batch_notes = gr.Textbox(label="Notes", interactive=False)
|
| 235 |
+
batch_btn.click(batch_size_calculator,
|
| 236 |
+
[gpu_vram_gb, model_weights_gb, runtime_overhead_gb,
|
| 237 |
+
hidden_size, num_layers, kv_precision_bits,
|
| 238 |
+
max_seq_len_tokens, reserve_headroom_frac],
|
| 239 |
+
[batch_table, batch_notes])
|
| 240 |
+
|
| 241 |
if __name__ == "__main__":
|
| 242 |
demo.launch()
|