xiaosuhu1986 commited on
Commit
9052fea
·
verified ·
1 Parent(s): 4a73369

Add batch size calculator

Browse files
Files changed (2) hide show
  1. README.md +8 -9
  2. app.py +107 -25
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: LLM Cost, Capacity & Latency Estimator
3
  emoji: 🧮
4
  colorFrom: blue
5
  colorTo: green
@@ -9,14 +9,13 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- # LLM Cost, Capacity & Latency Estimator
13
 
14
- Compare API vs GPU costs and estimate latency.
 
 
 
15
 
16
- ## Utilization
17
- Fraction of time GPU is busy while powered on (0–1).
18
 
19
- ## How to deploy
20
- 1. Create HF Space → Gradio.
21
- 2. Upload `app.py`, `requirements.txt`, `README.md`.
22
- 3. Launch.
 
1
  ---
2
+ title: LLM Cost, Capacity, Latency & Batch Sizer
3
  emoji: 🧮
4
  colorFrom: blue
5
  colorTo: green
 
9
  pinned: false
10
  ---
11
 
12
+ # LLM Cost, Capacity, Latency & Batch Sizer
13
 
14
+ Tabs:
15
+ 1) **Cost & Capacity** – Managed API vs GPU costs (busy-time vs scheduled uptime; set 24 h/day for always-on).
16
+ 2) **Latency Estimator** – prefill + decode + overhead, scaled by Queue/Burst factor for p95.
17
+ 3) **Batch Size Calculator** – computes theoretical & recommended safe batch from VRAM and KV-cache math.
18
 
19
+ **KV cache rule**: `KV ≈ 2 × hidden_size × bytes/elem × layers × seq_len × batch_size`
 
20
 
21
+ Use KV precision 4/8/16 bits, and reserve headroom to avoid OOMs.
 
 
 
app.py CHANGED
@@ -2,23 +2,24 @@
2
  import gradio as gr
3
  import pandas as pd
4
 
5
- APP_TITLE = "LLM Cost, Capacity & Latency Estimator"
6
 
7
  INTRO = """
8
- # LLM Cost, Capacity & Latency Estimator
9
 
10
- Estimate and compare **Managed API costs (per-token)** vs **Self-hosted GPU costs (per GPU-hour)**,
11
- plus a **Latency Estimator** for prompt+generation timing.
 
 
12
 
13
  **Utilization (0–1)** = fraction of time GPU is busy while powered on.
14
- - Example: 0.6 → GPU is busy 60% of the time and idle 40%.
15
- - Used to convert busy-hours → clock-hours: busy_hours / utilization.
16
-
17
- **Latency Estimator**:
18
- - Prefill (read prompt) is usually faster than decode (generate output).
19
- - Apply a queue/burst factor to estimate p95 latency under load.
20
  """
21
 
 
 
 
 
22
  def calc_tokens_per_patient(intake_tokens, review_tokens):
23
  return intake_tokens + review_tokens
24
 
@@ -88,13 +89,17 @@ def calculate_costs(num_patients, intake_tokens, review_tokens,
88
  f"${gpu_pp_fixed:,.2f}",
89
  ]
90
  })
91
- notes = f"Total tokens: {total_tokens:,}. Utilization = fraction of time GPU is busy while powered on."
92
  return df, notes
93
 
 
 
 
 
94
  def latency_estimator(prompt_tokens, output_tokens,
95
  prefill_tps, decode_tps, overhead_ms, queue_factor):
96
- prefill_sec = prompt_tokens / max(prefill_tps, 1e-9) if prefill_tps else 0.0
97
- decode_sec = output_tokens / max(decode_tps, 1e-9) if decode_tps else 0.0
98
  base_sec = prefill_sec + decode_sec + (overhead_ms / 1000.0)
99
  p95_sec = base_sec * max(queue_factor, 1.0)
100
  df = pd.DataFrame([
@@ -107,18 +112,74 @@ def latency_estimator(prompt_tokens, output_tokens,
107
  msg = f"Base: {base_sec:,.3f}s, p95 (×{queue_factor}): {p95_sec:,.3f}s"
108
  return df, msg
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  with gr.Blocks(title=APP_TITLE) as demo:
111
  gr.Markdown(INTRO)
 
112
  with gr.Tabs():
113
  with gr.Tab("Cost & Capacity"):
114
  with gr.Row():
115
  with gr.Column():
116
  num_patients = gr.Number(value=500, label="Number of patients")
117
- intake_tokens = gr.Number(value=2000, label="Intake tokens")
118
- review_tokens = gr.Number(value=5000, label="Review tokens")
119
  with gr.Column():
120
  price_per_1k_low = gr.Number(value=0.002, label="API Price Low ($/1K tok)")
121
- price_per_1k_high = gr.Number(value=0.01, label="API Price High ($/1K tok)")
122
  with gr.Row():
123
  with gr.Column():
124
  toks_per_sec = gr.Number(value=200, label="GPU Throughput (tok/s)")
@@ -126,11 +187,10 @@ with gr.Blocks(title=APP_TITLE) as demo:
126
  with gr.Column():
127
  daily_uptime_hours = gr.Number(value=8, label="Billed uptime (hr/day)")
128
  days_per_month = gr.Number(value=30, label="Days/month")
129
- utilization = gr.Slider(value=0.6, minimum=0.1, maximum=1.0, step=0.05,
130
- label="Utilization (0–1) = busy time fraction")
131
  calc_btn = gr.Button("Calculate Costs")
132
- out_table = gr.Dataframe(label="Summary")
133
- out_notes = gr.Textbox(label="Notes")
134
  calc_btn.click(calculate_costs,
135
  [num_patients, intake_tokens, review_tokens,
136
  price_per_1k_low, price_per_1k_high,
@@ -141,20 +201,42 @@ with gr.Blocks(title=APP_TITLE) as demo:
141
  with gr.Tab("Latency Estimator"):
142
  with gr.Row():
143
  with gr.Column():
144
- prompt_tokens = gr.Number(value=8000, label="Prompt tokens")
145
- output_tokens = gr.Number(value=300, label="Output tokens")
146
  with gr.Column():
147
  prefill_tps = gr.Number(value=1000, label="Prefill speed (tok/s)")
148
  decode_tps = gr.Number(value=400, label="Decode speed (tok/s)")
149
  with gr.Row():
150
  overhead_ms = gr.Number(value=200, label="Overhead (ms)")
151
- queue_factor = gr.Slider(value=1.3, minimum=1.0, maximum=3.0, step=0.05, label="Queue/Burst Factor ×")
152
  lat_btn = gr.Button("Estimate Latency")
153
- lat_table = gr.Dataframe(label="Latency Breakdown")
154
- lat_notes = gr.Textbox(label="Notes")
155
  lat_btn.click(latency_estimator,
156
  [prompt_tokens, output_tokens, prefill_tps, decode_tps, overhead_ms, queue_factor],
157
  [lat_table, lat_notes])
158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  if __name__ == "__main__":
160
  demo.launch()
 
2
  import gradio as gr
3
  import pandas as pd
4
 
5
+ APP_TITLE = "LLM Cost, Capacity, Latency & Batch Sizer"
6
 
7
  INTRO = """
8
+ # LLM Cost, Capacity, Latency & Batch Sizer
9
 
10
+ Estimate:
11
+ - **Costs** for Managed API vs Self-hosted GPU (busy-time & scheduled uptime)
12
+ - **Latency** from prompt/output and token speeds
13
+ - **Batch size** limits from GPU VRAM and KV-cache math
14
 
15
  **Utilization (0–1)** = fraction of time GPU is busy while powered on.
16
+ - Used to convert busy-hours → clock-hours: `effective_busy_hours = busy_hours / utilization`.
 
 
 
 
 
17
  """
18
 
19
+ # --------------------
20
+ # Cost & Capacity
21
+ # --------------------
22
+
23
  def calc_tokens_per_patient(intake_tokens, review_tokens):
24
  return intake_tokens + review_tokens
25
 
 
89
  f"${gpu_pp_fixed:,.2f}",
90
  ]
91
  })
92
+ notes = "Set hours/day=24 to simulate always-on. Utilization = fraction of time GPU is busy while powered on."
93
  return df, notes
94
 
95
+ # --------------------
96
+ # Latency Estimator
97
+ # --------------------
98
+
99
  def latency_estimator(prompt_tokens, output_tokens,
100
  prefill_tps, decode_tps, overhead_ms, queue_factor):
101
+ prefill_sec = (prompt_tokens / max(prefill_tps, 1e-9)) if prefill_tps else 0.0
102
+ decode_sec = (output_tokens / max(decode_tps, 1e-9)) if decode_tps else 0.0
103
  base_sec = prefill_sec + decode_sec + (overhead_ms / 1000.0)
104
  p95_sec = base_sec * max(queue_factor, 1.0)
105
  df = pd.DataFrame([
 
112
  msg = f"Base: {base_sec:,.3f}s, p95 (×{queue_factor}): {p95_sec:,.3f}s"
113
  return df, msg
114
 
115
+ # --------------------
116
+ # Batch Size Calculator
117
+ # --------------------
118
+
119
+ def kv_bytes_per_elem(kv_precision_bits):
120
+ if kv_precision_bits == 16:
121
+ return 2.0
122
+ if kv_precision_bits == 8:
123
+ return 1.0
124
+ if kv_precision_bits == 4:
125
+ return 0.5
126
+ return 2.0
127
+
128
+ def batch_size_calculator(
129
+ gpu_vram_gb,
130
+ model_weights_gb,
131
+ runtime_overhead_gb,
132
+ hidden_size,
133
+ num_layers,
134
+ kv_precision_bits,
135
+ max_seq_len_tokens,
136
+ reserve_headroom_frac
137
+ ):
138
+ bytes_per_elem = kv_bytes_per_elem(kv_precision_bits)
139
+ kv_per_token_bytes = 2.0 * hidden_size * bytes_per_elem * num_layers
140
+ kv_per_seq_bytes = kv_per_token_bytes * max_seq_len_tokens
141
+
142
+ total_vram_bytes = gpu_vram_gb * (1024**3)
143
+ used_bytes = (model_weights_gb + runtime_overhead_gb) * (1024**3)
144
+ reserve_bytes = total_vram_bytes * reserve_headroom_frac
145
+ free_for_kv = max(total_vram_bytes - used_bytes - reserve_bytes, 0)
146
+
147
+ theoretical_batch = int(free_for_kv // max(kv_per_seq_bytes, 1))
148
+ safe_batch_low = int(max(theoretical_batch * 0.5, 1))
149
+ safe_batch_high = int(max(theoretical_batch * 0.7, 1))
150
+
151
+ rows = [
152
+ ["GPU VRAM (GB)", f"{gpu_vram_gb}"],
153
+ ["Model weights (GB)", f"{model_weights_gb}"],
154
+ ["Runtime overhead (GB)", f"{runtime_overhead_gb}"],
155
+ ["Hidden size", f"{hidden_size}"],
156
+ ["Layers", f"{num_layers}"],
157
+ ["KV precision (bits)", f"{kv_precision_bits}"],
158
+ ["Max seq length (tokens)", f"{max_seq_len_tokens}"],
159
+ ["Reserve headroom (%)", f"{int(reserve_headroom_frac*100)}%"],
160
+ ["KV bytes / token", f"{kv_per_token_bytes:,.0f}"],
161
+ ["KV per sequence (GB)", f"{kv_per_seq_bytes / (1024**3):,.3f}"],
162
+ ["Free VRAM for KV (GB)", f"{free_for_kv / (1024**3):,.2f}"],
163
+ ["Max theoretical batch", f"{theoretical_batch}"],
164
+ ["Recommended safe batch", f"{safe_batch_low}–{safe_batch_high}"],
165
+ ]
166
+ df = pd.DataFrame(rows, columns=["Parameter", "Value"])
167
+ note = "Recommended safe batch is ~50–70% of theoretical to avoid OOM and keep p95 latency stable."
168
+ return df, note
169
+
170
  with gr.Blocks(title=APP_TITLE) as demo:
171
  gr.Markdown(INTRO)
172
+
173
  with gr.Tabs():
174
  with gr.Tab("Cost & Capacity"):
175
  with gr.Row():
176
  with gr.Column():
177
  num_patients = gr.Number(value=500, label="Number of patients")
178
+ intake_tokens = gr.Number(value=2000, label="Intake tokens per patient")
179
+ review_tokens = gr.Number(value=5000, label="Clinician review tokens per patient")
180
  with gr.Column():
181
  price_per_1k_low = gr.Number(value=0.002, label="API Price Low ($/1K tok)")
182
+ price_per_1k_high = gr.Number(value=0.010, label="API Price High ($/1K tok)")
183
  with gr.Row():
184
  with gr.Column():
185
  toks_per_sec = gr.Number(value=200, label="GPU Throughput (tok/s)")
 
187
  with gr.Column():
188
  daily_uptime_hours = gr.Number(value=8, label="Billed uptime (hr/day)")
189
  days_per_month = gr.Number(value=30, label="Days/month")
190
+ utilization = gr.Slider(value=0.6, minimum=0.1, maximum=1.0, step=0.05, label="Utilization (0–1)")
 
191
  calc_btn = gr.Button("Calculate Costs")
192
+ out_table = gr.Dataframe(label="Summary", interactive=False)
193
+ out_notes = gr.Textbox(label="Notes", interactive=False)
194
  calc_btn.click(calculate_costs,
195
  [num_patients, intake_tokens, review_tokens,
196
  price_per_1k_low, price_per_1k_high,
 
201
  with gr.Tab("Latency Estimator"):
202
  with gr.Row():
203
  with gr.Column():
204
+ prompt_tokens = gr.Number(value=8000, label="Prompt tokens (input)")
205
+ output_tokens = gr.Number(value=300, label="Output tokens (generated)")
206
  with gr.Column():
207
  prefill_tps = gr.Number(value=1000, label="Prefill speed (tok/s)")
208
  decode_tps = gr.Number(value=400, label="Decode speed (tok/s)")
209
  with gr.Row():
210
  overhead_ms = gr.Number(value=200, label="Overhead (ms)")
211
+ queue_factor = gr.Slider(value=1.3, minimum=1.0, maximum=10.0, step=0.1, label="Queue/Burst Factor (×)")
212
  lat_btn = gr.Button("Estimate Latency")
213
+ lat_table = gr.Dataframe(label="Latency Breakdown", interactive=False)
214
+ lat_notes = gr.Textbox(label="Notes", interactive=False)
215
  lat_btn.click(latency_estimator,
216
  [prompt_tokens, output_tokens, prefill_tps, decode_tps, overhead_ms, queue_factor],
217
  [lat_table, lat_notes])
218
 
219
+ with gr.Tab("Batch Size Calculator"):
220
+ with gr.Row():
221
+ with gr.Column():
222
+ gpu_vram_gb = gr.Number(value=24, label="GPU VRAM (GB)")
223
+ model_weights_gb = gr.Number(value=6, label="Model weights (GB)")
224
+ runtime_overhead_gb = gr.Number(value=2, label="Runtime overhead (GB)")
225
+ with gr.Column():
226
+ hidden_size = gr.Number(value=4096, label="Hidden size (d_model)")
227
+ num_layers = gr.Number(value=32, label="Layers (transformer blocks)")
228
+ kv_precision_bits = gr.Dropdown(choices=[4,8,16], value=4, label="KV precision (bits)")
229
+ with gr.Row():
230
+ max_seq_len_tokens = gr.Number(value=4096, label="Max sequence length (tokens)")
231
+ reserve_headroom_frac = gr.Slider(value=0.2, minimum=0.0, maximum=0.5, step=0.05, label="Reserve headroom (fraction)")
232
+ batch_btn = gr.Button("Calculate Batch Size")
233
+ batch_table = gr.Dataframe(label="Batch Sizing Result", interactive=False)
234
+ batch_notes = gr.Textbox(label="Notes", interactive=False)
235
+ batch_btn.click(batch_size_calculator,
236
+ [gpu_vram_gb, model_weights_gb, runtime_overhead_gb,
237
+ hidden_size, num_layers, kv_precision_bits,
238
+ max_seq_len_tokens, reserve_headroom_frac],
239
+ [batch_table, batch_notes])
240
+
241
  if __name__ == "__main__":
242
  demo.launch()