Spaces:

xiaosuhu1986
/

DocTrek-LLM-cost-estimation

Sleeping

App Files Files Community

xiaosuhu1986 commited on Aug 14, 2025

Commit

9052fea

verified ·

1 Parent(s): 4a73369

Add batch size calculator

Browse files

Files changed (2) hide show

README.md +8 -9
app.py +107 -25

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: LLM Cost, Capacity & Latency Estimator
 emoji: 🧮
 colorFrom: blue
 colorTo: green
@@ -9,14 +9,13 @@ app_file: app.py
 pinned: false
 ---
-# LLM Cost, Capacity & Latency Estimator
-Compare API vs GPU costs and estimate latency.
-## Utilization
-Fraction of time GPU is busy while powered on (0–1).
-## How to deploy
-1. Create HF Space → Gradio.
-2. Upload `app.py`, `requirements.txt`, `README.md`.
-3. Launch.

 ---
+title: LLM Cost, Capacity, Latency & Batch Sizer
 emoji: 🧮
 colorFrom: blue
 colorTo: green
 pinned: false
 ---
+# LLM Cost, Capacity, Latency & Batch Sizer
+Tabs:
+1) **Cost & Capacity** – Managed API vs GPU costs (busy-time vs scheduled uptime; set 24 h/day for always-on).
+2) **Latency Estimator** – prefill + decode + overhead, scaled by Queue/Burst factor for p95.
+3) **Batch Size Calculator** – computes theoretical & recommended safe batch from VRAM and KV-cache math.
+**KV cache rule**: `KV ≈ 2 × hidden_size × bytes/elem × layers × seq_len × batch_size`
+Use KV precision 4/8/16 bits, and reserve headroom to avoid OOMs.

app.py CHANGED Viewed

@@ -2,23 +2,24 @@
 import gradio as gr
 import pandas as pd
-APP_TITLE = "LLM Cost, Capacity & Latency Estimator"
 INTRO = """
-# LLM Cost, Capacity & Latency Estimator
-Estimate and compare **Managed API costs (per-token)** vs **Self-hosted GPU costs (per GPU-hour)**,
-plus a **Latency Estimator** for prompt+generation timing.
 **Utilization (0–1)** = fraction of time GPU is busy while powered on.
-- Example: 0.6 → GPU is busy 60% of the time and idle 40%.
-- Used to convert busy-hours → clock-hours: busy_hours / utilization.
-**Latency Estimator**:
-- Prefill (read prompt) is usually faster than decode (generate output).
-- Apply a queue/burst factor to estimate p95 latency under load.
 """
 def calc_tokens_per_patient(intake_tokens, review_tokens):
     return intake_tokens + review_tokens
@@ -88,13 +89,17 @@ def calculate_costs(num_patients, intake_tokens, review_tokens,
             f"${gpu_pp_fixed:,.2f}",
         ]
     })
-    notes = f"Total tokens: {total_tokens:,}. Utilization = fraction of time GPU is busy while powered on."
     return df, notes
 def latency_estimator(prompt_tokens, output_tokens,
                       prefill_tps, decode_tps, overhead_ms, queue_factor):
-    prefill_sec = prompt_tokens / max(prefill_tps, 1e-9) if prefill_tps else 0.0
-    decode_sec = output_tokens / max(decode_tps, 1e-9) if decode_tps else 0.0
     base_sec = prefill_sec + decode_sec + (overhead_ms / 1000.0)
     p95_sec = base_sec * max(queue_factor, 1.0)
     df = pd.DataFrame([
@@ -107,18 +112,74 @@ def latency_estimator(prompt_tokens, output_tokens,
     msg = f"Base: {base_sec:,.3f}s, p95 (×{queue_factor}): {p95_sec:,.3f}s"
     return df, msg
 with gr.Blocks(title=APP_TITLE) as demo:
     gr.Markdown(INTRO)
     with gr.Tabs():
         with gr.Tab("Cost & Capacity"):
             with gr.Row():
                 with gr.Column():
                     num_patients = gr.Number(value=500, label="Number of patients")
-                    intake_tokens = gr.Number(value=2000, label="Intake tokens")
-                    review_tokens = gr.Number(value=5000, label="Review tokens")
                 with gr.Column():
                     price_per_1k_low = gr.Number(value=0.002, label="API Price Low ($/1K tok)")
-                    price_per_1k_high = gr.Number(value=0.01, label="API Price High ($/1K tok)")
             with gr.Row():
                 with gr.Column():
                     toks_per_sec = gr.Number(value=200, label="GPU Throughput (tok/s)")
@@ -126,11 +187,10 @@ with gr.Blocks(title=APP_TITLE) as demo:
                 with gr.Column():
                     daily_uptime_hours = gr.Number(value=8, label="Billed uptime (hr/day)")
                     days_per_month = gr.Number(value=30, label="Days/month")
-                    utilization = gr.Slider(value=0.6, minimum=0.1, maximum=1.0, step=0.05,
-                                             label="Utilization (0–1) = busy time fraction")
             calc_btn = gr.Button("Calculate Costs")
-            out_table = gr.Dataframe(label="Summary")
-            out_notes = gr.Textbox(label="Notes")
             calc_btn.click(calculate_costs,
                            [num_patients, intake_tokens, review_tokens,
                             price_per_1k_low, price_per_1k_high,
@@ -141,20 +201,42 @@ with gr.Blocks(title=APP_TITLE) as demo:
         with gr.Tab("Latency Estimator"):
             with gr.Row():
                 with gr.Column():
-                    prompt_tokens = gr.Number(value=8000, label="Prompt tokens")
-                    output_tokens = gr.Number(value=300, label="Output tokens")
                 with gr.Column():
                     prefill_tps = gr.Number(value=1000, label="Prefill speed (tok/s)")
                     decode_tps = gr.Number(value=400, label="Decode speed (tok/s)")
             with gr.Row():
                 overhead_ms = gr.Number(value=200, label="Overhead (ms)")
-                queue_factor = gr.Slider(value=1.3, minimum=1.0, maximum=3.0, step=0.05, label="Queue/Burst Factor ×")
             lat_btn = gr.Button("Estimate Latency")
-            lat_table = gr.Dataframe(label="Latency Breakdown")
-            lat_notes = gr.Textbox(label="Notes")
             lat_btn.click(latency_estimator,
                           [prompt_tokens, output_tokens, prefill_tps, decode_tps, overhead_ms, queue_factor],
                           [lat_table, lat_notes])
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import pandas as pd
+APP_TITLE = "LLM Cost, Capacity, Latency & Batch Sizer"
 INTRO = """
+# LLM Cost, Capacity, Latency & Batch Sizer
+Estimate:
+- **Costs** for Managed API vs Self-hosted GPU (busy-time & scheduled uptime)
+- **Latency** from prompt/output and token speeds
+- **Batch size** limits from GPU VRAM and KV-cache math
 **Utilization (0–1)** = fraction of time GPU is busy while powered on.
+- Used to convert busy-hours → clock-hours: `effective_busy_hours = busy_hours / utilization`.
 """
+# --------------------
+# Cost & Capacity
+# --------------------
 def calc_tokens_per_patient(intake_tokens, review_tokens):
     return intake_tokens + review_tokens
             f"${gpu_pp_fixed:,.2f}",
         ]
     })
+    notes = "Set hours/day=24 to simulate always-on. Utilization = fraction of time GPU is busy while powered on."
     return df, notes
+# --------------------
+# Latency Estimator
+# --------------------
 def latency_estimator(prompt_tokens, output_tokens,
                       prefill_tps, decode_tps, overhead_ms, queue_factor):
+    prefill_sec = (prompt_tokens / max(prefill_tps, 1e-9)) if prefill_tps else 0.0
+    decode_sec = (output_tokens / max(decode_tps, 1e-9)) if decode_tps else 0.0
     base_sec = prefill_sec + decode_sec + (overhead_ms / 1000.0)
     p95_sec = base_sec * max(queue_factor, 1.0)
     df = pd.DataFrame([
     msg = f"Base: {base_sec:,.3f}s, p95 (×{queue_factor}): {p95_sec:,.3f}s"
     return df, msg
+# --------------------
+# Batch Size Calculator
+# --------------------
+def kv_bytes_per_elem(kv_precision_bits):
+    if kv_precision_bits == 16:
+        return 2.0
+    if kv_precision_bits == 8:
+        return 1.0
+    if kv_precision_bits == 4:
+        return 0.5
+    return 2.0
+def batch_size_calculator(
+    gpu_vram_gb,
+    model_weights_gb,
+    runtime_overhead_gb,
+    hidden_size,
+    num_layers,
+    kv_precision_bits,
+    max_seq_len_tokens,
+    reserve_headroom_frac
+):
+    bytes_per_elem = kv_bytes_per_elem(kv_precision_bits)
+    kv_per_token_bytes = 2.0 * hidden_size * bytes_per_elem * num_layers
+    kv_per_seq_bytes = kv_per_token_bytes * max_seq_len_tokens
+    total_vram_bytes = gpu_vram_gb * (1024**3)
+    used_bytes = (model_weights_gb + runtime_overhead_gb) * (1024**3)
+    reserve_bytes = total_vram_bytes * reserve_headroom_frac
+    free_for_kv = max(total_vram_bytes - used_bytes - reserve_bytes, 0)
+    theoretical_batch = int(free_for_kv // max(kv_per_seq_bytes, 1))
+    safe_batch_low = int(max(theoretical_batch * 0.5, 1))
+    safe_batch_high = int(max(theoretical_batch * 0.7, 1))
+    rows = [
+        ["GPU VRAM (GB)", f"{gpu_vram_gb}"],
+        ["Model weights (GB)", f"{model_weights_gb}"],
+        ["Runtime overhead (GB)", f"{runtime_overhead_gb}"],
+        ["Hidden size", f"{hidden_size}"],
+        ["Layers", f"{num_layers}"],
+        ["KV precision (bits)", f"{kv_precision_bits}"],
+        ["Max seq length (tokens)", f"{max_seq_len_tokens}"],
+        ["Reserve headroom (%)", f"{int(reserve_headroom_frac*100)}%"],
+        ["KV bytes / token", f"{kv_per_token_bytes:,.0f}"],
+        ["KV per sequence (GB)", f"{kv_per_seq_bytes / (1024**3):,.3f}"],
+        ["Free VRAM for KV (GB)", f"{free_for_kv / (1024**3):,.2f}"],
+        ["Max theoretical batch", f"{theoretical_batch}"],
+        ["Recommended safe batch", f"{safe_batch_low}–{safe_batch_high}"],
+    ]
+    df = pd.DataFrame(rows, columns=["Parameter", "Value"])
+    note = "Recommended safe batch is ~50–70% of theoretical to avoid OOM and keep p95 latency stable."
+    return df, note
 with gr.Blocks(title=APP_TITLE) as demo:
     gr.Markdown(INTRO)
     with gr.Tabs():
         with gr.Tab("Cost & Capacity"):
             with gr.Row():
                 with gr.Column():
                     num_patients = gr.Number(value=500, label="Number of patients")
+                    intake_tokens = gr.Number(value=2000, label="Intake tokens per patient")
+                    review_tokens = gr.Number(value=5000, label="Clinician review tokens per patient")
                 with gr.Column():
                     price_per_1k_low = gr.Number(value=0.002, label="API Price Low ($/1K tok)")
+                    price_per_1k_high = gr.Number(value=0.010, label="API Price High ($/1K tok)")
             with gr.Row():
                 with gr.Column():
                     toks_per_sec = gr.Number(value=200, label="GPU Throughput (tok/s)")
                 with gr.Column():
                     daily_uptime_hours = gr.Number(value=8, label="Billed uptime (hr/day)")
                     days_per_month = gr.Number(value=30, label="Days/month")
+                    utilization = gr.Slider(value=0.6, minimum=0.1, maximum=1.0, step=0.05, label="Utilization (0–1)")
             calc_btn = gr.Button("Calculate Costs")
+            out_table = gr.Dataframe(label="Summary", interactive=False)
+            out_notes = gr.Textbox(label="Notes", interactive=False)
             calc_btn.click(calculate_costs,
                            [num_patients, intake_tokens, review_tokens,
                             price_per_1k_low, price_per_1k_high,
         with gr.Tab("Latency Estimator"):
             with gr.Row():
                 with gr.Column():
+                    prompt_tokens = gr.Number(value=8000, label="Prompt tokens (input)")
+                    output_tokens = gr.Number(value=300, label="Output tokens (generated)")
                 with gr.Column():
                     prefill_tps = gr.Number(value=1000, label="Prefill speed (tok/s)")
                     decode_tps = gr.Number(value=400, label="Decode speed (tok/s)")
             with gr.Row():
                 overhead_ms = gr.Number(value=200, label="Overhead (ms)")
+                queue_factor = gr.Slider(value=1.3, minimum=1.0, maximum=10.0, step=0.1, label="Queue/Burst Factor (×)")
             lat_btn = gr.Button("Estimate Latency")
+            lat_table = gr.Dataframe(label="Latency Breakdown", interactive=False)
+            lat_notes = gr.Textbox(label="Notes", interactive=False)
             lat_btn.click(latency_estimator,
                           [prompt_tokens, output_tokens, prefill_tps, decode_tps, overhead_ms, queue_factor],
                           [lat_table, lat_notes])
+        with gr.Tab("Batch Size Calculator"):
+            with gr.Row():
+                with gr.Column():
+                    gpu_vram_gb = gr.Number(value=24, label="GPU VRAM (GB)")
+                    model_weights_gb = gr.Number(value=6, label="Model weights (GB)")
+                    runtime_overhead_gb = gr.Number(value=2, label="Runtime overhead (GB)")
+                with gr.Column():
+                    hidden_size = gr.Number(value=4096, label="Hidden size (d_model)")
+                    num_layers = gr.Number(value=32, label="Layers (transformer blocks)")
+                    kv_precision_bits = gr.Dropdown(choices=[4,8,16], value=4, label="KV precision (bits)")
+            with gr.Row():
+                max_seq_len_tokens = gr.Number(value=4096, label="Max sequence length (tokens)")
+                reserve_headroom_frac = gr.Slider(value=0.2, minimum=0.0, maximum=0.5, step=0.05, label="Reserve headroom (fraction)")
+            batch_btn = gr.Button("Calculate Batch Size")
+            batch_table = gr.Dataframe(label="Batch Sizing Result", interactive=False)
+            batch_notes = gr.Textbox(label="Notes", interactive=False)
+            batch_btn.click(batch_size_calculator,
+                            [gpu_vram_gb, model_weights_gb, runtime_overhead_gb,
+                             hidden_size, num_layers, kv_precision_bits,
+                             max_seq_len_tokens, reserve_headroom_frac],
+                            [batch_table, batch_notes])
 if __name__ == "__main__":
     demo.launch()