Spaces:

hlmshkr
/

gpu-sizing-calculator

Sleeping

File size: 8,407 Bytes

import math
import gradio as gr

# =========================
# --- Tab 1: GPU Sizing ---
# =========================
SECONDS_PER_MONTH_30D = 30 * 24 * 3600  # 2,592,000 seconds

def derive_per_gpu_tps(cluster_tps, n_gpus, fallback_tps_per_gpu):
    if cluster_tps and n_gpus and n_gpus > 0:
        try:
            return float(cluster_tps) / float(n_gpus)
        except Exception:
            pass
    return float(fallback_tps_per_gpu)

def monthly_to_rps(calls_per_month, avg_call_minutes):
    calls_per_month = max(0.0, float(calls_per_month))
    avg_call_minutes = max(0.0, float(avg_call_minutes))
    avg_rps = calls_per_month / SECONDS_PER_MONTH_30D
    avg_handle_time_s = avg_call_minutes * 60.0
    return avg_rps, avg_handle_time_s

def size_gpus(
    calls_per_month,
    avg_call_minutes,
    max_concurrency,
    target_stream_tps,
    tps_per_gpu_input,
    util_cap,
    cluster_tps,
    n_gpus_measured,
):
    # Derived stats
    avg_rps, avg_handle_time_s = monthly_to_rps(calls_per_month, avg_call_minutes)
    per_gpu_tps = derive_per_gpu_tps(cluster_tps, n_gpus_measured, max(1.0, float(tps_per_gpu_input)))
    util_cap = min(0.95, max(0.10, float(util_cap)))
    target_stream_tps = max(0.0, float(target_stream_tps))
    max_concurrency = max(0.0, float(max_concurrency))

    # Core math
    required_fleet_tps = max_concurrency * target_stream_tps
    usable_gpu_tps = per_gpu_tps * util_cap
    required_gpus = 0 if usable_gpu_tps <= 0 else math.ceil(required_fleet_tps / usable_gpu_tps)
    n_plus_1 = 0 if required_gpus == 0 else required_gpus + 1

    # Sensitivity table
    caps = [0.50, 0.60, 0.70, 0.80]
    table = "Util Cap | GPUs Needed\n---|---\n"
    for c in caps:
        eff = max(1.0, per_gpu_tps * c)
        table += f"{int(c*100)}% | {max(0, math.ceil(required_fleet_tps / eff))}\n"

    # Summary text
    lines = []
    lines.append("📊 **Workload Summary**")
    lines.append(f"- Calls per month: **{calls_per_month:,.0f}**")
    lines.append(f"- Average call duration: **{avg_call_minutes:.2f} min** (= {avg_handle_time_s:.0f}s)")
    lines.append(f"- Average RPS (for reference): **{avg_rps:.4f} req/s**")
    lines.append(f"- Max concurrent users: **{max_concurrency:,.0f}**\n")
    lines.append("⚙️ **Sizing Inputs**")
    lines.append(f"- Target stream rate per call: **{target_stream_tps:.2f} tokens/s**")
    if cluster_tps and n_gpus_measured:
        lines.append(f"- Per-GPU TPS (derived): **{per_gpu_tps:,.0f} tok/s** from {float(cluster_tps):,.0f} TPS / {int(n_gpus_measured)} GPU(s)")
    else:
        lines.append(f"- Per-GPU TPS (input): **{per_gpu_tps:,.0f} tok/s**")
    lines.append(f"- Utilization cap: **{util_cap:.0%}**")
    lines.append(f"- Fleet required TPS: **{required_fleet_tps:,.0f} tok/s** (= max_concurrency × target_stream_tps)\n")
    lines.append(f"✅ **Required GPUs (ceil)**: **{required_gpus}**")
    if required_gpus > 0:
        lines.append(f"🧩 **N+1 suggestion**: **{n_plus_1} GPUs**")
    return "\n".join(lines), table


# =========================
# --- Tab 2: Latency Predictor (Blocks, no duplicate) ---
# =========================
COEFF_TOTAL = {
    "const": 1.4171,
    "context": 0.0001,
    "concurrency": 0.0092,
    "TP": -0.0201,
    "DP": 0.2126,
    "gpu_type_encoded": -0.1040,
}
COEFF_TTFT = {
    "const": 1.2604,
    "context": 9.302e-05,
    "concurrency": 0.0041,
    "TP": -0.0749,
    "DP": -0.1258,
    "gpu_type_encoded": 0.1627,
}

def _predict_generic(context, TP, DP, concurrency, gpu_type, coeffs):
    gpu_encoded = 1 if gpu_type == "8× H200" else 0
    latency = (
        coeffs["const"]
        + coeffs["context"] * context
        + coeffs["concurrency"] * concurrency
        + coeffs["TP"] * TP
        + coeffs["DP"] * DP
        + coeffs["gpu_type_encoded"] * gpu_encoded
    )
    return round(latency, 3)

def predict_latency(model, context, TP, DP, concurrency, gpu_type):
    if TP * DP > 8:
        print("")
        msg = "⚠️ Invalid configuration: TP × DP must not exceed 8 GPUs."
        return (msg, msg)
    if gpu_type not in ["8× H100", "8× H200"]:
        msg = "⚠️ Invalid GPU selection. Only 8× H100 or 8× H200 supported."
        return (msg, msg)

    total = _predict_generic(context, TP, DP, concurrency, gpu_type, COEFF_TOTAL)
    ttft = _predict_generic(context, TP, DP, concurrency, gpu_type, COEFF_TTFT)
    return (
        f"Model: {model}\nGPU: {gpu_type}\n\nPredicted Total Response Latency: {total:.3f} s",
        f"Model: {model}\nGPU: {gpu_type}\n\nPredicted First Token Latency (TTFT): {ttft:.3f} s",
    )

def clear_latency():
    return ("", "")

# =========================
# --- Unified App (Soft Theme) ---
# =========================
with gr.Blocks(title="LLM GPU Planner", theme="soft") as app:
    gr.Markdown(
        "# 🧮 LLM GPU Planner\n"
        "Two tools to help you **predict latency** and **estimate GPU requirements**. Use the tabs below 👇"
    )

    with gr.Tabs():
        # ---- GPU Sizing Tab ----
        with gr.Tab("📈 GPU Sizing (Concurrency)"):
            gr.Markdown(
                "Estimate GPUs needed from **max concurrency**.\n\n"
                "Formula: `GPUs = ceil(max_concurrency × target_stream_tps / (per_gpu_tps × util_cap))`"
            )
            with gr.Row():
                with gr.Column():
                    calls_month = gr.Number(label="Calls per month", value=66200, precision=0)
                    avg_minutes = gr.Number(label="Average call duration (minutes)", value=10.0)
                    max_conc = gr.Number(label="Max concurrent active users", value=90)
                with gr.Column():
                    target_tps = gr.Number(label="Target stream tokens/s per call", value=10.0)
                    util = gr.Slider(label="Utilization cap (headroom)", value=0.60, minimum=0.10, maximum=0.95, step=0.01)
                    tps_per_gpu = gr.Number(label="Per-GPU tokens/s (measured or expected)", value=8300)
                    gr.Markdown("**Optional:** derive Per-GPU TPS from a measured cluster")
                    cluster_tps = gr.Number(label="Measured cluster tokens/s", value=None)
                    n_gpus_measured = gr.Number(label="#GPUs in that measurement", value=None, precision=0)
            btn = gr.Button("Calculate", variant="primary")
            summary_md = gr.Markdown()
            table_md = gr.Markdown()
            btn.click(
                size_gpus,
                inputs=[calls_month, avg_minutes, max_conc, target_tps, tps_per_gpu, util, cluster_tps, n_gpus_measured],
                outputs=[summary_md, table_md],
            )

        # ---- Latency Predictor Tab (Blocks only; no Interface duplication) ----
        with gr.Tab("⚡ LLM Latency Predictor"):
            gr.Markdown(
                "Estimate **total response** and **first-token (TTFT)** latency using your fitted model.\n\n"
                "⚙️ Validation: TP × DP ≤ 8; only 8× GPU setups are valid."
            )
            with gr.Row():
                with gr.Column():
                    model_dd = gr.Dropdown(["Qwen3-32B"], value="Qwen3-32B", label="Model")
                    ctx = gr.Slider(1024, 32768, value=8192, step=1024, label="Context Length (tokens)")
                    tp = gr.Dropdown([1, 2, 4, 8], value=4, label="Tensor Parallel (TP)")
                    dp = gr.Dropdown([1, 2, 4, 8], value=2, label="Data Parallel (DP)")
                    conc = gr.Slider(1, 200, value=100, step=5, label="Concurrency (requests)")
                    gpu_dd = gr.Dropdown(["8× H100", "8× H200"], value="8× H100", label="GPU Type")
                    with gr.Row():
                        clear_btn = gr.Button("Clear")
                        submit_btn = gr.Button("Submit", variant="primary")
                with gr.Column():
                    total_out = gr.Textbox(label="Predicted Total Response Latency")
                    ttft_out = gr.Textbox(label="Predicted First Token Latency (TTFT)")

            submit_btn.click(
                predict_latency,
                inputs=[model_dd, ctx, tp, dp, conc, gpu_dd],
                outputs=[total_out, ttft_out],
            )
            clear_btn.click(
                clear_latency,
                inputs=None,
                outputs=[total_out, ttft_out],
            )

if __name__ == "__main__":
    app.launch()