import math import gradio as gr # ========================= # --- Tab 1: GPU Sizing --- # ========================= SECONDS_PER_MONTH_30D = 30 * 24 * 3600 # 2,592,000 seconds def derive_per_gpu_tps(cluster_tps, n_gpus, fallback_tps_per_gpu): if cluster_tps and n_gpus and n_gpus > 0: try: return float(cluster_tps) / float(n_gpus) except Exception: pass return float(fallback_tps_per_gpu) def monthly_to_rps(calls_per_month, avg_call_minutes): calls_per_month = max(0.0, float(calls_per_month)) avg_call_minutes = max(0.0, float(avg_call_minutes)) avg_rps = calls_per_month / SECONDS_PER_MONTH_30D avg_handle_time_s = avg_call_minutes * 60.0 return avg_rps, avg_handle_time_s def size_gpus( calls_per_month, avg_call_minutes, max_concurrency, target_stream_tps, tps_per_gpu_input, util_cap, cluster_tps, n_gpus_measured, ): # Derived stats avg_rps, avg_handle_time_s = monthly_to_rps(calls_per_month, avg_call_minutes) per_gpu_tps = derive_per_gpu_tps(cluster_tps, n_gpus_measured, max(1.0, float(tps_per_gpu_input))) util_cap = min(0.95, max(0.10, float(util_cap))) target_stream_tps = max(0.0, float(target_stream_tps)) max_concurrency = max(0.0, float(max_concurrency)) # Core math required_fleet_tps = max_concurrency * target_stream_tps usable_gpu_tps = per_gpu_tps * util_cap required_gpus = 0 if usable_gpu_tps <= 0 else math.ceil(required_fleet_tps / usable_gpu_tps) n_plus_1 = 0 if required_gpus == 0 else required_gpus + 1 # Sensitivity table caps = [0.50, 0.60, 0.70, 0.80] table = "Util Cap | GPUs Needed\n---|---\n" for c in caps: eff = max(1.0, per_gpu_tps * c) table += f"{int(c*100)}% | {max(0, math.ceil(required_fleet_tps / eff))}\n" # Summary text lines = [] lines.append("📊 **Workload Summary**") lines.append(f"- Calls per month: **{calls_per_month:,.0f}**") lines.append(f"- Average call duration: **{avg_call_minutes:.2f} min** (= {avg_handle_time_s:.0f}s)") lines.append(f"- Average RPS (for reference): **{avg_rps:.4f} req/s**") lines.append(f"- Max concurrent users: **{max_concurrency:,.0f}**\n") lines.append("⚙️ **Sizing Inputs**") lines.append(f"- Target stream rate per call: **{target_stream_tps:.2f} tokens/s**") if cluster_tps and n_gpus_measured: lines.append(f"- Per-GPU TPS (derived): **{per_gpu_tps:,.0f} tok/s** from {float(cluster_tps):,.0f} TPS / {int(n_gpus_measured)} GPU(s)") else: lines.append(f"- Per-GPU TPS (input): **{per_gpu_tps:,.0f} tok/s**") lines.append(f"- Utilization cap: **{util_cap:.0%}**") lines.append(f"- Fleet required TPS: **{required_fleet_tps:,.0f} tok/s** (= max_concurrency × target_stream_tps)\n") lines.append(f"✅ **Required GPUs (ceil)**: **{required_gpus}**") if required_gpus > 0: lines.append(f"🧩 **N+1 suggestion**: **{n_plus_1} GPUs**") return "\n".join(lines), table # ========================= # --- Tab 2: Latency Predictor (Blocks, no duplicate) --- # ========================= COEFF_TOTAL = { "const": 1.4171, "context": 0.0001, "concurrency": 0.0092, "TP": -0.0201, "DP": 0.2126, "gpu_type_encoded": -0.1040, } COEFF_TTFT = { "const": 1.2604, "context": 9.302e-05, "concurrency": 0.0041, "TP": -0.0749, "DP": -0.1258, "gpu_type_encoded": 0.1627, } def _predict_generic(context, TP, DP, concurrency, gpu_type, coeffs): gpu_encoded = 1 if gpu_type == "8× H200" else 0 latency = ( coeffs["const"] + coeffs["context"] * context + coeffs["concurrency"] * concurrency + coeffs["TP"] * TP + coeffs["DP"] * DP + coeffs["gpu_type_encoded"] * gpu_encoded ) return round(latency, 3) def predict_latency(model, context, TP, DP, concurrency, gpu_type): if TP * DP > 8: print("") msg = "⚠️ Invalid configuration: TP × DP must not exceed 8 GPUs." return (msg, msg) if gpu_type not in ["8× H100", "8× H200"]: msg = "⚠️ Invalid GPU selection. Only 8× H100 or 8× H200 supported." return (msg, msg) total = _predict_generic(context, TP, DP, concurrency, gpu_type, COEFF_TOTAL) ttft = _predict_generic(context, TP, DP, concurrency, gpu_type, COEFF_TTFT) return ( f"Model: {model}\nGPU: {gpu_type}\n\nPredicted Total Response Latency: {total:.3f} s", f"Model: {model}\nGPU: {gpu_type}\n\nPredicted First Token Latency (TTFT): {ttft:.3f} s", ) def clear_latency(): return ("", "") # ========================= # --- Unified App (Soft Theme) --- # ========================= with gr.Blocks(title="LLM GPU Planner", theme="soft") as app: gr.Markdown( "# 🧮 LLM GPU Planner\n" "Two tools to help you **predict latency** and **estimate GPU requirements**. Use the tabs below 👇" ) with gr.Tabs(): # ---- GPU Sizing Tab ---- with gr.Tab("📈 GPU Sizing (Concurrency)"): gr.Markdown( "Estimate GPUs needed from **max concurrency**.\n\n" "Formula: `GPUs = ceil(max_concurrency × target_stream_tps / (per_gpu_tps × util_cap))`" ) with gr.Row(): with gr.Column(): calls_month = gr.Number(label="Calls per month", value=66200, precision=0) avg_minutes = gr.Number(label="Average call duration (minutes)", value=10.0) max_conc = gr.Number(label="Max concurrent active users", value=90) with gr.Column(): target_tps = gr.Number(label="Target stream tokens/s per call", value=10.0) util = gr.Slider(label="Utilization cap (headroom)", value=0.60, minimum=0.10, maximum=0.95, step=0.01) tps_per_gpu = gr.Number(label="Per-GPU tokens/s (measured or expected)", value=8300) gr.Markdown("**Optional:** derive Per-GPU TPS from a measured cluster") cluster_tps = gr.Number(label="Measured cluster tokens/s", value=None) n_gpus_measured = gr.Number(label="#GPUs in that measurement", value=None, precision=0) btn = gr.Button("Calculate", variant="primary") summary_md = gr.Markdown() table_md = gr.Markdown() btn.click( size_gpus, inputs=[calls_month, avg_minutes, max_conc, target_tps, tps_per_gpu, util, cluster_tps, n_gpus_measured], outputs=[summary_md, table_md], ) # ---- Latency Predictor Tab (Blocks only; no Interface duplication) ---- with gr.Tab("⚡ LLM Latency Predictor"): gr.Markdown( "Estimate **total response** and **first-token (TTFT)** latency using your fitted model.\n\n" "⚙️ Validation: TP × DP ≤ 8; only 8× GPU setups are valid." ) with gr.Row(): with gr.Column(): model_dd = gr.Dropdown(["Qwen3-32B"], value="Qwen3-32B", label="Model") ctx = gr.Slider(1024, 32768, value=8192, step=1024, label="Context Length (tokens)") tp = gr.Dropdown([1, 2, 4, 8], value=4, label="Tensor Parallel (TP)") dp = gr.Dropdown([1, 2, 4, 8], value=2, label="Data Parallel (DP)") conc = gr.Slider(1, 200, value=100, step=5, label="Concurrency (requests)") gpu_dd = gr.Dropdown(["8× H100", "8× H200"], value="8× H100", label="GPU Type") with gr.Row(): clear_btn = gr.Button("Clear") submit_btn = gr.Button("Submit", variant="primary") with gr.Column(): total_out = gr.Textbox(label="Predicted Total Response Latency") ttft_out = gr.Textbox(label="Predicted First Token Latency (TTFT)") submit_btn.click( predict_latency, inputs=[model_dd, ctx, tp, dp, conc, gpu_dd], outputs=[total_out, ttft_out], ) clear_btn.click( clear_latency, inputs=None, outputs=[total_out, ttft_out], ) if __name__ == "__main__": app.launch()