Spaces:

hlmshkr
/

gpu-sizing-calculator

Sleeping

App Files Files Community

gpu-sizing-calculator / app.py

hlmshkr

deploy

5770769 4 months ago

raw

history blame contribute delete

8.41 kB

	import math
	import gradio as gr

	# =========================
	# --- Tab 1: GPU Sizing ---
	# =========================
	SECONDS_PER_MONTH_30D = 30 * 24 * 3600 # 2,592,000 seconds

	def derive_per_gpu_tps(cluster_tps, n_gpus, fallback_tps_per_gpu):
	if cluster_tps and n_gpus and n_gpus > 0:
	try:
	return float(cluster_tps) / float(n_gpus)
	except Exception:
	pass
	return float(fallback_tps_per_gpu)

	def monthly_to_rps(calls_per_month, avg_call_minutes):
	calls_per_month = max(0.0, float(calls_per_month))
	avg_call_minutes = max(0.0, float(avg_call_minutes))
	avg_rps = calls_per_month / SECONDS_PER_MONTH_30D
	avg_handle_time_s = avg_call_minutes * 60.0
	return avg_rps, avg_handle_time_s

	def size_gpus(
	calls_per_month,
	avg_call_minutes,
	max_concurrency,
	target_stream_tps,
	tps_per_gpu_input,
	util_cap,
	cluster_tps,
	n_gpus_measured,
	):
	# Derived stats
	avg_rps, avg_handle_time_s = monthly_to_rps(calls_per_month, avg_call_minutes)
	per_gpu_tps = derive_per_gpu_tps(cluster_tps, n_gpus_measured, max(1.0, float(tps_per_gpu_input)))
	util_cap = min(0.95, max(0.10, float(util_cap)))
	target_stream_tps = max(0.0, float(target_stream_tps))
	max_concurrency = max(0.0, float(max_concurrency))

	# Core math
	required_fleet_tps = max_concurrency * target_stream_tps
	usable_gpu_tps = per_gpu_tps * util_cap
	required_gpus = 0 if usable_gpu_tps <= 0 else math.ceil(required_fleet_tps / usable_gpu_tps)
	n_plus_1 = 0 if required_gpus == 0 else required_gpus + 1

	# Sensitivity table
	caps = [0.50, 0.60, 0.70, 0.80]
	table = "Util Cap \| GPUs Needed\n---\|---\n"
	for c in caps:
	eff = max(1.0, per_gpu_tps * c)
	table += f"{int(c*100)}% \| {max(0, math.ceil(required_fleet_tps / eff))}\n"

	# Summary text
	lines = []
	lines.append("📊 Workload Summary")
	lines.append(f"- Calls per month: {calls_per_month:,.0f}")
	lines.append(f"- Average call duration: {avg_call_minutes:.2f} min (= {avg_handle_time_s:.0f}s)")
	lines.append(f"- Average RPS (for reference): {avg_rps:.4f} req/s")
	lines.append(f"- Max concurrent users: {max_concurrency:,.0f}\n")
	lines.append("⚙️ Sizing Inputs")
	lines.append(f"- Target stream rate per call: {target_stream_tps:.2f} tokens/s")
	if cluster_tps and n_gpus_measured:
	lines.append(f"- Per-GPU TPS (derived): {per_gpu_tps:,.0f} tok/s from {float(cluster_tps):,.0f} TPS / {int(n_gpus_measured)} GPU(s)")
	else:
	lines.append(f"- Per-GPU TPS (input): {per_gpu_tps:,.0f} tok/s")
	lines.append(f"- Utilization cap: {util_cap:.0%}")
	lines.append(f"- Fleet required TPS: {required_fleet_tps:,.0f} tok/s (= max_concurrency × target_stream_tps)\n")
	lines.append(f"✅ Required GPUs (ceil): {required_gpus}")
	if required_gpus > 0:
	lines.append(f"🧩 N+1 suggestion: {n_plus_1} GPUs")
	return "\n".join(lines), table


	# =========================
	# --- Tab 2: Latency Predictor (Blocks, no duplicate) ---
	# =========================
	COEFF_TOTAL = {
	"const": 1.4171,
	"context": 0.0001,
	"concurrency": 0.0092,
	"TP": -0.0201,
	"DP": 0.2126,
	"gpu_type_encoded": -0.1040,
	}
	COEFF_TTFT = {
	"const": 1.2604,
	"context": 9.302e-05,
	"concurrency": 0.0041,
	"TP": -0.0749,
	"DP": -0.1258,
	"gpu_type_encoded": 0.1627,
	}

	def _predict_generic(context, TP, DP, concurrency, gpu_type, coeffs):
	gpu_encoded = 1 if gpu_type == "8× H200" else 0
	latency = (
	coeffs["const"]
	+ coeffs["context"] * context
	+ coeffs["concurrency"] * concurrency
	+ coeffs["TP"] * TP
	+ coeffs["DP"] * DP
	+ coeffs["gpu_type_encoded"] * gpu_encoded
	)
	return round(latency, 3)

	def predict_latency(model, context, TP, DP, concurrency, gpu_type):
	if TP * DP > 8:
	print("")
	msg = "⚠️ Invalid configuration: TP × DP must not exceed 8 GPUs."
	return (msg, msg)
	if gpu_type not in ["8× H100", "8× H200"]:
	msg = "⚠️ Invalid GPU selection. Only 8× H100 or 8× H200 supported."
	return (msg, msg)

	total = _predict_generic(context, TP, DP, concurrency, gpu_type, COEFF_TOTAL)
	ttft = _predict_generic(context, TP, DP, concurrency, gpu_type, COEFF_TTFT)
	return (
	f"Model: {model}\nGPU: {gpu_type}\n\nPredicted Total Response Latency: {total:.3f} s",
	f"Model: {model}\nGPU: {gpu_type}\n\nPredicted First Token Latency (TTFT): {ttft:.3f} s",
	)

	def clear_latency():
	return ("", "")

	# =========================
	# --- Unified App (Soft Theme) ---
	# =========================
	with gr.Blocks(title="LLM GPU Planner", theme="soft") as app:
	gr.Markdown(
	"# 🧮 LLM GPU Planner\n"
	"Two tools to help you predict latency and estimate GPU requirements. Use the tabs below 👇"
	)

	with gr.Tabs():
	# ---- GPU Sizing Tab ----
	with gr.Tab("📈 GPU Sizing (Concurrency)"):
	gr.Markdown(
	"Estimate GPUs needed from max concurrency.\n\n"
	"Formula: `GPUs = ceil(max_concurrency × target_stream_tps / (per_gpu_tps × util_cap))`"
	)
	with gr.Row():
	with gr.Column():
	calls_month = gr.Number(label="Calls per month", value=66200, precision=0)
	avg_minutes = gr.Number(label="Average call duration (minutes)", value=10.0)
	max_conc = gr.Number(label="Max concurrent active users", value=90)
	with gr.Column():
	target_tps = gr.Number(label="Target stream tokens/s per call", value=10.0)
	util = gr.Slider(label="Utilization cap (headroom)", value=0.60, minimum=0.10, maximum=0.95, step=0.01)
	tps_per_gpu = gr.Number(label="Per-GPU tokens/s (measured or expected)", value=8300)
	gr.Markdown("Optional: derive Per-GPU TPS from a measured cluster")
	cluster_tps = gr.Number(label="Measured cluster tokens/s", value=None)
	n_gpus_measured = gr.Number(label="#GPUs in that measurement", value=None, precision=0)
	btn = gr.Button("Calculate", variant="primary")
	summary_md = gr.Markdown()
	table_md = gr.Markdown()
	btn.click(
	size_gpus,
	inputs=[calls_month, avg_minutes, max_conc, target_tps, tps_per_gpu, util, cluster_tps, n_gpus_measured],
	outputs=[summary_md, table_md],
	)

	# ---- Latency Predictor Tab (Blocks only; no Interface duplication) ----
	with gr.Tab("⚡ LLM Latency Predictor"):
	gr.Markdown(
	"Estimate total response and first-token (TTFT) latency using your fitted model.\n\n"
	"⚙️ Validation: TP × DP ≤ 8; only 8× GPU setups are valid."
	)
	with gr.Row():
	with gr.Column():
	model_dd = gr.Dropdown(["Qwen3-32B"], value="Qwen3-32B", label="Model")
	ctx = gr.Slider(1024, 32768, value=8192, step=1024, label="Context Length (tokens)")
	tp = gr.Dropdown([1, 2, 4, 8], value=4, label="Tensor Parallel (TP)")
	dp = gr.Dropdown([1, 2, 4, 8], value=2, label="Data Parallel (DP)")
	conc = gr.Slider(1, 200, value=100, step=5, label="Concurrency (requests)")
	gpu_dd = gr.Dropdown(["8× H100", "8× H200"], value="8× H100", label="GPU Type")
	with gr.Row():
	clear_btn = gr.Button("Clear")
	submit_btn = gr.Button("Submit", variant="primary")
	with gr.Column():
	total_out = gr.Textbox(label="Predicted Total Response Latency")
	ttft_out = gr.Textbox(label="Predicted First Token Latency (TTFT)")

	submit_btn.click(
	predict_latency,
	inputs=[model_dd, ctx, tp, dp, conc, gpu_dd],
	outputs=[total_out, ttft_out],
	)
	clear_btn.click(
	clear_latency,
	inputs=None,
	outputs=[total_out, ttft_out],
	)

	if __name__ == "__main__":
	app.launch()