File size: 8,407 Bytes
da107bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5770769
da107bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import math
import gradio as gr

# =========================
# --- Tab 1: GPU Sizing ---
# =========================
SECONDS_PER_MONTH_30D = 30 * 24 * 3600  # 2,592,000 seconds

def derive_per_gpu_tps(cluster_tps, n_gpus, fallback_tps_per_gpu):
    if cluster_tps and n_gpus and n_gpus > 0:
        try:
            return float(cluster_tps) / float(n_gpus)
        except Exception:
            pass
    return float(fallback_tps_per_gpu)

def monthly_to_rps(calls_per_month, avg_call_minutes):
    calls_per_month = max(0.0, float(calls_per_month))
    avg_call_minutes = max(0.0, float(avg_call_minutes))
    avg_rps = calls_per_month / SECONDS_PER_MONTH_30D
    avg_handle_time_s = avg_call_minutes * 60.0
    return avg_rps, avg_handle_time_s

def size_gpus(
    calls_per_month,
    avg_call_minutes,
    max_concurrency,
    target_stream_tps,
    tps_per_gpu_input,
    util_cap,
    cluster_tps,
    n_gpus_measured,
):
    # Derived stats
    avg_rps, avg_handle_time_s = monthly_to_rps(calls_per_month, avg_call_minutes)
    per_gpu_tps = derive_per_gpu_tps(cluster_tps, n_gpus_measured, max(1.0, float(tps_per_gpu_input)))
    util_cap = min(0.95, max(0.10, float(util_cap)))
    target_stream_tps = max(0.0, float(target_stream_tps))
    max_concurrency = max(0.0, float(max_concurrency))

    # Core math
    required_fleet_tps = max_concurrency * target_stream_tps
    usable_gpu_tps = per_gpu_tps * util_cap
    required_gpus = 0 if usable_gpu_tps <= 0 else math.ceil(required_fleet_tps / usable_gpu_tps)
    n_plus_1 = 0 if required_gpus == 0 else required_gpus + 1

    # Sensitivity table
    caps = [0.50, 0.60, 0.70, 0.80]
    table = "Util Cap | GPUs Needed\n---|---\n"
    for c in caps:
        eff = max(1.0, per_gpu_tps * c)
        table += f"{int(c*100)}% | {max(0, math.ceil(required_fleet_tps / eff))}\n"

    # Summary text
    lines = []
    lines.append("📊 **Workload Summary**")
    lines.append(f"- Calls per month: **{calls_per_month:,.0f}**")
    lines.append(f"- Average call duration: **{avg_call_minutes:.2f} min** (= {avg_handle_time_s:.0f}s)")
    lines.append(f"- Average RPS (for reference): **{avg_rps:.4f} req/s**")
    lines.append(f"- Max concurrent users: **{max_concurrency:,.0f}**\n")
    lines.append("⚙️ **Sizing Inputs**")
    lines.append(f"- Target stream rate per call: **{target_stream_tps:.2f} tokens/s**")
    if cluster_tps and n_gpus_measured:
        lines.append(f"- Per-GPU TPS (derived): **{per_gpu_tps:,.0f} tok/s** from {float(cluster_tps):,.0f} TPS / {int(n_gpus_measured)} GPU(s)")
    else:
        lines.append(f"- Per-GPU TPS (input): **{per_gpu_tps:,.0f} tok/s**")
    lines.append(f"- Utilization cap: **{util_cap:.0%}**")
    lines.append(f"- Fleet required TPS: **{required_fleet_tps:,.0f} tok/s** (= max_concurrency × target_stream_tps)\n")
    lines.append(f"✅ **Required GPUs (ceil)**: **{required_gpus}**")
    if required_gpus > 0:
        lines.append(f"🧩 **N+1 suggestion**: **{n_plus_1} GPUs**")
    return "\n".join(lines), table


# =========================
# --- Tab 2: Latency Predictor (Blocks, no duplicate) ---
# =========================
COEFF_TOTAL = {
    "const": 1.4171,
    "context": 0.0001,
    "concurrency": 0.0092,
    "TP": -0.0201,
    "DP": 0.2126,
    "gpu_type_encoded": -0.1040,
}
COEFF_TTFT = {
    "const": 1.2604,
    "context": 9.302e-05,
    "concurrency": 0.0041,
    "TP": -0.0749,
    "DP": -0.1258,
    "gpu_type_encoded": 0.1627,
}

def _predict_generic(context, TP, DP, concurrency, gpu_type, coeffs):
    gpu_encoded = 1 if gpu_type == "8× H200" else 0
    latency = (
        coeffs["const"]
        + coeffs["context"] * context
        + coeffs["concurrency"] * concurrency
        + coeffs["TP"] * TP
        + coeffs["DP"] * DP
        + coeffs["gpu_type_encoded"] * gpu_encoded
    )
    return round(latency, 3)

def predict_latency(model, context, TP, DP, concurrency, gpu_type):
    if TP * DP > 8:
        print("")
        msg = "⚠️ Invalid configuration: TP × DP must not exceed 8 GPUs."
        return (msg, msg)
    if gpu_type not in ["8× H100", "8× H200"]:
        msg = "⚠️ Invalid GPU selection. Only 8× H100 or 8× H200 supported."
        return (msg, msg)

    total = _predict_generic(context, TP, DP, concurrency, gpu_type, COEFF_TOTAL)
    ttft = _predict_generic(context, TP, DP, concurrency, gpu_type, COEFF_TTFT)
    return (
        f"Model: {model}\nGPU: {gpu_type}\n\nPredicted Total Response Latency: {total:.3f} s",
        f"Model: {model}\nGPU: {gpu_type}\n\nPredicted First Token Latency (TTFT): {ttft:.3f} s",
    )

def clear_latency():
    return ("", "")

# =========================
# --- Unified App (Soft Theme) ---
# =========================
with gr.Blocks(title="LLM GPU Planner", theme="soft") as app:
    gr.Markdown(
        "# 🧮 LLM GPU Planner\n"
        "Two tools to help you **predict latency** and **estimate GPU requirements**. Use the tabs below 👇"
    )

    with gr.Tabs():
        # ---- GPU Sizing Tab ----
        with gr.Tab("📈 GPU Sizing (Concurrency)"):
            gr.Markdown(
                "Estimate GPUs needed from **max concurrency**.\n\n"
                "Formula: `GPUs = ceil(max_concurrency × target_stream_tps / (per_gpu_tps × util_cap))`"
            )
            with gr.Row():
                with gr.Column():
                    calls_month = gr.Number(label="Calls per month", value=66200, precision=0)
                    avg_minutes = gr.Number(label="Average call duration (minutes)", value=10.0)
                    max_conc = gr.Number(label="Max concurrent active users", value=90)
                with gr.Column():
                    target_tps = gr.Number(label="Target stream tokens/s per call", value=10.0)
                    util = gr.Slider(label="Utilization cap (headroom)", value=0.60, minimum=0.10, maximum=0.95, step=0.01)
                    tps_per_gpu = gr.Number(label="Per-GPU tokens/s (measured or expected)", value=8300)
                    gr.Markdown("**Optional:** derive Per-GPU TPS from a measured cluster")
                    cluster_tps = gr.Number(label="Measured cluster tokens/s", value=None)
                    n_gpus_measured = gr.Number(label="#GPUs in that measurement", value=None, precision=0)
            btn = gr.Button("Calculate", variant="primary")
            summary_md = gr.Markdown()
            table_md = gr.Markdown()
            btn.click(
                size_gpus,
                inputs=[calls_month, avg_minutes, max_conc, target_tps, tps_per_gpu, util, cluster_tps, n_gpus_measured],
                outputs=[summary_md, table_md],
            )

        # ---- Latency Predictor Tab (Blocks only; no Interface duplication) ----
        with gr.Tab("⚡ LLM Latency Predictor"):
            gr.Markdown(
                "Estimate **total response** and **first-token (TTFT)** latency using your fitted model.\n\n"
                "⚙️ Validation: TP × DP ≤ 8; only 8× GPU setups are valid."
            )
            with gr.Row():
                with gr.Column():
                    model_dd = gr.Dropdown(["Qwen3-32B"], value="Qwen3-32B", label="Model")
                    ctx = gr.Slider(1024, 32768, value=8192, step=1024, label="Context Length (tokens)")
                    tp = gr.Dropdown([1, 2, 4, 8], value=4, label="Tensor Parallel (TP)")
                    dp = gr.Dropdown([1, 2, 4, 8], value=2, label="Data Parallel (DP)")
                    conc = gr.Slider(1, 200, value=100, step=5, label="Concurrency (requests)")
                    gpu_dd = gr.Dropdown(["8× H100", "8× H200"], value="8× H100", label="GPU Type")
                    with gr.Row():
                        clear_btn = gr.Button("Clear")
                        submit_btn = gr.Button("Submit", variant="primary")
                with gr.Column():
                    total_out = gr.Textbox(label="Predicted Total Response Latency")
                    ttft_out = gr.Textbox(label="Predicted First Token Latency (TTFT)")

            submit_btn.click(
                predict_latency,
                inputs=[model_dd, ctx, tp, dp, conc, gpu_dd],
                outputs=[total_out, ttft_out],
            )
            clear_btn.click(
                clear_latency,
                inputs=None,
                outputs=[total_out, ttft_out],
            )

if __name__ == "__main__":
    app.launch()