import gradio as gr import pandas as pd import plotly.graph_objects as go MODEL_PRESETS = { "DeepSeek V4 — OpenRouter (~90% cache)": (0.041, 0.87), "Claude Sonnet 4.6": (3.0, 15.0), "Claude Haiku 4.5": (1.0, 5.0), "Custom": None, } CLOUD_PRESETS = { "GMI Cloud": [["H200 × 8", 20.8], ["B200 × 8", 32.0], ["GB200 × 4", 32.0]], "Custom": None, } DEFAULT_MODEL = "DeepSeek V4 — OpenRouter (~90% cache)" DEFAULT_CLOUD = "GMI Cloud" DEFAULT_IN_K = 64.0 DEFAULT_OUT_K = 4.0 DEFAULT_RPS = 1.0 GPU_COLORS = ["#2E86DE", "#10AC84", "#EE5253", "#8854D0", "#F79F1F", "#576574"] WORKLOAD_COLOR = "#9b59b6" def cost_per_request(in_k: float, out_k: float, in_price: float, out_price: float) -> float: return (in_k * 1000 * in_price + out_k * 1000 * out_price) / 1_000_000 def parse_gpus(df): if isinstance(df, pd.DataFrame): rows = df.fillna(0).values.tolist() else: rows = df or [] out = [] for row in rows: if not row or len(row) < 2: continue name = str(row[0]).strip() if row[0] is not None else "" try: hourly = float(row[1]) except (TypeError, ValueError): continue if not name or hourly <= 0: continue out.append((name, hourly)) return out def compute(in_price, out_price, in_k, out_k, gpu_df, planned_rps): cpr = cost_per_request(in_k, out_k, in_price, out_price) gpus = parse_gpus(gpu_df) headline = _headline(cpr, in_k, out_k, in_price, out_price) if cpr <= 0 or not gpus: empty_break = pd.DataFrame(columns=["GPU config", "$/hour", "Break-even req/hr", "Break-even RPS"]) empty_workload = pd.DataFrame(columns=["Option", "$ / hour", "vs API"]) return headline, empty_break, empty_workload, _empty_figure() break_rows = [] max_rps = 0.0 for name, hourly in gpus: rph = hourly / cpr rps = rph / 3600 max_rps = max(max_rps, rps) break_rows.append({ "GPU config": name, "$/hour": f"${hourly:,.2f}", "Break-even req/hr": f"{rph:,.0f}", "Break-even RPS": f"{rps:,.3f}", }) break_df = pd.DataFrame(break_rows) api_hourly = planned_rps * 3600 * cpr workload_rows = [{ "Option": "API", "$ / hour": f"${api_hourly:,.2f}", "vs API": "—", }] for name, hourly in gpus: diff = hourly - api_hourly if abs(diff) < 0.005: note = "break-even" elif diff < 0: note = f"−${abs(diff):,.2f}/hr cheaper than API" else: note = f"+${diff:,.2f}/hr pricier than API" workload_rows.append({ "Option": name, "$ / hour": f"${hourly:,.2f}", "vs API": note, }) workload_df = pd.DataFrame(workload_rows) x_max = max(max_rps * 1.6, planned_rps * 1.3, 0.1) fig = _build_figure(cpr, gpus, x_max, planned_rps) return headline, break_df, workload_df, fig def _headline(cpr, in_k, out_k, in_price, out_price): return ( f"### API cost per request: **${cpr:,.6f}** \n" f"_({int(in_k * 1000):,} in × ${in_price}/1M + {int(out_k * 1000):,} out × ${out_price}/1M)_" ) def _empty_figure(): fig = go.Figure() fig.update_layout( template="plotly_white", height=480, annotations=[dict(text="Set positive values for tokens, prices, and at least one GPU row.", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False)], ) return fig def _build_figure(cpr, gpus, x_max, planned_rps): n = 200 xs = [x_max * i / (n - 1) for i in range(n)] api_costs = [r * 3600 * cpr for r in xs] fig = go.Figure() fig.add_trace(go.Scatter( x=xs, y=api_costs, mode="lines", name="API cost", line=dict(color="#222f3e", width=3), hovertemplate="RPS: %{x:.3f}
API $/hr: $%{y:,.2f}", )) y_max = max(api_costs[-1], max(h for _, h in gpus)) * 1.18 for i, (name, hourly) in enumerate(gpus): color = GPU_COLORS[i % len(GPU_COLORS)] fig.add_trace(go.Scatter( x=[0, x_max], y=[hourly, hourly], mode="lines", name=f"{name} (${hourly:.2f}/hr)", line=dict(color=color, width=2, dash="dash"), hovertemplate=f"{name}
$/hr: ${hourly:,.2f}", )) rph = hourly / cpr rps = rph / 3600 if rps <= x_max: fig.add_trace(go.Scatter( x=[rps], y=[hourly], mode="markers+text", marker=dict(color=color, size=11, line=dict(color="white", width=2)), text=[f" {rps:.3f} RPS"], textposition="middle right", textfont=dict(color=color, size=12), showlegend=False, hovertemplate=( f"{name} break-even
" f"RPS: {rps:.3f}
" f"req/hr: {rph:,.0f}
" f"$/hr: ${hourly:,.2f}" ), )) api_at = planned_rps * 3600 * cpr fig.add_shape(type="line", x0=planned_rps, x1=planned_rps, y0=0, y1=y_max, line=dict(color=WORKLOAD_COLOR, width=2, dash="dot")) fig.add_annotation(x=planned_rps, y=y_max, text=f"your workload: {planned_rps:.2f} RPS", showarrow=False, font=dict(color=WORKLOAD_COLOR, size=12), yshift=8) fig.add_trace(go.Scatter( x=[planned_rps], y=[api_at], mode="markers", marker=dict(color=WORKLOAD_COLOR, size=11, symbol="diamond", line=dict(color="white", width=2)), name="Your workload (on API)", hovertemplate=f"At {planned_rps:.2f} RPS
API $/hr: ${api_at:,.2f}", )) fig.update_layout( template="plotly_white", height=480, margin=dict(l=60, r=30, t=70, b=50), xaxis=dict(title="Requests per second", range=[0, x_max]), yaxis=dict(title="$ / hour", rangemode="tozero"), legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="left", x=0), title=dict(text="Cloud GPU $/hr vs API $/hr — where lines cross is break-even", font=dict(size=14)), ) return fig def apply_model_preset(preset_name, cur_in, cur_out): p = MODEL_PRESETS.get(preset_name) if p is None: return cur_in, cur_out return p[0], p[1] def apply_cloud_preset(preset_name, cur_df): p = CLOUD_PRESETS.get(preset_name) if p is None: return cur_df return p def reset_all(): return ( DEFAULT_MODEL, DEFAULT_CLOUD, MODEL_PRESETS[DEFAULT_MODEL][0], MODEL_PRESETS[DEFAULT_MODEL][1], DEFAULT_IN_K, DEFAULT_OUT_K, CLOUD_PRESETS[DEFAULT_CLOUD], DEFAULT_RPS, ) with gr.Blocks(title="Cloud bills vs API bills") as demo: gr.Markdown( """ # Cloud bills vs API bills At what request rate does renting GPUs beat paying per token? Drag the **Your workload** slider to see live cost at your planned scale. """ ) with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Model & API pricing") model_preset = gr.Dropdown( choices=list(MODEL_PRESETS.keys()), value=DEFAULT_MODEL, label="Model preset", info="Pick a preset or switch to Custom to enter your own prices.", ) in_price = gr.Number( value=MODEL_PRESETS[DEFAULT_MODEL][0], label="Input $ / 1M tokens", precision=4, info="Effective input price (post-cache for OpenRouter-style providers).", ) out_price = gr.Number( value=MODEL_PRESETS[DEFAULT_MODEL][1], label="Output $ / 1M tokens", precision=4, ) gr.Markdown("### Request shape") in_tokens_k = gr.Slider( 1, 256, value=DEFAULT_IN_K, step=1, label="Input tokens / request (k)", info="64 means 64,000 tokens. Slide for typical context size.", ) out_tokens_k = gr.Slider( 0.1, 32, value=DEFAULT_OUT_K, step=0.1, label="Output tokens / request (k)", info="4 means 4,000 tokens.", ) gr.Markdown("### Cloud GPU rates") cloud_preset = gr.Dropdown( choices=list(CLOUD_PRESETS.keys()), value=DEFAULT_CLOUD, label="Cloud provider preset", info="Edit the table below to match your contract.", ) gpu_df = gr.Dataframe( value=CLOUD_PRESETS[DEFAULT_CLOUD], headers=["Config", "$ / hour"], datatype=["str", "number"], column_count=(2, "fixed"), row_count=(3, "dynamic"), interactive=True, ) reset_btn = gr.Button("↺ Reset to defaults", variant="secondary", size="sm") with gr.Column(scale=2): gr.Markdown("### Your workload") planned_rps = gr.Slider( 0, 5, value=DEFAULT_RPS, step=0.05, label="Planned requests / second", info="What scale do you expect to run at? The dotted line on the chart marks this point.", ) workload_table = gr.Dataframe( headers=["Option", "$ / hour", "vs API"], interactive=False, wrap=True, ) gr.Markdown("### Break-even points") headline = gr.Markdown() break_table = gr.Dataframe( headers=["GPU config", "$/hour", "Break-even req/hr", "Break-even RPS"], interactive=False, wrap=True, ) chart = gr.Plot() inputs = [in_price, out_price, in_tokens_k, out_tokens_k, gpu_df, planned_rps] outputs = [headline, break_table, workload_table, chart] for c in inputs: c.change(compute, inputs=inputs, outputs=outputs) model_preset.change( apply_model_preset, inputs=[model_preset, in_price, out_price], outputs=[in_price, out_price], ) cloud_preset.change( apply_cloud_preset, inputs=[cloud_preset, gpu_df], outputs=[gpu_df], ) reset_outputs = [model_preset, cloud_preset, in_price, out_price, in_tokens_k, out_tokens_k, gpu_df, planned_rps] reset_btn.click(reset_all, outputs=reset_outputs).then( compute, inputs=inputs, outputs=outputs ) demo.load(compute, inputs=inputs, outputs=outputs) if __name__ == "__main__": demo.launch(theme=gr.themes.Soft())