File size: 13,523 Bytes
9cfa7c8
 
 
 
 
 
 
b222476
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1097568
b222476
 
 
 
 
 
 
 
 
623d5e8
b222476
 
 
 
 
1097568
b222476
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e76d70
 
 
623d5e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cfa7c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eac34f8
 
 
9cfa7c8
 
 
 
 
 
 
 
 
 
1097568
 
9cfa7c8
 
 
1097568
 
 
9cfa7c8
 
 
1097568
9cfa7c8
1097568
9cfa7c8
 
 
1097568
9cfa7c8
 
9ed2149
9cfa7c8
1097568
9cfa7c8
 
1097568
 
 
9cfa7c8
9ed2149
9cfa7c8
 
 
 
 
 
 
 
1097568
 
 
 
 
 
 
9cfa7c8
 
 
 
 
 
 
 
1097568
 
 
 
9cfa7c8
 
 
 
 
 
1097568
 
 
 
9cfa7c8
 
9ed2149
9cfa7c8
 
 
 
 
9ed2149
 
 
9cfa7c8
 
 
eac34f8
 
 
 
 
 
 
 
 
9ed2149
 
9cfa7c8
 
 
9ed2149
9cfa7c8
eac34f8
 
9cfa7c8
 
 
 
0dd02c7
9cfa7c8
 
 
 
 
 
 
1097568
 
9cfa7c8
 
 
1097568
9cfa7c8
 
9ed2149
9cfa7c8
 
 
 
 
3cdb1cb
9cfa7c8
1097568
9cfa7c8
 
9ed2149
c36955e
9cfa7c8
eac34f8
9cfa7c8
26d89a3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
import gradio as gr
import math

# ------------------------
# GPU presets: TFLOPs (units: TFLOPs)
# ------------------------
GPUS = {
    # Turing / consumer
    "RTX 2060":        {"FP32":  6.50,  "FP16":  13.00,  "INT4":   0.0},
    "RTX 2060 12GB":   {"FP32":  7.20,  "FP16":  14.40,  "INT4":   0.0},
    "RTX 2060 SUPER":  {"FP32":  8.90,  "FP16":  17.80,  "INT4":   0.0},
    "RTX 2070":        {"FP32":  8.90,  "FP16":  16.00,  "INT4":   0.0},
    "RTX 2070 SUPER":  {"FP32":  9.10,  "FP16":  18.20,  "INT4":   0.0},
    "RTX 2080":        {"FP32": 10.10,  "FP16":  20.20,  "INT4":   0.0},
    "RTX 2080 SUPER":  {"FP32": 11.15,  "FP16":  22.30,  "INT4":   0.0},
    "RTX 2080 Ti":     {"FP32": 13.45,  "FP16":  26.90,  "INT4": 544.0},

    # Ampere / consumer
    "RTX 3050":        {"FP32":  9.10,  "FP16":  18.20,  "INT4":   0.0},
    "RTX 3060":        {"FP32": 12.70,  "FP16":  25.40,  "INT4":   0.0},
    "RTX 3060 Ti":     {"FP32": 16.20,  "FP16":  32.40,  "INT4":   0.0},
    "RTX 3070":        {"FP32": 20.30,  "FP16":  40.60,  "INT4":   0.0},
    "RTX 3070 Ti":     {"FP32": 22.30,  "FP16":  44.60,  "INT4":   0.0},
    "RTX 3080":        {"FP32": 29.80,  "FP16":  59.60,  "INT4": 1248.0},
    "RTX 3080 Ti":     {"FP32": 34.10,  "FP16":  68.20,  "INT4": 1248.0},
    "RTX 3090":        {"FP32": 35.58,  "FP16":  71.16,  "INT4": 1248.0},
    "RTX 3090 Ti":     {"FP32": 40.00,  "FP16":  80.00,  "INT4": 1248.0},

    # Ada / Lovelace consumer
    "RTX 4050":        {"FP32": 16.90,  "FP16":  33.80,  "INT4":   0.0},
    "RTX 4060":        {"FP32": 31.10,  "FP16":  62.20,  "INT4":   0.0},
    "RTX 4060 Ti":     {"FP32": 45.60,  "FP16":  91.20,  "INT4":   0.0},
    "RTX 4070":        {"FP32": 75.00,  "FP16": 150.00,  "INT4":   0.0},
    "RTX 4070 Ti":     {"FP32": 92.20,  "FP16": 184.40,  "INT4":   0.0},
    "RTX 4080":        {"FP32":144.00,  "FP16": 288.00,  "INT4":   0.0},
    "RTX 4080 SUPER":  {"FP32":167.60,  "FP16": 335.20,  "INT4":   0.0},
    "RTX 4090":        {"FP32":201.00,  "FP16": 402.00,  "INT4":1676.0},

    # Blackwell consumer (RTX 50xx series)
    "RTX 5050":        {"FP32": 16.90,  "FP16":  33.80,  "INT4":   0.0},
    "RTX 5060":        {"FP32": 31.10,  "FP16":  62.20,  "INT4":   0.0},
    "RTX 5060 Ti":     {"FP32": 45.60,  "FP16":  91.20,  "INT4":   0.0},
    "RTX 5070":        {"FP32": 75.00,  "FP16": 150.00,  "INT4":   0.0},
    "RTX 5070 Ti":     {"FP32": 92.20,  "FP16": 184.40,  "INT4":   0.0},
    "RTX 5080":        {"FP32":144.00,  "FP16": 288.00,  "INT4":   0.0},
    "RTX 5090":        {"FP32":201.00,  "FP16": 402.00,  "INT4":1676.0},

    # Data center / Tesla / A-series
    "Tesla T4":        {"FP32":  8.10,  "FP16":  65.13,  "INT4":   0.0},
    "Tesla V100":      {"FP32": 15.70,  "FP16":  31.40,  "INT4":   0.0},
    "NVIDIA A10":      {"FP32": 31.20,  "FP16":  62.40,  "INT4":   0.0},
    "A100":            {"FP32": 19.50,  "FP16":  39.00,  "INT4": 624.0},
    "A100 80GB":       {"FP32": 19.50,  "FP16":  39.00,  "INT4": 624.0},

    # Hopper / Blackwell datacenter estimates
    "H100":            {"FP32":300.0,   "FP16": 600.0,   "INT4":3000.0},
    "B100":            {"FP32":400.0,   "FP16": 800.0,   "INT4":4000.0},
    "B200":            {"FP32":500.0,   "FP16":1000.0,   "INT4":5000.0},

    # AMD (kept for completeness)
    "RX 5500 XT":      {"FP32":  5.20,  "FP16":  10.40,  "INT4":   0.0},
    "RX 5600 XT":      {"FP32": 10.80,  "FP16":  21.60,  "INT4":   0.0},
    "RX 5700":         {"FP32": 14.40,  "FP16":  28.80,  "INT4":   0.0},
    "RX 5700 XT":      {"FP32": 16.20,  "FP16":  32.40,  "INT4":   0.0},
    "RX 6600":         {"FP32": 17.90,  "FP16":  35.80,  "INT4":   0.0},
    "RX 6600 XT":      {"FP32": 20.00,  "FP16":  40.00,  "INT4":   0.0},
    "RX 6700 XT":      {"FP32": 23.00,  "FP16":  46.00,  "INT4":   0.0},
    "RX 6800":         {"FP32": 30.00,  "FP16":  60.00,  "INT4":   0.0},
    "RX 6800 XT":      {"FP32": 34.00,  "FP16":  68.00,  "INT4":   0.0},
    "RX 6900 XT":      {"FP32": 40.00,  "FP16":  80.00,  "INT4":   0.0},
    "RX 7600":         {"FP32": 25.00,  "FP16":  50.00,  "INT4":   0.0},
    "RX 7700 XT":      {"FP32": 35.00,  "FP16":  70.00,  "INT4":   0.0},
    "RX 7900 XT":      {"FP32": 40.00,  "FP16":  80.00,  "INT4":   0.0},
    "RX 7900 XTX":     {"FP32": 61.10,  "FP16": 122.20,  "INT4":   0.0},

    # AMD MI / CDNA datacenter
    "MI50":            {"FP32": 13.70,  "FP16":  27.40,  "INT4":   0.0},
    "MI100":           {"FP32": 23.10,  "FP16":  46.20,  "INT4":   0.0},
    "MI200":           {"FP32": 300.0,  "FP16": 600.0,   "INT4":3000.0},
    "MI300":           {"FP32": 400.0,  "FP16": 800.0,   "INT4":4000.0},
    "MI355X":          {"FP32": 157,    "FP16": 2500,    "INT4": 10000},
        # Hopper / Grace superchips
    "H200":            {"FP32": 350.0,  "FP16": 700.0,  "INT4": 3500.0},
    "GH200":           {"FP32": 300.0,  "FP16": 600.0,  "INT4": 3000.0},  # H100-class GPU + Grace CPU
    "GB10":            {"FP32": 400.0,  "FP16": 800.0,  "INT4": 4000.0},  # dev module, Blackwell-class

    # Ada Lovelace datacenter
    "L20":             {"FP32": 44.0,   "FP16":  88.0,  "INT4":  700.0},
    "A40":             {"FP32": 37.4,   "FP16":  74.8,  "INT4":  600.0},
    "A2":              {"FP32":  4.5,   "FP16":   9.0,  "INT4":  160.0},

    # RTX Ada workstation GPUs
    "RTX A2000":       {"FP32":  8.0,   "FP16":  16.0,  "INT4":    0.0},
    "RTX A4000":       {"FP32": 19.2,   "FP16":  38.4,  "INT4":    0.0},
    "RTX A4500":       {"FP32": 23.7,   "FP16":  47.4,  "INT4":    0.0},
    "RTX A5000":       {"FP32": 27.8,   "FP16":  55.6,  "INT4":    0.0},
    "RTX A6000 Ada":   {"FP32": 91.1,   "FP16": 182.2,  "INT4": 1450.0},
}

# ------------------------
# CSS / Theme variables
# ------------------------
CSS = r"""
:root { --bg:#071233; --card:#07112a; --accent:#2563eb; --text:#e8f0ff; --muted:#9fb6e8; }
body { background: var(--bg); color:var(--text); font-family: Inter, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; }
.gradio-container { max-width: 920px; margin: 14px auto; padding: 12px; }

/* card */
.card { background: var(--card); border-radius:12px; padding:14px; box-shadow: 0 8px 26px rgba(2,6,23,0.5); border:1px solid rgba(255,255,255,0.03); }

/* accent and buttons */
.btn-theme { background:transparent; color:var(--accent); border:1px solid var(--accent); padding:8px 12px; border-radius:10px; cursor:pointer; }
.btn-theme:hover { background: rgba(255,255,255,0.02); }

/* result */
.result-box { background: linear-gradient(180deg, rgba(255,255,255,0.01), rgba(255,255,255,0.02)); border-radius:8px; padding:10px; border:1px solid rgba(255,255,255,0.03); color:var(--text); font-weight:600; }

/* small text */
.small-muted { color: var(--muted); font-size:0.92em; }

/* themes */
.theme-blue { --bg:#071233; --card:#07112a; --accent:#2563eb; --text:#e8f0ff; --muted:#9fb6e8; }
.theme-green{ --bg:#07120a; --card:#07120a; --accent:#16a34a; --text:#e8fff0; --muted:#9fe8b0; }
.theme-purple{ --bg:#120521; --card:#15061a; --accent:#8b5cf6; --text:#f2e8ff; --muted:#c9b8f6; }

/* minor Gradio element tweaks */
input[type="number"], .gradio-number { background: transparent; color: var(--text); border-radius:6px; }

/* theme button row */
.theme-btn-row { display:flex; gap:8px; align-items:center; }
"""

# ------------------------
# Core logic
# ------------------------
def estimate_time(params_m: float,
                  tokens_b: float,
                  selected_gpu: str,
                  dtype: str,
                  tf_override: float,
                  utilization_pct: float,
                  gpu_count: float):
    if params_m <= 0 or tokens_b <= 0:
        return "Enter positive values for parameters and tokens."

    if gpu_count is None or gpu_count <= 0:
        return "Enter a positive number of GPUs."

    params = params_m * 1e6
    tokens = tokens_b * 1e9

    # choose TFLOPs per-GPU
    if tf_override is not None and tf_override > 0:
        chosen_tf_per_gpu = float(tf_override)
        source = "manual override"
    else:
        try:
            chosen_tf_per_gpu = float(GPUS[selected_gpu].get(dtype, 0.0))
            source = f"preset ({selected_gpu} / {dtype})"
        except Exception:
            return "Couldn't determine GPU TFLOPs. Pick a GPU or enter TFLOPs manually."

    if chosen_tf_per_gpu <= 0:
        return "Couldn't determine GPU TFLOPs. Pick a GPU or enter TFLOPs manually."

    # multiply by count and utilization -> FLOPs/sec
    total_tf = chosen_tf_per_gpu * float(gpu_count)
    gpu_flops_per_sec = total_tf * 1e12 * (max(0.001, utilization_pct / 100.0))

    flops_total = 6 * params * tokens
    seconds = flops_total / gpu_flops_per_sec
    hours = seconds / 3600.0
    days = hours / 24.0

    seq_len = 2048.0
    steps = max(1.0, tokens / seq_len)
    flops_per_step = flops_total / steps if steps > 0 else 0.0

    # warnings for absurd counts
    warnings = []
    if gpu_count >= 10000:
        warnings.append("⚠️ Wow that's a lot of GPUs — are you sure? Check units (e.g., 8 not 800k).")
    if total_tf > 1e6:
        warnings.append("⚠️ Total TFLOPs exceed 1e6 TFLOPs (exaFLOPs scale) — results are rough estimates.")

    out = [
        f"🔥 Roman's Training Time Estimator",
        "",
        f"Model params: {params_m:,.1f} M",
        f"Training tokens: {tokens_b:,.3f} B",
        f"Total training FLOPs (approx): {flops_total:.3e}",
        "",
        f"Hardware source: {source}",
        f"Per-GPU TFLOPs: {chosen_tf_per_gpu:.3f} TFLOPs",
        f"GPU count: {int(gpu_count):,}",
        f"Total effective TFLOPs (before utilization): {total_tf:,.3f} TFLOPs",
        f"Utilization: {utilization_pct:.0f}%",
        "",
        f"⏱️ Wall-clock estimate: {hours:,.2f} hours (~{days:,.2f} days)",
        f"Steps (rough, seq_len=2048): {steps:,.0f} steps",
        f"FLOPs / step (avg): {flops_per_step:.3e}",
    ]

    if warnings:
        out.append("")
        out.extend(warnings)

    if tf_override and tf_override > 0 and selected_gpu != "Custom":
        out.append("")
        out.append("⚠️ Note: you overrode the preset TFLOPs. Ensure the value is in TFLOPs (e.g., 150 for A100 FP16-like).")

    return "\n".join(out)

def preset_tf_for_ui(selected_gpu: str, dtype: str):
    if selected_gpu in GPUS:
        return GPUS[selected_gpu].get(dtype, 0.0)
    return 0.0

# ------------------------
# Build UI
# ------------------------
# Inline HTML for theme buttons with client-side onclick handlers
THEME_BUTTONS_HTML = """
<div class="theme-btn-row">
  <button class="btn-theme" onclick="document.documentElement.className='theme-blue'">Blue</button>
  <button class="btn-theme" onclick="document.documentElement.className='theme-green'">Green</button>
  <button class="btn-theme" onclick="document.documentElement.className='theme-purple'">Purple</button>
</div>
"""

with gr.Blocks() as demo:
    # initial theme set (runs immediately on load)
    gr.HTML("<script>document.documentElement.className='theme-blue';</script>")

    with gr.Column(elem_classes="card"):
        with gr.Row():
            gr.Markdown("## 🧠 Roman’s Training Time Estimator")
            # render the theme buttons as raw HTML so onclick works client-side instantly
            gr.HTML(THEME_BUTTONS_HTML)

    with gr.Column(elem_classes="card"):
        gr.Markdown("### Model & Hardware")
        with gr.Row():
            params = gr.Slider(minimum=1, maximum=20000, value=100, step=0.1, label="Model Parameters (Millions)")
            tokens = gr.Number(value=1.0, label="Training Tokens (Billions)")
        with gr.Row():
            gpu_dropdown = gr.Dropdown(choices=list(GPUS.keys()), value="A100 80GB", label="GPU Preset (changes TFLOPs below)")
            dtype_dropdown = gr.Dropdown(choices=["FP32", "FP16", "INT4"], value="FP16", label="Training Precision / DType")
        with gr.Row():
            tf_override = gr.Number(value=preset_tf_for_ui("A100 80GB", "FP16"), label="GPU TFLOPs (teraFLOPs) — editable", precision=3)
            utilization = gr.Slider(minimum=1, maximum=100, value=80, step=1, label="Hardware Utilization (%) — realistic throughput")
        with gr.Row():
            gpu_count = gr.Number(value=1, label="GPU Count (how many of the chosen preset you have)", precision=0)

    with gr.Column(elem_classes="card"):
        gr.Markdown("### Estimate")
        result = gr.Textbox(lines=14, interactive=False, elem_classes="result-box", label="Result")
        run_btn = gr.Button("Estimate Training Time", elem_classes="btn-theme")

    # update TF override when gpu/dtype change
    def _update_tf(selected_gpu, dtype):
        return gr.update(value=preset_tf_for_ui(selected_gpu, dtype))
    gpu_dropdown.change(_update_tf, inputs=[gpu_dropdown, dtype_dropdown], outputs=[tf_override])
    dtype_dropdown.change(_update_tf, inputs=[gpu_dropdown, dtype_dropdown], outputs=[tf_override])

    # Run button computes estimate
    run_btn.click(estimate_time,
                  inputs=[params, tokens, gpu_dropdown, dtype_dropdown, tf_override, utilization, gpu_count],
                  outputs=[result])

    gr.HTML("<div class='small-muted'>Tip: GPU presets are TFLOPs per dtype. You can edit the TFLOPs number to override. Utilization reduces theoretical peak to realistic throughput.</div>")
    gr.HTML("<div class='small-muted'>Thanks to the contributions from Reality123b</div>")

# pass CSS to launch
if __name__ == "__main__":
    demo.launch(css=CSS)