Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
def estimate_transformer_stats(batch_size, seq_len, num_layers, hidden_dim, vocab_size, show_breakdown):
|
| 4 |
+
B = batch_size
|
| 5 |
+
S = seq_len
|
| 6 |
+
L = num_layers
|
| 7 |
+
D = hidden_dim
|
| 8 |
+
V = vocab_size
|
| 9 |
+
|
| 10 |
+
# --- Parameters ---
|
| 11 |
+
num_params = L * 12 * (D ** 2) + D * V
|
| 12 |
+
|
| 13 |
+
# --- FLOPs --- (using 2 * m * n * p per matmul)
|
| 14 |
+
attn_proj_flops = 2 * 3 * S * D * D
|
| 15 |
+
attn_score_flops = 2 * S * D * S
|
| 16 |
+
attn_out_proj_flops = 2 * S * D * D
|
| 17 |
+
ffn_flops = 2 * 2 * S * D * 4 * D
|
| 18 |
+
logit_flops = 2 * S * D * V / L
|
| 19 |
+
|
| 20 |
+
total_layer_flops = attn_proj_flops + attn_score_flops + attn_out_proj_flops + ffn_flops + logit_flops
|
| 21 |
+
total_flops = 6 * B * L * total_layer_flops
|
| 22 |
+
|
| 23 |
+
output_lines = [
|
| 24 |
+
f"Parameters: P = 12 * L * D^2 + D * V",
|
| 25 |
+
f" = 12 * {L} * {D}^2 + {D} * {V} = {num_params:.2e}",
|
| 26 |
+
f"",
|
| 27 |
+
f"FLOPs per layer (per sequence):",
|
| 28 |
+
f" Attention Projections (QKV): 2 * 3 * S * D^2 = 2 * 3 * {S} * {D}^2 = {attn_proj_flops:.2e}",
|
| 29 |
+
f" Attention Scores (QKᵀ): 2 * S * D * S = 2 * {S} * {D} * {S} = {attn_score_flops:.2e}",
|
| 30 |
+
f" Attention Output Proj: 2 * S * D^2 = 2 * {S} * {D}^2 = {attn_out_proj_flops:.2e}",
|
| 31 |
+
f" Feedforward Network: 2 * 2 * S * D * 4D = 2*2*{S}*{D}*{4*D} = {ffn_flops:.2e}",
|
| 32 |
+
f" Logits: 2 * S * D * V / L = 2*{S}*{D}*{V} / {L} = {logit_flops:.2e}",
|
| 33 |
+
f"",
|
| 34 |
+
f"Layer Total FLOPs = {total_layer_flops:.2e}",
|
| 35 |
+
f"",
|
| 36 |
+
f"Total Training FLOPs = 6 * B * L * Layer_FLOPs",
|
| 37 |
+
f" = 6 * {B} * {L} * {total_layer_flops:.2e} = {total_flops:.2e}"
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
if show_breakdown:
|
| 41 |
+
output_lines.append("\nComponent-wise totals across training batch:")
|
| 42 |
+
output_lines.append(f" - QKV Projections: {attn_proj_flops * B * L:.2e}")
|
| 43 |
+
output_lines.append(f" - Attention Scores: {attn_score_flops * B * L:.2e}")
|
| 44 |
+
output_lines.append(f" - Attention Output: {attn_out_proj_flops * B * L:.2e}")
|
| 45 |
+
output_lines.append(f" - FFN: {ffn_flops * B * L:.2e}")
|
| 46 |
+
output_lines.append(f" - Logits: {logit_flops * B * L:.2e}")
|
| 47 |
+
|
| 48 |
+
return "\n".join(output_lines)
|
| 49 |
+
|
| 50 |
+
iface = gr.Interface(
|
| 51 |
+
fn=estimate_transformer_stats,
|
| 52 |
+
inputs=[
|
| 53 |
+
gr.Number(label="Batch Size", value=32),
|
| 54 |
+
gr.Number(label="Sequence Length", value=2048),
|
| 55 |
+
gr.Number(label="Number of Layers", value=24),
|
| 56 |
+
gr.Number(label="Hidden Size (d_model)", value=2048),
|
| 57 |
+
gr.Number(label="Vocabulary Size", value=50272),
|
| 58 |
+
gr.Checkbox(label="Show FLOPs Breakdown", value=True),
|
| 59 |
+
],
|
| 60 |
+
outputs=gr.Textbox(label="Estimates"),
|
| 61 |
+
title="Transformer Parameter and FLOPs Estimator",
|
| 62 |
+
description="Estimates parameter count and training FLOPs for decoder-only Transformers (like OPT/GPT). Shows formulas and per-component breakdown."
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
if __name__ == "__main__":
|
| 66 |
+
iface.launch()
|