rubenaghayan commited on
Commit
64abcca
·
1 Parent(s): 15f0f2b

better defaults and validation section

Browse files
Files changed (3) hide show
  1. app.py +18 -18
  2. calculator.py +3 -4
  3. details.py +6 -0
app.py CHANGED
@@ -5,7 +5,7 @@ import gradio as gr
5
  import pandas as pd
6
  from functools import partial
7
  from defaults import DEFAULTS
8
- from details import DETAILS, INSTRUCTIONS, LIMITATIONS
9
  from state import Model, Parallelism, Training
10
  from calculator import MemoryCalculation
11
  from dtypes import DType
@@ -22,14 +22,12 @@ def create_parallelism_block():
22
  cp = NaturalNumber(label="Context Parallelism", value=1)
23
  ep = NaturalNumber(label="Expert Parallelism", value=1)
24
 
25
- fsdp_enabled = gr.Checkbox(label="FSDP (Fully Sharded Data Parallel)", value=False)
26
- fsdp_parallelism = NaturalNumber(label="FSDP Parallelism", value=1, interactive=False, elem_classes="disabled-field")
27
  fsdp_strategy = gr.Radio(
28
  choices=["Zero-1", "Zero-2", "Zero-3"],
29
  label="FSDP Strategy",
30
- value="Zero-1",
31
- interactive=False,
32
- elem_classes="disabled-field"
33
  )
34
 
35
  # Toggle FSDP fields interactivity based on FSDP checkbox
@@ -48,13 +46,13 @@ def create_parallelism_block():
48
  def create_model_block():
49
  with gr.Column():
50
  gr.Markdown("# Model Architecture")
51
- layers = NaturalNumber(label="Number of Layers", value=48)
52
- vocab = NaturalNumber(label="Vocab Size", value=262144)
53
- hidden = NaturalNumber(label="Hidden Dim", value=3840)
54
- intermediate = NaturalNumber(label="Intermediate Dim", value=15360)
55
  is_moe = gr.Checkbox(label="Mixture of Experts (MoE)", value=False)
56
- active_experts = NaturalNumber(label="Active Experts", value=2, interactive=False, elem_classes="disabled-field")
57
- total_experts = NaturalNumber(label="Total Experts", value=8, interactive=False, elem_classes="disabled-field")
58
  weight_tied_embeddings = gr.Checkbox(label="Weight Tied Embeddings", value=True)
59
 
60
  # Toggle expert fields interactivity based on MoE checkbox
@@ -67,7 +65,7 @@ def create_model_block():
67
  outputs=[active_experts, total_experts]
68
  )
69
 
70
- presets = gr.Dropdown(["Custom"] + list(DEFAULTS.keys()), label="Presets", value="Gemma3 12B", interactive=True)
71
 
72
  # Populate model parameters when preset is selected
73
  def populate_from_preset(preset_name):
@@ -123,12 +121,12 @@ def create_model_block():
123
  def create_training_block():
124
  with gr.Column():
125
  gr.Markdown("# Training Config")
126
- seq_len = NaturalNumber(label="Sequence Length", value=8192)
127
- batch_size = NaturalNumber(label="Batch Size", info="If you are using gradient accumulation, enter microbatch size", value=8)
128
  with gr.Row():
129
- gradient_checkpointing = gr.Checkbox(label="Gradient Checkpointing", value=False)
130
  grad_accumulation = gr.Checkbox(label="Gradient Accumulation", value=False)
131
- precision = gr.Dropdown(DType.values(), label="Precision", value=DType.FP32.value, interactive=True)
132
  mixed_precision = gr.Checkbox(label="Mixed Precision", value=False)
133
  param_dtype = gr.Dropdown(DType.values(), label="Parameter Dtype", value=DType.FP32.value, interactive=False, elem_classes="disabled-field")
134
  reduce_dtype = gr.Dropdown(DType.values(), label="Reduce Dtype", value=DType.FP32.value, interactive=False, elem_classes="disabled-field")
@@ -313,5 +311,7 @@ with gr.Blocks(theme='Default', css=css) as demo:
313
  with gr.Row():
314
  gr.Markdown(LIMITATIONS)
315
  gr.Markdown(DETAILS)
 
 
316
 
317
- demo.launch()
 
5
  import pandas as pd
6
  from functools import partial
7
  from defaults import DEFAULTS
8
+ from details import ACCURACY, DETAILS, INSTRUCTIONS, LIMITATIONS
9
  from state import Model, Parallelism, Training
10
  from calculator import MemoryCalculation
11
  from dtypes import DType
 
22
  cp = NaturalNumber(label="Context Parallelism", value=1)
23
  ep = NaturalNumber(label="Expert Parallelism", value=1)
24
 
25
+ fsdp_enabled = gr.Checkbox(label="FSDP (Fully Sharded Data Parallel)", value=True)
26
+ fsdp_parallelism = NaturalNumber(label="FSDP Parallelism", value=8)
27
  fsdp_strategy = gr.Radio(
28
  choices=["Zero-1", "Zero-2", "Zero-3"],
29
  label="FSDP Strategy",
30
+ value="Zero-3"
 
 
31
  )
32
 
33
  # Toggle FSDP fields interactivity based on FSDP checkbox
 
46
  def create_model_block():
47
  with gr.Column():
48
  gr.Markdown("# Model Architecture")
49
+ layers = NaturalNumber(label="Number of Layers", value=32)
50
+ vocab = NaturalNumber(label="Vocab Size", value=128256)
51
+ hidden = NaturalNumber(label="Hidden Dim", value=4096)
52
+ intermediate = NaturalNumber(label="Intermediate Dim", value=14336)
53
  is_moe = gr.Checkbox(label="Mixture of Experts (MoE)", value=False)
54
+ active_experts = NaturalNumber(label="Active Experts", value=1, interactive=False, elem_classes="disabled-field")
55
+ total_experts = NaturalNumber(label="Total Experts", value=1, interactive=False, elem_classes="disabled-field")
56
  weight_tied_embeddings = gr.Checkbox(label="Weight Tied Embeddings", value=True)
57
 
58
  # Toggle expert fields interactivity based on MoE checkbox
 
65
  outputs=[active_experts, total_experts]
66
  )
67
 
68
+ presets = gr.Dropdown(["Custom"] + list(DEFAULTS.keys()), label="Presets", value="Llama3 8B", interactive=True)
69
 
70
  # Populate model parameters when preset is selected
71
  def populate_from_preset(preset_name):
 
121
  def create_training_block():
122
  with gr.Column():
123
  gr.Markdown("# Training Config")
124
+ seq_len = NaturalNumber(label="Sequence Length", value=4096)
125
+ batch_size = NaturalNumber(label="Batch Size", info="If you are using gradient accumulation, enter microbatch size", value=1)
126
  with gr.Row():
127
+ gradient_checkpointing = gr.Checkbox(label="Gradient Checkpointing", value=True)
128
  grad_accumulation = gr.Checkbox(label="Gradient Accumulation", value=False)
129
+ precision = gr.Dropdown(DType.values(), label="Precision", value=DType.BF16.value, interactive=True)
130
  mixed_precision = gr.Checkbox(label="Mixed Precision", value=False)
131
  param_dtype = gr.Dropdown(DType.values(), label="Parameter Dtype", value=DType.FP32.value, interactive=False, elem_classes="disabled-field")
132
  reduce_dtype = gr.Dropdown(DType.values(), label="Reduce Dtype", value=DType.FP32.value, interactive=False, elem_classes="disabled-field")
 
311
  with gr.Row():
312
  gr.Markdown(LIMITATIONS)
313
  gr.Markdown(DETAILS)
314
+ gr.Markdown("# Validation")
315
+ gr.Markdown(ACCURACY)
316
 
317
+ demo.launch(share=True)
calculator.py CHANGED
@@ -37,14 +37,13 @@ class MemoryCalculation:
37
  )
38
 
39
  # Attention
40
- # weights and biases = *2
41
- layer_norm_attn_in = 2 * h # not tp sharded
42
  qkv = 3 * h * h / tp
43
  attn_output_proj = (h * h + h) / tp
44
  attn = layer_norm_attn_in + qkv + attn_output_proj
45
 
46
  # MLP
47
- layer_norm_mlp_in = 2 * h # not tp sharded
48
  mlp_up_proj = (h * i + i) / tp
49
  mlp_gate_proj = (h * i + i) / tp
50
  mlp_down_proj = (i * h + h) / tp
@@ -77,7 +76,7 @@ class MemoryCalculation:
77
  unembedding = 0
78
  if not self.model.weight_tied_embeddings:
79
  unembedding = h * v / tp
80
- final_layer_norm = 2 * h # not tp sharded
81
  # hush linter
82
  total_params = 0
83
  if pp == 1:
 
37
  )
38
 
39
  # Attention
40
+ layer_norm_attn_in = h # not tp sharded
 
41
  qkv = 3 * h * h / tp
42
  attn_output_proj = (h * h + h) / tp
43
  attn = layer_norm_attn_in + qkv + attn_output_proj
44
 
45
  # MLP
46
+ layer_norm_mlp_in = h # not tp sharded
47
  mlp_up_proj = (h * i + i) / tp
48
  mlp_gate_proj = (h * i + i) / tp
49
  mlp_down_proj = (i * h + h) / tp
 
76
  unembedding = 0
77
  if not self.model.weight_tied_embeddings:
78
  unembedding = h * v / tp
79
+ final_layer_norm = h # not tp sharded
80
  # hush linter
81
  total_params = 0
82
  if pp == 1:
details.py CHANGED
@@ -38,4 +38,10 @@ LIMITATIONS = """
38
  - Kernel/framework overhead and intermediate memory
39
 
40
  For advanced configurations, results should be validated against profiling.
 
 
 
 
 
 
41
  """
 
38
  - Kernel/framework overhead and intermediate memory
39
 
40
  For advanced configurations, results should be validated against profiling.
41
+ """
42
+
43
+
44
+ ACCURACY = """
45
+ I validated this calculator against the projected memory usage in The Ultra-Scale Playbook w/in 10%. Some overage is expected since the calculator makes pessimistic assumptions and looks for peak memory. Note that you could still OOM from intermediates!
46
+ Welcome any detailed memory usage reports along with configurations and framework details to tune this further!
47
  """