Spaces:
Sleeping
Sleeping
Commit
·
64abcca
1
Parent(s):
15f0f2b
better defaults and validation section
Browse files- app.py +18 -18
- calculator.py +3 -4
- details.py +6 -0
app.py
CHANGED
|
@@ -5,7 +5,7 @@ import gradio as gr
|
|
| 5 |
import pandas as pd
|
| 6 |
from functools import partial
|
| 7 |
from defaults import DEFAULTS
|
| 8 |
-
from details import DETAILS, INSTRUCTIONS, LIMITATIONS
|
| 9 |
from state import Model, Parallelism, Training
|
| 10 |
from calculator import MemoryCalculation
|
| 11 |
from dtypes import DType
|
|
@@ -22,14 +22,12 @@ def create_parallelism_block():
|
|
| 22 |
cp = NaturalNumber(label="Context Parallelism", value=1)
|
| 23 |
ep = NaturalNumber(label="Expert Parallelism", value=1)
|
| 24 |
|
| 25 |
-
fsdp_enabled = gr.Checkbox(label="FSDP (Fully Sharded Data Parallel)", value=
|
| 26 |
-
fsdp_parallelism = NaturalNumber(label="FSDP Parallelism", value=
|
| 27 |
fsdp_strategy = gr.Radio(
|
| 28 |
choices=["Zero-1", "Zero-2", "Zero-3"],
|
| 29 |
label="FSDP Strategy",
|
| 30 |
-
value="Zero-
|
| 31 |
-
interactive=False,
|
| 32 |
-
elem_classes="disabled-field"
|
| 33 |
)
|
| 34 |
|
| 35 |
# Toggle FSDP fields interactivity based on FSDP checkbox
|
|
@@ -48,13 +46,13 @@ def create_parallelism_block():
|
|
| 48 |
def create_model_block():
|
| 49 |
with gr.Column():
|
| 50 |
gr.Markdown("# Model Architecture")
|
| 51 |
-
layers = NaturalNumber(label="Number of Layers", value=
|
| 52 |
-
vocab = NaturalNumber(label="Vocab Size", value=
|
| 53 |
-
hidden = NaturalNumber(label="Hidden Dim", value=
|
| 54 |
-
intermediate = NaturalNumber(label="Intermediate Dim", value=
|
| 55 |
is_moe = gr.Checkbox(label="Mixture of Experts (MoE)", value=False)
|
| 56 |
-
active_experts = NaturalNumber(label="Active Experts", value=
|
| 57 |
-
total_experts = NaturalNumber(label="Total Experts", value=
|
| 58 |
weight_tied_embeddings = gr.Checkbox(label="Weight Tied Embeddings", value=True)
|
| 59 |
|
| 60 |
# Toggle expert fields interactivity based on MoE checkbox
|
|
@@ -67,7 +65,7 @@ def create_model_block():
|
|
| 67 |
outputs=[active_experts, total_experts]
|
| 68 |
)
|
| 69 |
|
| 70 |
-
presets = gr.Dropdown(["Custom"] + list(DEFAULTS.keys()), label="Presets", value="
|
| 71 |
|
| 72 |
# Populate model parameters when preset is selected
|
| 73 |
def populate_from_preset(preset_name):
|
|
@@ -123,12 +121,12 @@ def create_model_block():
|
|
| 123 |
def create_training_block():
|
| 124 |
with gr.Column():
|
| 125 |
gr.Markdown("# Training Config")
|
| 126 |
-
seq_len = NaturalNumber(label="Sequence Length", value=
|
| 127 |
-
batch_size = NaturalNumber(label="Batch Size", info="If you are using gradient accumulation, enter microbatch size", value=
|
| 128 |
with gr.Row():
|
| 129 |
-
gradient_checkpointing = gr.Checkbox(label="Gradient Checkpointing", value=
|
| 130 |
grad_accumulation = gr.Checkbox(label="Gradient Accumulation", value=False)
|
| 131 |
-
precision = gr.Dropdown(DType.values(), label="Precision", value=DType.
|
| 132 |
mixed_precision = gr.Checkbox(label="Mixed Precision", value=False)
|
| 133 |
param_dtype = gr.Dropdown(DType.values(), label="Parameter Dtype", value=DType.FP32.value, interactive=False, elem_classes="disabled-field")
|
| 134 |
reduce_dtype = gr.Dropdown(DType.values(), label="Reduce Dtype", value=DType.FP32.value, interactive=False, elem_classes="disabled-field")
|
|
@@ -313,5 +311,7 @@ with gr.Blocks(theme='Default', css=css) as demo:
|
|
| 313 |
with gr.Row():
|
| 314 |
gr.Markdown(LIMITATIONS)
|
| 315 |
gr.Markdown(DETAILS)
|
|
|
|
|
|
|
| 316 |
|
| 317 |
-
demo.launch()
|
|
|
|
| 5 |
import pandas as pd
|
| 6 |
from functools import partial
|
| 7 |
from defaults import DEFAULTS
|
| 8 |
+
from details import ACCURACY, DETAILS, INSTRUCTIONS, LIMITATIONS
|
| 9 |
from state import Model, Parallelism, Training
|
| 10 |
from calculator import MemoryCalculation
|
| 11 |
from dtypes import DType
|
|
|
|
| 22 |
cp = NaturalNumber(label="Context Parallelism", value=1)
|
| 23 |
ep = NaturalNumber(label="Expert Parallelism", value=1)
|
| 24 |
|
| 25 |
+
fsdp_enabled = gr.Checkbox(label="FSDP (Fully Sharded Data Parallel)", value=True)
|
| 26 |
+
fsdp_parallelism = NaturalNumber(label="FSDP Parallelism", value=8)
|
| 27 |
fsdp_strategy = gr.Radio(
|
| 28 |
choices=["Zero-1", "Zero-2", "Zero-3"],
|
| 29 |
label="FSDP Strategy",
|
| 30 |
+
value="Zero-3"
|
|
|
|
|
|
|
| 31 |
)
|
| 32 |
|
| 33 |
# Toggle FSDP fields interactivity based on FSDP checkbox
|
|
|
|
| 46 |
def create_model_block():
|
| 47 |
with gr.Column():
|
| 48 |
gr.Markdown("# Model Architecture")
|
| 49 |
+
layers = NaturalNumber(label="Number of Layers", value=32)
|
| 50 |
+
vocab = NaturalNumber(label="Vocab Size", value=128256)
|
| 51 |
+
hidden = NaturalNumber(label="Hidden Dim", value=4096)
|
| 52 |
+
intermediate = NaturalNumber(label="Intermediate Dim", value=14336)
|
| 53 |
is_moe = gr.Checkbox(label="Mixture of Experts (MoE)", value=False)
|
| 54 |
+
active_experts = NaturalNumber(label="Active Experts", value=1, interactive=False, elem_classes="disabled-field")
|
| 55 |
+
total_experts = NaturalNumber(label="Total Experts", value=1, interactive=False, elem_classes="disabled-field")
|
| 56 |
weight_tied_embeddings = gr.Checkbox(label="Weight Tied Embeddings", value=True)
|
| 57 |
|
| 58 |
# Toggle expert fields interactivity based on MoE checkbox
|
|
|
|
| 65 |
outputs=[active_experts, total_experts]
|
| 66 |
)
|
| 67 |
|
| 68 |
+
presets = gr.Dropdown(["Custom"] + list(DEFAULTS.keys()), label="Presets", value="Llama3 8B", interactive=True)
|
| 69 |
|
| 70 |
# Populate model parameters when preset is selected
|
| 71 |
def populate_from_preset(preset_name):
|
|
|
|
| 121 |
def create_training_block():
|
| 122 |
with gr.Column():
|
| 123 |
gr.Markdown("# Training Config")
|
| 124 |
+
seq_len = NaturalNumber(label="Sequence Length", value=4096)
|
| 125 |
+
batch_size = NaturalNumber(label="Batch Size", info="If you are using gradient accumulation, enter microbatch size", value=1)
|
| 126 |
with gr.Row():
|
| 127 |
+
gradient_checkpointing = gr.Checkbox(label="Gradient Checkpointing", value=True)
|
| 128 |
grad_accumulation = gr.Checkbox(label="Gradient Accumulation", value=False)
|
| 129 |
+
precision = gr.Dropdown(DType.values(), label="Precision", value=DType.BF16.value, interactive=True)
|
| 130 |
mixed_precision = gr.Checkbox(label="Mixed Precision", value=False)
|
| 131 |
param_dtype = gr.Dropdown(DType.values(), label="Parameter Dtype", value=DType.FP32.value, interactive=False, elem_classes="disabled-field")
|
| 132 |
reduce_dtype = gr.Dropdown(DType.values(), label="Reduce Dtype", value=DType.FP32.value, interactive=False, elem_classes="disabled-field")
|
|
|
|
| 311 |
with gr.Row():
|
| 312 |
gr.Markdown(LIMITATIONS)
|
| 313 |
gr.Markdown(DETAILS)
|
| 314 |
+
gr.Markdown("# Validation")
|
| 315 |
+
gr.Markdown(ACCURACY)
|
| 316 |
|
| 317 |
+
demo.launch(share=True)
|
calculator.py
CHANGED
|
@@ -37,14 +37,13 @@ class MemoryCalculation:
|
|
| 37 |
)
|
| 38 |
|
| 39 |
# Attention
|
| 40 |
-
|
| 41 |
-
layer_norm_attn_in = 2 * h # not tp sharded
|
| 42 |
qkv = 3 * h * h / tp
|
| 43 |
attn_output_proj = (h * h + h) / tp
|
| 44 |
attn = layer_norm_attn_in + qkv + attn_output_proj
|
| 45 |
|
| 46 |
# MLP
|
| 47 |
-
layer_norm_mlp_in =
|
| 48 |
mlp_up_proj = (h * i + i) / tp
|
| 49 |
mlp_gate_proj = (h * i + i) / tp
|
| 50 |
mlp_down_proj = (i * h + h) / tp
|
|
@@ -77,7 +76,7 @@ class MemoryCalculation:
|
|
| 77 |
unembedding = 0
|
| 78 |
if not self.model.weight_tied_embeddings:
|
| 79 |
unembedding = h * v / tp
|
| 80 |
-
final_layer_norm =
|
| 81 |
# hush linter
|
| 82 |
total_params = 0
|
| 83 |
if pp == 1:
|
|
|
|
| 37 |
)
|
| 38 |
|
| 39 |
# Attention
|
| 40 |
+
layer_norm_attn_in = h # not tp sharded
|
|
|
|
| 41 |
qkv = 3 * h * h / tp
|
| 42 |
attn_output_proj = (h * h + h) / tp
|
| 43 |
attn = layer_norm_attn_in + qkv + attn_output_proj
|
| 44 |
|
| 45 |
# MLP
|
| 46 |
+
layer_norm_mlp_in = h # not tp sharded
|
| 47 |
mlp_up_proj = (h * i + i) / tp
|
| 48 |
mlp_gate_proj = (h * i + i) / tp
|
| 49 |
mlp_down_proj = (i * h + h) / tp
|
|
|
|
| 76 |
unembedding = 0
|
| 77 |
if not self.model.weight_tied_embeddings:
|
| 78 |
unembedding = h * v / tp
|
| 79 |
+
final_layer_norm = h # not tp sharded
|
| 80 |
# hush linter
|
| 81 |
total_params = 0
|
| 82 |
if pp == 1:
|
details.py
CHANGED
|
@@ -38,4 +38,10 @@ LIMITATIONS = """
|
|
| 38 |
- Kernel/framework overhead and intermediate memory
|
| 39 |
|
| 40 |
For advanced configurations, results should be validated against profiling.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
"""
|
|
|
|
| 38 |
- Kernel/framework overhead and intermediate memory
|
| 39 |
|
| 40 |
For advanced configurations, results should be validated against profiling.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
ACCURACY = """
|
| 45 |
+
I validated this calculator against the projected memory usage in The Ultra-Scale Playbook w/in 10%. Some overage is expected since the calculator makes pessimistic assumptions and looks for peak memory. Note that you could still OOM from intermediates!
|
| 46 |
+
Welcome any detailed memory usage reports along with configurations and framework details to tune this further!
|
| 47 |
"""
|