Spaces:

rubenaghayan
/

llm_memory_visualizer

Sleeping

App Files Files Community

rubenaghayan commited on Oct 13, 2025

Commit

64abcca

1 Parent(s): 15f0f2b

better defaults and validation section

Browse files

Files changed (3) hide show

app.py +18 -18
calculator.py +3 -4
details.py +6 -0

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import gradio as gr
 import pandas as pd
 from functools import partial
 from defaults import DEFAULTS
-from details import DETAILS, INSTRUCTIONS, LIMITATIONS
 from state import Model, Parallelism, Training
 from calculator import MemoryCalculation
 from dtypes import DType
@@ -22,14 +22,12 @@ def create_parallelism_block():
             cp = NaturalNumber(label="Context Parallelism", value=1)
             ep = NaturalNumber(label="Expert Parallelism", value=1)
-            fsdp_enabled = gr.Checkbox(label="FSDP (Fully Sharded Data Parallel)", value=False)
-            fsdp_parallelism = NaturalNumber(label="FSDP Parallelism", value=1, interactive=False, elem_classes="disabled-field")
             fsdp_strategy = gr.Radio(
                 choices=["Zero-1", "Zero-2", "Zero-3"],
                 label="FSDP Strategy",
-                value="Zero-1",
-                interactive=False,
-                elem_classes="disabled-field"
             )
             # Toggle FSDP fields interactivity based on FSDP checkbox
@@ -48,13 +46,13 @@ def create_parallelism_block():
 def create_model_block():
     with gr.Column():
         gr.Markdown("# Model Architecture")
-        layers = NaturalNumber(label="Number of Layers", value=48)
-        vocab = NaturalNumber(label="Vocab Size", value=262144)
-        hidden = NaturalNumber(label="Hidden Dim", value=3840)
-        intermediate = NaturalNumber(label="Intermediate Dim", value=15360)
         is_moe = gr.Checkbox(label="Mixture of Experts (MoE)", value=False)
-        active_experts = NaturalNumber(label="Active Experts", value=2, interactive=False, elem_classes="disabled-field")
-        total_experts = NaturalNumber(label="Total Experts", value=8, interactive=False, elem_classes="disabled-field")
         weight_tied_embeddings = gr.Checkbox(label="Weight Tied Embeddings", value=True)
         # Toggle expert fields interactivity based on MoE checkbox
@@ -67,7 +65,7 @@ def create_model_block():
             outputs=[active_experts, total_experts]
         )
-        presets = gr.Dropdown(["Custom"] + list(DEFAULTS.keys()), label="Presets", value="Gemma3 12B", interactive=True)
         # Populate model parameters when preset is selected
         def populate_from_preset(preset_name):
@@ -123,12 +121,12 @@ def create_model_block():
 def create_training_block():
     with gr.Column():
         gr.Markdown("# Training Config")
-        seq_len = NaturalNumber(label="Sequence Length", value=8192)
-        batch_size = NaturalNumber(label="Batch Size", info="If you are using gradient accumulation, enter microbatch size", value=8)
         with gr.Row():
-            gradient_checkpointing = gr.Checkbox(label="Gradient Checkpointing", value=False)
             grad_accumulation = gr.Checkbox(label="Gradient Accumulation", value=False)
-        precision = gr.Dropdown(DType.values(), label="Precision", value=DType.FP32.value, interactive=True)
         mixed_precision = gr.Checkbox(label="Mixed Precision", value=False)
         param_dtype = gr.Dropdown(DType.values(), label="Parameter Dtype", value=DType.FP32.value, interactive=False, elem_classes="disabled-field")
         reduce_dtype = gr.Dropdown(DType.values(), label="Reduce Dtype", value=DType.FP32.value, interactive=False, elem_classes="disabled-field")
@@ -313,5 +311,7 @@ with gr.Blocks(theme='Default', css=css) as demo:
         with gr.Row():
             gr.Markdown(LIMITATIONS)
             gr.Markdown(DETAILS)
-demo.launch()

 import pandas as pd
 from functools import partial
 from defaults import DEFAULTS
+from details import ACCURACY, DETAILS, INSTRUCTIONS, LIMITATIONS
 from state import Model, Parallelism, Training
 from calculator import MemoryCalculation
 from dtypes import DType
             cp = NaturalNumber(label="Context Parallelism", value=1)
             ep = NaturalNumber(label="Expert Parallelism", value=1)
+            fsdp_enabled = gr.Checkbox(label="FSDP (Fully Sharded Data Parallel)", value=True)
+            fsdp_parallelism = NaturalNumber(label="FSDP Parallelism", value=8)
             fsdp_strategy = gr.Radio(
                 choices=["Zero-1", "Zero-2", "Zero-3"],
                 label="FSDP Strategy",
+                value="Zero-3"
             )
             # Toggle FSDP fields interactivity based on FSDP checkbox
 def create_model_block():
     with gr.Column():
         gr.Markdown("# Model Architecture")
+        layers = NaturalNumber(label="Number of Layers", value=32)
+        vocab = NaturalNumber(label="Vocab Size", value=128256)
+        hidden = NaturalNumber(label="Hidden Dim", value=4096)
+        intermediate = NaturalNumber(label="Intermediate Dim", value=14336)
         is_moe = gr.Checkbox(label="Mixture of Experts (MoE)", value=False)
+        active_experts = NaturalNumber(label="Active Experts", value=1, interactive=False, elem_classes="disabled-field")
+        total_experts = NaturalNumber(label="Total Experts", value=1, interactive=False, elem_classes="disabled-field")
         weight_tied_embeddings = gr.Checkbox(label="Weight Tied Embeddings", value=True)
         # Toggle expert fields interactivity based on MoE checkbox
             outputs=[active_experts, total_experts]
         )
+        presets = gr.Dropdown(["Custom"] + list(DEFAULTS.keys()), label="Presets", value="Llama3 8B", interactive=True)
         # Populate model parameters when preset is selected
         def populate_from_preset(preset_name):
 def create_training_block():
     with gr.Column():
         gr.Markdown("# Training Config")
+        seq_len = NaturalNumber(label="Sequence Length", value=4096)
+        batch_size = NaturalNumber(label="Batch Size", info="If you are using gradient accumulation, enter microbatch size", value=1)
         with gr.Row():
+            gradient_checkpointing = gr.Checkbox(label="Gradient Checkpointing", value=True)
             grad_accumulation = gr.Checkbox(label="Gradient Accumulation", value=False)
+        precision = gr.Dropdown(DType.values(), label="Precision", value=DType.BF16.value, interactive=True)
         mixed_precision = gr.Checkbox(label="Mixed Precision", value=False)
         param_dtype = gr.Dropdown(DType.values(), label="Parameter Dtype", value=DType.FP32.value, interactive=False, elem_classes="disabled-field")
         reduce_dtype = gr.Dropdown(DType.values(), label="Reduce Dtype", value=DType.FP32.value, interactive=False, elem_classes="disabled-field")
         with gr.Row():
             gr.Markdown(LIMITATIONS)
             gr.Markdown(DETAILS)
+        gr.Markdown("# Validation")
+        gr.Markdown(ACCURACY)
+demo.launch(share=True)

calculator.py CHANGED Viewed

@@ -37,14 +37,13 @@ class MemoryCalculation:
         )
         # Attention
-        # weights and biases = *2
-        layer_norm_attn_in = 2 * h  # not tp sharded
         qkv = 3 * h * h / tp
         attn_output_proj = (h * h + h) / tp
         attn = layer_norm_attn_in + qkv + attn_output_proj
         # MLP
-        layer_norm_mlp_in = 2 * h  # not tp sharded
         mlp_up_proj = (h * i + i) / tp
         mlp_gate_proj = (h * i + i) / tp
         mlp_down_proj = (i * h + h) / tp
@@ -77,7 +76,7 @@ class MemoryCalculation:
         unembedding = 0
         if not self.model.weight_tied_embeddings:
             unembedding = h * v / tp
-        final_layer_norm = 2 * h  # not tp sharded
         # hush linter
         total_params = 0
         if pp == 1:

         )
         # Attention
+        layer_norm_attn_in = h  # not tp sharded
         qkv = 3 * h * h / tp
         attn_output_proj = (h * h + h) / tp
         attn = layer_norm_attn_in + qkv + attn_output_proj
         # MLP
+        layer_norm_mlp_in = h  # not tp sharded
         mlp_up_proj = (h * i + i) / tp
         mlp_gate_proj = (h * i + i) / tp
         mlp_down_proj = (i * h + h) / tp
         unembedding = 0
         if not self.model.weight_tied_embeddings:
             unembedding = h * v / tp
+        final_layer_norm = h  # not tp sharded
         # hush linter
         total_params = 0
         if pp == 1:

details.py CHANGED Viewed

@@ -38,4 +38,10 @@ LIMITATIONS = """
 - Kernel/framework overhead and intermediate memory
 For advanced configurations, results should be validated against profiling.
 """

 - Kernel/framework overhead and intermediate memory
 For advanced configurations, results should be validated against profiling.
+"""
+ACCURACY = """
+I validated this calculator against the projected memory usage in The Ultra-Scale Playbook w/in 10%. Some overage is expected since the calculator makes pessimistic assumptions and looks for peak memory. Note that you could still OOM from intermediates!
+Welcome any detailed memory usage reports along with configurations and framework details to tune this further!
 """