Spaces:

rubenaghayan
/

llm_memory_visualizer

Sleeping

rubenaghayan commited on Oct 12, 2025

Commit

c3f56bc

1 Parent(s): 8e1d528

details clarifications

Files changed (3) hide show

app.py CHANGED Viewed

@@ -5,12 +5,10 @@ import gradio as gr
 import pandas as pd
 from functools import partial
 from defaults import DEFAULTS
-from details import DETAILS, INSTRUCTIONS
 from state import Model, Parallelism, Training
 from calculator import MemoryCalculation
 from dtypes import DType
-from gradio.themes import Base
-from limitations import LIMITATIONS
 # Create a Number component for natural numbers (positive integers)
 NaturalNumber = partial(gr.Number, minimum=1, step=1, precision=0, interactive=True)

 import pandas as pd
 from functools import partial
 from defaults import DEFAULTS
+from details import DETAILS, INSTRUCTIONS, LIMITATIONS
 from state import Model, Parallelism, Training
 from calculator import MemoryCalculation
 from dtypes import DType
 # Create a Number component for natural numbers (positive integers)
 NaturalNumber = partial(gr.Number, minimum=1, step=1, precision=0, interactive=True)

details.py CHANGED Viewed

@@ -16,8 +16,26 @@ Helpful resources used while building this:
 """
 INSTRUCTIONS = """
             ## How to Use
             1. Use Presets OR Adjust the parallelism, model, and training panels to match your run.
             2. Press **Calculate** to refresh the memory breakdown chart.
             3. Review the details and references below for context on the estimates.
-            """

 """
 INSTRUCTIONS = """
+            This calculator will estimate the memory used per GPU during training (excluding intermediates)
             ## How to Use
             1. Use Presets OR Adjust the parallelism, model, and training panels to match your run.
             2. Press **Calculate** to refresh the memory breakdown chart.
             3. Review the details and references below for context on the estimates.
+            """
+LIMITATIONS = """
+### Key Assumptions:
+- Standard transformer architecture with homogeneous layers
+- Adam optimizer
+- Mixed precision keeps master weights copy
+- Tensor parallelism includes sequence parallelism
+- Pipeline parallelism maintains consistent activation memory due to schedule
+### Not Currently Supported:
+- Non-standard architectures (alternating dense/sparse layers, custom attention)
+- Multi-modal models with vision layers
+- Non-homogeneous parameter dtypes (e.g. BF16 & MXFP4 in GPT-OSS). Mixed Precision is supported.
+- Kernel/framework overhead and intermediate memory
+For advanced configurations, results should be validated against profiling.
+"""

limitations.py DELETED Viewed

@@ -1,15 +0,0 @@
-LIMITATIONS = """
-### Key Assumptions:
-- Standard transformer architecture with homogeneous layers
-- Adam optimizer with mixed precision training (master weights copy)
-- Tensor parallelism includes sequence parallelism
-- Pipeline parallelism maintains consistent activation memory
-### Not Currently Supported:
-- Non-standard architectures (alternating dense/sparse layers, custom attention)
-- Multi-modal models with vision layers
-- Mixed dtype training (e.g., MXFP4)
-- Kernel/framework overhead and intermediate memory
-For advanced configurations, results should be validated against profiling.
-"""