rubenaghayan commited on
Commit
c3f56bc
·
1 Parent(s): 8e1d528

details clarifications

Browse files
Files changed (3) hide show
  1. app.py +1 -3
  2. details.py +19 -1
  3. limitations.py +0 -15
app.py CHANGED
@@ -5,12 +5,10 @@ import gradio as gr
5
  import pandas as pd
6
  from functools import partial
7
  from defaults import DEFAULTS
8
- from details import DETAILS, INSTRUCTIONS
9
  from state import Model, Parallelism, Training
10
  from calculator import MemoryCalculation
11
  from dtypes import DType
12
- from gradio.themes import Base
13
- from limitations import LIMITATIONS
14
 
15
  # Create a Number component for natural numbers (positive integers)
16
  NaturalNumber = partial(gr.Number, minimum=1, step=1, precision=0, interactive=True)
 
5
  import pandas as pd
6
  from functools import partial
7
  from defaults import DEFAULTS
8
+ from details import DETAILS, INSTRUCTIONS, LIMITATIONS
9
  from state import Model, Parallelism, Training
10
  from calculator import MemoryCalculation
11
  from dtypes import DType
 
 
12
 
13
  # Create a Number component for natural numbers (positive integers)
14
  NaturalNumber = partial(gr.Number, minimum=1, step=1, precision=0, interactive=True)
details.py CHANGED
@@ -16,8 +16,26 @@ Helpful resources used while building this:
16
  """
17
 
18
  INSTRUCTIONS = """
 
19
  ## How to Use
20
  1. Use Presets OR Adjust the parallelism, model, and training panels to match your run.
21
  2. Press **Calculate** to refresh the memory breakdown chart.
22
  3. Review the details and references below for context on the estimates.
23
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  """
17
 
18
  INSTRUCTIONS = """
19
+ This calculator will estimate the memory used per GPU during training (excluding intermediates)
20
  ## How to Use
21
  1. Use Presets OR Adjust the parallelism, model, and training panels to match your run.
22
  2. Press **Calculate** to refresh the memory breakdown chart.
23
  3. Review the details and references below for context on the estimates.
24
+ """
25
+
26
+ LIMITATIONS = """
27
+ ### Key Assumptions:
28
+ - Standard transformer architecture with homogeneous layers
29
+ - Adam optimizer
30
+ - Mixed precision keeps master weights copy
31
+ - Tensor parallelism includes sequence parallelism
32
+ - Pipeline parallelism maintains consistent activation memory due to schedule
33
+
34
+ ### Not Currently Supported:
35
+ - Non-standard architectures (alternating dense/sparse layers, custom attention)
36
+ - Multi-modal models with vision layers
37
+ - Non-homogeneous parameter dtypes (e.g. BF16 & MXFP4 in GPT-OSS). Mixed Precision is supported.
38
+ - Kernel/framework overhead and intermediate memory
39
+
40
+ For advanced configurations, results should be validated against profiling.
41
+ """
limitations.py DELETED
@@ -1,15 +0,0 @@
1
- LIMITATIONS = """
2
- ### Key Assumptions:
3
- - Standard transformer architecture with homogeneous layers
4
- - Adam optimizer with mixed precision training (master weights copy)
5
- - Tensor parallelism includes sequence parallelism
6
- - Pipeline parallelism maintains consistent activation memory
7
-
8
- ### Not Currently Supported:
9
- - Non-standard architectures (alternating dense/sparse layers, custom attention)
10
- - Multi-modal models with vision layers
11
- - Mixed dtype training (e.g., MXFP4)
12
- - Kernel/framework overhead and intermediate memory
13
-
14
- For advanced configurations, results should be validated against profiling.
15
- """