Spaces:

xxyux
/

GPU_memory_calculator_LLMTraining

Runtime error

App Files Files Community

xxyux commited on Jun 12, 2024

Commit

deacdbd

verified ·

1 Parent(s): 9baaa6b

Update app.py

Browse files

Files changed (1) hide show

app.py +221 -144

app.py CHANGED Viewed

@@ -1,11 +1,22 @@
 import gradio as gr
 import pandas as pd
-col=['Layer number', 'Hidden size', 'FFN Hidden size', 'Sequence length', 'Head number', 'Group number',
-        'dp', 'tp', 'pp', 'cp', 'GPU numbers', 'Batch size', 'FP8', 'Model parameters', 'Model_states', 'Activation', 'Total']
-# # global data
-# table_data = pd.DataFrame(columns=col)
 def Get_GigaByte(memory):
     return memory / 1024**3
@@ -46,7 +57,7 @@ def Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func,  tp):
     num_parameters_mlp = 2 * hidden_size
     # mlp1 weight: h*ffn/tp, bias: ffn/tp
     # mlp2 weight: ffn*h/tp, bias: h
-    if act_func == "True":
         num_parameters_mlp += hidden_size * ffn_size * 3 / tp
         if is_bias == "True":
             num_parameters_mlp += ffn_size * 2 / tp + hidden_size
@@ -178,7 +189,7 @@ def compute_activation_memory_mlp(activation_dtype, seq_length, b, hidden_size,
     if is_sp == "False":
         activation_mem_mlp_fc1 *= tp
     # Act 8bsh
-    if act_func == "Swiglu":
         activation_mem_mlp_act = seq_length * b * ffn_size * 2 * 2
     else:
         activation_mem_mlp_act = seq_length * b * ffn_size * 2
@@ -207,21 +218,21 @@ def compute_activation_memory_output(seq_length, b, hidden_size, vocab_size):
     # Inputs to output layer and CE loss(bf16, fp32 * 2).
     return 2 * seq_length * b * hidden_size + (2 + 4 + 4) * seq_length * b * vocab_size
-def compute_activation_memory_pp(activation_memory, is_ip, vp, pp, num_microbatches):
     # Multiply by interleaved PP memory factor.
-    if is_ip == "True":
         interleaved_schedule_memory_penalty = 1 + (pp - 1) / (pp * vp)
         activation_memory *= interleaved_schedule_memory_penalty
     # If using non-interleaved schedule, number of microbatches in pipeline can be less than pp_size,
     # so discount accordingly.
-    if is_ip == "False" and pp > 1:
         if num_microbatches > 1:
             activation_memory *= min(1, num_microbatches / pp)
     return activation_memory
-def compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, is_ip, vp):
     # Using formula in Table 2 of https://arxiv.org/pdf/2205.05198.pdf.
     # We are trying to compute the maximum activation footprint, so all calculations in this function
     # are for the first pipeline stage.
@@ -252,7 +263,7 @@ def compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, he
     # get num_microbatches
     num_microbatches = b_global / b / dp / cp
-    activation_memory = compute_activation_memory_pp(activation_memory, is_ip, vp, pp, num_microbatches)
     if pp == 1:
         # Inputs to output layer and CE loss(fp32).
@@ -267,13 +278,22 @@ def compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, he
 # compute_btn.click.function
 def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_length, head_num, is_group_query, group_query_num, is_bias, act_func,
-        dp, tp, pp, cp, is_sp, is_ip, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count):
     # get model states
     numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(vocab_size, layer_num, hidden_size,
         ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func, dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty)
     # get activation memory
-    activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, is_ip, vp)
     # get model parameters
     numParametersTotal = Compute_Parameters(vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, 1, 1)
@@ -289,7 +309,7 @@ def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_l
     # record
     new_row = pd.DataFrame([[layer_num, hidden_size, ffn_size, seq_length, head_num, group_query_num, dp, tp, pp, cp, gpu_num, b, is_fp8,
-                            numParametersTotal, model_states_memory, activation_memory, Total]],
                             columns=col)
     if count == 1:
         record_df = new_row
@@ -300,8 +320,8 @@ def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_l
     # return str(gpu_num), str(model_states) + " GB", str(activation) + " GB", str(total) + " GB", table_data
     return f"""
                 GPU numbers = {str(gpu_num)}, \n
-                Total model parameters = {str(numParametersTotal)} B, \n
-                Model parameters = {str(numParameters)} B, \n
                 Model_states = {str(model_states_memory)} GB, \n
                 Activation = {str(activation_memory)} GB, \n
                 Total memory consumption = {str(Total)} GB \n
@@ -317,71 +337,36 @@ def generate_csv(record_df):
 # formula string
 formula = r"""
-        > **Note**🔑: In this formula, we assume LLM training with FP32 Gradient and Optimizer state, and bias = False, Zero1 = False, SP = True.
-        <!-- parameters: -->
         $$
-        P_{input} = \frac{HV}{tp}, \quad
-        P_{output} = 2H \\\\
-        P_{attn} = 2H + \frac{2H^2 + 2H_{KV} \times H}{tp}, \quad
-        P_{MLP} = 2H +
-        \\begin{cases}
-        \frac{3H \times FFN}{tp},  & \text{if }GLU\text{ is True} \\\\
-        \frac{2H \times FFN}{tp}, & \text{if }GLU\text{ is False}
-        \\end{cases} \\\\
-        P_{middle} = \frac{(P_{attn} + P_{MLP}) \times L}{pp} \\\\
-        P = P_{input} + P_{middle} +
-        \\begin{cases}
-        P_{output},  & \text{if }pp = 1 \\\\
-        0, & \text{if }pp > 1
-        \\end{cases} \\\\
         {Total\ Model\ parameters} =
-        \\begin{cases}
-        P,  & \text{set tp = 1, pp = 1} \\\\
-        2HV + 2H + (4H + 2H^2 + 2H_{KV} \times H + 3FFN \times H) \times L, & \text{general formula}
-        \\end{cases} \\\\
-        {Model\ states} = {Model\ weight} + {Gradient} + {Optimizer\ state} + {Master\ weight} =
-        \\begin{cases}
-        18P,  & \text{BF16 training} \\\\
-        18P, & \text{FP8 training with FP8 Init} \\\\
-        20P, & \text{FP8 training w/o FP8 Init}
-        \\end{cases} \\\\
         $$
         ***
-        <!-- activations: -->
         $$
-        A_{input} = (8SB + SBH) \times pp, \quad
-        A_{output} = 2SBH +
-        \\begin{cases}
-        10SBV,  & \text{if }pp\text{ = 1} \\\\
-        0, & \text{if }pp\text{ > 1}
-        \\end{cases} \\\\
-        A_{attn} = 5SBH + 4SB \times H_{KV} +
-        \\begin{cases}
-        2SBH, & \text{if } FP8  \text{ is True} \\\\
-        4SBH, & \text{if } FP8  \text{ is False}
-        \\end{cases} \\\\
-        A_{MLP} = 3SBH +
-        \\begin{cases}
-        SBH + SB \times FFN + 4SB \times FFN, & \text{if }FP8 \text{ is True and }GLU \text{ is True} \\\\
-        2SBH + 2SB \times FFN + 4SB \times FFN, & \text{if }FP8 \text{ is False and }GLU \text{ is True} \\\\
-        SBH + SB \times FFN + 2SB \times FFN, & \text{if }FP8 \text{ is True and }GLU \text{ is False} \\\\
-        2SBH + 2SB \times FFN + 2SB \times FFN, & \text{if }FP8 \text{ is False and }GLU \text{ is False}
-        \\end{cases} \\\\
-        A_{middle} = (A_{attn} + A_{MLP}) \times L \\\\
-        A_{ip} = (A_{input} + A_{middle}) \times
-        \\begin{cases}
-        (1 + \frac{pp - 1}{pp \times vp}), & \text{if } Interleaved\ Pipeline  \text{ is True} \\\\
-        min(1, \frac{microbatch}{pp}), & \text{if } Interleaved\ Pipeline \text{ is False and pp > 1} \\\\
-        1, & \text{other}
-        \\end{cases} \\\\
-        Activation =
-        \\begin{cases}
-        \frac{A_{ip} + A_{output}}{tp \times cp}, & \text{if pp = 1} \\\\
-        \frac{A_{ip} + 2BSH}{tp \times cp}, & \text{if pp > 1}
-        \\end{cases}
         $$
         ***
@@ -394,6 +379,76 @@ formula = r"""
         $$
         """
 with gr.Blocks() as demo:
     with gr.Row():
         # Text
@@ -406,64 +461,92 @@ with gr.Blocks() as demo:
             """
         )
-    with gr.Column():
-        # Input 1.[Model Parameters]
-        gr.Markdown(
-            """
-            <h1>Model Parameters:</h1>
-            """
-        )
-        with gr.Accordion("Model Parameters"):
-            act_func = gr.Radio(["True", "False"], value="True", label="Model type", info="Action Function in MLP, whether to use GLU (Gated Linear Unit). [e.g \"True\" for LlaMA, \"False\" for GPT.]")
-            vocab_size = gr.Number(label="Vocab size", value=32000)
-            layer_num = gr.Number(label="Layer number", value=32)
-            hidden_size = gr.Number(label="Hidden size", value=4096)
-            ffn_size = gr.Number(label="FFN Hidden size", value=11008)
-            sequence_len = gr.Number(label="Sequence length", value=1024)
-            head_num = gr.Number(label="Number of Attention Heads", value=32)
-            with gr.Row():
-                is_group_query = gr.Radio(["True", "False"], value="True", label="Use Group Query Attention")
-                group_query_num = gr.Number(label="Number of Query Groups", value=96)
-            is_bias = gr.Radio(["True", "False"], value="False", label="Use Bias")
-        # Input 2.[Parallelism]
-        gr.Markdown(
-            """
-            <h1>Parallelism config:</h1>
-            """
-        )
-        with gr.Accordion("Parallelism config"):
-            dp = gr.Number(label="Data parallelism", value=1)
-            tp = gr.Number(label="Tensor parallelism", value=2)
-            pp = gr.Number(label="Pipeline parallelism", value=2)
-            cp = gr.Number(label="Context parallelism", value=2)
-            is_sp = gr.Radio(["True", "False"], value="True", label="Sequence parallelism")
-            with gr.Row():
-                is_ip = gr.Radio(["True", "False"], value="False", label="Use Interleaved Pipeline")
                 vp = gr.Number(label="Virtual Pipeline Size")
-            is_dist_opt = gr.Radio(["True", "False"], value="True", label="Use Distributed Optimizer(Zero1)")
-        # Input 3.[Training Settings]
-        gr.Markdown(
-            """
-            <h1>Training Config:</h1>
-            """
-        )
-        with gr.Accordion("Training Config"):
-            b = gr.Number(label="Micro Batch size", value=4)
-            b_global = gr.Number(label="Global Batch size", value=64)
-            gr.Checkbox(label="True", value=True, info="BF16 Training")
-            is_fp8 = gr.Radio(["True", "False"], value="True", label="FP8 Training")
-            is_fp8_init = gr.Radio(["True", "False"], value="True", label="FP8 Initialization(will reduce memory)")
-            g_ty = gr.Dropdown(["FP32", "BF16"], value="FP32", label="Gradients Dtype")
-            o_ty = gr.Dropdown(["FP32", "BF16"], value="FP32", label="Optimizer State Dtype")
-    with gr.Column():
-        gr.Markdown(
-            """
-            <h1>Output Data:</h1>
-            """
-        )
         formula = formula
         gr.Markdown(
@@ -471,25 +554,19 @@ with gr.Blocks() as demo:
             , latex_delimiters=[{ "left": "$$", "right": "$$", "display": True }]
         )
-        output_text = gr.Textbox(
-            label="Compute result",
-            interactive=False,
-        )
-    # Button
-    with gr.Row():
-        compute_btn = gr.Button("Compute")
-        download_btn = gr.Button("Download")
     record_df = gr.Dataframe(
         label="Record Table",
-        headers=col
     )
     count = gr.Number(label="Row count", value=1, visible=False)
     compute_btn.click(
         fn=Compute_ALL_Model_memory,
         inputs=[vocab_size, layer_num, hidden_size, ffn_size, sequence_len, head_num, is_group_query, group_query_num, is_bias, act_func,
-                dp, tp, pp, cp, is_sp, is_ip, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count],
         outputs=[output_text, record_df, count]
     )
@@ -503,4 +580,4 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import pandas as pd
+# col=['Layer number', 'Hidden size', 'FFN Hidden size', 'Sequence length', 'Head number', 'Group number',
+#         'dp', 'tp', 'pp', 'cp', 'GPU numbers', 'Batch size', 'FP8', 'Model parameters', 'Model_states', 'Activation', 'Total']
+col=['L', 'H', 'FFN', 'S', 'A', 'G',
+        'dp', 'tp', 'pp', 'cp', 'GPU number', 'Batch size', 'FP8', 'Model parameters', 'Model states', 'Activation', 'Total']
+abbr = """
+    <div align="center">
+    > **Abbreviations of symbols:**
+    |Abbr|Full name|Abbr|Full name|Abbr|Full name|Abbr|Full name|Abbr|Full name|Abbr|Full name|
+    |---|---|---|---|---|---|---|---|---|---|---|---|
+    |L|Layer number|H|Hidden size|FFN|FFN Hidden size|S|Sequence length|A|Head number|G|Group number|
+    </div>
+    """
 def Get_GigaByte(memory):
     return memory / 1024**3
     num_parameters_mlp = 2 * hidden_size
     # mlp1 weight: h*ffn/tp, bias: ffn/tp
     # mlp2 weight: ffn*h/tp, bias: h
+    if act_func == "LLaMA":
         num_parameters_mlp += hidden_size * ffn_size * 3 / tp
         if is_bias == "True":
             num_parameters_mlp += ffn_size * 2 / tp + hidden_size
     if is_sp == "False":
         activation_mem_mlp_fc1 *= tp
     # Act 8bsh
+    if act_func == "LLaMA":
         activation_mem_mlp_act = seq_length * b * ffn_size * 2 * 2
     else:
         activation_mem_mlp_act = seq_length * b * ffn_size * 2
     # Inputs to output layer and CE loss(bf16, fp32 * 2).
     return 2 * seq_length * b * hidden_size + (2 + 4 + 4) * seq_length * b * vocab_size
+def compute_activation_memory_pp(activation_memory, vp, pp, num_microbatches):
     # Multiply by interleaved PP memory factor.
+    if vp > 0:
         interleaved_schedule_memory_penalty = 1 + (pp - 1) / (pp * vp)
         activation_memory *= interleaved_schedule_memory_penalty
     # If using non-interleaved schedule, number of microbatches in pipeline can be less than pp_size,
     # so discount accordingly.
+    if vp == 0 and pp > 1:
         if num_microbatches > 1:
             activation_memory *= min(1, num_microbatches / pp)
     return activation_memory
+def compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp):
     # Using formula in Table 2 of https://arxiv.org/pdf/2205.05198.pdf.
     # We are trying to compute the maximum activation footprint, so all calculations in this function
     # are for the first pipeline stage.
     # get num_microbatches
     num_microbatches = b_global / b / dp / cp
+    activation_memory = compute_activation_memory_pp(activation_memory, vp, pp, num_microbatches)
     if pp == 1:
         # Inputs to output layer and CE loss(fp32).
 # compute_btn.click.function
 def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_length, head_num, is_group_query, group_query_num, is_bias, act_func,
+        dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count):
+    # data type trans
+    if is_group_query == "True":
+        group_query_num = int(group_query_num)
+    # check input
+    [result, Error_message] = check_input(dp, tp, pp, cp, hidden_size, head_num, layer_num, seq_length, vp, b, b_global)
+    if result == False:
+        return Error_message, record_df, count
     # get model states
     numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(vocab_size, layer_num, hidden_size,
         ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func, dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty)
     # get activation memory
+    activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp)
     # get model parameters
     numParametersTotal = Compute_Parameters(vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, 1, 1)
     # record
     new_row = pd.DataFrame([[layer_num, hidden_size, ffn_size, seq_length, head_num, group_query_num, dp, tp, pp, cp, gpu_num, b, is_fp8,
+                            numParameters, model_states_memory, activation_memory, Total]],
                             columns=col)
     if count == 1:
         record_df = new_row
     # return str(gpu_num), str(model_states) + " GB", str(activation) + " GB", str(total) + " GB", table_data
     return f"""
                 GPU numbers = {str(gpu_num)}, \n
+                Model parameters = {str(numParametersTotal)} B, \n
+                Model parameters on each device = {str(numParameters)} B, \n
                 Model_states = {str(model_states_memory)} GB, \n
                 Activation = {str(activation_memory)} GB, \n
                 Total memory consumption = {str(Total)} GB \n
 # formula string
 formula = r"""
+        > **Note**🔑: In this formula, we assume LLM training with FP8 training.
+        > 1. Interleaved pipeline.
+        > 2. bias = False.
+        > 3. SP = True.
+        <div align="center">
+        <img src=file/T1.jpg width=50%/>
+        </div>
         $$
         {Total\ Model\ parameters} =
+        HV + HS + (4H^2 + 3H \times FFN + 2H) \times L + 2H + HV
         $$
         ***
+        <div align="center">
+        <img src=file/ms.png width=40%/>
+        </div>
+        $$
+        {Model\ states} =
+        (6 + \frac{12}{dp}) \times
+        (\frac{(\frac{4h^2 + 3H \times FFN}{tp} + 2H) \times L}{pp} + \frac{HV}{tp} + HS)
+        $$
         $$
+        {Activation} =
+        (1 + \frac{pp-1}{pp \times vp}) \times
+        \frac{(8BS + BSH) \times pp + 15BSH + 5BS \times FFN}{tp \times cp}
         $$
         ***
         $$
         """
+def check_tp(tp, head_num):
+    if head_num % tp == 0:
+        return True
+    else:
+        return False
+def check_pp(pp, layer_num):
+    if layer_num % pp == 0:
+        return True
+    else:
+        return False
+def check_cp(cp, seq_length):
+    if seq_length % cp == 0:
+        return True
+    else:
+        return False
+def check_hidden(hidden_size, head_num):
+    if hidden_size % head_num == 0:
+        return True
+    else:
+        return False
+def check_b_global(b_global, b, dp, cp):
+    if b_global % (b * dp * cp) == 0:
+        return True
+    else:
+        return False
+def check_num_microbatch(layer_num, vp, pp, num_microbatches):
+    if vp > 0:
+        if layer_num % (pp * vp) == 0:
+            return True
+        else:
+            return False
+    if vp == 0 and pp > 1:
+        if num_microbatches > 1:
+            if num_microbatches % pp == 0:
+                return True
+            else:
+                return False
+    return True
+def check_input(dp, tp, pp, cp, hidden_size, head_num, layer_num, seq_length, vp, b, b_global):
+    result = True
+    Error_message = ""
+    if check_tp(tp, head_num) == False:
+        result = False
+        Error_message += "Error message: Please reset Tensor parallelism or head_num, make head_num % tp = 0. \n"
+    if check_pp(pp, layer_num) == False:
+        result = False
+        Error_message += "Error message: Please reset Pipeline parallelism or layer_num, make layer_num % pp = 0. \n"
+    if check_cp(cp, seq_length) == False:
+        result = False
+        Error_message += "Error message: Please reset Context parallelism or seq_length, make seq_length % cp = 0. \n"
+    if check_hidden(hidden_size, head_num) == False:
+        result = False
+        Error_message += "Error message: Please reset hidden_size or head_num, make hidden_size % head_num = 0. \n"
+    if check_b_global(b_global, b, dp, cp) == False:
+        result = False
+        Error_message += "Error message: Please reset b_global or batch_size, make b_global % (batch_size * dp * cp) = 0. \n"
+    if check_num_microbatch(layer_num, vp, pp, b_global / b / dp / cp) == False:
+        result = False
+        Error_message += "Error message: Please reset b_global or batch_size or layer_num or Virtual Pipeline Size, make layer_num % (pp * vp) = 0, num_microbatches % pp = 0. \n"
+    return result, Error_message
 with gr.Blocks() as demo:
     with gr.Row():
         # Text
             """
         )
+    with gr.Row():
+        with gr.Column():
+            # Input 1.[Model Parameters]
+            gr.Markdown(
+                """
+                <h1>Model Parameters:</h1>
+                """
+            )
+            with gr.Accordion("Model Parameters"):
+                # with gr.Row():
+                act_func = gr.Radio(["LLaMA", "GPT"], value="LLaMA", label="Model type") #, info="Action Function in MLP, whether to use GLU (Gated Linear Unit). [e.g \"True\" for LlaMA, \"False\" for GPT.]")
+                with gr.Row():
+                    vocab_size = gr.Number(label="Vocab size", value=32000)
+                    layer_num = gr.Number(label="Layer number", value=32)
+                with gr.Row():
+                    hidden_size = gr.Number(label="Hidden size", value=4096)
+                    ffn_size = gr.Number(label="FFN Hidden size", value=11008)
+                with gr.Row():
+                    sequence_len = gr.Number(label="Sequence length", value=2048)
+                    head_num = gr.Number(label="Number of Attention Heads", value=32)
+                with gr.Row():
+                    is_group_query = gr.Radio(["True", "False"], value="False", label="Use Group Query Attention")
+                    group_query_num = gr.Textbox(label="Number of Query Groups", max_lines=1, value=None, interactive=False)
+                is_bias = gr.Radio(["True", "False"], value="False", label="Use Bias")
+                # change editable function
+                def toggle_textbox_editable(radio_value):
+                    # 根据 radio_value 的值来决定 textbox 是否可编辑
+                    if radio_value == "True":
+                        return gr.update(interactive=True, value="96")
+                    else:
+                        return gr.update(interactive=False, value="")
+                # 将 radio 组件的变化连接到函数
+                is_group_query.change(toggle_textbox_editable, inputs=is_group_query, outputs=group_query_num)
+        with gr.Column():
+            # Input 2.[Parallelism]
+            gr.Markdown(
+                """
+                <h1>Parallelism config:</h1>
+                """
+            )
+            with gr.Accordion("Parallelism config"):
+                # with gr.Row():
+                dp = gr.Number(label="Data parallelism", value=1)
+                tp = gr.Number(label="Tensor parallelism", value=2)
+                pp = gr.Number(label="Pipeline parallelism", value=2)
+                cp = gr.Number(label="Context parallelism", value=2)
+                # with gr.Row():
+                is_sp = gr.Radio(["True", "False"], value="True", label="Sequence parallelism")
                 vp = gr.Number(label="Virtual Pipeline Size")
+                is_dist_opt = gr.Radio(["True", "False"], value="True", label="Use Distributed Optimizer(Zero1)")
+        with gr.Column():
+            # Input 3.[Training Settings]
+            gr.Markdown(
+                """
+                <h1>Training Config:</h1>
+                """
+            )
+            with gr.Accordion("Training Config"):
+                # with gr.Row():
+                b = gr.Number(label="Micro Batch size", value=4)
+                b_global = gr.Number(label="Global Batch size", value=64)
+                # with gr.Row():
+                gr.Checkbox(label="True", value=True, info="BF16 Training")
+                is_fp8 = gr.Radio(["True", "False"], value="True", label="FP8 Training")
+                is_fp8_init = gr.Radio(["True", "False"], value="True", label="FP8 Initialization(will reduce memory)")
+                # with gr.Row():
+                g_ty = gr.Dropdown(["FP32", "BF16"], value="FP32", label="Gradients Dtype")
+                o_ty = gr.Dropdown(["FP32", "BF16"], value="FP32", label="Optimizer State Dtype")
+    compute_btn = gr.Button("Compute")
+    with gr.Tab("Output"):
+        with gr.Column():
+            gr.Markdown(
+                """
+                <h1>Output Data:</h1>
+                """
+            )
+            output_text = gr.Textbox(
+                label="Compute result",
+                interactive=False,
+            )
+    with gr.Tab("Formula"):
         formula = formula
         gr.Markdown(
             , latex_delimiters=[{ "left": "$$", "right": "$$", "display": True }]
         )
+    gr.Markdown(abbr)
     record_df = gr.Dataframe(
         label="Record Table",
+        headers=col,
+        interactive=False
     )
+    download_btn = gr.Button("Download")
     count = gr.Number(label="Row count", value=1, visible=False)
     compute_btn.click(
         fn=Compute_ALL_Model_memory,
         inputs=[vocab_size, layer_num, hidden_size, ffn_size, sequence_len, head_num, is_group_query, group_query_num, is_bias, act_func,
+                dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count],
         outputs=[output_text, record_df, count]
     )
 if __name__ == "__main__":
+    demo.launch(allowed_paths=["/"])