Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,11 +1,22 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
-
col=['Layer number', 'Hidden size', 'FFN Hidden size', 'Sequence length', 'Head number', 'Group number',
|
| 5 |
-
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
def Get_GigaByte(memory):
|
| 11 |
return memory / 1024**3
|
|
@@ -46,7 +57,7 @@ def Compute_Parameters_mlp(hidden_size, ffn_size, is_bias, act_func, tp):
|
|
| 46 |
num_parameters_mlp = 2 * hidden_size
|
| 47 |
# mlp1 weight: h*ffn/tp, bias: ffn/tp
|
| 48 |
# mlp2 weight: ffn*h/tp, bias: h
|
| 49 |
-
if act_func == "
|
| 50 |
num_parameters_mlp += hidden_size * ffn_size * 3 / tp
|
| 51 |
if is_bias == "True":
|
| 52 |
num_parameters_mlp += ffn_size * 2 / tp + hidden_size
|
|
@@ -178,7 +189,7 @@ def compute_activation_memory_mlp(activation_dtype, seq_length, b, hidden_size,
|
|
| 178 |
if is_sp == "False":
|
| 179 |
activation_mem_mlp_fc1 *= tp
|
| 180 |
# Act 8bsh
|
| 181 |
-
if act_func == "
|
| 182 |
activation_mem_mlp_act = seq_length * b * ffn_size * 2 * 2
|
| 183 |
else:
|
| 184 |
activation_mem_mlp_act = seq_length * b * ffn_size * 2
|
|
@@ -207,21 +218,21 @@ def compute_activation_memory_output(seq_length, b, hidden_size, vocab_size):
|
|
| 207 |
# Inputs to output layer and CE loss(bf16, fp32 * 2).
|
| 208 |
return 2 * seq_length * b * hidden_size + (2 + 4 + 4) * seq_length * b * vocab_size
|
| 209 |
|
| 210 |
-
def compute_activation_memory_pp(activation_memory,
|
| 211 |
# Multiply by interleaved PP memory factor.
|
| 212 |
-
if
|
| 213 |
interleaved_schedule_memory_penalty = 1 + (pp - 1) / (pp * vp)
|
| 214 |
activation_memory *= interleaved_schedule_memory_penalty
|
| 215 |
|
| 216 |
# If using non-interleaved schedule, number of microbatches in pipeline can be less than pp_size,
|
| 217 |
# so discount accordingly.
|
| 218 |
-
if
|
| 219 |
if num_microbatches > 1:
|
| 220 |
activation_memory *= min(1, num_microbatches / pp)
|
| 221 |
|
| 222 |
return activation_memory
|
| 223 |
|
| 224 |
-
def compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp,
|
| 225 |
# Using formula in Table 2 of https://arxiv.org/pdf/2205.05198.pdf.
|
| 226 |
# We are trying to compute the maximum activation footprint, so all calculations in this function
|
| 227 |
# are for the first pipeline stage.
|
|
@@ -252,7 +263,7 @@ def compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, he
|
|
| 252 |
|
| 253 |
# get num_microbatches
|
| 254 |
num_microbatches = b_global / b / dp / cp
|
| 255 |
-
activation_memory = compute_activation_memory_pp(activation_memory,
|
| 256 |
|
| 257 |
if pp == 1:
|
| 258 |
# Inputs to output layer and CE loss(fp32).
|
|
@@ -267,13 +278,22 @@ def compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, he
|
|
| 267 |
|
| 268 |
# compute_btn.click.function
|
| 269 |
def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_length, head_num, is_group_query, group_query_num, is_bias, act_func,
|
| 270 |
-
dp, tp, pp, cp, is_sp,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
# get model states
|
| 272 |
numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(vocab_size, layer_num, hidden_size,
|
| 273 |
ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func, dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty)
|
| 274 |
|
| 275 |
# get activation memory
|
| 276 |
-
activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp,
|
| 277 |
|
| 278 |
# get model parameters
|
| 279 |
numParametersTotal = Compute_Parameters(vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, 1, 1)
|
|
@@ -289,7 +309,7 @@ def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_l
|
|
| 289 |
|
| 290 |
# record
|
| 291 |
new_row = pd.DataFrame([[layer_num, hidden_size, ffn_size, seq_length, head_num, group_query_num, dp, tp, pp, cp, gpu_num, b, is_fp8,
|
| 292 |
-
|
| 293 |
columns=col)
|
| 294 |
if count == 1:
|
| 295 |
record_df = new_row
|
|
@@ -300,8 +320,8 @@ def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_l
|
|
| 300 |
# return str(gpu_num), str(model_states) + " GB", str(activation) + " GB", str(total) + " GB", table_data
|
| 301 |
return f"""
|
| 302 |
GPU numbers = {str(gpu_num)}, \n
|
| 303 |
-
|
| 304 |
-
Model parameters = {str(numParameters)} B, \n
|
| 305 |
Model_states = {str(model_states_memory)} GB, \n
|
| 306 |
Activation = {str(activation_memory)} GB, \n
|
| 307 |
Total memory consumption = {str(Total)} GB \n
|
|
@@ -317,71 +337,36 @@ def generate_csv(record_df):
|
|
| 317 |
|
| 318 |
# formula string
|
| 319 |
formula = r"""
|
| 320 |
-
> **Note**🔑: In this formula, we assume LLM training with
|
| 321 |
-
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
$$
|
| 324 |
-
P_{input} = \frac{HV}{tp}, \quad
|
| 325 |
-
P_{output} = 2H \\\\
|
| 326 |
-
P_{attn} = 2H + \frac{2H^2 + 2H_{KV} \times H}{tp}, \quad
|
| 327 |
-
P_{MLP} = 2H +
|
| 328 |
-
\\begin{cases}
|
| 329 |
-
\frac{3H \times FFN}{tp}, & \text{if }GLU\text{ is True} \\\\
|
| 330 |
-
\frac{2H \times FFN}{tp}, & \text{if }GLU\text{ is False}
|
| 331 |
-
\\end{cases} \\\\
|
| 332 |
-
P_{middle} = \frac{(P_{attn} + P_{MLP}) \times L}{pp} \\\\
|
| 333 |
-
P = P_{input} + P_{middle} +
|
| 334 |
-
\\begin{cases}
|
| 335 |
-
P_{output}, & \text{if }pp = 1 \\\\
|
| 336 |
-
0, & \text{if }pp > 1
|
| 337 |
-
\\end{cases} \\\\
|
| 338 |
{Total\ Model\ parameters} =
|
| 339 |
-
|
| 340 |
-
P, & \text{set tp = 1, pp = 1} \\\\
|
| 341 |
-
2HV + 2H + (4H + 2H^2 + 2H_{KV} \times H + 3FFN \times H) \times L, & \text{general formula}
|
| 342 |
-
\\end{cases} \\\\
|
| 343 |
-
{Model\ states} = {Model\ weight} + {Gradient} + {Optimizer\ state} + {Master\ weight} =
|
| 344 |
-
\\begin{cases}
|
| 345 |
-
18P, & \text{BF16 training} \\\\
|
| 346 |
-
18P, & \text{FP8 training with FP8 Init} \\\\
|
| 347 |
-
20P, & \text{FP8 training w/o FP8 Init}
|
| 348 |
-
\\end{cases} \\\\
|
| 349 |
$$
|
| 350 |
-
|
| 351 |
***
|
| 352 |
|
| 353 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
$$
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
10SBV, & \text{if }pp\text{ = 1} \\\\
|
| 359 |
-
0, & \text{if }pp\text{ > 1}
|
| 360 |
-
\\end{cases} \\\\
|
| 361 |
-
A_{attn} = 5SBH + 4SB \times H_{KV} +
|
| 362 |
-
\\begin{cases}
|
| 363 |
-
2SBH, & \text{if } FP8 \text{ is True} \\\\
|
| 364 |
-
4SBH, & \text{if } FP8 \text{ is False}
|
| 365 |
-
\\end{cases} \\\\
|
| 366 |
-
A_{MLP} = 3SBH +
|
| 367 |
-
\\begin{cases}
|
| 368 |
-
SBH + SB \times FFN + 4SB \times FFN, & \text{if }FP8 \text{ is True and }GLU \text{ is True} \\\\
|
| 369 |
-
2SBH + 2SB \times FFN + 4SB \times FFN, & \text{if }FP8 \text{ is False and }GLU \text{ is True} \\\\
|
| 370 |
-
SBH + SB \times FFN + 2SB \times FFN, & \text{if }FP8 \text{ is True and }GLU \text{ is False} \\\\
|
| 371 |
-
2SBH + 2SB \times FFN + 2SB \times FFN, & \text{if }FP8 \text{ is False and }GLU \text{ is False}
|
| 372 |
-
\\end{cases} \\\\
|
| 373 |
-
A_{middle} = (A_{attn} + A_{MLP}) \times L \\\\
|
| 374 |
-
A_{ip} = (A_{input} + A_{middle}) \times
|
| 375 |
-
\\begin{cases}
|
| 376 |
-
(1 + \frac{pp - 1}{pp \times vp}), & \text{if } Interleaved\ Pipeline \text{ is True} \\\\
|
| 377 |
-
min(1, \frac{microbatch}{pp}), & \text{if } Interleaved\ Pipeline \text{ is False and pp > 1} \\\\
|
| 378 |
-
1, & \text{other}
|
| 379 |
-
\\end{cases} \\\\
|
| 380 |
-
Activation =
|
| 381 |
-
\\begin{cases}
|
| 382 |
-
\frac{A_{ip} + A_{output}}{tp \times cp}, & \text{if pp = 1} \\\\
|
| 383 |
-
\frac{A_{ip} + 2BSH}{tp \times cp}, & \text{if pp > 1}
|
| 384 |
-
\\end{cases}
|
| 385 |
$$
|
| 386 |
|
| 387 |
***
|
|
@@ -394,6 +379,76 @@ formula = r"""
|
|
| 394 |
$$
|
| 395 |
"""
|
| 396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
with gr.Blocks() as demo:
|
| 398 |
with gr.Row():
|
| 399 |
# Text
|
|
@@ -406,64 +461,92 @@ with gr.Blocks() as demo:
|
|
| 406 |
"""
|
| 407 |
)
|
| 408 |
|
| 409 |
-
with gr.
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
vp = gr.Number(label="Virtual Pipeline Size")
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 467 |
formula = formula
|
| 468 |
|
| 469 |
gr.Markdown(
|
|
@@ -471,25 +554,19 @@ with gr.Blocks() as demo:
|
|
| 471 |
, latex_delimiters=[{ "left": "$$", "right": "$$", "display": True }]
|
| 472 |
)
|
| 473 |
|
| 474 |
-
|
| 475 |
-
label="Compute result",
|
| 476 |
-
interactive=False,
|
| 477 |
-
)
|
| 478 |
|
| 479 |
-
# Button
|
| 480 |
-
with gr.Row():
|
| 481 |
-
compute_btn = gr.Button("Compute")
|
| 482 |
-
download_btn = gr.Button("Download")
|
| 483 |
-
|
| 484 |
record_df = gr.Dataframe(
|
| 485 |
label="Record Table",
|
| 486 |
-
headers=col
|
|
|
|
| 487 |
)
|
|
|
|
| 488 |
count = gr.Number(label="Row count", value=1, visible=False)
|
| 489 |
compute_btn.click(
|
| 490 |
fn=Compute_ALL_Model_memory,
|
| 491 |
inputs=[vocab_size, layer_num, hidden_size, ffn_size, sequence_len, head_num, is_group_query, group_query_num, is_bias, act_func,
|
| 492 |
-
dp, tp, pp, cp, is_sp,
|
| 493 |
outputs=[output_text, record_df, count]
|
| 494 |
)
|
| 495 |
|
|
@@ -503,4 +580,4 @@ with gr.Blocks() as demo:
|
|
| 503 |
|
| 504 |
|
| 505 |
if __name__ == "__main__":
|
| 506 |
-
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pandas as pd
|
| 3 |
|
| 4 |
+
# col=['Layer number', 'Hidden size', 'FFN Hidden size', 'Sequence length', 'Head number', 'Group number',
|
| 5 |
+
# 'dp', 'tp', 'pp', 'cp', 'GPU numbers', 'Batch size', 'FP8', 'Model parameters', 'Model_states', 'Activation', 'Total']
|
| 6 |
|
| 7 |
+
col=['L', 'H', 'FFN', 'S', 'A', 'G',
|
| 8 |
+
'dp', 'tp', 'pp', 'cp', 'GPU number', 'Batch size', 'FP8', 'Model parameters', 'Model states', 'Activation', 'Total']
|
| 9 |
+
|
| 10 |
+
abbr = """
|
| 11 |
+
<div align="center">
|
| 12 |
+
|
| 13 |
+
> **Abbreviations of symbols:**
|
| 14 |
+
|Abbr|Full name|Abbr|Full name|Abbr|Full name|Abbr|Full name|Abbr|Full name|Abbr|Full name|
|
| 15 |
+
|---|---|---|---|---|---|---|---|---|---|---|---|
|
| 16 |
+
|L|Layer number|H|Hidden size|FFN|FFN Hidden size|S|Sequence length|A|Head number|G|Group number|
|
| 17 |
+
|
| 18 |
+
</div>
|
| 19 |
+
"""
|
| 20 |
|
| 21 |
def Get_GigaByte(memory):
|
| 22 |
return memory / 1024**3
|
|
|
|
| 57 |
num_parameters_mlp = 2 * hidden_size
|
| 58 |
# mlp1 weight: h*ffn/tp, bias: ffn/tp
|
| 59 |
# mlp2 weight: ffn*h/tp, bias: h
|
| 60 |
+
if act_func == "LLaMA":
|
| 61 |
num_parameters_mlp += hidden_size * ffn_size * 3 / tp
|
| 62 |
if is_bias == "True":
|
| 63 |
num_parameters_mlp += ffn_size * 2 / tp + hidden_size
|
|
|
|
| 189 |
if is_sp == "False":
|
| 190 |
activation_mem_mlp_fc1 *= tp
|
| 191 |
# Act 8bsh
|
| 192 |
+
if act_func == "LLaMA":
|
| 193 |
activation_mem_mlp_act = seq_length * b * ffn_size * 2 * 2
|
| 194 |
else:
|
| 195 |
activation_mem_mlp_act = seq_length * b * ffn_size * 2
|
|
|
|
| 218 |
# Inputs to output layer and CE loss(bf16, fp32 * 2).
|
| 219 |
return 2 * seq_length * b * hidden_size + (2 + 4 + 4) * seq_length * b * vocab_size
|
| 220 |
|
| 221 |
+
def compute_activation_memory_pp(activation_memory, vp, pp, num_microbatches):
|
| 222 |
# Multiply by interleaved PP memory factor.
|
| 223 |
+
if vp > 0:
|
| 224 |
interleaved_schedule_memory_penalty = 1 + (pp - 1) / (pp * vp)
|
| 225 |
activation_memory *= interleaved_schedule_memory_penalty
|
| 226 |
|
| 227 |
# If using non-interleaved schedule, number of microbatches in pipeline can be less than pp_size,
|
| 228 |
# so discount accordingly.
|
| 229 |
+
if vp == 0 and pp > 1:
|
| 230 |
if num_microbatches > 1:
|
| 231 |
activation_memory *= min(1, num_microbatches / pp)
|
| 232 |
|
| 233 |
return activation_memory
|
| 234 |
|
| 235 |
+
def compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp):
|
| 236 |
# Using formula in Table 2 of https://arxiv.org/pdf/2205.05198.pdf.
|
| 237 |
# We are trying to compute the maximum activation footprint, so all calculations in this function
|
| 238 |
# are for the first pipeline stage.
|
|
|
|
| 263 |
|
| 264 |
# get num_microbatches
|
| 265 |
num_microbatches = b_global / b / dp / cp
|
| 266 |
+
activation_memory = compute_activation_memory_pp(activation_memory, vp, pp, num_microbatches)
|
| 267 |
|
| 268 |
if pp == 1:
|
| 269 |
# Inputs to output layer and CE loss(fp32).
|
|
|
|
| 278 |
|
| 279 |
# compute_btn.click.function
|
| 280 |
def Compute_ALL_Model_memory(vocab_size, layer_num, hidden_size, ffn_size, seq_length, head_num, is_group_query, group_query_num, is_bias, act_func,
|
| 281 |
+
dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count):
|
| 282 |
+
# data type trans
|
| 283 |
+
if is_group_query == "True":
|
| 284 |
+
group_query_num = int(group_query_num)
|
| 285 |
+
|
| 286 |
+
# check input
|
| 287 |
+
[result, Error_message] = check_input(dp, tp, pp, cp, hidden_size, head_num, layer_num, seq_length, vp, b, b_global)
|
| 288 |
+
if result == False:
|
| 289 |
+
return Error_message, record_df, count
|
| 290 |
+
|
| 291 |
# get model states
|
| 292 |
numParameters, weight_memory, gradient_memory, optimizer_memory, master_weight_memory, model_states_memory = Compute_Model_states(vocab_size, layer_num, hidden_size,
|
| 293 |
ffn_size, head_num, is_group_query, group_query_num, is_bias, act_func, dp, tp, pp, cp, is_dist_opt, is_fp8, is_fp8_init, g_ty, o_ty)
|
| 294 |
|
| 295 |
# get activation memory
|
| 296 |
+
activation_memory = compute_activation_memory(vocab_size, seq_length, layer_num, b, b_global, head_num, hidden_size, ffn_size, act_func, is_fp8, is_sp, is_group_query, group_query_num, tp, pp, dp, cp, vp)
|
| 297 |
|
| 298 |
# get model parameters
|
| 299 |
numParametersTotal = Compute_Parameters(vocab_size, layer_num, hidden_size, ffn_size, is_group_query, group_query_num, is_bias, act_func, head_num, 1, 1)
|
|
|
|
| 309 |
|
| 310 |
# record
|
| 311 |
new_row = pd.DataFrame([[layer_num, hidden_size, ffn_size, seq_length, head_num, group_query_num, dp, tp, pp, cp, gpu_num, b, is_fp8,
|
| 312 |
+
numParameters, model_states_memory, activation_memory, Total]],
|
| 313 |
columns=col)
|
| 314 |
if count == 1:
|
| 315 |
record_df = new_row
|
|
|
|
| 320 |
# return str(gpu_num), str(model_states) + " GB", str(activation) + " GB", str(total) + " GB", table_data
|
| 321 |
return f"""
|
| 322 |
GPU numbers = {str(gpu_num)}, \n
|
| 323 |
+
Model parameters = {str(numParametersTotal)} B, \n
|
| 324 |
+
Model parameters on each device = {str(numParameters)} B, \n
|
| 325 |
Model_states = {str(model_states_memory)} GB, \n
|
| 326 |
Activation = {str(activation_memory)} GB, \n
|
| 327 |
Total memory consumption = {str(Total)} GB \n
|
|
|
|
| 337 |
|
| 338 |
# formula string
|
| 339 |
formula = r"""
|
| 340 |
+
> **Note**🔑: In this formula, we assume LLM training with FP8 training.
|
| 341 |
+
> 1. Interleaved pipeline.
|
| 342 |
+
> 2. bias = False.
|
| 343 |
+
> 3. SP = True.
|
| 344 |
+
|
| 345 |
+
<div align="center">
|
| 346 |
+
<img src=file/T1.jpg width=50%/>
|
| 347 |
+
</div>
|
| 348 |
+
|
| 349 |
$$
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
{Total\ Model\ parameters} =
|
| 351 |
+
HV + HS + (4H^2 + 3H \times FFN + 2H) \times L + 2H + HV
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
$$
|
| 353 |
+
|
| 354 |
***
|
| 355 |
|
| 356 |
+
<div align="center">
|
| 357 |
+
<img src=file/ms.png width=40%/>
|
| 358 |
+
</div>
|
| 359 |
+
|
| 360 |
+
$$
|
| 361 |
+
{Model\ states} =
|
| 362 |
+
(6 + \frac{12}{dp}) \times
|
| 363 |
+
(\frac{(\frac{4h^2 + 3H \times FFN}{tp} + 2H) \times L}{pp} + \frac{HV}{tp} + HS)
|
| 364 |
+
$$
|
| 365 |
+
|
| 366 |
$$
|
| 367 |
+
{Activation} =
|
| 368 |
+
(1 + \frac{pp-1}{pp \times vp}) \times
|
| 369 |
+
\frac{(8BS + BSH) \times pp + 15BSH + 5BS \times FFN}{tp \times cp}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
$$
|
| 371 |
|
| 372 |
***
|
|
|
|
| 379 |
$$
|
| 380 |
"""
|
| 381 |
|
| 382 |
+
def check_tp(tp, head_num):
|
| 383 |
+
if head_num % tp == 0:
|
| 384 |
+
return True
|
| 385 |
+
else:
|
| 386 |
+
return False
|
| 387 |
+
|
| 388 |
+
def check_pp(pp, layer_num):
|
| 389 |
+
if layer_num % pp == 0:
|
| 390 |
+
return True
|
| 391 |
+
else:
|
| 392 |
+
return False
|
| 393 |
+
|
| 394 |
+
def check_cp(cp, seq_length):
|
| 395 |
+
if seq_length % cp == 0:
|
| 396 |
+
return True
|
| 397 |
+
else:
|
| 398 |
+
return False
|
| 399 |
+
|
| 400 |
+
def check_hidden(hidden_size, head_num):
|
| 401 |
+
if hidden_size % head_num == 0:
|
| 402 |
+
return True
|
| 403 |
+
else:
|
| 404 |
+
return False
|
| 405 |
+
|
| 406 |
+
def check_b_global(b_global, b, dp, cp):
|
| 407 |
+
if b_global % (b * dp * cp) == 0:
|
| 408 |
+
return True
|
| 409 |
+
else:
|
| 410 |
+
return False
|
| 411 |
+
|
| 412 |
+
def check_num_microbatch(layer_num, vp, pp, num_microbatches):
|
| 413 |
+
if vp > 0:
|
| 414 |
+
if layer_num % (pp * vp) == 0:
|
| 415 |
+
return True
|
| 416 |
+
else:
|
| 417 |
+
return False
|
| 418 |
+
|
| 419 |
+
if vp == 0 and pp > 1:
|
| 420 |
+
if num_microbatches > 1:
|
| 421 |
+
if num_microbatches % pp == 0:
|
| 422 |
+
return True
|
| 423 |
+
else:
|
| 424 |
+
return False
|
| 425 |
+
return True
|
| 426 |
+
|
| 427 |
+
|
| 428 |
+
def check_input(dp, tp, pp, cp, hidden_size, head_num, layer_num, seq_length, vp, b, b_global):
|
| 429 |
+
result = True
|
| 430 |
+
Error_message = ""
|
| 431 |
+
if check_tp(tp, head_num) == False:
|
| 432 |
+
result = False
|
| 433 |
+
Error_message += "Error message: Please reset Tensor parallelism or head_num, make head_num % tp = 0. \n"
|
| 434 |
+
if check_pp(pp, layer_num) == False:
|
| 435 |
+
result = False
|
| 436 |
+
Error_message += "Error message: Please reset Pipeline parallelism or layer_num, make layer_num % pp = 0. \n"
|
| 437 |
+
if check_cp(cp, seq_length) == False:
|
| 438 |
+
result = False
|
| 439 |
+
Error_message += "Error message: Please reset Context parallelism or seq_length, make seq_length % cp = 0. \n"
|
| 440 |
+
if check_hidden(hidden_size, head_num) == False:
|
| 441 |
+
result = False
|
| 442 |
+
Error_message += "Error message: Please reset hidden_size or head_num, make hidden_size % head_num = 0. \n"
|
| 443 |
+
if check_b_global(b_global, b, dp, cp) == False:
|
| 444 |
+
result = False
|
| 445 |
+
Error_message += "Error message: Please reset b_global or batch_size, make b_global % (batch_size * dp * cp) = 0. \n"
|
| 446 |
+
if check_num_microbatch(layer_num, vp, pp, b_global / b / dp / cp) == False:
|
| 447 |
+
result = False
|
| 448 |
+
Error_message += "Error message: Please reset b_global or batch_size or layer_num or Virtual Pipeline Size, make layer_num % (pp * vp) = 0, num_microbatches % pp = 0. \n"
|
| 449 |
+
|
| 450 |
+
return result, Error_message
|
| 451 |
+
|
| 452 |
with gr.Blocks() as demo:
|
| 453 |
with gr.Row():
|
| 454 |
# Text
|
|
|
|
| 461 |
"""
|
| 462 |
)
|
| 463 |
|
| 464 |
+
with gr.Row():
|
| 465 |
+
with gr.Column():
|
| 466 |
+
# Input 1.[Model Parameters]
|
| 467 |
+
gr.Markdown(
|
| 468 |
+
"""
|
| 469 |
+
<h1>Model Parameters:</h1>
|
| 470 |
+
"""
|
| 471 |
+
)
|
| 472 |
+
with gr.Accordion("Model Parameters"):
|
| 473 |
+
# with gr.Row():
|
| 474 |
+
act_func = gr.Radio(["LLaMA", "GPT"], value="LLaMA", label="Model type") #, info="Action Function in MLP, whether to use GLU (Gated Linear Unit). [e.g \"True\" for LlaMA, \"False\" for GPT.]")
|
| 475 |
+
with gr.Row():
|
| 476 |
+
vocab_size = gr.Number(label="Vocab size", value=32000)
|
| 477 |
+
layer_num = gr.Number(label="Layer number", value=32)
|
| 478 |
+
with gr.Row():
|
| 479 |
+
hidden_size = gr.Number(label="Hidden size", value=4096)
|
| 480 |
+
ffn_size = gr.Number(label="FFN Hidden size", value=11008)
|
| 481 |
+
with gr.Row():
|
| 482 |
+
sequence_len = gr.Number(label="Sequence length", value=2048)
|
| 483 |
+
head_num = gr.Number(label="Number of Attention Heads", value=32)
|
| 484 |
+
with gr.Row():
|
| 485 |
+
is_group_query = gr.Radio(["True", "False"], value="False", label="Use Group Query Attention")
|
| 486 |
+
group_query_num = gr.Textbox(label="Number of Query Groups", max_lines=1, value=None, interactive=False)
|
| 487 |
+
is_bias = gr.Radio(["True", "False"], value="False", label="Use Bias")
|
| 488 |
+
|
| 489 |
+
# change editable function
|
| 490 |
+
def toggle_textbox_editable(radio_value):
|
| 491 |
+
# 根据 radio_value 的值来决定 textbox 是否可编辑
|
| 492 |
+
if radio_value == "True":
|
| 493 |
+
return gr.update(interactive=True, value="96")
|
| 494 |
+
else:
|
| 495 |
+
return gr.update(interactive=False, value="")
|
| 496 |
+
# 将 radio 组件的变化连接到函数
|
| 497 |
+
is_group_query.change(toggle_textbox_editable, inputs=is_group_query, outputs=group_query_num)
|
| 498 |
+
|
| 499 |
+
with gr.Column():
|
| 500 |
+
# Input 2.[Parallelism]
|
| 501 |
+
gr.Markdown(
|
| 502 |
+
"""
|
| 503 |
+
<h1>Parallelism config:</h1>
|
| 504 |
+
"""
|
| 505 |
+
)
|
| 506 |
+
with gr.Accordion("Parallelism config"):
|
| 507 |
+
# with gr.Row():
|
| 508 |
+
dp = gr.Number(label="Data parallelism", value=1)
|
| 509 |
+
tp = gr.Number(label="Tensor parallelism", value=2)
|
| 510 |
+
pp = gr.Number(label="Pipeline parallelism", value=2)
|
| 511 |
+
cp = gr.Number(label="Context parallelism", value=2)
|
| 512 |
+
# with gr.Row():
|
| 513 |
+
is_sp = gr.Radio(["True", "False"], value="True", label="Sequence parallelism")
|
| 514 |
vp = gr.Number(label="Virtual Pipeline Size")
|
| 515 |
+
is_dist_opt = gr.Radio(["True", "False"], value="True", label="Use Distributed Optimizer(Zero1)")
|
| 516 |
+
|
| 517 |
+
with gr.Column():
|
| 518 |
+
# Input 3.[Training Settings]
|
| 519 |
+
gr.Markdown(
|
| 520 |
+
"""
|
| 521 |
+
<h1>Training Config:</h1>
|
| 522 |
+
"""
|
| 523 |
+
)
|
| 524 |
+
with gr.Accordion("Training Config"):
|
| 525 |
+
# with gr.Row():
|
| 526 |
+
b = gr.Number(label="Micro Batch size", value=4)
|
| 527 |
+
b_global = gr.Number(label="Global Batch size", value=64)
|
| 528 |
+
# with gr.Row():
|
| 529 |
+
gr.Checkbox(label="True", value=True, info="BF16 Training")
|
| 530 |
+
is_fp8 = gr.Radio(["True", "False"], value="True", label="FP8 Training")
|
| 531 |
+
is_fp8_init = gr.Radio(["True", "False"], value="True", label="FP8 Initialization(will reduce memory)")
|
| 532 |
+
# with gr.Row():
|
| 533 |
+
g_ty = gr.Dropdown(["FP32", "BF16"], value="FP32", label="Gradients Dtype")
|
| 534 |
+
o_ty = gr.Dropdown(["FP32", "BF16"], value="FP32", label="Optimizer State Dtype")
|
| 535 |
+
|
| 536 |
+
compute_btn = gr.Button("Compute")
|
| 537 |
+
with gr.Tab("Output"):
|
| 538 |
+
with gr.Column():
|
| 539 |
+
gr.Markdown(
|
| 540 |
+
"""
|
| 541 |
+
<h1>Output Data:</h1>
|
| 542 |
+
"""
|
| 543 |
+
)
|
| 544 |
+
output_text = gr.Textbox(
|
| 545 |
+
label="Compute result",
|
| 546 |
+
interactive=False,
|
| 547 |
+
)
|
| 548 |
+
|
| 549 |
+
with gr.Tab("Formula"):
|
| 550 |
formula = formula
|
| 551 |
|
| 552 |
gr.Markdown(
|
|
|
|
| 554 |
, latex_delimiters=[{ "left": "$$", "right": "$$", "display": True }]
|
| 555 |
)
|
| 556 |
|
| 557 |
+
gr.Markdown(abbr)
|
|
|
|
|
|
|
|
|
|
| 558 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 559 |
record_df = gr.Dataframe(
|
| 560 |
label="Record Table",
|
| 561 |
+
headers=col,
|
| 562 |
+
interactive=False
|
| 563 |
)
|
| 564 |
+
download_btn = gr.Button("Download")
|
| 565 |
count = gr.Number(label="Row count", value=1, visible=False)
|
| 566 |
compute_btn.click(
|
| 567 |
fn=Compute_ALL_Model_memory,
|
| 568 |
inputs=[vocab_size, layer_num, hidden_size, ffn_size, sequence_len, head_num, is_group_query, group_query_num, is_bias, act_func,
|
| 569 |
+
dp, tp, pp, cp, is_sp, vp, is_dist_opt, b, b_global, is_fp8, is_fp8_init, g_ty, o_ty, record_df, count],
|
| 570 |
outputs=[output_text, record_df, count]
|
| 571 |
)
|
| 572 |
|
|
|
|
| 580 |
|
| 581 |
|
| 582 |
if __name__ == "__main__":
|
| 583 |
+
demo.launch(allowed_paths=["/"])
|