Spaces:
Running
Running
Commit ·
d51b632
1
Parent(s): 5f67cc3
small fixes
Browse files- app.py +8 -4
- requirements.txt +1 -0
- utils.py +50 -5
app.py
CHANGED
|
@@ -28,7 +28,9 @@ def load_config_from_yaml_content(yaml_content):
|
|
| 28 |
'pp': parallelism['pp'],
|
| 29 |
'dp': parallelism['dp'],
|
| 30 |
'zero_stage': optimizer['zero_stage'],
|
| 31 |
-
'tie_word_embeddings': model_config['tie_word_embeddings']
|
|
|
|
|
|
|
| 32 |
}
|
| 33 |
except Exception as e:
|
| 34 |
raise gr.Error(f"Error parsing YAML: {str(e)}")
|
|
@@ -46,7 +48,7 @@ def format_config_display(config):
|
|
| 46 |
sections = {
|
| 47 |
"Model Architecture": [
|
| 48 |
"hidden_size", "num_layers", "vocab_size",
|
| 49 |
-
"intermediate_size", "tie_word_embeddings"
|
| 50 |
],
|
| 51 |
"Training Configuration": [
|
| 52 |
"seq_len", "mbs", "batch_accum"
|
|
@@ -89,6 +91,8 @@ with gr.Blocks() as demo:
|
|
| 89 |
vocab_size = gr.Number(50432, label="Vocabulary Size")
|
| 90 |
intermediate_size = gr.Number(11008, label="Intermediate Size")
|
| 91 |
tie_word_embeddings = gr.Checkbox(True, label="Tie Word Embeddings")
|
|
|
|
|
|
|
| 92 |
|
| 93 |
with gr.Accordion("Training Configuration", open=True):
|
| 94 |
seq_len = gr.Number(2048, label="Sequence Length")
|
|
@@ -127,14 +131,14 @@ with gr.Blocks() as demo:
|
|
| 127 |
config = dict(zip([
|
| 128 |
'hidden_size', 'num_layers', 'vocab_size', 'intermediate_size',
|
| 129 |
'seq_len', 'mbs', 'batch_accum', 'tp', 'pp', 'dp', 'zero_stage',
|
| 130 |
-
'tie_word_embeddings'
|
| 131 |
], args))
|
| 132 |
return process_yaml_and_plot(config)
|
| 133 |
|
| 134 |
manual_submit.click(
|
| 135 |
manual_input_to_config,
|
| 136 |
inputs=[
|
| 137 |
-
hidden_size, num_layers, vocab_size, intermediate_size,
|
| 138 |
seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
|
| 139 |
tie_word_embeddings
|
| 140 |
],
|
|
|
|
| 28 |
'pp': parallelism['pp'],
|
| 29 |
'dp': parallelism['dp'],
|
| 30 |
'zero_stage': optimizer['zero_stage'],
|
| 31 |
+
'tie_word_embeddings': model_config['tie_word_embeddings'],
|
| 32 |
+
'num_attention_heads': model_config['num_attention_heads'],
|
| 33 |
+
'num_key_value_heads': model_config.get('num_key_value_heads', model_config['num_attention_heads'])
|
| 34 |
}
|
| 35 |
except Exception as e:
|
| 36 |
raise gr.Error(f"Error parsing YAML: {str(e)}")
|
|
|
|
| 48 |
sections = {
|
| 49 |
"Model Architecture": [
|
| 50 |
"hidden_size", "num_layers", "vocab_size",
|
| 51 |
+
"intermediate_size", "tie_word_embeddings", "num_attention_heads", "num_key_value_heads"
|
| 52 |
],
|
| 53 |
"Training Configuration": [
|
| 54 |
"seq_len", "mbs", "batch_accum"
|
|
|
|
| 91 |
vocab_size = gr.Number(50432, label="Vocabulary Size")
|
| 92 |
intermediate_size = gr.Number(11008, label="Intermediate Size")
|
| 93 |
tie_word_embeddings = gr.Checkbox(True, label="Tie Word Embeddings")
|
| 94 |
+
num_attention_heads = gr.Number(32, label="Number of Attention Heads")
|
| 95 |
+
num_key_value_heads = gr.Number(32, label="Number of Key Value Heads")
|
| 96 |
|
| 97 |
with gr.Accordion("Training Configuration", open=True):
|
| 98 |
seq_len = gr.Number(2048, label="Sequence Length")
|
|
|
|
| 131 |
config = dict(zip([
|
| 132 |
'hidden_size', 'num_layers', 'vocab_size', 'intermediate_size',
|
| 133 |
'seq_len', 'mbs', 'batch_accum', 'tp', 'pp', 'dp', 'zero_stage',
|
| 134 |
+
'tie_word_embeddings', 'num_attention_heads', 'num_key_value_heads'
|
| 135 |
], args))
|
| 136 |
return process_yaml_and_plot(config)
|
| 137 |
|
| 138 |
manual_submit.click(
|
| 139 |
manual_input_to_config,
|
| 140 |
inputs=[
|
| 141 |
+
hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
|
| 142 |
seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
|
| 143 |
tie_word_embeddings
|
| 144 |
],
|
requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
matplotlib
|
utils.py
CHANGED
|
@@ -1,19 +1,64 @@
|
|
| 1 |
|
| 2 |
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
def calculate_memory_components(
|
| 5 |
-
hidden_size, num_layers, vocab_size, intermediate_size,
|
| 6 |
seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
|
| 7 |
tie_word_embeddings
|
| 8 |
):
|
| 9 |
# Calculate base components first
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# Model BF16 calculation
|
| 13 |
vocab_embeddings = vocab_size * hidden_size * (2 if (not tie_word_embeddings and pp==1) else 1)
|
| 14 |
|
| 15 |
layer_params = (
|
| 16 |
-
(hidden_size *
|
| 17 |
+ (hidden_size * hidden_size) # out_proj
|
| 18 |
+ (hidden_size * 2 * intermediate_size) # gate_up_proj
|
| 19 |
+ (intermediate_size * hidden_size) # down_proj
|
|
@@ -84,12 +129,12 @@ def calculate_memory_components(
|
|
| 84 |
}
|
| 85 |
|
| 86 |
def plot_memory_breakdown(
|
| 87 |
-
hidden_size, num_layers, vocab_size, intermediate_size,
|
| 88 |
seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
|
| 89 |
tie_word_embeddings
|
| 90 |
):
|
| 91 |
results = calculate_memory_components(
|
| 92 |
-
hidden_size, num_layers, vocab_size, intermediate_size,
|
| 93 |
seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
|
| 94 |
tie_word_embeddings
|
| 95 |
)
|
|
|
|
| 1 |
|
| 2 |
import matplotlib.pyplot as plt
|
| 3 |
+
def get_num_hidden_layers_in_pp(hidden_size, num_layers, vocab_size, intermediate_size, num_attention_heads, pp_size):
|
| 4 |
+
# Get list of pipeline blocks and their costs
|
| 5 |
+
pipeline_blocks = []
|
| 6 |
+
block_costs = []
|
| 7 |
+
|
| 8 |
+
# Embedding layer (treated as zero cost in the original implementation)
|
| 9 |
+
pipeline_blocks.append("embedding")
|
| 10 |
+
block_costs.append(0)
|
| 11 |
+
|
| 12 |
+
# Decoder layers
|
| 13 |
+
decoder_cost = (4 * num_attention_heads * (hidden_size//num_attention_heads) * hidden_size +
|
| 14 |
+
3 * intermediate_size * hidden_size)
|
| 15 |
+
for _ in range(num_layers):
|
| 16 |
+
pipeline_blocks.append("decoder")
|
| 17 |
+
block_costs.append(decoder_cost)
|
| 18 |
+
|
| 19 |
+
# LM head
|
| 20 |
+
pipeline_blocks.append("lm_head")
|
| 21 |
+
block_costs.append(vocab_size * hidden_size)
|
| 22 |
+
|
| 23 |
+
# Now follow the same logic as the original code
|
| 24 |
+
total_cost = sum(block_costs)
|
| 25 |
+
target_cost_per_rank = total_cost / pp_size
|
| 26 |
+
|
| 27 |
+
blocks_in_rank0 = 0
|
| 28 |
+
current_cost = 0
|
| 29 |
+
|
| 30 |
+
for block_idx, block_cost in enumerate(block_costs):
|
| 31 |
+
current_cost += block_cost
|
| 32 |
+
blocks_in_rank0 += 1
|
| 33 |
+
|
| 34 |
+
# Check if we should move to next rank
|
| 35 |
+
remaining_ranks = pp_size - 1 # -1 because we're calculating for rank 0
|
| 36 |
+
remaining_nonzero_blocks = sum(1 for c in block_costs[block_idx+1:] if c > 0)
|
| 37 |
+
|
| 38 |
+
if (remaining_ranks > 0 and remaining_nonzero_blocks <= remaining_ranks) or (current_cost >= target_cost_per_rank):
|
| 39 |
+
break
|
| 40 |
+
|
| 41 |
+
num_hidden_layers_in_pp = blocks_in_rank0 - 1 # We exclude first rank as it's the embedding layer
|
| 42 |
+
print("num_hidden_layers_in_pp", num_hidden_layers_in_pp)
|
| 43 |
+
return num_hidden_layers_in_pp
|
| 44 |
|
| 45 |
def calculate_memory_components(
|
| 46 |
+
hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
|
| 47 |
seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
|
| 48 |
tie_word_embeddings
|
| 49 |
):
|
| 50 |
# Calculate base components first
|
| 51 |
+
if pp == 1:
|
| 52 |
+
num_hidden_layers_in_pp = num_layers
|
| 53 |
+
else:
|
| 54 |
+
# num_hidden_layers_in_pp = num_layers // pp
|
| 55 |
+
num_hidden_layers_in_pp = get_num_hidden_layers_in_pp(hidden_size, num_layers, vocab_size, intermediate_size, num_attention_heads, pp)
|
| 56 |
|
| 57 |
# Model BF16 calculation
|
| 58 |
vocab_embeddings = vocab_size * hidden_size * (2 if (not tie_word_embeddings and pp==1) else 1)
|
| 59 |
|
| 60 |
layer_params = (
|
| 61 |
+
(hidden_size * hidden_size * (1 + 2*num_key_value_heads/num_attention_heads)) # qkv_proj
|
| 62 |
+ (hidden_size * hidden_size) # out_proj
|
| 63 |
+ (hidden_size * 2 * intermediate_size) # gate_up_proj
|
| 64 |
+ (intermediate_size * hidden_size) # down_proj
|
|
|
|
| 129 |
}
|
| 130 |
|
| 131 |
def plot_memory_breakdown(
|
| 132 |
+
hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
|
| 133 |
seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
|
| 134 |
tie_word_embeddings
|
| 135 |
):
|
| 136 |
results = calculate_memory_components(
|
| 137 |
+
hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
|
| 138 |
seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
|
| 139 |
tie_word_embeddings
|
| 140 |
)
|