nouamanetazi HF Staff commited on
Commit
d51b632
·
1 Parent(s): 5f67cc3

small fixes

Browse files
Files changed (3) hide show
  1. app.py +8 -4
  2. requirements.txt +1 -0
  3. utils.py +50 -5
app.py CHANGED
@@ -28,7 +28,9 @@ def load_config_from_yaml_content(yaml_content):
28
  'pp': parallelism['pp'],
29
  'dp': parallelism['dp'],
30
  'zero_stage': optimizer['zero_stage'],
31
- 'tie_word_embeddings': model_config['tie_word_embeddings']
 
 
32
  }
33
  except Exception as e:
34
  raise gr.Error(f"Error parsing YAML: {str(e)}")
@@ -46,7 +48,7 @@ def format_config_display(config):
46
  sections = {
47
  "Model Architecture": [
48
  "hidden_size", "num_layers", "vocab_size",
49
- "intermediate_size", "tie_word_embeddings"
50
  ],
51
  "Training Configuration": [
52
  "seq_len", "mbs", "batch_accum"
@@ -89,6 +91,8 @@ with gr.Blocks() as demo:
89
  vocab_size = gr.Number(50432, label="Vocabulary Size")
90
  intermediate_size = gr.Number(11008, label="Intermediate Size")
91
  tie_word_embeddings = gr.Checkbox(True, label="Tie Word Embeddings")
 
 
92
 
93
  with gr.Accordion("Training Configuration", open=True):
94
  seq_len = gr.Number(2048, label="Sequence Length")
@@ -127,14 +131,14 @@ with gr.Blocks() as demo:
127
  config = dict(zip([
128
  'hidden_size', 'num_layers', 'vocab_size', 'intermediate_size',
129
  'seq_len', 'mbs', 'batch_accum', 'tp', 'pp', 'dp', 'zero_stage',
130
- 'tie_word_embeddings'
131
  ], args))
132
  return process_yaml_and_plot(config)
133
 
134
  manual_submit.click(
135
  manual_input_to_config,
136
  inputs=[
137
- hidden_size, num_layers, vocab_size, intermediate_size,
138
  seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
139
  tie_word_embeddings
140
  ],
 
28
  'pp': parallelism['pp'],
29
  'dp': parallelism['dp'],
30
  'zero_stage': optimizer['zero_stage'],
31
+ 'tie_word_embeddings': model_config['tie_word_embeddings'],
32
+ 'num_attention_heads': model_config['num_attention_heads'],
33
+ 'num_key_value_heads': model_config.get('num_key_value_heads', model_config['num_attention_heads'])
34
  }
35
  except Exception as e:
36
  raise gr.Error(f"Error parsing YAML: {str(e)}")
 
48
  sections = {
49
  "Model Architecture": [
50
  "hidden_size", "num_layers", "vocab_size",
51
+ "intermediate_size", "tie_word_embeddings", "num_attention_heads", "num_key_value_heads"
52
  ],
53
  "Training Configuration": [
54
  "seq_len", "mbs", "batch_accum"
 
91
  vocab_size = gr.Number(50432, label="Vocabulary Size")
92
  intermediate_size = gr.Number(11008, label="Intermediate Size")
93
  tie_word_embeddings = gr.Checkbox(True, label="Tie Word Embeddings")
94
+ num_attention_heads = gr.Number(32, label="Number of Attention Heads")
95
+ num_key_value_heads = gr.Number(32, label="Number of Key Value Heads")
96
 
97
  with gr.Accordion("Training Configuration", open=True):
98
  seq_len = gr.Number(2048, label="Sequence Length")
 
131
  config = dict(zip([
132
  'hidden_size', 'num_layers', 'vocab_size', 'intermediate_size',
133
  'seq_len', 'mbs', 'batch_accum', 'tp', 'pp', 'dp', 'zero_stage',
134
+ 'tie_word_embeddings', 'num_attention_heads', 'num_key_value_heads'
135
  ], args))
136
  return process_yaml_and_plot(config)
137
 
138
  manual_submit.click(
139
  manual_input_to_config,
140
  inputs=[
141
+ hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
142
  seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
143
  tie_word_embeddings
144
  ],
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ matplotlib
utils.py CHANGED
@@ -1,19 +1,64 @@
1
 
2
  import matplotlib.pyplot as plt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def calculate_memory_components(
5
- hidden_size, num_layers, vocab_size, intermediate_size,
6
  seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
7
  tie_word_embeddings
8
  ):
9
  # Calculate base components first
10
- num_hidden_layers_in_pp = num_layers // pp
 
 
 
 
11
 
12
  # Model BF16 calculation
13
  vocab_embeddings = vocab_size * hidden_size * (2 if (not tie_word_embeddings and pp==1) else 1)
14
 
15
  layer_params = (
16
- (hidden_size * 3 * hidden_size) # qkv_proj
17
  + (hidden_size * hidden_size) # out_proj
18
  + (hidden_size * 2 * intermediate_size) # gate_up_proj
19
  + (intermediate_size * hidden_size) # down_proj
@@ -84,12 +129,12 @@ def calculate_memory_components(
84
  }
85
 
86
  def plot_memory_breakdown(
87
- hidden_size, num_layers, vocab_size, intermediate_size,
88
  seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
89
  tie_word_embeddings
90
  ):
91
  results = calculate_memory_components(
92
- hidden_size, num_layers, vocab_size, intermediate_size,
93
  seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
94
  tie_word_embeddings
95
  )
 
1
 
2
  import matplotlib.pyplot as plt
3
+ def get_num_hidden_layers_in_pp(hidden_size, num_layers, vocab_size, intermediate_size, num_attention_heads, pp_size):
4
+ # Get list of pipeline blocks and their costs
5
+ pipeline_blocks = []
6
+ block_costs = []
7
+
8
+ # Embedding layer (treated as zero cost in the original implementation)
9
+ pipeline_blocks.append("embedding")
10
+ block_costs.append(0)
11
+
12
+ # Decoder layers
13
+ decoder_cost = (4 * num_attention_heads * (hidden_size//num_attention_heads) * hidden_size +
14
+ 3 * intermediate_size * hidden_size)
15
+ for _ in range(num_layers):
16
+ pipeline_blocks.append("decoder")
17
+ block_costs.append(decoder_cost)
18
+
19
+ # LM head
20
+ pipeline_blocks.append("lm_head")
21
+ block_costs.append(vocab_size * hidden_size)
22
+
23
+ # Now follow the same logic as the original code
24
+ total_cost = sum(block_costs)
25
+ target_cost_per_rank = total_cost / pp_size
26
+
27
+ blocks_in_rank0 = 0
28
+ current_cost = 0
29
+
30
+ for block_idx, block_cost in enumerate(block_costs):
31
+ current_cost += block_cost
32
+ blocks_in_rank0 += 1
33
+
34
+ # Check if we should move to next rank
35
+ remaining_ranks = pp_size - 1 # -1 because we're calculating for rank 0
36
+ remaining_nonzero_blocks = sum(1 for c in block_costs[block_idx+1:] if c > 0)
37
+
38
+ if (remaining_ranks > 0 and remaining_nonzero_blocks <= remaining_ranks) or (current_cost >= target_cost_per_rank):
39
+ break
40
+
41
+ num_hidden_layers_in_pp = blocks_in_rank0 - 1 # We exclude first rank as it's the embedding layer
42
+ print("num_hidden_layers_in_pp", num_hidden_layers_in_pp)
43
+ return num_hidden_layers_in_pp
44
 
45
  def calculate_memory_components(
46
+ hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
47
  seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
48
  tie_word_embeddings
49
  ):
50
  # Calculate base components first
51
+ if pp == 1:
52
+ num_hidden_layers_in_pp = num_layers
53
+ else:
54
+ # num_hidden_layers_in_pp = num_layers // pp
55
+ num_hidden_layers_in_pp = get_num_hidden_layers_in_pp(hidden_size, num_layers, vocab_size, intermediate_size, num_attention_heads, pp)
56
 
57
  # Model BF16 calculation
58
  vocab_embeddings = vocab_size * hidden_size * (2 if (not tie_word_embeddings and pp==1) else 1)
59
 
60
  layer_params = (
61
+ (hidden_size * hidden_size * (1 + 2*num_key_value_heads/num_attention_heads)) # qkv_proj
62
  + (hidden_size * hidden_size) # out_proj
63
  + (hidden_size * 2 * intermediate_size) # gate_up_proj
64
  + (intermediate_size * hidden_size) # down_proj
 
129
  }
130
 
131
  def plot_memory_breakdown(
132
+ hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
133
  seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
134
  tie_word_embeddings
135
  ):
136
  results = calculate_memory_components(
137
+ hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
138
  seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
139
  tie_word_embeddings
140
  )