Spaces:

nanotron
/

predict_memory

Running

App Files Files Community

nouamanetazi HF Staff commited on Jan 21, 2025

Commit

d51b632

1 Parent(s): 5f67cc3

small fixes

Browse files

Files changed (3) hide show

app.py +8 -4
requirements.txt +1 -0
utils.py +50 -5

app.py CHANGED Viewed

@@ -28,7 +28,9 @@ def load_config_from_yaml_content(yaml_content):
             'pp': parallelism['pp'],
             'dp': parallelism['dp'],
             'zero_stage': optimizer['zero_stage'],
-            'tie_word_embeddings': model_config['tie_word_embeddings']
         }
     except Exception as e:
         raise gr.Error(f"Error parsing YAML: {str(e)}")
@@ -46,7 +48,7 @@ def format_config_display(config):
     sections = {
         "Model Architecture": [
             "hidden_size", "num_layers", "vocab_size",
-            "intermediate_size", "tie_word_embeddings"
         ],
         "Training Configuration": [
             "seq_len", "mbs", "batch_accum"
@@ -89,6 +91,8 @@ with gr.Blocks() as demo:
                     vocab_size = gr.Number(50432, label="Vocabulary Size")
                     intermediate_size = gr.Number(11008, label="Intermediate Size")
                     tie_word_embeddings = gr.Checkbox(True, label="Tie Word Embeddings")
                 with gr.Accordion("Training Configuration", open=True):
                     seq_len = gr.Number(2048, label="Sequence Length")
@@ -127,14 +131,14 @@ with gr.Blocks() as demo:
         config = dict(zip([
             'hidden_size', 'num_layers', 'vocab_size', 'intermediate_size',
             'seq_len', 'mbs', 'batch_accum', 'tp', 'pp', 'dp', 'zero_stage',
-            'tie_word_embeddings'
         ], args))
         return process_yaml_and_plot(config)
     manual_submit.click(
         manual_input_to_config,
         inputs=[
-            hidden_size, num_layers, vocab_size, intermediate_size,
             seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
             tie_word_embeddings
         ],

             'pp': parallelism['pp'],
             'dp': parallelism['dp'],
             'zero_stage': optimizer['zero_stage'],
+            'tie_word_embeddings': model_config['tie_word_embeddings'],
+            'num_attention_heads': model_config['num_attention_heads'],
+            'num_key_value_heads': model_config.get('num_key_value_heads', model_config['num_attention_heads'])
         }
     except Exception as e:
         raise gr.Error(f"Error parsing YAML: {str(e)}")
     sections = {
         "Model Architecture": [
             "hidden_size", "num_layers", "vocab_size",
+            "intermediate_size", "tie_word_embeddings", "num_attention_heads", "num_key_value_heads"
         ],
         "Training Configuration": [
             "seq_len", "mbs", "batch_accum"
                     vocab_size = gr.Number(50432, label="Vocabulary Size")
                     intermediate_size = gr.Number(11008, label="Intermediate Size")
                     tie_word_embeddings = gr.Checkbox(True, label="Tie Word Embeddings")
+                    num_attention_heads = gr.Number(32, label="Number of Attention Heads")
+                    num_key_value_heads = gr.Number(32, label="Number of Key Value Heads")
                 with gr.Accordion("Training Configuration", open=True):
                     seq_len = gr.Number(2048, label="Sequence Length")
         config = dict(zip([
             'hidden_size', 'num_layers', 'vocab_size', 'intermediate_size',
             'seq_len', 'mbs', 'batch_accum', 'tp', 'pp', 'dp', 'zero_stage',
+            'tie_word_embeddings', 'num_attention_heads', 'num_key_value_heads'
         ], args))
         return process_yaml_and_plot(config)
     manual_submit.click(
         manual_input_to_config,
         inputs=[
+            hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
             seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
             tie_word_embeddings
         ],

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ matplotlib

utils.py CHANGED Viewed

@@ -1,19 +1,64 @@
 import matplotlib.pyplot as plt
 def calculate_memory_components(
-    hidden_size, num_layers, vocab_size, intermediate_size,
     seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
     tie_word_embeddings
 ):
     # Calculate base components first
-    num_hidden_layers_in_pp = num_layers // pp
     # Model BF16 calculation
     vocab_embeddings = vocab_size * hidden_size * (2 if (not tie_word_embeddings and pp==1) else 1)
     layer_params = (
-        (hidden_size * 3 * hidden_size)  # qkv_proj
         + (hidden_size * hidden_size)     # out_proj
         + (hidden_size * 2 * intermediate_size)  # gate_up_proj
         + (intermediate_size * hidden_size)      # down_proj
@@ -84,12 +129,12 @@ def calculate_memory_components(
     }
 def plot_memory_breakdown(
-    hidden_size, num_layers, vocab_size, intermediate_size,
     seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
     tie_word_embeddings
 ):
     results = calculate_memory_components(
-        hidden_size, num_layers, vocab_size, intermediate_size,
         seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
         tie_word_embeddings
     )

 import matplotlib.pyplot as plt
+def get_num_hidden_layers_in_pp(hidden_size, num_layers, vocab_size, intermediate_size, num_attention_heads, pp_size):
+    # Get list of pipeline blocks and their costs
+    pipeline_blocks = []
+    block_costs = []
+    # Embedding layer (treated as zero cost in the original implementation)
+    pipeline_blocks.append("embedding")
+    block_costs.append(0)
+    # Decoder layers
+    decoder_cost = (4 * num_attention_heads * (hidden_size//num_attention_heads) * hidden_size +
+                   3 * intermediate_size * hidden_size)
+    for _ in range(num_layers):
+        pipeline_blocks.append("decoder")
+        block_costs.append(decoder_cost)
+    # LM head
+    pipeline_blocks.append("lm_head")
+    block_costs.append(vocab_size * hidden_size)
+    # Now follow the same logic as the original code
+    total_cost = sum(block_costs)
+    target_cost_per_rank = total_cost / pp_size
+    blocks_in_rank0 = 0
+    current_cost = 0
+    for block_idx, block_cost in enumerate(block_costs):
+        current_cost += block_cost
+        blocks_in_rank0 += 1
+        # Check if we should move to next rank
+        remaining_ranks = pp_size - 1  # -1 because we're calculating for rank 0
+        remaining_nonzero_blocks = sum(1 for c in block_costs[block_idx+1:] if c > 0)
+        if (remaining_ranks > 0 and remaining_nonzero_blocks <= remaining_ranks) or (current_cost >= target_cost_per_rank):
+            break
+    num_hidden_layers_in_pp = blocks_in_rank0 - 1 # We exclude first rank as it's the embedding layer
+    print("num_hidden_layers_in_pp", num_hidden_layers_in_pp)
+    return num_hidden_layers_in_pp
 def calculate_memory_components(
+    hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
     seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
     tie_word_embeddings
 ):
     # Calculate base components first
+    if pp == 1:
+        num_hidden_layers_in_pp = num_layers
+    else:
+        # num_hidden_layers_in_pp = num_layers // pp
+        num_hidden_layers_in_pp = get_num_hidden_layers_in_pp(hidden_size, num_layers, vocab_size, intermediate_size, num_attention_heads, pp)
     # Model BF16 calculation
     vocab_embeddings = vocab_size * hidden_size * (2 if (not tie_word_embeddings and pp==1) else 1)
     layer_params = (
+        (hidden_size * hidden_size * (1 + 2*num_key_value_heads/num_attention_heads))  # qkv_proj
         + (hidden_size * hidden_size)     # out_proj
         + (hidden_size * 2 * intermediate_size)  # gate_up_proj
         + (intermediate_size * hidden_size)      # down_proj
     }
 def plot_memory_breakdown(
+    hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
     seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
     tie_word_embeddings
 ):
     results = calculate_memory_components(
+        hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
         seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
         tie_word_embeddings
     )