Spaces:

nanotron
/

predict_memory

Running

App Files Files Community

nouamanetazi HF Staff commited on Jan 22, 2025

Commit

4921bbf

1 Parent(s): c68510e

.

Browse files

Files changed (1) hide show

utils.py +19 -16

utils.py CHANGED Viewed

@@ -1,7 +1,12 @@
 import matplotlib.pyplot as plt
 import numpy as np
 def get_num_hidden_layers_in_pp(hidden_size, num_layers, vocab_size, intermediate_size, num_attention_heads, pp_size):
     # Get list of pipeline blocks and their costs
     pipeline_blocks = []
     block_costs = []
@@ -40,9 +45,9 @@ def get_num_hidden_layers_in_pp(hidden_size, num_layers, vocab_size, intermediat
             break
     num_hidden_layers_in_pp = blocks_in_rank0 - 1 # We exclude first rank as it's the embedding layer
-    print("num_hidden_layers_in_pp", num_hidden_layers_in_pp)
     return num_hidden_layers_in_pp
 def calculate_memory_components(
     hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
     seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
@@ -77,9 +82,9 @@ def calculate_memory_components(
     overhead = 72 + 32 * mbs
     # Activations
-    # decoder_layer_mib = (seq_len * mbs * hidden_size/tp) * (2/1024/1024) * (4*intermediate_size/hidden_size + 10)
     is_mha = num_key_value_heads == num_attention_heads
-    decoder_layer_mib = (seq_len * mbs * hidden_size/tp) * (2/1024/1024) * (4*intermediate_size/hidden_size + 12 + 2*num_key_value_heads/num_attention_heads + (2 if is_mha else 0))
     if pp > 1:
         activs = min(pp, batch_accum) * num_hidden_layers_in_pp * decoder_layer_mib
@@ -144,7 +149,7 @@ def plot_memory_breakdown(
     # Create figure for components plot
     plt.close('all')
-    fig1 = plt.figure(figsize=(10, 6))
     ax1 = fig1.add_subplot(1, 1, 1)
     # Plot components
@@ -152,7 +157,10 @@ def plot_memory_breakdown(
     names = list(components.keys())
     values = list(components.values())
-    bars1 = ax1.bar(range(len(components)), values)
     # Add value labels with better positioning
     for bar in bars1:
@@ -171,7 +179,7 @@ def plot_memory_breakdown(
     plt.tight_layout()
     # Create figure for timeline plot
-    fig2 = plt.figure(figsize=(12, 6))
     ax2 = fig2.add_subplot(1, 1, 1)
     # Define timeline steps and their components
@@ -194,12 +202,6 @@ def plot_memory_breakdown(
             ("FP32 Gradients", c["FP32 Gradients"]),
             ("Activations", c["Activations"])
         ],
-        "After Fwd-Bwd": [
-            ("Model BF16", c["Model BF16"]),
-            ("DDP Gradient Buffers", c["DDP Gradient Buffers"]),
-            ("FP32 Parameters", c["FP32 Parameters"]),
-            ("FP32 Gradients", c["FP32 Gradients"])
-        ],
         "Optimizer Step": [
             ("Model BF16", c["Model BF16"]),
             ("FP32 Parameters", c["FP32 Parameters"]),
@@ -225,8 +227,7 @@ def plot_memory_breakdown(
     # Plot timeline
     x = range(len(timeline_steps))
     bottom = np.zeros(len(timeline_steps))
-    colors = plt.cm.Set3(np.linspace(0, 1, len(c)))
-    color_map = dict(zip(c.keys(), colors))
     for component in c.keys():
         heights = []
@@ -245,7 +246,7 @@ def plot_memory_breakdown(
     ax2.set_xticklabels(timeline_steps.keys(), rotation=45, ha='right')
     ax2.set_ylabel('Memory (MiB)')
     ax2.set_title('Memory Timeline', pad=20)
-    ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
     # Add total memory labels on top of each bar
     for i, total in enumerate(bottom):
@@ -253,9 +254,11 @@ def plot_memory_breakdown(
     # Adjust layout
     plt.tight_layout()
     # Set y-axis limit
     max_y_value = max(bottom)
     ax2.set_ylim(0, max(80000, max_y_value))
     return fig1, fig2

 import matplotlib.pyplot as plt
 import numpy as np
+import functools
+@functools.lru_cache(maxsize=None)
 def get_num_hidden_layers_in_pp(hidden_size, num_layers, vocab_size, intermediate_size, num_attention_heads, pp_size):
+    if pp_size == 1:
+        return num_layers
     # Get list of pipeline blocks and their costs
     pipeline_blocks = []
     block_costs = []
             break
     num_hidden_layers_in_pp = blocks_in_rank0 - 1 # We exclude first rank as it's the embedding layer
     return num_hidden_layers_in_pp
+@functools.lru_cache(maxsize=None)
 def calculate_memory_components(
     hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
     seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
     overhead = 72 + 32 * mbs
     # Activations
     is_mha = num_key_value_heads == num_attention_heads
+    decoder_layer_mib = (seq_len * mbs * hidden_size/tp) * (2/1024/1024) * (4*intermediate_size/hidden_size + 6 + 2*num_key_value_heads/num_attention_heads + 2)
+    # decoder_layer_mib = (seq_len * mbs * hidden_size/tp) * (2/1024/1024) * (4*intermediate_size/hidden_size + 12 + 2*num_key_value_heads/num_attention_heads + (2 if is_mha else 0))
     if pp > 1:
         activs = min(pp, batch_accum) * num_hidden_layers_in_pp * decoder_layer_mib
     # Create figure for components plot
     plt.close('all')
+    fig1 = plt.figure(figsize=(10, 5))
     ax1 = fig1.add_subplot(1, 1, 1)
     # Plot components
     names = list(components.keys())
     values = list(components.values())
+    colors = plt.cm.Set3(np.linspace(0, 1, len(components)))
+    color_map = dict(zip(names, colors))
+    bars1 = ax1.bar(range(len(components)), values, color=colors)
     # Add value labels with better positioning
     for bar in bars1:
     plt.tight_layout()
     # Create figure for timeline plot
+    fig2 = plt.figure(figsize=(10, 6))
     ax2 = fig2.add_subplot(1, 1, 1)
     # Define timeline steps and their components
             ("FP32 Gradients", c["FP32 Gradients"]),
             ("Activations", c["Activations"])
         ],
         "Optimizer Step": [
             ("Model BF16", c["Model BF16"]),
             ("FP32 Parameters", c["FP32 Parameters"]),
     # Plot timeline
     x = range(len(timeline_steps))
     bottom = np.zeros(len(timeline_steps))
     for component in c.keys():
         heights = []
     ax2.set_xticklabels(timeline_steps.keys(), rotation=45, ha='right')
     ax2.set_ylabel('Memory (MiB)')
     ax2.set_title('Memory Timeline', pad=20)
     # Add total memory labels on top of each bar
     for i, total in enumerate(bottom):
     # Adjust layout
     plt.tight_layout()
     # Set y-axis limit
     max_y_value = max(bottom)
     ax2.set_ylim(0, max(80000, max_y_value))
+    # Add legend below the plot
+    # plt.subplots_adjust(bottom=0.8)
+    ax2.legend(loc='lower center', bbox_to_anchor=(0.5, -1.5), ncol=3)
     return fig1, fig2