Spaces:

nanotron
/

predict_memory

Running

App Files Files Community

nouamanetazi HF Staff commited on Feb 4, 2025

Commit

1a15aaa

1 Parent(s): 2fa84c8

EXPERIMENTAL: add fsdp_checkpointing

Browse files

Files changed (2) hide show

app.py +12 -7
utils.py +11 -5

app.py CHANGED Viewed

@@ -27,7 +27,8 @@ def load_config_from_content(content):
                 'zero_stage': 0,  # Default value
                 'tie_word_embeddings': config.get('tie_word_embeddings', True),
                 'num_attention_heads': config['num_attention_heads'],
-                'num_key_value_heads': config.get('num_key_value_heads', config['num_attention_heads'])
             }
         except json.JSONDecodeError:
             # If not JSON, try YAML
@@ -53,7 +54,8 @@ def load_config_from_content(content):
                 'zero_stage': optimizer['zero_stage'],
                 'tie_word_embeddings': model_config['tie_word_embeddings'],
                 'num_attention_heads': model_config['num_attention_heads'],
-                'num_key_value_heads': model_config.get('num_key_value_heads', model_config['num_attention_heads'])
             }
     except Exception as e:
         raise gr.Error(f"Error parsing configuration: {str(e)}")
@@ -77,7 +79,7 @@ def format_config_display(config):
             "seq_len", "mbs", "batch_accum"
         ],
         "Parallelism": [
-            "tp", "pp", "dp", "zero_stage"
         ]
     }
@@ -134,6 +136,7 @@ with gr.Blocks() as demo:
                         pp = gr.Number(1, label="Pipeline Parallelism")
                         dp = gr.Number(1, label="Data Parallelism")
                     zero_stage = gr.Radio([0, 1, 2, 3], value=0, label="ZeRO Stage")
                 manual_submit = gr.Button("Calculate Memory (Manual Input)")
         with gr.Column(scale=2):
@@ -150,7 +153,7 @@ with gr.Blocks() as demo:
             plot1, plot2, config_display, oom_display,
             hidden_size, num_attention_heads, num_key_value_heads, num_layers,
             vocab_size, intermediate_size, seq_len, mbs, batch_accum,
-            tp, pp, dp, zero_stage, tie_word_embeddings
         ]
     )
@@ -180,7 +183,8 @@ with gr.Blocks() as demo:
             config['pp'],
             config['dp'],
             config['zero_stage'],
-            config['tie_word_embeddings']
         ]
     # Handle manual input
@@ -199,7 +203,8 @@ with gr.Blocks() as demo:
             'zero_stage': args[12],
             'tie_word_embeddings': args[13],
             'num_attention_heads': args[1],
-            'num_key_value_heads': args[2]
         }
         return process_yaml_and_update_ui(config)
@@ -208,7 +213,7 @@ with gr.Blocks() as demo:
         inputs=[
             hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
             seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
-            tie_word_embeddings
         ],
         outputs=[plot1, plot2, config_display, oom_display]
     )

                 'zero_stage': 0,  # Default value
                 'tie_word_embeddings': config.get('tie_word_embeddings', True),
                 'num_attention_heads': config['num_attention_heads'],
+                'num_key_value_heads': config.get('num_key_value_heads', config['num_attention_heads']),
+                'fsdp_checkpointing': False  # Default value
             }
         except json.JSONDecodeError:
             # If not JSON, try YAML
                 'zero_stage': optimizer['zero_stage'],
                 'tie_word_embeddings': model_config['tie_word_embeddings'],
                 'num_attention_heads': model_config['num_attention_heads'],
+                'num_key_value_heads': model_config.get('num_key_value_heads', model_config['num_attention_heads']),
+                'fsdp_checkpointing': optimizer.get('fsdp_checkpointing', False)  # Add FSDP checkpointing from config
             }
     except Exception as e:
         raise gr.Error(f"Error parsing configuration: {str(e)}")
             "seq_len", "mbs", "batch_accum"
         ],
         "Parallelism": [
+            "tp", "pp", "dp", "zero_stage", "fsdp_checkpointing"
         ]
     }
                         pp = gr.Number(1, label="Pipeline Parallelism")
                         dp = gr.Number(1, label="Data Parallelism")
                     zero_stage = gr.Radio([0, 1, 2, 3], value=0, label="ZeRO Stage")
+                    fsdp_checkpointing = gr.Checkbox(False, label="FSDP Activation Checkpointing")
                 manual_submit = gr.Button("Calculate Memory (Manual Input)")
         with gr.Column(scale=2):
             plot1, plot2, config_display, oom_display,
             hidden_size, num_attention_heads, num_key_value_heads, num_layers,
             vocab_size, intermediate_size, seq_len, mbs, batch_accum,
+            tp, pp, dp, zero_stage, tie_word_embeddings, fsdp_checkpointing
         ]
     )
             config['pp'],
             config['dp'],
             config['zero_stage'],
+            config['tie_word_embeddings'],
+            config['fsdp_checkpointing']
         ]
     # Handle manual input
             'zero_stage': args[12],
             'tie_word_embeddings': args[13],
             'num_attention_heads': args[1],
+            'num_key_value_heads': args[2],
+            'fsdp_checkpointing': args[14]  # Add FSDP checkpointing
         }
         return process_yaml_and_update_ui(config)
         inputs=[
             hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
             seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
+            tie_word_embeddings, fsdp_checkpointing  # Add FSDP checkpointing
         ],
         outputs=[plot1, plot2, config_display, oom_display]
     )

utils.py CHANGED Viewed

@@ -51,7 +51,7 @@ def get_num_hidden_layers_in_pp(hidden_size, num_layers, vocab_size, intermediat
 def calculate_memory_components(
     hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
     seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
-    tie_word_embeddings
 ):
     # Calculate base components first
     if pp == 1:
@@ -93,7 +93,7 @@ def calculate_memory_components(
     ddp_grads_buffers = model_bf16 if use_ddp else 0
     overhead = 72 + 32 * mbs
-    # Activations
     is_mha = num_key_value_heads == num_attention_heads
     decoder_layer_mib = (seq_len * mbs * hidden_size/tp) * (2/1024/1024) * (4*intermediate_size/hidden_size + 6 + 2*num_key_value_heads/num_attention_heads + 2)
@@ -101,7 +101,13 @@ def calculate_memory_components(
         activs = min(pp, batch_accum) * num_hidden_layers_in_pp * decoder_layer_mib
     else:
         cast_to_fp32 = sharded_cross_entropy = seq_len * mbs * vocab_size * (2 / 1024 / 1024) * 2 / tp
-        activs = num_layers * decoder_layer_mib + cast_to_fp32 + sharded_cross_entropy
     # Calculate aggregate metrics
     memory_usage_after_optimstates = (
@@ -154,12 +160,12 @@ def calculate_memory_components(
 def plot_memory_breakdown(
     hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
     seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
-    tie_word_embeddings
 ):
     results = calculate_memory_components(
         hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
         seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
-        tie_word_embeddings
     )
     memory_usage_peak_tbi = results["Aggregates"]["Peak Memory (TBI)"]

 def calculate_memory_components(
     hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
     seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
+    tie_word_embeddings, fsdp_checkpointing=False
 ):
     # Calculate base components first
     if pp == 1:
     ddp_grads_buffers = model_bf16 if use_ddp else 0
     overhead = 72 + 32 * mbs
+    # Activations calculation with FSDP checkpointing support
     is_mha = num_key_value_heads == num_attention_heads
     decoder_layer_mib = (seq_len * mbs * hidden_size/tp) * (2/1024/1024) * (4*intermediate_size/hidden_size + 6 + 2*num_key_value_heads/num_attention_heads + 2)
         activs = min(pp, batch_accum) * num_hidden_layers_in_pp * decoder_layer_mib
     else:
         cast_to_fp32 = sharded_cross_entropy = seq_len * mbs * vocab_size * (2 / 1024 / 1024) * 2 / tp
+        base_activs = num_layers * decoder_layer_mib + cast_to_fp32 + sharded_cross_entropy
+        # Apply activation reduction for FSDP checkpointing in ZeRO-3
+        if zero_stage == 3 and fsdp_checkpointing:
+            activs = base_activs / dp  # Activation memory is reduced by dp factor with checkpointing
+        else:
+            activs = base_activs
     # Calculate aggregate metrics
     memory_usage_after_optimstates = (
 def plot_memory_breakdown(
     hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
     seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
+    tie_word_embeddings, fsdp_checkpointing=False
 ):
     results = calculate_memory_components(
         hidden_size, num_attention_heads, num_key_value_heads, num_layers, vocab_size, intermediate_size,
         seq_len, mbs, batch_accum, tp, pp, dp, zero_stage,
+        tie_word_embeddings, fsdp_checkpointing
     )
     memory_usage_peak_tbi = results["Aggregates"]["Peak Memory (TBI)"]