nouamanetazi HF Staff commited on
Commit
0b99db3
·
1 Parent(s): 1a15aaa
Files changed (2) hide show
  1. app.py +22 -4
  2. utils.py +5 -1
app.py CHANGED
@@ -70,10 +70,23 @@ def format_config_display(config):
70
  if not config:
71
  return "No configuration loaded"
72
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  sections = {
74
  "Model Architecture": [
75
- "hidden_size", "num_layers", "vocab_size",
76
- "intermediate_size", "tie_word_embeddings", "num_attention_heads", "num_key_value_heads"
 
77
  ],
78
  "Training Configuration": [
79
  "seq_len", "mbs", "batch_accum"
@@ -87,8 +100,13 @@ def format_config_display(config):
87
  for section_name, params in sections.items():
88
  output += f"<div style='flex: 1; padding-right: 20px;'><h3>{section_name}</h3>"
89
  for param in params:
90
- value = config.get(param, 'N/A')
91
- output += f"<b>{param}</b>: {value}<br>"
 
 
 
 
 
92
  output += "</div>"
93
  output += "</div>"
94
  return output
 
70
  if not config:
71
  return "No configuration loaded"
72
 
73
+ # Calculate number of parameters
74
+ vocab_embeddings = config['vocab_size'] * config['hidden_size'] * (1 if config['tie_word_embeddings'] else 2)
75
+
76
+ layer_params = (
77
+ (config['hidden_size'] * config['hidden_size'] * (1 + 2*config['num_key_value_heads']/config['num_attention_heads'])) # qkv_proj
78
+ + (config['hidden_size'] * config['hidden_size']) # out_proj
79
+ + (config['hidden_size'] * 2 * config['intermediate_size']) # gate_up_proj
80
+ + (config['intermediate_size'] * config['hidden_size']) # down_proj
81
+ )
82
+ total_params = (vocab_embeddings + config['num_layers'] * layer_params) / config['tp']
83
+ params_billions = total_params / 1_000_000_000
84
+
85
  sections = {
86
  "Model Architecture": [
87
+ "hidden_size", "num_layers", "vocab_size",
88
+ "intermediate_size", "tie_word_embeddings", "num_attention_heads", "num_key_value_heads",
89
+ ("num_params", f"{params_billions:.2f}B") # Show params in billions
90
  ],
91
  "Training Configuration": [
92
  "seq_len", "mbs", "batch_accum"
 
100
  for section_name, params in sections.items():
101
  output += f"<div style='flex: 1; padding-right: 20px;'><h3>{section_name}</h3>"
102
  for param in params:
103
+ if isinstance(param, tuple):
104
+ # Handle custom parameter display
105
+ param_name, value = param
106
+ output += f"<b>{param_name}</b>: {value}<br>"
107
+ else:
108
+ value = config.get(param, 'N/A')
109
+ output += f"<b>{param}</b>: {value}<br>"
110
  output += "</div>"
111
  output += "</div>"
112
  return output
utils.py CHANGED
@@ -71,6 +71,9 @@ def calculate_memory_components(
71
 
72
  model_bf16_full = (vocab_embeddings + num_hidden_layers_in_pp * layer_params) * (2 / 1024 / 1024) / tp
73
 
 
 
 
74
  # Adjust model components based on ZeRO stage
75
  if zero_stage == 3:
76
  # In ZeRO-3, model parameters are sharded across dp ranks
@@ -148,7 +151,8 @@ def calculate_memory_components(
148
  "DDP Gradient Buffers": ddp_grads_buffers,
149
  "ZeRO-3 Buffers": zero3_buffers,
150
  "Overhead": overhead,
151
- "Activations": activs
 
152
  },
153
  "Aggregates": {
154
  "Memory Before Optimizer States": memory_usage_before_optimstates,
 
71
 
72
  model_bf16_full = (vocab_embeddings + num_hidden_layers_in_pp * layer_params) * (2 / 1024 / 1024) / tp
73
 
74
+ # Calculate number of parameters in billions
75
+ num_params_in_B = (vocab_embeddings + num_layers*layer_params) / 1e9
76
+
77
  # Adjust model components based on ZeRO stage
78
  if zero_stage == 3:
79
  # In ZeRO-3, model parameters are sharded across dp ranks
 
151
  "DDP Gradient Buffers": ddp_grads_buffers,
152
  "ZeRO-3 Buffers": zero3_buffers,
153
  "Overhead": overhead,
154
+ "Activations": activs,
155
+ "num_params_in_B": num_params_in_B
156
  },
157
  "Aggregates": {
158
  "Memory Before Optimizer States": memory_usage_before_optimstates,