Spaces:
Running
Running
Commit ·
0b99db3
1
Parent(s): 1a15aaa
update
Browse files
app.py
CHANGED
|
@@ -70,10 +70,23 @@ def format_config_display(config):
|
|
| 70 |
if not config:
|
| 71 |
return "No configuration loaded"
|
| 72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
sections = {
|
| 74 |
"Model Architecture": [
|
| 75 |
-
"hidden_size", "num_layers", "vocab_size",
|
| 76 |
-
"intermediate_size", "tie_word_embeddings", "num_attention_heads", "num_key_value_heads"
|
|
|
|
| 77 |
],
|
| 78 |
"Training Configuration": [
|
| 79 |
"seq_len", "mbs", "batch_accum"
|
|
@@ -87,8 +100,13 @@ def format_config_display(config):
|
|
| 87 |
for section_name, params in sections.items():
|
| 88 |
output += f"<div style='flex: 1; padding-right: 20px;'><h3>{section_name}</h3>"
|
| 89 |
for param in params:
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
output += "</div>"
|
| 93 |
output += "</div>"
|
| 94 |
return output
|
|
|
|
| 70 |
if not config:
|
| 71 |
return "No configuration loaded"
|
| 72 |
|
| 73 |
+
# Calculate number of parameters
|
| 74 |
+
vocab_embeddings = config['vocab_size'] * config['hidden_size'] * (1 if config['tie_word_embeddings'] else 2)
|
| 75 |
+
|
| 76 |
+
layer_params = (
|
| 77 |
+
(config['hidden_size'] * config['hidden_size'] * (1 + 2*config['num_key_value_heads']/config['num_attention_heads'])) # qkv_proj
|
| 78 |
+
+ (config['hidden_size'] * config['hidden_size']) # out_proj
|
| 79 |
+
+ (config['hidden_size'] * 2 * config['intermediate_size']) # gate_up_proj
|
| 80 |
+
+ (config['intermediate_size'] * config['hidden_size']) # down_proj
|
| 81 |
+
)
|
| 82 |
+
total_params = (vocab_embeddings + config['num_layers'] * layer_params) / config['tp']
|
| 83 |
+
params_billions = total_params / 1_000_000_000
|
| 84 |
+
|
| 85 |
sections = {
|
| 86 |
"Model Architecture": [
|
| 87 |
+
"hidden_size", "num_layers", "vocab_size",
|
| 88 |
+
"intermediate_size", "tie_word_embeddings", "num_attention_heads", "num_key_value_heads",
|
| 89 |
+
("num_params", f"{params_billions:.2f}B") # Show params in billions
|
| 90 |
],
|
| 91 |
"Training Configuration": [
|
| 92 |
"seq_len", "mbs", "batch_accum"
|
|
|
|
| 100 |
for section_name, params in sections.items():
|
| 101 |
output += f"<div style='flex: 1; padding-right: 20px;'><h3>{section_name}</h3>"
|
| 102 |
for param in params:
|
| 103 |
+
if isinstance(param, tuple):
|
| 104 |
+
# Handle custom parameter display
|
| 105 |
+
param_name, value = param
|
| 106 |
+
output += f"<b>{param_name}</b>: {value}<br>"
|
| 107 |
+
else:
|
| 108 |
+
value = config.get(param, 'N/A')
|
| 109 |
+
output += f"<b>{param}</b>: {value}<br>"
|
| 110 |
output += "</div>"
|
| 111 |
output += "</div>"
|
| 112 |
return output
|
utils.py
CHANGED
|
@@ -71,6 +71,9 @@ def calculate_memory_components(
|
|
| 71 |
|
| 72 |
model_bf16_full = (vocab_embeddings + num_hidden_layers_in_pp * layer_params) * (2 / 1024 / 1024) / tp
|
| 73 |
|
|
|
|
|
|
|
|
|
|
| 74 |
# Adjust model components based on ZeRO stage
|
| 75 |
if zero_stage == 3:
|
| 76 |
# In ZeRO-3, model parameters are sharded across dp ranks
|
|
@@ -148,7 +151,8 @@ def calculate_memory_components(
|
|
| 148 |
"DDP Gradient Buffers": ddp_grads_buffers,
|
| 149 |
"ZeRO-3 Buffers": zero3_buffers,
|
| 150 |
"Overhead": overhead,
|
| 151 |
-
"Activations": activs
|
|
|
|
| 152 |
},
|
| 153 |
"Aggregates": {
|
| 154 |
"Memory Before Optimizer States": memory_usage_before_optimstates,
|
|
|
|
| 71 |
|
| 72 |
model_bf16_full = (vocab_embeddings + num_hidden_layers_in_pp * layer_params) * (2 / 1024 / 1024) / tp
|
| 73 |
|
| 74 |
+
# Calculate number of parameters in billions
|
| 75 |
+
num_params_in_B = (vocab_embeddings + num_layers*layer_params) / 1e9
|
| 76 |
+
|
| 77 |
# Adjust model components based on ZeRO stage
|
| 78 |
if zero_stage == 3:
|
| 79 |
# In ZeRO-3, model parameters are sharded across dp ranks
|
|
|
|
| 151 |
"DDP Gradient Buffers": ddp_grads_buffers,
|
| 152 |
"ZeRO-3 Buffers": zero3_buffers,
|
| 153 |
"Overhead": overhead,
|
| 154 |
+
"Activations": activs,
|
| 155 |
+
"num_params_in_B": num_params_in_B
|
| 156 |
},
|
| 157 |
"Aggregates": {
|
| 158 |
"Memory Before Optimizer States": memory_usage_before_optimstates,
|