Spaces:
Running
Running
Commit ·
c68510e
1
Parent(s): d435c48
timeline view
Browse files
utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
-
|
| 2 |
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
| 3 |
def get_num_hidden_layers_in_pp(hidden_size, num_layers, vocab_size, intermediate_size, num_attention_heads, pp_size):
|
| 4 |
# Get list of pipeline blocks and their costs
|
| 5 |
pipeline_blocks = []
|
|
@@ -169,33 +170,92 @@ def plot_memory_breakdown(
|
|
| 169 |
|
| 170 |
plt.tight_layout()
|
| 171 |
|
| 172 |
-
# Create figure for
|
| 173 |
-
fig2 = plt.figure(figsize=(
|
| 174 |
ax2 = fig2.add_subplot(1, 1, 1)
|
| 175 |
|
| 176 |
-
#
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
ax2.set_ylabel('Memory (MiB)')
|
| 194 |
-
ax2.set_title('
|
| 195 |
-
|
| 196 |
-
ax2.set_ylim(0, max(80000, max_y_value))
|
| 197 |
|
| 198 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
plt.tight_layout()
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
return fig1, fig2
|
|
|
|
|
|
|
| 1 |
import matplotlib.pyplot as plt
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
def get_num_hidden_layers_in_pp(hidden_size, num_layers, vocab_size, intermediate_size, num_attention_heads, pp_size):
|
| 5 |
# Get list of pipeline blocks and their costs
|
| 6 |
pipeline_blocks = []
|
|
|
|
| 170 |
|
| 171 |
plt.tight_layout()
|
| 172 |
|
| 173 |
+
# Create figure for timeline plot
|
| 174 |
+
fig2 = plt.figure(figsize=(12, 6))
|
| 175 |
ax2 = fig2.add_subplot(1, 1, 1)
|
| 176 |
|
| 177 |
+
# Define timeline steps and their components
|
| 178 |
+
c = results["Components"]
|
| 179 |
+
timeline_steps = {
|
| 180 |
+
"Model Init": [
|
| 181 |
+
("Model BF16", c["Model BF16"]),
|
| 182 |
+
("DDP Gradient Buffers", c["DDP Gradient Buffers"]),
|
| 183 |
+
],
|
| 184 |
+
"Gradient Accumulator Init": [
|
| 185 |
+
("Model BF16", c["Model BF16"]),
|
| 186 |
+
("DDP Gradient Buffers", c["DDP Gradient Buffers"]),
|
| 187 |
+
("FP32 Parameters", c["FP32 Parameters"]),
|
| 188 |
+
("FP32 Gradients", c["FP32 Gradients"])
|
| 189 |
+
],
|
| 190 |
+
"Fwd-Bwd Peak": [
|
| 191 |
+
("Model BF16", c["Model BF16"]),
|
| 192 |
+
("DDP Gradient Buffers", c["DDP Gradient Buffers"]),
|
| 193 |
+
("FP32 Parameters", c["FP32 Parameters"]),
|
| 194 |
+
("FP32 Gradients", c["FP32 Gradients"]),
|
| 195 |
+
("Activations", c["Activations"])
|
| 196 |
+
],
|
| 197 |
+
"After Fwd-Bwd": [
|
| 198 |
+
("Model BF16", c["Model BF16"]),
|
| 199 |
+
("DDP Gradient Buffers", c["DDP Gradient Buffers"]),
|
| 200 |
+
("FP32 Parameters", c["FP32 Parameters"]),
|
| 201 |
+
("FP32 Gradients", c["FP32 Gradients"])
|
| 202 |
+
],
|
| 203 |
+
"Optimizer Step": [
|
| 204 |
+
("Model BF16", c["Model BF16"]),
|
| 205 |
+
("FP32 Parameters", c["FP32 Parameters"]),
|
| 206 |
+
("FP32 Gradients", c["FP32 Gradients"]),
|
| 207 |
+
("Optimizer States", c["Optimizer States"])
|
| 208 |
+
],
|
| 209 |
+
"2nd Fwd-Bwd Peak": [
|
| 210 |
+
("Model BF16", c["Model BF16"]),
|
| 211 |
+
("FP32 Parameters", c["FP32 Parameters"]),
|
| 212 |
+
("FP32 Gradients", c["FP32 Gradients"]),
|
| 213 |
+
("Optimizer States", c["Optimizer States"]),
|
| 214 |
+
("DDP Gradient Buffers", c["DDP Gradient Buffers"]),
|
| 215 |
+
("Activations", c["Activations"])
|
| 216 |
+
],
|
| 217 |
+
"2nd Optimizer Step": [
|
| 218 |
+
("Model BF16", c["Model BF16"]),
|
| 219 |
+
("FP32 Parameters", c["FP32 Parameters"]),
|
| 220 |
+
("FP32 Gradients", c["FP32 Gradients"]),
|
| 221 |
+
("Optimizer States", c["Optimizer States"]),
|
| 222 |
+
("DDP Gradient Buffers", c["DDP Gradient Buffers"])
|
| 223 |
+
]
|
| 224 |
+
}
|
| 225 |
+
# Plot timeline
|
| 226 |
+
x = range(len(timeline_steps))
|
| 227 |
+
bottom = np.zeros(len(timeline_steps))
|
| 228 |
+
colors = plt.cm.Set3(np.linspace(0, 1, len(c)))
|
| 229 |
+
color_map = dict(zip(c.keys(), colors))
|
| 230 |
|
| 231 |
+
for component in c.keys():
|
| 232 |
+
heights = []
|
| 233 |
+
for step_components in timeline_steps.values():
|
| 234 |
+
height = 0
|
| 235 |
+
for comp_name, comp_value in step_components:
|
| 236 |
+
if comp_name == component:
|
| 237 |
+
height = comp_value
|
| 238 |
+
heights.append(height)
|
| 239 |
+
|
| 240 |
+
ax2.bar(x, heights, bottom=bottom, label=component, color=color_map[component])
|
| 241 |
+
bottom += heights
|
| 242 |
+
|
| 243 |
+
# Customize the timeline plot
|
| 244 |
+
ax2.set_xticks(x)
|
| 245 |
+
ax2.set_xticklabels(timeline_steps.keys(), rotation=45, ha='right')
|
| 246 |
ax2.set_ylabel('Memory (MiB)')
|
| 247 |
+
ax2.set_title('Memory Timeline', pad=20)
|
| 248 |
+
ax2.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
|
|
|
|
| 249 |
|
| 250 |
+
# Add total memory labels on top of each bar
|
| 251 |
+
for i, total in enumerate(bottom):
|
| 252 |
+
ax2.text(i, total, f'{total:.1f} MiB', ha='center', va='bottom')
|
| 253 |
+
|
| 254 |
+
# Adjust layout
|
| 255 |
plt.tight_layout()
|
| 256 |
|
| 257 |
+
# Set y-axis limit
|
| 258 |
+
max_y_value = max(bottom)
|
| 259 |
+
ax2.set_ylim(0, max(80000, max_y_value))
|
| 260 |
+
|
| 261 |
return fig1, fig2
|