Spaces:
Running
Running
mismatch default value and choice list of gradio dropdown
Browse files
app.py
CHANGED
|
@@ -16,6 +16,7 @@ PRECISION_TO_BYTES = {"float32": 4,
|
|
| 16 |
"int8": 1}
|
| 17 |
|
| 18 |
ZERO_STAGES = [0, 1, 2, 3]
|
|
|
|
| 19 |
OPTIMIZERS = ["adam", "adamw", "sgd"]
|
| 20 |
HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
|
| 21 |
|
|
@@ -151,33 +152,32 @@ def activations_memory(num_layers, sequence_length, micro_batch_size, hidden_siz
|
|
| 151 |
|
| 152 |
def vram_required(model_size, hidden_size, sequence_length, num_layers, num_heads, micro_batch_size, num_gpus, optimizer, zero_stage, gradient_checkpointing, mixed_precision):
|
| 153 |
# Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
|
|
|
|
| 154 |
model_vram = model_memory(model_size, mixed_precision=mixed_precision)
|
| 155 |
gradients_vram = gradients_memory(model_size)
|
| 156 |
optimizer_vram = optimizer_memory(model_size, optimizer=optimizer)
|
| 157 |
|
| 158 |
# Baseline
|
| 159 |
if zero_stage == 0:
|
| 160 |
-
|
| 161 |
# Optimizer state partitioning
|
| 162 |
-
if zero_stage =
|
| 163 |
-
|
| 164 |
# Gradient + Optimzer state partitioning
|
| 165 |
-
if zero_stage =
|
| 166 |
-
|
| 167 |
# Parameter partitioning + Gradient + Optimizer partitioning
|
| 168 |
if zero_stage == 3:
|
| 169 |
-
aggregated_vram =
|
| 170 |
|
| 171 |
-
|
| 172 |
|
| 173 |
activations_vram = activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads)
|
| 174 |
if gradient_checkpointing:
|
| 175 |
-
activations_vram = activations_vram ** 0.5
|
| 176 |
|
| 177 |
-
print(f"Activations require {activations_vram} GB with gradient checkpointing: {gradient_checkpointing}")
|
| 178 |
total_vram = aggregated_vram + activations_vram
|
| 179 |
-
|
| 180 |
-
return total_vram
|
| 181 |
|
| 182 |
def build_interface(estimate_vram_fn):
|
| 183 |
training_params = []
|
|
@@ -190,11 +190,11 @@ def build_interface(estimate_vram_fn):
|
|
| 190 |
|
| 191 |
|
| 192 |
with gr.Row(visible=False) as model_params_row:
|
| 193 |
-
model_params = [gr.Slider(label="Model Size", minimum=0.1, maximum=
|
| 194 |
gr.Slider(label="Hidden size", minimum=256, maximum=8192, step=128, value=4096, info="Hidden size"),
|
| 195 |
-
gr.Slider(label="Sequence length", minimum=256, maximum=
|
| 196 |
-
gr.Slider(label="Num layers", minimum=
|
| 197 |
-
gr.Slider(label="Num heads", minimum=
|
| 198 |
]
|
| 199 |
|
| 200 |
|
|
@@ -212,16 +212,17 @@ def build_interface(estimate_vram_fn):
|
|
| 212 |
|
| 213 |
|
| 214 |
with gr.Row(equal_height=True):
|
| 215 |
-
training_params = [gr.Dropdown(label="Micro batch size", choices=
|
| 216 |
gr.Dropdown(label="ZeRO stage", choices=ZERO_STAGES, value=0, info="ZeRO optimization stage"),
|
| 217 |
-
gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=
|
| 218 |
-
gr.Dropdown(label="Mixed
|
| 219 |
gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"),
|
| 220 |
gr.Slider(label="Num GPUs", minimum=1, maximum=64, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"),
|
| 221 |
gr.Textbox(label="Cache dir", value=None, placeholder=".huggingface_configs", info="HuggingFace cache directory to download config from")
|
| 222 |
]
|
| 223 |
|
| 224 |
submit_btn = gr.Button("Estimate!")
|
|
|
|
| 225 |
output = gr.Textbox(label="Total estimated VRAM per device/GPU (in GB)")
|
| 226 |
|
| 227 |
submit_btn.click(
|
|
@@ -235,22 +236,24 @@ def build_interface(estimate_vram_fn):
|
|
| 235 |
|
| 236 |
def estimate_vram(arg_keys, *args):
|
| 237 |
params = dict(zip(arg_keys, args))
|
| 238 |
-
print(params)
|
| 239 |
|
| 240 |
model_config = ModelConfig(params["model_size"], params["hidden_size"], params["sequence_length"], params["num_layers"], params["num_heads"])
|
| 241 |
training_config = TrainingConfig(params["micro_batch_size"], params["num_gpus"], params["optimizer"], params["zero_stage"], params["gradient_checkpointing"], params["mixed_precision"])
|
| 242 |
-
if params["repo_id"]:
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
|
|
|
| 251 |
|
| 252 |
-
|
| 253 |
-
|
|
|
|
| 254 |
|
| 255 |
if __name__ == "__main__":
|
| 256 |
parser = parse_args()
|
|
@@ -276,4 +279,5 @@ if __name__ == "__main__":
|
|
| 276 |
config = scrape_config_from_hub(args.repo_id)
|
| 277 |
model_config.overwrite_with_hf_config(config)
|
| 278 |
|
| 279 |
-
|
|
|
|
|
|
| 16 |
"int8": 1}
|
| 17 |
|
| 18 |
ZERO_STAGES = [0, 1, 2, 3]
|
| 19 |
+
BATCH_SIZES = [1,2,4,8,16,32,64]
|
| 20 |
OPTIMIZERS = ["adam", "adamw", "sgd"]
|
| 21 |
HUGGINGFACE_URL_CONFIG = "https://huggingface.co/{}/resolve/main/config.json"
|
| 22 |
|
|
|
|
| 152 |
|
| 153 |
def vram_required(model_size, hidden_size, sequence_length, num_layers, num_heads, micro_batch_size, num_gpus, optimizer, zero_stage, gradient_checkpointing, mixed_precision):
|
| 154 |
# Reference: https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/
|
| 155 |
+
|
| 156 |
model_vram = model_memory(model_size, mixed_precision=mixed_precision)
|
| 157 |
gradients_vram = gradients_memory(model_size)
|
| 158 |
optimizer_vram = optimizer_memory(model_size, optimizer=optimizer)
|
| 159 |
|
| 160 |
# Baseline
|
| 161 |
if zero_stage == 0:
|
| 162 |
+
pass
|
| 163 |
# Optimizer state partitioning
|
| 164 |
+
if zero_stage >= 1:
|
| 165 |
+
optimizer_vram = optimizer_vram / num_gpus
|
| 166 |
# Gradient + Optimzer state partitioning
|
| 167 |
+
if zero_stage >= 2:
|
| 168 |
+
gradients_vram = gradients_vram / num_gpus
|
| 169 |
# Parameter partitioning + Gradient + Optimizer partitioning
|
| 170 |
if zero_stage == 3:
|
| 171 |
+
aggregated_vram = model_vram / num_gpus
|
| 172 |
|
| 173 |
+
aggregated_vram = round(model_vram, 2) + gradients_vram + optimizer_vram
|
| 174 |
|
| 175 |
activations_vram = activations_memory(num_layers, sequence_length, micro_batch_size, hidden_size, num_heads)
|
| 176 |
if gradient_checkpointing:
|
| 177 |
+
activations_vram = round(activations_vram ** 0.5, 2)
|
| 178 |
|
|
|
|
| 179 |
total_vram = aggregated_vram + activations_vram
|
| 180 |
+
return {"total": total_vram, "model": model_vram, "gradients": gradients_vram, "optimizer": optimizer_vram, "activations": activations_vram}
|
|
|
|
| 181 |
|
| 182 |
def build_interface(estimate_vram_fn):
|
| 183 |
training_params = []
|
|
|
|
| 190 |
|
| 191 |
|
| 192 |
with gr.Row(visible=False) as model_params_row:
|
| 193 |
+
model_params = [gr.Slider(label="Model Size", minimum=0.1, maximum=400, step=0.1, value=7, info="Model size (in billion parameters)"),
|
| 194 |
gr.Slider(label="Hidden size", minimum=256, maximum=8192, step=128, value=4096, info="Hidden size"),
|
| 195 |
+
gr.Slider(label="Sequence length", minimum=256, maximum=128_000, step=256, value=8192, info="Sequence length"),
|
| 196 |
+
gr.Slider(label="Num layers", minimum=8, maximum=64, step=1, value=32, info="Number of layers"),
|
| 197 |
+
gr.Slider(label="Num heads", minimum=8, maximum=64, step=1, value=32, info="Number of attention heads")
|
| 198 |
]
|
| 199 |
|
| 200 |
|
|
|
|
| 212 |
|
| 213 |
|
| 214 |
with gr.Row(equal_height=True):
|
| 215 |
+
training_params = [gr.Dropdown(label="Micro batch size", choices=BATCH_SIZES, value=4, info="Micro batch size (batch size per device/GPU)"),
|
| 216 |
gr.Dropdown(label="ZeRO stage", choices=ZERO_STAGES, value=0, info="ZeRO optimization stage"),
|
| 217 |
+
gr.Dropdown(label="Gradient checkpointing", choices=[True, False], value=True, info="Enable gradient checkpointing"),
|
| 218 |
+
gr.Dropdown(label="Mixed precision", choices=[False, True], value=False, info="Enable mixed precision for model training"),
|
| 219 |
gr.Dropdown(label="Optimizer", choices=OPTIMIZERS, value="adamw", info="Type of optimizer"),
|
| 220 |
gr.Slider(label="Num GPUs", minimum=1, maximum=64, step=1, value=4, info="Number of GPUs. Necessary for estimating ZeRO stages"),
|
| 221 |
gr.Textbox(label="Cache dir", value=None, placeholder=".huggingface_configs", info="HuggingFace cache directory to download config from")
|
| 222 |
]
|
| 223 |
|
| 224 |
submit_btn = gr.Button("Estimate!")
|
| 225 |
+
|
| 226 |
output = gr.Textbox(label="Total estimated VRAM per device/GPU (in GB)")
|
| 227 |
|
| 228 |
submit_btn.click(
|
|
|
|
| 236 |
|
| 237 |
def estimate_vram(arg_keys, *args):
|
| 238 |
params = dict(zip(arg_keys, args))
|
| 239 |
+
print("Parameters: ", params)
|
| 240 |
|
| 241 |
model_config = ModelConfig(params["model_size"], params["hidden_size"], params["sequence_length"], params["num_layers"], params["num_heads"])
|
| 242 |
training_config = TrainingConfig(params["micro_batch_size"], params["num_gpus"], params["optimizer"], params["zero_stage"], params["gradient_checkpointing"], params["mixed_precision"])
|
| 243 |
+
if not params["repo_id"]:
|
| 244 |
+
return "No model selected!"
|
| 245 |
+
# If cache directory set, then download config
|
| 246 |
+
if params["cache_dir"]:
|
| 247 |
+
config = scrape_config_from_hub(params["repo_id"])
|
| 248 |
+
model_config.overwrite_with_hf_config(config)
|
| 249 |
+
# By default, scrape config.json from hub
|
| 250 |
+
else:
|
| 251 |
+
config = download_config_from_hub(params["repo_id"], params["cache_dir"])
|
| 252 |
+
model_config.overwrite_with_hf_config(config.to_dict())
|
| 253 |
|
| 254 |
+
total_vram_dict = vram_required(**vars(model_config), **vars(training_config))
|
| 255 |
+
output_str = f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB activations"
|
| 256 |
+
return output_str
|
| 257 |
|
| 258 |
if __name__ == "__main__":
|
| 259 |
parser = parse_args()
|
|
|
|
| 279 |
config = scrape_config_from_hub(args.repo_id)
|
| 280 |
model_config.overwrite_with_hf_config(config)
|
| 281 |
|
| 282 |
+
total_vram_dict = vram_required(**vars(model_config), **vars(training_config))
|
| 283 |
+
print(f"Total {total_vram_dict['total']}GB = {total_vram_dict['model']}GB (model) + {total_vram_dict['gradients']}GB (gradients) + {total_vram_dict['optimizer']}GB (optimizer) + {total_vram_dict['activations']}GB activations")
|