| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import argparse |
| import os |
| from dataclasses import dataclass |
| from functools import partial |
|
|
| import fiddle as fdl |
| import nemo_run as run |
|
|
| from nemo.collections import llm |
| from nemo.collections.llm.gpt.model.llama import Llama3Config, LlamaModel |
| from nemo.collections.llm.tools.auto_configurator import AutoConfigurator, generate_configs, get_results |
|
|
|
|
| def get_args(): |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--model_type", type=str, choices=["llama", "bert", "t5"], help="Model type to run") |
| parser.add_argument("--run_number", type=int, help="Number of config to run") |
| parser.add_argument("--log_dir", type=str, help="Path where to save training logs") |
| parser.add_argument("--get_results", action="store_true") |
| parser.add_argument("--extra_metrics", action="store_true") |
|
|
| return parser.parse_args() |
|
|
|
|
| @dataclass |
| class Llama3Config145M(Llama3Config): |
| num_layers: int = 12 |
| hidden_size: int = 768 |
| num_attention_heads: int = 16 |
| num_query_groups: int = 8 |
| ffn_hidden_size: int = 2688 |
|
|
|
|
| @run.cli.factory(target=llm.pretrain, name="llama3_145m") |
| def llama3_145m(num_nodes=1, num_gpus_per_node=1): |
| |
| recipe = partial(llm.llama3_8b.pretrain_recipe, num_nodes=num_nodes, num_gpus_per_node=num_gpus_per_node)() |
| recipe = run.Partial( |
| llm.pretrain, |
| model=run.Config(LlamaModel, config=run.Config(Llama3Config145M)), |
| trainer=recipe.trainer, |
| data=recipe.data, |
| log=recipe.log, |
| optim=recipe.optim, |
| resume=None, |
| ) |
|
|
| return recipe |
|
|
|
|
| def train_config(args): |
| |
| |
| |
| |
|
|
| |
| calculate_model_size = False |
| if args.model_type == "llama": |
| recipe = partial(llama3_145m)() |
| recipe.data.seq_length = recipe.model.config.seq_length = 2048 |
| elif args.model_type == "bert": |
| recipe = partial(llm.bert_110m.pretrain_recipe, num_nodes=1, num_gpus_per_node=1)() |
| elif args.model_type == "t5": |
| recipe = partial(llm.t5_220m.pretrain_recipe, num_nodes=1, num_gpus_per_node=1)() |
| |
| calculate_model_size = True |
| else: |
| raise ValueError(f"Unsupported model type for this script: {args.model_type}") |
| recipe.data.global_batch_size = 16 |
|
|
| runner = AutoConfigurator( |
| recipe=recipe, |
| gpu_memory_gb=40, |
| tensor_parallel_sizes=[1], |
| pipeline_parallel_sizes=[1], |
| micro_batch_sizes=[1, 2, 4], |
| max_training_days=1, |
| max_steps_per_run=10, |
| num_tokens_in_b=10, |
| vocab_size=32000, |
| path_to_logs=args.log_dir, |
| calculate_model_size=calculate_model_size, |
| ) |
|
|
| base_cfg, configs = generate_configs(runner) |
| if not args.get_results: |
| |
| partials = list(configs.values()) |
| names = list(configs.keys()) |
|
|
| |
| pretrain_cfg = partials[args.run_number - 1] |
| if args.extra_metrics: |
| from nemo.lightning.pytorch.callbacks import ( |
| MemoryMonitor, |
| OptimizerMonitor, |
| RuntimeEstimator, |
| SpeedMonitor, |
| ) |
|
|
| |
| pretrain_cfg.trainer.callbacks.append(run.Config(SpeedMonitor, window_size=5)) |
| pretrain_cfg.trainer.callbacks.append(run.Config(RuntimeEstimator)) |
| pretrain_cfg.trainer.callbacks.append(run.Config(OptimizerMonitor)) |
| pretrain_cfg.trainer.callbacks.append(run.Config(MemoryMonitor)) |
|
|
| pretrain = fdl.build(pretrain_cfg) |
| pretrain() |
| else: |
| |
| get_results(base_cfg, runner, args.log_dir, log_file_prefix="nemo_error") |
| print(f"The results were successfully saved to {args.log_dir}.") |
|
|
|
|
| def main(): |
| args = get_args() |
| train_config(args) |
|
|
|
|
| if __name__ == '__main__': |
| main() |
|
|