| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import os |
| |
|
| | from ...utils import ( |
| | ComputeEnvironment, |
| | DistributedType, |
| | is_deepspeed_available, |
| | is_fp8_available, |
| | is_hpu_available, |
| | is_mlu_available, |
| | is_mps_available, |
| | is_msamp_available, |
| | is_musa_available, |
| | is_npu_available, |
| | is_sdaa_available, |
| | is_transformer_engine_available, |
| | is_transformers_available, |
| | is_xpu_available, |
| | ) |
| | from ...utils.constants import ( |
| | DEEPSPEED_MULTINODE_LAUNCHERS, |
| | FSDP2_STATE_DICT_TYPE, |
| | FSDP_AUTO_WRAP_POLICY, |
| | FSDP_BACKWARD_PREFETCH, |
| | FSDP_SHARDING_STRATEGY, |
| | FSDP_STATE_DICT_TYPE, |
| | TORCH_DYNAMO_MODES, |
| | ) |
| | from .config_args import ClusterConfig |
| | from .config_utils import ( |
| | DYNAMO_BACKENDS, |
| | _ask_field, |
| | _ask_options, |
| | _convert_distributed_mode, |
| | _convert_dynamo_backend, |
| | _convert_fp8_backend, |
| | _convert_mixed_precision, |
| | _convert_yes_no_to_bool, |
| | ) |
| |
|
| |
|
| | def get_cluster_input(): |
| | distributed_type = _ask_options( |
| | "Which type of machine are you using?", |
| | [ |
| | "No distributed training", |
| | "multi-CPU", |
| | "multi-XPU", |
| | "multi-HPU", |
| | "multi-GPU", |
| | "multi-NPU", |
| | "multi-MLU", |
| | "multi-SDAA", |
| | "multi-MUSA", |
| | "TPU", |
| | ], |
| | _convert_distributed_mode, |
| | ) |
| |
|
| | machine_rank = 0 |
| | num_machines = 1 |
| | num_processes = 1 |
| | gpu_ids = None |
| | main_process_ip = None |
| | main_process_port = None |
| | rdzv_backend = "static" |
| | same_network = True |
| | debug = False |
| |
|
| | if distributed_type in [ |
| | DistributedType.MULTI_GPU, |
| | DistributedType.MULTI_MLU, |
| | DistributedType.MULTI_SDAA, |
| | DistributedType.MULTI_MUSA, |
| | DistributedType.MULTI_NPU, |
| | DistributedType.MULTI_XPU, |
| | DistributedType.MULTI_CPU, |
| | DistributedType.MULTI_HPU, |
| | ]: |
| | num_machines = _ask_field( |
| | "How many different machines will you use (use more than 1 for multi-node training)? [1]: ", |
| | int, |
| | default=1, |
| | ) |
| | if num_machines > 1: |
| | machine_rank = _ask_options( |
| | "What is the rank of this machine?", |
| | list(range(num_machines)), |
| | int, |
| | ) |
| | main_process_ip = _ask_field( |
| | "What is the IP address of the machine that will host the main process? ", |
| | ) |
| | main_process_port = _ask_field( |
| | "What is the port you will use to communicate with the main process? ", |
| | int, |
| | ) |
| | same_network = _ask_field( |
| | "Are all the machines on the same local network? Answer `no` if nodes are on the cloud and/or on different network hosts [YES/no]: ", |
| | _convert_yes_no_to_bool, |
| | default=True, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if not same_network: |
| | rdzv_backend = _ask_field( |
| | "What rendezvous backend will you use? ('static', 'c10d', ...): ", default="static" |
| | ) |
| | debug = _ask_field( |
| | "Should distributed operations be checked while running for errors? This can avoid timeout issues but will be slower. [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| |
|
| | if distributed_type == DistributedType.NO: |
| | use_cpu = _ask_field( |
| | "Do you want to run your training on CPU only (even if a GPU / Apple Silicon / Ascend NPU device is available)? [yes/NO]:", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | elif distributed_type == DistributedType.MULTI_CPU: |
| | use_cpu = True |
| | else: |
| | use_cpu = False |
| |
|
| | ipex_config = {} |
| | mpirun_config = {} |
| | if use_cpu or is_xpu_available(): |
| | ipex_config["ipex"] = _ask_field( |
| | "Do you want to use Intel PyTorch Extension (IPEX) to speed up training on CPU/XPU? [yes/NO]:", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| |
|
| | if use_cpu: |
| | if distributed_type == DistributedType.MULTI_CPU: |
| | use_mpirun = _ask_field( |
| | "Do you want accelerate to launch mpirun? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if use_mpirun: |
| | mpirun_hostfile = _ask_field( |
| | "Please enter the path to the hostfile to use with mpirun [~/hostfile]: ", |
| | str, |
| | default="~/hostfile", |
| | ) |
| | mpirun_config["mpirun_hostfile"] = os.path.expanduser(mpirun_hostfile.strip()) |
| | mpirun_config["mpirun_ccl"] = _ask_field("Enter the number of oneCCL worker threads [1]: ", default=1) |
| |
|
| | dynamo_config = {} |
| | use_dynamo = _ask_field( |
| | "Do you wish to optimize your script with torch dynamo?[yes/NO]:", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if use_dynamo: |
| | prefix = "dynamo_" |
| | dynamo_config[prefix + "backend"] = _ask_options( |
| | "Which dynamo backend would you like to use?", |
| | [x.lower() for x in DYNAMO_BACKENDS], |
| | _convert_dynamo_backend, |
| | default=2, |
| | ) |
| | use_custom_options = _ask_field( |
| | "Do you want to customize the defaults sent to torch.compile? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| |
|
| | if use_custom_options: |
| | dynamo_config[prefix + "mode"] = _ask_options( |
| | "Which mode do you want to use?", |
| | TORCH_DYNAMO_MODES, |
| | lambda x: TORCH_DYNAMO_MODES[int(x)], |
| | default=0, |
| | ) |
| | dynamo_config[prefix + "use_fullgraph"] = _ask_field( |
| | "Do you want the fullgraph mode or it is ok to break model into several subgraphs? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | dynamo_config[prefix + "use_dynamic"] = _ask_field( |
| | "Do you want to enable dynamic shape tracing? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | dynamo_config[prefix + "use_regional_compilation"] = _ask_field( |
| | "Do you want to enable regional compilation? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| |
|
| | use_mps = not use_cpu and is_mps_available() |
| | deepspeed_config = {} |
| | if ( |
| | distributed_type |
| | in [ |
| | DistributedType.MULTI_GPU, |
| | DistributedType.MULTI_XPU, |
| | DistributedType.MULTI_HPU, |
| | DistributedType.MULTI_NPU, |
| | DistributedType.MULTI_MLU, |
| | DistributedType.MULTI_SDAA, |
| | DistributedType.MULTI_MUSA, |
| | DistributedType.NO, |
| | ] |
| | and not use_mps |
| | ): |
| | use_deepspeed = _ask_field( |
| | "Do you want to use DeepSpeed? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if use_deepspeed: |
| | distributed_type = DistributedType.DEEPSPEED |
| | assert is_deepspeed_available(), ( |
| | "DeepSpeed is not installed => run `pip3 install deepspeed` or build it from source" |
| | ) |
| |
|
| | if distributed_type == DistributedType.DEEPSPEED: |
| | use_deepspeed_config = _ask_field( |
| | "Do you want to specify a json file to a DeepSpeed config? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if use_deepspeed_config: |
| | deepspeed_config["deepspeed_config_file"] = _ask_field( |
| | "Please enter the path to the json DeepSpeed config file: ", |
| | str, |
| | default="none", |
| | ) |
| | else: |
| | deepspeed_config["zero_stage"] = _ask_options( |
| | "What should be your DeepSpeed's ZeRO optimization stage?", |
| | [0, 1, 2, 3], |
| | int, |
| | default=2, |
| | ) |
| |
|
| | deepspeed_devices = ["none", "cpu", "nvme"] |
| | if deepspeed_config["zero_stage"] >= 2: |
| | deepspeed_config["offload_optimizer_device"] = _ask_options( |
| | "Where to offload optimizer states?", deepspeed_devices, lambda x: deepspeed_devices[int(x)] |
| | ) |
| | deepspeed_config["offload_param_device"] = _ask_options( |
| | "Where to offload parameters?", deepspeed_devices, lambda x: deepspeed_devices[int(x)] |
| | ) |
| | if deepspeed_config["offload_param_device"] == "nvme": |
| | deepspeed_config["offload_param_nvme_path"] = _ask_field( |
| | "Nvme Path to offload parameters?", |
| | str, |
| | default="/nvme", |
| | ) |
| | if deepspeed_config["offload_optimizer_device"] == "nvme": |
| | deepspeed_config["offload_optimizer_nvme_path"] = _ask_field( |
| | "Nvme Path to offload optimizer states?", |
| | str, |
| | default="/nvme", |
| | ) |
| | deepspeed_config["gradient_accumulation_steps"] = _ask_field( |
| | "How many gradient accumulation steps you're passing in your script? [1]: ", |
| | int, |
| | default=1, |
| | ) |
| | use_gradient_clipping = _ask_field( |
| | "Do you want to use gradient clipping? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if use_gradient_clipping: |
| | deepspeed_config["gradient_clipping"] = _ask_field( |
| | "What is the gradient clipping value? [1.0]: ", |
| | float, |
| | default=1.0, |
| | ) |
| | if deepspeed_config["zero_stage"] == 3: |
| | deepspeed_config["zero3_save_16bit_model"] = _ask_field( |
| | "Do you want to save 16-bit model weights when using ZeRO Stage-3? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | deepspeed_config["zero3_init_flag"] = _ask_field( |
| | "Do you want to enable `deepspeed.zero.Init` when using ZeRO Stage-3 for constructing massive models? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if deepspeed_config["zero3_init_flag"]: |
| | if not is_transformers_available(): |
| | raise Exception( |
| | "When `zero3_init_flag` is set, it requires Transformers to be installed. " |
| | "Please run `pip3 install transformers`." |
| | ) |
| | use_moe = _ask_field( |
| | "Do you want to enable Mixture-of-Experts training (MoE)? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if use_moe: |
| | deepspeed_config["deepspeed_moe_layer_cls_names"] = _ask_field( |
| | "Specify the comma-separated list of transformers MoE layer class names (case-sensitive), e.g : " |
| | " `MixtralSparseMoeBlock`, `Qwen2MoeSparseMoeBlock`, `JetMoEAttention,JetMoEBlock` ... : ", |
| | str, |
| | ) |
| |
|
| | if num_machines > 1: |
| | launcher_query = "Which Type of launcher do you want to use?" |
| | deepspeed_config["deepspeed_multinode_launcher"] = _ask_options( |
| | launcher_query, |
| | DEEPSPEED_MULTINODE_LAUNCHERS, |
| | lambda x: DEEPSPEED_MULTINODE_LAUNCHERS[int(x)], |
| | ) |
| |
|
| | if deepspeed_config["deepspeed_multinode_launcher"] != DEEPSPEED_MULTINODE_LAUNCHERS[1]: |
| | deepspeed_config["deepspeed_hostfile"] = _ask_field( |
| | "DeepSpeed configures multi-node compute resources with hostfile. " |
| | "Each row is of the format `hostname slots=[num_gpus]`, e.g., `localhost slots=2`; " |
| | "for more information please refer official [documentation]" |
| | "(https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node). " |
| | "Please specify the location of hostfile: ", |
| | str, |
| | ) |
| |
|
| | is_exclusion_filter = _ask_field( |
| | "Do you want to specify exclusion filter string? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if is_exclusion_filter: |
| | deepspeed_config["deepspeed_exclusion_filter"] = _ask_field( |
| | "DeepSpeed exclusion filter string: ", |
| | str, |
| | ) |
| |
|
| | is_inclusion_filter = _ask_field( |
| | "Do you want to specify inclusion filter string? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if is_inclusion_filter: |
| | deepspeed_config["deepspeed_inclusion_filter"] = _ask_field( |
| | "DeepSpeed inclusion filter string: ", |
| | str, |
| | ) |
| |
|
| | fsdp_config = {} |
| |
|
| | if distributed_type in [ |
| | DistributedType.MULTI_GPU, |
| | DistributedType.MULTI_NPU, |
| | DistributedType.MULTI_MLU, |
| | DistributedType.MULTI_SDAA, |
| | DistributedType.MULTI_MUSA, |
| | DistributedType.MULTI_XPU, |
| | DistributedType.MULTI_HPU, |
| | ]: |
| | use_fsdp = _ask_field( |
| | "Do you want to use FullyShardedDataParallel? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if use_fsdp: |
| | distributed_type = DistributedType.FSDP |
| | if distributed_type == DistributedType.FSDP: |
| | fsdp_config["fsdp_version"] = _ask_options( |
| | "What should be your FSDP version? [2]: ", |
| | [1, 2], |
| | lambda x: int(x) + 1, |
| | default=1, |
| | ) |
| | fsdp_version = fsdp_config["fsdp_version"] |
| |
|
| | if fsdp_version == 1: |
| | sharding_strategy_query = "What should be your sharding strategy?" |
| | fsdp_config["fsdp_reshard_after_forward"] = _ask_options( |
| | sharding_strategy_query, |
| | FSDP_SHARDING_STRATEGY, |
| | lambda x: FSDP_SHARDING_STRATEGY[int(x)], |
| | ) |
| | else: |
| | fsdp_config["fsdp_reshard_after_forward"] = _ask_field( |
| | "Do you want to enable resharding after forward? [YES/no]: ", |
| | _convert_yes_no_to_bool, |
| | default=True, |
| | error_message="Please enter yes or no.", |
| | ) |
| |
|
| | fsdp_config["fsdp_offload_params"] = _ask_field( |
| | "Do you want to offload parameters and gradients to CPU? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| |
|
| | fsdp_wrap_query = "What should be your auto wrap policy?" |
| | fsdp_config["fsdp_auto_wrap_policy"] = _ask_options( |
| | fsdp_wrap_query, |
| | FSDP_AUTO_WRAP_POLICY, |
| | lambda x: FSDP_AUTO_WRAP_POLICY[int(x)], |
| | ) |
| | if fsdp_config["fsdp_auto_wrap_policy"] == FSDP_AUTO_WRAP_POLICY[0]: |
| | use_no_split_modules = _ask_field( |
| | "Do you want to use the model's `_no_split_modules` to wrap. Only applicable for 🤗 Transformers [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if not use_no_split_modules: |
| | fsdp_config["fsdp_transformer_layer_cls_to_wrap"] = _ask_field( |
| | "Specify the comma-separated list of transformer layer class names (case-sensitive) to wrap ,e.g, :" |
| | "`BertLayer`, `GPTJBlock`, `T5Block`, `BertLayer,BertEmbeddings,BertSelfOutput` ...? : ", |
| | str, |
| | ) |
| | elif fsdp_config["fsdp_auto_wrap_policy"] == FSDP_AUTO_WRAP_POLICY[1]: |
| | fsdp_config["fsdp_min_num_params"] = _ask_field( |
| | "What should be your FSDP's minimum number of parameters for Default Auto Wrapping Policy? [1e8]: ", |
| | int, |
| | default=100000000, |
| | ) |
| | |
| | if fsdp_version == 1: |
| | fsdp_backward_prefetch_query = "What should be your FSDP's backward prefetch policy?" |
| | fsdp_config["fsdp_backward_prefetch"] = _ask_options( |
| | fsdp_backward_prefetch_query, |
| | FSDP_BACKWARD_PREFETCH, |
| | lambda x: FSDP_BACKWARD_PREFETCH[int(x)], |
| | ) |
| |
|
| | fsdp_state_dict_type_query = "What should be your FSDP's state dict type?" |
| | fsdp_config["fsdp_state_dict_type"] = _ask_options( |
| | fsdp_state_dict_type_query, |
| | FSDP_STATE_DICT_TYPE if fsdp_version == 1 else FSDP2_STATE_DICT_TYPE, |
| | lambda x: FSDP_STATE_DICT_TYPE[int(x)] if fsdp_version == 1 else FSDP2_STATE_DICT_TYPE[int(x)], |
| | default=0, |
| | ) |
| | |
| | if fsdp_version == 1: |
| | fsdp_config["fsdp_forward_prefetch"] = _ask_field( |
| | "Do you want to enable FSDP's forward prefetch policy? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | |
| | if fsdp_version == 1: |
| | fsdp_config["fsdp_use_orig_params"] = _ask_field( |
| | "Do you want to enable FSDP's `use_orig_params` feature? [YES/no]: ", |
| | _convert_yes_no_to_bool, |
| | default=True, |
| | error_message="Please enter yes or no.", |
| | ) |
| | fsdp_config["fsdp_cpu_ram_efficient_loading"] = _ask_field( |
| | "Do you want to enable CPU RAM efficient model loading? Only applicable for 🤗 Transformers models. [YES/no]: ", |
| | _convert_yes_no_to_bool, |
| | default=True, |
| | error_message="Please enter yes or no.", |
| | ) |
| | |
| | if fsdp_version == 1: |
| | if fsdp_config["fsdp_cpu_ram_efficient_loading"]: |
| | fsdp_config["fsdp_sync_module_states"] = True |
| | else: |
| | fsdp_config["fsdp_sync_module_states"] = _ask_field( |
| | "Do you want each individually wrapped FSDP unit to broadcast module parameters from rank 0 at the start? [YES/no]: ", |
| | _convert_yes_no_to_bool, |
| | default=True, |
| | error_message="Please enter yes or no.", |
| | ) |
| | fsdp_config["fsdp_activation_checkpointing"] = _ask_field( |
| | "Do you want to enable FSDP activation checkpointing? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| |
|
| | parallelism_config = {} |
| |
|
| | if fsdp_config.get("fsdp_version", 1) == 2: |
| | use_parallelism_config = _ask_field( |
| | "Do you want to use the parallelism config? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| |
|
| | if use_parallelism_config: |
| | prefix = "parallelism_config_" |
| | parallelism_config[prefix + "dp_replicate_size"] = _ask_field( |
| | "What is the data parallelism replicate size? [1]: ", |
| | int, |
| | default=1, |
| | error_message="Please enter an integer.", |
| | ) |
| |
|
| | parallelism_config[prefix + "dp_shard_size"] = _ask_field( |
| | "What is the FSDP shard size? [1]: ", |
| | int, |
| | default=1, |
| | error_message="Please enter an integer.", |
| | ) |
| |
|
| | parallelism_config[prefix + "tp_size"] = _ask_field( |
| | "What is the tensor parallelism size? [1]: ", |
| | int, |
| | default=1, |
| | error_message="Please enter an integer.", |
| | ) |
| |
|
| | parallelism_config[prefix + "cp_size"] = _ask_field( |
| | "What is the context parallelism size? [1]: ", |
| | int, |
| | default=1, |
| | error_message="Please enter an integer.", |
| | ) |
| | if parallelism_config[prefix + "cp_size"] > 1: |
| | parallelism_config[prefix + "cp_comm_strategy"] = _ask_options( |
| | "What is the compute parallelism communication strategy?", |
| | ["allgather", "alltoall"], |
| | lambda x: ["allgather", "alltoall"][int(x)], |
| | default=0, |
| | ) |
| |
|
| | megatron_lm_config = {} |
| | if distributed_type in [DistributedType.MULTI_GPU]: |
| | use_megatron_lm = _ask_field( |
| | "Do you want to use Megatron-LM ? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if use_megatron_lm: |
| | distributed_type = DistributedType.MEGATRON_LM |
| | if distributed_type == DistributedType.MEGATRON_LM: |
| | prefix = "megatron_lm_" |
| | megatron_lm_config[prefix + "tp_degree"] = _ask_field( |
| | "What is the Tensor Parallelism degree/size? [1]:", |
| | int, |
| | default=1, |
| | error_message="Please enter an integer.", |
| | ) |
| | if megatron_lm_config[prefix + "tp_degree"] > 1: |
| | megatron_lm_config[prefix + "sequence_parallelism"] = _ask_field( |
| | "Do you want to enable Sequence Parallelism? [YES/no]: ", |
| | _convert_yes_no_to_bool, |
| | default=True, |
| | error_message="Please enter yes or no.", |
| | ) |
| |
|
| | megatron_lm_config[prefix + "pp_degree"] = _ask_field( |
| | "What is the Pipeline Parallelism degree/size? [1]:", |
| | int, |
| | default=1, |
| | error_message="Please enter an integer.", |
| | ) |
| | if megatron_lm_config[prefix + "pp_degree"] > 1: |
| | megatron_lm_config[prefix + "num_micro_batches"] = _ask_field( |
| | "What is the number of micro-batches? [1]:", |
| | int, |
| | default=1, |
| | error_message="Please enter an integer.", |
| | ) |
| |
|
| | megatron_lm_config[prefix + "recompute_activations"] = _ask_field( |
| | "Do you want to enable selective activation recomputation? [YES/no]: ", |
| | _convert_yes_no_to_bool, |
| | default=True, |
| | error_message="Please enter yes or no.", |
| | ) |
| |
|
| | megatron_lm_config[prefix + "use_distributed_optimizer"] = _ask_field( |
| | "Do you want to use distributed optimizer " |
| | "which shards optimizer state and gradients across data parallel ranks? [YES/no]: ", |
| | _convert_yes_no_to_bool, |
| | default=True, |
| | error_message="Please enter yes or no.", |
| | ) |
| |
|
| | megatron_lm_config[prefix + "gradient_clipping"] = _ask_field( |
| | "What is the gradient clipping value based on global L2 Norm (0 to disable)? [1.0]: ", |
| | float, |
| | default=1.0, |
| | ) |
| | |
| | tpu_commands = None |
| | tpu_command_file = None |
| | tpu_downcast_bf16 = "no" |
| | tpu_env = [] |
| | tpu_name = None |
| | tpu_vm = None |
| | tpu_zone = None |
| | tpu_use_sudo = False |
| | tpu_use_cluster = False |
| |
|
| | if distributed_type in [ |
| | DistributedType.MULTI_CPU, |
| | DistributedType.MULTI_XPU, |
| | DistributedType.MULTI_HPU, |
| | DistributedType.MULTI_GPU, |
| | DistributedType.MULTI_MLU, |
| | DistributedType.MULTI_SDAA, |
| | DistributedType.MULTI_MUSA, |
| | DistributedType.MULTI_NPU, |
| | DistributedType.XLA, |
| | ]: |
| | machine_type = str(distributed_type).split(".")[1].replace("MULTI_", "") |
| | if machine_type == "TPU": |
| | machine_type += " cores" |
| | elif machine_type == "CPU": |
| | machine_type = "processes" |
| | else: |
| | machine_type += "(s)" |
| | num_processes = _ask_field( |
| | f"How many {machine_type} should be used for distributed training? [1]:", |
| | int, |
| | default=1, |
| | error_message="Please enter an integer.", |
| | ) |
| | elif distributed_type in [DistributedType.FSDP, DistributedType.DEEPSPEED, DistributedType.MEGATRON_LM]: |
| | num_processes = _ask_field( |
| | "How many GPU(s) should be used for distributed training? [1]:", |
| | int, |
| | default=1, |
| | error_message="Please enter an integer.", |
| | ) |
| | else: |
| | num_processes = 1 |
| |
|
| | if (distributed_type == DistributedType.MULTI_GPU) and (num_machines == 1) and (num_processes == 1): |
| | raise ValueError( |
| | f"Specified distributed type {distributed_type} but only using 1 GPU on a single machine. Please select `No distributed training` for the type of machine you are using." |
| | ) |
| |
|
| | if ( |
| | distributed_type |
| | in [ |
| | DistributedType.MULTI_GPU, |
| | DistributedType.MULTI_MLU, |
| | DistributedType.MULTI_SDAA, |
| | DistributedType.MULTI_MUSA, |
| | DistributedType.MULTI_NPU, |
| | DistributedType.MULTI_XPU, |
| | DistributedType.MULTI_HPU, |
| | DistributedType.NO, |
| | ] |
| | and not use_cpu |
| | and not use_mps |
| | ): |
| | if is_npu_available(): |
| | machine_type = "NPU(s)" |
| | elif is_mlu_available(): |
| | machine_type = "MLU(s)" |
| | elif is_sdaa_available(): |
| | machine_type = "SDAA(s)" |
| | elif is_musa_available(): |
| | machine_type = "MUSA(s)" |
| | elif is_xpu_available(): |
| | machine_type = "XPU(s)" |
| | elif is_hpu_available(): |
| | machine_type = "HPU(s)" |
| | else: |
| | machine_type = "GPU(s)" |
| | gpu_ids = _ask_field( |
| | f"What {machine_type} (by id) should be used for training on this machine as a comma-separated list? [all]:", |
| | default="all", |
| | ) |
| |
|
| | |
| | enable_cpu_affinity = False |
| | if distributed_type in (DistributedType.NO, DistributedType.MULTI_GPU) and not use_cpu and not use_mps: |
| | enable_cpu_affinity = _ask_field( |
| | "Would you like to enable numa efficiency? (Currently only supported on NVIDIA hardware). [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| |
|
| | fp8_config = None |
| | if distributed_type == DistributedType.XLA: |
| | mixed_precision = "no" |
| | main_training_function = _ask_field( |
| | "What is the name of the function in your script that should be launched in all parallel scripts? [main]: ", |
| | default="main", |
| | ) |
| | tpu_use_cluster = _ask_field( |
| | "Are you using a TPU cluster? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if tpu_use_cluster: |
| | tpu_name = _ask_field( |
| | "What is the name of your TPU cluster? ", |
| | default=None, |
| | error_message="Please enter the name of your TPU cluster.", |
| | ) |
| | tpu_zone = _ask_field( |
| | "What is the zone of your TPU cluster? ", |
| | default=None, |
| | error_message="Please enter the zone of your TPU cluster.", |
| | ) |
| | tpu_use_sudo = _ask_field( |
| | "To run a python script in a TPU pod, should `sudo` be used? [yes/NO]: ", |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | run_commands = _ask_field( |
| | "Do you have code you wish to run on startup in each pod? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if run_commands: |
| | use_command_file = _ask_field( |
| | "Is this code located in a bash script? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | if use_command_file: |
| | tpu_command_file = _ask_field( |
| | "What is the path to your bash script? ", |
| | default=None, |
| | error_message="Please enter the path to your bash script.", |
| | ) |
| | tpu_command_file = os.path.abspath(tpu_command_file) |
| | else: |
| | print("Please enter each command separately you wish to run on startup in each pod.") |
| | tpu_commands = [] |
| | another_command = True |
| | while another_command: |
| | tpu_commands.append( |
| | _ask_field( |
| | "Please enter a single command to be ran ", |
| | default=None, |
| | error_message="Please enter the commands you wish to run on startup in each pod as a single string.", |
| | ) |
| | ) |
| | another_command = _ask_field( |
| | "Do you wish to add another command? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | error_message="Please enter yes or no.", |
| | ) |
| | tpu_vm = _ask_field( |
| | "If not using an instance group, what are the names of the Compute VM instances to be used, separated by a comma: ", |
| | default="", |
| | ).split(",") |
| | tpu_env = _ask_field( |
| | "What environment variables do you wish to set in each pod, separated by a comma: ", |
| | default="", |
| | ).split(",") |
| |
|
| | else: |
| | main_training_function = "main" |
| | if distributed_type == DistributedType.DEEPSPEED and use_deepspeed_config: |
| | mixed_precision = None |
| | else: |
| | mixed_precision = _ask_options( |
| | "Do you wish to use mixed precision?", |
| | ["no", "fp16", "bf16", "fp8"], |
| | _convert_mixed_precision, |
| | ) |
| | if mixed_precision == "fp8": |
| | if not is_fp8_available(): |
| | raise ValueError("FP8 (either Transformer Engine or MSAMP) is not installed on this machine.") |
| | fp8_config = {} |
| | fp8_config["backend"] = _ask_options( |
| | "Which FP8 backend do you want to use?", |
| | ["te", "msamp"], |
| | _convert_fp8_backend, |
| | ) |
| | if fp8_config["backend"] == "TE": |
| | if not is_transformer_engine_available(): |
| | raise ValueError("TransformersEngine was selected, but it is not installed on this machine.") |
| | fp8_config["use_autocast_during_eval"] = _ask_field( |
| | "Do you want to use FP8 autocast during eval mode? Generally better metrics are found when this is disabled [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | ) |
| | fp8_config["margin"] = _ask_field( |
| | "What margin should be used for gradient scaling? [0]: ", |
| | int, |
| | default=0, |
| | ) |
| | fp8_config["interval"] = _ask_field( |
| | "What interval should be used for for how often the scaling factor is recomputed? [1]: ", |
| | int, |
| | default=1, |
| | ) |
| | fp8_config["fp8_format"] = _ask_options( |
| | "Which weight format should be used?", |
| | ["HYBRID", "E4M3", "E5M2"], |
| | lambda i: ["HYBRID", "E4M3", "E5M2"][i], |
| | default=0, |
| | ) |
| | fp8_config["amax_history_length"] = _ask_field( |
| | "What length of history should be used for the amax scaling factor computation? [1024]: ", |
| | int, |
| | default=1024, |
| | ) |
| | fp8_config["amax_compute_algorithm"] = _ask_options( |
| | "Which algorithm should be used for the amax scaling factor computation?", |
| | ["max", "most_recent"], |
| | lambda x: "max" if x == 0 else "most_recent", |
| | default=0, |
| | ) |
| | fp8_config["override_linear_precision"] = _ask_field( |
| | "Do you want to to execute `fprop`, `dgrad`, and `wgrad` GEMMS in higher precision? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | ) |
| | if fp8_config["override_linear_precision"]: |
| | fprop = _ask_field( |
| | "Should `fprop` be executed in higher precision? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | ) |
| | dgrad = _ask_field( |
| | "Should `dgrad` be executed in higher precision? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | ) |
| | wgrad = _ask_field( |
| | "Should `wgrad` be executed in higher precision? [yes/NO]: ", |
| | _convert_yes_no_to_bool, |
| | default=False, |
| | ) |
| | fp8_config["override_linear_precision"] = (fprop, dgrad, wgrad) |
| | else: |
| | fp8_config["override_linear_precision"] = (False, False, False) |
| |
|
| | elif fp8_config["backend"] == "MSAMP": |
| | if not is_msamp_available(): |
| | raise ValueError("MSAMP was selected, but it is not installed on this machine.") |
| | fp8_config["optimization_level"] = _ask_options( |
| | "Which optimization level should be used?", |
| | ["O1", "O2"], |
| | lambda x: "O1" if x == 0 else "O2", |
| | default=1, |
| | ) |
| |
|
| | if use_dynamo and mixed_precision == "no" and not use_cpu: |
| | print( |
| | "Torch dynamo used without mixed precision requires TF32 to be efficient. Accelerate will enable it by default when launching your scripts." |
| | ) |
| |
|
| | if distributed_type == DistributedType.XLA and mixed_precision == "bf16": |
| | tpu_downcast_bf16 = _ask_field( |
| | "Should `torch.float` be cast as `bfloat16` and `torch.double` remain `float32` on TPUs?", default="no" |
| | ) |
| |
|
| | return ClusterConfig( |
| | compute_environment=ComputeEnvironment.LOCAL_MACHINE, |
| | distributed_type=distributed_type, |
| | num_processes=num_processes, |
| | gpu_ids=gpu_ids, |
| | mixed_precision=mixed_precision, |
| | downcast_bf16=tpu_downcast_bf16, |
| | machine_rank=machine_rank, |
| | num_machines=num_machines, |
| | main_process_ip=main_process_ip, |
| | main_process_port=main_process_port, |
| | main_training_function=main_training_function, |
| | fp8_config=fp8_config, |
| | deepspeed_config=deepspeed_config, |
| | fsdp_config=fsdp_config, |
| | parallelism_config=parallelism_config, |
| | megatron_lm_config=megatron_lm_config, |
| | ipex_config=ipex_config, |
| | mpirun_config=mpirun_config, |
| | use_cpu=use_cpu, |
| | rdzv_backend=rdzv_backend, |
| | same_network=same_network, |
| | commands=tpu_commands, |
| | command_file=tpu_command_file, |
| | tpu_env=tpu_env, |
| | tpu_name=tpu_name, |
| | tpu_vm=tpu_vm, |
| | tpu_zone=tpu_zone, |
| | tpu_use_sudo=tpu_use_sudo, |
| | tpu_use_cluster=tpu_use_cluster, |
| | dynamo_config=dynamo_config, |
| | debug=debug, |
| | enable_cpu_affinity=enable_cpu_affinity, |
| | ) |
| |
|