| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import operator as op |
| |
|
| | import torch |
| |
|
| |
|
| | SCALER_NAME = "scaler.pt" |
| | MODEL_NAME = "pytorch_model" |
| | SAFE_MODEL_NAME = "model" |
| | RNG_STATE_NAME = "random_states" |
| | OPTIMIZER_NAME = "optimizer" |
| | SCHEDULER_NAME = "scheduler" |
| | SAMPLER_NAME = "sampler" |
| | PROFILE_PATTERN_NAME = "profile_{suffix}.json" |
| | WEIGHTS_NAME = f"{MODEL_NAME}.bin" |
| | WEIGHTS_PATTERN_NAME = "pytorch_model{suffix}.bin" |
| | WEIGHTS_INDEX_NAME = f"{WEIGHTS_NAME}.index.json" |
| | SAFE_WEIGHTS_NAME = f"{SAFE_MODEL_NAME}.safetensors" |
| | SAFE_WEIGHTS_PATTERN_NAME = "model{suffix}.safetensors" |
| | SAFE_WEIGHTS_INDEX_NAME = f"{SAFE_WEIGHTS_NAME}.index.json" |
| | SAGEMAKER_PYTORCH_VERSION = "1.10.2" |
| | SAGEMAKER_PYTHON_VERSION = "py38" |
| | SAGEMAKER_TRANSFORMERS_VERSION = "4.17.0" |
| | SAGEMAKER_PARALLEL_EC2_INSTANCES = ["ml.p3.16xlarge", "ml.p3dn.24xlarge", "ml.p4dn.24xlarge"] |
| | FSDP_SHARDING_STRATEGY = ["FULL_SHARD", "SHARD_GRAD_OP", "NO_SHARD", "HYBRID_SHARD", "HYBRID_SHARD_ZERO2"] |
| | FSDP_AUTO_WRAP_POLICY = ["TRANSFORMER_BASED_WRAP", "SIZE_BASED_WRAP", "NO_WRAP"] |
| | FSDP_BACKWARD_PREFETCH = ["BACKWARD_PRE", "BACKWARD_POST", "NO_PREFETCH"] |
| | FSDP_STATE_DICT_TYPE = ["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"] |
| | FSDP2_STATE_DICT_TYPE = ["SHARDED_STATE_DICT", "FULL_STATE_DICT"] |
| | FSDP_PYTORCH_VERSION = ( |
| | "2.1.0.a0+32f93b1" |
| | ) |
| | FSDP2_PYTORCH_VERSION = "2.6.0" |
| | FSDP_MODEL_NAME = "pytorch_model_fsdp" |
| | DEEPSPEED_MULTINODE_LAUNCHERS = ["pdsh", "standard", "openmpi", "mvapich", "mpich", "nossh", "slurm"] |
| | TORCH_DYNAMO_MODES = ["default", "reduce-overhead", "max-autotune"] |
| | ELASTIC_LOG_LINE_PREFIX_TEMPLATE_PYTORCH_VERSION = "2.2.0" |
| | XPU_PROFILING_AVAILABLE_PYTORCH_VERSION = "2.4.0" |
| | MITA_PROFILING_AVAILABLE_PYTORCH_VERSION = "2.1.0" |
| | BETA_TP_AVAILABLE_PYTORCH_VERSION = "2.3.0" |
| |
|
| | BETA_TP_AVAILABLE_TRANSFORMERS_VERSION = "4.52.0" |
| | BETA_CP_AVAILABLE_PYTORCH_VERSION = "2.6.0" |
| | BETA_SP_AVAILABLE_DEEPSPEED_VERSION = "0.18.2" |
| |
|
| | STR_OPERATION_TO_FUNC = {">": op.gt, ">=": op.ge, "==": op.eq, "!=": op.ne, "<=": op.le, "<": op.lt} |
| |
|
| | |
| | TORCH_LAUNCH_PARAMS = [ |
| | "nnodes", |
| | "nproc_per_node", |
| | "rdzv_backend", |
| | "rdzv_endpoint", |
| | "rdzv_id", |
| | "rdzv_conf", |
| | "standalone", |
| | "max_restarts", |
| | "monitor_interval", |
| | "start_method", |
| | "role", |
| | "module", |
| | "m", |
| | "no_python", |
| | "run_path", |
| | "log_dir", |
| | "r", |
| | "redirects", |
| | "t", |
| | "tee", |
| | "node_rank", |
| | "master_addr", |
| | "master_port", |
| | ] |
| |
|
| | CUDA_DISTRIBUTED_TYPES = ["DEEPSPEED", "MULTI_GPU", "FSDP", "MEGATRON_LM", "TP"] |
| | TORCH_DISTRIBUTED_OPERATION_TYPES = CUDA_DISTRIBUTED_TYPES + [ |
| | "MULTI_NPU", |
| | "MULTI_MLU", |
| | "MULTI_SDAA", |
| | "MULTI_MUSA", |
| | "MULTI_XPU", |
| | "MULTI_CPU", |
| | "MULTI_HPU", |
| | ] |
| | SUPPORTED_PYTORCH_LAYERS_FOR_UPCASTING = ( |
| | torch.nn.Conv1d, |
| | torch.nn.Conv2d, |
| | torch.nn.Conv3d, |
| | torch.nn.ConvTranspose1d, |
| | torch.nn.ConvTranspose2d, |
| | torch.nn.ConvTranspose3d, |
| | torch.nn.Linear, |
| | ) |
| |
|