| | |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | set -ex |
| |
|
| | |
| | |
| |
|
| | DATA_DIR=${HL_DATA_DIR_ROOT:-/data/datasets/oscar} |
| | DATA_CACHE_DIR=${HL_DATA_CACHE_DIR:-} |
| | DATA_FILE_PREFIX=${HL_DATA_FILE_PREFIX:-oscar} |
| | TOKENIZER_TYPE=${HL_TOKENIZER_TYPE:-GPTSentencePieceTokenizer} |
| | TOKENIZER_MODEL=${HL_TOKENIZER_MODEL:-} |
| | NUM_NODES=${HL_NUM_NODES:-1} |
| | DP=${HL_DP:-4} |
| | TP=${HL_TP:-2} |
| | PP=${HL_PP:-1} |
| | NO_PIPELINE_PARALLEL=${HL_NO_PIPELINE_PARALLEL:-0} |
| | MICRO_BATCH=${HL_MICRO_BATCH:-1} |
| | EXIT_INTERVAL=${HL_EXIT_INTERVAL:-0} |
| | OUTPUT_DIR=${HL_RESULTS_DIR:-} |
| | OUTPUT_DIR_PREFIX=${HL_RESULTS_DIR_PREFIX:-.} |
| | CHECKPOINT_SAVE=${HL_SAVE:-1} |
| | SAVE_INTERVAL=${HL_SAVE_INTERVAL:-2000} |
| | CHECKPOINTS_DIR=${HL_CHECKPOINTS_DIR:-} |
| | CHECKPOINT_LOAD_TAG=${HL_CHECKPOINT_LOAD_TAG:-} |
| | TENSORBOARD_DIR=${HL_TENSORBOARD_DIR:-} |
| | KILL_SWITCH_FILE=${HL_KILL_SWITCH:-} |
| | HOSTSFILE=${HL_HOSTSFILE:-} |
| | CKP_ACT=${HL_CKP_ACT:-0} |
| | UNIV_CP=${HL_UNIV_CP:-0} |
| | VERIFY_CP=${HL_VERIFY_CP:-0} |
| | QNPU_DIR=${HL_QNPU_DIR:-} |
| | LOG_INTERVAL=${HL_LOG_INTERVAL:-10} |
| | MIXTRAL_MODEL=${HL_MIXTRAL_MODEL:-8x7b} |
| | DEVICES_PER_NODE=${HL_DEVICES_PER_NODE:-8} |
| | ZERO_STAGE=${HL_ZERO_STAGE:-0} |
| | SEQ_PARALLEL=${HL_SEQ_PARALLEL:-0} |
| | OPTIMIZER=${HL_OPTIMIZER:-fusedadamw} |
| | DROPOUT=${HL_DROPOUT:-0.0} |
| | TRAIN_ITERS=${HL_TRAIN_ITERS:-250000} |
| | LR_WARMUP_ITERS=${HL_LR_WARMUP_ITERS:-2000} |
| | EVAL_ITERS=${HL_EVAL_ITERS:-100} |
| | EVAL_INTERVAL=${HL_EVAL_INTERVAL:-1000} |
| | PROFILE=${HL_PROFILE:-} |
| | PROFILE_STEPS=${HL_PROFILE_STEPS:-"3,4"} |
| | MOE_NUM_CAPACITY_BINS=${HL_MOE_NUM_CAPACITY_BINS:-0} |
| | MOE_CAPACITY_BINS=${HL_MOE_CAPACITY_BINS:-} |
| | MOE_CAPACITY_BINS_EXP_BASE=${HL_CAPACITY_BINS_EXP_BASE:-1.5} |
| | MOE_CAPACITY_BINS_ALIGNMENT=${HL_MOE_CAPACITY_BINS_ALIGNMENT:-64} |
| | MOE_CAPACITY_BINS_OPTIMIZE_INTERVAL=${HL_MOE_CAPACITY_BINS_OPTIMIZE_INTERVAL:-300} |
| | MOE_CAPACITY_BINS_OPTIMIZE_MAX_GROUP=${HL_MOE_CAPACITY_BINS_OPTIMIZE_MAX_GROUP:-4} |
| | MOE_MIN_CAP=${HL_MOE_MIN_CAP:-64} |
| | MOE_ENABLE_EXPERT_TP=${HL_MOE_ENABLE_EXPERT_TP:-0} |
| | MOE_EP=${HL_MOE_EP:-} |
| | MOE_USE_DATA_BEFORE_EXPERT_PARALLEL=${HL_MOE_USE_DATA_BEFORE_EXPERT_PARALLEL:-0} |
| | USE_LAZY_MODE=${HL_USE_LAZY_MODE:-1} |
| | USE_TORCH_COMPILE=${HL_USE_TORCH_COMPILE:-0} |
| | USE_FUSED_SDPA=${HL_USE_FUSED_SDPA:-1} |
| | USE_FUSED_SDPA_WITH_RECOMPUTE=${HL_USE_FUSED_SDPA_WITH_RECOMPUTE:-0} |
| | PARTITIONED_MODE=${HL_PARTITIONED_MODE:-false} |
| | USE_TRANSFORMER_ENGINE=${HL_USE_TRANSFORMER_ENGINE:-0} |
| | USE_CACHE_FP8_WEIGHT=${HL_USE_CACHE_FP8_WEIGHT:-0} |
| | USE_CACHE_FP8_WEIGHT_FWD=${HL_USE_CACHE_FP8_WEIGHT_FWD:-0} |
| | FP8_FORMAT=${HL_FP8_FORMAT:-hybrid} |
| | GRAD_ACCUM_DTYPE=${HL_GRAD_ACCUM_DTYPE} |
| | FP8_MARGIN=${HL_FP8_MARGIN:-0} |
| | FP8_AMAX_RECOMPUTE_ALGO=${HL_FP8_AMAX_RECOMPUTE_ALGO:-max} |
| | USE_FUSED_RMSNORM=${HL_USE_FUSED_RMSNORM:-1} |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | if [ $PARTITIONED_MODE -ne 'false' ]; then |
| | echo "Currently PipelineEngine does not support partitioning of 2+ outputs from MoE; Configured with HL_PARTITIONED_MODE=${HL_PARTITIONED_MODE}" |
| | exit 1 |
| | fi |
| |
|
| | if [[ $MOE_ENABLE_EXPERT_TP -eq 0 && $TP -ne 1 ]]; then |
| | echo "When using TP, MOE must also be configured with TP" |
| | exit 1 |
| | fi |
| |
|
| | if [ $UNIV_CP -ne 0 ]; then |
| | echo "No support for loading from universal checkpoint; Configured with HL_UNIV_CP=${HL_UNIV_CP}" |
| | exit 1 |
| | fi |
| |
|
| | if [ $VERIFY_CP -ne 0 ]; then |
| | echo "No support for checkpoint verification; Configured with HL_VERIFY_CP=${HL_VERIFY_CP}" |
| | exit 1 |
| | fi |
| |
|
| | NUM_DEVICES=$(($DP * $TP * $PP)) |
| | NUM_DEVICES_2=$(($DEVICES_PER_NODE * $NUM_NODES)) |
| | if [ $NUM_DEVICES -ne $NUM_DEVICES_2 ]; then |
| | echo "Bad devices configuration. DPxTPxPP=${NUM_DEVICES} != N_NODES*N_DEVICES_PER_NODE=${NUM_DEVICES_2}" |
| | exit 1 |
| | fi |
| |
|
| | |
| | |
| |
|
| | if [ $MIXTRAL_MODEL == "8x7b" ]; then |
| | |
| | MOE_NUM_EXPERTS=${HL_MOE_NUM_EXPERTS:-8} |
| | N_LAYERS=${HL_NUM_LAYERS:-32} |
| | SEQ_LEN=${HL_SEQ_LEN:-32768} |
| | NHIDDEN=4096 |
| | FFN_HIDDEN_SIZE=14336 |
| | NHEADS=32 |
| | NUM_KV_HEADS=8 |
| | LR=3e-4 |
| | MIN_LR=3e-6 |
| | elif [ $MIXTRAL_MODEL == "small" ]; then |
| | MOE_NUM_EXPERTS=${HL_MOE_NUM_EXPERTS:-4} |
| | N_LAYERS=${HL_NUM_LAYERS:-8} |
| | SEQ_LEN=${HL_SEQ_LEN:-256} |
| | NHIDDEN=768 |
| | FFN_HIDDEN_SIZE=3072 |
| | NHEADS=16 |
| | NUM_KV_HEADS=8 |
| | LR=3e-4 |
| | MIN_LR=3e-6 |
| | else |
| | echo "Unsupported HL_MIXTRAL_MODEL=$MIXTRAL_MODEL" |
| | exit 1 |
| | fi |
| |
|
| | if [ -z "${MOE_EP}" ]; then |
| | if [[ $MOE_NUM_EXPERTS -gt $NUM_DEVICES ]]; then |
| | MOE_EP=${NUM_DEVICES} |
| | else |
| | MOE_EP=${MOE_NUM_EXPERTS} |
| | fi |
| | fi |
| | echo "Using Num Experts=${MOE_NUM_EXPERTS} with MoE EP=${MOE_EP}" |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | TOKENS_IN_BATCH=$((2 ** 22)) |
| | CALCULATED_GBS=$(($TOKENS_IN_BATCH / $SEQ_LEN)) |
| | GLOBAL_BATCH=${HL_GBS:-$CALCULATED_GBS} |
| | TOTAL_TOKENS=$((250000 * $TOKENS_IN_BATCH)) |
| |
|
| | |
| | |
| |
|
| | if [[ -z "$MEGATRON_DEEPSPEED_ROOT" ]]; then |
| | MEGATRON_DEEPSPEED_ROOT=$(realpath $(dirname $0)/../) |
| | fi |
| |
|
| | DATA_PATH=${DATA_DIR}/${DATA_FILE_PREFIX} |
| |
|
| | RUNTIME=`date +"%Y%m%d_%H%M"` |
| | |
| | if [ -z "$OUTPUT_DIR" ]; then |
| | |
| | if [ -z "$EXP_NAME" ]; then |
| | EXP_NAME="default" |
| | fi |
| | OUTPUT_DIR=${OUTPUT_DIR_PREFIX}/out/mixtral_${MIXTRAL_MODEL}/ds_${EXP_NAME}_z${ZERO_STAGE}_nl${N_LAYERS}_hs${NHIDDEN}_ffn${FFN_HIDDEN_SIZE}_moe_exp${MOE_NUM_EXPERTS}_gb${GLOBAL_BATCH}_mb${MICRO_BATCH}_sp${SEQ_PARALLEL}_D${DP}_T${TP}_P${PP}_E${MOE_EP}_moeT${MOE_ENABLE_EXPERT_TP}_devices${NUM_DEVICES}_${RUNTIME} |
| | fi |
| |
|
| | if [ -z "$CHECKPOINTS_DIR" ]; then |
| | CHECKPOINTS_DIR=$OUTPUT_DIR/checkpoints |
| | fi |
| |
|
| | if [ -z "$TENSORBOARD_DIR" ]; then |
| | TENSORBOARD_DIR=$OUTPUT_DIR/tensorboard |
| | fi |
| |
|
| | mkdir -p ${OUTPUT_DIR} |
| | mkdir -p ${TENSORBOARD_DIR} |
| |
|
| | |
| | |
| |
|
| | if [ $SEQ_PARALLEL -eq 1 ]; then |
| | PARTITIONED_MODE="false" |
| | fi |
| |
|
| | |
| | |
| | |
| | if [[ ${MOE_NUM_EXPERTS} -gt 1 ]] && [[ ${PP} -ne 1 ]]; then |
| | PARTITIONED_MODE="false" |
| | fi |
| |
|
| | DS_CONFIG=${OUTPUT_DIR}/ds_config.json |
| | cat << EOT > $DS_CONFIG |
| | { |
| | "train_batch_size" : $GLOBAL_BATCH, |
| | "train_micro_batch_size_per_gpu": $MICRO_BATCH, |
| | "steps_per_print": $LOG_INTERVAL, |
| | "gradient_clipping": 1.0, |
| | "zero_optimization": { |
| | "stage": $ZERO_STAGE |
| | }, |
| | "zero_allow_untested_optimizer": true, |
| | "bf16": { |
| | "enabled": true, |
| | "immediate_grad_update": true |
| | }, |
| | "fp16": {"enabled": false}, |
| | "wall_clock_breakdown": false, |
| | "pipeline": { |
| | "pipe_partitioned": $PARTITIONED_MODE, |
| | "grad_partitioned": $PARTITIONED_MODE |
| | }, |
| | "use_data_before_expert_parallelism": $MOE_USE_DATA_BEFORE_EXPERT_PARALLEL |
| | } |
| | EOT |
| |
|
| | |
| | |
| |
|
| | |
| | MULTINODE_CMD="" |
| | if [ "$NUM_NODES" -ne "1" -a -f "$HOSTSFILE" ]; then |
| | MULTINODE_CMD="--hostfile=$HOSTSFILE \ |
| | --master_addr $(head -n 1 $HOSTSFILE | sed -n s/[[:space:]]slots.*//p) " |
| | fi |
| |
|
| | |
| | CMD="" |
| | if [ ! -z "$QNPU_DIR" ]; then |
| | CMD="source ${QNPU_DIR}/activate ;" |
| | fi |
| |
|
| | if [ $USE_LAZY_MODE -eq 0 ]; then |
| | CMD="${CMD} PT_HPU_LAZY_MODE=0" |
| | else |
| | LOWER_CASE_USE_TORCH_COMPILE=$(echo "$USE_TORCH_COMPILE" | tr '[:upper:]' '[:lower:]') |
| | if [[ "$LOWER_CASE_USE_TORCH_COMPILE" == "true" || "$LOWER_CASE_USE_TORCH_COMPILE" == "1" ]]; then |
| | echo "Cannot use lazy(HL_USE_LAZY_MODE) and torch.compile(HL_USE_TORCH_COMPILE) modes together" |
| | exit 1 |
| | fi |
| | fi |
| |
|
| | CMD="${CMD} \ |
| | python3 -u ${MEGATRON_DEEPSPEED_ROOT}/pretrain_gpt.py \ |
| | --bf16 \ |
| | --deepspeed \ |
| | --tensor-model-parallel-size ${TP} \ |
| | --pipeline-model-parallel-size ${PP} \ |
| | --num-layers ${N_LAYERS} \ |
| | --hidden-size ${NHIDDEN} \ |
| | --ffn-hidden-size ${FFN_HIDDEN_SIZE} \ |
| | --num-attention-heads ${NHEADS} \ |
| | --num-key-value-heads ${NUM_KV_HEADS} \ |
| | --seq-length ${SEQ_LEN} \ |
| | --micro-batch-size ${MICRO_BATCH} \ |
| | --global-batch-size ${GLOBAL_BATCH} \ |
| | --train-iters ${TRAIN_ITERS} \ |
| | --log-interval ${LOG_INTERVAL} \ |
| | --eval-iters ${EVAL_ITERS} \ |
| | --eval-interval ${EVAL_INTERVAL} \ |
| | --data-path ${DATA_PATH} \ |
| | --optimizer ${OPTIMIZER} \ |
| | --adam-beta1 0.9 \ |
| | --adam-beta2 0.95 \ |
| | --adam-eps 1e-8 \ |
| | --lr ${LR} \ |
| | --min-lr ${MIN_LR} \ |
| | --lr-decay-style cosine \ |
| | --lr-warmup-iters ${LR_WARMUP_ITERS} \ |
| | --clip-grad 1.0 \ |
| | --weight-decay 0.1 \ |
| | --tensorboard-dir ${TENSORBOARD_DIR} \ |
| | --log-validation-ppl-to-tensorboard \ |
| | --log-batch-size-to-tensorboard \ |
| | --log-timers-to-tensorboard \ |
| | --load ${CHECKPOINTS_DIR} \ |
| | --deepspeed_config=${DS_CONFIG} \ |
| | --use-torch-compile=${USE_TORCH_COMPILE} \ |
| | --zero-stage=${ZERO_STAGE} \ |
| | --exit-interval ${EXIT_INTERVAL} \ |
| | --no-masked-softmax-fusion \ |
| | --no-bias-gelu-fusion \ |
| | --no-bias-dropout-fusion \ |
| | --no-gradient-accumulation-fusion \ |
| | --max-position-embeddings ${SEQ_LEN} \ |
| | --use-rotary-position-embeddings \ |
| | --rotary-position-embeddings-theta 1000000 \ |
| | --untie-embeddings-and-output-weights \ |
| | --swiglu \ |
| | --normalization rmsnorm \ |
| | --disable-bias-linear \ |
| | --no-query-key-layer-scaling \ |
| | --attention-dropout ${DROPOUT} \ |
| | --hidden-dropout ${DROPOUT} \ |
| | --use-fused-sdpa ${USE_FUSED_SDPA} \ |
| | --use-fused-sdpa-with-recompute ${USE_FUSED_SDPA_WITH_RECOMPUTE} \ |
| | --use-fused-rmsnorm $USE_FUSED_RMSNORM" |
| |
|
| | |
| | |
| | |
| | MOE_ARGS=" \ |
| | --num-experts ${MOE_NUM_EXPERTS} \ |
| | --moe-expert-parallel-size ${MOE_EP} \ |
| | --topk 2 \ |
| | --disable-moe-token-dropping \ |
| | --moe-loss-coeff 0.02 \ |
| | --expert-interval 1 \ |
| | --moe-train-capacity-factor 1.0 \ |
| | --moe-eval-capacity-factor 1.0 \ |
| | --moe-min-capacity ${MOE_MIN_CAP} \ |
| | " |
| |
|
| | if [[ $MOE_NUM_EXPERTS -gt 1 ]]; then |
| | MOE_ARGS="${MOE_ARGS} --create-moe-param-group" |
| | fi |
| |
|
| | if [[ $MOE_ENABLE_EXPERT_TP -gt 0 ]]; then |
| | MOE_ARGS="${MOE_ARGS} --enable-expert-tensor-parallelism" |
| | fi |
| |
|
| | |
| | |
| | |
| |
|
| | MOE_CAPACITY_BINS_ARGS=" \ |
| | --moe-num-capacity-bins ${MOE_NUM_CAPACITY_BINS} \ |
| | --moe-capacity-bins-exp-base ${MOE_CAPACITY_BINS_EXP_BASE} \ |
| | --moe-capacity-bins-alignment ${MOE_CAPACITY_BINS_ALIGNMENT} \ |
| | --moe-capacity-bins-optimize-interval ${MOE_CAPACITY_BINS_OPTIMIZE_INTERVAL} \ |
| | --moe-capacity-bins-optimize-max-group ${MOE_CAPACITY_BINS_OPTIMIZE_MAX_GROUP} \ |
| | " |
| |
|
| | if [ ! -z "$MOE_CAPACITY_BINS" ]; then |
| | MOE_CAPACITY_BINS_ARGS="${MOE_CAPACITY_BINS_ARGS} --moe-capacity-bins ${MOE_CAPACITY_BINS}" |
| | fi |
| |
|
| | if [[ $MOE_NUM_CAPACITY_BINS -gt 0 ]]; then |
| | MOE_ARGS="${MOE_ARGS} ${MOE_CAPACITY_BINS_ARGS}" |
| | fi |
| |
|
| | CMD="${CMD} ${MOE_ARGS}" |
| |
|
| | |
| | |
| | |
| | if [ $USE_TRANSFORMER_ENGINE -eq 1 ]; then |
| | CMD="${CMD} --transformer-impl transformer_engine" |
| |
|
| | if [ $USE_CACHE_FP8_WEIGHT -eq 1 ]; then |
| | CMD="${CMD} --cache-fp8-weight" |
| | fi |
| |
|
| | FP8_MEASURE_INTERVAL=${HL_FP8_MEASURE_INTERVAL:-$(( GLOBAL_BATCH / MICRO_BATCH / DP ))} |
| | FP8_AMAX_HISTORY_LEN=${HL_FP8_AMAX_HISTORY_LEN:-$(( GLOBAL_BATCH / MICRO_BATCH / DP ))} |
| | FP8_AMAX_REDUCE=${HL_FP8_AMAX_REDUCE:-1} |
| |
|
| | CMD="${CMD} --cache-fp8-weight-fwd $USE_CACHE_FP8_WEIGHT_FWD" |
| | CMD="${CMD} --fp8-interval $FP8_MEASURE_INTERVAL" |
| | CMD="${CMD} --fp8-margin $FP8_MARGIN" |
| | CMD="${CMD} --fp8-amax-compute-algo $FP8_AMAX_RECOMPUTE_ALGO" |
| | CMD="${CMD} --fp8-amax-history-len $FP8_AMAX_HISTORY_LEN" |
| |
|
| | if [ "$FP8_FORMAT" = "e5m2" ]; then |
| | CMD="${CMD} --fp8-e5m2" |
| | else |
| | CMD="${CMD} --fp8-hybrid" |
| | fi |
| |
|
| | if [ $FP8_AMAX_REDUCE -eq 1 ]; then |
| | CMD="${CMD} --fp8-amax-reduce" |
| | fi |
| | fi |
| |
|
| | |
| | |
| | |
| |
|
| | if [ "$TOKENIZER_TYPE" = "GPTSentencePieceTokenizer" ]; then |
| | CMD="${CMD} --tokenizer-type GPTSentencePieceTokenizer" |
| | if [[ -z "$TOKENIZER_MODEL" ]]; then |
| | TOKENIZER_MODEL="${DATA_DIR}/tokenizer.model" |
| | fi |
| | CMD="${CMD} --tokenizer-model $TOKENIZER_MODEL" |
| | elif [ "$TOKENIZER_TYPE" = "GPT2BPETokenizer" ]; then |
| | CMD="${CMD} --tokenizer-type GPT2BPETokenizer" |
| | CMD="${CMD} --vocab-file $DATA_DIR/gpt2-vocab.json" |
| | CMD="${CMD} --merge-file $DATA_DIR/gpt2-merges.txt" |
| | else |
| | echo "incorrect HL_TOKENIZER_TYPE=$TOKENIZER_TYPE is set" |
| | exit 1 |
| | fi |
| |
|
| | if [ ! -z "$CHECKPOINT_LOAD_TAG" ]; then |
| | CMD="${CMD} --load-tag ${CHECKPOINT_LOAD_TAG}" |
| | fi |
| |
|
| | if [ ! -z "$KILL_SWITCH_FILE" ]; then |
| | CMD="${CMD} --kill-switch-path $KILL_SWITCH_FILE" |
| | fi |
| |
|
| | if [ ! -z "$DATA_CACHE_DIR" ]; then |
| | CMD="${CMD} --data-cache-path ${DATA_CACHE_DIR}" |
| | fi |
| |
|
| | if [ $SEQ_PARALLEL -eq 1 ]; then |
| | CMD="${CMD} --sequence-parallel" |
| | fi |
| |
|
| | if [ $NO_PIPELINE_PARALLEL -eq 1 ]; then |
| | CMD="${CMD} --no-pipeline-parallel" |
| | fi |
| |
|
| | if [ $UNIV_CP -eq 1 ]; then |
| | echo "Loading Universal Checkpoint from ${CHECKPOINTS_DIR}" |
| | CMD="${CMD} --universal-checkpoint" |
| | fi |
| |
|
| | if [ $CHECKPOINT_SAVE -eq 1 ]; then |
| | mkdir -p ${CHECKPOINTS_DIR} |
| | CMD="${CMD} --save $CHECKPOINTS_DIR --save-interval $SAVE_INTERVAL" |
| |
|
| | if [ $VERIFY_CP -eq 1 ]; then |
| | |
| | CMD="${CMD} --verify-checkpoint --verify-checkpoint-model-type LLAMA" |
| | fi |
| | fi |
| |
|
| | if [ $CKP_ACT -eq 1 ]; then |
| | CMD="${CMD} --deepspeed-activation-checkpointing --recompute-granularity=full --recompute-method uniform" |
| | elif [ $CKP_ACT -eq 2 ]; then |
| | CMD="${CMD} --deepspeed-activation-checkpointing --recompute-granularity=selective" |
| | fi |
| |
|
| | if [ ! -z "$PROFILE" ]; then |
| | CMD="${CMD} --profile ${PROFILE}" |
| | CMD="${CMD} --profile-steps ${PROFILE_STEPS}" |
| | fi |
| |
|
| | if [ ! -z "$QNPU_DIR" ]; then |
| | rm -rf $HOME/.deepspeed_env |
| | echo "LD_LIBRARY_PATH=$LD_LIBRARY_PATH" >> $HOME/.deepspeed_env |
| | fi |
| |
|
| | |
| | deepspeed --num_nodes ${NUM_NODES} \ |
| | --num_gpus ${DEVICES_PER_NODE} \ |
| | --no_local_rank \ |
| | --no_python \ |
| | $MULTINODE_CMD \ |
| | /usr/bin/bash -c "$CMD" |
| |
|