| agent="hydra_pe_temporal" | |
| bs=8 | |
| lr=0.0002 | |
| cache="null" | |
| config="competition_training" | |
| epoch=20 | |
| # node 数量 | |
| replicas=8 | |
| dir=${agent}_vov_fixedpading_pe_temporal_modifyself_bs${bs}x${replicas}_ckpt | |
| ngc batch run \ | |
| -in dgx1v.32g.8.norm \ | |
| --ace nv-us-west-2 \ | |
| --label _wl___computer_vision \ | |
| -n ml-model.lkl_train._wl___computer_vision \ | |
| --result /result \ | |
| -i nvcr.io/nvidian/swaiinf/lzx-navsim \ | |
| --workspace q-2TlPKESo62ktTxOc8rYg:/zhenxinl_nuplan \ | |
| --port 6007 \ | |
| --array-type "MPI" \ | |
| --replicas $replicas \ | |
| --total-runtime "4D" \ | |
| --commandline " | |
| mpirun --allow-run-as-root -np $replicas -npernode 1 bash -c ' | |
| git pull; | |
| pip install --upgrade diffusers[torch]; | |
| MASTER_PORT=29500 MASTER_ADDR=launcher-svc-\${NGC_JOB_ID} WORLD_SIZE=\${NGC_ARRAY_SIZE} NODE_RANK=\${NGC_ARRAY_INDEX} \ | |
| python \${NAVSIM_DEVKIT_ROOT}/navsim/planning/script/run_training.py \ | |
| --config-name $config \ | |
| agent=$agent \ | |
| trainer.params.num_nodes=$replicas \ | |
| ~trainer.params.strategy \ | |
| trainer.params.max_epochs=$epoch \ | |
| dataloader.params.batch_size=$bs \ | |
| experiment_name=$dir \ | |
| cache_path=$cache \ | |
| agent.config.ckpt_path=$dir \ | |
| agent.lr=$lr \ | |
| split=trainval \ | |
| scene_filter=navtrain; | |
| ' | |
| " | |