| [base] |
| type = ml-vision |
|
|
| [resource] |
| usergroup = hadoop-camera3d |
| queue = root.hldy_training_cluster.hadoop-aipnlp.h800_vi_sp |
|
|
| [dataset] |
| dataset_name = |
| dataset_type = |
| dataset_path = |
|
|
| [job_track] |
| demand_id = 91369190 |
| upstream_jobid = |
| input_dir = |
| output_dir = |
| log_dir = |
|
|
| [user_args] |
|
|
| [roles] |
| workers = 4 |
| worker.memory = 1920000 |
| worker.vcore = 128 |
| worker.gcoresh800-80g = 8 |
| worker.script = sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/finetune_mergestep_multi_v2.sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/DanceGRPO /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/conda-envs/dancegrpo-v2/bin/python fastvideo/train_g2rpo_hps_clip_merge.py 4 8 |
|
|
| worker.ports = 1 |
|
|
| [am] |
| afo.app.am.resource.mb = 4096 |
|
|
| [tensorboard] |
| with.tensor.board = false |
|
|
| [docker] |
| afo.docker.image.name = registryonline-hulk.sankuai.com/custom_prod/com.sankuai.data.hadoop.gpu/data-hadoop-camera3d_cuda12.4-nccl2.21.5-prod-10ab7b1d |
|
|
|
|
| [data] |
| afo.data.prefetch = false |
|
|
| [failover] |
| afo.app.support.engine.failover = true |
|
|
| [conda] |
| afo.conda.env.name = |
| afo.conda.env.path = |
| afo.conda.store.type = |
|
|
| [distribute] |
| afo.role.worker.gpu_driver_version = 470.103.01 |
|
|
| [others] |
| afo.app.env.YARN_CONTAINER_RUNTIME_DOCKER_SHM_SIZE_BYTES = 640000000000 |
| afo.xm.notice.receivers.account = zhangshengjun02 |
| with_requirements = false |
| afo.app.yarn.allocate.timeout.seconds = 3600000 |
| afo.app.blacklist.fail_times = 16 |
| #afo.role.worker.task.attempt.max.retry = 16 |
| afo.role.worker.task.attempt.max.retry = 1 |
| afo.dolphinfs.otherusers = hadoop-videogen-hl,hadoop-imagen-hl:true,hadoop-vision-data:true |
| afo.use.hdfs.fuse=true |
| afo.use.hdfs.fuse.subpath=:/mnt/hdfs |
| afo.use.hdfs.fuse.readonly=false |
| afo.role.worker.not.node_name = hldy-data-k8s-gpu-h800-node0483.mt,hldy-data-k8s-gpu-h800-node0866.mt,hldy-data-k8s-gpu-h800-node0187.mt,hldy-data-k8s-gpu-h800-node0059.mt,hldy-data-k8s-gpu-h800-node0178.mt,hldy-data-k8s-gpu-h800-node0670.mt,hldy-data-k8s-gpu-h800-node0303.mt,hldy-data-k8s-gpu-h800-node0950.mt,hldy-data-k8s-gpu-h800-node0785.mt,hldy-data-k8s-gpu-h800-node0416.mt,hldy-data-k8s-gpu-h800-node0846.mt,hldy-data-k8s-gpu-h800-node0836.mt,hldy-data-k8s-gpu-h800-node0802.mt,hldy-data-k8s-gpu-h800-node0768.mt,hldy-data-k8s-gpu-h800-node1014.mt,hldy-data-k8s-gpu-h800-node0843.mt |
| afo.role.am.not.node_name = hlsc-data-k8s-node0187.mt |