egrpo / hope /finetune_mergestep_multi_v2.hope
studyOverflow's picture
Add files using upload-large-folder tool
771ecfe verified
[base]
type = ml-vision
[resource]
usergroup = hadoop-camera3d
queue = root.hldy_training_cluster.hadoop-aipnlp.h800_vi_sp
[dataset]
dataset_name =
dataset_type =
dataset_path =
[job_track]
demand_id = 91369190
upstream_jobid =
input_dir =
output_dir =
log_dir =
[user_args]
[roles]
workers = 4
worker.memory = 1920000
worker.vcore = 128
worker.gcoresh800-80g = 8
worker.script = sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/finetune_mergestep_multi_v2.sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/DanceGRPO /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/conda-envs/dancegrpo-v2/bin/python fastvideo/train_g2rpo_hps_clip_merge.py 4 8
worker.ports = 1
[am]
afo.app.am.resource.mb = 4096
[tensorboard]
with.tensor.board = false
[docker]
afo.docker.image.name = registryonline-hulk.sankuai.com/custom_prod/com.sankuai.data.hadoop.gpu/data-hadoop-camera3d_cuda12.4-nccl2.21.5-prod-10ab7b1d
[data]
afo.data.prefetch = false
[failover]
afo.app.support.engine.failover = true
[conda]
afo.conda.env.name =
afo.conda.env.path =
afo.conda.store.type =
[distribute]
afo.role.worker.gpu_driver_version = 470.103.01
[others]
afo.app.env.YARN_CONTAINER_RUNTIME_DOCKER_SHM_SIZE_BYTES = 640000000000
afo.xm.notice.receivers.account = zhangshengjun02
with_requirements = false
afo.app.yarn.allocate.timeout.seconds = 3600000
afo.app.blacklist.fail_times = 16
#afo.role.worker.task.attempt.max.retry = 16
afo.role.worker.task.attempt.max.retry = 1
afo.dolphinfs.otherusers = hadoop-videogen-hl,hadoop-imagen-hl:true,hadoop-vision-data:true
afo.use.hdfs.fuse=true
afo.use.hdfs.fuse.subpath=:/mnt/hdfs
afo.use.hdfs.fuse.readonly=false
afo.role.worker.not.node_name = hldy-data-k8s-gpu-h800-node0483.mt,hldy-data-k8s-gpu-h800-node0866.mt,hldy-data-k8s-gpu-h800-node0187.mt,hldy-data-k8s-gpu-h800-node0059.mt,hldy-data-k8s-gpu-h800-node0178.mt,hldy-data-k8s-gpu-h800-node0670.mt,hldy-data-k8s-gpu-h800-node0303.mt,hldy-data-k8s-gpu-h800-node0950.mt,hldy-data-k8s-gpu-h800-node0785.mt,hldy-data-k8s-gpu-h800-node0416.mt,hldy-data-k8s-gpu-h800-node0846.mt,hldy-data-k8s-gpu-h800-node0836.mt,hldy-data-k8s-gpu-h800-node0802.mt,hldy-data-k8s-gpu-h800-node0768.mt,hldy-data-k8s-gpu-h800-node1014.mt,hldy-data-k8s-gpu-h800-node0843.mt
afo.role.am.not.node_name = hlsc-data-k8s-node0187.mt