studyOverflow commited on
Commit
771ecfe
·
verified ·
1 Parent(s): 3f04536

Add files using upload-large-folder tool

Browse files
data/qwenimage_rl_embeddings/prompt_embed/27214.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e6e1d92bf4691cf95d4000bf0a6cb6112ce5943b629f2deb6f25494a81feb31
3
+ size 7341531
data/qwenimage_rl_embeddings/prompt_embed/27548.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55f9e80c9788f581a087b97b27e38426c69d39b03c088503a52b9ba1582f398e
3
+ size 7341531
data/qwenimage_rl_embeddings/prompt_embed/28269.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec71dd4914a29f331f0e4e25ba03ea8992f411797e6224f83def3aa69e7789ce
3
+ size 7341531
data/qwenimage_rl_embeddings/prompt_embed/35020.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6604247e246bdc9e66602d3db3b4611b38733c07362894220e78659a5357afb
3
+ size 7341531
data/qwenimage_rl_embeddings/prompt_embed/51199.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bb38e0af3d106c6988c980db6ed16f8b91602966a3edfc7adb0db2afc153685
3
+ size 7341531
data/qwenimage_rl_embeddings/prompt_embed/70606.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a06b6d21c0cd9e05a829ac4ec4a1151f148c10663df240568b1b0d6c5d2ec504
3
+ size 7341531
data/qwenimage_rl_embeddings/prompt_embed/76836.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f38a891cb79fc0386d497068c2d1dcda5b8607d50c35f46c3b6f37f1a4012d54
3
+ size 7341531
data/qwenimage_rl_embeddings/prompt_embed/80700.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bce0abc6601a8b348551e7f766fb0f5136ec991eb9364681331f98588ae01474
3
+ size 7341531
data/qwenimage_rl_embeddings/prompt_embed/81549.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:235d43ccd91e1a4327a120a814cc52566fb1d8502c37d703a50229c97b78feaf
3
+ size 7341531
data/qwenimage_rl_embeddings/prompt_embed/94872.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13e8c6013dc5c3fc477936ff6b8708dd110c207087ab629db02e52a84d81e529
3
+ size 7341531
data/qwenimage_rl_embeddings/prompt_embed/98552.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a84711584506cfb03f7c3584af40c10dc3583814df7428110090b46143faf2b9
3
+ size 7341531
hope/finetune_intervalstep.hope ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [base]
2
+ type = ml-vision
3
+
4
+ [resource]
5
+ usergroup = hadoop-camera3d
6
+ queue = root.hldy_training_cluster.hadoop-aipnlp.h800_vi_sp
7
+
8
+ [dataset]
9
+ dataset_name =
10
+ dataset_type =
11
+ dataset_path =
12
+
13
+ [job_track]
14
+ demand_id = 91369190
15
+ upstream_jobid =
16
+ input_dir =
17
+ output_dir =
18
+ log_dir =
19
+
20
+ [user_args]
21
+
22
+ [roles]
23
+ workers = 2
24
+ worker.memory = 1920000
25
+ worker.vcore = 128
26
+ worker.gcoresh800-80g = 8
27
+ worker.script = sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/finetune_intervalstep.sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/DanceGRPO /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/conda-envs/dancegrpo-v2/bin/python fastvideo/train_g2rpo_hps.py 2 8
28
+
29
+ worker.ports = 1
30
+
31
+ [am]
32
+ afo.app.am.resource.mb = 4096
33
+
34
+ [tensorboard]
35
+ with.tensor.board = false
36
+
37
+ [docker]
38
+ afo.docker.image.name = registryonline-hulk.sankuai.com/custom_prod/com.sankuai.data.hadoop.gpu/data-hadoop-camera3d_cuda12.4-nccl2.21.5-prod-10ab7b1d
39
+
40
+
41
+ [data]
42
+ afo.data.prefetch = false
43
+
44
+ [failover]
45
+ afo.app.support.engine.failover = true
46
+
47
+ [conda]
48
+ afo.conda.env.name =
49
+ afo.conda.env.path =
50
+ afo.conda.store.type =
51
+
52
+ [distribute]
53
+ afo.role.worker.gpu_driver_version = 470.103.01
54
+
55
+ [others]
56
+ afo.app.env.YARN_CONTAINER_RUNTIME_DOCKER_SHM_SIZE_BYTES = 640000000000
57
+ afo.xm.notice.receivers.account = zhangshengjun02
58
+ with_requirements = false
59
+ afo.app.yarn.allocate.timeout.seconds = 3600000
60
+ afo.app.blacklist.fail_times = 16
61
+ #afo.role.worker.task.attempt.max.retry = 16
62
+ afo.role.worker.task.attempt.max.retry = 1
63
+ afo.dolphinfs.otherusers = hadoop-videogen-hl,hadoop-imagen-hl:true,hadoop-vision-data:true
64
+ afo.use.hdfs.fuse=true
65
+ afo.use.hdfs.fuse.subpath=:/mnt/hdfs
66
+ afo.use.hdfs.fuse.readonly=false
67
+ afo.role.worker.not.node_name = hldy-data-k8s-gpu-h800-node0483.mt,hldy-data-k8s-gpu-h800-node0866.mt,hldy-data-k8s-gpu-h800-node0187.mt,hldy-data-k8s-gpu-h800-node0059.mt,hldy-data-k8s-gpu-h800-node0178.mt,hldy-data-k8s-gpu-h800-node0670.mt,hldy-data-k8s-gpu-h800-node0303.mt,hldy-data-k8s-gpu-h800-node0950.mt,hldy-data-k8s-gpu-h800-node0785.mt,hldy-data-k8s-gpu-h800-node0416.mt,hldy-data-k8s-gpu-h800-node0846.mt,hldy-data-k8s-gpu-h800-node0836.mt,hldy-data-k8s-gpu-h800-node0802.mt,hldy-data-k8s-gpu-h800-node0768.mt,hldy-data-k8s-gpu-h800-node1014.mt,hldy-data-k8s-gpu-h800-node0843.mt
68
+ afo.role.am.not.node_name = hlsc-data-k8s-node0187.mt
hope/finetune_mergestep.sh ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # cluster_spec='{"am":["psx2s7cxrbvmlcvk-am-0.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local"],"index":"0","role":"worker","worker":["psx2s7cxrbvmlcvk-worker-0.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local:3400","psx2s7cxrbvmlcvk-worker-1.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local:3400"]}'
3
+ # echo "cluster spec is $cluster_spec"
4
+ WORK_DIR=$1
5
+ PYTHON_BIN=$2
6
+ SCRIPT=$3
7
+ NNODES=$4
8
+ NPROC_PER_NODE=$5
9
+
10
+ echo "WORK_DIR is $WORK_DIR"
11
+ echo "PYTHON_BIN is $PYTHON_BIN"
12
+ echo "SCRIPT is $SCRIPT"
13
+ echo "NNODES is $NNODES"
14
+ echo "NPROC_PER_NODE is $NPROC_PER_NODE"
15
+
16
+ PORT=${PORT:-29509}
17
+ PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
18
+
19
+ cluster_spec=${AFO_ENV_CLUSTER_SPEC//\"/\\\"}
20
+ echo "cluster spec is $cluster_spec"
21
+ # Assuming worker_list contains the JSON string (it's already been parsed)
22
+ worker_list_command="import json_parser; data = json_parser.parse('$cluster_spec'); print(data['worker'])"
23
+ worker_list=$($PYTHON_BIN -c "$worker_list_command")
24
+
25
+ # Remove the square brackets and quotes from worker_list
26
+ worker_list_cleaned=$(echo $worker_list | tr -d '[]' | tr -d "'")
27
+
28
+ # Convert the cleaned worker list into an array by splitting by commas
29
+ worker_strs=($(echo $worker_list_cleaned | tr ',' '\n'))
30
+
31
+ # Extract the master (first worker)
32
+ master=${worker_strs[0]}
33
+
34
+ # Extract master address and port
35
+ master_addr=$(echo $master | cut -d ':' -f1)
36
+ master_port=$(echo $master | cut -d ':' -f2)
37
+
38
+ # Output the master information without brackets and quotes
39
+ echo "worker list is $worker_list_cleaned"
40
+ echo "master is $master"
41
+ echo "master address is $master_addr"
42
+ echo "master port is $master_port"
43
+
44
+ worker_list_command="import json_parser; data = json_parser.parse('$cluster_spec'); print(data['index'])"
45
+ node_rank=$($PYTHON_BIN -c "$worker_list_command")
46
+ echo "node rank is $node_rank"
47
+ dist_url="tcp://$master_addr:$master_port"
48
+ echo "dist url is $dist_url"
49
+
50
+ export TOKENIZERS_PARALLELISM=false
51
+ export OMP_NUM_THREADS=1
52
+ export NCCL_DEBUG=INFO
53
+ export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
54
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
55
+ export TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1
56
+
57
+ ### launch with DDP (multi-machines-multi-gpus)
58
+ source scl_source enable devtoolset-7
59
+ ifconfig
60
+ cd $WORK_DIR=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/json_parse_test.sh
61
+ $PYTHON_BIN -m torch.distributed.run \
62
+ --nnodes=$NNODES --nproc_per_node=$NPROC_PER_NODE --node_rank=$node_rank --master_addr=$master_addr --master_port=$PORT \
63
+ $SCRIPT \
64
+ --seed 42 \
65
+ --pretrained_model_name_or_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/flux \
66
+ --hps_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/hps/HPS_v2.1_compressed.pt \
67
+ --hps_clip_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin \
68
+ --data_json_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/rl_embeddings/videos2caption.json \
69
+ --gradient_checkpointing \
70
+ --train_batch_size 1 \
71
+ --num_latent_t 1 \
72
+ --sp_size 1 \
73
+ --train_sp_batch_size 1 \
74
+ --dataloader_num_workers 4 \
75
+ --max_train_steps 301 \
76
+ --learning_rate 2e-6 \
77
+ --mixed_precision bf16 \
78
+ --checkpointing_steps 50 \
79
+ --cfg 0.0 \
80
+ --output_dir /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/save_exp/hps_merge_step_2_0 \
81
+ --h 1024 \
82
+ --w 1024 \
83
+ --t 1 \
84
+ --sampling_steps 16 \
85
+ --eta 0.7 \
86
+ --lr_warmup_steps 0 \
87
+ --sampler_seed 1223627 \
88
+ --max_grad_norm 1.0 \
89
+ --weight_decay 0.0001 \
90
+ --num_generations 12 \
91
+ --shift 3 \
92
+ --init_same_noise \
93
+ --clip_range 1e-4 \
94
+ --adv_clip_max 5.0 \
95
+ --eta_step_list 0 1 2 3 4 5 6 7 \
96
+ --eta_step_merge_list 1 1 1 2 2 2 3 3 \
97
+ --granular_list 1 \
hope/finetune_mergestep_multi.sh ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # cluster_spec='{"am":["psx2s7cxrbvmlcvk-am-0.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local"],"index":"0","role":"worker","worker":["psx2s7cxrbvmlcvk-worker-0.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local:3400","psx2s7cxrbvmlcvk-worker-1.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local:3400"]}'
3
+ # echo "cluster spec is $cluster_spec"
4
+ WORK_DIR=$1
5
+ PYTHON_BIN=$2
6
+ SCRIPT=$3
7
+ NNODES=$4
8
+ NPROC_PER_NODE=$5
9
+
10
+ echo "WORK_DIR is $WORK_DIR"
11
+ echo "PYTHON_BIN is $PYTHON_BIN"
12
+ echo "SCRIPT is $SCRIPT"
13
+ echo "NNODES is $NNODES"
14
+ echo "NPROC_PER_NODE is $NPROC_PER_NODE"
15
+
16
+ PORT=${PORT:-29509}
17
+ PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
18
+
19
+ cluster_spec=${AFO_ENV_CLUSTER_SPEC//\"/\\\"}
20
+ echo "cluster spec is $cluster_spec"
21
+ # Assuming worker_list contains the JSON string (it's already been parsed)
22
+ worker_list_command="import json_parser; data = json_parser.parse('$cluster_spec'); print(data['worker'])"
23
+ worker_list=$($PYTHON_BIN -c "$worker_list_command")
24
+
25
+ # Remove the square brackets and quotes from worker_list
26
+ worker_list_cleaned=$(echo $worker_list | tr -d '[]' | tr -d "'")
27
+
28
+ # Convert the cleaned worker list into an array by splitting by commas
29
+ worker_strs=($(echo $worker_list_cleaned | tr ',' '\n'))
30
+
31
+ # Extract the master (first worker)
32
+ master=${worker_strs[0]}
33
+
34
+ # Extract master address and port
35
+ master_addr=$(echo $master | cut -d ':' -f1)
36
+ master_port=$(echo $master | cut -d ':' -f2)
37
+
38
+ # Output the master information without brackets and quotes
39
+ echo "worker list is $worker_list_cleaned"
40
+ echo "master is $master"
41
+ echo "master address is $master_addr"
42
+ echo "master port is $master_port"
43
+
44
+ worker_list_command="import json_parser; data = json_parser.parse('$cluster_spec'); print(data['index'])"
45
+ node_rank=$($PYTHON_BIN -c "$worker_list_command")
46
+ echo "node rank is $node_rank"
47
+ dist_url="tcp://$master_addr:$master_port"
48
+ echo "dist url is $dist_url"
49
+
50
+ export TOKENIZERS_PARALLELISM=false
51
+ export OMP_NUM_THREADS=1
52
+ export NCCL_DEBUG=INFO
53
+ export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
54
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
55
+ export TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1
56
+
57
+ ### launch with DDP (multi-machines-multi-gpus)
58
+ source scl_source enable devtoolset-7
59
+ ifconfig
60
+ cd $WORK_DIR=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO
61
+ $PYTHON_BIN -m torch.distributed.run \
62
+ --nnodes=$NNODES --nproc_per_node=$NPROC_PER_NODE --node_rank=$node_rank --master_addr=$master_addr --master_port=$PORT \
63
+ $SCRIPT \
64
+ --seed 42 \
65
+ --pretrained_model_name_or_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/flux \
66
+ --resume_ckpt /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/save_exp/hps_clip_merge_step/ckpt/checkpoint-200-0 \
67
+ --hps_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/hps/HPS_v2.1_compressed.pt \
68
+ --hps_clip_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin \
69
+ --clip_score_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/clip_score \
70
+ --data_json_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/rl_embeddings/videos2caption.json \
71
+ --train_batch_size 1 \
72
+ --num_latent_t 1 \
73
+ --sp_size 1 \
74
+ --train_sp_batch_size 1 \
75
+ --dataloader_num_workers 4 \
76
+ --max_train_steps 401 \
77
+ --init_steps 200 \
78
+ --learning_rate 2e-6 \
79
+ --mixed_precision bf16 \
80
+ --checkpointing_steps 10 \
81
+ --cfg 0.0 \
82
+ --output_dir /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/save_exp/hps_clip_merge_resume_200 \
83
+ --h 1024 \
84
+ --w 1024 \
85
+ --t 1 \
86
+ --sampling_steps 16 \
87
+ --eta 0.7 \
88
+ --lr_warmup_steps 0 \
89
+ --sampler_seed 1223627 \
90
+ --max_grad_norm 1.0 \
91
+ --weight_decay 0.0001 \
92
+ --num_generations 12 \
93
+ --shift 3 \
94
+ --init_same_noise \
95
+ --clip_range 1e-4 \
96
+ --adv_clip_max 5.0 \
97
+ --eta_step_list 0 1 2 3 4 5 6 7 \
98
+ --eta_step_merge_list 1 1 1 2 2 2 3 3 \
99
+ --granular_list 1 \
hope/finetune_mergestep_multi_v2.hope ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [base]
2
+ type = ml-vision
3
+
4
+ [resource]
5
+ usergroup = hadoop-camera3d
6
+ queue = root.hldy_training_cluster.hadoop-aipnlp.h800_vi_sp
7
+
8
+ [dataset]
9
+ dataset_name =
10
+ dataset_type =
11
+ dataset_path =
12
+
13
+ [job_track]
14
+ demand_id = 91369190
15
+ upstream_jobid =
16
+ input_dir =
17
+ output_dir =
18
+ log_dir =
19
+
20
+ [user_args]
21
+
22
+ [roles]
23
+ workers = 4
24
+ worker.memory = 1920000
25
+ worker.vcore = 128
26
+ worker.gcoresh800-80g = 8
27
+ worker.script = sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/finetune_mergestep_multi_v2.sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/DanceGRPO /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/conda-envs/dancegrpo-v2/bin/python fastvideo/train_g2rpo_hps_clip_merge.py 4 8
28
+
29
+ worker.ports = 1
30
+
31
+ [am]
32
+ afo.app.am.resource.mb = 4096
33
+
34
+ [tensorboard]
35
+ with.tensor.board = false
36
+
37
+ [docker]
38
+ afo.docker.image.name = registryonline-hulk.sankuai.com/custom_prod/com.sankuai.data.hadoop.gpu/data-hadoop-camera3d_cuda12.4-nccl2.21.5-prod-10ab7b1d
39
+
40
+
41
+ [data]
42
+ afo.data.prefetch = false
43
+
44
+ [failover]
45
+ afo.app.support.engine.failover = true
46
+
47
+ [conda]
48
+ afo.conda.env.name =
49
+ afo.conda.env.path =
50
+ afo.conda.store.type =
51
+
52
+ [distribute]
53
+ afo.role.worker.gpu_driver_version = 470.103.01
54
+
55
+ [others]
56
+ afo.app.env.YARN_CONTAINER_RUNTIME_DOCKER_SHM_SIZE_BYTES = 640000000000
57
+ afo.xm.notice.receivers.account = zhangshengjun02
58
+ with_requirements = false
59
+ afo.app.yarn.allocate.timeout.seconds = 3600000
60
+ afo.app.blacklist.fail_times = 16
61
+ #afo.role.worker.task.attempt.max.retry = 16
62
+ afo.role.worker.task.attempt.max.retry = 1
63
+ afo.dolphinfs.otherusers = hadoop-videogen-hl,hadoop-imagen-hl:true,hadoop-vision-data:true
64
+ afo.use.hdfs.fuse=true
65
+ afo.use.hdfs.fuse.subpath=:/mnt/hdfs
66
+ afo.use.hdfs.fuse.readonly=false
67
+ afo.role.worker.not.node_name = hldy-data-k8s-gpu-h800-node0483.mt,hldy-data-k8s-gpu-h800-node0866.mt,hldy-data-k8s-gpu-h800-node0187.mt,hldy-data-k8s-gpu-h800-node0059.mt,hldy-data-k8s-gpu-h800-node0178.mt,hldy-data-k8s-gpu-h800-node0670.mt,hldy-data-k8s-gpu-h800-node0303.mt,hldy-data-k8s-gpu-h800-node0950.mt,hldy-data-k8s-gpu-h800-node0785.mt,hldy-data-k8s-gpu-h800-node0416.mt,hldy-data-k8s-gpu-h800-node0846.mt,hldy-data-k8s-gpu-h800-node0836.mt,hldy-data-k8s-gpu-h800-node0802.mt,hldy-data-k8s-gpu-h800-node0768.mt,hldy-data-k8s-gpu-h800-node1014.mt,hldy-data-k8s-gpu-h800-node0843.mt
68
+ afo.role.am.not.node_name = hlsc-data-k8s-node0187.mt
hope/finetune_multistep.hope ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [base]
2
+ type = ml-vision
3
+
4
+ [resource]
5
+ usergroup = hadoop-camera3d
6
+ queue = root.hldy_training_cluster.hadoop-aipnlp.h800_vi_sp
7
+
8
+ [dataset]
9
+ dataset_name =
10
+ dataset_type =
11
+ dataset_path =
12
+
13
+ [job_track]
14
+ demand_id = 91369190
15
+ upstream_jobid =
16
+ input_dir =
17
+ output_dir =
18
+ log_dir =
19
+
20
+ [user_args]
21
+
22
+ [roles]
23
+ workers = 2
24
+ worker.memory = 1920000
25
+ worker.vcore = 128
26
+ worker.gcoresh800-80g = 8
27
+ worker.script = sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/json_parse_test.sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/DanceGRPO /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/conda-envs/dancegrpo-v2/bin/python fastvideo/train_g2rpo_hps.py 2 8
28
+
29
+ worker.ports = 1
30
+
31
+ [am]
32
+ afo.app.am.resource.mb = 4096
33
+
34
+ [tensorboard]
35
+ with.tensor.board = false
36
+
37
+ [docker]
38
+ afo.docker.image.name = registryonline-hulk.sankuai.com/custom_prod/com.sankuai.data.hadoop.gpu/data-hadoop-camera3d_cuda12.4-nccl2.21.5-prod-10ab7b1d
39
+
40
+
41
+ [data]
42
+ afo.data.prefetch = false
43
+
44
+ [failover]
45
+ afo.app.support.engine.failover = true
46
+
47
+ [conda]
48
+ afo.conda.env.name =
49
+ afo.conda.env.path =
50
+ afo.conda.store.type =
51
+
52
+ [distribute]
53
+ afo.role.worker.gpu_driver_version = 470.103.01
54
+
55
+ [others]
56
+ afo.app.env.YARN_CONTAINER_RUNTIME_DOCKER_SHM_SIZE_BYTES = 640000000000
57
+ afo.xm.notice.receivers.account = zhangshengjun02
58
+ with_requirements = false
59
+ afo.app.yarn.allocate.timeout.seconds = 3600000
60
+ afo.app.blacklist.fail_times = 16
61
+ #afo.role.worker.task.attempt.max.retry = 16
62
+ afo.role.worker.task.attempt.max.retry = 1
63
+ afo.dolphinfs.otherusers = hadoop-videogen-hl,hadoop-imagen-hl:true,hadoop-vision-data:true
64
+ afo.use.hdfs.fuse=true
65
+ afo.use.hdfs.fuse.subpath=:/mnt/hdfs
66
+ afo.use.hdfs.fuse.readonly=false
67
+ afo.role.worker.not.node_name = hldy-data-k8s-gpu-h800-node0483.mt,hldy-data-k8s-gpu-h800-node0866.mt,hldy-data-k8s-gpu-h800-node0187.mt,hldy-data-k8s-gpu-h800-node0059.mt,hldy-data-k8s-gpu-h800-node0178.mt,hldy-data-k8s-gpu-h800-node0670.mt,hldy-data-k8s-gpu-h800-node0303.mt,hldy-data-k8s-gpu-h800-node0950.mt,hldy-data-k8s-gpu-h800-node0785.mt,hldy-data-k8s-gpu-h800-node0416.mt,hldy-data-k8s-gpu-h800-node0846.mt,hldy-data-k8s-gpu-h800-node0836.mt,hldy-data-k8s-gpu-h800-node0802.mt,hldy-data-k8s-gpu-h800-node0768.mt,hldy-data-k8s-gpu-h800-node1014.mt,hldy-data-k8s-gpu-h800-node0843.mt
68
+ afo.role.am.not.node_name = hlsc-data-k8s-node0187.mt
hope/finetune_rlpt.hope ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [base]
2
+ type = ml-vision
3
+
4
+ [resource]
5
+ usergroup = hadoop-camera3d
6
+ queue = root.hldy_training_cluster.hadoop-aipnlp.h800_vi_sp
7
+
8
+ [dataset]
9
+ dataset_name =
10
+ dataset_type =
11
+ dataset_path =
12
+
13
+ [job_track]
14
+ demand_id = 91369190
15
+ upstream_jobid =
16
+ input_dir =
17
+ output_dir =
18
+ log_dir =
19
+
20
+ [user_args]
21
+
22
+ [roles]
23
+ workers = 4
24
+ worker.memory = 1920000
25
+ worker.vcore = 128
26
+ worker.gcoresh800-80g = 8
27
+ worker.script = sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/finetune_rlpt.sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/DanceGRPO /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/conda-envs/dancegrpo-v2/bin/python fastvideo/train_g2rpo_rlpt_dino.py 4 8
28
+
29
+ worker.ports = 1
30
+
31
+ [am]
32
+ afo.app.am.resource.mb = 4096
33
+
34
+ [tensorboard]
35
+ with.tensor.board = false
36
+
37
+ [docker]
38
+ afo.docker.image.name = registryonline-hulk.sankuai.com/custom_prod/com.sankuai.data.hadoop.gpu/data-hadoop-camera3d_cuda12.4-nccl2.21.5-prod-10ab7b1d
39
+
40
+
41
+ [data]
42
+ afo.data.prefetch = false
43
+
44
+ [failover]
45
+ afo.app.support.engine.failover = true
46
+
47
+ [conda]
48
+ afo.conda.env.name =
49
+ afo.conda.env.path =
50
+ afo.conda.store.type =
51
+
52
+ [distribute]
53
+ afo.role.worker.gpu_driver_version = 470.103.01
54
+
55
+ [others]
56
+ afo.app.env.YARN_CONTAINER_RUNTIME_DOCKER_SHM_SIZE_BYTES = 640000000000
57
+ afo.xm.notice.receivers.account = zhangshengjun02
58
+ with_requirements = false
59
+ afo.app.yarn.allocate.timeout.seconds = 3600000
60
+ afo.app.blacklist.fail_times = 16
61
+ #afo.role.worker.task.attempt.max.retry = 16
62
+ afo.role.worker.task.attempt.max.retry = 1
63
+ afo.dolphinfs.otherusers = hadoop-videogen-hl,hadoop-imagen-hl:true,hadoop-vision-data:true
64
+ afo.use.hdfs.fuse=true
65
+ afo.use.hdfs.fuse.subpath=:/mnt/hdfs
66
+ afo.use.hdfs.fuse.readonly=false
67
+ afo.role.worker.not.node_name = hldy-data-k8s-gpu-h800-node0483.mt,hldy-data-k8s-gpu-h800-node0866.mt,hldy-data-k8s-gpu-h800-node0187.mt,hldy-data-k8s-gpu-h800-node0059.mt,hldy-data-k8s-gpu-h800-node0178.mt,hldy-data-k8s-gpu-h800-node0670.mt,hldy-data-k8s-gpu-h800-node0303.mt,hldy-data-k8s-gpu-h800-node0950.mt,hldy-data-k8s-gpu-h800-node0785.mt,hldy-data-k8s-gpu-h800-node0416.mt,hldy-data-k8s-gpu-h800-node0846.mt,hldy-data-k8s-gpu-h800-node0836.mt,hldy-data-k8s-gpu-h800-node0802.mt,hldy-data-k8s-gpu-h800-node0768.mt,hldy-data-k8s-gpu-h800-node1014.mt,hldy-data-k8s-gpu-h800-node0843.mt
68
+ afo.role.am.not.node_name = hlsc-data-k8s-node0187.mt
hope/finetune_rlpt.sh ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # cluster_spec='{"am":["psx2s7cxrbvmlcvk-am-0.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local"],"index":"0","role":"worker","worker":["psx2s7cxrbvmlcvk-worker-0.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local:3400","psx2s7cxrbvmlcvk-worker-1.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local:3400"]}'
3
+ # echo "cluster spec is $cluster_spec"
4
+ WORK_DIR=$1
5
+ PYTHON_BIN=$2
6
+ SCRIPT=$3
7
+ NNODES=$4
8
+ NPROC_PER_NODE=$5
9
+
10
+ echo "WORK_DIR is $WORK_DIR"
11
+ echo "PYTHON_BIN is $PYTHON_BIN"
12
+ echo "SCRIPT is $SCRIPT"
13
+ echo "NNODES is $NNODES"
14
+ echo "NPROC_PER_NODE is $NPROC_PER_NODE"
15
+
16
+ PORT=${PORT:-29509}
17
+ PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
18
+
19
+ cluster_spec=${AFO_ENV_CLUSTER_SPEC//\"/\\\"}
20
+ echo "cluster spec is $cluster_spec"
21
+ # Assuming worker_list contains the JSON string (it's already been parsed)
22
+ worker_list_command="import json_parser; data = json_parser.parse('$cluster_spec'); print(data['worker'])"
23
+ worker_list=$($PYTHON_BIN -c "$worker_list_command")
24
+
25
+ # Remove the square brackets and quotes from worker_list
26
+ worker_list_cleaned=$(echo $worker_list | tr -d '[]' | tr -d "'")
27
+
28
+ # Convert the cleaned worker list into an array by splitting by commas
29
+ worker_strs=($(echo $worker_list_cleaned | tr ',' '\n'))
30
+
31
+ # Extract the master (first worker)
32
+ master=${worker_strs[0]}
33
+
34
+ # Extract master address and port
35
+ master_addr=$(echo $master | cut -d ':' -f1)
36
+ master_port=$(echo $master | cut -d ':' -f2)
37
+
38
+ # Output the master information without brackets and quotes
39
+ echo "worker list is $worker_list_cleaned"
40
+ echo "master is $master"
41
+ echo "master address is $master_addr"
42
+ echo "master port is $master_port"
43
+
44
+ worker_list_command="import json_parser; data = json_parser.parse('$cluster_spec'); print(data['index'])"
45
+ node_rank=$($PYTHON_BIN -c "$worker_list_command")
46
+ echo "node rank is $node_rank"
47
+ dist_url="tcp://$master_addr:$master_port"
48
+ echo "dist url is $dist_url"
49
+
50
+ export TOKENIZERS_PARALLELISM=false
51
+ export OMP_NUM_THREADS=1
52
+ export NCCL_DEBUG=INFO
53
+ export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
54
+ export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
55
+ export TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1
56
+
57
+ ### launch with DDP (multi-machines-multi-gpus)
58
+ source scl_source enable devtoolset-7
59
+ ifconfig
60
+ cd $WORK_DIR=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO
61
+ $PYTHON_BIN -m torch.distributed.run \
62
+ --nnodes=$NNODES --nproc_per_node=$NPROC_PER_NODE --node_rank=$node_rank --master_addr=$master_addr --master_port=$PORT \
63
+ $SCRIPT \
64
+ --seed 42 \
65
+ --pretrained_model_name_or_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/flux \
66
+ --hps_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/hps/HPS_v2.1_compressed.pt \
67
+ --hps_clip_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin \
68
+ --dino_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/dinov2 \
69
+ --data_json_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/datasets/flux_rl_embeddings/videos2caption.json \
70
+ --gradient_checkpointing \
71
+ --train_batch_size 1 \
72
+ --num_latent_t 1 \
73
+ --sp_size 1 \
74
+ --train_sp_batch_size 1 \
75
+ --dataloader_num_workers 4 \
76
+ --max_train_steps 151 \
77
+ --learning_rate 2e-6 \
78
+ --mixed_precision bf16 \
79
+ --checkpointing_steps 30 \
80
+ --cfg 0.0 \
81
+ --output_dir /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/save_exp_rlpt/dino_gt_1022 \
82
+ --h 1024 \
83
+ --w 1024 \
84
+ --t 1 \
85
+ --sampling_steps 16 \
86
+ --eta 0.7 \
87
+ --lr_warmup_steps 0 \
88
+ --sampler_seed 1223627 \
89
+ --max_grad_norm 1.0 \
90
+ --weight_decay 0.0001 \
91
+ --num_generations 12 \
92
+ --shift 3 \
93
+ --init_same_noise \
94
+ --clip_range 1e-4 \
95
+ --adv_clip_max 5.0 \
96
+ --eta_step_list 0 1 2 3 4 5 6 7 \
97
+ --granular_list 1 \
hope/finetune_rlpt_from_noise.hope ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [base]
2
+ type = ml-vision
3
+
4
+ [resource]
5
+ usergroup = hadoop-camera3d
6
+ queue = root.hldy_training_cluster.hadoop-aipnlp.h800_vi_sp
7
+
8
+ [dataset]
9
+ dataset_name =
10
+ dataset_type =
11
+ dataset_path =
12
+
13
+ [job_track]
14
+ demand_id = 91369190
15
+ upstream_jobid =
16
+ input_dir =
17
+ output_dir =
18
+ log_dir =
19
+
20
+ [user_args]
21
+
22
+ [roles]
23
+ workers = 4
24
+ worker.memory = 1920000
25
+ worker.vcore = 128
26
+ worker.gcoresh800-80g = 8
27
+ worker.script = sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/finetune_rlpt_from_noise.sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/DanceGRPO /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/conda-envs/dancegrpo-v2/bin/python fastvideo/train_g2rpo_rlpt_from_noise.py 4 8
28
+
29
+ worker.ports = 1
30
+
31
+ [am]
32
+ afo.app.am.resource.mb = 4096
33
+
34
+ [tensorboard]
35
+ with.tensor.board = false
36
+
37
+ [docker]
38
+ afo.docker.image.name = registryonline-hulk.sankuai.com/custom_prod/com.sankuai.data.hadoop.gpu/data-hadoop-camera3d_cuda12.4-nccl2.21.5-prod-10ab7b1d
39
+
40
+
41
+ [data]
42
+ afo.data.prefetch = false
43
+
44
+ [failover]
45
+ afo.app.support.engine.failover = true
46
+
47
+ [conda]
48
+ afo.conda.env.name =
49
+ afo.conda.env.path =
50
+ afo.conda.store.type =
51
+
52
+ [distribute]
53
+ afo.role.worker.gpu_driver_version = 470.103.01
54
+
55
+ [others]
56
+ afo.app.env.YARN_CONTAINER_RUNTIME_DOCKER_SHM_SIZE_BYTES = 640000000000
57
+ afo.xm.notice.receivers.account = zhangshengjun02
58
+ with_requirements = false
59
+ afo.app.yarn.allocate.timeout.seconds = 3600000
60
+ afo.app.blacklist.fail_times = 16
61
+ #afo.role.worker.task.attempt.max.retry = 16
62
+ afo.role.worker.task.attempt.max.retry = 1
63
+ afo.dolphinfs.otherusers = hadoop-videogen-hl,hadoop-imagen-hl:true,hadoop-vision-data:true
64
+ afo.use.hdfs.fuse=true
65
+ afo.use.hdfs.fuse.subpath=:/mnt/hdfs
66
+ afo.use.hdfs.fuse.readonly=false
67
+ afo.role.worker.not.node_name = hldy-data-k8s-gpu-h800-node0483.mt,hldy-data-k8s-gpu-h800-node0866.mt,hldy-data-k8s-gpu-h800-node0187.mt,hldy-data-k8s-gpu-h800-node0059.mt,hldy-data-k8s-gpu-h800-node0178.mt,hldy-data-k8s-gpu-h800-node0670.mt,hldy-data-k8s-gpu-h800-node0303.mt,hldy-data-k8s-gpu-h800-node0950.mt,hldy-data-k8s-gpu-h800-node0785.mt,hldy-data-k8s-gpu-h800-node0416.mt,hldy-data-k8s-gpu-h800-node0846.mt,hldy-data-k8s-gpu-h800-node0836.mt,hldy-data-k8s-gpu-h800-node0802.mt,hldy-data-k8s-gpu-h800-node0768.mt,hldy-data-k8s-gpu-h800-node1014.mt,hldy-data-k8s-gpu-h800-node0843.mt
68
+ afo.role.am.not.node_name = hlsc-data-k8s-node0187.mt
hope/finetune_tempflow_multi.hope ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [base]
2
+ type = ml-vision
3
+
4
+ [resource]
5
+ usergroup = hadoop-camera3d
6
+ queue = root.hldy_training_cluster.hadoop-aipnlp.h800_vi_sp
7
+
8
+ [dataset]
9
+ dataset_name =
10
+ dataset_type =
11
+ dataset_path =
12
+
13
+ [job_track]
14
+ demand_id = 91369190
15
+ upstream_jobid =
16
+ input_dir =
17
+ output_dir =
18
+ log_dir =
19
+
20
+ [user_args]
21
+
22
+ [roles]
23
+ workers = 1
24
+ worker.memory = 1920000
25
+ worker.vcore = 128
26
+ worker.gcoresh800-80g = 8
27
+ worker.script = sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/finetune_tempflow_multi.sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/DanceGRPO /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/conda-envs/dancegrpo-v2/bin/python fastvideo/train_tempflow_hps_clip.py 1 8
28
+
29
+ worker.ports = 1
30
+
31
+ [am]
32
+ afo.app.am.resource.mb = 4096
33
+
34
+ [tensorboard]
35
+ with.tensor.board = false
36
+
37
+ [docker]
38
+ afo.docker.image.name = registryonline-hulk.sankuai.com/custom_prod/com.sankuai.data.hadoop.gpu/data-hadoop-camera3d_cuda12.4-nccl2.21.5-prod-10ab7b1d
39
+
40
+
41
+ [data]
42
+ afo.data.prefetch = false
43
+
44
+ [failover]
45
+ afo.app.support.engine.failover = true
46
+
47
+ [conda]
48
+ afo.conda.env.name =
49
+ afo.conda.env.path =
50
+ afo.conda.store.type =
51
+
52
+ [distribute]
53
+ afo.role.worker.gpu_driver_version = 470.103.01
54
+
55
+ [others]
56
+ afo.app.env.YARN_CONTAINER_RUNTIME_DOCKER_SHM_SIZE_BYTES = 640000000000
57
+ afo.xm.notice.receivers.account = zhangshengjun02
58
+ with_requirements = false
59
+ afo.app.yarn.allocate.timeout.seconds = 3600000
60
+ afo.app.blacklist.fail_times = 16
61
+ #afo.role.worker.task.attempt.max.retry = 16
62
+ afo.role.worker.task.attempt.max.retry = 1
63
+ afo.dolphinfs.otherusers = hadoop-videogen-hl,hadoop-imagen-hl:true,hadoop-vision-data:true
64
+ afo.use.hdfs.fuse=true
65
+ afo.use.hdfs.fuse.subpath=:/mnt/hdfs
66
+ afo.use.hdfs.fuse.readonly=false
67
+ afo.role.worker.not.node_name = hldy-data-k8s-gpu-h800-node0483.mt,hldy-data-k8s-gpu-h800-node0866.mt,hldy-data-k8s-gpu-h800-node0187.mt,hldy-data-k8s-gpu-h800-node0059.mt,hldy-data-k8s-gpu-h800-node0178.mt,hldy-data-k8s-gpu-h800-node0670.mt,hldy-data-k8s-gpu-h800-node0303.mt,hldy-data-k8s-gpu-h800-node0950.mt,hldy-data-k8s-gpu-h800-node0785.mt,hldy-data-k8s-gpu-h800-node0416.mt,hldy-data-k8s-gpu-h800-node0846.mt,hldy-data-k8s-gpu-h800-node0836.mt,hldy-data-k8s-gpu-h800-node0802.mt,hldy-data-k8s-gpu-h800-node0768.mt,hldy-data-k8s-gpu-h800-node1014.mt,hldy-data-k8s-gpu-h800-node0843.mt
68
+ afo.role.am.not.node_name = hlsc-data-k8s-node0187.mt