Add files using upload-large-folder tool
Browse files- data/qwenimage_rl_embeddings/prompt_embed/27214.pt +3 -0
- data/qwenimage_rl_embeddings/prompt_embed/27548.pt +3 -0
- data/qwenimage_rl_embeddings/prompt_embed/28269.pt +3 -0
- data/qwenimage_rl_embeddings/prompt_embed/35020.pt +3 -0
- data/qwenimage_rl_embeddings/prompt_embed/51199.pt +3 -0
- data/qwenimage_rl_embeddings/prompt_embed/70606.pt +3 -0
- data/qwenimage_rl_embeddings/prompt_embed/76836.pt +3 -0
- data/qwenimage_rl_embeddings/prompt_embed/80700.pt +3 -0
- data/qwenimage_rl_embeddings/prompt_embed/81549.pt +3 -0
- data/qwenimage_rl_embeddings/prompt_embed/94872.pt +3 -0
- data/qwenimage_rl_embeddings/prompt_embed/98552.pt +3 -0
- hope/finetune_intervalstep.hope +68 -0
- hope/finetune_mergestep.sh +97 -0
- hope/finetune_mergestep_multi.sh +99 -0
- hope/finetune_mergestep_multi_v2.hope +68 -0
- hope/finetune_multistep.hope +68 -0
- hope/finetune_rlpt.hope +68 -0
- hope/finetune_rlpt.sh +97 -0
- hope/finetune_rlpt_from_noise.hope +68 -0
- hope/finetune_tempflow_multi.hope +68 -0
data/qwenimage_rl_embeddings/prompt_embed/27214.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0e6e1d92bf4691cf95d4000bf0a6cb6112ce5943b629f2deb6f25494a81feb31
|
| 3 |
+
size 7341531
|
data/qwenimage_rl_embeddings/prompt_embed/27548.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:55f9e80c9788f581a087b97b27e38426c69d39b03c088503a52b9ba1582f398e
|
| 3 |
+
size 7341531
|
data/qwenimage_rl_embeddings/prompt_embed/28269.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ec71dd4914a29f331f0e4e25ba03ea8992f411797e6224f83def3aa69e7789ce
|
| 3 |
+
size 7341531
|
data/qwenimage_rl_embeddings/prompt_embed/35020.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a6604247e246bdc9e66602d3db3b4611b38733c07362894220e78659a5357afb
|
| 3 |
+
size 7341531
|
data/qwenimage_rl_embeddings/prompt_embed/51199.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4bb38e0af3d106c6988c980db6ed16f8b91602966a3edfc7adb0db2afc153685
|
| 3 |
+
size 7341531
|
data/qwenimage_rl_embeddings/prompt_embed/70606.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a06b6d21c0cd9e05a829ac4ec4a1151f148c10663df240568b1b0d6c5d2ec504
|
| 3 |
+
size 7341531
|
data/qwenimage_rl_embeddings/prompt_embed/76836.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f38a891cb79fc0386d497068c2d1dcda5b8607d50c35f46c3b6f37f1a4012d54
|
| 3 |
+
size 7341531
|
data/qwenimage_rl_embeddings/prompt_embed/80700.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bce0abc6601a8b348551e7f766fb0f5136ec991eb9364681331f98588ae01474
|
| 3 |
+
size 7341531
|
data/qwenimage_rl_embeddings/prompt_embed/81549.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:235d43ccd91e1a4327a120a814cc52566fb1d8502c37d703a50229c97b78feaf
|
| 3 |
+
size 7341531
|
data/qwenimage_rl_embeddings/prompt_embed/94872.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:13e8c6013dc5c3fc477936ff6b8708dd110c207087ab629db02e52a84d81e529
|
| 3 |
+
size 7341531
|
data/qwenimage_rl_embeddings/prompt_embed/98552.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a84711584506cfb03f7c3584af40c10dc3583814df7428110090b46143faf2b9
|
| 3 |
+
size 7341531
|
hope/finetune_intervalstep.hope
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[base]
|
| 2 |
+
type = ml-vision
|
| 3 |
+
|
| 4 |
+
[resource]
|
| 5 |
+
usergroup = hadoop-camera3d
|
| 6 |
+
queue = root.hldy_training_cluster.hadoop-aipnlp.h800_vi_sp
|
| 7 |
+
|
| 8 |
+
[dataset]
|
| 9 |
+
dataset_name =
|
| 10 |
+
dataset_type =
|
| 11 |
+
dataset_path =
|
| 12 |
+
|
| 13 |
+
[job_track]
|
| 14 |
+
demand_id = 91369190
|
| 15 |
+
upstream_jobid =
|
| 16 |
+
input_dir =
|
| 17 |
+
output_dir =
|
| 18 |
+
log_dir =
|
| 19 |
+
|
| 20 |
+
[user_args]
|
| 21 |
+
|
| 22 |
+
[roles]
|
| 23 |
+
workers = 2
|
| 24 |
+
worker.memory = 1920000
|
| 25 |
+
worker.vcore = 128
|
| 26 |
+
worker.gcoresh800-80g = 8
|
| 27 |
+
worker.script = sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/finetune_intervalstep.sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/DanceGRPO /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/conda-envs/dancegrpo-v2/bin/python fastvideo/train_g2rpo_hps.py 2 8
|
| 28 |
+
|
| 29 |
+
worker.ports = 1
|
| 30 |
+
|
| 31 |
+
[am]
|
| 32 |
+
afo.app.am.resource.mb = 4096
|
| 33 |
+
|
| 34 |
+
[tensorboard]
|
| 35 |
+
with.tensor.board = false
|
| 36 |
+
|
| 37 |
+
[docker]
|
| 38 |
+
afo.docker.image.name = registryonline-hulk.sankuai.com/custom_prod/com.sankuai.data.hadoop.gpu/data-hadoop-camera3d_cuda12.4-nccl2.21.5-prod-10ab7b1d
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
[data]
|
| 42 |
+
afo.data.prefetch = false
|
| 43 |
+
|
| 44 |
+
[failover]
|
| 45 |
+
afo.app.support.engine.failover = true
|
| 46 |
+
|
| 47 |
+
[conda]
|
| 48 |
+
afo.conda.env.name =
|
| 49 |
+
afo.conda.env.path =
|
| 50 |
+
afo.conda.store.type =
|
| 51 |
+
|
| 52 |
+
[distribute]
|
| 53 |
+
afo.role.worker.gpu_driver_version = 470.103.01
|
| 54 |
+
|
| 55 |
+
[others]
|
| 56 |
+
afo.app.env.YARN_CONTAINER_RUNTIME_DOCKER_SHM_SIZE_BYTES = 640000000000
|
| 57 |
+
afo.xm.notice.receivers.account = zhangshengjun02
|
| 58 |
+
with_requirements = false
|
| 59 |
+
afo.app.yarn.allocate.timeout.seconds = 3600000
|
| 60 |
+
afo.app.blacklist.fail_times = 16
|
| 61 |
+
#afo.role.worker.task.attempt.max.retry = 16
|
| 62 |
+
afo.role.worker.task.attempt.max.retry = 1
|
| 63 |
+
afo.dolphinfs.otherusers = hadoop-videogen-hl,hadoop-imagen-hl:true,hadoop-vision-data:true
|
| 64 |
+
afo.use.hdfs.fuse=true
|
| 65 |
+
afo.use.hdfs.fuse.subpath=:/mnt/hdfs
|
| 66 |
+
afo.use.hdfs.fuse.readonly=false
|
| 67 |
+
afo.role.worker.not.node_name = hldy-data-k8s-gpu-h800-node0483.mt,hldy-data-k8s-gpu-h800-node0866.mt,hldy-data-k8s-gpu-h800-node0187.mt,hldy-data-k8s-gpu-h800-node0059.mt,hldy-data-k8s-gpu-h800-node0178.mt,hldy-data-k8s-gpu-h800-node0670.mt,hldy-data-k8s-gpu-h800-node0303.mt,hldy-data-k8s-gpu-h800-node0950.mt,hldy-data-k8s-gpu-h800-node0785.mt,hldy-data-k8s-gpu-h800-node0416.mt,hldy-data-k8s-gpu-h800-node0846.mt,hldy-data-k8s-gpu-h800-node0836.mt,hldy-data-k8s-gpu-h800-node0802.mt,hldy-data-k8s-gpu-h800-node0768.mt,hldy-data-k8s-gpu-h800-node1014.mt,hldy-data-k8s-gpu-h800-node0843.mt
|
| 68 |
+
afo.role.am.not.node_name = hlsc-data-k8s-node0187.mt
|
hope/finetune_mergestep.sh
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# cluster_spec='{"am":["psx2s7cxrbvmlcvk-am-0.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local"],"index":"0","role":"worker","worker":["psx2s7cxrbvmlcvk-worker-0.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local:3400","psx2s7cxrbvmlcvk-worker-1.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local:3400"]}'
|
| 3 |
+
# echo "cluster spec is $cluster_spec"
|
| 4 |
+
WORK_DIR=$1
|
| 5 |
+
PYTHON_BIN=$2
|
| 6 |
+
SCRIPT=$3
|
| 7 |
+
NNODES=$4
|
| 8 |
+
NPROC_PER_NODE=$5
|
| 9 |
+
|
| 10 |
+
echo "WORK_DIR is $WORK_DIR"
|
| 11 |
+
echo "PYTHON_BIN is $PYTHON_BIN"
|
| 12 |
+
echo "SCRIPT is $SCRIPT"
|
| 13 |
+
echo "NNODES is $NNODES"
|
| 14 |
+
echo "NPROC_PER_NODE is $NPROC_PER_NODE"
|
| 15 |
+
|
| 16 |
+
PORT=${PORT:-29509}
|
| 17 |
+
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
|
| 18 |
+
|
| 19 |
+
cluster_spec=${AFO_ENV_CLUSTER_SPEC//\"/\\\"}
|
| 20 |
+
echo "cluster spec is $cluster_spec"
|
| 21 |
+
# Assuming worker_list contains the JSON string (it's already been parsed)
|
| 22 |
+
worker_list_command="import json_parser; data = json_parser.parse('$cluster_spec'); print(data['worker'])"
|
| 23 |
+
worker_list=$($PYTHON_BIN -c "$worker_list_command")
|
| 24 |
+
|
| 25 |
+
# Remove the square brackets and quotes from worker_list
|
| 26 |
+
worker_list_cleaned=$(echo $worker_list | tr -d '[]' | tr -d "'")
|
| 27 |
+
|
| 28 |
+
# Convert the cleaned worker list into an array by splitting by commas
|
| 29 |
+
worker_strs=($(echo $worker_list_cleaned | tr ',' '\n'))
|
| 30 |
+
|
| 31 |
+
# Extract the master (first worker)
|
| 32 |
+
master=${worker_strs[0]}
|
| 33 |
+
|
| 34 |
+
# Extract master address and port
|
| 35 |
+
master_addr=$(echo $master | cut -d ':' -f1)
|
| 36 |
+
master_port=$(echo $master | cut -d ':' -f2)
|
| 37 |
+
|
| 38 |
+
# Output the master information without brackets and quotes
|
| 39 |
+
echo "worker list is $worker_list_cleaned"
|
| 40 |
+
echo "master is $master"
|
| 41 |
+
echo "master address is $master_addr"
|
| 42 |
+
echo "master port is $master_port"
|
| 43 |
+
|
| 44 |
+
worker_list_command="import json_parser; data = json_parser.parse('$cluster_spec'); print(data['index'])"
|
| 45 |
+
node_rank=$($PYTHON_BIN -c "$worker_list_command")
|
| 46 |
+
echo "node rank is $node_rank"
|
| 47 |
+
dist_url="tcp://$master_addr:$master_port"
|
| 48 |
+
echo "dist url is $dist_url"
|
| 49 |
+
|
| 50 |
+
export TOKENIZERS_PARALLELISM=false
|
| 51 |
+
export OMP_NUM_THREADS=1
|
| 52 |
+
export NCCL_DEBUG=INFO
|
| 53 |
+
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
|
| 54 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 55 |
+
export TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1
|
| 56 |
+
|
| 57 |
+
### launch with DDP (multi-machines-multi-gpus)
|
| 58 |
+
source scl_source enable devtoolset-7
|
| 59 |
+
ifconfig
|
| 60 |
+
cd $WORK_DIR=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/json_parse_test.sh
|
| 61 |
+
$PYTHON_BIN -m torch.distributed.run \
|
| 62 |
+
--nnodes=$NNODES --nproc_per_node=$NPROC_PER_NODE --node_rank=$node_rank --master_addr=$master_addr --master_port=$PORT \
|
| 63 |
+
$SCRIPT \
|
| 64 |
+
--seed 42 \
|
| 65 |
+
--pretrained_model_name_or_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/flux \
|
| 66 |
+
--hps_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/hps/HPS_v2.1_compressed.pt \
|
| 67 |
+
--hps_clip_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin \
|
| 68 |
+
--data_json_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/rl_embeddings/videos2caption.json \
|
| 69 |
+
--gradient_checkpointing \
|
| 70 |
+
--train_batch_size 1 \
|
| 71 |
+
--num_latent_t 1 \
|
| 72 |
+
--sp_size 1 \
|
| 73 |
+
--train_sp_batch_size 1 \
|
| 74 |
+
--dataloader_num_workers 4 \
|
| 75 |
+
--max_train_steps 301 \
|
| 76 |
+
--learning_rate 2e-6 \
|
| 77 |
+
--mixed_precision bf16 \
|
| 78 |
+
--checkpointing_steps 50 \
|
| 79 |
+
--cfg 0.0 \
|
| 80 |
+
--output_dir /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/save_exp/hps_merge_step_2_0 \
|
| 81 |
+
--h 1024 \
|
| 82 |
+
--w 1024 \
|
| 83 |
+
--t 1 \
|
| 84 |
+
--sampling_steps 16 \
|
| 85 |
+
--eta 0.7 \
|
| 86 |
+
--lr_warmup_steps 0 \
|
| 87 |
+
--sampler_seed 1223627 \
|
| 88 |
+
--max_grad_norm 1.0 \
|
| 89 |
+
--weight_decay 0.0001 \
|
| 90 |
+
--num_generations 12 \
|
| 91 |
+
--shift 3 \
|
| 92 |
+
--init_same_noise \
|
| 93 |
+
--clip_range 1e-4 \
|
| 94 |
+
--adv_clip_max 5.0 \
|
| 95 |
+
--eta_step_list 0 1 2 3 4 5 6 7 \
|
| 96 |
+
--eta_step_merge_list 1 1 1 2 2 2 3 3 \
|
| 97 |
+
--granular_list 1 \
|
hope/finetune_mergestep_multi.sh
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# cluster_spec='{"am":["psx2s7cxrbvmlcvk-am-0.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local"],"index":"0","role":"worker","worker":["psx2s7cxrbvmlcvk-worker-0.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local:3400","psx2s7cxrbvmlcvk-worker-1.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local:3400"]}'
|
| 3 |
+
# echo "cluster spec is $cluster_spec"
|
| 4 |
+
WORK_DIR=$1
|
| 5 |
+
PYTHON_BIN=$2
|
| 6 |
+
SCRIPT=$3
|
| 7 |
+
NNODES=$4
|
| 8 |
+
NPROC_PER_NODE=$5
|
| 9 |
+
|
| 10 |
+
echo "WORK_DIR is $WORK_DIR"
|
| 11 |
+
echo "PYTHON_BIN is $PYTHON_BIN"
|
| 12 |
+
echo "SCRIPT is $SCRIPT"
|
| 13 |
+
echo "NNODES is $NNODES"
|
| 14 |
+
echo "NPROC_PER_NODE is $NPROC_PER_NODE"
|
| 15 |
+
|
| 16 |
+
PORT=${PORT:-29509}
|
| 17 |
+
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
|
| 18 |
+
|
| 19 |
+
cluster_spec=${AFO_ENV_CLUSTER_SPEC//\"/\\\"}
|
| 20 |
+
echo "cluster spec is $cluster_spec"
|
| 21 |
+
# Assuming worker_list contains the JSON string (it's already been parsed)
|
| 22 |
+
worker_list_command="import json_parser; data = json_parser.parse('$cluster_spec'); print(data['worker'])"
|
| 23 |
+
worker_list=$($PYTHON_BIN -c "$worker_list_command")
|
| 24 |
+
|
| 25 |
+
# Remove the square brackets and quotes from worker_list
|
| 26 |
+
worker_list_cleaned=$(echo $worker_list | tr -d '[]' | tr -d "'")
|
| 27 |
+
|
| 28 |
+
# Convert the cleaned worker list into an array by splitting by commas
|
| 29 |
+
worker_strs=($(echo $worker_list_cleaned | tr ',' '\n'))
|
| 30 |
+
|
| 31 |
+
# Extract the master (first worker)
|
| 32 |
+
master=${worker_strs[0]}
|
| 33 |
+
|
| 34 |
+
# Extract master address and port
|
| 35 |
+
master_addr=$(echo $master | cut -d ':' -f1)
|
| 36 |
+
master_port=$(echo $master | cut -d ':' -f2)
|
| 37 |
+
|
| 38 |
+
# Output the master information without brackets and quotes
|
| 39 |
+
echo "worker list is $worker_list_cleaned"
|
| 40 |
+
echo "master is $master"
|
| 41 |
+
echo "master address is $master_addr"
|
| 42 |
+
echo "master port is $master_port"
|
| 43 |
+
|
| 44 |
+
worker_list_command="import json_parser; data = json_parser.parse('$cluster_spec'); print(data['index'])"
|
| 45 |
+
node_rank=$($PYTHON_BIN -c "$worker_list_command")
|
| 46 |
+
echo "node rank is $node_rank"
|
| 47 |
+
dist_url="tcp://$master_addr:$master_port"
|
| 48 |
+
echo "dist url is $dist_url"
|
| 49 |
+
|
| 50 |
+
export TOKENIZERS_PARALLELISM=false
|
| 51 |
+
export OMP_NUM_THREADS=1
|
| 52 |
+
export NCCL_DEBUG=INFO
|
| 53 |
+
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
|
| 54 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 55 |
+
export TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1
|
| 56 |
+
|
| 57 |
+
### launch with DDP (multi-machines-multi-gpus)
|
| 58 |
+
source scl_source enable devtoolset-7
|
| 59 |
+
ifconfig
|
| 60 |
+
cd $WORK_DIR=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO
|
| 61 |
+
$PYTHON_BIN -m torch.distributed.run \
|
| 62 |
+
--nnodes=$NNODES --nproc_per_node=$NPROC_PER_NODE --node_rank=$node_rank --master_addr=$master_addr --master_port=$PORT \
|
| 63 |
+
$SCRIPT \
|
| 64 |
+
--seed 42 \
|
| 65 |
+
--pretrained_model_name_or_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/flux \
|
| 66 |
+
--resume_ckpt /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/save_exp/hps_clip_merge_step/ckpt/checkpoint-200-0 \
|
| 67 |
+
--hps_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/hps/HPS_v2.1_compressed.pt \
|
| 68 |
+
--hps_clip_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin \
|
| 69 |
+
--clip_score_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/clip_score \
|
| 70 |
+
--data_json_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/rl_embeddings/videos2caption.json \
|
| 71 |
+
--train_batch_size 1 \
|
| 72 |
+
--num_latent_t 1 \
|
| 73 |
+
--sp_size 1 \
|
| 74 |
+
--train_sp_batch_size 1 \
|
| 75 |
+
--dataloader_num_workers 4 \
|
| 76 |
+
--max_train_steps 401 \
|
| 77 |
+
--init_steps 200 \
|
| 78 |
+
--learning_rate 2e-6 \
|
| 79 |
+
--mixed_precision bf16 \
|
| 80 |
+
--checkpointing_steps 10 \
|
| 81 |
+
--cfg 0.0 \
|
| 82 |
+
--output_dir /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/save_exp/hps_clip_merge_resume_200 \
|
| 83 |
+
--h 1024 \
|
| 84 |
+
--w 1024 \
|
| 85 |
+
--t 1 \
|
| 86 |
+
--sampling_steps 16 \
|
| 87 |
+
--eta 0.7 \
|
| 88 |
+
--lr_warmup_steps 0 \
|
| 89 |
+
--sampler_seed 1223627 \
|
| 90 |
+
--max_grad_norm 1.0 \
|
| 91 |
+
--weight_decay 0.0001 \
|
| 92 |
+
--num_generations 12 \
|
| 93 |
+
--shift 3 \
|
| 94 |
+
--init_same_noise \
|
| 95 |
+
--clip_range 1e-4 \
|
| 96 |
+
--adv_clip_max 5.0 \
|
| 97 |
+
--eta_step_list 0 1 2 3 4 5 6 7 \
|
| 98 |
+
--eta_step_merge_list 1 1 1 2 2 2 3 3 \
|
| 99 |
+
--granular_list 1 \
|
hope/finetune_mergestep_multi_v2.hope
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[base]
|
| 2 |
+
type = ml-vision
|
| 3 |
+
|
| 4 |
+
[resource]
|
| 5 |
+
usergroup = hadoop-camera3d
|
| 6 |
+
queue = root.hldy_training_cluster.hadoop-aipnlp.h800_vi_sp
|
| 7 |
+
|
| 8 |
+
[dataset]
|
| 9 |
+
dataset_name =
|
| 10 |
+
dataset_type =
|
| 11 |
+
dataset_path =
|
| 12 |
+
|
| 13 |
+
[job_track]
|
| 14 |
+
demand_id = 91369190
|
| 15 |
+
upstream_jobid =
|
| 16 |
+
input_dir =
|
| 17 |
+
output_dir =
|
| 18 |
+
log_dir =
|
| 19 |
+
|
| 20 |
+
[user_args]
|
| 21 |
+
|
| 22 |
+
[roles]
|
| 23 |
+
workers = 4
|
| 24 |
+
worker.memory = 1920000
|
| 25 |
+
worker.vcore = 128
|
| 26 |
+
worker.gcoresh800-80g = 8
|
| 27 |
+
worker.script = sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/finetune_mergestep_multi_v2.sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/DanceGRPO /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/conda-envs/dancegrpo-v2/bin/python fastvideo/train_g2rpo_hps_clip_merge.py 4 8
|
| 28 |
+
|
| 29 |
+
worker.ports = 1
|
| 30 |
+
|
| 31 |
+
[am]
|
| 32 |
+
afo.app.am.resource.mb = 4096
|
| 33 |
+
|
| 34 |
+
[tensorboard]
|
| 35 |
+
with.tensor.board = false
|
| 36 |
+
|
| 37 |
+
[docker]
|
| 38 |
+
afo.docker.image.name = registryonline-hulk.sankuai.com/custom_prod/com.sankuai.data.hadoop.gpu/data-hadoop-camera3d_cuda12.4-nccl2.21.5-prod-10ab7b1d
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
[data]
|
| 42 |
+
afo.data.prefetch = false
|
| 43 |
+
|
| 44 |
+
[failover]
|
| 45 |
+
afo.app.support.engine.failover = true
|
| 46 |
+
|
| 47 |
+
[conda]
|
| 48 |
+
afo.conda.env.name =
|
| 49 |
+
afo.conda.env.path =
|
| 50 |
+
afo.conda.store.type =
|
| 51 |
+
|
| 52 |
+
[distribute]
|
| 53 |
+
afo.role.worker.gpu_driver_version = 470.103.01
|
| 54 |
+
|
| 55 |
+
[others]
|
| 56 |
+
afo.app.env.YARN_CONTAINER_RUNTIME_DOCKER_SHM_SIZE_BYTES = 640000000000
|
| 57 |
+
afo.xm.notice.receivers.account = zhangshengjun02
|
| 58 |
+
with_requirements = false
|
| 59 |
+
afo.app.yarn.allocate.timeout.seconds = 3600000
|
| 60 |
+
afo.app.blacklist.fail_times = 16
|
| 61 |
+
#afo.role.worker.task.attempt.max.retry = 16
|
| 62 |
+
afo.role.worker.task.attempt.max.retry = 1
|
| 63 |
+
afo.dolphinfs.otherusers = hadoop-videogen-hl,hadoop-imagen-hl:true,hadoop-vision-data:true
|
| 64 |
+
afo.use.hdfs.fuse=true
|
| 65 |
+
afo.use.hdfs.fuse.subpath=:/mnt/hdfs
|
| 66 |
+
afo.use.hdfs.fuse.readonly=false
|
| 67 |
+
afo.role.worker.not.node_name = hldy-data-k8s-gpu-h800-node0483.mt,hldy-data-k8s-gpu-h800-node0866.mt,hldy-data-k8s-gpu-h800-node0187.mt,hldy-data-k8s-gpu-h800-node0059.mt,hldy-data-k8s-gpu-h800-node0178.mt,hldy-data-k8s-gpu-h800-node0670.mt,hldy-data-k8s-gpu-h800-node0303.mt,hldy-data-k8s-gpu-h800-node0950.mt,hldy-data-k8s-gpu-h800-node0785.mt,hldy-data-k8s-gpu-h800-node0416.mt,hldy-data-k8s-gpu-h800-node0846.mt,hldy-data-k8s-gpu-h800-node0836.mt,hldy-data-k8s-gpu-h800-node0802.mt,hldy-data-k8s-gpu-h800-node0768.mt,hldy-data-k8s-gpu-h800-node1014.mt,hldy-data-k8s-gpu-h800-node0843.mt
|
| 68 |
+
afo.role.am.not.node_name = hlsc-data-k8s-node0187.mt
|
hope/finetune_multistep.hope
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[base]
|
| 2 |
+
type = ml-vision
|
| 3 |
+
|
| 4 |
+
[resource]
|
| 5 |
+
usergroup = hadoop-camera3d
|
| 6 |
+
queue = root.hldy_training_cluster.hadoop-aipnlp.h800_vi_sp
|
| 7 |
+
|
| 8 |
+
[dataset]
|
| 9 |
+
dataset_name =
|
| 10 |
+
dataset_type =
|
| 11 |
+
dataset_path =
|
| 12 |
+
|
| 13 |
+
[job_track]
|
| 14 |
+
demand_id = 91369190
|
| 15 |
+
upstream_jobid =
|
| 16 |
+
input_dir =
|
| 17 |
+
output_dir =
|
| 18 |
+
log_dir =
|
| 19 |
+
|
| 20 |
+
[user_args]
|
| 21 |
+
|
| 22 |
+
[roles]
|
| 23 |
+
workers = 2
|
| 24 |
+
worker.memory = 1920000
|
| 25 |
+
worker.vcore = 128
|
| 26 |
+
worker.gcoresh800-80g = 8
|
| 27 |
+
worker.script = sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/json_parse_test.sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/DanceGRPO /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/conda-envs/dancegrpo-v2/bin/python fastvideo/train_g2rpo_hps.py 2 8
|
| 28 |
+
|
| 29 |
+
worker.ports = 1
|
| 30 |
+
|
| 31 |
+
[am]
|
| 32 |
+
afo.app.am.resource.mb = 4096
|
| 33 |
+
|
| 34 |
+
[tensorboard]
|
| 35 |
+
with.tensor.board = false
|
| 36 |
+
|
| 37 |
+
[docker]
|
| 38 |
+
afo.docker.image.name = registryonline-hulk.sankuai.com/custom_prod/com.sankuai.data.hadoop.gpu/data-hadoop-camera3d_cuda12.4-nccl2.21.5-prod-10ab7b1d
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
[data]
|
| 42 |
+
afo.data.prefetch = false
|
| 43 |
+
|
| 44 |
+
[failover]
|
| 45 |
+
afo.app.support.engine.failover = true
|
| 46 |
+
|
| 47 |
+
[conda]
|
| 48 |
+
afo.conda.env.name =
|
| 49 |
+
afo.conda.env.path =
|
| 50 |
+
afo.conda.store.type =
|
| 51 |
+
|
| 52 |
+
[distribute]
|
| 53 |
+
afo.role.worker.gpu_driver_version = 470.103.01
|
| 54 |
+
|
| 55 |
+
[others]
|
| 56 |
+
afo.app.env.YARN_CONTAINER_RUNTIME_DOCKER_SHM_SIZE_BYTES = 640000000000
|
| 57 |
+
afo.xm.notice.receivers.account = zhangshengjun02
|
| 58 |
+
with_requirements = false
|
| 59 |
+
afo.app.yarn.allocate.timeout.seconds = 3600000
|
| 60 |
+
afo.app.blacklist.fail_times = 16
|
| 61 |
+
#afo.role.worker.task.attempt.max.retry = 16
|
| 62 |
+
afo.role.worker.task.attempt.max.retry = 1
|
| 63 |
+
afo.dolphinfs.otherusers = hadoop-videogen-hl,hadoop-imagen-hl:true,hadoop-vision-data:true
|
| 64 |
+
afo.use.hdfs.fuse=true
|
| 65 |
+
afo.use.hdfs.fuse.subpath=:/mnt/hdfs
|
| 66 |
+
afo.use.hdfs.fuse.readonly=false
|
| 67 |
+
afo.role.worker.not.node_name = hldy-data-k8s-gpu-h800-node0483.mt,hldy-data-k8s-gpu-h800-node0866.mt,hldy-data-k8s-gpu-h800-node0187.mt,hldy-data-k8s-gpu-h800-node0059.mt,hldy-data-k8s-gpu-h800-node0178.mt,hldy-data-k8s-gpu-h800-node0670.mt,hldy-data-k8s-gpu-h800-node0303.mt,hldy-data-k8s-gpu-h800-node0950.mt,hldy-data-k8s-gpu-h800-node0785.mt,hldy-data-k8s-gpu-h800-node0416.mt,hldy-data-k8s-gpu-h800-node0846.mt,hldy-data-k8s-gpu-h800-node0836.mt,hldy-data-k8s-gpu-h800-node0802.mt,hldy-data-k8s-gpu-h800-node0768.mt,hldy-data-k8s-gpu-h800-node1014.mt,hldy-data-k8s-gpu-h800-node0843.mt
|
| 68 |
+
afo.role.am.not.node_name = hlsc-data-k8s-node0187.mt
|
hope/finetune_rlpt.hope
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[base]
|
| 2 |
+
type = ml-vision
|
| 3 |
+
|
| 4 |
+
[resource]
|
| 5 |
+
usergroup = hadoop-camera3d
|
| 6 |
+
queue = root.hldy_training_cluster.hadoop-aipnlp.h800_vi_sp
|
| 7 |
+
|
| 8 |
+
[dataset]
|
| 9 |
+
dataset_name =
|
| 10 |
+
dataset_type =
|
| 11 |
+
dataset_path =
|
| 12 |
+
|
| 13 |
+
[job_track]
|
| 14 |
+
demand_id = 91369190
|
| 15 |
+
upstream_jobid =
|
| 16 |
+
input_dir =
|
| 17 |
+
output_dir =
|
| 18 |
+
log_dir =
|
| 19 |
+
|
| 20 |
+
[user_args]
|
| 21 |
+
|
| 22 |
+
[roles]
|
| 23 |
+
workers = 4
|
| 24 |
+
worker.memory = 1920000
|
| 25 |
+
worker.vcore = 128
|
| 26 |
+
worker.gcoresh800-80g = 8
|
| 27 |
+
worker.script = sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/finetune_rlpt.sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/DanceGRPO /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/conda-envs/dancegrpo-v2/bin/python fastvideo/train_g2rpo_rlpt_dino.py 4 8
|
| 28 |
+
|
| 29 |
+
worker.ports = 1
|
| 30 |
+
|
| 31 |
+
[am]
|
| 32 |
+
afo.app.am.resource.mb = 4096
|
| 33 |
+
|
| 34 |
+
[tensorboard]
|
| 35 |
+
with.tensor.board = false
|
| 36 |
+
|
| 37 |
+
[docker]
|
| 38 |
+
afo.docker.image.name = registryonline-hulk.sankuai.com/custom_prod/com.sankuai.data.hadoop.gpu/data-hadoop-camera3d_cuda12.4-nccl2.21.5-prod-10ab7b1d
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
[data]
|
| 42 |
+
afo.data.prefetch = false
|
| 43 |
+
|
| 44 |
+
[failover]
|
| 45 |
+
afo.app.support.engine.failover = true
|
| 46 |
+
|
| 47 |
+
[conda]
|
| 48 |
+
afo.conda.env.name =
|
| 49 |
+
afo.conda.env.path =
|
| 50 |
+
afo.conda.store.type =
|
| 51 |
+
|
| 52 |
+
[distribute]
|
| 53 |
+
afo.role.worker.gpu_driver_version = 470.103.01
|
| 54 |
+
|
| 55 |
+
[others]
|
| 56 |
+
afo.app.env.YARN_CONTAINER_RUNTIME_DOCKER_SHM_SIZE_BYTES = 640000000000
|
| 57 |
+
afo.xm.notice.receivers.account = zhangshengjun02
|
| 58 |
+
with_requirements = false
|
| 59 |
+
afo.app.yarn.allocate.timeout.seconds = 3600000
|
| 60 |
+
afo.app.blacklist.fail_times = 16
|
| 61 |
+
#afo.role.worker.task.attempt.max.retry = 16
|
| 62 |
+
afo.role.worker.task.attempt.max.retry = 1
|
| 63 |
+
afo.dolphinfs.otherusers = hadoop-videogen-hl,hadoop-imagen-hl:true,hadoop-vision-data:true
|
| 64 |
+
afo.use.hdfs.fuse=true
|
| 65 |
+
afo.use.hdfs.fuse.subpath=:/mnt/hdfs
|
| 66 |
+
afo.use.hdfs.fuse.readonly=false
|
| 67 |
+
afo.role.worker.not.node_name = hldy-data-k8s-gpu-h800-node0483.mt,hldy-data-k8s-gpu-h800-node0866.mt,hldy-data-k8s-gpu-h800-node0187.mt,hldy-data-k8s-gpu-h800-node0059.mt,hldy-data-k8s-gpu-h800-node0178.mt,hldy-data-k8s-gpu-h800-node0670.mt,hldy-data-k8s-gpu-h800-node0303.mt,hldy-data-k8s-gpu-h800-node0950.mt,hldy-data-k8s-gpu-h800-node0785.mt,hldy-data-k8s-gpu-h800-node0416.mt,hldy-data-k8s-gpu-h800-node0846.mt,hldy-data-k8s-gpu-h800-node0836.mt,hldy-data-k8s-gpu-h800-node0802.mt,hldy-data-k8s-gpu-h800-node0768.mt,hldy-data-k8s-gpu-h800-node1014.mt,hldy-data-k8s-gpu-h800-node0843.mt
|
| 68 |
+
afo.role.am.not.node_name = hlsc-data-k8s-node0187.mt
|
hope/finetune_rlpt.sh
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# cluster_spec='{"am":["psx2s7cxrbvmlcvk-am-0.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local"],"index":"0","role":"worker","worker":["psx2s7cxrbvmlcvk-worker-0.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local:3400","psx2s7cxrbvmlcvk-worker-1.psx2s7cxrbvmlcvk.hadoop-aipnlp.svc.cluster.local:3400"]}'
|
| 3 |
+
# echo "cluster spec is $cluster_spec"
|
| 4 |
+
WORK_DIR=$1
|
| 5 |
+
PYTHON_BIN=$2
|
| 6 |
+
SCRIPT=$3
|
| 7 |
+
NNODES=$4
|
| 8 |
+
NPROC_PER_NODE=$5
|
| 9 |
+
|
| 10 |
+
echo "WORK_DIR is $WORK_DIR"
|
| 11 |
+
echo "PYTHON_BIN is $PYTHON_BIN"
|
| 12 |
+
echo "SCRIPT is $SCRIPT"
|
| 13 |
+
echo "NNODES is $NNODES"
|
| 14 |
+
echo "NPROC_PER_NODE is $NPROC_PER_NODE"
|
| 15 |
+
|
| 16 |
+
PORT=${PORT:-29509}
|
| 17 |
+
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
|
| 18 |
+
|
| 19 |
+
cluster_spec=${AFO_ENV_CLUSTER_SPEC//\"/\\\"}
|
| 20 |
+
echo "cluster spec is $cluster_spec"
|
| 21 |
+
# Assuming worker_list contains the JSON string (it's already been parsed)
|
| 22 |
+
worker_list_command="import json_parser; data = json_parser.parse('$cluster_spec'); print(data['worker'])"
|
| 23 |
+
worker_list=$($PYTHON_BIN -c "$worker_list_command")
|
| 24 |
+
|
| 25 |
+
# Remove the square brackets and quotes from worker_list
|
| 26 |
+
worker_list_cleaned=$(echo $worker_list | tr -d '[]' | tr -d "'")
|
| 27 |
+
|
| 28 |
+
# Convert the cleaned worker list into an array by splitting by commas
|
| 29 |
+
worker_strs=($(echo $worker_list_cleaned | tr ',' '\n'))
|
| 30 |
+
|
| 31 |
+
# Extract the master (first worker)
|
| 32 |
+
master=${worker_strs[0]}
|
| 33 |
+
|
| 34 |
+
# Extract master address and port
|
| 35 |
+
master_addr=$(echo $master | cut -d ':' -f1)
|
| 36 |
+
master_port=$(echo $master | cut -d ':' -f2)
|
| 37 |
+
|
| 38 |
+
# Output the master information without brackets and quotes
|
| 39 |
+
echo "worker list is $worker_list_cleaned"
|
| 40 |
+
echo "master is $master"
|
| 41 |
+
echo "master address is $master_addr"
|
| 42 |
+
echo "master port is $master_port"
|
| 43 |
+
|
| 44 |
+
worker_list_command="import json_parser; data = json_parser.parse('$cluster_spec'); print(data['index'])"
|
| 45 |
+
node_rank=$($PYTHON_BIN -c "$worker_list_command")
|
| 46 |
+
echo "node rank is $node_rank"
|
| 47 |
+
dist_url="tcp://$master_addr:$master_port"
|
| 48 |
+
echo "dist url is $dist_url"
|
| 49 |
+
|
| 50 |
+
export TOKENIZERS_PARALLELISM=false
|
| 51 |
+
export OMP_NUM_THREADS=1
|
| 52 |
+
export NCCL_DEBUG=INFO
|
| 53 |
+
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
|
| 54 |
+
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
|
| 55 |
+
export TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD=1
|
| 56 |
+
|
| 57 |
+
### launch with DDP (multi-machines-multi-gpus)
|
| 58 |
+
source scl_source enable devtoolset-7
|
| 59 |
+
ifconfig
|
| 60 |
+
cd $WORK_DIR=/mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO
|
| 61 |
+
$PYTHON_BIN -m torch.distributed.run \
|
| 62 |
+
--nnodes=$NNODES --nproc_per_node=$NPROC_PER_NODE --node_rank=$node_rank --master_addr=$master_addr --master_port=$PORT \
|
| 63 |
+
$SCRIPT \
|
| 64 |
+
--seed 42 \
|
| 65 |
+
--pretrained_model_name_or_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/flux \
|
| 66 |
+
--hps_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/hps/HPS_v2.1_compressed.pt \
|
| 67 |
+
--hps_clip_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/ckpt/CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin \
|
| 68 |
+
--dino_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/dinov2 \
|
| 69 |
+
--data_json_path /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/datasets/flux_rl_embeddings/videos2caption.json \
|
| 70 |
+
--gradient_checkpointing \
|
| 71 |
+
--train_batch_size 1 \
|
| 72 |
+
--num_latent_t 1 \
|
| 73 |
+
--sp_size 1 \
|
| 74 |
+
--train_sp_batch_size 1 \
|
| 75 |
+
--dataloader_num_workers 4 \
|
| 76 |
+
--max_train_steps 151 \
|
| 77 |
+
--learning_rate 2e-6 \
|
| 78 |
+
--mixed_precision bf16 \
|
| 79 |
+
--checkpointing_steps 30 \
|
| 80 |
+
--cfg 0.0 \
|
| 81 |
+
--output_dir /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/checkpoints/G2RPO/save_exp_rlpt/dino_gt_1022 \
|
| 82 |
+
--h 1024 \
|
| 83 |
+
--w 1024 \
|
| 84 |
+
--t 1 \
|
| 85 |
+
--sampling_steps 16 \
|
| 86 |
+
--eta 0.7 \
|
| 87 |
+
--lr_warmup_steps 0 \
|
| 88 |
+
--sampler_seed 1223627 \
|
| 89 |
+
--max_grad_norm 1.0 \
|
| 90 |
+
--weight_decay 0.0001 \
|
| 91 |
+
--num_generations 12 \
|
| 92 |
+
--shift 3 \
|
| 93 |
+
--init_same_noise \
|
| 94 |
+
--clip_range 1e-4 \
|
| 95 |
+
--adv_clip_max 5.0 \
|
| 96 |
+
--eta_step_list 0 1 2 3 4 5 6 7 \
|
| 97 |
+
--granular_list 1 \
|
hope/finetune_rlpt_from_noise.hope
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[base]
|
| 2 |
+
type = ml-vision
|
| 3 |
+
|
| 4 |
+
[resource]
|
| 5 |
+
usergroup = hadoop-camera3d
|
| 6 |
+
queue = root.hldy_training_cluster.hadoop-aipnlp.h800_vi_sp
|
| 7 |
+
|
| 8 |
+
[dataset]
|
| 9 |
+
dataset_name =
|
| 10 |
+
dataset_type =
|
| 11 |
+
dataset_path =
|
| 12 |
+
|
| 13 |
+
[job_track]
|
| 14 |
+
demand_id = 91369190
|
| 15 |
+
upstream_jobid =
|
| 16 |
+
input_dir =
|
| 17 |
+
output_dir =
|
| 18 |
+
log_dir =
|
| 19 |
+
|
| 20 |
+
[user_args]
|
| 21 |
+
|
| 22 |
+
[roles]
|
| 23 |
+
workers = 4
|
| 24 |
+
worker.memory = 1920000
|
| 25 |
+
worker.vcore = 128
|
| 26 |
+
worker.gcoresh800-80g = 8
|
| 27 |
+
worker.script = sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/finetune_rlpt_from_noise.sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/DanceGRPO /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/conda-envs/dancegrpo-v2/bin/python fastvideo/train_g2rpo_rlpt_from_noise.py 4 8
|
| 28 |
+
|
| 29 |
+
worker.ports = 1
|
| 30 |
+
|
| 31 |
+
[am]
|
| 32 |
+
afo.app.am.resource.mb = 4096
|
| 33 |
+
|
| 34 |
+
[tensorboard]
|
| 35 |
+
with.tensor.board = false
|
| 36 |
+
|
| 37 |
+
[docker]
|
| 38 |
+
afo.docker.image.name = registryonline-hulk.sankuai.com/custom_prod/com.sankuai.data.hadoop.gpu/data-hadoop-camera3d_cuda12.4-nccl2.21.5-prod-10ab7b1d
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
[data]
|
| 42 |
+
afo.data.prefetch = false
|
| 43 |
+
|
| 44 |
+
[failover]
|
| 45 |
+
afo.app.support.engine.failover = true
|
| 46 |
+
|
| 47 |
+
[conda]
|
| 48 |
+
afo.conda.env.name =
|
| 49 |
+
afo.conda.env.path =
|
| 50 |
+
afo.conda.store.type =
|
| 51 |
+
|
| 52 |
+
[distribute]
|
| 53 |
+
afo.role.worker.gpu_driver_version = 470.103.01
|
| 54 |
+
|
| 55 |
+
[others]
|
| 56 |
+
afo.app.env.YARN_CONTAINER_RUNTIME_DOCKER_SHM_SIZE_BYTES = 640000000000
|
| 57 |
+
afo.xm.notice.receivers.account = zhangshengjun02
|
| 58 |
+
with_requirements = false
|
| 59 |
+
afo.app.yarn.allocate.timeout.seconds = 3600000
|
| 60 |
+
afo.app.blacklist.fail_times = 16
|
| 61 |
+
#afo.role.worker.task.attempt.max.retry = 16
|
| 62 |
+
afo.role.worker.task.attempt.max.retry = 1
|
| 63 |
+
afo.dolphinfs.otherusers = hadoop-videogen-hl,hadoop-imagen-hl:true,hadoop-vision-data:true
|
| 64 |
+
afo.use.hdfs.fuse=true
|
| 65 |
+
afo.use.hdfs.fuse.subpath=:/mnt/hdfs
|
| 66 |
+
afo.use.hdfs.fuse.readonly=false
|
| 67 |
+
afo.role.worker.not.node_name = hldy-data-k8s-gpu-h800-node0483.mt,hldy-data-k8s-gpu-h800-node0866.mt,hldy-data-k8s-gpu-h800-node0187.mt,hldy-data-k8s-gpu-h800-node0059.mt,hldy-data-k8s-gpu-h800-node0178.mt,hldy-data-k8s-gpu-h800-node0670.mt,hldy-data-k8s-gpu-h800-node0303.mt,hldy-data-k8s-gpu-h800-node0950.mt,hldy-data-k8s-gpu-h800-node0785.mt,hldy-data-k8s-gpu-h800-node0416.mt,hldy-data-k8s-gpu-h800-node0846.mt,hldy-data-k8s-gpu-h800-node0836.mt,hldy-data-k8s-gpu-h800-node0802.mt,hldy-data-k8s-gpu-h800-node0768.mt,hldy-data-k8s-gpu-h800-node1014.mt,hldy-data-k8s-gpu-h800-node0843.mt
|
| 68 |
+
afo.role.am.not.node_name = hlsc-data-k8s-node0187.mt
|
hope/finetune_tempflow_multi.hope
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[base]
|
| 2 |
+
type = ml-vision
|
| 3 |
+
|
| 4 |
+
[resource]
|
| 5 |
+
usergroup = hadoop-camera3d
|
| 6 |
+
queue = root.hldy_training_cluster.hadoop-aipnlp.h800_vi_sp
|
| 7 |
+
|
| 8 |
+
[dataset]
|
| 9 |
+
dataset_name =
|
| 10 |
+
dataset_type =
|
| 11 |
+
dataset_path =
|
| 12 |
+
|
| 13 |
+
[job_track]
|
| 14 |
+
demand_id = 91369190
|
| 15 |
+
upstream_jobid =
|
| 16 |
+
input_dir =
|
| 17 |
+
output_dir =
|
| 18 |
+
log_dir =
|
| 19 |
+
|
| 20 |
+
[user_args]
|
| 21 |
+
|
| 22 |
+
[roles]
|
| 23 |
+
workers = 1
|
| 24 |
+
worker.memory = 1920000
|
| 25 |
+
worker.vcore = 128
|
| 26 |
+
worker.gcoresh800-80g = 8
|
| 27 |
+
worker.script = sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/Granular-GRPO/hope/finetune_tempflow_multi.sh /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/DanceGRPO /mnt/dolphinfs/ssd_pool/docker/user/hadoop-videogen-hl/hadoop-camera3d/zhangshengjun/conda-envs/dancegrpo-v2/bin/python fastvideo/train_tempflow_hps_clip.py 1 8
|
| 28 |
+
|
| 29 |
+
worker.ports = 1
|
| 30 |
+
|
| 31 |
+
[am]
|
| 32 |
+
afo.app.am.resource.mb = 4096
|
| 33 |
+
|
| 34 |
+
[tensorboard]
|
| 35 |
+
with.tensor.board = false
|
| 36 |
+
|
| 37 |
+
[docker]
|
| 38 |
+
afo.docker.image.name = registryonline-hulk.sankuai.com/custom_prod/com.sankuai.data.hadoop.gpu/data-hadoop-camera3d_cuda12.4-nccl2.21.5-prod-10ab7b1d
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
[data]
|
| 42 |
+
afo.data.prefetch = false
|
| 43 |
+
|
| 44 |
+
[failover]
|
| 45 |
+
afo.app.support.engine.failover = true
|
| 46 |
+
|
| 47 |
+
[conda]
|
| 48 |
+
afo.conda.env.name =
|
| 49 |
+
afo.conda.env.path =
|
| 50 |
+
afo.conda.store.type =
|
| 51 |
+
|
| 52 |
+
[distribute]
|
| 53 |
+
afo.role.worker.gpu_driver_version = 470.103.01
|
| 54 |
+
|
| 55 |
+
[others]
|
| 56 |
+
afo.app.env.YARN_CONTAINER_RUNTIME_DOCKER_SHM_SIZE_BYTES = 640000000000
|
| 57 |
+
afo.xm.notice.receivers.account = zhangshengjun02
|
| 58 |
+
with_requirements = false
|
| 59 |
+
afo.app.yarn.allocate.timeout.seconds = 3600000
|
| 60 |
+
afo.app.blacklist.fail_times = 16
|
| 61 |
+
#afo.role.worker.task.attempt.max.retry = 16
|
| 62 |
+
afo.role.worker.task.attempt.max.retry = 1
|
| 63 |
+
afo.dolphinfs.otherusers = hadoop-videogen-hl,hadoop-imagen-hl:true,hadoop-vision-data:true
|
| 64 |
+
afo.use.hdfs.fuse=true
|
| 65 |
+
afo.use.hdfs.fuse.subpath=:/mnt/hdfs
|
| 66 |
+
afo.use.hdfs.fuse.readonly=false
|
| 67 |
+
afo.role.worker.not.node_name = hldy-data-k8s-gpu-h800-node0483.mt,hldy-data-k8s-gpu-h800-node0866.mt,hldy-data-k8s-gpu-h800-node0187.mt,hldy-data-k8s-gpu-h800-node0059.mt,hldy-data-k8s-gpu-h800-node0178.mt,hldy-data-k8s-gpu-h800-node0670.mt,hldy-data-k8s-gpu-h800-node0303.mt,hldy-data-k8s-gpu-h800-node0950.mt,hldy-data-k8s-gpu-h800-node0785.mt,hldy-data-k8s-gpu-h800-node0416.mt,hldy-data-k8s-gpu-h800-node0846.mt,hldy-data-k8s-gpu-h800-node0836.mt,hldy-data-k8s-gpu-h800-node0802.mt,hldy-data-k8s-gpu-h800-node0768.mt,hldy-data-k8s-gpu-h800-node1014.mt,hldy-data-k8s-gpu-h800-node0843.mt
|
| 68 |
+
afo.role.am.not.node_name = hlsc-data-k8s-node0187.mt
|