diff --git a/VLMEvalKit_old/InternVL/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml b/VLMEvalKit_old/InternVL/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4ca8070b16293c0febbcceaf050adc9dc626de00
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml
@@ -0,0 +1,36 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 224
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 48
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/VLMEvalKit_old/InternVL/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml b/VLMEvalKit_old/InternVL/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1572d242729069413b3c61533e564befb37f3c49
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_real.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/VLMEvalKit_old/InternVL/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml b/VLMEvalKit_old/InternVL/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..6095ad8702fe71b93416b5ccc2d70cd798a076d6
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/classification/configs/attn_pooling_probing/attn_pooling_probing_intern_vit_6b_448px_v1_5_in1k_448_64gpu_imagenet_real.yaml
@@ -0,0 +1,37 @@
+DATA:
+  IMG_ON_MEMORY: False
+  BATCH_SIZE: 16 # single GPU batch size
+  DATASET: 'imagenet-real'
+  TRANSFORM: 'build_transform_for_linear_probe'
+  DATA_PATH: './data/imagenet-1k'
+  IMG_SIZE: 448
+MODEL:
+  TYPE: intern_vit_6b
+  DROP_PATH_RATE: 0.0
+  INTERN_VIT_6B:
+    FREEZE_VIT: True
+    PATCH_SIZE: 14
+    PRETRAIN_SIZE: 448
+    QKV_BIAS: False
+    EMBED_DIM: 3200
+    NUM_HEADS: 25
+    MLP_RATIO: 4
+    INIT_VALUES: 0.1
+    QK_NORMALIZATION: True
+    DEPTH: 45
+    USE_FLASH_ATTN: True
+    PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_5.pth"
+    CLS_TARGET: 'attention_pooling'
+TRAIN:
+  EMA:
+    ENABLE: True
+    DECAY: 0.998
+  EPOCHS: 10
+  WARMUP_EPOCHS: 1
+  WEIGHT_DECAY: 0.0
+  BASE_LR: 0.1 # 512
+  WARMUP_LR: .0
+  MIN_LR: .0
+  LR_LAYER_DECAY: false
+  OPTIMIZER:
+    NAME: 'sgd'
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/examples/image1.jpg b/VLMEvalKit_old/InternVL/internvl_chat/examples/image1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fd9891ef7e00774157a9dcd726b2ea9fa0c5ecff
Binary files /dev/null and b/VLMEvalKit_old/InternVL/internvl_chat/examples/image1.jpg differ
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/examples/image3.jpg b/VLMEvalKit_old/InternVL/internvl_chat/examples/image3.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..972d5694d89803ac930ad6b34ba1bea7c7675fa1
Binary files /dev/null and b/VLMEvalKit_old/InternVL/internvl_chat/examples/image3.jpg differ
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/data/coco_caption.json b/VLMEvalKit_old/InternVL/internvl_chat/shell/data/coco_caption.json
new file mode 100644
index 0000000000000000000000000000000000000000..86898235ae911bbf6d25913ac4f94e65ddc1d2f6
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/data/coco_caption.json
@@ -0,0 +1,9 @@
+{
+  "coco_karpathy_train_567k": {
+    "root": "data/coco/",
+    "annotation": "data/coco/annotations/coco_karpathy_train_567k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 566747
+  }
+}
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/data/internvl_1_2_finetune.json b/VLMEvalKit_old/InternVL/internvl_chat/shell/data/internvl_1_2_finetune.json
new file mode 100644
index 0000000000000000000000000000000000000000..d98d69f8b80536a8d2010efd7a0d72b8543c84fc
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/data/internvl_1_2_finetune.json
@@ -0,0 +1,65 @@
+{
+  "sharegpt4v_instruct_gpt4-vision_cap100k": {
+    "root": "playground/data/",
+    "annotation": "playground/opensource/sharegpt4v_instruct_gpt4-vision_cap100k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 102025
+  },
+  "llava_instruct_150k_zh": {
+    "root": "playground/data/coco/",
+    "annotation": "playground/opensource/llava_instruct_150k_zh.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 157712
+  },
+  "sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k": {
+    "root": "playground/data/",
+    "annotation": "playground/opensource/sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 665058
+  },
+  "dvqa_train_200k": {
+    "root": "playground/data/dvqa/",
+    "annotation": "playground/opensource/dvqa_train_200k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 200000
+  },
+  "chartqa_train_18k": {
+    "root": "playground/data/chartqa/",
+    "annotation": "playground/opensource/chartqa_train_18k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 18317
+  },
+  "ai2d_train_12k": {
+    "root": "playground/data/ai2d/",
+    "annotation": "playground/opensource/ai2d_train_12k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 12413
+  },
+  "docvqa_train_10k": {
+    "root": "playground/data/docvqa/",
+    "annotation": "playground/opensource/docvqa_train_10k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 10211
+  },
+  "geoqa+": {
+    "root": "playground/data/geoqa+/",
+    "annotation": "playground/opensource/geoqa+.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 72318
+  },
+  "synthdog_en": {
+    "root": "playground/data/synthdog-en/",
+    "annotation": "playground/opensource/synthdog_en.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 29765
+  }
+}
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/data/internvl_1_2_finetune_custom.json b/VLMEvalKit_old/InternVL/internvl_chat/shell/data/internvl_1_2_finetune_custom.json
new file mode 100644
index 0000000000000000000000000000000000000000..d98d69f8b80536a8d2010efd7a0d72b8543c84fc
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/data/internvl_1_2_finetune_custom.json
@@ -0,0 +1,65 @@
+{
+  "sharegpt4v_instruct_gpt4-vision_cap100k": {
+    "root": "playground/data/",
+    "annotation": "playground/opensource/sharegpt4v_instruct_gpt4-vision_cap100k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 102025
+  },
+  "llava_instruct_150k_zh": {
+    "root": "playground/data/coco/",
+    "annotation": "playground/opensource/llava_instruct_150k_zh.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 157712
+  },
+  "sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k": {
+    "root": "playground/data/",
+    "annotation": "playground/opensource/sharegpt4v_mix665k_cap23k_coco-ap9k_lcs3k_sam9k_div2k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 665058
+  },
+  "dvqa_train_200k": {
+    "root": "playground/data/dvqa/",
+    "annotation": "playground/opensource/dvqa_train_200k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 200000
+  },
+  "chartqa_train_18k": {
+    "root": "playground/data/chartqa/",
+    "annotation": "playground/opensource/chartqa_train_18k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 18317
+  },
+  "ai2d_train_12k": {
+    "root": "playground/data/ai2d/",
+    "annotation": "playground/opensource/ai2d_train_12k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 12413
+  },
+  "docvqa_train_10k": {
+    "root": "playground/data/docvqa/",
+    "annotation": "playground/opensource/docvqa_train_10k.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 10211
+  },
+  "geoqa+": {
+    "root": "playground/data/geoqa+/",
+    "annotation": "playground/opensource/geoqa+.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 72318
+  },
+  "synthdog_en": {
+    "root": "playground/data/synthdog-en/",
+    "annotation": "playground/opensource/synthdog_en.jsonl",
+    "data_augment": false,
+    "repeat_time": 1,
+    "length": 29765
+  }
+}
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.2/2nd_finetune/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.2/2nd_finetune/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..20be1beec754b5fd6ccd85bf5047b3162a67265c
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.2/2nd_finetune/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_full.sh
@@ -0,0 +1,76 @@
+set -x
+
+PARTITION=${PARTITION:-"INTERN2"}
+GPUS=${GPUS:-16}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_2/internvl_chat_v1_2_hermes2_yi34b_448_res_2nd_finetune_full'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 16
+# batch size per gpu: 4
+# gradient accumulation steps: 2
+# total batch size: 128
+# epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/InternVL-Chat-V1-2-Plus" \
+  --conv_style "Hermes-2" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 1 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 1e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 2048 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size False \
+  --use_thumbnail False \
+  --ps_version 'v1' \
+  --deepspeed "zero_stage3_config_34b.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.2/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.2/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cdb611a2a5129d16a09aaebf77109be8f014da56
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.2/hermes2_yi34b/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune.sh
@@ -0,0 +1,74 @@
+set -x
+
+PARTITION=${PARTITION:-"INTERN2"}
+GPUS=${GPUS:-64}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-512}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-8}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_2/internvl_chat_v1_2_hermes2_yi34b_448_res_finetune'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 64
+# batch size per gpu: 8
+# gradient accumulation steps: 1
+# total batch size: 512
+# epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_finetune.py \
+  --vision_path "./pretrained/InternViT-6B-448px-V1-2" \
+  --mlp_path "./pretrained/InternViT-6B-448px-V1-2/mlp_projector/hermes_2_yi_34b.pth" \
+  --llm_path "./pretrained/Nous-Hermes-2-Yi-34B" \
+  --conv_style "Hermes-2" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.4 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 3 \
+  --learning_rate 1e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 2048 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --deepspeed "zero_stage3_config_34b.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..61ea3c646b8e4d2f0c770e0e7291b8f017fc8be1
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_full.sh
@@ -0,0 +1,68 @@
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_full'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 8
+# batch size per gpu: 4
+# gradient accumulation steps: 4
+# total batch size: 128
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/Mini-InternVL-Chat-2B-V1-5" \
+  --conv_style "internlm2-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 12 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e929bd265690a1d8b4f7b596f59364f8e17928eb
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_lora.sh
@@ -0,0 +1,69 @@
+set -x
+
+GPUS=${GPUS:-2}
+BATCH_SIZE=${BATCH_SIZE:-16}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_2nd_finetune_lora'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 2
+# batch size per gpu: 4
+# gradient accumulation steps: 2
+# total batch size: 16
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/Mini-InternVL-Chat-2B-V1-5" \
+  --conv_style "internlm2-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 12 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 16 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..02439183eef81f9f1d2bf86c17d341815855aa85
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_full.sh
@@ -0,0 +1,68 @@
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_full'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 8
+# batch size per gpu: 4
+# gradient accumulation steps: 4
+# total batch size: 128
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/InternVL-Chat-V1-5" \
+  --conv_style "internlm2-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 12 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.4 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_lora.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_lora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..52a479862537fb1a8035334527cdb9536764b3c4
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_lora.sh
@@ -0,0 +1,69 @@
+set -x
+
+GPUS=${GPUS:-2}
+BATCH_SIZE=${BATCH_SIZE:-16}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_20b_dynamic_res_2nd_finetune_lora'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 2
+# batch size per gpu: 4
+# gradient accumulation steps: 2
+# total batch size: 16
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/InternVL-Chat-V1-5" \
+  --conv_style "internlm2-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 12 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 16 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7eb1517e3c1a3934340ede6f6d08c58e3d1920fa
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_full.sh
@@ -0,0 +1,68 @@
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_full'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 8
+# batch size per gpu: 4
+# gradient accumulation steps: 4
+# total batch size: 128
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/Mini-InternVL-Chat-4B-V1-5" \
+  --conv_style "phi3-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 12 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1e541bd3b6076b894368350165dbbd1db2b76c81
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/2nd_finetune/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh
@@ -0,0 +1,69 @@
+set -x
+
+GPUS=${GPUS:-2}
+BATCH_SIZE=${BATCH_SIZE:-16}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_phi3_3_8b_dynamic_res_2nd_finetune_lora'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 2
+# batch size per gpu: 4
+# gradient accumulation steps: 2
+# total batch size: 16
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/Mini-InternVL-Chat-4B-V1-5" \
+  --conv_style "phi3-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 12 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 16 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/hermes2_yi34b/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_finetune.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/hermes2_yi34b/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_finetune.sh
new file mode 100644
index 0000000000000000000000000000000000000000..18026d86e7476881d8b0fcf5f57606b244b128a4
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/hermes2_yi34b/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_finetune.sh
@@ -0,0 +1,76 @@
+set -x
+
+PARTITION=${PARTITION:-"INTERN2"}
+GPUS=${GPUS:-256}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-1024}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_finetune'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 256
+# batch size per gpu: 4
+# gradient accumulation steps: 1
+# total batch size: 1024
+# epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./work_dirs/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain" \
+  --conv_style "Hermes-2" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "path/to/finetune/data.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 12 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.4 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 3 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config_34b.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/hermes2_yi34b/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/hermes2_yi34b/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3992907c5d5848e9ee33205caf184a492b780816
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/hermes2_yi34b/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain.sh
@@ -0,0 +1,78 @@
+set -x
+
+PARTITION=${PARTITION:-"INTERN2"}
+GPUS=${GPUS:-256}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-2048}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_hermes2_yi34b_dynamic_res_pretrain'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 256
+# batch size per gpu: 2
+# gradient accumulation steps: 4
+# total batch size: 2048
+# epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --vision_path "./pretrained/InternViT-6B-448px-V1-5" \
+  --mlp_path "./pretrained/InternViT-6B-448px-V1-2/mlp_projector/hermes_2_yi_34b.pth" \
+  --llm_path "./pretrained/Nous-Hermes-2-Yi-34B" \
+  --conv_style "Hermes-2" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "path/to/pretrain/data.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 12 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 3 \
+  --learning_rate 1e-4 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config_34b.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/internlm2_1_8b/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/internlm2_1_8b/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c0f3677e22bac85d48d1d38d2247e8df215d8724
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/internlm2_1_8b/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune.sh
@@ -0,0 +1,76 @@
+set -x
+
+PARTITION=${PARTITION:-"INTERN2"}
+GPUS=${GPUS:-128}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-1024}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_finetune'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 128
+# batch size per gpu: 4
+# gradient accumulation steps: 2
+# total batch size: 1024
+# epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./work_dirs/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_pretrain" \
+  --conv_style "internlm2-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "path/to/finetune/data.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 12 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 3 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/internlm2_1_8b/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_pretrain.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/internlm2_1_8b/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_pretrain.sh
new file mode 100644
index 0000000000000000000000000000000000000000..aa9d3ee9f78c1addc9b6eec0d4894a8d799defdb
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/internlm2_1_8b/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_pretrain.sh
@@ -0,0 +1,77 @@
+set -x
+
+PARTITION=${PARTITION:-"INTERN2"}
+GPUS=${GPUS:-128}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-2048}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-8}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_1_8b_dynamic_res_pretrain'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 128
+# batch size per gpu: 8
+# gradient accumulation steps: 2
+# total batch size: 2048
+# epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --vision_path "./pretrained/InternViT-300M-448px" \
+  --llm_path "./pretrained/internlm2-chat-1_8b" \
+  --conv_style "internlm2-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "path/to/pretrain/data.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 12 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --freeze_llm True \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 3 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.01 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/internlm2_20b/internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/internlm2_20b/internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune.sh
new file mode 100644
index 0000000000000000000000000000000000000000..11804be90132f06fc9f25ca54c1b577249308a72
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/internlm2_20b/internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune.sh
@@ -0,0 +1,76 @@
+set -x
+
+PARTITION=${PARTITION:-"INTERN2"}
+GPUS=${GPUS:-256}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-1024}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_20b_dynamic_res_finetune'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 256
+# batch size per gpu: 4
+# gradient accumulation steps: 1
+# total batch size: 1024
+# epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./work_dirs/internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain" \
+  --conv_style "internlm2-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "path/to/finetune/data.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 12 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.4 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 3 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/internlm2_20b/internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/internlm2_20b/internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8edc053c758778711e76687452503602cfc49b89
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/internlm2_20b/internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain.sh
@@ -0,0 +1,77 @@
+set -x
+
+PARTITION=${PARTITION:-"INTERN2"}
+GPUS=${GPUS:-256}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-2048}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_internlm2_20b_dynamic_res_pretrain'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 256
+# batch size per gpu: 4
+# gradient accumulation steps: 2
+# total batch size: 2048
+# epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --vision_path "./pretrained/InternViT-6B-448px-V1-5" \
+  --llm_path "./pretrained/internlm2-chat-20b" \
+  --conv_style "internlm2-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "path/to/pretrain/data.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 12 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.2 \
+  --freeze_llm True \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 3 \
+  --learning_rate 1e-5 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/phi3_3_8b/internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/phi3_3_8b/internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b4c521edd0b2c65da05c60760c96a95c7c0e5d7a
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/phi3_3_8b/internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune.sh
@@ -0,0 +1,76 @@
+set -x
+
+PARTITION=${PARTITION:-"INTERN2"}
+GPUS=${GPUS:-128}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-1024}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 128
+# batch size per gpu: 4
+# gradient accumulation steps: 2
+# total batch size: 1024
+# epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./work_dirs/internvl_chat_v1_5_phi3_3_8b_dynamic_res_finetune" \
+  --conv_style "phi3-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "path/to/finetune/data.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 12 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 3 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/phi3_3_8b/internvl_chat_v1_5_phi3_3_8b_dynamic_res_pretrain.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/phi3_3_8b/internvl_chat_v1_5_phi3_3_8b_dynamic_res_pretrain.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f5b9126868836de3c88603c3ad585a48f40da54f
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl1.5/phi3_3_8b/internvl_chat_v1_5_phi3_3_8b_dynamic_res_pretrain.sh
@@ -0,0 +1,77 @@
+set -x
+
+PARTITION=${PARTITION:-"INTERN2"}
+GPUS=${GPUS:-128}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-2048}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-8}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v1_5/internvl_chat_v1_5_phi3_3_8b_dynamic_res_pretrain'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 128
+# batch size per gpu: 8
+# gradient accumulation steps: 2
+# total batch size: 2048
+# epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --vision_path "./pretrained/InternViT-300M-448px" \
+  --llm_path "./pretrained/Phi-3-mini-128k-instruct" \
+  --conv_style "phi3-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "path/to/pretrain/data.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 12 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 3 \
+  --learning_rate 2e-4 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b67be7201e6a3e0db820be71fca65a973ea5e2a9
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_full.sh
@@ -0,0 +1,68 @@
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_1b_qwen2_0_5b_dynamic_res_2nd_finetune_full'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 8
+# batch size per gpu: 4
+# gradient accumulation steps: 4
+# total batch size: 128
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/InternVL2-1B" \
+  --conv_style "Hermes-2" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora_coco.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora_coco.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f68d68dabb0ac186b3962ca4cedd97306f4b9188
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora_coco.sh
@@ -0,0 +1,69 @@
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-512}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_2b_internlm2_1_8b_dynamic_res_2nd_finetune_lora_coco'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 8
+# batch size per gpu: 4
+# gradient accumulation steps: 16
+# total batch size: 512
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/InternVL2-2B" \
+  --conv_style "internlm2-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/coco_caption.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 128 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0fe9614cb62844338350d7cff96a76b4b81e419f
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_full.sh
@@ -0,0 +1,68 @@
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_full'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 8
+# batch size per gpu: 4
+# gradient accumulation steps: 4
+# total batch size: 128
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/InternVL2-4B" \
+  --conv_style "phi3-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d4242b2144378ca39cc70c97bbe826947baf7fe0
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_lora.sh
@@ -0,0 +1,69 @@
+set -x
+
+GPUS=${GPUS:-2}
+BATCH_SIZE=${BATCH_SIZE:-16}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_4b_phi3_3_8b_dynamic_res_2nd_finetune_lora'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 2
+# batch size per gpu: 4
+# gradient accumulation steps: 2
+# total batch size: 16
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/InternVL2-4B" \
+  --conv_style "phi3-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 16 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..45ce5b61c45c1cf2de5cb4c4d6d85c569f017f42
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_full.sh
@@ -0,0 +1,76 @@
+set -x
+
+PARTITION=${PARTITION:-"INTERN2"}
+GPUS=${GPUS:-32}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_full'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 32
+# batch size per gpu: 1
+# gradient accumulation steps: 4
+# total batch size: 128
+# epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/InternVL2-Llama3-76B" \
+  --conv_style "internlm2-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.4 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config_100b.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_lora.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_lora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9cd0175bdd44be6557b33904524993a53470ffd5
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_lora.sh
@@ -0,0 +1,69 @@
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-16}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_76b_hermes2_llama3_70b_dynamic_res_2nd_finetune_lora'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 8
+# batch size per gpu: 1
+# gradient accumulation steps: 2
+# total batch size: 16
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/InternVL2-Llama3-76B" \
+  --conv_style "internlm2-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 16 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config_100b.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6fb447267f08639206ec8d31c55a86056432a663
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_full.sh
@@ -0,0 +1,68 @@
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_full'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 8
+# batch size per gpu: 4
+# gradient accumulation steps: 4
+# total batch size: 128
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/InternVL2-8B" \
+  --conv_style "internlm2-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_lora.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_lora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9fdb7559aa9257d59c590f815900b1319e4f4bd9
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0/2nd_finetune/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_lora.sh
@@ -0,0 +1,69 @@
+set -x
+
+GPUS=${GPUS:-2}
+BATCH_SIZE=${BATCH_SIZE:-16}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_8b_internlm2_7b_dynamic_res_2nd_finetune_lora'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 2
+# batch size per gpu: 4
+# gradient accumulation steps: 2
+# total batch size: 16
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "./pretrained/InternVL2-8B" \
+  --conv_style "internlm2-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 16 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 4096 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0_mpo/README.md b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0_mpo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4c6825d20f2b28ab1e2fbc8104241378082c843b
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0_mpo/README.md
@@ -0,0 +1,150 @@
+# Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization
+
+[\[📂 GitHub\]](https://github.com/OpenGVLab/InternVL/tree/main/internvl_chat/shell/internvl2.0_mpo)  [\[🆕 Blog\]](https://internvl.github.io/blog/2024-11-14-InternVL-2.0-MPO/)  [\[📜 Paper\]](https://arxiv.org/abs/2411.10442) [\[📖 Documents\]](https://internvl.readthedocs.io/en/latest/internvl2.0/preference_optimization.html)
+
+## Introduction
+
+Existing open-source multimodal large language models (MLLMs) generally follow a training process involving pre-training and supervised fine-tuning. However, these models suffer from distribution shifts, which limit their multimodal reasoning, particularly in the Chain-of-Thought (CoT) performance.
+
+To address this, we introduce a preference optimization (PO) process to enhance the multimodal reasoning capabilities of MLLMs. Specifically, (1) on the data side, we design an automated preference data construction pipeline to create [MMPR](https://huggingface.co/datasets/OpenGVLab/MMPR), a high-quality, large-scale multimodal reasoning preference dataset. and (2) on the model side, we explore integrating PO with MLLMs, developing a simple yet effective method, termed Mixed Preference Optimization (MPO), which boosts multimodal CoT performance.
+
+Our approach demonstrates improved performance across multiple benchmarks, particularly in multimodal reasoning tasks. Notably, our model, [InternVL2-8B-MPO](https://huggingface.co/OpenGVLab/InternVL2-8B-MPO), achieves an accuracy of 67.0 on MathVista, outperforming InternVL2-8B by 8.7 points and achieving performance comparable to the 10$`\times`$ larger InternVL2-76B. We hope this study could inspire further advancements in MLLMs.
+
+![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/619507e7b74b6c591f794340/sy8aVC1Y5wtAjG-OQzrDI.jpeg)
+
+## MMPR Dataset
+
+MMPR is a large-scale and high-quality multimodal reasoning preference dataset. This dataset includes about 3 million samples.
+
+![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/619507e7b74b6c591f794340/mmXL47UPDFwYOWdn9Z6j5.jpeg)
+![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/619507e7b74b6c591f794340/6fnvI_wCd9JXAs6vYthaG.jpeg)
+
+To construct this dataset, we propose an efficient data construction pipeline. Specifically, we categorize the multimodal data into **samples with clear ground truths** and **samples without clear ground truths**.
+
+- **For samples with clear ground truths:**
+  the model is prompted to first provide the reasoning process and then give the final answer in the format like `Final Answer: ***`.
+  Responses matching the ground truth answer constitute the positive set $\\mathcal{Y}\_p$, while those that do not match make up the negative set $\\mathcal{Y}\_n$. Additionally, responses that fail to provide a clear final answer are also merged into $\\mathcal{Y}\_n$.
+  Given these responses labeled as positive or negative, we build the preference pairs by selecting a chosen response $y_c$ from $\\mathcal{Y}\_p$ and a negative response $y_r$ from $\\mathcal{Y}\_n$.
+
+- **For samples without clear ground truths:**
+  we propose a simple yet effective method: Dropout Next-Token Prediction (Dropout NTP).
+  Specifically, we use the responses generated by InternVL2-8B as chosen answers.
+  Given the chosen answer, we truncate it by half and then prompt InternVL2-8B to complete the remaining
+  portion of the truncated answer without access to the image input.
+  This generated completion serves as the rejected answer for the paired sample.
+  It is worth noting that while the responses generated by InternVL2-8B may not be perfect,
+  the completions generated without the image input will introduce more hallucinations than those
+  generated with the image input.
+  Therefore, the partial order relationship between the chosen and rejected responses holds true.
+
+The data construction pipeline is open-sourced, see more details in our [document](https://internvl.readthedocs.io/en/latest/internvl2.0/preference_optimization.html#generate-additional-preference-data).
+
+## Mixed Preference Optimization
+
+The key insight behind MPO is that *an effective PO process should enable the model to learn the relative preference between pairs of responses, the absolute quality of individual responses, and the process for generating preferred responses.* We define the training objective as a combination of
+preference loss $`\mathcal{L}_{\text{p}}`$,
+quality loss $`\mathcal{L}_{\text{q}}`$,
+and generation loss $`\mathcal{L}_{\text{g}}`$,
+referred to as Mixed Preference Optimization:
+
+```math
+\mathcal{L}=w_{p}\cdot\mathcal{L}_{\text{p}} + w_{q}\cdot\mathcal{L}_{\text{q}} + w_{g}\cdot\mathcal{L}_{\text{g}},
+```
+
+where $w\_{\*}$ represents the weight assigned to each loss component.
+In this work, we empirically compare different variants of preference loss.
+Based on the experimental results, we use DPO as our preference loss and BCO as our quality loss.
+
+Specifically, the DPO serves as the preference loss to enable the model to learn the
+relative preference between chosen and rejected responses.
+This algorithm optimizes the following loss function:
+
+```math
+\mathcal{L}_{\text{p}}=-\log \sigma\left(\beta \log \frac{\pi_\theta\left(y_c \mid x\right)}{\pi_0\left(y_c \mid x\right)}-\beta \log \frac{\pi_\theta\left(y_r \mid x\right)}{\pi_0\left(y_r \mid x\right)}\right),
+```
+
+where $\\beta$ is the KL penalty coefficient, and $x$, $y_c$, and $y_r$ are user query, chosen response, and rejected response, respectively.
+The policy model $\\pi\_\\theta$ is initialized from model $\\pi_0$.
+
+Additionally, the BCO loss is employed as the quality loss, which helps the model to understand the absolute quality of individual responses.
+The loss function is defined as:
+
+```math
+\mathcal{L}_{\text{q}}=\mathcal{L}_{\text{q}}^+ + \mathcal{L}_{\text{q}}^-,
+```
+
+where $`\mathcal{L}_{\text{q}}^{+}`$ and $`\mathcal{L}_{\text{q}}^{+}`$ represent the loss for chosen and rejected responses, respectively.
+Each response type's loss is calculated independently, requiring the model to differentiate the absolute quality of individual responses. The loss terms are given by:
+
+```math
+\mathcal{L}_{\text{q}}^+=-\log \sigma\left(\beta \log \frac{\pi_\theta\left(y_c \mid x\right)}{\pi_0\left(y_c \mid x\right)} - \delta\right),
+```
+
+```math
+\mathcal{L}_{\text{q}}^-=-\log \sigma\left(-\left(\beta \log \frac{\pi_\theta\left(y_r \mid x\right)}{\pi_0\left(y_r \mid x\right)} - \delta\right) \right),
+```
+
+where $\\delta$ represents the reward shift, calculated as the moving average of previous rewards to stabilize training.
+
+Finally, the SFT loss is used as the generation loss to help the model learn the generation process of preferred responses.
+The loss function is defined as:
+
+```math
+\mathcal{L}_{\text{gen}}=-\frac{\log\pi_\theta\left(y_c \mid x\right)}{\left| y_c \right|}.
+```
+
+## Models and Performance
+
+Our [InternVL2-8B-MPO](https://huggingface.co/OpenGVLab/InternVL2-8B) achieves superior performance across 8 benchmarks, particularly excelling in multimodal reasoning tasks.
+**On the MathVista benchmark, our model achieves an accuracy of 67.0%**, outperforming InternVL2-8B by 8.7 points and achieving performance comparable to the 10$`\times`$ larger InternVL2-76B.
+**On the MathVision benchmark, our model achieves an accuracy of 25.7%**, establishing a new state-of-the-art performance among open-source models.
+These results demonstrate the effectiveness of our preference optimization approach in enhancing multimodal reasoning capabilities.
+
+Additionally, on the POPE benchmark, our model exhibits a 1.2-point improvement over InterVL2-8B, demonstrating the effectiveness of the perception data contained in our MMPR dataset to mitigate hallucinations.
+
+Furthermore, our model also shows superior performance compared to the InternVL2-8B on complex VQA benchmarks, indicating that the general abilities of our model are also improved, benefiting from enhanced reasoning abilities and mitigated hallucinations.
+
+| Model Name              | M3CoT | MathVista | MathVision MINI | MMVet (GPT4-Turbo) | LLaVA-Bench | POPE | CRPE | MMHalBench |
+| ----------------------- | :---: | :-------: | :-------------: | :----------------: | :---------: | :--: | :--: | :--------: |
+| Gemini-1.5-Pro          |   -   |   63.9    |      19.2       |         -          |      -      |  -   |  -   |     -      |
+| GPT-4o                  | 64.3  |   63.8    |      30.4       |        69.1        |    97.6     | 86.9 | 76.6 |    4.0     |
+| GPT-4o-Mini             | 61.9  |   52.4    |      27.3       |        66.9        |    95.4     | 85.1 | 73.1 |    3.6     |
+| LLaVA-1.5-13B           | 39.5  |   27.6    |      11.1       |        36.3        |    70.7     | 85.9 | 55.6 |    2.4     |
+| Qwen2-VL-7B             | 57.8  |   58.2    |      21.1       |        60.6        |    67.7     | 88.1 | 74.4 |    3.4     |
+| MiniCPM-V-2-6-8B        | 56.0  |   60.6    |      23.4       |        57.4        |    83.4     | 87.3 | 75.2 |    3.6     |
+| LLaVA-OneVision-7B      | 52.3  |   63.2    |      18.4       |        51.4        |    79.9     | 88.4 | 73.7 |    3.1     |
+| InternVL2-26B           | 58.2  |   59.4    |      23.4       |        62.1        |    92.3     | 88.0 | 75.6 |    3.7     |
+| InternVL2-40B           | 63.6  |   63.7    |      21.4       |        65.5        |    100.5    | 88.4 | 77.3 |    3.9     |
+| InternVL2-76B           | 65.4  |   67.5    |      23.7       |        65.7        |    99.3     | 89.0 | 77.8 |    3.8     |
+| InternVL2-Pro           | 65.6  |   66.3    |      18.8       |        69.4        |    99.5     | 88.2 | 77.6 |    3.7     |
+| InternVL2-8B            | 59.3  |   58.3    |      20.4       |        54.2        |    73.2     | 86.9 | 75.5 |    3.3     |
+| InternVL2-8B-MPO (ours) | 79.2  |   67.0    |      25.7       |        56.2        |    76.7     | 88.1 | 75.4 |    3.5     |
+
+## Train
+
+Please refer to [our document](https://internvl.readthedocs.io/en/latest/internvl2.0/preference_optimization.html) for more details about how to train with our data.
+
+## Citation
+
+If you find this project useful in your research, please consider citing:
+
+```BibTeX
+@article{wang2024mpo,
+  title={Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization},
+  author={Wang, Weiyun and Chen, Zhe and Wang, Wenhai and Cao, Yue and Liu, Yangzhou and Gao, Zhangwei and Zhu, Jinguo and Zhu, Xizhou and Lu, Lewei and Qiao, Yu and Dai, Jifeng},
+  journal={arXiv preprint arXiv:2411.10442},
+  year={2024}
+}
+@article{chen2023internvl,
+  title={InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks},
+  author={Chen, Zhe and Wu, Jiannan and Wang, Wenhai and Su, Weijie and Chen, Guo and Xing, Sen and Zhong, Muyan and Zhang, Qinglong and Zhu, Xizhou and Lu, Lewei and Li, Bin and Luo, Ping and Lu, Tong and Qiao, Yu and Dai, Jifeng},
+  journal={arXiv preprint arXiv:2312.14238},
+  year={2023}
+}
+@article{chen2024far,
+  title={How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites},
+  author={Chen, Zhe and Wang, Weiyun and Tian, Hao and Ye, Shenglong and Gao, Zhangwei and Cui, Erfei and Tong, Wenwen and Hu, Kongzhi and Luo, Jiapeng and Ma, Zheng and others},
+  journal={arXiv preprint arXiv:2404.16821},
+  year={2024}
+}
+```
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0_mpo/preference_optimization/internvl2_8b_internlm2_7b_dynamic_res_mpo_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0_mpo/preference_optimization/internvl2_8b_internlm2_7b_dynamic_res_mpo_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c99289495f3f5d05c7c80b81bfe17c0a70ac0d47
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.0_mpo/preference_optimization/internvl2_8b_internlm2_7b_dynamic_res_mpo_full.sh
@@ -0,0 +1,81 @@
+set -x
+
+PARTITION=${PARTITION:-"Intern5"}
+GPUS=${GPUS:-256}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-256}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+export PYTHONPATH="/mnt/petrelfs/wangweiyun/workspace_cz/InternVL/internvl_chat_dev/petrel-oss-python-sdk"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_0/internvl2_8b_internlm2_7b_dynamic_res_mpo_full'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 64
+# batch size per gpu: ~4
+# gradient accumulation steps: 1
+# total batch size: ~256
+# epoch: 8
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_dpo.py \
+  --model_name_or_path "ckpt/OpenGVLab/InternVL2-8B" \
+  --conv_style "internlm2-chat" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "MMPR/meta.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --pad2square False \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --use_data_resampling False \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "no" \
+  --save_steps 100 \
+  --save_total_limit 100 \
+  --learning_rate 5e-6 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 6144 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  --loss_type sigmoid,bco_pair \
+  --sigmoid_loss_weight 0.8 \
+  --bco_pair_loss_weight 0.2 \
+  --rpo_alpha 1 \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_1b_dynamic_res_2nd_finetune_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_1b_dynamic_res_2nd_finetune_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..94af8e6fee68475feaa9c7f9702ec29b3d36df6c
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_1b_dynamic_res_2nd_finetune_full.sh
@@ -0,0 +1,69 @@
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_1b_dynamic_res_2nd_finetune_full'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 8
+# batch size per gpu: 4
+# gradient accumulation steps: 4
+# total batch size: 128
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-1B" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_1b_dynamic_res_2nd_finetune_lora.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_1b_dynamic_res_2nd_finetune_lora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..71c1b1e9aa7f724ef2573200acf63c8e392f56d1
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_1b_dynamic_res_2nd_finetune_lora.sh
@@ -0,0 +1,70 @@
+set -x
+
+GPUS=${GPUS:-2}
+BATCH_SIZE=${BATCH_SIZE:-16}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_1b_dynamic_res_2nd_finetune_lora'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 2
+# batch size per gpu: 4
+# gradient accumulation steps: 2
+# total batch size: 16
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-1B" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 16 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_26b_dynamic_res_2nd_finetune_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_26b_dynamic_res_2nd_finetune_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b3199ec8b9a6ddc9d49d99c3d4e4b4afce098a00
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_26b_dynamic_res_2nd_finetune_full.sh
@@ -0,0 +1,69 @@
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_26b_dynamic_res_2nd_finetune_lora'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 8
+# batch size per gpu: 2
+# gradient accumulation steps: 8
+# total batch size: 128
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-26B" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.4 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_26b_dynamic_res_2nd_finetune_lora.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_26b_dynamic_res_2nd_finetune_lora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b8b812c45b4d53f98f31c8cbc1f2e3e778f13d80
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_26b_dynamic_res_2nd_finetune_lora.sh
@@ -0,0 +1,70 @@
+set -x
+
+GPUS=${GPUS:-2}
+BATCH_SIZE=${BATCH_SIZE:-16}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_26b_dynamic_res_2nd_finetune_lora'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 2
+# batch size per gpu: 2
+# gradient accumulation steps: 4
+# total batch size: 16
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-26B" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 16 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_2b_dynamic_res_2nd_finetune_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_2b_dynamic_res_2nd_finetune_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fca861bdfd80e9a634f151b4b4886e6786cac0a2
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_2b_dynamic_res_2nd_finetune_full.sh
@@ -0,0 +1,69 @@
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_full'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 8
+# batch size per gpu: 4
+# gradient accumulation steps: 4
+# total batch size: 128
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-2B" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_2b_dynamic_res_2nd_finetune_lora.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_2b_dynamic_res_2nd_finetune_lora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0830a4d3675f4eb62985d00c9d9a375bfe1d908e
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_2b_dynamic_res_2nd_finetune_lora.sh
@@ -0,0 +1,70 @@
+set -x
+
+GPUS=${GPUS:-2}
+BATCH_SIZE=${BATCH_SIZE:-16}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_lora'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 2
+# batch size per gpu: 4
+# gradient accumulation steps: 2
+# total batch size: 16
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-2B" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 16 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_2b_dynamic_res_2nd_finetune_lora_coco.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_2b_dynamic_res_2nd_finetune_lora_coco.sh
new file mode 100644
index 0000000000000000000000000000000000000000..43cd4db645fcac90a71503d527ee0895b719a508
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_2b_dynamic_res_2nd_finetune_lora_coco.sh
@@ -0,0 +1,70 @@
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-512}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_2b_dynamic_res_2nd_finetune_lora_coco'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 8
+# batch size per gpu: 4
+# gradient accumulation steps: 16
+# total batch size: 512
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-2B" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/coco_caption.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 128 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_38b_dynamic_res_2nd_finetune_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_38b_dynamic_res_2nd_finetune_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a31bc4460319cfba5d81d3dc297802c7e28a0a3e
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_38b_dynamic_res_2nd_finetune_full.sh
@@ -0,0 +1,77 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-16}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_38b_dynamic_res_2nd_finetune_full'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 16
+# batch size per gpu: 2
+# gradient accumulation steps: 4
+# total batch size: 128
+# epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-38B" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.4 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config_34b.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_38b_dynamic_res_2nd_finetune_lora.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_38b_dynamic_res_2nd_finetune_lora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..eedd273038f318c5bbad76c97f946896963fb23f
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_38b_dynamic_res_2nd_finetune_lora.sh
@@ -0,0 +1,70 @@
+set -x
+
+GPUS=${GPUS:-2}
+BATCH_SIZE=${BATCH_SIZE:-16}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-2}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_38b_dynamic_res_2nd_finetune_lora'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 2
+# batch size per gpu: 2
+# gradient accumulation steps: 4
+# total batch size: 16
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-38B" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 16 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config_34b.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_4b_dynamic_res_2nd_finetune_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_4b_dynamic_res_2nd_finetune_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f51b46fd5590fe5d4cf31e313f6701c066eaec6b
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_4b_dynamic_res_2nd_finetune_full.sh
@@ -0,0 +1,69 @@
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_4b_dynamic_res_2nd_finetune_full'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 8
+# batch size per gpu: 4
+# gradient accumulation steps: 4
+# total batch size: 128
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-4B" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_4b_dynamic_res_2nd_finetune_lora.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_4b_dynamic_res_2nd_finetune_lora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..59fef84e008f4e5b1437f0f8711b4565188d90f7
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_4b_dynamic_res_2nd_finetune_lora.sh
@@ -0,0 +1,70 @@
+set -x
+
+GPUS=${GPUS:-2}
+BATCH_SIZE=${BATCH_SIZE:-16}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_4b_dynamic_res_2nd_finetune_lora'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 2
+# batch size per gpu: 4
+# gradient accumulation steps: 2
+# total batch size: 16
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-4B" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 16 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_78b_dynamic_res_2nd_finetune_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_78b_dynamic_res_2nd_finetune_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f7ab24fcd75ee3983f0be47a0576eb81ae5bd72c
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_78b_dynamic_res_2nd_finetune_full.sh
@@ -0,0 +1,77 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-32}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_78b_dynamic_res_2nd_finetune_full'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 32
+# batch size per gpu: 1
+# gradient accumulation steps: 4
+# total batch size: 128
+# epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-78B" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.4 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config_100b.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_78b_dynamic_res_2nd_finetune_lora.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_78b_dynamic_res_2nd_finetune_lora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4578248f083c4dbc107aef111646a759cda4937b
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_78b_dynamic_res_2nd_finetune_lora.sh
@@ -0,0 +1,70 @@
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-16}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_78b_dynamic_res_2nd_finetune_lora'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 8
+# batch size per gpu: 1
+# gradient accumulation steps: 2
+# total batch size: 16
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-78B" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 16 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config_100b.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_8b_dynamic_res_2nd_finetune_full.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_8b_dynamic_res_2nd_finetune_full.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ab3231b543b0600676ec6a362493aa90dedc1194
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_8b_dynamic_res_2nd_finetune_full.sh
@@ -0,0 +1,69 @@
+set -x
+
+GPUS=${GPUS:-8}
+BATCH_SIZE=${BATCH_SIZE:-128}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_8b_dynamic_res_2nd_finetune_full'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 8
+# batch size per gpu: 4
+# gradient accumulation steps: 4
+# total batch size: 128
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-8B" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_8b_dynamic_res_2nd_finetune_lora.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_8b_dynamic_res_2nd_finetune_lora.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c740d53a7961efab076658a95e7a22e72190e913
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/2nd_finetune/internvl2_5_8b_dynamic_res_2nd_finetune_lora.sh
@@ -0,0 +1,70 @@
+set -x
+
+GPUS=${GPUS:-2}
+BATCH_SIZE=${BATCH_SIZE:-16}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-4}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+export LAUNCHER=pytorch
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_8b_dynamic_res_2nd_finetune_lora'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# number of gpus: 2
+# batch size per gpu: 4
+# gradient accumulation steps: 2
+# total batch size: 16
+# epoch: 1
+torchrun \
+  --nnodes=1 \
+  --node_rank=0 \
+  --master_addr=127.0.0.1 \
+  --nproc_per_node=${GPUS} \
+  --master_port=${MASTER_PORT} \
+  internvl/train/internvl_chat_finetune.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-8B" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./shell/data/internvl_1_2_finetune_custom.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --max_dynamic_patch 6 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --freeze_llm True \
+  --freeze_mlp True \
+  --freeze_backbone True \
+  --use_llm_lora 16 \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 4 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 1 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length True \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1.5/internvl2_5_26b_internlm2_5_20b_dynamic_res_stage1_5.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1.5/internvl2_5_26b_internlm2_5_20b_dynamic_res_stage1_5.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5afbe82c86ff8267013ac46ac464846f35ae8d13
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1.5/internvl2_5_26b_internlm2_5_20b_dynamic_res_stage1_5.sh
@@ -0,0 +1,91 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-512}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_26b_internlm2_5_20b_dynamic_res_stage1_5'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# Stage: Stage 1.5 (ViT Incremental Learning)
+# Architecture: InternViT-6B-448px-V1-5 + MLP + internlm2_5-20b-chat
+# Trainable Components: ViT + MLP
+# Number of GPUs: 512
+# Packed Batch Size: 1024
+# Learning Rate: 1e-5
+# Context Length: 16384
+# Image Tile Threshold: 48
+# ViT Drop Path: 0.4
+# Weight Decay: 0.05
+# Epoch: None
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --model_name_or_path "./work_dirs/internvl_chat_v2_5/internvl2_5_26b_internlm2_5_20b_dynamic_res_stage1/" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./path/to/pretrain/data/mixture.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.4 \
+  --min_num_frame 8 \
+  --max_num_frame 32 \
+  --freeze_llm True \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --max_steps 100000 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 2 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 3 \
+  --learning_rate 1e-5 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 16384 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard" \
+  --use_packed_ds True \
+  --num_images_expected 48 \
+  --max_packed_tokens 16384 \
+  --max_buffer_size 20 \
+  --log_freq 1000 \
+  --strict_mode False \
+  --replacement False \
+  --allow_overflow False \
+  --remove_unused_columns False \
+  --loss_reduction "square" \
+  --loss_reduction_all_gather True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1.5/internvl2_5_8b_internlm2_5_7b_dynamic_res_stage1_5.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1.5/internvl2_5_8b_internlm2_5_7b_dynamic_res_stage1_5.sh
new file mode 100644
index 0000000000000000000000000000000000000000..356cfc8ed912646773a2aca61ceea1f72e2ac29c
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1.5/internvl2_5_8b_internlm2_5_7b_dynamic_res_stage1_5.sh
@@ -0,0 +1,91 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-512}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_8b_internlm2_5_7b_dynamic_res_stage1_5'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# Stage: Stage 1.5 (ViT Incremental Learning)
+# Architecture: InternViT-300M-448px + MLP + internlm2_5-7b-chat
+# Trainable Components: ViT + MLP
+# Number of GPUs: 512
+# Packed Batch Size: 1024
+# Learning Rate: 1e-5
+# Context Length: 16384
+# Image Tile Threshold: 48
+# ViT Drop Path: 0.1
+# Weight Decay: 0.05
+# Epoch: None
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --model_name_or_path "./work_dirs/internvl_chat_v2_5/internvl2_5_8b_internlm2_5_7b_dynamic_res_stage1/" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./path/to/pretrain/data/mixture.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --min_num_frame 8 \
+  --max_num_frame 32 \
+  --freeze_llm True \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --max_steps 100000 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 2 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 3 \
+  --learning_rate 1e-5 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 16384 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  --use_packed_ds True \
+  --num_images_expected 48 \
+  --max_packed_tokens 16384 \
+  --max_buffer_size 20 \
+  --log_freq 1000 \
+  --strict_mode False \
+  --replacement False \
+  --allow_overflow False \
+  --remove_unused_columns False \
+  --loss_reduction "square" \
+  --loss_reduction_all_gather True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_1b_qwen2_5_0_5b_dynamic_res_stage1.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_1b_qwen2_5_0_5b_dynamic_res_stage1.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4c9e1cc036bccd0a262e1d8d069534ffe91fe4f9
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_1b_qwen2_5_0_5b_dynamic_res_stage1.sh
@@ -0,0 +1,92 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-512}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_1b_qwen2_5_0_5b_dynamic_res_stage1'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# Stage: Stage 1 (MLP Warmup)
+# Architecture: InternViT-300M-448px-V2_5 + MLP + Qwen2.5-0.5B-Instruct
+# Trainable Components: MLP
+# Number of GPUs: 512
+# Packed Batch Size: 512
+# Learning Rate: 2e-4
+# Context Length: 16384
+# Image Tile Threshold: 48
+# ViT Drop Path: 0.0
+# Weight Decay: 0.01
+# Epoch: None
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --vision_path "./pretrained/InternViT-300M-448px-V2_5" \
+  --llm_path "./pretrained/Qwen2.5-0.5B-Instruct" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./path/to/pretrain/data/mixture.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --min_num_frame 8 \
+  --max_num_frame 32 \
+  --freeze_llm True \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --max_steps 100000 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 3 \
+  --learning_rate 2e-4 \
+  --weight_decay 0.01 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 16384 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  --use_packed_ds True \
+  --num_images_expected 48 \
+  --max_packed_tokens 16384 \
+  --max_buffer_size 20 \
+  --log_freq 1000 \
+  --strict_mode False \
+  --replacement False \
+  --allow_overflow False \
+  --remove_unused_columns False \
+  --loss_reduction "square" \
+  --loss_reduction_all_gather True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_26b_internlm2_5_20b_dynamic_res_stage1.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_26b_internlm2_5_20b_dynamic_res_stage1.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7c79edc0f4170813fc94e69ddab6d88b75cb7fbe
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_26b_internlm2_5_20b_dynamic_res_stage1.sh
@@ -0,0 +1,92 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-512}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_26b_internlm2_5_20b_dynamic_res_stage1'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# Stage: Stage 1 (MLP Warmup)
+# Architecture: InternViT-6B-448px-V1-5 + MLP + internlm2_5-20b-chat
+# Trainable Components: MLP
+# Number of GPUs: 512
+# Packed Batch Size: 512
+# Learning Rate: 2e-4
+# Context Length: 16384
+# Image Tile Threshold: 48
+# ViT Drop Path: 0.0
+# Weight Decay: 0.05
+# Epoch: None
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --vision_path "./pretrained/InternViT-6B-448px-V1-5" \
+  --llm_path "./pretrained/internlm2_5-20b-chat" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./path/to/pretrain/data/mixture.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --min_num_frame 8 \
+  --max_num_frame 32 \
+  --freeze_llm True \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --max_steps 100000 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 3 \
+  --learning_rate 2e-4 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 16384 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard" \
+  --use_packed_ds True \
+  --num_images_expected 48 \
+  --max_packed_tokens 16384 \
+  --max_buffer_size 20 \
+  --log_freq 1000 \
+  --strict_mode False \
+  --replacement False \
+  --allow_overflow False \
+  --remove_unused_columns False \
+  --loss_reduction "square" \
+  --loss_reduction_all_gather True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_2b_internlm2_5_1_8b_dynamic_res_stage1.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_2b_internlm2_5_1_8b_dynamic_res_stage1.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b88d2c16b074df8558b0bcf88f399df71ba638ea
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_2b_internlm2_5_1_8b_dynamic_res_stage1.sh
@@ -0,0 +1,92 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-512}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_2b_internlm2_5_1_8b_dynamic_res_stage1'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# Stage: Stage 1 (MLP Warmup)
+# Architecture: InternViT-300M-448px-V2_5 + MLP + internlm2_5-1_8b-chat
+# Trainable Components: MLP
+# Number of GPUs: 512
+# Packed Batch Size: 512
+# Learning Rate: 2e-4
+# Context Length: 16384
+# Image Tile Threshold: 48
+# ViT Drop Path: 0.0
+# Weight Decay: 0.01
+# Epoch: None
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --vision_path "./pretrained/InternViT-300M-448px-V2_5" \
+  --llm_path "./pretrained/internlm2_5-1_8b-chat" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./path/to/pretrain/data/mixture.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --min_num_frame 8 \
+  --max_num_frame 32 \
+  --freeze_llm True \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --max_steps 100000 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 3 \
+  --learning_rate 2e-4 \
+  --weight_decay 0.01 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 16384 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  --use_packed_ds True \
+  --num_images_expected 48 \
+  --max_packed_tokens 16384 \
+  --max_buffer_size 20 \
+  --log_freq 1000 \
+  --strict_mode False \
+  --replacement False \
+  --allow_overflow False \
+  --remove_unused_columns False \
+  --loss_reduction "square" \
+  --loss_reduction_all_gather True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_38b_qwen2_5_32b_dynamic_res_stage1.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_38b_qwen2_5_32b_dynamic_res_stage1.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c08110195a0478f80dad548fe8e06ab46e2a7456
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_38b_qwen2_5_32b_dynamic_res_stage1.sh
@@ -0,0 +1,92 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-512}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_38b_qwen2_5_32b_dynamic_res_stage1'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# Stage: Stage 1 (MLP Warmup)
+# Architecture: InternViT-6B-448px-V2_5 + MLP + Qwen2.5-32B-Instruct
+# Trainable Components: MLP
+# Number of GPUs: 512
+# Packed Batch Size: 512
+# Learning Rate: 2e-4
+# Context Length: 16384
+# Image Tile Threshold: 48
+# ViT Drop Path: 0.0
+# Weight Decay: 0.05
+# Epoch: None
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --vision_path "./pretrained/InternViT-6B-448px-V2_5" \
+  --llm_path "./pretrained/Qwen2.5-32B-Instruct" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./path/to/pretrain/data/mixture.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --min_num_frame 8 \
+  --max_num_frame 32 \
+  --freeze_llm True \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --max_steps 100000 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 3 \
+  --learning_rate 2e-4 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 16384 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config_34b.json" \
+  --report_to "tensorboard" \
+  --use_packed_ds True \
+  --num_images_expected 48 \
+  --max_packed_tokens 16384 \
+  --max_buffer_size 20 \
+  --log_freq 1000 \
+  --strict_mode False \
+  --replacement False \
+  --allow_overflow False \
+  --remove_unused_columns False \
+  --loss_reduction "square" \
+  --loss_reduction_all_gather True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_4b_qwen2_5_3b_dynamic_res_stage1.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_4b_qwen2_5_3b_dynamic_res_stage1.sh
new file mode 100644
index 0000000000000000000000000000000000000000..fae248e8afbae182f23977ed7e8d066bbf1815e0
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_4b_qwen2_5_3b_dynamic_res_stage1.sh
@@ -0,0 +1,92 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-512}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_4b_qwen2_5_3b_dynamic_res_stage1'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# Stage: Stage 1 (MLP Warmup)
+# Architecture: InternViT-300M-448px-V2_5 + MLP + Qwen2.5-3B-Instruct
+# Trainable Components: MLP
+# Number of GPUs: 512
+# Packed Batch Size: 512
+# Learning Rate: 2e-4
+# Context Length: 16384
+# Image Tile Threshold: 48
+# ViT Drop Path: 0.0
+# Weight Decay: 0.01
+# Epoch: None
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --vision_path "./pretrained/InternViT-300M-448px-V2_5" \
+  --llm_path "./pretrained/Qwen2.5-3B-Instruct" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./path/to/pretrain/data/mixture.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --min_num_frame 8 \
+  --max_num_frame 32 \
+  --freeze_llm True \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --max_steps 100000 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 3 \
+  --learning_rate 2e-4 \
+  --weight_decay 0.01 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 16384 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  --use_packed_ds True \
+  --num_images_expected 48 \
+  --max_packed_tokens 16384 \
+  --max_buffer_size 20 \
+  --log_freq 1000 \
+  --strict_mode False \
+  --replacement False \
+  --allow_overflow False \
+  --remove_unused_columns False \
+  --loss_reduction "square" \
+  --loss_reduction_all_gather True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_78b_qwen2_5_72b_dynamic_res_stage1.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_78b_qwen2_5_72b_dynamic_res_stage1.sh
new file mode 100644
index 0000000000000000000000000000000000000000..08924d0df93a804a78b744f9406b34b24e1b47c9
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_78b_qwen2_5_72b_dynamic_res_stage1.sh
@@ -0,0 +1,92 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-512}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_78b_qwen2_5_72b_dynamic_res_stage1'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# Stage: Stage 1 (MLP Warmup)
+# Architecture: InternViT-6B-448px-V2_5 + MLP + Qwen2.5-72B-Instruct
+# Trainable Components: MLP
+# Number of GPUs: 512
+# Packed Batch Size: 512
+# Learning Rate: 2e-4
+# Context Length: 16384
+# Image Tile Threshold: 48
+# ViT Drop Path: 0.0
+# Weight Decay: 0.05
+# Epoch: None
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --vision_path "./pretrained/InternViT-6B-448px-V2_5" \
+  --llm_path "./pretrained/Qwen2.5-72B-Instruct" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./path/to/pretrain/data/mixture.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --min_num_frame 8 \
+  --max_num_frame 32 \
+  --freeze_llm True \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --max_steps 100000 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 3 \
+  --learning_rate 2e-4 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 16384 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config_70b.json" \
+  --report_to "tensorboard" \
+  --use_packed_ds True \
+  --num_images_expected 48 \
+  --max_packed_tokens 16384 \
+  --max_buffer_size 20 \
+  --log_freq 1000 \
+  --strict_mode False \
+  --replacement False \
+  --allow_overflow False \
+  --remove_unused_columns False \
+  --loss_reduction "square" \
+  --loss_reduction_all_gather True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_8b_internlm2_5_7b_dynamic_res_stage1.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_8b_internlm2_5_7b_dynamic_res_stage1.sh
new file mode 100644
index 0000000000000000000000000000000000000000..225f9dc98ac49f25294b7cc260aa18cae21cc842
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage1/internvl2_5_8b_internlm2_5_7b_dynamic_res_stage1.sh
@@ -0,0 +1,92 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-512}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_8b_internlm2_5_7b_dynamic_res_stage1'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# Stage: Stage 1 (MLP Warmup)
+# Architecture: InternViT-300M-448px + MLP + internlm2_5-7b-chat
+# Trainable Components: MLP
+# Number of GPUs: 512
+# Packed Batch Size: 512
+# Learning Rate: 2e-4
+# Context Length: 16384
+# Image Tile Threshold: 48
+# ViT Drop Path: 0.0
+# Weight Decay: 0.05
+# Epoch: None
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --vision_path "./pretrained/InternViT-300M-448px" \
+  --llm_path "./pretrained/internlm2_5-7b-chat" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./path/to/pretrain/data/mixture.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.0 \
+  --min_num_frame 8 \
+  --max_num_frame 32 \
+  --freeze_llm True \
+  --freeze_mlp False \
+  --freeze_backbone True \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --max_steps 100000 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 3 \
+  --learning_rate 2e-4 \
+  --weight_decay 0.05 \
+  --warmup_steps 100 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 16384 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  --use_packed_ds True \
+  --num_images_expected 48 \
+  --max_packed_tokens 16384 \
+  --max_buffer_size 20 \
+  --log_freq 1000 \
+  --strict_mode False \
+  --replacement False \
+  --allow_overflow False \
+  --remove_unused_columns False \
+  --loss_reduction "square" \
+  --loss_reduction_all_gather True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage2/internvl2_5_1b_qwen2_5_0_5b_dynamic_res_stage2.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage2/internvl2_5_1b_qwen2_5_0_5b_dynamic_res_stage2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d348c39104250356aa9b6970b64fcf4985c2097f
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage2/internvl2_5_1b_qwen2_5_0_5b_dynamic_res_stage2.sh
@@ -0,0 +1,91 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-512}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_1b_qwen2_5_0_5b_dynamic_res_stage2'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# Stage: Stage 2 (Full Model Instruction Tuning)
+# Architecture: InternViT-300M-448px-V2_5 + MLP + Qwen2.5-0.5B-Instruct
+# Trainable Components: ViT + MLP + LLM
+# Number of GPUs: 512
+# Packed Batch Size: 512
+# Learning Rate: 4e-5
+# Context Length: 16384
+# Image Tile Threshold: 48
+# ViT Drop Path: 0.1
+# Weight Decay: 0.01
+# Epoch: 4
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --model_name_or_path "./work_dirs/internvl_chat_v2_5/internvl2_5_1b_qwen2_5_0_5b_dynamic_res_stage1/" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./path/to/finetune/data/mixture_4x.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --min_num_frame 8 \
+  --max_num_frame 32 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --max_steps 22000 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 3 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 16384 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  --use_packed_ds True \
+  --num_images_expected 48 \
+  --max_packed_tokens 16384 \
+  --max_buffer_size 20 \
+  --log_freq 1000 \
+  --strict_mode False \
+  --replacement False \
+  --allow_overflow False \
+  --remove_unused_columns False \
+  --loss_reduction "square" \
+  --loss_reduction_all_gather True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage2/internvl2_5_26b_internlm2_5_20b_dynamic_res_stage2.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage2/internvl2_5_26b_internlm2_5_20b_dynamic_res_stage2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..bbfb43230190242f1cc4c158a0bde708dd6bc184
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage2/internvl2_5_26b_internlm2_5_20b_dynamic_res_stage2.sh
@@ -0,0 +1,91 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-512}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_26b_internlm2_5_20b_dynamic_res_stage2'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# Stage: Stage 2 (Full Model Instruction Tuning)
+# Architecture: InternViT-6B-448px-V2_5 + MLP + internlm2_5-20b-chat
+# Trainable Components: ViT + MLP + LLM
+# Number of GPUs: 512
+# Packed Batch Size: 512
+# Learning Rate: 2e-5
+# Context Length: 16384
+# Image Tile Threshold: 48
+# ViT Drop Path: 0.4
+# Weight Decay: 0.05
+# Epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --model_name_or_path "./work_dirs/internvl_chat_v2_5/internvl2_5_26b_internlm2_5_20b_dynamic_res_stage1_5/" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./path/to/finetune/data/mixture.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.4 \
+  --min_num_frame 8 \
+  --max_num_frame 32 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --max_steps 5500 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 3 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 16384 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config.json" \
+  --report_to "tensorboard" \
+  --use_packed_ds True \
+  --num_images_expected 48 \
+  --max_packed_tokens 16384 \
+  --max_buffer_size 20 \
+  --log_freq 1000 \
+  --strict_mode False \
+  --replacement False \
+  --allow_overflow False \
+  --remove_unused_columns False \
+  --loss_reduction "square" \
+  --loss_reduction_all_gather True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage2/internvl2_5_2b_internlm2_5_1_8b_dynamic_res_stage2.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage2/internvl2_5_2b_internlm2_5_1_8b_dynamic_res_stage2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..baa66aaf9c57469d942d3ac18ab00efd5994a60a
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage2/internvl2_5_2b_internlm2_5_1_8b_dynamic_res_stage2.sh
@@ -0,0 +1,91 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-512}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_2b_internlm2_5_1_8b_dynamic_res_stage2'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# Stage: Stage 2 (Full Model Instruction Tuning)
+# Architecture: InternViT-300M-448px-V2_5 + MLP + internlm2_5-1_8b-chat
+# Trainable Components: ViT + MLP + LLM
+# Number of GPUs: 512
+# Packed Batch Size: 512
+# Learning Rate: 4e-5
+# Context Length: 16384
+# Image Tile Threshold: 48
+# ViT Drop Path: 0.1
+# Weight Decay: 0.01
+# Epoch: 4
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --model_name_or_path "./work_dirs/internvl_chat_v2_5/internvl2_5_2b_internlm2_5_1_8b_dynamic_res_stage1/" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./path/to/finetune/data/mixture_4x.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --min_num_frame 8 \
+  --max_num_frame 32 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --max_steps 22000 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 3 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.01 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 16384 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  --use_packed_ds True \
+  --num_images_expected 48 \
+  --max_packed_tokens 16384 \
+  --max_buffer_size 20 \
+  --log_freq 1000 \
+  --strict_mode False \
+  --replacement False \
+  --allow_overflow False \
+  --remove_unused_columns False \
+  --loss_reduction "square" \
+  --loss_reduction_all_gather True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage2/internvl2_5_38b_qwen2_5_32b_dynamic_res_stage2.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage2/internvl2_5_38b_qwen2_5_32b_dynamic_res_stage2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..81f2c75f7c1a579cbdcb533f3dfac4bf7e5f7cda
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage2/internvl2_5_38b_qwen2_5_32b_dynamic_res_stage2.sh
@@ -0,0 +1,93 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-512}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+export TRITON_CACHE_DIR=/tmp/triton_internvl/
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_38b_qwen2_5_32b_dynamic_res_stage2'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# Stage: Stage 2 (Full Model Instruction Tuning)
+# Architecture: InternViT-6B-448px-V2_5 + MLP + Qwen2.5-32B-Instruct
+# Trainable Components: ViT + MLP + LLM
+# Number of GPUs: 512
+# Packed Batch Size: 512
+# Learning Rate: 2e-5
+# Context Length: 16384
+# Image Tile Threshold: 48
+# ViT Drop Path: 0.4
+# Weight Decay: 0.05
+# Epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --model_name_or_path "./work_dirs/internvl_chat_v2_5/internvl2_5_38b_qwen2_5_32b_dynamic_res_stage1/" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --use_liger True \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./path/to/finetune/data/mixture.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.4 \
+  --min_num_frame 8 \
+  --max_num_frame 32 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --max_steps 5500 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 3 \
+  --learning_rate 2e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 16384 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config_100b.json" \
+  --report_to "tensorboard" \
+  --use_packed_ds True \
+  --num_images_expected 48 \
+  --max_packed_tokens 16384 \
+  --max_buffer_size 20 \
+  --log_freq 1000 \
+  --strict_mode False \
+  --replacement False \
+  --allow_overflow False \
+  --remove_unused_columns False \
+  --loss_reduction "square" \
+  --loss_reduction_all_gather True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage2/internvl2_5_8b_internlm2_5_7b_dynamic_res_stage2.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage2/internvl2_5_8b_internlm2_5_7b_dynamic_res_stage2.sh
new file mode 100644
index 0000000000000000000000000000000000000000..18544c8daed0a6eb972cd60b49d2a5984b0b2c8c
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5/stage2/internvl2_5_8b_internlm2_5_7b_dynamic_res_stage2.sh
@@ -0,0 +1,91 @@
+set -x
+
+PARTITION=${PARTITION:-"VC5"}
+GPUS=${GPUS:-512}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5/internvl2_5_8b_internlm2_5_7b_dynamic_res_stage2'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+# Stage: Stage 2 (Full Model Instruction Tuning)
+# Architecture: InternViT-300M-448px-V2_5 + MLP + internlm2_5-7b-chat
+# Trainable Components: ViT + MLP + LLM
+# Number of GPUs: 512
+# Packed Batch Size: 512
+# Learning Rate: 4e-5
+# Context Length: 16384
+# Image Tile Threshold: 48
+# ViT Drop Path: 0.1
+# Weight Decay: 0.05
+# Epoch: 1
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_pretrain.py \
+  --model_name_or_path "./work_dirs/internvl_chat_v2_5/internvl2_5_8b_internlm2_5_7b_dynamic_res_stage1_5/" \
+  --conv_style "internvl2_5" \
+  --use_fast_tokenizer False \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "./path/to/finetune/data/mixture.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --min_num_frame 8 \
+  --max_num_frame 32 \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --max_steps 5500 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 1 \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 100 \
+  --save_total_limit 3 \
+  --learning_rate 4e-5 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 16384 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  --use_packed_ds True \
+  --num_images_expected 48 \
+  --max_packed_tokens 16384 \
+  --max_buffer_size 20 \
+  --log_freq 1000 \
+  --strict_mode False \
+  --replacement False \
+  --allow_overflow False \
+  --remove_unused_columns False \
+  --loss_reduction "square" \
+  --loss_reduction_all_gather True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_1b_qwen2_5_0_5b_dynamic_res_mpo.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_1b_qwen2_5_0_5b_dynamic_res_mpo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..25045f2249b98f3b721945deae12b090c95e1ea9
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_1b_qwen2_5_0_5b_dynamic_res_mpo.sh
@@ -0,0 +1,77 @@
+set -x
+
+PARTITION=${PARTITION:-"Intern5"}
+GPUS=${GPUS:-256}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-256}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export TRITON_CACHE_DIR="/tmp/triton_wwy/"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5_mpo/Internvl2_5-1B-MPO'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_dpo.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-1B" \
+  --conv_style "internvl2_5" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "MMPR-v1.1/meta.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --pad2square False \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --use_data_resampling False \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "no" \
+  --save_steps 100 \
+  --save_total_limit 100 \
+  --learning_rate 1e-6 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  --loss_type sigmoid,bco_pair \
+  --sigmoid_loss_weight 0.8 \
+  --bco_pair_loss_weight 0.2 \
+  --rpo_alpha 1 \
+  --use_liger True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_26b_internlm2_5_20b_dynamic_res_mpo.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_26b_internlm2_5_20b_dynamic_res_mpo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..db34165a447c1b7822553548a9faecc401ea869e
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_26b_internlm2_5_20b_dynamic_res_mpo.sh
@@ -0,0 +1,77 @@
+set -x
+
+PARTITION=${PARTITION:-"Intern5"}
+GPUS=${GPUS:-256}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-256}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export TRITON_CACHE_DIR="/tmp/triton_wwy/"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5_mpo/Internvl2_5-26B-MPO'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_dpo.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-26B" \
+  --conv_style "internvl2_5" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "MMPR-v1.1/meta.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.4 \
+  --pad2square False \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --use_data_resampling False \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "no" \
+  --save_steps 100 \
+  --save_total_limit 100 \
+  --learning_rate 1e-6 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config_70b.json" \
+  --report_to "tensorboard" \
+  --loss_type sigmoid,bco_pair \
+  --sigmoid_loss_weight 0.8 \
+  --bco_pair_loss_weight 0.2 \
+  --rpo_alpha 1 \
+  --use_liger True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_2b_internlm2_5_1_8b_dynamic_res_mpo.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_2b_internlm2_5_1_8b_dynamic_res_mpo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..282e65027083e2b4d08b2f1f218fe783ceb7e39e
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_2b_internlm2_5_1_8b_dynamic_res_mpo.sh
@@ -0,0 +1,77 @@
+set -x
+
+PARTITION=${PARTITION:-"Intern5"}
+GPUS=${GPUS:-256}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-256}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export TRITON_CACHE_DIR="/tmp/triton_wwy/"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5_mpo/Internvl2_5-2B-MPO'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_dpo.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-2B" \
+  --conv_style "internvl2_5" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "MMPR-v1.1/meta.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --pad2square False \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --use_data_resampling False \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "no" \
+  --save_steps 100 \
+  --save_total_limit 100 \
+  --learning_rate 1e-6 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  --loss_type sigmoid,bco_pair \
+  --sigmoid_loss_weight 0.8 \
+  --bco_pair_loss_weight 0.2 \
+  --rpo_alpha 1 \
+  --use_liger True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_38b_qwen2_5_32b_dynamic_res_mpo.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_38b_qwen2_5_32b_dynamic_res_mpo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..edbf3f7c84161933a2465ef0399cd9bcc62553fe
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_38b_qwen2_5_32b_dynamic_res_mpo.sh
@@ -0,0 +1,77 @@
+set -x
+
+PARTITION=${PARTITION:-"Intern5"}
+GPUS=${GPUS:-512}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-512}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export TRITON_CACHE_DIR="/tmp/triton_wwy/"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5_mpo/Internvl2_5-38B-MPO'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_dpo.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-38B" \
+  --conv_style "internvl2_5" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "MMPR-v1.1/meta.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.4 \
+  --pad2square False \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --use_data_resampling False \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "steps" \
+  --save_steps 200 \
+  --save_total_limit 100 \
+  --learning_rate 1e-6 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage3_config_100b.json" \
+  --report_to "tensorboard" \
+  --loss_type sigmoid,bco_pair \
+  --sigmoid_loss_weight 0.8 \
+  --bco_pair_loss_weight 0.2 \
+  --rpo_alpha 1 \
+  --use_liger True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_4b_qwen2_5_3b_dynamic_res_mpo.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_4b_qwen2_5_3b_dynamic_res_mpo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b79b54d98d77570af452c7ea7073d1b4da92b06a
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_4b_qwen2_5_3b_dynamic_res_mpo.sh
@@ -0,0 +1,77 @@
+set -x
+
+PARTITION=${PARTITION:-"Intern5"}
+GPUS=${GPUS:-256}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-256}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export TRITON_CACHE_DIR="/tmp/triton_wwy/"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5_mpo/Internvl2_5-4B-MPO'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_dpo.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-4B" \
+  --conv_style "internvl2_5" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "MMPR-v1.1/meta.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --pad2square False \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --use_data_resampling False \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "no" \
+  --save_steps 100 \
+  --save_total_limit 100 \
+  --learning_rate 1e-6 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  --loss_type sigmoid,bco_pair \
+  --sigmoid_loss_weight 0.8 \
+  --bco_pair_loss_weight 0.2 \
+  --rpo_alpha 1 \
+  --use_liger True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_8b_internlm2_5_7b_dynamic_res_mpo.sh b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_8b_internlm2_5_7b_dynamic_res_mpo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..57e849177c59a4f5f44759f8a1c7d78d9d906f86
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/internvl2.5_mpo/preference_optimization/internvl2_5_8b_internlm2_5_7b_dynamic_res_mpo.sh
@@ -0,0 +1,77 @@
+set -x
+
+PARTITION=${PARTITION:-"Intern5"}
+GPUS=${GPUS:-256}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+QUOTA_TYPE=${QUOTA_TYPE:-"reserved"}
+NODES=$((GPUS / GPUS_PER_NODE))
+CPUS_PER_TASK=${CPUS_PER_TASK:-10}
+SRUN_ARGS=${SRUN_ARGS:-""}
+BATCH_SIZE=${BATCH_SIZE:-256}
+PER_DEVICE_BATCH_SIZE=${PER_DEVICE_BATCH_SIZE:-1}
+GRADIENT_ACC=$((BATCH_SIZE / PER_DEVICE_BATCH_SIZE / GPUS))
+
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export TRITON_CACHE_DIR="/tmp/triton_wwy/"
+export MASTER_PORT=34229
+export TF_CPP_MIN_LOG_LEVEL=3
+
+OUTPUT_DIR='work_dirs/internvl_chat_v2_5_mpo/Internvl2_5-8B-MPO'
+
+if [ ! -d "$OUTPUT_DIR" ]; then
+  mkdir -p "$OUTPUT_DIR"
+fi
+
+srun -p ${PARTITION} \
+  --gres=gpu:${GPUS_PER_NODE} \
+  --nodes=${NODES} \
+  --ntasks=${GPUS} \
+  --ntasks-per-node=${GPUS_PER_NODE} \
+  --cpus-per-task=${CPUS_PER_TASK} \
+  --kill-on-bad-exit=1 \
+  --quotatype=${QUOTA_TYPE} \
+  ${SRUN_ARGS} \
+  python -u internvl/train/internvl_chat_dpo.py \
+  --model_name_or_path "OpenGVLab/InternVL2_5-8B" \
+  --conv_style "internvl2_5" \
+  --output_dir ${OUTPUT_DIR} \
+  --meta_path "MMPR-v1.1/meta.json" \
+  --overwrite_output_dir True \
+  --force_image_size 448 \
+  --down_sample_ratio 0.5 \
+  --drop_path_rate 0.1 \
+  --pad2square False \
+  --freeze_llm False \
+  --freeze_mlp False \
+  --freeze_backbone False \
+  --vision_select_layer -1 \
+  --use_data_resampling False \
+  --dataloader_num_workers 8 \
+  --bf16 True \
+  --num_train_epochs 1 \
+  --per_device_train_batch_size ${PER_DEVICE_BATCH_SIZE} \
+  --gradient_accumulation_steps ${GRADIENT_ACC} \
+  --evaluation_strategy "no" \
+  --save_strategy "no" \
+  --save_steps 100 \
+  --save_total_limit 100 \
+  --learning_rate 1e-6 \
+  --weight_decay 0.05 \
+  --warmup_ratio 0.03 \
+  --lr_scheduler_type "cosine" \
+  --logging_steps 1 \
+  --max_seq_length 8192 \
+  --do_train True \
+  --grad_checkpoint True \
+  --group_by_length False \
+  --dynamic_image_size True \
+  --use_thumbnail True \
+  --ps_version 'v2' \
+  --deepspeed "zero_stage1_config.json" \
+  --report_to "tensorboard" \
+  --loss_type sigmoid,bco_pair \
+  --sigmoid_loss_weight 0.8 \
+  --bco_pair_loss_weight 0.2 \
+  --rpo_alpha 1 \
+  --use_liger True \
+  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/shell/mini_internvl/README.md b/VLMEvalKit_old/InternVL/internvl_chat/shell/mini_internvl/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7f97f9fa0682c7cc30896ea3ae62e4d63d5de2c5
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/shell/mini_internvl/README.md
@@ -0,0 +1,102 @@
+# Mini-InternVL: A Flexible-Transfer Pocket Multimodal Model with 5% Parameters and 90% Performance
+
+## Introduction
+
+We introduce Mini-InternVL, a series of MLLMs with parameters ranging from 1B to 4B, which achieves 90% of the performance with only 5% of the parameters.
+This significant improvement in efficiency and effectiveness makes our models more accessible and applicable in various real-world scenarios.
+
+![internvl 1 5_wwh_33_2](https://github.com/user-attachments/assets/820ed173-4bd1-45a6-95d6-59c1be01d53f)
+
+- InternViT-300M
+
+We employ InternViT-300M as our visual encoder, a lightweight vision model that inherits the capabilities of a powerful vision encoder. We directly leverage InternViT-6B that has undergone generative training on diverse datasets to transfer knowledge to a lightweight vision model, CLIP-ViT-L-336px.
+
+- Adaptation for Mini-InternVL
+
+To further promote the adoption of our models, we develop a unified adaptation framework for Mini-InternVL, which enables our models to transfer and outperform specialized models in downstream tasks, including autonomous driving, medical images, and remote sensing. We hope to provide insights into the application of MLLMs.
+
+## Models and Performance
+
+|                                 Model                                  | MMMU (val) | MathVista (testmini) | AI2D | ChartQA | DocVQA | InfoVQA | OCRBench | MMB-EN | MMB-CN | Avg. Score |
+| :--------------------------------------------------------------------: | :--------: | :------------------: | :--: | :-----: | :----: | :-----: | :------: | :----: | -----: | :--------: |
+|                            Claude3.5-Sonnet                            |    65.9    |         67.7         | 94.7 |  90.8   |  95.2  |    -    |   788    |  79.7  |   80.7 |    81.7    |
+|                          InternVL2-Llama3-76B                          |    58.2    |         65.5         | 87.6 |  88.4   |  94.1  |  82.0   |   839    |  86.5  |   86.3 |    81.4    |
+| Mini-InternVL-1B ([🤗](https://huggingface.co/OpenGVLab/InternVL2-1B)) |    36.7    |         37.7         | 64.1 |  72.9   |  81.7  |  50.9   |   754    |  65.4  |   60.7 | 60.6 (74%) |
+| Mini-InternVL-2B ([🤗](https://huggingface.co/OpenGVLab/InternVL2-2B)) |    36.3    |         46.3         | 74.1 |  76.2   |  86.9  |  58.9   |   784    |  73.2  |   70.9 | 66.8 (82%) |
+| Mini-InternVL-4B ([🤗](https://huggingface.co/OpenGVLab/InternVL2-4B)) |    48.3    |         58.6         | 78.9 |  81.5   |  89.2  |  67.0   |   788    |  78.6  |   73.9 | 72.8 (90%) |
+
+- We evaluate models using InternVL and VLMEvalKit repositories. AI2D, ChartQA, DocVQA, InfoVQA, and MMBench are tested with InternVL, while MathVistaand OCRBench use VLMEvalKit. For MMMU, we report scores from OpenCompass leaderboard.
+
+- The Avg. Score is the average of the scores from all tested benchmarks, with the OCR-Bench score divided by 10. The values in parentheses represent the relative parameters and performance of Mini-InternVL compared to *InternVL2-Llama3-76B*, which is considered as 100%.
+
+## Domain Adaptation
+
+Visual tasks (*e.g.* Image classification, region perception, multi-view images tasks, video related tasks and visual grounding) can be  formulated into VQA format.
+
+![framework_03_2](https://github.com/user-attachments/assets/63bffb31-cf05-4f52-a679-4700650d0c37)
+
+In the [document](https://internvl.readthedocs.io/en/latest/internvl2.0/domain_adaptation.html), we provide detailed information on the datasets and the fine-tuning process.
+
+### Adaptation models
+
+We have released the adaptation models for the following four domains. The script for evaluation is in the [document](https://internvl.readthedocs.io/en/latest/internvl2.0/domain_adaptation.html#id3).
+
+<table>
+  <tr>
+    <th>Model Name</th>
+    <th>HF Link</th>
+    <th>Note</th>
+  </tr>
+  <tr>
+    <td>Mini-InternVL2-DA-Drivelm</td>
+    <td><a href="https://huggingface.co/OpenGVLab/Mini-InternVL2-1B-DA-Drivelm">🤗1B</a> / <a href="https://huggingface.co/OpenGVLab/Mini-InternVL2-2B-DA-Drivelm">🤗2B</a> / <a href="https://huggingface.co/OpenGVLab/Mini-InternVL2-4B-DA-Drivelm">🤗4B</a></td>
+    <td> Adaptation for <a href="https://github.com/OpenDriveLab/DriveLM/tree/main/challenge"> CVPR 2024 Autonomous Driving Challenge </a></td>
+  </tr>
+  <tr>
+    <td>Mini-InternVL2-DA-BDD</td>
+    <td><a href="https://huggingface.co/OpenGVLab/Mini-InternVL2-1B-DA-BDD">🤗1B</a> / <a href="https://huggingface.co/OpenGVLab/Mini-InternVL2-2B-DA-BDD">🤗2B</a> / <a href="https://huggingface.co/OpenGVLab/Mini-InternVL2-4B-DA-BDD">🤗4B</a></td>
+    <td> Fine-tuning with data constructed by <a href="https://tonyxuqaq.github.io/projects/DriveGPT4/"> DriveGPT4 </a></td>
+  </tr>
+  <tr>
+    <td>Mini-InternVL2-DA-RS</td>
+    <td><a href="https://huggingface.co/OpenGVLab/Mini-InternVL2-1B-DA-RS">🤗1B</a> / <a href="https://huggingface.co/OpenGVLab/Mini-InternVL2-2B-DA-RS">🤗2B</a> / <a href="https://huggingface.co/OpenGVLab/Mini-InternVL2-4B-DA-RS">🤗4B</a></td>
+    <td> Adaptation for remote sensing domain </td>
+  </tr>
+  <tr>
+    <td>Mini-InternVL2-DA-Medical</td>
+    <td><a href="https://huggingface.co/OpenGVLab/Mini-InternVL2-1B-DA-Medical">🤗1B</a> / <a href="https://huggingface.co/OpenGVLab/Mini-InternVL2-2B-DA-Medical">🤗2B</a> / <a href="https://huggingface.co/OpenGVLab/Mini-InternVL2-4B-DA-Medical">🤗4B</a></td>
+    <td> Fine-tuning using our <a href="https://huggingface.co/datasets/OpenGVLab/InternVL-Domain-Adaptation-Data/blob/main/train_meta/internvl_1_2_finetune_medical.json">medical data</a>.</td>
+  </tr>
+</table>
+
+## Citation
+
+If you find this project useful in your research, please consider citing:
+
+```BibTeX
+@article{chen2023internvl,
+  title={InternVL: Scaling up Vision Foundation Models and Aligning for Generic Visual-Linguistic Tasks},
+  author={Chen, Zhe and Wu, Jiannan and Wang, Wenhai and Su, Weijie and Chen, Guo and Xing, Sen and Zhong, Muyan and Zhang, Qinglong and Zhu, Xizhou and Lu, Lewei and Li, Bin and Luo, Ping and Lu, Tong and Qiao, Yu and Dai, Jifeng},
+  journal={arXiv preprint arXiv:2312.14238},
+  year={2023}
+}
+@article{chen2024far,
+  title={How Far Are We to GPT-4V? Closing the Gap to Commercial Multimodal Models with Open-Source Suites},
+  author={Chen, Zhe and Wang, Weiyun and Tian, Hao and Ye, Shenglong and Gao, Zhangwei and Cui, Erfei and Tong, Wenwen and Hu, Kongzhi and Luo, Jiapeng and Ma, Zheng and others},
+  journal={arXiv preprint arXiv:2404.16821},
+  year={2024}
+}
+@article{gao2024mini,
+  title={Mini-InternVL: A Flexible-Transfer Pocket Multimodal Model with 5\% Parameters and 90\% Performance},
+  author={Gao, Zhangwei and Chen, Zhe and Cui, Erfei and Ren, Yiming and Wang, Weiyun and Zhu, Jinguo and Tian, Hao and Ye, Shenglong and He, Junjun and Zhu, Xizhou and others},
+  journal={arXiv preprint arXiv:2410.16261},
+  year={2024}
+}
+```
+
+## Acknowledgements
+
+[DriveGPT4](https://tonyxuqaq.github.io/projects/DriveGPT4/),
+[GeoChat](https://github.com/mbzuai-oryx/GeoChat),
+[SkySenseGPT](https://github.com/Luo-Z13/SkySenseGPT),
+[DriveLM](https://github.com/OpenDriveLab/DriveLM)
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/tools/extract_mlp.py b/VLMEvalKit_old/InternVL/internvl_chat/tools/extract_mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..03f28e2b05257e1490e382ef7781208d6008d126
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/tools/extract_mlp.py
@@ -0,0 +1,19 @@
+import argparse
+import os.path
+
+import torch
+from internvl.model.internvl_chat import InternVLChatModel
+
+argparse = argparse.ArgumentParser()
+argparse.add_argument('model_path', type=str, default='')
+argparse.add_argument('output_path', type=str, default='')
+
+args = argparse.parse_args()
+
+model = InternVLChatModel.from_pretrained(args.model_path, torch_dtype=torch.bfloat16)
+model = model.mlp1.to(torch.bfloat16)
+
+ckpt = model.state_dict()
+output_path = os.path.join(args.output_path, 'mlp_projector.pth')
+torch.save(ckpt, output_path)
+print('finished')
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/tools/images_stitching.py b/VLMEvalKit_old/InternVL/internvl_chat/tools/images_stitching.py
new file mode 100644
index 0000000000000000000000000000000000000000..fab0b063d1b96eb326a1769ce53b2537fe787ddf
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/tools/images_stitching.py
@@ -0,0 +1,79 @@
+import argparse
+import json
+import os
+
+from PIL import Image, ImageDraw, ImageFont
+from tqdm import tqdm
+
+FOOT = ImageFont.truetype('/usr/share/fonts/dejavu/DejaVuSans-Bold.ttf', 50)
+
+
+def custom_image(img_paths, save_path, image_size=448):
+    captions = ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT']
+
+    width = image_size * 2
+    height = image_size
+    # count = 0
+    all_images = {}
+    for image_id, image_files in tqdm(img_paths.items()):
+        all_images[image_id] = dict()
+        all_images[image_id]['images_path'] = image_files
+        all_images[image_id]['images_size'] = {k: (0, 0) for k in image_files.keys()}
+        imgs = {}
+        for caption, image_file in image_files.items():
+            image_path = os.path.join(args.data_root, image_file.replace('../nuscenes/samples/', '/nuscenes/samples/'))
+            img = Image.open(image_path).convert('RGB')
+            old_wide, old_height = img.size
+            all_images[image_id]['images_size'][caption] = (old_wide, old_height)
+            img = img.resize((width, height))
+
+            draw = ImageDraw.Draw(img)
+            text = caption
+            draw.text((0, 0), text, fill=(255, 0, 255), font=FOOT)
+            imgs[caption] = img
+
+        result_width = width * 3
+        result_height = height * 2
+        result_img = Image.new('RGB', (result_width, result_height))
+
+        imgs = [imgs[caption] for caption in captions]
+        for i in range(len(imgs)):
+            row = i // 3
+            col = i % 3
+
+            left = col * width
+            top = row * height
+            right = left + width
+            bottom = top + height
+            result_img.paste(imgs[i], (left, top))
+
+        result_path = os.path.join(save_path, image_id + '.jpg')
+        result_img.save(result_path)
+
+
+def get_images(ann_file):
+    with open(ann_file, 'r') as f:  # , \
+        train_file = json.load(f)
+
+    images = {}
+    for scene_id in train_file.keys():
+        scene_data = train_file[scene_id]['key_frames']
+        for frame_id in scene_data.keys():
+            image_id = scene_id + '_' + frame_id
+            if image_id not in images:
+                images[image_id] = scene_data[frame_id]['image_paths']
+            else:
+                print(image_id)
+
+    return images
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data-root', type=str, default='InternVL-Domain-Adaptation-Data/images/drivelm')
+    parser.add_argument('--ann-file', type=str, default='path/to/v1_1_val_nus_q_only.json')
+    args = parser.parse_args()
+    images = get_images(args.ann_file)
+    save_path = os.path.join(args.data_root, 'stitch')
+    os.makedirs(save_path, exist_ok=True)
+    custom_image(img_paths=images, save_path=save_path)
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/tools/merge_lora.py b/VLMEvalKit_old/InternVL/internvl_chat/tools/merge_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..506869096440c4f17fa81682c022d7dbbbb0d1c1
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/tools/merge_lora.py
@@ -0,0 +1,31 @@
+import argparse
+
+import torch
+from internvl.model.internvl_chat import InternVLChatModel
+from transformers import AutoTokenizer
+
+argparse = argparse.ArgumentParser()
+argparse.add_argument('input_path', type=str, help='Path to the input model')
+argparse.add_argument('output_path', type=str, help='Path to the output model')
+args = argparse.parse_args()
+
+print('Loading model...')
+model = InternVLChatModel.from_pretrained(
+    args.input_path, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).eval()
+print('Loading tokenizer...')
+tokenizer = AutoTokenizer.from_pretrained(args.input_path, trust_remote_code=True)
+
+if model.config.use_backbone_lora:
+    model.vision_model.merge_and_unload()
+    model.vision_model = model.vision_model.model
+    model.config.use_backbone_lora = 0
+if model.config.use_llm_lora:
+    model.language_model.merge_and_unload()
+    model.language_model = model.language_model.model
+    model.config.use_llm_lora = 0
+
+print('Saving model...')
+model.save_pretrained(args.output_path)
+print('Saving tokenizer...')
+tokenizer.save_pretrained(args.output_path)
+print('Done!')
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/tools/replace_llm.py b/VLMEvalKit_old/InternVL/internvl_chat/tools/replace_llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..31545c48078fe2620cf7d4c418de4139cb50bde3
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/tools/replace_llm.py
@@ -0,0 +1,29 @@
+import argparse
+
+import torch
+from internvl.model.internvl_chat import InternVLChatModel
+from transformers import AutoModel, AutoTokenizer
+
+argparse = argparse.ArgumentParser()
+argparse.add_argument('model_path', type=str, default='')
+argparse.add_argument('llm_path', type=str, default='')
+
+args = argparse.parse_args()
+
+if args.model_path[-1] == '/':
+    args.model_path = args.model_path[:-1]
+
+model = InternVLChatModel.from_pretrained(args.model_path, torch_dtype=torch.bfloat16)
+
+llm = AutoModel.from_pretrained(
+    args.llm_path, trust_remote_code=True, torch_dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained(
+    args.llm_path, trust_remote_code=True)
+model.language_model = llm
+model.config.llm_config = llm.config
+model.to(torch.bfloat16)
+
+output_path = args.model_path + '_replace_llm'
+model.save_pretrained(output_path)
+tokenizer.save_pretrained(output_path)
+print('finished')
diff --git a/VLMEvalKit_old/InternVL/internvl_chat/tools/resize_pos_embed.py b/VLMEvalKit_old/InternVL/internvl_chat/tools/resize_pos_embed.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c89cebb4a07ca29eb7989768af3a346a041c6cf
--- /dev/null
+++ b/VLMEvalKit_old/InternVL/internvl_chat/tools/resize_pos_embed.py
@@ -0,0 +1,25 @@
+import argparse
+
+import torch
+from internvl.model.internvl_chat import InternVLChatModel
+from transformers import AutoTokenizer
+
+argparse = argparse.ArgumentParser()
+argparse.add_argument('model_path', type=str, default='')
+argparse.add_argument('output_path', type=str, default='')
+argparse.add_argument('force_image_size', type=int, default=448)
+
+args = argparse.parse_args()
+
+model = InternVLChatModel.from_pretrained(args.model_path, torch_dtype=torch.bfloat16)
+model.vision_model.resize_pos_embeddings(old_size=model.config.vision_config.image_size,
+                                         new_size=args.force_image_size,
+                                         patch_size=14)
+model.config.vision_config.image_size = args.force_image_size
+model.config.force_image_size = args.force_image_size
+
+model.save_pretrained(args.output_path)
+
+tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+tokenizer.save_pretrained(args.output_path)
+print('finished')