Wangpeng An commited on
Upload folder using huggingface_hub
Browse files- config.json +4 -6
- finetune_all_multinode_stage4.sh +8 -10
- model-00001-of-00004.safetensors +1 -1
- model-00002-of-00004.safetensors +1 -1
- model-00003-of-00004.safetensors +1 -1
- model-00004-of-00004.safetensors +1 -1
- tokenizer_config.json +1 -1
- trainer_state.json +0 -0
- training_args.bin +1 -1
config.json
CHANGED
|
@@ -1,11 +1,8 @@
|
|
| 1 |
{
|
| 2 |
"X": [
|
| 3 |
-
"
|
| 4 |
-
"Audio_caption",
|
| 5 |
-
"Video",
|
| 6 |
-
"Image"
|
| 7 |
],
|
| 8 |
-
"_name_or_path": "/mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/OmniFusion-main/checkpoints/OmniFusion-8B-stage3-1018",
|
| 9 |
"architectures": [
|
| 10 |
"LlavaLlamaForCausalLM"
|
| 11 |
],
|
|
@@ -24,6 +21,7 @@
|
|
| 24 |
"image_grid_pinpoints": null,
|
| 25 |
"initializer_range": 0.02,
|
| 26 |
"intermediate_size": 14336,
|
|
|
|
| 27 |
"max_position_embeddings": 131072,
|
| 28 |
"mlp_bias": false,
|
| 29 |
"mm_audio_caption_tower": "/mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio",
|
|
@@ -51,7 +49,7 @@
|
|
| 51 |
},
|
| 52 |
"rope_theta": 500000.0,
|
| 53 |
"tie_word_embeddings": false,
|
| 54 |
-
"tokenizer_model_max_length":
|
| 55 |
"torch_dtype": "bfloat16",
|
| 56 |
"transformers_version": "4.43.1",
|
| 57 |
"tune_mm_mlp_adapter": false,
|
|
|
|
| 1 |
{
|
| 2 |
"X": [
|
| 3 |
+
"Video"
|
|
|
|
|
|
|
|
|
|
| 4 |
],
|
| 5 |
+
"_name_or_path": "/mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/tmp/OmniFusion-main/checkpoints/OmniFusion-8B-stage3-1018",
|
| 6 |
"architectures": [
|
| 7 |
"LlavaLlamaForCausalLM"
|
| 8 |
],
|
|
|
|
| 21 |
"image_grid_pinpoints": null,
|
| 22 |
"initializer_range": 0.02,
|
| 23 |
"intermediate_size": 14336,
|
| 24 |
+
"is_fusion": true,
|
| 25 |
"max_position_embeddings": 131072,
|
| 26 |
"mlp_bias": false,
|
| 27 |
"mm_audio_caption_tower": "/mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio",
|
|
|
|
| 49 |
},
|
| 50 |
"rope_theta": 500000.0,
|
| 51 |
"tie_word_embeddings": false,
|
| 52 |
+
"tokenizer_model_max_length": 6144,
|
| 53 |
"torch_dtype": "bfloat16",
|
| 54 |
"transformers_version": "4.43.1",
|
| 55 |
"tune_mm_mlp_adapter": false,
|
finetune_all_multinode_stage4.sh
CHANGED
|
@@ -27,7 +27,7 @@ echo "master port: ${port}"
|
|
| 27 |
|
| 28 |
source /mnt/bn/tns-algo-video-public-my2/wangpeng.an/environment/anaconda3/bin/activate multimodal
|
| 29 |
|
| 30 |
-
cd /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/OmniFusion-
|
| 31 |
|
| 32 |
# Install necessary packages
|
| 33 |
pip3 install requests
|
|
@@ -48,14 +48,11 @@ sudo chmod 777 /var/lib/fastrak -R
|
|
| 48 |
ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=$ARNOLD_WORKER_GPU --nnodes=$ARNOLD_WORKER_NUM --node_rank=$ARNOLD_ID --master_addr=$METIS_WORKER_0_HOST --master_port=$port \
|
| 49 |
llava/train/train_mem.py \
|
| 50 |
--deepspeed ./scripts/zero2.json \
|
| 51 |
-
--model_name_or_path /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/OmniFusion-main/checkpoints/OmniFusion-8B-stage3-1018 \
|
| 52 |
--version llama_3_1 \
|
| 53 |
-
--data_path /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/
|
| 54 |
-
--audio_asr_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data \
|
| 55 |
-
--audio_caption_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/stage2/audio_caption_data_tune/audio_caption_tune/audio_caption \
|
| 56 |
--video_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/Video-LLaVA \
|
| 57 |
-
--
|
| 58 |
-
--X "Audio_asr" "Audio_caption" "Video" "Image" \
|
| 59 |
--audio_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/LanguageBind_Audio_Asr \
|
| 60 |
--audio_caption_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio \
|
| 61 |
--video_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Video_merge \
|
|
@@ -64,10 +61,11 @@ ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=$ARNOLD_WORKER_GPU --nnodes=
|
|
| 64 |
--mm_vision_select_layer -2 \
|
| 65 |
--mm_use_x_start_end False \
|
| 66 |
--mm_use_x_patch_token False \
|
|
|
|
| 67 |
--image_aspect_ratio pad \
|
| 68 |
--group_by_modality_length True \
|
| 69 |
--bf16 True \
|
| 70 |
-
--output_dir ./checkpoints/OmniFusion-8B-stage4-
|
| 71 |
--num_train_epochs 1 \
|
| 72 |
--per_device_train_batch_size 8 \
|
| 73 |
--per_device_eval_batch_size 4 \
|
|
@@ -82,8 +80,8 @@ ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=$ARNOLD_WORKER_GPU --nnodes=
|
|
| 82 |
--lr_scheduler_type "cosine" \
|
| 83 |
--logging_steps 1 \
|
| 84 |
--tf32 True \
|
| 85 |
-
--model_max_length
|
| 86 |
-
--tokenizer_model_max_length
|
| 87 |
--gradient_checkpointing True \
|
| 88 |
--dataloader_num_workers 8 \
|
| 89 |
--lazy_preprocess True \
|
|
|
|
| 27 |
|
| 28 |
source /mnt/bn/tns-algo-video-public-my2/wangpeng.an/environment/anaconda3/bin/activate multimodal
|
| 29 |
|
| 30 |
+
cd /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/OmniFusion-main
|
| 31 |
|
| 32 |
# Install necessary packages
|
| 33 |
pip3 install requests
|
|
|
|
| 48 |
ACCELERATE_CPU_AFFINITY=1 torchrun --nproc_per_node=$ARNOLD_WORKER_GPU --nnodes=$ARNOLD_WORKER_NUM --node_rank=$ARNOLD_ID --master_addr=$METIS_WORKER_0_HOST --master_port=$port \
|
| 49 |
llava/train/train_mem.py \
|
| 50 |
--deepspeed ./scripts/zero2.json \
|
| 51 |
+
--model_name_or_path /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/tmp/OmniFusion-main/checkpoints/OmniFusion-8B-stage3-1018 \
|
| 52 |
--version llama_3_1 \
|
| 53 |
+
--data_path /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/stage4_1031/videochatgpt_tune_stage4.json \
|
|
|
|
|
|
|
| 54 |
--video_folder /mnt/bn/tns-algo-video-public-my2/wangpeng.an/data/Video-LLaVA \
|
| 55 |
+
--X "Video" \
|
|
|
|
| 56 |
--audio_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/LanguageBind_Audio_Asr \
|
| 57 |
--audio_caption_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Audio \
|
| 58 |
--video_tower /mnt/bn/tns-algo-video-public-my2/wangpeng.an/model/pretrained_model/LanguageBind_Video_merge \
|
|
|
|
| 61 |
--mm_vision_select_layer -2 \
|
| 62 |
--mm_use_x_start_end False \
|
| 63 |
--mm_use_x_patch_token False \
|
| 64 |
+
--is_fusion True \
|
| 65 |
--image_aspect_ratio pad \
|
| 66 |
--group_by_modality_length True \
|
| 67 |
--bf16 True \
|
| 68 |
+
--output_dir ./checkpoints/OmniFusion-8B-stage4-1031 \
|
| 69 |
--num_train_epochs 1 \
|
| 70 |
--per_device_train_batch_size 8 \
|
| 71 |
--per_device_eval_batch_size 4 \
|
|
|
|
| 80 |
--lr_scheduler_type "cosine" \
|
| 81 |
--logging_steps 1 \
|
| 82 |
--tf32 True \
|
| 83 |
+
--model_max_length 4096 \
|
| 84 |
+
--tokenizer_model_max_length 6144 \
|
| 85 |
--gradient_checkpointing True \
|
| 86 |
--dataloader_num_workers 8 \
|
| 87 |
--lazy_preprocess True \
|
model-00001-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4976698672
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:346f2705a5e9ae972033847876dea0bcedd1ee56116c9c23cfa72ded222e8214
|
| 3 |
size 4976698672
|
model-00002-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4999802720
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7bab15611dfc404180dd875368f2e343eef0be202143436f74781821fdb00be9
|
| 3 |
size 4999802720
|
model-00003-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 4915916176
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:134edcbecc2c4602a9f3276d0014ec68dfd9b379af4b1b750f6524cabb9703ab
|
| 3 |
size 4915916176
|
model-00004-of-00004.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 3851682320
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:37ba906825d1bb6330573ed40c45cfa9dcb3e28a86ddf494f4f498321e6fbb86
|
| 3 |
size 3851682320
|
tokenizer_config.json
CHANGED
|
@@ -2057,7 +2057,7 @@
|
|
| 2057 |
"input_ids",
|
| 2058 |
"attention_mask"
|
| 2059 |
],
|
| 2060 |
-
"model_max_length":
|
| 2061 |
"pad_token": "<|finetune_right_pad_id|>",
|
| 2062 |
"padding_side": "right",
|
| 2063 |
"tokenizer_class": "PreTrainedTokenizerFast"
|
|
|
|
| 2057 |
"input_ids",
|
| 2058 |
"attention_mask"
|
| 2059 |
],
|
| 2060 |
+
"model_max_length": 4096,
|
| 2061 |
"pad_token": "<|finetune_right_pad_id|>",
|
| 2062 |
"padding_side": "right",
|
| 2063 |
"tokenizer_class": "PreTrainedTokenizerFast"
|
trainer_state.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 6776
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:145ed827bee3a57b7ae1ffa2e2548128c776658cc3e524c09ac1865e2a584bf7
|
| 3 |
size 6776
|