dreilly commited on
Commit
2ca7930
·
verified ·
1 Parent(s): ea75dd1

Upload folder using huggingface_hub

Browse files
viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+ ### Framework versions
7
+
8
+
9
+ - PEFT 0.4.0
viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "/home/dreilly1/Projects/VisCoP/base_vlm/videollama3-image_7b_local",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 64,
15
+ "revision": null,
16
+ "target_modules": ".*model\\.layers\\..*\\.(gate_proj|o_proj|q_proj|v_proj|up_proj|down_proj|k_proj)$",
17
+ "task_type": "CAUSAL_LM"
18
+ }
viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54b0d449306b48fc0193eeb2dec4280c5d91eb1180bc00faaf1e134edd727cbe
3
+ size 323097578
viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/config.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_name_or_path": "/home/dreilly1/Projects/VisCoP/base_vlm/videollama3-image_7b_local",
4
+ "architectures": [
5
+ "Videollama3Qwen2ForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--configuration_videollama3.Videollama3Qwen2Config",
10
+ "AutoModelForCausalLM": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--modeling_videollama3.Videollama3Qwen2ForCausalLM"
11
+ },
12
+ "bos_token_id": 151643,
13
+ "eos_token_id": 151645,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 3584,
16
+ "image_aspect_ratio": "square",
17
+ "image_size": -1,
18
+ "image_token_index": 151665,
19
+ "image_token_length": 1,
20
+ "include_visual_probes": true,
21
+ "include_visual_tokens": true,
22
+ "initializer_range": 0.02,
23
+ "interaction_module": "cross_attention",
24
+ "interaction_module_layers": null,
25
+ "intermediate_size": 18944,
26
+ "is_alignment": false,
27
+ "llm_lr": 1e-05,
28
+ "max_frames": 180,
29
+ "max_position_embeddings": 32768,
30
+ "max_window_layers": 28,
31
+ "mm_hidden_size": 1152,
32
+ "mm_projector_lr": 1e-05,
33
+ "mm_projector_type": "mlp2x_gelu",
34
+ "mm_vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT",
35
+ "mm_vision_select_feature": "patch",
36
+ "mm_vision_select_layer": -1,
37
+ "model_type": "viscop_qwen2",
38
+ "num_attention_heads": 28,
39
+ "num_hidden_layers": 28,
40
+ "num_key_value_heads": 4,
41
+ "num_visual_probes": 16,
42
+ "probe_token_index": 151668,
43
+ "rms_norm_eps": 1e-06,
44
+ "rope_scaling": null,
45
+ "rope_theta": 1000000.0,
46
+ "sliding_window": null,
47
+ "tie_word_embeddings": false,
48
+ "tokenizer_model_max_length": 16384,
49
+ "tokenizer_padding_side": "right",
50
+ "torch_dtype": "bfloat16",
51
+ "transformers_version": "4.46.3",
52
+ "use_cache": true,
53
+ "use_mm_proj": true,
54
+ "use_sliding_window": false,
55
+ "use_token_compression": false,
56
+ "vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT",
57
+ "vision_encoder_config": {
58
+ "_attn_implementation_autoset": false,
59
+ "_name_or_path": "",
60
+ "add_cross_attention": false,
61
+ "architectures": null,
62
+ "attention_dropout": 0.0,
63
+ "bad_words_ids": null,
64
+ "begin_suppress_tokens": null,
65
+ "bos_token_id": null,
66
+ "chunk_size_feed_forward": 0,
67
+ "cross_attention_hidden_size": null,
68
+ "decoder_start_token_id": null,
69
+ "diversity_penalty": 0.0,
70
+ "do_sample": false,
71
+ "early_stopping": false,
72
+ "encoder_no_repeat_ngram_size": 0,
73
+ "eos_token_id": null,
74
+ "exponential_decay_length_penalty": null,
75
+ "finetuning_task": null,
76
+ "forced_bos_token_id": null,
77
+ "forced_eos_token_id": null,
78
+ "hidden_act": "gelu_pytorch_tanh",
79
+ "hidden_size": 1152,
80
+ "id2label": {
81
+ "0": "LABEL_0",
82
+ "1": "LABEL_1"
83
+ },
84
+ "intermediate_size": 4304,
85
+ "is_decoder": false,
86
+ "is_encoder_decoder": false,
87
+ "label2id": {
88
+ "LABEL_0": 0,
89
+ "LABEL_1": 1
90
+ },
91
+ "layer_norm_eps": 1e-06,
92
+ "length_penalty": 1.0,
93
+ "max_length": 20,
94
+ "min_length": 0,
95
+ "model_type": "videollama3_vision_encoder",
96
+ "no_repeat_ngram_size": 0,
97
+ "num_attention_heads": 16,
98
+ "num_beam_groups": 1,
99
+ "num_beams": 1,
100
+ "num_channels": 3,
101
+ "num_hidden_layers": 27,
102
+ "num_return_sequences": 1,
103
+ "output_attentions": false,
104
+ "output_hidden_states": false,
105
+ "output_scores": false,
106
+ "pad_token_id": null,
107
+ "patch_size": 14,
108
+ "prefix": null,
109
+ "problem_type": null,
110
+ "pruned_heads": {},
111
+ "remove_invalid_values": false,
112
+ "repetition_penalty": 1.0,
113
+ "return_dict": true,
114
+ "return_dict_in_generate": false,
115
+ "sep_token_id": null,
116
+ "suppress_tokens": null,
117
+ "task_specific_params": null,
118
+ "temperature": 1.0,
119
+ "tf_legacy_loss": false,
120
+ "tie_encoder_decoder": false,
121
+ "tie_word_embeddings": true,
122
+ "tokenizer_class": null,
123
+ "top_k": 50,
124
+ "top_p": 1.0,
125
+ "torch_dtype": null,
126
+ "torchscript": false,
127
+ "typical_p": 1.0,
128
+ "use_bfloat16": false
129
+ },
130
+ "vision_encoder_lr": null,
131
+ "vocab_size": 152064
132
+ }
viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/non_lora_trainables.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf012cd5517edf0873ba621833c3bb639f70191e9f9d733ff69e0bcdd71628be
3
+ size 354950700
viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/train_viscop.sh ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # Environment Variables
3
+ ARG_WORLD_SIZE=${1:-1}
4
+ ARG_NPROC_PER_NODE=${2:-8}
5
+
6
+ if [[ -v MASTER_ADDR_PASSED ]]; then
7
+ ARG_MASTER_ADDR=$MASTER_ADDR_PASSED # passed via slurm submission script
8
+ else
9
+ ARG_MASTER_ADDR=127.0.0.1 # for dev environments
10
+ fi
11
+ ARG_MASTER_PORT=12355
12
+ # ARG_RANK=$SLURM_NODEID
13
+ ARG_RANK=0
14
+
15
+ # Multiple conditions
16
+ if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
17
+ WORLD_SIZE=$ARG_WORLD_SIZE
18
+ NPROC_PER_NODE=$ARG_NPROC_PER_NODE
19
+ fi
20
+
21
+ if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
22
+ MASTER_ADDR=$ARG_MASTER_ADDR
23
+ MASTER_PORT=$ARG_MASTER_PORT
24
+ RANK=$ARG_RANK
25
+ fi
26
+
27
+ echo "MASTER_ADDR: $MASTER_ADDR. MASTER_PORT: $MASTER_PORT. RANK: $RANK"
28
+ echo "WORLD_SIZE: $WORLD_SIZE"
29
+ echo "NPROC_PER_NODE: $NPROC_PER_NODE"
30
+
31
+ # Training Arguments
32
+ GLOBAL_BATCH_SIZE=128 # aka effective batch size
33
+ LOCAL_BATCH_SIZE=8 # batch size per GPU
34
+ GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
35
+ echo $GRADIENT_ACCUMULATION_STEPS
36
+
37
+ INIT_MODEL=/home/dreilly1/Projects/VisCoP/base_vlm/videollama3-image_7b_local # path to base VLM (for ViSCoP we use VideoLLaMA3 as the base VLM)
38
+
39
+ NUM_DATA_WORKERS=8
40
+ NUM_TRAIN_EPOCHS=3
41
+ LORA_TRAINING=True
42
+
43
+ # ViSCoP Arguments
44
+ NUM_VISUAL_PROBES=16
45
+ INTERACTION_MODULE_POS=all
46
+ PASS_PROBES_TO_LLM=True
47
+ PASS_VIS_FEATURES_TO_LLM=True
48
+
49
+ # Logging Arguments
50
+ export WANDB_PROJECT=sony26_mm_viscop
51
+ REPORT_TO=wandb
52
+ OUTP_DIR=work_dirs/egoexo
53
+ RUN_NAME=viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer
54
+
55
+ # Data Arguments
56
+ # DATA_DIR=/home/dreilly1/Projects/paired_egoexo_dl/paired_videos_DATA/
57
+ DATA_DIR=/data/dreilly1/EgoExo4D_symlink/
58
+ TRAINING_JSON="/home/dreilly1/Projects/viscop_sony/training_jsons/train-instr_viscop_egoview_ALLDATA.json"
59
+
60
+ # if [[ $TRAINING_JSON == *"egoview"* ]]; then
61
+ # MAX_FRAMES=40 # use 40 frames for training on ego
62
+ # else
63
+ # MAX_FRAMES=180
64
+ # fi
65
+ MAX_FRAMES=180
66
+
67
+ # Optional Arguments. Set TESTING to 1 to quickly test the training script without logging or data workers, useful for debugging
68
+ TESTING=0
69
+ if [ $TESTING -eq 1 ]; then
70
+ NUM_DATA_WORKERS=0
71
+ REPORT_TO=none
72
+ RUN_NAME=TESTING
73
+ fi
74
+
75
+ mkdir -p "${OUTP_DIR}/${RUN_NAME}/"
76
+ cp "$0" "${OUTP_DIR}/${RUN_NAME}/"
77
+
78
+ torchrun --nnodes $WORLD_SIZE \
79
+ --nproc_per_node $NPROC_PER_NODE \
80
+ --master_addr=$MASTER_ADDR \
81
+ --master_port=$MASTER_PORT \
82
+ --node_rank $RANK \
83
+ viscop/train_viscop.py \
84
+ --interaction_module_layers $INTERACTION_MODULE_POS \
85
+ --lora_enable $LORA_TRAINING \
86
+ --num_train_epochs $NUM_TRAIN_EPOCHS \
87
+ --deepspeed scripts/zero2.json \
88
+ --model_type viscop_qwen2 \
89
+ --model_path $INIT_MODEL \
90
+ --vision_encoder DAMO-NLP-SG/SigLIP-NaViT \
91
+ --mm_projector_type mlp2x_gelu \
92
+ --data_path $TRAINING_JSON \
93
+ --data_folder $DATA_DIR \
94
+ --image_merge_size 2 \
95
+ --video_merge_size 2 \
96
+ --fps 1 \
97
+ --max_frames $MAX_FRAMES \
98
+ --model_max_length 16384 \
99
+ --mm_max_length 10240 \
100
+ --bf16 True \
101
+ --tf32 True \
102
+ --fp16 False \
103
+ --output_dir ${OUTP_DIR}/${RUN_NAME} \
104
+ --per_device_train_batch_size $LOCAL_BATCH_SIZE \
105
+ --per_device_eval_batch_size 2 \
106
+ --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
107
+ --evaluation_strategy "no" \
108
+ --save_strategy "no" \
109
+ --save_steps 5000 \
110
+ --save_total_limit 1 \
111
+ --mm_projector_lr 1e-5 \
112
+ --llm_lr 1e-5 \
113
+ --weight_decay 0. \
114
+ --warmup_ratio 0.03 \
115
+ --lr_scheduler_type "cosine" \
116
+ --logging_steps 1 \
117
+ --gradient_checkpointing True \
118
+ --dataloader_num_workers $NUM_DATA_WORKERS \
119
+ --report_to $REPORT_TO \
120
+ --run_name $RUN_NAME \
121
+ --dataset_cache_dir /home/dreilly1/.cache/viscop_datasetcache \
122
+ --include_visual_tokens $PASS_VIS_FEATURES_TO_LLM \
123
+ --include_visual_probes $PASS_PROBES_TO_LLM \
124
+ --num_visual_probes $NUM_VISUAL_PROBES
viscop_qwen2.5_7b_EgoExo4D-EGOvideos_train-VisCoP_projector-LLM_LoRA_CorrectedOptimizer/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff