dreilly commited on
Commit
94c170f
·
verified ·
1 Parent(s): 25aacfd

Upload folder using huggingface_hub

Browse files
viscop_qwen2.5_7b_viscop-lora_depth-expert/README.md ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+ ### Framework versions
7
+
8
+ - PEFT 0.4.0
9
+ - PEFT 0.4.0
10
+
11
+ - PEFT 0.4.0
viscop_qwen2.5_7b_viscop-lora_depth-expert/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "/scratch/10580/dreilly1/videollama3-image_7b_local",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 64,
15
+ "revision": null,
16
+ "target_modules": ".*model\\.layers\\..*\\.(q_proj|gate_proj|v_proj|o_proj|up_proj|k_proj|down_proj)$",
17
+ "task_type": "CAUSAL_LM"
18
+ }
viscop_qwen2.5_7b_viscop-lora_depth-expert/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:475e86a97f1411f5d610d6a8121ab967176c306a32fcad23f5e523bdab331211
3
+ size 323097987
viscop_qwen2.5_7b_viscop-lora_depth-expert/config.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_name_or_path": "./base_vlm/videollama3-image_7b_local/",
4
+ "architectures": [
5
+ "Videollama3Qwen2ForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--configuration_videollama3.Videollama3Qwen2Config",
10
+ "AutoModelForCausalLM": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--modeling_videollama3.Videollama3Qwen2ForCausalLM"
11
+ },
12
+ "bos_token_id": 151643,
13
+ "ca_module_version": "v1",
14
+ "eos_token_id": 151645,
15
+ "hidden_act": "silu",
16
+ "hidden_size": 3584,
17
+ "image_aspect_ratio": "square",
18
+ "image_size": -1,
19
+ "image_token_index": 151665,
20
+ "image_token_length": 1,
21
+ "include_visual_tokens": true,
22
+ "initializer_range": 0.02,
23
+ "intermediate_size": 18944,
24
+ "is_alignment": false,
25
+ "llm_lr": 1e-05,
26
+ "max_frames": 180,
27
+ "max_position_embeddings": 32768,
28
+ "max_window_layers": 28,
29
+ "mm_hidden_size": 1152,
30
+ "mm_projector_lr": 1e-05,
31
+ "mm_projector_type": "mlp2x_gelu",
32
+ "mm_vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT",
33
+ "mm_vision_select_feature": "patch",
34
+ "mm_vision_select_layer": -1,
35
+ "model_type": "viscop_qwen2",
36
+ "num_attention_heads": 28,
37
+ "num_hidden_layers": 28,
38
+ "num_key_value_heads": 4,
39
+ "query_ca_layers": null,
40
+ "rms_norm_eps": 1e-06,
41
+ "rope_scaling": null,
42
+ "rope_theta": 1000000.0,
43
+ "sliding_window": null,
44
+ "tie_word_embeddings": false,
45
+ "tokenizer_model_max_length": 16384,
46
+ "tokenizer_padding_side": "right",
47
+ "torch_dtype": "bfloat16",
48
+ "transformers_version": "4.46.3",
49
+ "use_cache": true,
50
+ "use_learnable_queries": true,
51
+ "use_mm_proj": true,
52
+ "use_sliding_window": false,
53
+ "use_token_compression": false,
54
+ "vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT",
55
+ "vision_encoder_config": {
56
+ "_attn_implementation_autoset": false,
57
+ "_name_or_path": "",
58
+ "add_cross_attention": false,
59
+ "architectures": null,
60
+ "attention_dropout": 0.0,
61
+ "bad_words_ids": null,
62
+ "begin_suppress_tokens": null,
63
+ "bos_token_id": null,
64
+ "chunk_size_feed_forward": 0,
65
+ "cross_attention_hidden_size": null,
66
+ "decoder_start_token_id": null,
67
+ "diversity_penalty": 0.0,
68
+ "do_sample": false,
69
+ "early_stopping": false,
70
+ "encoder_no_repeat_ngram_size": 0,
71
+ "eos_token_id": null,
72
+ "exponential_decay_length_penalty": null,
73
+ "finetuning_task": null,
74
+ "forced_bos_token_id": null,
75
+ "forced_eos_token_id": null,
76
+ "hidden_act": "gelu_pytorch_tanh",
77
+ "hidden_size": 1152,
78
+ "id2label": {
79
+ "0": "LABEL_0",
80
+ "1": "LABEL_1"
81
+ },
82
+ "intermediate_size": 4304,
83
+ "is_decoder": false,
84
+ "is_encoder_decoder": false,
85
+ "label2id": {
86
+ "LABEL_0": 0,
87
+ "LABEL_1": 1
88
+ },
89
+ "layer_norm_eps": 1e-06,
90
+ "length_penalty": 1.0,
91
+ "max_length": 20,
92
+ "min_length": 0,
93
+ "model_type": "videollama3_vision_encoder",
94
+ "no_repeat_ngram_size": 0,
95
+ "num_attention_heads": 16,
96
+ "num_beam_groups": 1,
97
+ "num_beams": 1,
98
+ "num_channels": 3,
99
+ "num_hidden_layers": 27,
100
+ "num_return_sequences": 1,
101
+ "output_attentions": false,
102
+ "output_hidden_states": false,
103
+ "output_scores": false,
104
+ "pad_token_id": null,
105
+ "patch_size": 14,
106
+ "prefix": null,
107
+ "problem_type": null,
108
+ "pruned_heads": {},
109
+ "remove_invalid_values": false,
110
+ "repetition_penalty": 1.0,
111
+ "return_dict": true,
112
+ "return_dict_in_generate": false,
113
+ "sep_token_id": null,
114
+ "suppress_tokens": null,
115
+ "task_specific_params": null,
116
+ "temperature": 1.0,
117
+ "tf_legacy_loss": false,
118
+ "tie_encoder_decoder": false,
119
+ "tie_word_embeddings": true,
120
+ "tokenizer_class": null,
121
+ "top_k": 50,
122
+ "top_p": 1.0,
123
+ "torch_dtype": null,
124
+ "torchscript": false,
125
+ "typical_p": 1.0,
126
+ "use_bfloat16": false
127
+ },
128
+ "vision_encoder_lr": null,
129
+ "vocab_size": 152064,
130
+ "include_visual_probes": true,
131
+ "interaction_module": "cross_attention",
132
+ "interaction_module_layers": null,
133
+ "num_visual_probes": 16
134
+ }
viscop_qwen2.5_7b_viscop-lora_depth-expert/cross_view_queries.sh ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # nohup bash -c 'while true; do nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv > nvidia_log.txt; sleep 1; done' &
3
+ export PATH=/scratch/10580/dreilly1/ffmpeg_link/:$PATH
4
+
5
+ # Environment Variables
6
+ ARG_WORLD_SIZE=${1:-1}
7
+ ARG_NPROC_PER_NODE=${2:-8}
8
+
9
+ if [[ -v MASTER_ADDR_PASSED ]]; then
10
+ ARG_MASTER_ADDR=$MASTER_ADDR_PASSED # passed via slurm submission script
11
+ else
12
+ ARG_MASTER_ADDR=127.0.0.1 # for dev environments
13
+ fi
14
+ ARG_MASTER_PORT=12355
15
+ ARG_RANK=$SLURM_NODEID
16
+
17
+ # Multiple conditions
18
+ if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
19
+ WORLD_SIZE=$ARG_WORLD_SIZE
20
+ NPROC_PER_NODE=$ARG_NPROC_PER_NODE
21
+ fi
22
+
23
+ if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
24
+ MASTER_ADDR=$ARG_MASTER_ADDR
25
+ MASTER_PORT=$ARG_MASTER_PORT
26
+ RANK=$ARG_RANK
27
+ fi
28
+
29
+ echo "MASTER_ADDR: $MASTER_ADDR. MASTER_PORT: $MASTER_PORT. RANK: $RANK"
30
+ echo "WORLD_SIZE: $WORLD_SIZE"
31
+ echo "NPROC_PER_NODE: $NPROC_PER_NODE"
32
+
33
+ # Training Arguments
34
+ GLOBAL_BATCH_SIZE=128 # 128
35
+ LOCAL_BATCH_SIZE=4
36
+ GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
37
+ echo $GRADIENT_ACCUMULATION_STEPS
38
+
39
+ NUM_DATA_WORKERS=8
40
+ NUM_TRAIN_EPOCHS=3
41
+
42
+ # Log Arguments
43
+ export WANDB_PROJECT=egoexo
44
+ REPORT_TO=none
45
+ DATA_DIR=/scratch/10580/dreilly1/EgoExo4D/keystep_segments_depth
46
+ OUTP_DIR=work_dirs/egoexo
47
+
48
+ CA_LAYER_VERSION=v1
49
+ RUN_NAME=videollama3_qwen2.5_7b_DEPTHEgoExo4D-S4-Captioned-EXOonly_train-projector-CA$CA_LAYER_VERSION-Queries-LLM_LoRAr16_16global_NODIST
50
+
51
+ # TRAINING_JSON="training_jsons/egoexo4d-allactions-exoviews.json"
52
+ # TRAINING_JSON="training_jsons/egoexo4d-allactions-egoview.json"
53
+
54
+ TRAINING_JSON="training_jsons/egoexo_vllama3-S4_caption-inst/egoexo4d-vllama3-S4cap-exoviews.json"
55
+ # TRAINING_JSON="training_jsons/egoexo_vllama3-S4_caption-inst/egoexo4d-vllama3-S4cap-egoview.json"
56
+
57
+ LORA_TRAINING=True # > Must pass llm_lr if doing LoRA training
58
+
59
+ # > Cross view options
60
+ NUM_LATENT_QUERIES=16
61
+
62
+ if [[ $TRAINING_JSON == *"exoviews"* ]]; then
63
+ MAX_FRAMES=180
64
+ # DIST_DIR="/work/dreilly1/VideoLLaMA3/saved_dist_tokens/16global-EgoExo4D-S4-Captioned_ego-trained_visquery-to-LLM" # set to "none" to disable
65
+ DIST_DIR="none"
66
+ PASS_VIS=True
67
+ PASS_QUERY=True
68
+ elif [[ $TRAINING_JSON == *"egoview"* ]]; then
69
+ MAX_FRAMES=40
70
+ DIST_DIR="none"
71
+ PASS_VIS=False
72
+ PASS_QUERY=True
73
+ fi
74
+
75
+ PASS_VIS=True # !
76
+
77
+ TESTING=0
78
+ if [ $TESTING -eq 1 ]; then
79
+ NUM_DATA_WORKERS=0
80
+ REPORT_TO=none
81
+ RUN_NAME=TESTING
82
+ fi
83
+
84
+ mkdir -p "${OUTP_DIR}/${RUN_NAME}/"
85
+ cp "$0" "${OUTP_DIR}/${RUN_NAME}/"
86
+
87
+ INIT_MODEL=/scratch/10580/dreilly1/videollama3-image_7b_local # ! videollama3-image_7b_local
88
+
89
+ # Layers: 3,8,17,26
90
+ # Layers: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
91
+ torchrun --nnodes $WORLD_SIZE \
92
+ --nproc_per_node $NPROC_PER_NODE \
93
+ --master_addr=$MASTER_ADDR \
94
+ --master_port=$MASTER_PORT \
95
+ --node_rank $RANK \
96
+ videollama3/train_crossview_queries.py \
97
+ --query_ca_layers None \
98
+ --ca_module_version $CA_LAYER_VERSION \
99
+ --distillation_dir $DIST_DIR \
100
+ --lora_enable $LORA_TRAINING \
101
+ --num_train_epochs $NUM_TRAIN_EPOCHS \
102
+ --deepspeed scripts/zero1.json \
103
+ --model_type videollama3_qwen2_crossview_queries \
104
+ --model_path $INIT_MODEL \
105
+ --vision_encoder DAMO-NLP-SG/SigLIP-NaViT \
106
+ --mm_projector_type mlp2x_gelu \
107
+ --data_path $TRAINING_JSON \
108
+ --data_folder ${DATA_DIR} \
109
+ --image_merge_size 2 \
110
+ --video_merge_size 2 \
111
+ --fps 1 \
112
+ --max_frames $MAX_FRAMES \
113
+ --model_max_length 16384 \
114
+ --mm_max_length 10240 \
115
+ --bf16 True \
116
+ --tf32 True \
117
+ --fp16 False \
118
+ --output_dir ${OUTP_DIR}/${RUN_NAME} \
119
+ --per_device_train_batch_size $LOCAL_BATCH_SIZE \
120
+ --per_device_eval_batch_size 2 \
121
+ --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
122
+ --evaluation_strategy "no" \
123
+ --save_strategy "no" \
124
+ --save_steps 5000 \
125
+ --save_total_limit 1 \
126
+ --mm_projector_lr 1e-5 \
127
+ --llm_lr 1e-5 \
128
+ --weight_decay 0. \
129
+ --warmup_ratio 0.03 \
130
+ --lr_scheduler_type "cosine" \
131
+ --logging_steps 1 \
132
+ --gradient_checkpointing True \
133
+ --dataloader_num_workers $NUM_DATA_WORKERS \
134
+ --report_to $REPORT_TO \
135
+ --run_name $RUN_NAME \
136
+ --dataset_cache_dir /scratch/10580/dreilly1/.cache/vllama3_datasetcache \
137
+ --include_visual_tokens $PASS_VIS \
138
+ --include_query_tokens $PASS_QUERY \
139
+ --num_latent_queries $NUM_LATENT_QUERIES
140
+ # --use_batch_flattening
141
+ # --llm_lr 1e-5 \
142
+ # --vision_encoder_lr 2e-6 \
143
+ # --mm_projector_lr 1e-5 \
viscop_qwen2.5_7b_viscop-lora_depth-expert/non_lora_trainables.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63b928f9670dbcffb1d9de5f63893d728f04ad6c005e67c38482b9ceee51f453
3
+ size 354951121
viscop_qwen2.5_7b_viscop-lora_depth-expert/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
viscop_qwen2.5_7b_viscop-lora_egocentric-expert/README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ ---
4
+ ## Training procedure
5
+
6
+ ### Framework versions
7
+
8
+
9
+ - PEFT 0.4.0
viscop_qwen2.5_7b_viscop-lora_egocentric-expert/adapter_config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": null,
3
+ "base_model_name_or_path": "/work/dreilly1/videollama3-image_7b_local",
4
+ "bias": "none",
5
+ "fan_in_fan_out": false,
6
+ "inference_mode": true,
7
+ "init_lora_weights": true,
8
+ "layers_pattern": null,
9
+ "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.05,
12
+ "modules_to_save": null,
13
+ "peft_type": "LORA",
14
+ "r": 64,
15
+ "revision": null,
16
+ "target_modules": [
17
+ "k_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "up_proj",
21
+ "q_proj",
22
+ "v_proj",
23
+ "down_proj"
24
+ ],
25
+ "task_type": "CAUSAL_LM"
26
+ }
viscop_qwen2.5_7b_viscop-lora_egocentric-expert/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:344525498c4532729f1abef575830691bde3db0846f3b0c1fdfa5bee74986da6
3
+ size 323097987
viscop_qwen2.5_7b_viscop-lora_egocentric-expert/config.json ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "_name_or_path": "./base_vlm/videollama3-image_7b_local/",
4
+ "architectures": [
5
+ "Videollama3Qwen2ForCausalLM"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--configuration_videollama3.Videollama3Qwen2Config",
10
+ "AutoModelForCausalLM": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--modeling_videollama3.Videollama3Qwen2ForCausalLM"
11
+ },
12
+ "bos_token_id": 151643,
13
+ "eos_token_id": 151645,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 3584,
16
+ "image_aspect_ratio": "square",
17
+ "image_size": -1,
18
+ "image_token_index": 151665,
19
+ "image_token_length": 1,
20
+ "include_visual_tokens": true,
21
+ "initializer_range": 0.02,
22
+ "intermediate_size": 18944,
23
+ "is_alignment": false,
24
+ "llm_lr": 1e-05,
25
+ "max_frames": 40,
26
+ "max_position_embeddings": 32768,
27
+ "max_window_layers": 28,
28
+ "mm_hidden_size": 1152,
29
+ "mm_projector_lr": 1e-05,
30
+ "mm_projector_type": "mlp2x_gelu",
31
+ "mm_vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT",
32
+ "mm_vision_select_feature": "patch",
33
+ "mm_vision_select_layer": -1,
34
+ "model_type": "viscop_qwen2",
35
+ "num_attention_heads": 28,
36
+ "num_hidden_layers": 28,
37
+ "num_key_value_heads": 4,
38
+ "rms_norm_eps": 1e-06,
39
+ "rope_scaling": null,
40
+ "rope_theta": 1000000.0,
41
+ "sliding_window": null,
42
+ "tie_word_embeddings": false,
43
+ "tokenizer_model_max_length": 16384,
44
+ "tokenizer_padding_side": "right",
45
+ "torch_dtype": "bfloat16",
46
+ "transformers_version": "4.46.3",
47
+ "use_cache": true,
48
+ "use_learnable_queries": true,
49
+ "use_mm_proj": true,
50
+ "use_sliding_window": false,
51
+ "use_token_compression": false,
52
+ "vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT",
53
+ "vision_encoder_config": {
54
+ "_attn_implementation_autoset": false,
55
+ "_name_or_path": "",
56
+ "add_cross_attention": false,
57
+ "architectures": null,
58
+ "attention_dropout": 0.0,
59
+ "bad_words_ids": null,
60
+ "begin_suppress_tokens": null,
61
+ "bos_token_id": null,
62
+ "chunk_size_feed_forward": 0,
63
+ "cross_attention_hidden_size": null,
64
+ "decoder_start_token_id": null,
65
+ "diversity_penalty": 0.0,
66
+ "do_sample": false,
67
+ "early_stopping": false,
68
+ "encoder_no_repeat_ngram_size": 0,
69
+ "eos_token_id": null,
70
+ "exponential_decay_length_penalty": null,
71
+ "finetuning_task": null,
72
+ "forced_bos_token_id": null,
73
+ "forced_eos_token_id": null,
74
+ "hidden_act": "gelu_pytorch_tanh",
75
+ "hidden_size": 1152,
76
+ "id2label": {
77
+ "0": "LABEL_0",
78
+ "1": "LABEL_1"
79
+ },
80
+ "intermediate_size": 4304,
81
+ "is_decoder": false,
82
+ "is_encoder_decoder": false,
83
+ "label2id": {
84
+ "LABEL_0": 0,
85
+ "LABEL_1": 1
86
+ },
87
+ "layer_norm_eps": 1e-06,
88
+ "length_penalty": 1.0,
89
+ "max_length": 20,
90
+ "min_length": 0,
91
+ "model_type": "videollama3_vision_encoder",
92
+ "no_repeat_ngram_size": 0,
93
+ "num_attention_heads": 16,
94
+ "num_beam_groups": 1,
95
+ "num_beams": 1,
96
+ "num_channels": 3,
97
+ "num_hidden_layers": 27,
98
+ "num_return_sequences": 1,
99
+ "output_attentions": false,
100
+ "output_hidden_states": false,
101
+ "output_scores": false,
102
+ "pad_token_id": null,
103
+ "patch_size": 14,
104
+ "prefix": null,
105
+ "problem_type": null,
106
+ "pruned_heads": {},
107
+ "remove_invalid_values": false,
108
+ "repetition_penalty": 1.0,
109
+ "return_dict": true,
110
+ "return_dict_in_generate": false,
111
+ "sep_token_id": null,
112
+ "suppress_tokens": null,
113
+ "task_specific_params": null,
114
+ "temperature": 1.0,
115
+ "tf_legacy_loss": false,
116
+ "tie_encoder_decoder": false,
117
+ "tie_word_embeddings": true,
118
+ "tokenizer_class": null,
119
+ "top_k": 50,
120
+ "top_p": 1.0,
121
+ "torch_dtype": null,
122
+ "torchscript": false,
123
+ "typical_p": 1.0,
124
+ "use_bfloat16": false
125
+ },
126
+ "vision_encoder_lr": null,
127
+ "vocab_size": 152064,
128
+ "include_visual_probes": true,
129
+ "interaction_module": "cross_attention",
130
+ "interaction_module_layers": null,
131
+ "num_visual_probes": 16
132
+ }
viscop_qwen2.5_7b_viscop-lora_egocentric-expert/cross_view_queries.sh ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # nohup bash -c 'while true; do nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv > nvidia_log.txt; sleep 1; done' &
3
+
4
+ # Environment Variables
5
+ ARG_WORLD_SIZE=${1:-1}
6
+ ARG_NPROC_PER_NODE=${2:-8}
7
+ ARG_MASTER_ADDR="127.0.0.1"
8
+ ARG_MASTER_PORT=16668
9
+ ARG_RANK=0
10
+
11
+ # Multiple conditions
12
+ if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
13
+ WORLD_SIZE=$ARG_WORLD_SIZE
14
+ NPROC_PER_NODE=$ARG_NPROC_PER_NODE
15
+ fi
16
+ if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
17
+ MASTER_ADDR=$ARG_MASTER_ADDR
18
+ MASTER_PORT=$ARG_MASTER_PORT
19
+ RANK=$ARG_RANK
20
+ fi
21
+
22
+ echo "WORLD_SIZE: $WORLD_SIZE"
23
+ echo "NPROC_PER_NODE: $NPROC_PER_NODE"
24
+
25
+ # Training Arguments
26
+ GLOBAL_BATCH_SIZE=128 # 128
27
+ LOCAL_BATCH_SIZE=8
28
+ GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
29
+ echo $GRADIENT_ACCUMULATION_STEPS
30
+
31
+ NUM_DATA_WORKERS=8
32
+ NUM_TRAIN_EPOCHS=3
33
+
34
+ # Log Arguments
35
+ export WANDB_PROJECT=egoexo
36
+ REPORT_TO=wandb
37
+ DATA_DIR=/work/dreilly1/EgoExo4D/keystep_segments/
38
+ OUTP_DIR=work_dirs/egoexo
39
+
40
+ RUN_NAME=videollama3_qwen2.5_7b_EgoExo4D-S4-Captioned-EGOonly_train-projector-CA-Queries-LLM_LoRAr16_16global_NODIST
41
+
42
+ # TRAINING_JSON="training_jsons/egoexo4d-allactions-exoviews.json"
43
+ # TRAINING_JSON="training_jsons/egoexo4d-allactions-egoview.json"
44
+
45
+ # TRAINING_JSON="training_jsons/egoexo_vllama3-S4_caption-inst/egoexo4d-vllama3-S4cap-exoviews.json"
46
+ TRAINING_JSON="training_jsons/egoexo_vllama3-S4_caption-inst/egoexo4d-vllama3-S4cap-egoview.json"
47
+
48
+ LORA_TRAINING=True
49
+
50
+ # > Cross view options
51
+ NUM_LATENT_QUERIES=16
52
+
53
+ if [[ $TRAINING_JSON == *"exoviews"* ]]; then
54
+ MAX_FRAMES=180
55
+ # DIST_DIR="/work/dreilly1/VideoLLaMA3/saved_dist_tokens/16global-queries_ego-trained_visquery-to-LLM/" # set to "none" to disable
56
+ DIST_DIR="none"
57
+ PASS_VIS=True
58
+ PASS_QUERY=True
59
+ elif [[ $TRAINING_JSON == *"egoview"* ]]; then
60
+ MAX_FRAMES=40
61
+ DIST_DIR="none"
62
+ PASS_VIS=False
63
+ PASS_QUERY=True
64
+ fi
65
+
66
+ PASS_VIS=True # !
67
+
68
+ TESTING=0
69
+ if [ $TESTING -eq 1 ]; then
70
+ NUM_DATA_WORKERS=0
71
+ REPORT_TO=none
72
+ RUN_NAME=TESTING
73
+ fi
74
+
75
+ mkdir -p "${OUTP_DIR}/${RUN_NAME}/"
76
+ cp "$0" "${OUTP_DIR}/${RUN_NAME}/"
77
+
78
+ torchrun --nnodes $WORLD_SIZE \
79
+ --nproc_per_node $NPROC_PER_NODE \
80
+ --master_addr=$MASTER_ADDR \
81
+ --master_port=$MASTER_PORT \
82
+ --node_rank $RANK \
83
+ videollama3/train_crossview_queries.py \
84
+ --distillation_dir $DIST_DIR \
85
+ --lora_enable $LORA_TRAINING \
86
+ --num_train_epochs $NUM_TRAIN_EPOCHS \
87
+ --deepspeed scripts/zero1.json \
88
+ --model_type videollama3_qwen2_crossview_queries \
89
+ --model_path /work/dreilly1/videollama3-image_7b_local \
90
+ --vision_encoder DAMO-NLP-SG/SigLIP-NaViT \
91
+ --mm_projector_type mlp2x_gelu \
92
+ --data_path $TRAINING_JSON \
93
+ --data_folder ${DATA_DIR} \
94
+ --image_merge_size 2 \
95
+ --video_merge_size 2 \
96
+ --fps 1 \
97
+ --max_frames $MAX_FRAMES \
98
+ --model_max_length 16384 \
99
+ --mm_max_length 10240 \
100
+ --bf16 True \
101
+ --tf32 True \
102
+ --fp16 False \
103
+ --output_dir ${OUTP_DIR}/${RUN_NAME} \
104
+ --per_device_train_batch_size $LOCAL_BATCH_SIZE \
105
+ --per_device_eval_batch_size 2 \
106
+ --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
107
+ --evaluation_strategy "no" \
108
+ --save_strategy "no" \
109
+ --save_steps 5000 \
110
+ --save_total_limit 1 \
111
+ --mm_projector_lr 1e-5 \
112
+ --llm_lr 1e-5 \
113
+ --weight_decay 0. \
114
+ --warmup_ratio 0.03 \
115
+ --lr_scheduler_type "cosine" \
116
+ --logging_steps 1 \
117
+ --gradient_checkpointing True \
118
+ --dataloader_num_workers $NUM_DATA_WORKERS \
119
+ --report_to $REPORT_TO \
120
+ --run_name $RUN_NAME \
121
+ --dataset_cache_dir /work/dreilly1/.cache/vllama3_datasetcache \
122
+ --include_visual_tokens $PASS_VIS \
123
+ --include_query_tokens $PASS_QUERY \
124
+ --num_latent_queries $NUM_LATENT_QUERIES
125
+ # --use_batch_flattening
126
+
127
+ # --llm_lr 1e-5 \
128
+ # --vision_encoder_lr 2e-6 \
129
+ # --mm_projector_lr 1e-5 \
viscop_qwen2.5_7b_viscop-lora_egocentric-expert/non_lora_trainables.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b695fe812e07081c5af2397195980a2c50dd394a2cb705ec3beb768e8e359d4
3
+ size 354951121
viscop_qwen2.5_7b_viscop-lora_egocentric-expert/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff