Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

viscop_qwen2.5_7b_viscop-lora_depth-expert/README.md +11 -0
viscop_qwen2.5_7b_viscop-lora_depth-expert/adapter_config.json +18 -0
viscop_qwen2.5_7b_viscop-lora_depth-expert/adapter_model.bin +3 -0
viscop_qwen2.5_7b_viscop-lora_depth-expert/config.json +134 -0
viscop_qwen2.5_7b_viscop-lora_depth-expert/cross_view_queries.sh +143 -0
viscop_qwen2.5_7b_viscop-lora_depth-expert/non_lora_trainables.bin +3 -0
viscop_qwen2.5_7b_viscop-lora_depth-expert/trainer_state.json +0 -0
viscop_qwen2.5_7b_viscop-lora_egocentric-expert/README.md +9 -0
viscop_qwen2.5_7b_viscop-lora_egocentric-expert/adapter_config.json +26 -0
viscop_qwen2.5_7b_viscop-lora_egocentric-expert/adapter_model.bin +3 -0
viscop_qwen2.5_7b_viscop-lora_egocentric-expert/config.json +132 -0
viscop_qwen2.5_7b_viscop-lora_egocentric-expert/cross_view_queries.sh +129 -0
viscop_qwen2.5_7b_viscop-lora_egocentric-expert/non_lora_trainables.bin +3 -0
viscop_qwen2.5_7b_viscop-lora_egocentric-expert/trainer_state.json +0 -0

viscop_qwen2.5_7b_viscop-lora_depth-expert/README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+---
+library_name: peft
+---
+## Training procedure
+### Framework versions
+- PEFT 0.4.0
+- PEFT 0.4.0
+- PEFT 0.4.0

viscop_qwen2.5_7b_viscop-lora_depth-expert/adapter_config.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "auto_mapping": null,
+  "base_model_name_or_path": "/scratch/10580/dreilly1/videollama3-image_7b_local",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "revision": null,
+  "target_modules": ".*model\\.layers\\..*\\.(q_proj|gate_proj|v_proj|o_proj|up_proj|k_proj|down_proj)$",
+  "task_type": "CAUSAL_LM"
+}

viscop_qwen2.5_7b_viscop-lora_depth-expert/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:475e86a97f1411f5d610d6a8121ab967176c306a32fcad23f5e523bdab331211
+size 323097987

viscop_qwen2.5_7b_viscop-lora_depth-expert/config.json ADDED Viewed

	@@ -0,0 +1,134 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "./base_vlm/videollama3-image_7b_local/",
+  "architectures": [
+    "Videollama3Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--configuration_videollama3.Videollama3Qwen2Config",
+    "AutoModelForCausalLM": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--modeling_videollama3.Videollama3Qwen2ForCausalLM"
+  },
+  "bos_token_id": 151643,
+  "ca_module_version": "v1",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "image_aspect_ratio": "square",
+  "image_size": -1,
+  "image_token_index": 151665,
+  "image_token_length": 1,
+  "include_visual_tokens": true,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "is_alignment": false,
+  "llm_lr": 1e-05,
+  "max_frames": 180,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "mm_hidden_size": 1152,
+  "mm_projector_lr": 1e-05,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT",
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -1,
+  "model_type": "viscop_qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "query_ca_layers": null,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 16384,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.3",
+  "use_cache": true,
+  "use_learnable_queries": true,
+  "use_mm_proj": true,
+  "use_sliding_window": false,
+  "use_token_compression": false,
+  "vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT",
+  "vision_encoder_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "intermediate_size": 4304,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-06,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "videollama3_vision_encoder",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "vision_encoder_lr": null,
+  "vocab_size": 152064,
+  "include_visual_probes": true,
+  "interaction_module": "cross_attention",
+  "interaction_module_layers": null,
+  "num_visual_probes": 16
+}

viscop_qwen2.5_7b_viscop-lora_depth-expert/cross_view_queries.sh ADDED Viewed

	@@ -0,0 +1,143 @@

+#!/bin/bash
+# nohup bash -c 'while true; do nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv > nvidia_log.txt; sleep 1; done' &
+export PATH=/scratch/10580/dreilly1/ffmpeg_link/:$PATH
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+if [[ -v MASTER_ADDR_PASSED ]]; then
+    ARG_MASTER_ADDR=$MASTER_ADDR_PASSED # passed via slurm submission script
+else
+    ARG_MASTER_ADDR=127.0.0.1 # for dev environments
+fi
+ARG_MASTER_PORT=12355
+ARG_RANK=$SLURM_NODEID
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+echo "MASTER_ADDR: $MASTER_ADDR. MASTER_PORT: $MASTER_PORT. RANK: $RANK"
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+# Training Arguments
+GLOBAL_BATCH_SIZE=128 # 128
+LOCAL_BATCH_SIZE=4
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+echo $GRADIENT_ACCUMULATION_STEPS
+NUM_DATA_WORKERS=8
+NUM_TRAIN_EPOCHS=3
+# Log Arguments
+export WANDB_PROJECT=egoexo
+REPORT_TO=none
+DATA_DIR=/scratch/10580/dreilly1/EgoExo4D/keystep_segments_depth
+OUTP_DIR=work_dirs/egoexo
+CA_LAYER_VERSION=v1
+RUN_NAME=videollama3_qwen2.5_7b_DEPTHEgoExo4D-S4-Captioned-EXOonly_train-projector-CA$CA_LAYER_VERSION-Queries-LLM_LoRAr16_16global_NODIST
+# TRAINING_JSON="training_jsons/egoexo4d-allactions-exoviews.json"
+# TRAINING_JSON="training_jsons/egoexo4d-allactions-egoview.json"
+TRAINING_JSON="training_jsons/egoexo_vllama3-S4_caption-inst/egoexo4d-vllama3-S4cap-exoviews.json"
+# TRAINING_JSON="training_jsons/egoexo_vllama3-S4_caption-inst/egoexo4d-vllama3-S4cap-egoview.json"
+LORA_TRAINING=True # > Must pass llm_lr if doing LoRA training
+# > Cross view options
+NUM_LATENT_QUERIES=16
+if [[ $TRAINING_JSON == *"exoviews"* ]]; then
+    MAX_FRAMES=180
+    # DIST_DIR="/work/dreilly1/VideoLLaMA3/saved_dist_tokens/16global-EgoExo4D-S4-Captioned_ego-trained_visquery-to-LLM" # set to "none" to disable
+    DIST_DIR="none"
+    PASS_VIS=True
+    PASS_QUERY=True
+elif [[ $TRAINING_JSON == *"egoview"* ]]; then
+    MAX_FRAMES=40
+    DIST_DIR="none"
+    PASS_VIS=False
+    PASS_QUERY=True
+fi
+PASS_VIS=True # !
+TESTING=0
+if [ $TESTING -eq 1 ]; then
+    NUM_DATA_WORKERS=0
+    REPORT_TO=none
+    RUN_NAME=TESTING
+fi
+mkdir -p "${OUTP_DIR}/${RUN_NAME}/"
+cp "$0" "${OUTP_DIR}/${RUN_NAME}/"
+INIT_MODEL=/scratch/10580/dreilly1/videollama3-image_7b_local # ! videollama3-image_7b_local
+# Layers: 3,8,17,26
+# Layers: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama3/train_crossview_queries.py \
+    --query_ca_layers None \
+    --ca_module_version $CA_LAYER_VERSION \
+    --distillation_dir $DIST_DIR \
+    --lora_enable $LORA_TRAINING \
+    --num_train_epochs $NUM_TRAIN_EPOCHS \
+    --deepspeed scripts/zero1.json \
+    --model_type videollama3_qwen2_crossview_queries \
+    --model_path $INIT_MODEL \
+    --vision_encoder DAMO-NLP-SG/SigLIP-NaViT \
+    --mm_projector_type mlp2x_gelu \
+    --data_path $TRAINING_JSON \
+    --data_folder ${DATA_DIR} \
+    --image_merge_size 2 \
+    --video_merge_size 2 \
+    --fps 1 \
+    --max_frames $MAX_FRAMES \
+    --model_max_length 16384 \
+    --mm_max_length 10240 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${RUN_NAME} \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 2 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "no" \
+    --save_steps 5000 \
+    --save_total_limit 1 \
+    --mm_projector_lr 1e-5 \
+    --llm_lr 1e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers $NUM_DATA_WORKERS \
+    --report_to $REPORT_TO \
+    --run_name $RUN_NAME \
+    --dataset_cache_dir /scratch/10580/dreilly1/.cache/vllama3_datasetcache \
+    --include_visual_tokens $PASS_VIS \
+    --include_query_tokens $PASS_QUERY \
+    --num_latent_queries $NUM_LATENT_QUERIES
+    # --use_batch_flattening
+    # --llm_lr 1e-5 \
+    # --vision_encoder_lr 2e-6 \
+    # --mm_projector_lr 1e-5 \

viscop_qwen2.5_7b_viscop-lora_depth-expert/non_lora_trainables.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:63b928f9670dbcffb1d9de5f63893d728f04ad6c005e67c38482b9ceee51f453
+size 354951121

viscop_qwen2.5_7b_viscop-lora_depth-expert/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

viscop_qwen2.5_7b_viscop-lora_egocentric-expert/README.md ADDED Viewed

	@@ -0,0 +1,9 @@

+---
+library_name: peft
+---
+## Training procedure
+### Framework versions
+- PEFT 0.4.0

viscop_qwen2.5_7b_viscop-lora_egocentric-expert/adapter_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "auto_mapping": null,
+  "base_model_name_or_path": "/work/dreilly1/videollama3-image_7b_local",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "lora_alpha": 16,
+  "lora_dropout": 0.05,
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 64,
+  "revision": null,
+  "target_modules": [
+    "k_proj",
+    "o_proj",
+    "gate_proj",
+    "up_proj",
+    "q_proj",
+    "v_proj",
+    "down_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

viscop_qwen2.5_7b_viscop-lora_egocentric-expert/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:344525498c4532729f1abef575830691bde3db0846f3b0c1fdfa5bee74986da6
+size 323097987

viscop_qwen2.5_7b_viscop-lora_egocentric-expert/config.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+  "_attn_implementation_autoset": true,
+  "_name_or_path": "./base_vlm/videollama3-image_7b_local/",
+  "architectures": [
+    "Videollama3Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--configuration_videollama3.Videollama3Qwen2Config",
+    "AutoModelForCausalLM": "DAMO-NLP-SG/VideoLLaMA3-7B-Image--modeling_videollama3.Videollama3Qwen2ForCausalLM"
+  },
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 3584,
+  "image_aspect_ratio": "square",
+  "image_size": -1,
+  "image_token_index": 151665,
+  "image_token_length": 1,
+  "include_visual_tokens": true,
+  "initializer_range": 0.02,
+  "intermediate_size": 18944,
+  "is_alignment": false,
+  "llm_lr": 1e-05,
+  "max_frames": 40,
+  "max_position_embeddings": 32768,
+  "max_window_layers": 28,
+  "mm_hidden_size": 1152,
+  "mm_projector_lr": 1e-05,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT",
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -1,
+  "model_type": "viscop_qwen2",
+  "num_attention_heads": 28,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 4,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "tokenizer_model_max_length": 16384,
+  "tokenizer_padding_side": "right",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.3",
+  "use_cache": true,
+  "use_learnable_queries": true,
+  "use_mm_proj": true,
+  "use_sliding_window": false,
+  "use_token_compression": false,
+  "vision_encoder": "DAMO-NLP-SG/SigLIP-NaViT",
+  "vision_encoder_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "intermediate_size": 4304,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-06,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "videollama3_vision_encoder",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "vision_encoder_lr": null,
+  "vocab_size": 152064,
+  "include_visual_probes": true,
+  "interaction_module": "cross_attention",
+  "interaction_module_layers": null,
+  "num_visual_probes": 16
+}

viscop_qwen2.5_7b_viscop-lora_egocentric-expert/cross_view_queries.sh ADDED Viewed

	@@ -0,0 +1,129 @@

+#!/bin/bash
+# nohup bash -c 'while true; do nvidia-smi --query-gpu=index,memory.used,memory.total --format=csv > nvidia_log.txt; sleep 1; done' &
+# Environment Variables
+ARG_WORLD_SIZE=${1:-1}
+ARG_NPROC_PER_NODE=${2:-8}
+ARG_MASTER_ADDR="127.0.0.1"
+ARG_MASTER_PORT=16668
+ARG_RANK=0
+# Multiple conditions
+if [ ! -n "$WORLD_SIZE" ] || [ ! -n "$NPROC_PER_NODE" ]; then
+    WORLD_SIZE=$ARG_WORLD_SIZE
+    NPROC_PER_NODE=$ARG_NPROC_PER_NODE
+fi
+if [ ! -n "$MASTER_ADDR" ] || [ ! -n "$MASTER_PORT" ] || [ ! -n "$RANK" ]; then
+    MASTER_ADDR=$ARG_MASTER_ADDR
+    MASTER_PORT=$ARG_MASTER_PORT
+    RANK=$ARG_RANK
+fi
+echo "WORLD_SIZE: $WORLD_SIZE"
+echo "NPROC_PER_NODE: $NPROC_PER_NODE"
+# Training Arguments
+GLOBAL_BATCH_SIZE=128 # 128
+LOCAL_BATCH_SIZE=8
+GRADIENT_ACCUMULATION_STEPS=$[$GLOBAL_BATCH_SIZE/($WORLD_SIZE*$NPROC_PER_NODE*$LOCAL_BATCH_SIZE)]
+echo $GRADIENT_ACCUMULATION_STEPS
+NUM_DATA_WORKERS=8
+NUM_TRAIN_EPOCHS=3
+# Log Arguments
+export WANDB_PROJECT=egoexo
+REPORT_TO=wandb
+DATA_DIR=/work/dreilly1/EgoExo4D/keystep_segments/
+OUTP_DIR=work_dirs/egoexo
+RUN_NAME=videollama3_qwen2.5_7b_EgoExo4D-S4-Captioned-EGOonly_train-projector-CA-Queries-LLM_LoRAr16_16global_NODIST
+# TRAINING_JSON="training_jsons/egoexo4d-allactions-exoviews.json"
+# TRAINING_JSON="training_jsons/egoexo4d-allactions-egoview.json"
+# TRAINING_JSON="training_jsons/egoexo_vllama3-S4_caption-inst/egoexo4d-vllama3-S4cap-exoviews.json"
+TRAINING_JSON="training_jsons/egoexo_vllama3-S4_caption-inst/egoexo4d-vllama3-S4cap-egoview.json"
+LORA_TRAINING=True
+# > Cross view options
+NUM_LATENT_QUERIES=16
+if [[ $TRAINING_JSON == *"exoviews"* ]]; then
+    MAX_FRAMES=180
+    # DIST_DIR="/work/dreilly1/VideoLLaMA3/saved_dist_tokens/16global-queries_ego-trained_visquery-to-LLM/" # set to "none" to disable
+    DIST_DIR="none"
+    PASS_VIS=True
+    PASS_QUERY=True
+elif [[ $TRAINING_JSON == *"egoview"* ]]; then
+    MAX_FRAMES=40
+    DIST_DIR="none"
+    PASS_VIS=False
+    PASS_QUERY=True
+fi
+PASS_VIS=True # !
+TESTING=0
+if [ $TESTING -eq 1 ]; then
+    NUM_DATA_WORKERS=0
+    REPORT_TO=none
+    RUN_NAME=TESTING
+fi
+mkdir -p "${OUTP_DIR}/${RUN_NAME}/"
+cp "$0" "${OUTP_DIR}/${RUN_NAME}/"
+torchrun --nnodes $WORLD_SIZE \
+    --nproc_per_node $NPROC_PER_NODE \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    --node_rank $RANK \
+    videollama3/train_crossview_queries.py \
+    --distillation_dir $DIST_DIR \
+    --lora_enable $LORA_TRAINING \
+    --num_train_epochs $NUM_TRAIN_EPOCHS \
+    --deepspeed scripts/zero1.json \
+    --model_type videollama3_qwen2_crossview_queries \
+    --model_path /work/dreilly1/videollama3-image_7b_local \
+    --vision_encoder DAMO-NLP-SG/SigLIP-NaViT \
+    --mm_projector_type mlp2x_gelu \
+    --data_path $TRAINING_JSON \
+    --data_folder ${DATA_DIR} \
+    --image_merge_size 2 \
+    --video_merge_size 2 \
+    --fps 1 \
+    --max_frames $MAX_FRAMES \
+    --model_max_length 16384 \
+    --mm_max_length 10240 \
+    --bf16 True \
+    --tf32 True \
+    --fp16 False \
+    --output_dir ${OUTP_DIR}/${RUN_NAME} \
+    --per_device_train_batch_size $LOCAL_BATCH_SIZE \
+    --per_device_eval_batch_size 2 \
+    --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS \
+    --evaluation_strategy "no" \
+    --save_strategy "no" \
+    --save_steps 5000 \
+    --save_total_limit 1 \
+    --mm_projector_lr 1e-5 \
+    --llm_lr 1e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers $NUM_DATA_WORKERS \
+    --report_to $REPORT_TO \
+    --run_name $RUN_NAME \
+    --dataset_cache_dir /work/dreilly1/.cache/vllama3_datasetcache \
+    --include_visual_tokens $PASS_VIS \
+    --include_query_tokens $PASS_QUERY \
+    --num_latent_queries $NUM_LATENT_QUERIES
+    # --use_batch_flattening
+    # --llm_lr 1e-5 \
+    # --vision_encoder_lr 2e-6 \
+    # --mm_projector_lr 1e-5 \

viscop_qwen2.5_7b_viscop-lora_egocentric-expert/non_lora_trainables.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b695fe812e07081c5af2397195980a2c50dd394a2cb705ec3beb768e8e359d4
+size 354951121

viscop_qwen2.5_7b_viscop-lora_egocentric-expert/trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff