| dataset: |
| align_stage_components: |
| - download/videollava/valley_llavaimage.json |
| - download/videollava |
| dataset_id: videollava |
| dataset_root_dir: data |
| finetune_stage_components: |
| - download/videollava/videochatgpt_llavaimage_tune.json |
| - download/videollava |
| type: videollava |
| dataset_class: FinetuneVideoDataset |
| hf_token: .hf_token |
| model: |
| align_epochs: 1 |
| align_global_batch_size: 256 |
| align_learning_rate: 0.001 |
| align_lr_scheduler_type: linear-warmup+cosine-decay |
| align_max_grad_norm: 1.0 |
| align_max_steps: null |
| align_per_device_batch_size: 16 |
| align_train_strategy: fsdp-shard-grad-op |
| align_warmup_ratio: 0.03 |
| align_weight_decay: 0.0 |
| arch_specifier: no-align+linear |
| enable_gradient_checkpointing: true |
| enable_mixed_precision_training: true |
| feature_fusion: first |
| finetune_epochs: 1 |
| finetune_global_batch_size: 128 |
| finetune_learning_rate: 2.0e-05 |
| finetune_lr_scheduler_type: linear-warmup+cosine-decay |
| finetune_max_grad_norm: 1.0 |
| finetune_max_steps: null |
| finetune_per_device_batch_size: 4 |
| finetune_train_strategy: fsdp-full-shard |
| finetune_warmup_ratio: 0.03 |
| finetune_weight_decay: 0.1 |
| image_resize_strategy: resize-naive |
| llm_backbone_id: llama2-7b-pure |
| llm_max_length: 2048 |
| model_id: languagebind-single |
| num_frames: |
| - 16 |
| projector_token_length: 64 |
| reduce_in_full_precision: false |
| type: languagebind-single |
| video_backbone_ids: |
| - languagebind-video |
| visual_feature_length: 4112 |
| pretrained_checkpoint: null |
| run_id: languagebind-single |
| run_root_dir: runs |
| seed: 7 |
| slurm_id: '441692' |
| stage: finetune |
| trackers: |
| - jsonl |
| - wandb |
|
|