| data: |
| train: |
| datasets: |
| - dataset_name: hf_vision |
| dataset_path: null |
| subset: null |
| split: train |
| dataset_kwargs: |
| hf_dataset_path: yosubshin/WaltonMultimodalColdStart-random-5000-1 |
| image_column: image |
| question_column: problem |
| answer_column: solution |
| return_tensors: true |
| processor_name: Qwen/Qwen2.5-VL-7B-Instruct |
| return_conversations: true |
| sample_count: null |
| mixture_proportion: null |
| shuffle: true |
| seed: 42 |
| shuffle_buffer_size: 1000 |
| trust_remote_code: true |
| transform_num_workers: auto |
| collator_name: vision_language_sft |
| collator_kwargs: |
| process_individually: true |
| pack: false |
| stream: false |
| target_col: null |
| mixture_strategy: first_exhausted |
| seed: null |
| use_torchdata: true |
| test: |
| datasets: [] |
| collator_name: null |
| collator_kwargs: {} |
| pack: false |
| stream: false |
| target_col: null |
| mixture_strategy: first_exhausted |
| seed: null |
| use_torchdata: null |
| validation: |
| datasets: [] |
| collator_name: null |
| collator_kwargs: {} |
| pack: false |
| stream: false |
| target_col: null |
| mixture_strategy: first_exhausted |
| seed: null |
| use_torchdata: null |
| model: |
| model_name: Qwen/Qwen2.5-VL-7B-Instruct |
| adapter_model: null |
| tokenizer_name: null |
| tokenizer_pad_token: null |
| tokenizer_kwargs: {} |
| processor_kwargs: {} |
| model_max_length: 10000 |
| load_pretrained_weights: true |
| trust_remote_code: true |
| torch_dtype_str: bfloat16 |
| compile: false |
| chat_template: qwen2-vl-instruct |
| chat_template_kwargs: null |
| attn_implementation: flash_attention_2 |
| device_map: auto |
| model_kwargs: {} |
| enable_liger_kernel: false |
| shard_for_eval: false |
| freeze_layers: [] |
| model_revision: null |
| training: |
| use_peft: false |
| trainer_type: TRL_SFT |
| enable_gradient_checkpointing: true |
| gradient_checkpointing_kwargs: |
| use_reentrant: false |
| output_dir: /content/qwen2_5_vl_7b_walton_random_5000_1 |
| per_device_train_batch_size: 1 |
| per_device_eval_batch_size: 8 |
| gradient_accumulation_steps: 1 |
| max_steps: -1 |
| num_train_epochs: 1 |
| save_epoch: false |
| save_steps: 0 |
| save_final_model: true |
| seed: 42 |
| data_seed: 42 |
| use_deterministic: false |
| full_determinism: false |
| run_name: null |
| metrics_function: null |
| reward_functions: null |
| grpo: |
| model_init_kwargs: {} |
| max_prompt_length: null |
| max_completion_length: null |
| num_generations: null |
| temperature: 0.9 |
| remove_unused_columns: false |
| repetition_penalty: 1.0 |
| use_vllm: false |
| vllm_mode: null |
| vllm_gpu_memory_utilization: 0.9 |
| epsilon: 0.2 |
| log_completions: false |
| rollout_function: null |
| gkd: |
| teacher_model_name_or_path: null |
| teacher_model_init_kwargs: |
| dtype: auto |
| temperature: 0.9 |
| lmbda: 0.5 |
| beta: 0.5 |
| max_new_tokens: 128 |
| disable_dropout: true |
| seq_kd: false |
| log_level: info |
| dep_log_level: warning |
| log_examples: false |
| enable_wandb: true |
| enable_mlflow: false |
| enable_tensorboard: true |
| logging_strategy: steps |
| logging_dir: null |
| logging_steps: 5 |
| logging_first_step: false |
| eval_strategy: 'no' |
| eval_steps: 500 |
| learning_rate: 2.0e-05 |
| lr_scheduler_type: cosine |
| lr_scheduler_kwargs: {} |
| warmup_ratio: 0.03 |
| warmup_steps: null |
| optimizer: adamw_torch_fused |
| weight_decay: 0.01 |
| adam_beta1: 0.9 |
| adam_beta2: 0.999 |
| adam_epsilon: 1.0e-08 |
| sgd_momentum: 0.0 |
| mixed_precision_dtype: NONE |
| compile: false |
| include_performance_metrics: true |
| include_alternative_mfu_metrics: false |
| log_model_summary: false |
| resume_from_checkpoint: null |
| try_resume_from_last_checkpoint: false |
| dataloader_num_workers: 2 |
| dataloader_persistent_workers: false |
| dataloader_prefetch_factor: 8 |
| dataloader_main_process_only: false |
| ddp_find_unused_parameters: false |
| max_grad_norm: 1.0 |
| trainer_kwargs: |
| max_length: 10000 |
| remove_unused_columns: false |
| dataset_kwargs: |
| skip_prepare_dataset: true |
| verl_config_overrides: {} |
| profiler: |
| save_dir: null |
| enable_cpu_profiling: false |
| enable_cuda_profiling: false |
| record_shapes: false |
| profile_memory: false |
| with_stack: false |
| with_flops: false |
| with_modules: false |
| row_limit: 50 |
| schedule: |
| enable_schedule: false |
| wait: 0 |
| warmup: 1 |
| active: 3 |
| repeat: 1 |
| skip_first: 1 |
| telemetry: |
| telemetry_dir: telemetry |
| collect_telemetry_for_all_ranks: false |
| track_gpu_temperature: false |
| empty_device_cache_steps: 1 |
| nccl_default_timeout_minutes: null |
| label_ignore_index: null |
| peft: |
| lora_r: 8 |
| lora_alpha: 8 |
| lora_dropout: 0.0 |
| lora_target_modules: null |
| lora_target_parameters: null |
| lora_modules_to_save: null |
| lora_bias: none |
| lora_init_weights: DEFAULT |
| lora_task_type: CAUSAL_LM |
| q_lora: false |
| q_lora_bits: 4 |
| bnb_4bit_quant_type: fp4 |
| llm_int8_skip_modules: null |
| use_bnb_nested_quant: false |
| bnb_4bit_quant_storage: uint8 |
| bnb_4bit_compute_dtype: float32 |
| peft_save_mode: ADAPTER_ONLY |
| fsdp: |
| enable_fsdp: true |
| sharding_strategy: HYBRID_SHARD |
| cpu_offload: false |
| mixed_precision: bf16 |
| backward_prefetch: BACKWARD_PRE |
| forward_prefetch: true |
| use_orig_params: null |
| state_dict_type: FULL_STATE_DICT |
| auto_wrap_policy: SIZE_BASED_WRAP |
| min_num_params: 100000 |
| transformer_layer_cls: null |
| sync_module_states: true |
| deepspeed: |
| enable_deepspeed: false |
| deepspeed_config_path: null |
| zero_stage: ZERO_0 |
| offload_optimizer: null |
| offload_param: null |
| precision: null |
| overlap_comm: false |
| contiguous_gradients: true |
| reduce_bucket_size: 500000000 |
| allgather_bucket_size: 500000000 |
| allgather_partitions: true |
| reduce_scatter: true |
| round_robin_gradients: false |
| stage3_prefetch_bucket_size: 50000000 |
| stage3_param_persistence_threshold: 100000 |
| stage3_max_live_parameters: 1000000000 |
| stage3_max_reuse_distance: 1000000000 |
| stage3_gather_16bit_weights_on_model_save: false |
| sub_group_size: 1000000000 |
| train_batch_size: auto |
| train_micro_batch_size_per_gpu: auto |
| gradient_accumulation_steps: auto |
| gradient_clipping: auto |
| zero_allow_untested_optimizer: true |
| zero_force_ds_cpu_optimizer: true |
| activation_checkpointing: {} |
| memory_efficient_linear: false |
| steps_per_print: 10 |
| wall_clock_breakdown: false |
|
|