yosubshin's picture
Upload folder using huggingface_hub
ca512b9 verified
data:
train:
datasets:
- dataset_name: hf_vision
dataset_path: null
subset: null
split: train
dataset_kwargs:
hf_dataset_path: yosubshin/WaltonMultimodalColdStart-random-5000-1
image_column: image
question_column: problem
answer_column: solution
return_tensors: true
processor_name: Qwen/Qwen2.5-VL-7B-Instruct
return_conversations: true
sample_count: null
mixture_proportion: null
shuffle: true
seed: 42
shuffle_buffer_size: 1000
trust_remote_code: true
transform_num_workers: auto
collator_name: vision_language_sft
collator_kwargs:
process_individually: true
pack: false
stream: false
target_col: null
mixture_strategy: first_exhausted
seed: null
use_torchdata: true
test:
datasets: []
collator_name: null
collator_kwargs: {}
pack: false
stream: false
target_col: null
mixture_strategy: first_exhausted
seed: null
use_torchdata: null
validation:
datasets: []
collator_name: null
collator_kwargs: {}
pack: false
stream: false
target_col: null
mixture_strategy: first_exhausted
seed: null
use_torchdata: null
model:
model_name: Qwen/Qwen2.5-VL-7B-Instruct
adapter_model: null
tokenizer_name: null
tokenizer_pad_token: null
tokenizer_kwargs: {}
processor_kwargs: {}
model_max_length: 10000
load_pretrained_weights: true
trust_remote_code: true
torch_dtype_str: bfloat16
compile: false
chat_template: qwen2-vl-instruct
chat_template_kwargs: null
attn_implementation: flash_attention_2
device_map: auto
model_kwargs: {}
enable_liger_kernel: false
shard_for_eval: false
freeze_layers: []
model_revision: null
training:
use_peft: false
trainer_type: TRL_SFT
enable_gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
output_dir: /content/qwen2_5_vl_7b_walton_random_5000_1
per_device_train_batch_size: 1
per_device_eval_batch_size: 8
gradient_accumulation_steps: 1
max_steps: -1
num_train_epochs: 1
save_epoch: false
save_steps: 0
save_final_model: true
seed: 42
data_seed: 42
use_deterministic: false
full_determinism: false
run_name: null
metrics_function: null
reward_functions: null
grpo:
model_init_kwargs: {}
max_prompt_length: null
max_completion_length: null
num_generations: null
temperature: 0.9
remove_unused_columns: false
repetition_penalty: 1.0
use_vllm: false
vllm_mode: null
vllm_gpu_memory_utilization: 0.9
epsilon: 0.2
log_completions: false
rollout_function: null
gkd:
teacher_model_name_or_path: null
teacher_model_init_kwargs:
dtype: auto
temperature: 0.9
lmbda: 0.5
beta: 0.5
max_new_tokens: 128
disable_dropout: true
seq_kd: false
log_level: info
dep_log_level: warning
log_examples: false
enable_wandb: true
enable_mlflow: false
enable_tensorboard: true
logging_strategy: steps
logging_dir: null
logging_steps: 5
logging_first_step: false
eval_strategy: 'no'
eval_steps: 500
learning_rate: 2.0e-05
lr_scheduler_type: cosine
lr_scheduler_kwargs: {}
warmup_ratio: 0.03
warmup_steps: null
optimizer: adamw_torch_fused
weight_decay: 0.01
adam_beta1: 0.9
adam_beta2: 0.999
adam_epsilon: 1.0e-08
sgd_momentum: 0.0
mixed_precision_dtype: NONE
compile: false
include_performance_metrics: true
include_alternative_mfu_metrics: false
log_model_summary: false
resume_from_checkpoint: null
try_resume_from_last_checkpoint: false
dataloader_num_workers: 2
dataloader_persistent_workers: false
dataloader_prefetch_factor: 8
dataloader_main_process_only: false
ddp_find_unused_parameters: false
max_grad_norm: 1.0
trainer_kwargs:
max_length: 10000
remove_unused_columns: false
dataset_kwargs:
skip_prepare_dataset: true
verl_config_overrides: {}
profiler:
save_dir: null
enable_cpu_profiling: false
enable_cuda_profiling: false
record_shapes: false
profile_memory: false
with_stack: false
with_flops: false
with_modules: false
row_limit: 50
schedule:
enable_schedule: false
wait: 0
warmup: 1
active: 3
repeat: 1
skip_first: 1
telemetry:
telemetry_dir: telemetry
collect_telemetry_for_all_ranks: false
track_gpu_temperature: false
empty_device_cache_steps: 1
nccl_default_timeout_minutes: null
label_ignore_index: null
peft:
lora_r: 8
lora_alpha: 8
lora_dropout: 0.0
lora_target_modules: null
lora_target_parameters: null
lora_modules_to_save: null
lora_bias: none
lora_init_weights: DEFAULT
lora_task_type: CAUSAL_LM
q_lora: false
q_lora_bits: 4
bnb_4bit_quant_type: fp4
llm_int8_skip_modules: null
use_bnb_nested_quant: false
bnb_4bit_quant_storage: uint8
bnb_4bit_compute_dtype: float32
peft_save_mode: ADAPTER_ONLY
fsdp:
enable_fsdp: true
sharding_strategy: HYBRID_SHARD
cpu_offload: false
mixed_precision: bf16
backward_prefetch: BACKWARD_PRE
forward_prefetch: true
use_orig_params: null
state_dict_type: FULL_STATE_DICT
auto_wrap_policy: SIZE_BASED_WRAP
min_num_params: 100000
transformer_layer_cls: null
sync_module_states: true
deepspeed:
enable_deepspeed: false
deepspeed_config_path: null
zero_stage: ZERO_0
offload_optimizer: null
offload_param: null
precision: null
overlap_comm: false
contiguous_gradients: true
reduce_bucket_size: 500000000
allgather_bucket_size: 500000000
allgather_partitions: true
reduce_scatter: true
round_robin_gradients: false
stage3_prefetch_bucket_size: 50000000
stage3_param_persistence_threshold: 100000
stage3_max_live_parameters: 1000000000
stage3_max_reuse_distance: 1000000000
stage3_gather_16bit_weights_on_model_save: false
sub_group_size: 1000000000
train_batch_size: auto
train_micro_batch_size_per_gpu: auto
gradient_accumulation_steps: auto
gradient_clipping: auto
zero_allow_untested_optimizer: true
zero_force_ds_cpu_optimizer: true
activation_checkpointing: {}
memory_efficient_linear: false
steps_per_print: 10
wall_clock_breakdown: false