Upload folder using huggingface_hub

ca512b9 verified 5 months ago

6.26 kB

	data:
	train:
	datasets:
	- dataset_name: hf_vision
	dataset_path: null
	subset: null
	split: train
	dataset_kwargs:
	hf_dataset_path: yosubshin/WaltonMultimodalColdStart-random-5000-1
	image_column: image
	question_column: problem
	answer_column: solution
	return_tensors: true
	processor_name: Qwen/Qwen2.5-VL-7B-Instruct
	return_conversations: true
	sample_count: null
	mixture_proportion: null
	shuffle: true
	seed: 42
	shuffle_buffer_size: 1000
	trust_remote_code: true
	transform_num_workers: auto
	collator_name: vision_language_sft
	collator_kwargs:
	process_individually: true
	pack: false
	stream: false
	target_col: null
	mixture_strategy: first_exhausted
	seed: null
	use_torchdata: true
	test:
	datasets: []
	collator_name: null
	collator_kwargs: {}
	pack: false
	stream: false
	target_col: null
	mixture_strategy: first_exhausted
	seed: null
	use_torchdata: null
	validation:
	datasets: []
	collator_name: null
	collator_kwargs: {}
	pack: false
	stream: false
	target_col: null
	mixture_strategy: first_exhausted
	seed: null
	use_torchdata: null
	model:
	model_name: Qwen/Qwen2.5-VL-7B-Instruct
	adapter_model: null
	tokenizer_name: null
	tokenizer_pad_token: null
	tokenizer_kwargs: {}
	processor_kwargs: {}
	model_max_length: 10000
	load_pretrained_weights: true
	trust_remote_code: true
	torch_dtype_str: bfloat16
	compile: false
	chat_template: qwen2-vl-instruct
	chat_template_kwargs: null
	attn_implementation: flash_attention_2
	device_map: auto
	model_kwargs: {}
	enable_liger_kernel: false
	shard_for_eval: false
	freeze_layers: []
	model_revision: null
	training:
	use_peft: false
	trainer_type: TRL_SFT
	enable_gradient_checkpointing: true
	gradient_checkpointing_kwargs:
	use_reentrant: false
	output_dir: /content/qwen2_5_vl_7b_walton_random_5000_1
	per_device_train_batch_size: 1
	per_device_eval_batch_size: 8
	gradient_accumulation_steps: 1
	max_steps: -1
	num_train_epochs: 1
	save_epoch: false
	save_steps: 0
	save_final_model: true
	seed: 42
	data_seed: 42
	use_deterministic: false
	full_determinism: false
	run_name: null
	metrics_function: null
	reward_functions: null
	grpo:
	model_init_kwargs: {}
	max_prompt_length: null
	max_completion_length: null
	num_generations: null
	temperature: 0.9
	remove_unused_columns: false
	repetition_penalty: 1.0
	use_vllm: false
	vllm_mode: null
	vllm_gpu_memory_utilization: 0.9
	epsilon: 0.2
	log_completions: false
	rollout_function: null
	gkd:
	teacher_model_name_or_path: null
	teacher_model_init_kwargs:
	dtype: auto
	temperature: 0.9
	lmbda: 0.5
	beta: 0.5
	max_new_tokens: 128
	disable_dropout: true
	seq_kd: false
	log_level: info
	dep_log_level: warning
	log_examples: false
	enable_wandb: true
	enable_mlflow: false
	enable_tensorboard: true
	logging_strategy: steps
	logging_dir: null
	logging_steps: 5
	logging_first_step: false
	eval_strategy: 'no'
	eval_steps: 500
	learning_rate: 2.0e-05
	lr_scheduler_type: cosine
	lr_scheduler_kwargs: {}
	warmup_ratio: 0.03
	warmup_steps: null
	optimizer: adamw_torch_fused
	weight_decay: 0.01
	adam_beta1: 0.9
	adam_beta2: 0.999
	adam_epsilon: 1.0e-08
	sgd_momentum: 0.0
	mixed_precision_dtype: NONE
	compile: false
	include_performance_metrics: true
	include_alternative_mfu_metrics: false
	log_model_summary: false
	resume_from_checkpoint: null
	try_resume_from_last_checkpoint: false
	dataloader_num_workers: 2
	dataloader_persistent_workers: false
	dataloader_prefetch_factor: 8
	dataloader_main_process_only: false
	ddp_find_unused_parameters: false
	max_grad_norm: 1.0
	trainer_kwargs:
	max_length: 10000
	remove_unused_columns: false
	dataset_kwargs:
	skip_prepare_dataset: true
	verl_config_overrides: {}
	profiler:
	save_dir: null
	enable_cpu_profiling: false
	enable_cuda_profiling: false
	record_shapes: false
	profile_memory: false
	with_stack: false
	with_flops: false
	with_modules: false
	row_limit: 50
	schedule:
	enable_schedule: false
	wait: 0
	warmup: 1
	active: 3
	repeat: 1
	skip_first: 1
	telemetry:
	telemetry_dir: telemetry
	collect_telemetry_for_all_ranks: false
	track_gpu_temperature: false
	empty_device_cache_steps: 1
	nccl_default_timeout_minutes: null
	label_ignore_index: null
	peft:
	lora_r: 8
	lora_alpha: 8
	lora_dropout: 0.0
	lora_target_modules: null
	lora_target_parameters: null
	lora_modules_to_save: null
	lora_bias: none
	lora_init_weights: DEFAULT
	lora_task_type: CAUSAL_LM
	q_lora: false
	q_lora_bits: 4
	bnb_4bit_quant_type: fp4
	llm_int8_skip_modules: null
	use_bnb_nested_quant: false
	bnb_4bit_quant_storage: uint8
	bnb_4bit_compute_dtype: float32
	peft_save_mode: ADAPTER_ONLY
	fsdp:
	enable_fsdp: true
	sharding_strategy: HYBRID_SHARD
	cpu_offload: false
	mixed_precision: bf16
	backward_prefetch: BACKWARD_PRE
	forward_prefetch: true
	use_orig_params: null
	state_dict_type: FULL_STATE_DICT
	auto_wrap_policy: SIZE_BASED_WRAP
	min_num_params: 100000
	transformer_layer_cls: null
	sync_module_states: true
	deepspeed:
	enable_deepspeed: false
	deepspeed_config_path: null
	zero_stage: ZERO_0
	offload_optimizer: null
	offload_param: null
	precision: null
	overlap_comm: false
	contiguous_gradients: true
	reduce_bucket_size: 500000000
	allgather_bucket_size: 500000000
	allgather_partitions: true
	reduce_scatter: true
	round_robin_gradients: false
	stage3_prefetch_bucket_size: 50000000
	stage3_param_persistence_threshold: 100000
	stage3_max_live_parameters: 1000000000
	stage3_max_reuse_distance: 1000000000
	stage3_gather_16bit_weights_on_model_save: false
	sub_group_size: 1000000000
	train_batch_size: auto
	train_micro_batch_size_per_gpu: auto
	gradient_accumulation_steps: auto
	gradient_clipping: auto
	zero_allow_untested_optimizer: true
	zero_force_ds_cpu_optimizer: true
	activation_checkpointing: {}
	memory_efficient_linear: false
	steps_per_print: 10
	wall_clock_breakdown: false