Add Devstral-14B CPT training run

a555835 verified 2 months ago

1.29 kB

	run:
	run_dir: ./runs/cpt_run_v1
	seed: 42
	model:
	repo_id: /workspace/Models/Devstral-Small-2-24B-Instruct-2512
	revision: null
	base_local_dir: base_model
	trust_remote_code: true
	tokenizer_use_fast: true
	device_map: auto
	torch_dtype: bfloat16
	use_4bit: false
	bnb_4bit_quant_type: nf4
	bnb_4bit_use_double_quant: false
	bnb_4bit_compute_dtype: bfloat16
	attn_implementation: null
	data:
	train_jsonl: /workspace/all_data_with_descriptions.jsonl
	eval_jsonl: null
	eval_split_ratio: 0.1
	text_field: text
	block_size: 4096
	shuffle: true
	num_proc: 4
	pack_mode: pad
	peft:
	enabled: true
	r: 64
	lora_alpha: 128
	lora_dropout: 0.05
	bias: none
	target_modules: auto
	train:
	num_train_epochs: 2
	per_device_train_batch_size: 1
	per_device_eval_batch_size: 1
	gradient_accumulation_steps: 16
	learning_rate: 2e-5
	weight_decay: 0.0
	warmup_ratio: 0.1
	lr_scheduler_type: cosine
	optim: paged_adamw_8bit
	max_grad_norm: 1.0
	gradient_checkpointing: true
	logging_steps: 1
	save_strategy: steps
	save_steps: 100
	save_total_limit: 4
	evaluation_strategy: steps
	eval_steps: 50
	load_best_model_at_end: true
	resume_from_checkpoint: auto
	merge:
	enabled: true
	merged_dtype: float16
	max_shard_size: 2GB
	output_dir: ./merged_24b_cpt_lora