juspay
/

task2file-llm

Model card Files Files and versions

task2file-llm / trainer-kit /CPT /config.yaml

SirajRLX's picture

Add Training Scripts

e527a65 verified about 2 months ago

history blame contribute delete

1.71 kB

	run:
	run_dir: "./runs/cpt_run_v1"
	seed: 42

	model:
	# Local model path (no download)
	repo_id: "/workspace/Models/Devstral-Small-2-24B-Instruct-2512"
	revision: null

	# Used only when repo_id is a HF repo (not a local path)
	base_local_dir: "base_model"

	trust_remote_code: true
	tokenizer_use_fast: true
	device_map: "auto"

	torch_dtype: "bfloat16" # "float16" \| "bfloat16" \| "float32"

	# QLoRA
	use_4bit: false
	bnb_4bit_quant_type: "nf4"
	bnb_4bit_use_double_quant: false
	bnb_4bit_compute_dtype: "bfloat16"

	# optional: "flash_attention_2" \| "sdpa" \| null
	attn_implementation: null

	data:
	train_jsonl: "/workspace/all_data_with_descriptions.jsonl"
	eval_jsonl: null
	eval_split_ratio: 0.1
	text_field: "text"
	block_size: 4096
	shuffle: true
	num_proc: 4

	# ✅ NEW: packing behavior
	# "drop" = strict CPT (drop remainder)
	# "pad" = pad remainder to block_size + loss mask (-100) + attention_mask=0
	pack_mode: "pad"

	peft:
	enabled: true
	r: 64
	lora_alpha: 128
	lora_dropout: 0.05
	bias: "none"
	target_modules: "auto"

	train:
	#max_steps: 1000
	num_train_epochs: 2

	per_device_train_batch_size: 1
	per_device_eval_batch_size: 1
	gradient_accumulation_steps: 16

	learning_rate: 2e-5
	weight_decay: 0.0
	warmup_ratio: 0.1
	lr_scheduler_type: "cosine"

	optim: "paged_adamw_8bit"
	max_grad_norm: 1.0
	gradient_checkpointing: true

	logging_steps: 1
	save_strategy: "steps"
	save_steps: 100
	save_total_limit: 4

	evaluation_strategy: "steps"
	eval_steps: 50
	load_best_model_at_end: true

	resume_from_checkpoint: "auto"

	merge:
	enabled: true
	merged_dtype: "float16"
	max_shard_size: "2GB"
	output_dir: "./merged_24b_cpt_lora"