juspay
/

task2file-llm

Model card Files Files and versions

task2file-llm / trainer-kit /CPT-14b /config.yaml

SirajRLX's picture

Add Training Scripts

e527a65 verified 4 months ago

history blame contribute delete

2.1 kB

	run:
	run_dir: "./runs/cpt_run_14b"
	seed: 42

	# WandB integration for experiment tracking
	wandb:
	enabled: true # Set to true to enable wandb logging
	project: "cpt-training" # WandB project name
	entity: null # WandB entity/team (optional)
	name: null # Run name (optional, will auto-generate if null)
	tags: ["cpt-lora","sft-14b"] # List of tags for the run (e.g., ["lora", "qlora", "experiment-1"])
	notes: null # Run description/notes (optional)

	model:
	# Local model path (no download)
	repo_id: "/workspace/Models/Qwen2.5-Coder-14B"
	revision: null

	# Used only when repo_id is a HF repo (not a local path)
	base_local_dir: "base_model"

	trust_remote_code: true
	tokenizer_use_fast: true
	device_map: "auto"

	torch_dtype: "bfloat16" # "float16" \| "bfloat16" \| "float32"

	# QLoRA
	use_4bit: false
	bnb_4bit_quant_type: "nf4"
	bnb_4bit_use_double_quant: false
	bnb_4bit_compute_dtype: "bfloat16"

	# optional: "flash_attention_2" \| "sdpa" \| null
	attn_implementation: null

	data:
	train_jsonl: "all_data_with_descriptions.jsonl"
	eval_jsonl: null
	eval_split_ratio: 0.1
	text_field: "text"
	block_size: 4096
	shuffle: true
	num_proc: 4

	# ✅ NEW: packing behavior
	# "drop" = strict CPT (drop remainder)
	# "pad" = pad remainder to block_size + loss mask (-100) + attention_mask=0
	pack_mode: "pad"

	peft:
	enabled: true
	r: 32
	lora_alpha: 64
	lora_dropout: 0.05
	bias: "none"
	target_modules: "auto"

	train:
	# max_steps: 1000
	num_train_epochs: 2

	per_device_train_batch_size: 1
	per_device_eval_batch_size: 1
	gradient_accumulation_steps: 16

	learning_rate: 2e-5
	weight_decay: 0.0
	warmup_ratio: 0.1
	lr_scheduler_type: "cosine"

	optim: "paged_adamw_8bit"
	max_grad_norm: 1.0
	gradient_checkpointing: true

	logging_steps: 1
	save_strategy: "steps"
	save_steps: 100
	save_total_limit: 7

	evaluation_strategy: "steps"
	eval_steps: 50
	load_best_model_at_end: true

	resume_from_checkpoint: "auto"

	merge:
	enabled: true
	merged_dtype: "float16"
	max_shard_size: "2GB"
	output_dir: "./merged_14b_cpt_lora"