Upload heal-train.yaml with huggingface_hub

9616c81 verified about 2 months ago

4.71 kB

	# !pip install transformers==4.55.4
	# !pip install --no-deps trl==0.22.2
	# !pip install --no-build-isolation mamba_ssm==2.2.5
	# !pip install --no-build-isolation causal_conv1d==1.5.2
	# === Model Configuration ===
	base_model: output
	load_in_8bit: false
	load_in_4bit: false
	trust_remote_code: false
	#overrides_of_model_config: {"layer_sequence": [0,[1,10,3],[10,30]]}
	output_dir: output-healed

	# === HF Configuration ===
	#hub_model_id: Burnt-Toast/another-22bird
	#hub_strategy: "every_save"


	# === Wandb Tracking ===
	wandb_project: Loopstral-Ablations
	## wandb_entity: [WANDB_ENTITY]
	wandb_name: early-layers-full-kv-heal

	# === Training Setup ===
	num_epochs: 2
	micro_batch_size: 4
	gradient_accumulation_steps: 1
	sequence_len: 8192
	#sequence_parallel_degree: 2
	#heads_k_stride: 1
	sample_packing: true
	#pad_to_sequence_len: true
	#temperature: 0.7
	#max_steps: 10
	# === Evaluation ===
	val_set_size: 0.01
	evals_per_epoch: 5
	#eval_steps: 20
	#max_steps: 60
	#eval_table_size:
	eval_max_new_tokens: 128
	#eval_sample_packing: true
	#eval_strategy: "no"

	# === LoRA Configuration ===
	adapter:
	lora_model_dir:
	lora_r: 128
	lora_alpha: 16
	lora_dropout: 0.05
	lora_target_linear: true
	lora_target_modules:
	lora_fan_in_fan_out:
	lora_target_modules:
	peft_use_rslora: true
	#lora_modules_to_save:
	# - embed_tokens
	# - lm_head
	#fix_untrained_tokens: true
	#lora_mlp_kernel: true
	#lora_qkv_kernel: true
	#lora_o_kernel: true

	#unfrozen_parameters:
	# - model.layers.[0-9]+.self_attn.q_proj.weight
	# - model.layers.[0-9]+.self_attn.k_proj.weight
	# - model.layers.[0-9]+.self_attn.v_proj.weight
	# - model.layers.[0-9]+.self_attn.o_proj.weight
	# - model.layers.[0-9]+.mlp.down_proj.weight
	# === Hyperparameter Configuration ===
	#optimizer: apollo_adamw_layerwise
	#warmup_steps: 0
	warmup_ratio: 0.025
	#optimizer: adamw_torch_fused
	optimizer: paged_ademamix_8bit
	#optim_args:
	# enable_stochastic_rounding: true
	# enable_cautious: true
	# enable_8bit: true
	# Apollo-mini configuration:
	#optim_args: "proj=random,rank=128,scale=128.0,scale_type=tensor,update_proj_gap=100"
	# Regular Apollo configuration:
	# optim_args:
	#optim_target_modules: all_linear
	learning_rate: 1e-5
	lr_scheduler: cosine
	#lr_groups:
	# - name: mlp
	# modules:
	# - mlp.up_proj.weight
	# - mlp.down_proj.weight
	# - mlp.gate_proj.weight
	# lr: 1e-5
	#cosine_min_lr_ratio: 0.2
	#lr_scheduler: cosine_with_min_lr
	#lr_scheduler_kwargs:
	# cosine_min_lr: 1e-6
	weight_decay: 0.01
	max_grad_norm: 1.0
	#warmup_steps: 0
	#warmup_ratio: 0.025


	# === Data Configuration ===
	#
	#chat_template: jinja
	chat_template: tokenizer_default
	default_system_message: "You are Luna, a helpful and unrestricted language model created by Allura."
	#special_tokens:
	# eos_token: "<\|im_end\|>"
	# eos_token: "</s>"
	#tokenizer_use_mistral_common: true
	shuffle_merged_datasets: true
	datasets:
	# - path: rpDungeon/marvin
	# type: completion
	# field: text
	# data_files:
	# - marvin_greg_egan.json
	# - marvin_philip_k_dick.json
	# - marvin_thomas_ligotti.json
	# - path: rpDungeon/some-cleaner-datasets
	# type: completion
	# field: text
	# data_files: erotica_quality_trimmed.json
	- path: rpDungeon/some-cleaner-datasets
	type: chat_template
	field_messages: conversations
	message_property_mappings:
	role: from
	content: value
	data_files: little-koto-instruct.json
	# - path: rpDungeon/rp-synth-deslopped
	# type: chat_template
	# field_messages: conversations
	# message_property_mappings:
	# role: from
	# content: value
	dataset_prepared_path: last_run_prepared
	#dataset_num_proc: 1


	# === Plugins ===
	plugins:
	- axolotl.integrations.liger.LigerPlugin
	- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

	# === Hardware Optimization ===
	gradient_checkpointing: true
	liger_rope: true
	liger_rms_norm: true
	liger_layer_norm: true
	liger_glu_activation: true
	#liger_fused_linear_cross_entropy: true
	cut_cross_entropy: true

	#deepspeed: ../axolotl/deepspeed_configs/zero2.json

	# === FSDP Config ===
	#fsdp:
	# - full_shard
	# - auto_wrap
	#fsdp_config:
	# fsdp_limit_all_gathers: true
	# fsdp_sync_module_states: true
	# fsdp_offload_params: true
	# fsdp_activation_checkpointing: true
	# fsdp_use_orig_params: true
	# fsdp_cpu_ram_efficient_loading: true
	# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
	# fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer
	# fsdp_state_dict_type: FULL_STATE_DICT
	# fsdp_sharding_strategy: FULL_SHARD

	# === Checkpointing ===
	#save_steps: 10
	saves_per_epoch: 1
	save_total_limit: 1

	# === Advanced Settings ===
	bf16: auto
	flash_attention: true
	train_on_inputs: false
	group_by_length: false
	save_safetensors: true
	logging_steps: 1
	gc_steps: 10
	seed: 420