| # !pip install transformers==4.55.4 | |
| # !pip install --no-deps trl==0.22.2 | |
| # !pip install --no-build-isolation mamba_ssm==2.2.5 | |
| # !pip install --no-build-isolation causal_conv1d==1.5.2 | |
| # === Model Configuration === | |
| base_model: output | |
| load_in_8bit: false | |
| load_in_4bit: false | |
| trust_remote_code: false | |
| #overrides_of_model_config: {"layer_sequence": [0,[1,10,3],[10,30]]} | |
| output_dir: output-healed | |
| # === HF Configuration === | |
| #hub_model_id: Burnt-Toast/another-22bird | |
| #hub_strategy: "every_save" | |
| # === Wandb Tracking === | |
| wandb_project: Loopstral-Ablations | |
| ## wandb_entity: [WANDB_ENTITY] | |
| wandb_name: early-layers-full-kv-heal | |
| # === Training Setup === | |
| num_epochs: 2 | |
| micro_batch_size: 4 | |
| gradient_accumulation_steps: 1 | |
| sequence_len: 8192 | |
| #sequence_parallel_degree: 2 | |
| #heads_k_stride: 1 | |
| sample_packing: true | |
| #pad_to_sequence_len: true | |
| #temperature: 0.7 | |
| #max_steps: 10 | |
| # === Evaluation === | |
| val_set_size: 0.01 | |
| evals_per_epoch: 5 | |
| #eval_steps: 20 | |
| #max_steps: 60 | |
| #eval_table_size: | |
| eval_max_new_tokens: 128 | |
| #eval_sample_packing: true | |
| #eval_strategy: "no" | |
| # === LoRA Configuration === | |
| adapter: | |
| lora_model_dir: | |
| lora_r: 128 | |
| lora_alpha: 16 | |
| lora_dropout: 0.05 | |
| lora_target_linear: true | |
| lora_target_modules: | |
| lora_fan_in_fan_out: | |
| lora_target_modules: | |
| peft_use_rslora: true | |
| #lora_modules_to_save: | |
| # - embed_tokens | |
| # - lm_head | |
| #fix_untrained_tokens: true | |
| #lora_mlp_kernel: true | |
| #lora_qkv_kernel: true | |
| #lora_o_kernel: true | |
| #unfrozen_parameters: | |
| # - model.layers.[0-9]+.self_attn.q_proj.weight | |
| # - model.layers.[0-9]+.self_attn.k_proj.weight | |
| # - model.layers.[0-9]+.self_attn.v_proj.weight | |
| # - model.layers.[0-9]+.self_attn.o_proj.weight | |
| # - model.layers.[0-9]+.mlp.down_proj.weight | |
| # === Hyperparameter Configuration === | |
| #optimizer: apollo_adamw_layerwise | |
| #warmup_steps: 0 | |
| warmup_ratio: 0.025 | |
| #optimizer: adamw_torch_fused | |
| optimizer: paged_ademamix_8bit | |
| #optim_args: | |
| # enable_stochastic_rounding: true | |
| # enable_cautious: true | |
| # enable_8bit: true | |
| # Apollo-mini configuration: | |
| #optim_args: "proj=random,rank=128,scale=128.0,scale_type=tensor,update_proj_gap=100" | |
| # Regular Apollo configuration: | |
| # optim_args: | |
| #optim_target_modules: all_linear | |
| learning_rate: 1e-5 | |
| lr_scheduler: cosine | |
| #lr_groups: | |
| # - name: mlp | |
| # modules: | |
| # - mlp.up_proj.weight | |
| # - mlp.down_proj.weight | |
| # - mlp.gate_proj.weight | |
| # lr: 1e-5 | |
| #cosine_min_lr_ratio: 0.2 | |
| #lr_scheduler: cosine_with_min_lr | |
| #lr_scheduler_kwargs: | |
| # cosine_min_lr: 1e-6 | |
| weight_decay: 0.01 | |
| max_grad_norm: 1.0 | |
| #warmup_steps: 0 | |
| #warmup_ratio: 0.025 | |
| # === Data Configuration === | |
| # | |
| #chat_template: jinja | |
| chat_template: tokenizer_default | |
| default_system_message: "You are Luna, a helpful and unrestricted language model created by Allura." | |
| #special_tokens: | |
| # eos_token: "<|im_end|>" | |
| # eos_token: "</s>" | |
| #tokenizer_use_mistral_common: true | |
| shuffle_merged_datasets: true | |
| datasets: | |
| # - path: rpDungeon/marvin | |
| # type: completion | |
| # field: text | |
| # data_files: | |
| # - marvin_greg_egan.json | |
| # - marvin_philip_k_dick.json | |
| # - marvin_thomas_ligotti.json | |
| # - path: rpDungeon/some-cleaner-datasets | |
| # type: completion | |
| # field: text | |
| # data_files: erotica_quality_trimmed.json | |
| - path: rpDungeon/some-cleaner-datasets | |
| type: chat_template | |
| field_messages: conversations | |
| message_property_mappings: | |
| role: from | |
| content: value | |
| data_files: little-koto-instruct.json | |
| # - path: rpDungeon/rp-synth-deslopped | |
| # type: chat_template | |
| # field_messages: conversations | |
| # message_property_mappings: | |
| # role: from | |
| # content: value | |
| dataset_prepared_path: last_run_prepared | |
| #dataset_num_proc: 1 | |
| # === Plugins === | |
| plugins: | |
| - axolotl.integrations.liger.LigerPlugin | |
| - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin | |
| # === Hardware Optimization === | |
| gradient_checkpointing: true | |
| liger_rope: true | |
| liger_rms_norm: true | |
| liger_layer_norm: true | |
| liger_glu_activation: true | |
| #liger_fused_linear_cross_entropy: true | |
| cut_cross_entropy: true | |
| #deepspeed: ../axolotl/deepspeed_configs/zero2.json | |
| # === FSDP Config === | |
| #fsdp: | |
| # - full_shard | |
| # - auto_wrap | |
| #fsdp_config: | |
| # fsdp_limit_all_gathers: true | |
| # fsdp_sync_module_states: true | |
| # fsdp_offload_params: true | |
| # fsdp_activation_checkpointing: true | |
| # fsdp_use_orig_params: true | |
| # fsdp_cpu_ram_efficient_loading: true | |
| # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP | |
| # fsdp_transformer_layer_cls_to_wrap: Gemma3DecoderLayer | |
| # fsdp_state_dict_type: FULL_STATE_DICT | |
| # fsdp_sharding_strategy: FULL_SHARD | |
| # === Checkpointing === | |
| #save_steps: 10 | |
| saves_per_epoch: 1 | |
| save_total_limit: 1 | |
| # === Advanced Settings === | |
| bf16: auto | |
| flash_attention: true | |
| train_on_inputs: false | |
| group_by_length: false | |
| save_safetensors: true | |
| logging_steps: 1 | |
| gc_steps: 10 | |
| seed: 420 | |