| |
| base_model: /raid/Mango/axolotl/24B-KTO/merged |
| model_type: AutoModelForCausalLM |
| tokenizer_type: AutoTokenizer |
|
|
| |
| load_in_8bit: false |
| load_in_4bit: false |
| strict: false |
|
|
| |
| datasets: |
| - path: PocketDoc/Dans-Codemaxx-LeetCode |
| type: dan-chat-advanced |
| - path: Nitral-AI/ARES-ShareGPT |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-Logicmaxx-FI-VeriMed |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-Prosemaxx-Cowriter-3-XS |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-Toolmaxx-Functions-apigen |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-Toolmaxx-Functions-Toolbench |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-Logicmaxx-SAT-AP |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-Logicmaxx-Skunkworks |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-Taskmaxx-ConcurrentQA-Reworked |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-Toolmaxx-Agent |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-Toolmaxx-ShellCommands |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-MemoryCore-CoreCurriculum-Small |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-Codemaxx-CodeFeedback-Conversations |
| type: dan-chat-advanced |
| - path: Delta-Vector/Orion-Praxis-Co-Writer |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-Prosemaxx-Instructwriter-Long |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-Prosemaxx-Cowriter-3-XS |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-Prosemaxx-InstructWriter-ZeroShot |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-Benchmaxx-COT |
| type: dan-chat-advanced |
| - path: PocketDoc/Dans-Benchmaxx |
| type: dan-chat-advanced |
| - path: NewEden/xlam-function-calling-60k-shareGPT |
| type: dan-chat-advanced |
| shuffle_merged_datasets: true |
| dataset_prepared_path: dataset_prepared |
| val_set_size: 0.0020 |
| output_dir: ./MS3.2-SFT-V2 |
| |
| plugins: |
| - axolotl.integrations.liger.LigerPlugin |
| - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin |
| liger_rope: true |
| liger_rms_norm: true |
| liger_layer_norm: true |
| liger_glu_activation: true |
| liger_fused_linear_cross_entropy: false |
| cut_cross_entropy: true |
|
|
| |
| sequence_len: 16000 |
| sample_packing: true |
| eval_sample_packing: false |
| pad_to_sequence_len: true |
|
|
| |
| max_grad_norm: 0.001 |
|
|
|
|
| |
| wandb_project: Training-A100 |
| wandb_entity: |
| wandb_watch: |
| wandb_name: SFT-KTO-V2 |
| wandb_log_model: |
|
|
| |
| evals_per_epoch: 4 |
| eval_table_size: |
| eval_max_new_tokens: 128 |
|
|
| |
| gradient_accumulation_steps: 6 |
| micro_batch_size: 6 |
| num_epochs: 4 |
| optimizer: paged_ademamix_8bit |
| optim_args: "beta1=0.9,beta2=0.999,beta3=0.999,alpha=5" |
| lr_scheduler: rex |
| learning_rate: 1e-6 |
| warmup_ratio: 0.1 |
| weight_decay: 0.0 |
|
|
| train_on_inputs: false |
| group_by_length: false |
| bf16: auto |
| fp16: |
| tf32: false |
|
|
| gradient_checkpointing: true |
| early_stopping_patience: |
| resume_from_checkpoint: |
| local_rank: |
| logging_steps: 1 |
| xformers_attention: |
| flash_attention: true |
| s2_attention: |
| saves_per_epoch: 2 |
| debug: |
| deepspeed: ./deepspeed_configs/zero3_bf16.json |
| fsdp: |
| fsdp_config: |
| special_tokens: |
| pad_token: <pad> |
|
|