Bantam-285m / training_args.json
Theoistic's picture
Upload folder using huggingface_hub
bc9042e verified
{
"dataset": "/root/lang_mix.jsonl",
"tokenizer": "/root/tokenizers",
"out_dir": "/root/models/run_cen_V",
"use_hf": false,
"hf_name": null,
"hf_subset": null,
"hf_split": "train",
"hf_streaming": true,
"hf_text_field": "text",
"hf_messages_field": "messages",
"shuffle_buffer_size": 10000,
"seed": 1337,
"init_from_checkpoint": null,
"finetune_from": null,
"strict_vocab_match": false,
"save_tag": null,
"save_every_n": 2000,
"keep_last_k": 3,
"save_on_improve": false,
"improve_delta": 0.0,
"resume_from_checkpoint": null,
"log_loss_to_csv": false,
"dataset_text_field": "text",
"min_sample_token_length": 8,
"stream_local_dataset": false,
"local_dataset_shuffle_buffer": 2048,
"block_count_sample_fraction": 0.02,
"block_count_min_sample_megabytes": 32,
"block_count_max_sample_megabytes": 512,
"precision": "bf16",
"optimizer": "adamw",
"lr": 0.0003,
"weight_decay": 0.06,
"beta2": 0.98,
"adam_eps": 1e-08,
"grad_clip": 0.8,
"optim_eps": 1e-08,
"lr_scheduler": "cosine",
"min_lr_ratio": 0.05,
"muon_lr": null,
"muon_momentum": 0.95,
"muon_exclude_embeddings": true,
"muon_beta1": null,
"muon_beta2": null,
"muon_eps": null,
"muon_bias_correction": true,
"muon_clip_by_layer": false,
"muon_lr_correction": true,
"batch_size": 4,
"accum_steps": 16,
"epochs": 2,
"warmup_frac": 0.05,
"log_every_n": 10,
"overfit_subset": null,
"use_gradient_checkpoint": false,
"num_workers": 2,
"pin_memory": true,
"persistent_workers": true,
"sft_mode": "lora",
"lora_r": 64,
"lora_alpha": 96,
"lora_dropout": 0.05,
"include_agent_end": true,
"include_eos": false,
"mask_user_queries": true
}