| { | |
| "dataset": "/root/lang_mix.jsonl", | |
| "tokenizer": "/root/tokenizers", | |
| "out_dir": "/root/models/run_cen_V", | |
| "use_hf": false, | |
| "hf_name": null, | |
| "hf_subset": null, | |
| "hf_split": "train", | |
| "hf_streaming": true, | |
| "hf_text_field": "text", | |
| "hf_messages_field": "messages", | |
| "shuffle_buffer_size": 10000, | |
| "seed": 1337, | |
| "init_from_checkpoint": null, | |
| "finetune_from": null, | |
| "strict_vocab_match": false, | |
| "save_tag": null, | |
| "save_every_n": 2000, | |
| "keep_last_k": 3, | |
| "save_on_improve": false, | |
| "improve_delta": 0.0, | |
| "resume_from_checkpoint": null, | |
| "log_loss_to_csv": false, | |
| "dataset_text_field": "text", | |
| "min_sample_token_length": 8, | |
| "stream_local_dataset": false, | |
| "local_dataset_shuffle_buffer": 2048, | |
| "block_count_sample_fraction": 0.02, | |
| "block_count_min_sample_megabytes": 32, | |
| "block_count_max_sample_megabytes": 512, | |
| "precision": "bf16", | |
| "optimizer": "adamw", | |
| "lr": 0.0003, | |
| "weight_decay": 0.06, | |
| "beta2": 0.98, | |
| "adam_eps": 1e-08, | |
| "grad_clip": 0.8, | |
| "optim_eps": 1e-08, | |
| "lr_scheduler": "cosine", | |
| "min_lr_ratio": 0.05, | |
| "muon_lr": null, | |
| "muon_momentum": 0.95, | |
| "muon_exclude_embeddings": true, | |
| "muon_beta1": null, | |
| "muon_beta2": null, | |
| "muon_eps": null, | |
| "muon_bias_correction": true, | |
| "muon_clip_by_layer": false, | |
| "muon_lr_correction": true, | |
| "batch_size": 4, | |
| "accum_steps": 16, | |
| "epochs": 2, | |
| "warmup_frac": 0.05, | |
| "log_every_n": 10, | |
| "overfit_subset": null, | |
| "use_gradient_checkpoint": false, | |
| "num_workers": 2, | |
| "pin_memory": true, | |
| "persistent_workers": true, | |
| "sft_mode": "lora", | |
| "lora_r": 64, | |
| "lora_alpha": 96, | |
| "lora_dropout": 0.05, | |
| "include_agent_end": true, | |
| "include_eos": false, | |
| "mask_user_queries": true | |
| } |