| { |
| "seed": 42, |
| "outdir": "/workspace/checkpoints_v2_30L_1280_700M/", |
| "use_wandb": false, |
| "resume_from_checkpoint": false, |
| "force_next_epoch": false, |
| "manual_batch_override": true, |
| "manual_batch_size": 8, |
| "manual_accum_steps": 16, |
| "manual_gradient_checkpointing": false, |
| "moving_avg_beta": 0.98, |
| "save_every_epochs": 1, |
| "eval_every_epochs": 1, |
| "vocab_name": "mistralai/Mistral-7B-v0.1", |
| "doc_eos_token": "<|endoftext|>", |
| "vocab_pad_multiple": 64, |
| "tokenizer_download_timeout": 30, |
| "d_model": 1280, |
| "n_layers": 30, |
| "n_heads": 10, |
| "q_lora_rank": 640, |
| "kv_lora_rank": 320, |
| "qk_nope_head_dim": 64, |
| "qk_rope_head_dim": 64, |
| "v_head_dim": 128, |
| "ff_mult": 3.5, |
| "qk_norm": true, |
| "use_compile": true, |
| "attn_dropout": 0.05, |
| "resid_dropout": 0.05, |
| "emb_dropout": 0.05, |
| "label_smoothing": 0.05, |
| "dataset_name": "AlgoDriveAI/Cosmopedia_Math_v2_Alpha", |
| "context_len": 2048, |
| "batch_size": 8, |
| "accum_steps": 16, |
| "num_epochs": 2, |
| "max_lr": 0.00015, |
| "min_lr": 1.5e-05, |
| "warmup_ratio": 0.05, |
| "decay_ratio": 0.15, |
| "weight_decay": 0.1, |
| "beta1": 0.9, |
| "beta2": 0.95, |
| "clip_norm": 1.0, |
| "decay_embeddings": false, |
| "num_workers": 16, |
| "prefetch_factor": 4, |
| "pin_memory": true, |
| "use_dataset_cache": false, |
| "dataset_cache_dir": "/workspace/tokenized_cache_v2b", |
| "use_double_eos": false, |
| "use_memmap_token_cache": true, |
| "memmap_cache_dir": "/workspace/memmap_cache_v2b", |
| "memmap_write_workers": 8, |
| "use_gradient_checkpointing": false, |
| "run_dataloader_diagnostic": true, |
| "diagnostic_batches": 50, |
| "min_batch_size": 1, |
| "max_batch_size": 64, |
| "min_effective_batch": 128, |
| "max_accum_steps": 64, |
| "enable_dynamic_oom_recovery": true, |
| "max_oom_retries": 5, |
| "oom_batch_size_reduction_factor": 0.7, |
| "fallback_context_lens": [ |
| 2048, |
| 1024, |
| 512 |
| ], |
| "memory_warning_threshold": 0.95, |
| "reshuffle_each_epoch": true, |
| "shuffle_before_packing": true, |
| "num_packing_offsets": 1 |
| } |