| { |
| "step": 2000, |
| "tokens_consumed": 1048576000, |
| "sequences_consumed": 512000, |
| "config": { |
| "vocab_size": 129280, |
| "dim": 1536, |
| "n_layers": 16, |
| "n_heads": 12, |
| "head_dim": 128, |
| "rope_head_dim": 32, |
| "q_lora_rank": 384, |
| "o_groups": 4, |
| "o_lora_rank": 256, |
| "norm_eps": 1e-06, |
| "n_routed_experts": 12, |
| "n_shared_experts": 1, |
| "n_activated_experts": 2, |
| "moe_inter_dim": 1408, |
| "score_func": "sqrtsoftplus", |
| "route_scale": 2.5, |
| "swiglu_limit": 10.0, |
| "n_dense_layers": 2, |
| "load_balance_alpha": 0.0001, |
| "hc_mult": 3, |
| "hc_sinkhorn_iters": 8, |
| "hc_eps": 1e-06, |
| "window_size": 128, |
| "compress_ratio": 4, |
| "n_mtp_layers": 1, |
| "mtp_loss_weight": 0.3, |
| "rope_theta": 10000.0, |
| "rope_factor": 1.0, |
| "original_seq_len": 0, |
| "beta_fast": 32, |
| "beta_slow": 1, |
| "ctx_phase_boundaries": [ |
| 60000, |
| 80000, |
| 100000 |
| ], |
| "ctx_phase_lengths": [ |
| 2048, |
| 4096, |
| 8192 |
| ], |
| "yarn_scale": 40.0, |
| "yarn_alpha": 1.0, |
| "yarn_beta": 32.0, |
| "yarn_original_seq_len": 4096, |
| "micro_batch_size": 1, |
| "gradient_accumulation_steps": 32, |
| "max_lr": 0.00042, |
| "min_lr": 4.2e-05, |
| "warmup_steps": 2000, |
| "total_steps": 200000, |
| "decay_steps": 180000, |
| "weight_decay": 0.1, |
| "muon_beta": 0.95, |
| "muon_ns_steps": 5, |
| "adam_beta1": 0.9, |
| "adam_beta2": 0.95, |
| "grad_clip_norm": 1.0, |
| "init_std": 0.006, |
| "dtype": "bfloat16", |
| "fineweb_config": "default", |
| "approx_tokens_per_fineweb_doc": 800, |
| "approx_tokens_per_code_doc": 1200, |
| "approx_tokens_per_math_doc": 600, |
| "data_mix_probs": [ |
| 0.6, |
| 0.25, |
| 0.15 |
| ], |
| "save_every_steps": 400, |
| "log_every_steps": 10, |
| "push_every_steps": 400, |
| "checkpoint_dir": "/kaggle/working/zenyxv3checkpoints" |
| } |
| } |