{ "step": 2000, "tokens_consumed": 1048576000, "sequences_consumed": 512000, "config": { "vocab_size": 129280, "dim": 1536, "n_layers": 16, "n_heads": 12, "head_dim": 128, "rope_head_dim": 32, "q_lora_rank": 384, "o_groups": 4, "o_lora_rank": 256, "norm_eps": 1e-06, "n_routed_experts": 12, "n_shared_experts": 1, "n_activated_experts": 2, "moe_inter_dim": 1408, "score_func": "sqrtsoftplus", "route_scale": 2.5, "swiglu_limit": 10.0, "n_dense_layers": 2, "load_balance_alpha": 0.0001, "hc_mult": 3, "hc_sinkhorn_iters": 8, "hc_eps": 1e-06, "window_size": 128, "compress_ratio": 4, "n_mtp_layers": 1, "mtp_loss_weight": 0.3, "rope_theta": 10000.0, "rope_factor": 1.0, "original_seq_len": 0, "beta_fast": 32, "beta_slow": 1, "ctx_phase_boundaries": [ 60000, 80000, 100000 ], "ctx_phase_lengths": [ 2048, 4096, 8192 ], "yarn_scale": 40.0, "yarn_alpha": 1.0, "yarn_beta": 32.0, "yarn_original_seq_len": 4096, "micro_batch_size": 1, "gradient_accumulation_steps": 32, "max_lr": 0.00042, "min_lr": 4.2e-05, "warmup_steps": 2000, "total_steps": 200000, "decay_steps": 180000, "weight_decay": 0.1, "muon_beta": 0.95, "muon_ns_steps": 5, "adam_beta1": 0.9, "adam_beta2": 0.95, "grad_clip_norm": 1.0, "init_std": 0.006, "dtype": "bfloat16", "fineweb_config": "default", "approx_tokens_per_fineweb_doc": 800, "approx_tokens_per_code_doc": 1200, "approx_tokens_per_math_doc": 600, "data_mix_probs": [ 0.6, 0.25, 0.15 ], "save_every_steps": 400, "log_every_steps": 10, "push_every_steps": 400, "checkpoint_dir": "/kaggle/working/zenyxv3checkpoints" } }