| { |
| "step": 100, |
| "mean_loss": 12.659629821777344, |
| "best_loss": 12.208006858825684, |
| "topk_fkl": 4.3213653564453125, |
| "rkl_tail": 9.042220115661621, |
| "skew_fkl": 1.0191670060157776, |
| "fkl_tail": -0.008757250383496284, |
| "validator_tail_bucket_kl": 4.312608480453491, |
| "teacher_topk_mass": 0.9973499476909637, |
| "student_topk_mass": 0.5712139010429382, |
| "selective_weight_mean": 1.0, |
| "quasar_aux": 0.7987381517887115, |
| "kl_loss": 11.860891819000244, |
| "args": { |
| "student": "/home/claudeuser/ember", |
| "teacher": "/home/claudeuser/Qwen3.5-4B", |
| "out_dir": "/home/claudeuser/ember-greedy-v2", |
| "teacher_cache": "/home/claudeuser/_teacher_cache_climbmix.pt", |
| "skip_cache_gen": true, |
| "cache_only": false, |
| "n_prompts": 256, |
| "max_new": 1580, |
| "max_prompt_tokens": 256, |
| "topk": 128, |
| "shards_per_epoch": 6, |
| "prompts_per_shard": 48, |
| "block_hashes_file": null, |
| "w_topk_fkl": 1.0, |
| "w_rkl_tail": 0.8, |
| "w_skew": 0.3, |
| "w_fkl_tail": 0.0, |
| "skew_alpha": 0.2, |
| "w_quasar_internal": 1.0, |
| "selective_weighting": false, |
| "thread_memory_state": false, |
| "layers_from": 17, |
| "layers_to": 23, |
| "no_lm_head": false, |
| "no_final_norm": false, |
| "extra_unfreeze_layers": "", |
| "max_steps": 500, |
| "lr": 2e-05, |
| "weight_decay": 0.0, |
| "warmup_steps": 30, |
| "micro_batch": 12, |
| "grad_accum": 2, |
| "grad_clip": 1.0, |
| "save_every": 50, |
| "log_every": 5, |
| "no_grad_ckpt": false, |
| "seed": 42 |
| } |
| } |