happyQuasar6 / train_meta.json
Bittoby1040's picture
Thank you first commit
2d04cb8 verified
{
"step": 100,
"mean_loss": 12.659629821777344,
"best_loss": 12.208006858825684,
"topk_fkl": 4.3213653564453125,
"rkl_tail": 9.042220115661621,
"skew_fkl": 1.0191670060157776,
"fkl_tail": -0.008757250383496284,
"validator_tail_bucket_kl": 4.312608480453491,
"teacher_topk_mass": 0.9973499476909637,
"student_topk_mass": 0.5712139010429382,
"selective_weight_mean": 1.0,
"quasar_aux": 0.7987381517887115,
"kl_loss": 11.860891819000244,
"args": {
"student": "/home/claudeuser/ember",
"teacher": "/home/claudeuser/Qwen3.5-4B",
"out_dir": "/home/claudeuser/ember-greedy-v2",
"teacher_cache": "/home/claudeuser/_teacher_cache_climbmix.pt",
"skip_cache_gen": true,
"cache_only": false,
"n_prompts": 256,
"max_new": 1580,
"max_prompt_tokens": 256,
"topk": 128,
"shards_per_epoch": 6,
"prompts_per_shard": 48,
"block_hashes_file": null,
"w_topk_fkl": 1.0,
"w_rkl_tail": 0.8,
"w_skew": 0.3,
"w_fkl_tail": 0.0,
"skew_alpha": 0.2,
"w_quasar_internal": 1.0,
"selective_weighting": false,
"thread_memory_state": false,
"layers_from": 17,
"layers_to": 23,
"no_lm_head": false,
"no_final_norm": false,
"extra_unfreeze_layers": "",
"max_steps": 500,
"lr": 2e-05,
"weight_decay": 0.0,
"warmup_steps": 30,
"micro_batch": 12,
"grad_accum": 2,
"grad_clip": 1.0,
"save_every": 50,
"log_every": 5,
"no_grad_ckpt": false,
"seed": 42
}
}