| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.6006006006006006, | |
| "eval_steps": 50, | |
| "global_step": 400, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02002002002002002, | |
| "grad_norm": 0.05460292845964432, | |
| "grpo_mean_advantage": -1.3560057254835556e-07, | |
| "grpo_mean_group_score": 0.5922331809997559, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 3.0318567496578908e-06, | |
| "learning_rate": 8.000000000000001e-07, | |
| "loss": 0.007, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.04004004004004004, | |
| "grad_norm": 0.0679207444190979, | |
| "grpo_mean_advantage": 3.6619603633880615e-06, | |
| "grpo_mean_group_score": 0.5561589002609253, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.6246918676188216e-05, | |
| "learning_rate": 1.8000000000000001e-06, | |
| "loss": 0.0107, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06006006006006006, | |
| "grad_norm": 0.05788416787981987, | |
| "grpo_mean_advantage": -1.0654330395709621e-07, | |
| "grpo_mean_group_score": 0.5759152173995972, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 5.399440965447866e-07, | |
| "learning_rate": 2.8000000000000003e-06, | |
| "loss": 0.007, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.08008008008008008, | |
| "grad_norm": 0.0746568813920021, | |
| "grpo_mean_advantage": -5.871057737749652e-07, | |
| "grpo_mean_group_score": 0.5127314329147339, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 2.6951597646984737e-06, | |
| "learning_rate": 3.8000000000000005e-06, | |
| "loss": 0.0246, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1001001001001001, | |
| "grad_norm": 0.11442846059799194, | |
| "grpo_mean_advantage": 6.370246410369873e-07, | |
| "grpo_mean_group_score": 0.539706826210022, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 2.8908377771585947e-06, | |
| "learning_rate": 4.800000000000001e-06, | |
| "loss": 0.0337, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.12012012012012012, | |
| "grad_norm": 0.05778791010379791, | |
| "grpo_mean_advantage": 6.705522359595761e-09, | |
| "grpo_mean_group_score": 0.5812538862228394, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 6.189450800775376e-07, | |
| "learning_rate": 4.999125183044924e-06, | |
| "loss": 0.0171, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.14014014014014015, | |
| "grad_norm": 0.05819695070385933, | |
| "grpo_mean_advantage": 3.859400692363124e-07, | |
| "grpo_mean_group_score": 0.5909844636917114, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.6833292875162442e-06, | |
| "learning_rate": 4.995572288443412e-06, | |
| "loss": 0.0145, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.16016016016016016, | |
| "grad_norm": 0.07968433201313019, | |
| "grpo_mean_advantage": 2.600252742013254e-07, | |
| "grpo_mean_group_score": 0.5630953907966614, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.4095899132371414e-06, | |
| "learning_rate": 4.98929052218411e-06, | |
| "loss": 0.0196, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18018018018018017, | |
| "grad_norm": 0.0733402892947197, | |
| "grpo_mean_advantage": -1.2591480924584175e-07, | |
| "grpo_mean_group_score": 0.5604403614997864, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.0309080380466185e-06, | |
| "learning_rate": 4.980286753286196e-06, | |
| "loss": 0.0186, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.2002002002002002, | |
| "grad_norm": 0.07136482000350952, | |
| "grpo_mean_advantage": -2.808868941883702e-07, | |
| "grpo_mean_group_score": 0.5971035957336426, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.5696078889959608e-06, | |
| "learning_rate": 4.9685708272387645e-06, | |
| "loss": 0.0286, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22022022022022023, | |
| "grad_norm": 0.08851475268602371, | |
| "grpo_mean_advantage": 2.6822089438383045e-08, | |
| "grpo_mean_group_score": 0.5892971754074097, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 3.7878271541558206e-07, | |
| "learning_rate": 4.9541555552349404e-06, | |
| "loss": 0.0054, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.24024024024024024, | |
| "grad_norm": 0.07778509706258774, | |
| "grpo_mean_advantage": -5.662441182607836e-08, | |
| "grpo_mean_group_score": 0.564322292804718, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 6.128998393251095e-07, | |
| "learning_rate": 4.9370567001630155e-06, | |
| "loss": -0.0074, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2602602602602603, | |
| "grad_norm": 0.08740051090717316, | |
| "grpo_mean_advantage": -1.5944242193199898e-07, | |
| "grpo_mean_group_score": 0.562497615814209, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.6374274309782777e-06, | |
| "learning_rate": 4.917292959369968e-06, | |
| "loss": 0.0145, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2802802802802803, | |
| "grad_norm": 0.19070060551166534, | |
| "grpo_mean_advantage": 1.6838312433264946e-07, | |
| "grpo_mean_group_score": 0.5904761552810669, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 8.536571272088622e-07, | |
| "learning_rate": 4.8948859442161876e-06, | |
| "loss": 0.0257, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.3003003003003003, | |
| "grad_norm": 0.07321271300315857, | |
| "grpo_mean_advantage": 1.1175870895385742e-07, | |
| "grpo_mean_group_score": 0.5765624046325684, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 6.451961667153228e-07, | |
| "learning_rate": 4.869860156443768e-06, | |
| "loss": 0.0024, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.3203203203203203, | |
| "grad_norm": 0.07126748561859131, | |
| "grpo_mean_advantage": -1.4603138254187797e-07, | |
| "grpo_mean_group_score": 0.5858271718025208, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.1309343790344428e-06, | |
| "learning_rate": 4.842242961384211e-06, | |
| "loss": 0.0277, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.34034034034034033, | |
| "grad_norm": 0.08629189431667328, | |
| "grpo_mean_advantage": -1.817941665649414e-06, | |
| "grpo_mean_group_score": 0.5871662497520447, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.1141768482048064e-05, | |
| "learning_rate": 4.812064558034847e-06, | |
| "loss": 0.0246, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.36036036036036034, | |
| "grad_norm": 0.0998779758810997, | |
| "grpo_mean_advantage": 1.8179416372277046e-07, | |
| "grpo_mean_group_score": 0.5330992937088013, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 6.210335072864837e-07, | |
| "learning_rate": 4.779357946036662e-06, | |
| "loss": 0.0056, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.38038038038038036, | |
| "grad_norm": 0.10614689439535141, | |
| "grpo_mean_advantage": -2.972781771859445e-07, | |
| "grpo_mean_group_score": 0.5265295505523682, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 3.1582342217006953e-06, | |
| "learning_rate": 4.74415888958968e-06, | |
| "loss": 0.0053, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.4004004004004004, | |
| "grad_norm": 0.10345634073019028, | |
| "grpo_mean_advantage": -7.033348197182931e-07, | |
| "grpo_mean_group_score": 0.5660771131515503, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 4.245831405569334e-06, | |
| "learning_rate": 4.706505878345343e-06, | |
| "loss": 0.0134, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.42042042042042044, | |
| "grad_norm": 0.10077933222055435, | |
| "grpo_mean_advantage": 1.1920928955078125e-07, | |
| "grpo_mean_group_score": 0.57631915807724, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 3.2809634831210133e-07, | |
| "learning_rate": 4.666440085318626e-06, | |
| "loss": 0.0004, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.44044044044044045, | |
| "grad_norm": 0.09548182785511017, | |
| "grpo_mean_advantage": -4.0978193283081055e-07, | |
| "grpo_mean_group_score": 0.546563982963562, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 6.0397578636184335e-06, | |
| "learning_rate": 4.624005321865968e-06, | |
| "loss": 0.0033, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.46046046046046046, | |
| "grad_norm": 0.09417816251516342, | |
| "grpo_mean_advantage": -1.467764434437413e-07, | |
| "grpo_mean_group_score": 0.5519219636917114, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 2.2689375782647403e-06, | |
| "learning_rate": 4.57924798977818e-06, | |
| "loss": 0.0095, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.4804804804804805, | |
| "grad_norm": 0.10022275149822235, | |
| "grpo_mean_advantage": -5.215406329028838e-09, | |
| "grpo_mean_group_score": 0.5490407943725586, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 7.929010621410271e-07, | |
| "learning_rate": 4.532217030540781e-06, | |
| "loss": 0.0006, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5005005005005005, | |
| "grad_norm": 0.14057794213294983, | |
| "grpo_mean_advantage": -5.7369469175228005e-08, | |
| "grpo_mean_group_score": 0.5646580457687378, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.2823379620385822e-06, | |
| "learning_rate": 4.482963871817195e-06, | |
| "loss": -0.0046, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5205205205205206, | |
| "grad_norm": 0.12420658767223358, | |
| "grpo_mean_advantage": 2.9876827056796174e-07, | |
| "grpo_mean_group_score": 0.6111599802970886, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.0496698905626545e-06, | |
| "learning_rate": 4.4315423712133595e-06, | |
| "loss": -0.003, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5405405405405406, | |
| "grad_norm": 0.14342808723449707, | |
| "grpo_mean_advantage": 1.5869736103013565e-07, | |
| "grpo_mean_group_score": 0.5619662404060364, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.2748531617035042e-06, | |
| "learning_rate": 4.378008757385222e-06, | |
| "loss": 0.0154, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.5605605605605606, | |
| "grad_norm": 0.14729444682598114, | |
| "grpo_mean_advantage": 3.0100346748440643e-07, | |
| "grpo_mean_group_score": 0.5795454978942871, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 2.4499684059264837e-06, | |
| "learning_rate": 4.322421568553529e-06, | |
| "loss": -0.0262, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5805805805805806, | |
| "grad_norm": 0.15249410271644592, | |
| "grpo_mean_advantage": -3.233552092751779e-07, | |
| "grpo_mean_group_score": 0.5804953575134277, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.248456669600273e-06, | |
| "learning_rate": 4.2648415884931476e-06, | |
| "loss": 0.0018, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.6006006006006006, | |
| "grad_norm": 0.1841023564338684, | |
| "grpo_mean_advantage": 3.2261013416245987e-07, | |
| "grpo_mean_group_score": 0.5628539323806763, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.4773489738217904e-06, | |
| "learning_rate": 4.205331780066892e-06, | |
| "loss": -0.017, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6206206206206206, | |
| "grad_norm": 0.18597163259983063, | |
| "grpo_mean_advantage": -2.5331974029541016e-07, | |
| "grpo_mean_group_score": 0.5727725625038147, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.5092309695319273e-06, | |
| "learning_rate": 4.1439572163765615e-06, | |
| "loss": 0.0044, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.6406406406406406, | |
| "grad_norm": 0.18310388922691345, | |
| "grpo_mean_advantage": -6.780028627417778e-08, | |
| "grpo_mean_group_score": 0.5833909511566162, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 8.550978805033083e-07, | |
| "learning_rate": 4.0807850096064605e-06, | |
| "loss": -0.005, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6606606606606606, | |
| "grad_norm": 0.2192923128604889, | |
| "grpo_mean_advantage": -5.587935447692871e-08, | |
| "grpo_mean_group_score": 0.5742615461349487, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 3.564579174053506e-07, | |
| "learning_rate": 4.015884237637206e-06, | |
| "loss": -0.015, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.6806806806806807, | |
| "grad_norm": 0.16708803176879883, | |
| "grpo_mean_advantage": -5.327165126800537e-07, | |
| "grpo_mean_group_score": 0.5758188962936401, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 2.309018327650847e-06, | |
| "learning_rate": 3.949325868510083e-06, | |
| "loss": -0.0314, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7007007007007007, | |
| "grad_norm": 0.3401262164115906, | |
| "grpo_mean_advantage": 5.863606702405377e-07, | |
| "grpo_mean_group_score": 0.5767683982849121, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 2.4449204829579685e-06, | |
| "learning_rate": 3.881182682824534e-06, | |
| "loss": -0.0441, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7207207207207207, | |
| "grad_norm": 0.1931898146867752, | |
| "grpo_mean_advantage": 3.2186508747145126e-07, | |
| "grpo_mean_group_score": 0.586772084236145, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 2.293551688126172e-06, | |
| "learning_rate": 3.811529194153635e-06, | |
| "loss": -0.0162, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7407407407407407, | |
| "grad_norm": 0.2537969648838043, | |
| "grpo_mean_advantage": -4.470348358154297e-08, | |
| "grpo_mean_group_score": 0.549396276473999, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 3.7067667335577426e-07, | |
| "learning_rate": 3.7404415675646054e-06, | |
| "loss": -0.0386, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.7607607607607607, | |
| "grad_norm": 0.20326584577560425, | |
| "grpo_mean_advantage": -2.1010637851759384e-07, | |
| "grpo_mean_group_score": 0.5798425078392029, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.1695076409523608e-06, | |
| "learning_rate": 3.667997536333424e-06, | |
| "loss": -0.037, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7807807807807807, | |
| "grad_norm": 0.25048357248306274, | |
| "grpo_mean_advantage": 1.765787658314366e-07, | |
| "grpo_mean_group_score": 0.5584167838096619, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 2.429934738756856e-06, | |
| "learning_rate": 3.59427631694463e-06, | |
| "loss": -0.0292, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8008008008008008, | |
| "grad_norm": 0.2687569260597229, | |
| "grpo_mean_advantage": 1.6540289493605087e-07, | |
| "grpo_mean_group_score": 0.5676193237304688, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 2.6342788714828203e-06, | |
| "learning_rate": 3.5193585224692595e-06, | |
| "loss": -0.0454, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8208208208208209, | |
| "grad_norm": 0.22301620244979858, | |
| "grpo_mean_advantage": -1.0944902442133753e-06, | |
| "grpo_mean_group_score": 0.5669739842414856, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 5.346942998585291e-06, | |
| "learning_rate": 3.44332607441564e-06, | |
| "loss": -0.0423, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8408408408408409, | |
| "grad_norm": 0.3040211498737335, | |
| "grpo_mean_advantage": 2.4065374759629776e-07, | |
| "grpo_mean_group_score": 0.5922158360481262, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.6327536513927043e-06, | |
| "learning_rate": 3.3662621131494204e-06, | |
| "loss": -0.0857, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8608608608608609, | |
| "grad_norm": 0.27231141924858093, | |
| "grpo_mean_advantage": -5.21540641784668e-08, | |
| "grpo_mean_group_score": 0.5473950505256653, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 5.847922466273303e-07, | |
| "learning_rate": 3.2882509069808044e-06, | |
| "loss": -0.0278, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.8808808808808809, | |
| "grad_norm": 0.3571636378765106, | |
| "grpo_mean_advantage": 6.541609991472797e-07, | |
| "grpo_mean_group_score": 0.5880032777786255, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 4.072162937518442e-06, | |
| "learning_rate": 3.2093777600183873e-06, | |
| "loss": -0.0727, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9009009009009009, | |
| "grad_norm": 0.306273490190506, | |
| "grpo_mean_advantage": -1.2218951894737984e-07, | |
| "grpo_mean_group_score": 0.5835092663764954, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 4.386006935419573e-07, | |
| "learning_rate": 3.1297289188903705e-06, | |
| "loss": -0.0464, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.9209209209209209, | |
| "grad_norm": 0.2700377106666565, | |
| "grpo_mean_advantage": 1.7605722177904681e-06, | |
| "grpo_mean_group_score": 0.5394966006278992, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 8.007580618141219e-06, | |
| "learning_rate": 3.049391478435133e-06, | |
| "loss": -0.0295, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9409409409409409, | |
| "grad_norm": 0.39531761407852173, | |
| "grpo_mean_advantage": -3.3080578987210174e-07, | |
| "grpo_mean_group_score": 0.5687432289123535, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.551636614749441e-06, | |
| "learning_rate": 2.9684532864643123e-06, | |
| "loss": -0.031, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.960960960960961, | |
| "grad_norm": 0.5987040996551514, | |
| "grpo_mean_advantage": 2.712011450967111e-07, | |
| "grpo_mean_group_score": 0.5550583600997925, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.4400844747797237e-06, | |
| "learning_rate": 2.887002847702504e-06, | |
| "loss": -0.0789, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.980980980980981, | |
| "grad_norm": 0.5680716037750244, | |
| "grpo_mean_advantage": -3.2857059295565705e-07, | |
| "grpo_mean_group_score": 0.558111310005188, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 2.105091425619321e-06, | |
| "learning_rate": 2.8051292270086506e-06, | |
| "loss": -0.1131, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.6204046010971069, | |
| "grpo_mean_advantage": 4.470348358154297e-08, | |
| "grpo_mean_group_score": 0.6196198463439941, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 5.315724820320611e-07, | |
| "learning_rate": 2.722921951984927e-06, | |
| "loss": -0.2232, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.02002002002002, | |
| "grad_norm": 0.8389026522636414, | |
| "grpo_mean_advantage": 9.290873776990338e-07, | |
| "grpo_mean_group_score": 0.582168459892273, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 4.219644324621186e-06, | |
| "learning_rate": 2.640470915079614e-06, | |
| "loss": -0.1363, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.04004004004004, | |
| "grad_norm": 0.9067686796188354, | |
| "grpo_mean_advantage": 2.533197474008375e-08, | |
| "grpo_mean_group_score": 0.5551307797431946, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.6600588992332632e-07, | |
| "learning_rate": 2.557866275291035e-06, | |
| "loss": -0.1868, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.06006006006006, | |
| "grad_norm": 0.9277902841567993, | |
| "grpo_mean_advantage": -5.662441182607836e-08, | |
| "grpo_mean_group_score": 0.535040020942688, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.0909400316450046e-06, | |
| "learning_rate": 2.4751983595800093e-06, | |
| "loss": -0.1792, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.08008008008008, | |
| "grad_norm": 1.0715463161468506, | |
| "grpo_mean_advantage": -9.536743306171047e-08, | |
| "grpo_mean_group_score": 0.5673571825027466, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 5.838213610331877e-07, | |
| "learning_rate": 2.392557564098649e-06, | |
| "loss": -0.1691, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.1001001001001, | |
| "grad_norm": 0.7759184837341309, | |
| "grpo_mean_advantage": 3.278255533700758e-08, | |
| "grpo_mean_group_score": 0.5874732732772827, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 9.317170679423725e-07, | |
| "learning_rate": 2.3100342553434924e-06, | |
| "loss": -0.1655, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.12012012012012, | |
| "grad_norm": 0.9387398958206177, | |
| "grpo_mean_advantage": -1.206994113545079e-07, | |
| "grpo_mean_group_score": 0.5569106340408325, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 6.201085511747806e-07, | |
| "learning_rate": 2.2277186713410688e-06, | |
| "loss": -0.1821, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.14014014014014, | |
| "grad_norm": 1.6132302284240723, | |
| "grpo_mean_advantage": 4.470348358154297e-08, | |
| "grpo_mean_group_score": 0.5578873157501221, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 6.115651558502577e-07, | |
| "learning_rate": 2.1457008229739395e-06, | |
| "loss": -0.2102, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.16016016016016, | |
| "grad_norm": 0.8679026961326599, | |
| "grpo_mean_advantage": -3.3453108017056365e-07, | |
| "grpo_mean_group_score": 0.5735999345779419, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 3.5326345368957845e-06, | |
| "learning_rate": 2.0640703955551214e-06, | |
| "loss": -0.2937, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.1801801801801801, | |
| "grad_norm": 1.0550166368484497, | |
| "grpo_mean_advantage": -1.110136480519941e-07, | |
| "grpo_mean_group_score": 0.5626259446144104, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 4.731904823529476e-07, | |
| "learning_rate": 1.9829166507585084e-06, | |
| "loss": -0.2598, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.2002002002002001, | |
| "grad_norm": 1.2819372415542603, | |
| "grpo_mean_advantage": -5.08874677507265e-07, | |
| "grpo_mean_group_score": 0.5463050603866577, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.840126174101897e-06, | |
| "learning_rate": 1.90232832901255e-06, | |
| "loss": -0.2546, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.2202202202202201, | |
| "grad_norm": 1.0188143253326416, | |
| "grpo_mean_advantage": 1.01327898960335e-07, | |
| "grpo_mean_group_score": 0.5352144241333008, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 7.798533943059738e-07, | |
| "learning_rate": 1.82239355246389e-06, | |
| "loss": -0.1809, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.2402402402402402, | |
| "grad_norm": 2.0709052085876465, | |
| "grpo_mean_advantage": 1.341104507446289e-07, | |
| "grpo_mean_group_score": 0.5547868013381958, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 7.821902840987605e-07, | |
| "learning_rate": 1.7431997286170923e-06, | |
| "loss": -0.3559, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.2602602602602602, | |
| "grad_norm": 1.8516215085983276, | |
| "grpo_mean_advantage": 9.015202806494926e-08, | |
| "grpo_mean_group_score": 0.5859472751617432, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.0693488547985908e-06, | |
| "learning_rate": 1.6648334547558227e-06, | |
| "loss": -0.3874, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.2802802802802802, | |
| "grad_norm": 1.283104419708252, | |
| "grpo_mean_advantage": -2.443790378947597e-07, | |
| "grpo_mean_group_score": 0.5751550793647766, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.183122208203713e-06, | |
| "learning_rate": 1.5873804232499862e-06, | |
| "loss": -0.3467, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.3003003003003002, | |
| "grad_norm": 1.4108576774597168, | |
| "grpo_mean_advantage": -6.705522537231445e-08, | |
| "grpo_mean_group_score": 0.5497723817825317, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 6.109748937888071e-07, | |
| "learning_rate": 1.51092532785238e-06, | |
| "loss": -0.1703, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.3203203203203202, | |
| "grad_norm": 1.0421361923217773, | |
| "grpo_mean_advantage": -1.639127766850379e-08, | |
| "grpo_mean_group_score": 0.55989670753479, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 5.529495297196263e-07, | |
| "learning_rate": 1.4355517710873184e-06, | |
| "loss": -0.2918, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.3403403403403402, | |
| "grad_norm": 1.3465828895568848, | |
| "grpo_mean_advantage": 4.418194237132411e-07, | |
| "grpo_mean_group_score": 0.5809233784675598, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 2.9275292945385445e-06, | |
| "learning_rate": 1.361342172832502e-06, | |
| "loss": -0.3069, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.3603603603603602, | |
| "grad_norm": 1.1959459781646729, | |
| "grpo_mean_advantage": 9.685754776000977e-08, | |
| "grpo_mean_group_score": 0.5568087100982666, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 3.754235251562932e-07, | |
| "learning_rate": 1.2883776801940884e-06, | |
| "loss": -0.5594, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.3803803803803802, | |
| "grad_norm": 1.8967422246932983, | |
| "grpo_mean_advantage": -2.384185791015625e-07, | |
| "grpo_mean_group_score": 0.5655568838119507, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 6.821086913078034e-07, | |
| "learning_rate": 1.216738078773522e-06, | |
| "loss": -0.4102, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.4004004004004005, | |
| "grad_norm": 2.221132755279541, | |
| "grpo_mean_advantage": -8.717179156292332e-08, | |
| "grpo_mean_group_score": 0.6089578866958618, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 2.500940354366321e-06, | |
| "learning_rate": 1.146501705423155e-06, | |
| "loss": -0.338, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.4204204204204205, | |
| "grad_norm": 2.3640377521514893, | |
| "grpo_mean_advantage": 2.1606683731079102e-07, | |
| "grpo_mean_group_score": 0.6129671335220337, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.4568390724889468e-06, | |
| "learning_rate": 1.0777453625860474e-06, | |
| "loss": -0.4985, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.4404404404404405, | |
| "grad_norm": 1.9084734916687012, | |
| "grpo_mean_advantage": -3.725290298461914e-09, | |
| "grpo_mean_group_score": 0.5562310814857483, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 2.965894054796081e-06, | |
| "learning_rate": 1.0105442343136184e-06, | |
| "loss": -0.4347, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.4604604604604605, | |
| "grad_norm": 1.6063904762268066, | |
| "grpo_mean_advantage": 4.313886279305734e-07, | |
| "grpo_mean_group_score": 0.5884170532226562, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.9621948013082147e-06, | |
| "learning_rate": 9.449718040529987e-07, | |
| "loss": -0.6217, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.4804804804804805, | |
| "grad_norm": 2.114664077758789, | |
| "grpo_mean_advantage": 2.0489096641540527e-07, | |
| "grpo_mean_group_score": 0.5795440673828125, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.0235522722723545e-06, | |
| "learning_rate": 8.810997742939531e-07, | |
| "loss": -0.5364, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.5005005005005005, | |
| "grad_norm": 1.8450465202331543, | |
| "grpo_mean_advantage": -1.4185905001795618e-06, | |
| "grpo_mean_group_score": 0.5607603788375854, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.0947338523692451e-05, | |
| "learning_rate": 8.189979881632634e-07, | |
| "loss": -0.4798, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.5205205205205206, | |
| "grad_norm": 2.673438787460327, | |
| "grpo_mean_advantage": -1.758337049295733e-07, | |
| "grpo_mean_group_score": 0.5381432771682739, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 9.663675655247062e-07, | |
| "learning_rate": 7.587343530522945e-07, | |
| "loss": -0.4805, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.5405405405405406, | |
| "grad_norm": 2.2263550758361816, | |
| "grpo_mean_advantage": -6.973743325033865e-07, | |
| "grpo_mean_group_score": 0.5528443455696106, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 4.341973180999048e-06, | |
| "learning_rate": 7.003747663612581e-07, | |
| "loss": -0.433, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.5605605605605606, | |
| "grad_norm": 2.3657093048095703, | |
| "grpo_mean_advantage": 1.7881394143159923e-08, | |
| "grpo_mean_group_score": 0.6091476678848267, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 1.3004198251564958e-07, | |
| "learning_rate": 6.439830434413754e-07, | |
| "loss": -0.6021, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.5805805805805806, | |
| "grad_norm": 1.9847129583358765, | |
| "grpo_mean_advantage": 3.4868716625169327e-07, | |
| "grpo_mean_group_score": 0.5397372245788574, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 2.059372718576924e-06, | |
| "learning_rate": 5.896208478137222e-07, | |
| "loss": -0.5595, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.6006006006006006, | |
| "grad_norm": 2.922114133834839, | |
| "grpo_mean_advantage": -2.1636485598719446e-06, | |
| "grpo_mean_group_score": 0.5873125195503235, | |
| "grpo_mean_kl_div": 0.0, | |
| "grpo_std_advantage": 9.725940799398813e-06, | |
| "learning_rate": 5.373476237410808e-07, | |
| "loss": -0.5592, | |
| "step": 400 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |