diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -4,5789 +4,5831 @@ "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, - "global_step": 825, + "global_step": 831, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0036363636363636364, - "grad_norm": 47.79498291015625, + "epoch": 0.0036101083032490976, + "grad_norm": 53.0424919128418, "learning_rate": 5e-05, - "loss": 20.1422, + "loss": 20.1881, "step": 1 }, { - "epoch": 0.007272727272727273, - "grad_norm": 45.03392028808594, - "learning_rate": 4.993939393939394e-05, - "loss": 19.7063, + "epoch": 0.007220216606498195, + "grad_norm": 40.154483795166016, + "learning_rate": 4.9939831528279186e-05, + "loss": 18.4212, "step": 2 }, { - "epoch": 0.01090909090909091, - "grad_norm": 41.21010971069336, - "learning_rate": 4.987878787878788e-05, - "loss": 18.7882, + "epoch": 0.010830324909747292, + "grad_norm": 41.030975341796875, + "learning_rate": 4.987966305655837e-05, + "loss": 18.1343, "step": 3 }, { - "epoch": 0.014545454545454545, - "grad_norm": 41.21604919433594, - "learning_rate": 4.981818181818182e-05, - "loss": 18.4652, + "epoch": 0.01444043321299639, + "grad_norm": 38.574737548828125, + "learning_rate": 4.981949458483755e-05, + "loss": 17.831, "step": 4 }, { - "epoch": 0.01818181818181818, - "grad_norm": 36.21095657348633, - "learning_rate": 4.9757575757575756e-05, - "loss": 17.261, + "epoch": 0.018050541516245487, + "grad_norm": 34.99953079223633, + "learning_rate": 4.9759326113116724e-05, + "loss": 16.8969, "step": 5 }, { - "epoch": 0.02181818181818182, - "grad_norm": 33.17779541015625, - "learning_rate": 4.9696969696969694e-05, - "loss": 17.3998, + "epoch": 0.021660649819494584, + "grad_norm": 36.076698303222656, + "learning_rate": 4.969915764139591e-05, + "loss": 17.5522, "step": 6 }, { - "epoch": 0.025454545454545455, - "grad_norm": 27.718563079833984, - "learning_rate": 4.963636363636364e-05, - "loss": 16.7747, + "epoch": 0.02527075812274368, + "grad_norm": 40.10169982910156, + "learning_rate": 4.963898916967509e-05, + "loss": 17.2688, "step": 7 }, { - "epoch": 0.02909090909090909, - "grad_norm": 29.30256462097168, - "learning_rate": 4.957575757575758e-05, - "loss": 16.5601, + "epoch": 0.02888086642599278, + "grad_norm": 30.18881607055664, + "learning_rate": 4.9578820697954276e-05, + "loss": 16.2351, "step": 8 }, { - "epoch": 0.03272727272727273, - "grad_norm": 29.383529663085938, - "learning_rate": 4.951515151515152e-05, - "loss": 17.3504, + "epoch": 0.032490974729241874, + "grad_norm": 30.67852020263672, + "learning_rate": 4.951865222623346e-05, + "loss": 17.0894, "step": 9 }, { - "epoch": 0.03636363636363636, - "grad_norm": 23.729032516479492, - "learning_rate": 4.945454545454546e-05, - "loss": 16.3259, + "epoch": 0.036101083032490974, + "grad_norm": 27.552501678466797, + "learning_rate": 4.945848375451264e-05, + "loss": 16.7366, "step": 10 }, { - "epoch": 0.04, - "grad_norm": 25.69147300720215, - "learning_rate": 4.93939393939394e-05, - "loss": 17.0484, + "epoch": 0.039711191335740074, + "grad_norm": 26.315004348754883, + "learning_rate": 4.939831528279182e-05, + "loss": 16.6955, "step": 11 }, { - "epoch": 0.04363636363636364, - "grad_norm": 32.13129425048828, - "learning_rate": 4.933333333333334e-05, - "loss": 16.2835, + "epoch": 0.04332129963898917, + "grad_norm": 27.678667068481445, + "learning_rate": 4.9338146811071e-05, + "loss": 16.1629, "step": 12 }, { - "epoch": 0.04727272727272727, - "grad_norm": 33.38395690917969, - "learning_rate": 4.9272727272727276e-05, - "loss": 16.2865, + "epoch": 0.04693140794223827, + "grad_norm": 39.11885452270508, + "learning_rate": 4.927797833935018e-05, + "loss": 15.8583, "step": 13 }, { - "epoch": 0.05090909090909091, - "grad_norm": 47.96619415283203, - "learning_rate": 4.9212121212121214e-05, - "loss": 16.763, + "epoch": 0.05054151624548736, + "grad_norm": 44.302581787109375, + "learning_rate": 4.9217809867629366e-05, + "loss": 16.4332, "step": 14 }, { - "epoch": 0.05454545454545454, - "grad_norm": 40.4400520324707, - "learning_rate": 4.915151515151515e-05, - "loss": 15.6799, + "epoch": 0.05415162454873646, + "grad_norm": 48.368934631347656, + "learning_rate": 4.915764139590854e-05, + "loss": 15.6681, "step": 15 }, { - "epoch": 0.05818181818181818, - "grad_norm": 32.743865966796875, - "learning_rate": 4.909090909090909e-05, - "loss": 15.7916, + "epoch": 0.05776173285198556, + "grad_norm": 62.672847747802734, + "learning_rate": 4.909747292418773e-05, + "loss": 15.9232, "step": 16 }, { - "epoch": 0.06181818181818182, - "grad_norm": 56.1345329284668, - "learning_rate": 4.903030303030303e-05, - "loss": 15.7303, + "epoch": 0.061371841155234655, + "grad_norm": 56.098548889160156, + "learning_rate": 4.903730445246691e-05, + "loss": 15.6061, "step": 17 }, { - "epoch": 0.06545454545454546, - "grad_norm": 43.6757698059082, - "learning_rate": 4.896969696969697e-05, - "loss": 15.492, + "epoch": 0.06498194945848375, + "grad_norm": 47.98686218261719, + "learning_rate": 4.8977135980746095e-05, + "loss": 15.2327, "step": 18 }, { - "epoch": 0.06909090909090909, - "grad_norm": 31.908369064331055, - "learning_rate": 4.890909090909091e-05, - "loss": 15.177, + "epoch": 0.06859205776173286, + "grad_norm": 49.93760299682617, + "learning_rate": 4.891696750902527e-05, + "loss": 15.6251, "step": 19 }, { - "epoch": 0.07272727272727272, - "grad_norm": 29.7440185546875, - "learning_rate": 4.884848484848485e-05, - "loss": 15.2295, + "epoch": 0.07220216606498195, + "grad_norm": 34.30894470214844, + "learning_rate": 4.8856799037304456e-05, + "loss": 14.8196, "step": 20 }, { - "epoch": 0.07636363636363637, - "grad_norm": 24.505130767822266, - "learning_rate": 4.878787878787879e-05, - "loss": 14.9099, + "epoch": 0.07581227436823104, + "grad_norm": 33.243125915527344, + "learning_rate": 4.879663056558363e-05, + "loss": 14.5626, "step": 21 }, { - "epoch": 0.08, - "grad_norm": 23.924467086791992, - "learning_rate": 4.872727272727273e-05, - "loss": 15.062, + "epoch": 0.07942238267148015, + "grad_norm": 27.303117752075195, + "learning_rate": 4.873646209386282e-05, + "loss": 14.1914, "step": 22 }, { - "epoch": 0.08363636363636363, - "grad_norm": 25.439834594726562, - "learning_rate": 4.866666666666667e-05, - "loss": 14.1283, + "epoch": 0.08303249097472924, + "grad_norm": 30.314437866210938, + "learning_rate": 4.8676293622142e-05, + "loss": 15.2119, "step": 23 }, { - "epoch": 0.08727272727272728, - "grad_norm": 21.35331153869629, - "learning_rate": 4.860606060606061e-05, - "loss": 13.9236, + "epoch": 0.08664259927797834, + "grad_norm": 25.56382942199707, + "learning_rate": 4.8616125150421185e-05, + "loss": 14.5087, "step": 24 }, { - "epoch": 0.09090909090909091, - "grad_norm": 26.658445358276367, - "learning_rate": 4.854545454545455e-05, - "loss": 14.4044, + "epoch": 0.09025270758122744, + "grad_norm": 28.333757400512695, + "learning_rate": 4.855595667870036e-05, + "loss": 14.3963, "step": 25 }, { - "epoch": 0.09454545454545454, - "grad_norm": 25.977502822875977, - "learning_rate": 4.848484848484849e-05, - "loss": 13.7121, + "epoch": 0.09386281588447654, + "grad_norm": 22.039608001708984, + "learning_rate": 4.8495788206979546e-05, + "loss": 14.4985, "step": 26 }, { - "epoch": 0.09818181818181818, - "grad_norm": 32.09416580200195, - "learning_rate": 4.8424242424242426e-05, - "loss": 13.8631, + "epoch": 0.09747292418772563, + "grad_norm": 27.54795265197754, + "learning_rate": 4.843561973525872e-05, + "loss": 13.7732, "step": 27 }, { - "epoch": 0.10181818181818182, - "grad_norm": 23.397937774658203, - "learning_rate": 4.8363636363636364e-05, - "loss": 13.3994, + "epoch": 0.10108303249097472, + "grad_norm": 21.023685455322266, + "learning_rate": 4.837545126353791e-05, + "loss": 13.5031, "step": 28 }, { - "epoch": 0.10545454545454545, - "grad_norm": 28.557424545288086, - "learning_rate": 4.83030303030303e-05, - "loss": 13.83, + "epoch": 0.10469314079422383, + "grad_norm": 21.715377807617188, + "learning_rate": 4.831528279181709e-05, + "loss": 13.9961, "step": 29 }, { - "epoch": 0.10909090909090909, - "grad_norm": 17.376995086669922, - "learning_rate": 4.824242424242425e-05, - "loss": 13.0998, + "epoch": 0.10830324909747292, + "grad_norm": 18.735675811767578, + "learning_rate": 4.8255114320096275e-05, + "loss": 13.2735, "step": 30 }, { - "epoch": 0.11272727272727273, - "grad_norm": 26.38998031616211, - "learning_rate": 4.8181818181818186e-05, - "loss": 13.134, + "epoch": 0.11191335740072202, + "grad_norm": 19.733570098876953, + "learning_rate": 4.819494584837546e-05, + "loss": 13.2835, "step": 31 }, { - "epoch": 0.11636363636363636, - "grad_norm": 17.54689598083496, - "learning_rate": 4.8121212121212125e-05, - "loss": 12.8039, + "epoch": 0.11552346570397112, + "grad_norm": 18.386388778686523, + "learning_rate": 4.8134777376654636e-05, + "loss": 13.3851, "step": 32 }, { - "epoch": 0.12, - "grad_norm": 18.970720291137695, - "learning_rate": 4.806060606060606e-05, - "loss": 13.0586, + "epoch": 0.11913357400722022, + "grad_norm": 18.647968292236328, + "learning_rate": 4.807460890493381e-05, + "loss": 12.7988, "step": 33 }, { - "epoch": 0.12363636363636364, - "grad_norm": 19.773618698120117, - "learning_rate": 4.8e-05, - "loss": 13.0412, + "epoch": 0.12274368231046931, + "grad_norm": 17.047182083129883, + "learning_rate": 4.8014440433213e-05, + "loss": 12.8917, "step": 34 }, { - "epoch": 0.12727272727272726, - "grad_norm": 22.805259704589844, - "learning_rate": 4.793939393939394e-05, - "loss": 12.8489, + "epoch": 0.1263537906137184, + "grad_norm": 17.250154495239258, + "learning_rate": 4.795427196149218e-05, + "loss": 12.9744, "step": 35 }, { - "epoch": 0.13090909090909092, - "grad_norm": 23.569717407226562, - "learning_rate": 4.787878787878788e-05, - "loss": 12.616, + "epoch": 0.1299638989169675, + "grad_norm": 19.833730697631836, + "learning_rate": 4.7894103489771365e-05, + "loss": 13.001, "step": 36 }, { - "epoch": 0.13454545454545455, - "grad_norm": 22.83664321899414, - "learning_rate": 4.781818181818182e-05, - "loss": 11.8258, + "epoch": 0.13357400722021662, + "grad_norm": 16.90082550048828, + "learning_rate": 4.783393501805055e-05, + "loss": 12.4627, "step": 37 }, { - "epoch": 0.13818181818181818, - "grad_norm": 26.738494873046875, - "learning_rate": 4.775757575757576e-05, - "loss": 12.1916, + "epoch": 0.1371841155234657, + "grad_norm": 15.739155769348145, + "learning_rate": 4.7773766546329726e-05, + "loss": 12.4672, "step": 38 }, { - "epoch": 0.14181818181818182, - "grad_norm": 18.36771583557129, - "learning_rate": 4.76969696969697e-05, - "loss": 12.2002, + "epoch": 0.1407942238267148, + "grad_norm": 16.412572860717773, + "learning_rate": 4.771359807460891e-05, + "loss": 11.8982, "step": 39 }, { - "epoch": 0.14545454545454545, - "grad_norm": 15.442994117736816, - "learning_rate": 4.763636363636364e-05, - "loss": 11.7265, + "epoch": 0.1444043321299639, + "grad_norm": 15.304023742675781, + "learning_rate": 4.765342960288809e-05, + "loss": 12.2821, "step": 40 }, { - "epoch": 0.14909090909090908, - "grad_norm": 17.741811752319336, - "learning_rate": 4.7575757575757576e-05, - "loss": 12.4528, + "epoch": 0.148014440433213, + "grad_norm": 15.000272750854492, + "learning_rate": 4.759326113116727e-05, + "loss": 11.3867, "step": 41 }, { - "epoch": 0.15272727272727274, - "grad_norm": 14.73928451538086, - "learning_rate": 4.751515151515152e-05, - "loss": 11.5017, + "epoch": 0.15162454873646208, + "grad_norm": 14.771124839782715, + "learning_rate": 4.7533092659446455e-05, + "loss": 11.7266, "step": 42 }, { - "epoch": 0.15636363636363637, - "grad_norm": 17.848480224609375, - "learning_rate": 4.745454545454546e-05, - "loss": 11.8774, + "epoch": 0.1552346570397112, + "grad_norm": 15.111937522888184, + "learning_rate": 4.747292418772563e-05, + "loss": 11.378, "step": 43 }, { - "epoch": 0.16, - "grad_norm": 17.191818237304688, - "learning_rate": 4.73939393939394e-05, - "loss": 11.0546, + "epoch": 0.1588447653429603, + "grad_norm": 14.241510391235352, + "learning_rate": 4.7412755716004816e-05, + "loss": 11.4672, "step": 44 }, { - "epoch": 0.16363636363636364, - "grad_norm": 15.763983726501465, - "learning_rate": 4.7333333333333336e-05, - "loss": 11.2831, + "epoch": 0.1624548736462094, + "grad_norm": 14.913339614868164, + "learning_rate": 4.7352587244284e-05, + "loss": 11.2229, "step": 45 }, { - "epoch": 0.16727272727272727, - "grad_norm": 16.031883239746094, - "learning_rate": 4.7272727272727275e-05, - "loss": 11.0432, + "epoch": 0.16606498194945848, + "grad_norm": 14.72385025024414, + "learning_rate": 4.7292418772563177e-05, + "loss": 10.8122, "step": 46 }, { - "epoch": 0.1709090909090909, - "grad_norm": 14.979501724243164, - "learning_rate": 4.721212121212121e-05, - "loss": 11.0545, + "epoch": 0.16967509025270758, + "grad_norm": 13.52721881866455, + "learning_rate": 4.723225030084236e-05, + "loss": 10.9708, "step": 47 }, { - "epoch": 0.17454545454545456, - "grad_norm": 15.489818572998047, - "learning_rate": 4.715151515151515e-05, - "loss": 11.3819, + "epoch": 0.17328519855595667, + "grad_norm": 13.60007095336914, + "learning_rate": 4.7172081829121544e-05, + "loss": 10.807, "step": 48 }, { - "epoch": 0.1781818181818182, - "grad_norm": 14.921260833740234, - "learning_rate": 4.709090909090909e-05, - "loss": 11.057, + "epoch": 0.17689530685920576, + "grad_norm": 13.957290649414062, + "learning_rate": 4.711191335740072e-05, + "loss": 10.7637, "step": 49 }, { - "epoch": 0.18181818181818182, - "grad_norm": 14.218314170837402, - "learning_rate": 4.703030303030303e-05, - "loss": 10.7186, + "epoch": 0.18050541516245489, + "grad_norm": 14.956080436706543, + "learning_rate": 4.7051744885679905e-05, + "loss": 10.9521, "step": 50 }, { - "epoch": 0.18545454545454546, - "grad_norm": 13.922761917114258, - "learning_rate": 4.696969696969697e-05, - "loss": 10.7164, + "epoch": 0.18411552346570398, + "grad_norm": 13.179266929626465, + "learning_rate": 4.699157641395909e-05, + "loss": 10.5118, "step": 51 }, { - "epoch": 0.1890909090909091, - "grad_norm": 14.378653526306152, - "learning_rate": 4.690909090909091e-05, - "loss": 10.6261, + "epoch": 0.18772563176895307, + "grad_norm": 16.000741958618164, + "learning_rate": 4.693140794223827e-05, + "loss": 10.7588, "step": 52 }, { - "epoch": 0.19272727272727272, - "grad_norm": 13.454336166381836, - "learning_rate": 4.684848484848485e-05, - "loss": 10.3784, + "epoch": 0.19133574007220217, + "grad_norm": 21.888368606567383, + "learning_rate": 4.687123947051745e-05, + "loss": 10.6924, "step": 53 }, { - "epoch": 0.19636363636363635, - "grad_norm": 14.680737495422363, - "learning_rate": 4.6787878787878795e-05, - "loss": 10.3173, + "epoch": 0.19494584837545126, + "grad_norm": 14.187573432922363, + "learning_rate": 4.681107099879663e-05, + "loss": 10.3625, "step": 54 }, { - "epoch": 0.2, - "grad_norm": 15.582632064819336, - "learning_rate": 4.672727272727273e-05, - "loss": 10.1981, + "epoch": 0.19855595667870035, + "grad_norm": 13.989165306091309, + "learning_rate": 4.675090252707581e-05, + "loss": 9.9838, "step": 55 }, { - "epoch": 0.20363636363636364, - "grad_norm": 13.507526397705078, - "learning_rate": 4.666666666666667e-05, - "loss": 10.1057, + "epoch": 0.20216606498194944, + "grad_norm": 17.044803619384766, + "learning_rate": 4.6690734055354995e-05, + "loss": 10.3739, "step": 56 }, { - "epoch": 0.20727272727272728, - "grad_norm": 12.818306922912598, - "learning_rate": 4.660606060606061e-05, - "loss": 9.9707, + "epoch": 0.20577617328519857, + "grad_norm": 14.635570526123047, + "learning_rate": 4.663056558363418e-05, + "loss": 9.9174, "step": 57 }, { - "epoch": 0.2109090909090909, - "grad_norm": 21.36762809753418, - "learning_rate": 4.654545454545455e-05, - "loss": 10.374, + "epoch": 0.20938628158844766, + "grad_norm": 13.500565528869629, + "learning_rate": 4.657039711191336e-05, + "loss": 9.8747, "step": 58 }, { - "epoch": 0.21454545454545454, - "grad_norm": 12.67024040222168, - "learning_rate": 4.6484848484848487e-05, - "loss": 9.8168, + "epoch": 0.21299638989169675, + "grad_norm": 13.646149635314941, + "learning_rate": 4.651022864019254e-05, + "loss": 9.7799, "step": 59 }, { - "epoch": 0.21818181818181817, - "grad_norm": 17.809871673583984, - "learning_rate": 4.6424242424242425e-05, - "loss": 9.5406, + "epoch": 0.21660649819494585, + "grad_norm": 13.92541217803955, + "learning_rate": 4.645006016847172e-05, + "loss": 9.5702, "step": 60 }, { - "epoch": 0.22181818181818183, - "grad_norm": 16.511991500854492, - "learning_rate": 4.636363636363636e-05, - "loss": 10.0524, + "epoch": 0.22021660649819494, + "grad_norm": 12.247454643249512, + "learning_rate": 4.63898916967509e-05, + "loss": 9.5437, "step": 61 }, { - "epoch": 0.22545454545454546, - "grad_norm": 14.556917190551758, - "learning_rate": 4.63030303030303e-05, - "loss": 9.8776, + "epoch": 0.22382671480144403, + "grad_norm": 13.187448501586914, + "learning_rate": 4.6329723225030085e-05, + "loss": 9.3673, "step": 62 }, { - "epoch": 0.2290909090909091, - "grad_norm": 21.906808853149414, - "learning_rate": 4.624242424242424e-05, - "loss": 9.2781, + "epoch": 0.22743682310469315, + "grad_norm": 16.612987518310547, + "learning_rate": 4.626955475330927e-05, + "loss": 9.4453, "step": 63 }, { - "epoch": 0.23272727272727273, - "grad_norm": 13.430219650268555, - "learning_rate": 4.618181818181818e-05, - "loss": 9.3972, + "epoch": 0.23104693140794225, + "grad_norm": 12.24003791809082, + "learning_rate": 4.620938628158845e-05, + "loss": 9.2878, "step": 64 }, { - "epoch": 0.23636363636363636, - "grad_norm": 16.371122360229492, - "learning_rate": 4.612121212121212e-05, - "loss": 9.2046, + "epoch": 0.23465703971119134, + "grad_norm": 12.949747085571289, + "learning_rate": 4.614921780986764e-05, + "loss": 9.2656, "step": 65 }, { - "epoch": 0.24, - "grad_norm": 17.033885955810547, - "learning_rate": 4.606060606060607e-05, - "loss": 9.4568, + "epoch": 0.23826714801444043, + "grad_norm": 15.98535442352295, + "learning_rate": 4.6089049338146814e-05, + "loss": 9.1888, "step": 66 }, { - "epoch": 0.24363636363636362, - "grad_norm": 19.499509811401367, - "learning_rate": 4.600000000000001e-05, - "loss": 9.3437, + "epoch": 0.24187725631768953, + "grad_norm": 12.989156723022461, + "learning_rate": 4.602888086642599e-05, + "loss": 8.9948, "step": 67 }, { - "epoch": 0.24727272727272728, - "grad_norm": 13.871404647827148, - "learning_rate": 4.5939393939393945e-05, - "loss": 9.2115, + "epoch": 0.24548736462093862, + "grad_norm": 19.851245880126953, + "learning_rate": 4.5968712394705175e-05, + "loss": 9.4233, "step": 68 }, { - "epoch": 0.2509090909090909, - "grad_norm": 20.010080337524414, - "learning_rate": 4.5878787878787883e-05, - "loss": 9.2831, + "epoch": 0.2490974729241877, + "grad_norm": 12.913300514221191, + "learning_rate": 4.590854392298436e-05, + "loss": 9.0834, "step": 69 }, { - "epoch": 0.2545454545454545, - "grad_norm": 17.872783660888672, - "learning_rate": 4.581818181818182e-05, - "loss": 9.3362, + "epoch": 0.2527075812274368, + "grad_norm": 14.333419799804688, + "learning_rate": 4.584837545126354e-05, + "loss": 9.1562, "step": 70 }, { - "epoch": 0.2581818181818182, - "grad_norm": 18.30128288269043, - "learning_rate": 4.575757575757576e-05, - "loss": 8.7413, + "epoch": 0.2563176895306859, + "grad_norm": 13.390801429748535, + "learning_rate": 4.578820697954272e-05, + "loss": 9.0329, "step": 71 }, { - "epoch": 0.26181818181818184, - "grad_norm": 21.37891387939453, - "learning_rate": 4.56969696969697e-05, - "loss": 8.6968, + "epoch": 0.259927797833935, + "grad_norm": 26.999013900756836, + "learning_rate": 4.5728038507821904e-05, + "loss": 8.702, "step": 72 }, { - "epoch": 0.26545454545454544, - "grad_norm": 20.191150665283203, - "learning_rate": 4.563636363636364e-05, - "loss": 8.7501, + "epoch": 0.26353790613718414, + "grad_norm": 15.850932121276855, + "learning_rate": 4.566787003610109e-05, + "loss": 8.9099, "step": 73 }, { - "epoch": 0.2690909090909091, - "grad_norm": 17.527233123779297, - "learning_rate": 4.5575757575757575e-05, - "loss": 8.589, + "epoch": 0.26714801444043323, + "grad_norm": 22.490720748901367, + "learning_rate": 4.5607701564380265e-05, + "loss": 8.6662, "step": 74 }, { - "epoch": 0.2727272727272727, - "grad_norm": 12.172165870666504, - "learning_rate": 4.5515151515151513e-05, - "loss": 8.5487, + "epoch": 0.27075812274368233, + "grad_norm": 21.35521125793457, + "learning_rate": 4.554753309265945e-05, + "loss": 8.3598, "step": 75 }, { - "epoch": 0.27636363636363637, - "grad_norm": 12.90199089050293, - "learning_rate": 4.545454545454546e-05, - "loss": 8.4269, + "epoch": 0.2743682310469314, + "grad_norm": 14.975168228149414, + "learning_rate": 4.548736462093863e-05, + "loss": 8.2639, "step": 76 }, { - "epoch": 0.28, - "grad_norm": 14.662657737731934, - "learning_rate": 4.53939393939394e-05, - "loss": 8.47, + "epoch": 0.2779783393501805, + "grad_norm": 14.411498069763184, + "learning_rate": 4.542719614921781e-05, + "loss": 8.2173, "step": 77 }, { - "epoch": 0.28363636363636363, - "grad_norm": 12.177051544189453, - "learning_rate": 4.5333333333333335e-05, - "loss": 8.1216, + "epoch": 0.2815884476534296, + "grad_norm": 15.177107810974121, + "learning_rate": 4.5367027677496994e-05, + "loss": 8.2124, "step": 78 }, { - "epoch": 0.2872727272727273, - "grad_norm": 15.531538009643555, - "learning_rate": 4.5272727272727274e-05, - "loss": 8.2657, + "epoch": 0.2851985559566787, + "grad_norm": 21.427085876464844, + "learning_rate": 4.530685920577618e-05, + "loss": 8.1488, "step": 79 }, { - "epoch": 0.2909090909090909, - "grad_norm": 42.798797607421875, - "learning_rate": 4.521212121212122e-05, - "loss": 8.5536, + "epoch": 0.2888086642599278, + "grad_norm": 18.792469024658203, + "learning_rate": 4.5246690734055355e-05, + "loss": 8.0504, "step": 80 }, { - "epoch": 0.29454545454545455, - "grad_norm": 14.968511581420898, - "learning_rate": 4.515151515151516e-05, - "loss": 8.0826, + "epoch": 0.2924187725631769, + "grad_norm": 13.447463989257812, + "learning_rate": 4.518652226233454e-05, + "loss": 7.7557, "step": 81 }, { - "epoch": 0.29818181818181816, - "grad_norm": 13.593378067016602, - "learning_rate": 4.5090909090909095e-05, - "loss": 8.0562, + "epoch": 0.296028880866426, + "grad_norm": 11.585929870605469, + "learning_rate": 4.5126353790613716e-05, + "loss": 7.7533, "step": 82 }, { - "epoch": 0.3018181818181818, - "grad_norm": 20.985275268554688, - "learning_rate": 4.5030303030303034e-05, - "loss": 7.7742, + "epoch": 0.2996389891696751, + "grad_norm": 16.441787719726562, + "learning_rate": 4.50661853188929e-05, + "loss": 7.6073, "step": 83 }, { - "epoch": 0.3054545454545455, - "grad_norm": 22.25298309326172, - "learning_rate": 4.496969696969697e-05, - "loss": 7.8028, + "epoch": 0.30324909747292417, + "grad_norm": 12.96184253692627, + "learning_rate": 4.5006016847172084e-05, + "loss": 7.7666, "step": 84 }, { - "epoch": 0.3090909090909091, - "grad_norm": 14.785983085632324, - "learning_rate": 4.490909090909091e-05, - "loss": 8.0702, + "epoch": 0.30685920577617326, + "grad_norm": 32.2669677734375, + "learning_rate": 4.494584837545127e-05, + "loss": 8.1308, "step": 85 }, { - "epoch": 0.31272727272727274, - "grad_norm": 19.781902313232422, - "learning_rate": 4.484848484848485e-05, - "loss": 7.6381, + "epoch": 0.3104693140794224, + "grad_norm": 18.520849227905273, + "learning_rate": 4.488567990373045e-05, + "loss": 7.8578, "step": 86 }, { - "epoch": 0.31636363636363635, - "grad_norm": 14.84749698638916, - "learning_rate": 4.478787878787879e-05, - "loss": 7.635, + "epoch": 0.3140794223826715, + "grad_norm": 18.274093627929688, + "learning_rate": 4.482551143200963e-05, + "loss": 7.8863, "step": 87 }, { - "epoch": 0.32, - "grad_norm": 15.319684982299805, - "learning_rate": 4.472727272727273e-05, - "loss": 7.3204, + "epoch": 0.3176895306859206, + "grad_norm": 20.193405151367188, + "learning_rate": 4.4765342960288806e-05, + "loss": 7.6916, "step": 88 }, { - "epoch": 0.3236363636363636, - "grad_norm": 17.83990478515625, - "learning_rate": 4.466666666666667e-05, - "loss": 7.4941, + "epoch": 0.3212996389891697, + "grad_norm": 27.587148666381836, + "learning_rate": 4.470517448856799e-05, + "loss": 7.3948, "step": 89 }, { - "epoch": 0.32727272727272727, - "grad_norm": 13.8068265914917, - "learning_rate": 4.460606060606061e-05, - "loss": 7.3864, + "epoch": 0.3249097472924188, + "grad_norm": 19.598400115966797, + "learning_rate": 4.4645006016847174e-05, + "loss": 7.6276, "step": 90 }, { - "epoch": 0.33090909090909093, - "grad_norm": 42.767478942871094, - "learning_rate": 4.454545454545455e-05, - "loss": 7.6903, + "epoch": 0.3285198555956679, + "grad_norm": 34.88701629638672, + "learning_rate": 4.458483754512636e-05, + "loss": 7.29, "step": 91 }, { - "epoch": 0.33454545454545453, - "grad_norm": 12.295734405517578, - "learning_rate": 4.4484848484848485e-05, - "loss": 7.14, + "epoch": 0.33212996389891697, + "grad_norm": 27.295541763305664, + "learning_rate": 4.452466907340554e-05, + "loss": 7.2514, "step": 92 }, { - "epoch": 0.3381818181818182, - "grad_norm": 13.037458419799805, - "learning_rate": 4.4424242424242424e-05, - "loss": 7.2247, + "epoch": 0.33574007220216606, + "grad_norm": 23.34226417541504, + "learning_rate": 4.4464500601684725e-05, + "loss": 7.0975, "step": 93 }, { - "epoch": 0.3418181818181818, - "grad_norm": 14.935813903808594, - "learning_rate": 4.436363636363637e-05, - "loss": 7.1593, + "epoch": 0.33935018050541516, + "grad_norm": 15.4843111038208, + "learning_rate": 4.44043321299639e-05, + "loss": 7.039, "step": 94 }, { - "epoch": 0.34545454545454546, - "grad_norm": 23.534366607666016, - "learning_rate": 4.430303030303031e-05, - "loss": 7.4126, + "epoch": 0.34296028880866425, + "grad_norm": 33.904781341552734, + "learning_rate": 4.434416365824308e-05, + "loss": 7.1548, "step": 95 }, { - "epoch": 0.3490909090909091, - "grad_norm": 15.437704086303711, - "learning_rate": 4.4242424242424246e-05, - "loss": 7.2912, + "epoch": 0.34657039711191334, + "grad_norm": 18.602880477905273, + "learning_rate": 4.4283995186522264e-05, + "loss": 6.866, "step": 96 }, { - "epoch": 0.3527272727272727, - "grad_norm": 19.122468948364258, - "learning_rate": 4.4181818181818184e-05, - "loss": 7.2778, + "epoch": 0.35018050541516244, + "grad_norm": 68.61976623535156, + "learning_rate": 4.422382671480145e-05, + "loss": 7.4221, "step": 97 }, { - "epoch": 0.3563636363636364, - "grad_norm": 29.849777221679688, - "learning_rate": 4.412121212121212e-05, - "loss": 6.9932, + "epoch": 0.35379061371841153, + "grad_norm": 29.8818416595459, + "learning_rate": 4.416365824308063e-05, + "loss": 6.8978, "step": 98 }, { - "epoch": 0.36, - "grad_norm": 31.232065200805664, - "learning_rate": 4.406060606060606e-05, - "loss": 6.9073, + "epoch": 0.3574007220216607, + "grad_norm": 19.478824615478516, + "learning_rate": 4.410348977135981e-05, + "loss": 6.7587, "step": 99 }, { - "epoch": 0.36363636363636365, - "grad_norm": 20.299890518188477, - "learning_rate": 4.4000000000000006e-05, - "loss": 6.7348, + "epoch": 0.36101083032490977, + "grad_norm": 18.693496704101562, + "learning_rate": 4.404332129963899e-05, + "loss": 6.8979, "step": 100 }, { - "epoch": 0.36727272727272725, - "grad_norm": 12.967691421508789, - "learning_rate": 4.3939393939393944e-05, - "loss": 6.7341, + "epoch": 0.36462093862815886, + "grad_norm": 20.177623748779297, + "learning_rate": 4.398315282791817e-05, + "loss": 6.6204, "step": 101 }, { - "epoch": 0.3709090909090909, - "grad_norm": 33.69974899291992, - "learning_rate": 4.387878787878788e-05, - "loss": 7.1418, + "epoch": 0.36823104693140796, + "grad_norm": 21.361373901367188, + "learning_rate": 4.3922984356197353e-05, + "loss": 6.685, "step": 102 }, { - "epoch": 0.37454545454545457, - "grad_norm": 16.100889205932617, - "learning_rate": 4.381818181818182e-05, - "loss": 6.6484, + "epoch": 0.37184115523465705, + "grad_norm": 22.163894653320312, + "learning_rate": 4.386281588447654e-05, + "loss": 6.6413, "step": 103 }, { - "epoch": 0.3781818181818182, - "grad_norm": 16.84703254699707, - "learning_rate": 4.375757575757576e-05, - "loss": 6.7367, + "epoch": 0.37545126353790614, + "grad_norm": 25.493696212768555, + "learning_rate": 4.380264741275572e-05, + "loss": 6.5075, "step": 104 }, { - "epoch": 0.38181818181818183, - "grad_norm": 12.23398494720459, - "learning_rate": 4.36969696969697e-05, - "loss": 6.5239, + "epoch": 0.37906137184115524, + "grad_norm": 21.208959579467773, + "learning_rate": 4.37424789410349e-05, + "loss": 6.5093, "step": 105 }, { - "epoch": 0.38545454545454544, - "grad_norm": 16.255165100097656, - "learning_rate": 4.3636363636363636e-05, - "loss": 6.409, + "epoch": 0.38267148014440433, + "grad_norm": 16.638938903808594, + "learning_rate": 4.368231046931408e-05, + "loss": 6.853, "step": 106 }, { - "epoch": 0.3890909090909091, - "grad_norm": 14.847028732299805, - "learning_rate": 4.3575757575757574e-05, - "loss": 6.4569, + "epoch": 0.3862815884476534, + "grad_norm": 14.78966999053955, + "learning_rate": 4.3622141997593266e-05, + "loss": 6.601, "step": 107 }, { - "epoch": 0.3927272727272727, - "grad_norm": 13.885218620300293, - "learning_rate": 4.351515151515152e-05, - "loss": 6.3969, + "epoch": 0.3898916967509025, + "grad_norm": 11.214503288269043, + "learning_rate": 4.356197352587244e-05, + "loss": 6.47, "step": 108 }, { - "epoch": 0.39636363636363636, - "grad_norm": 9.663321495056152, - "learning_rate": 4.345454545454546e-05, - "loss": 6.2821, + "epoch": 0.3935018050541516, + "grad_norm": 11.047112464904785, + "learning_rate": 4.350180505415163e-05, + "loss": 6.3487, "step": 109 }, { - "epoch": 0.4, - "grad_norm": 11.254612922668457, - "learning_rate": 4.3393939393939396e-05, - "loss": 6.2634, + "epoch": 0.3971119133574007, + "grad_norm": 10.862858772277832, + "learning_rate": 4.3441636582430804e-05, + "loss": 6.4521, "step": 110 }, { - "epoch": 0.4036363636363636, - "grad_norm": 30.378196716308594, - "learning_rate": 4.3333333333333334e-05, - "loss": 6.6234, + "epoch": 0.4007220216606498, + "grad_norm": 10.075233459472656, + "learning_rate": 4.338146811070999e-05, + "loss": 6.2256, "step": 111 }, { - "epoch": 0.4072727272727273, - "grad_norm": 16.96799087524414, - "learning_rate": 4.327272727272728e-05, - "loss": 6.3086, + "epoch": 0.4043321299638989, + "grad_norm": 8.835427284240723, + "learning_rate": 4.332129963898917e-05, + "loss": 6.252, "step": 112 }, { - "epoch": 0.4109090909090909, - "grad_norm": 8.183362007141113, - "learning_rate": 4.321212121212122e-05, - "loss": 6.1431, + "epoch": 0.40794223826714804, + "grad_norm": 8.498411178588867, + "learning_rate": 4.3261131167268356e-05, + "loss": 6.2111, "step": 113 }, { - "epoch": 0.41454545454545455, - "grad_norm": 9.962987899780273, - "learning_rate": 4.3151515151515156e-05, - "loss": 6.1345, + "epoch": 0.41155234657039713, + "grad_norm": 12.734055519104004, + "learning_rate": 4.320096269554753e-05, + "loss": 6.3314, "step": 114 }, { - "epoch": 0.41818181818181815, - "grad_norm": 15.649056434631348, - "learning_rate": 4.3090909090909094e-05, - "loss": 6.0388, + "epoch": 0.4151624548736462, + "grad_norm": 10.362188339233398, + "learning_rate": 4.314079422382672e-05, + "loss": 6.0026, "step": 115 }, { - "epoch": 0.4218181818181818, - "grad_norm": 11.750924110412598, - "learning_rate": 4.303030303030303e-05, - "loss": 6.1091, + "epoch": 0.4187725631768953, + "grad_norm": 8.478679656982422, + "learning_rate": 4.3080625752105894e-05, + "loss": 6.1003, "step": 116 }, { - "epoch": 0.4254545454545455, - "grad_norm": 11.011019706726074, - "learning_rate": 4.296969696969697e-05, - "loss": 5.9637, + "epoch": 0.4223826714801444, + "grad_norm": 9.426121711730957, + "learning_rate": 4.302045728038508e-05, + "loss": 5.9875, "step": 117 }, { - "epoch": 0.4290909090909091, - "grad_norm": 7.509008884429932, - "learning_rate": 4.290909090909091e-05, - "loss": 5.819, + "epoch": 0.4259927797833935, + "grad_norm": 7.937860012054443, + "learning_rate": 4.296028880866426e-05, + "loss": 5.9543, "step": 118 }, { - "epoch": 0.43272727272727274, - "grad_norm": 8.163488388061523, - "learning_rate": 4.284848484848485e-05, - "loss": 5.8944, + "epoch": 0.4296028880866426, + "grad_norm": 9.517463684082031, + "learning_rate": 4.2900120336943446e-05, + "loss": 5.9709, "step": 119 }, { - "epoch": 0.43636363636363634, - "grad_norm": 12.121551513671875, - "learning_rate": 4.2787878787878786e-05, - "loss": 5.8427, + "epoch": 0.4332129963898917, + "grad_norm": 5.840395450592041, + "learning_rate": 4.283995186522263e-05, + "loss": 5.8178, "step": 120 }, { - "epoch": 0.44, - "grad_norm": 10.191195487976074, - "learning_rate": 4.2727272727272724e-05, - "loss": 5.8268, + "epoch": 0.4368231046931408, + "grad_norm": 5.626274108886719, + "learning_rate": 4.277978339350181e-05, + "loss": 5.9067, "step": 121 }, { - "epoch": 0.44363636363636366, - "grad_norm": 6.44727087020874, - "learning_rate": 4.266666666666667e-05, - "loss": 5.8514, + "epoch": 0.4404332129963899, + "grad_norm": 4.778440952301025, + "learning_rate": 4.2719614921780984e-05, + "loss": 5.8001, "step": 122 }, { - "epoch": 0.44727272727272727, - "grad_norm": 5.603975296020508, - "learning_rate": 4.260606060606061e-05, - "loss": 5.7627, + "epoch": 0.44404332129963897, + "grad_norm": 8.221015930175781, + "learning_rate": 4.265944645006017e-05, + "loss": 6.0161, "step": 123 }, { - "epoch": 0.4509090909090909, - "grad_norm": 8.263181686401367, - "learning_rate": 4.254545454545455e-05, - "loss": 5.9339, + "epoch": 0.44765342960288806, + "grad_norm": 8.236863136291504, + "learning_rate": 4.259927797833935e-05, + "loss": 5.7349, "step": 124 }, { - "epoch": 0.45454545454545453, - "grad_norm": 9.458388328552246, - "learning_rate": 4.248484848484849e-05, - "loss": 5.623, + "epoch": 0.45126353790613716, + "grad_norm": 5.051707744598389, + "learning_rate": 4.2539109506618536e-05, + "loss": 5.8968, "step": 125 }, { - "epoch": 0.4581818181818182, - "grad_norm": 6.523182392120361, - "learning_rate": 4.242424242424243e-05, - "loss": 5.7246, + "epoch": 0.4548736462093863, + "grad_norm": 6.383734703063965, + "learning_rate": 4.247894103489772e-05, + "loss": 6.0475, "step": 126 }, { - "epoch": 0.4618181818181818, - "grad_norm": 4.975604057312012, - "learning_rate": 4.236363636363637e-05, - "loss": 5.7018, + "epoch": 0.4584837545126354, + "grad_norm": 8.41263484954834, + "learning_rate": 4.24187725631769e-05, + "loss": 5.7269, "step": 127 }, { - "epoch": 0.46545454545454545, - "grad_norm": 6.818673133850098, - "learning_rate": 4.2303030303030306e-05, - "loss": 5.9547, + "epoch": 0.4620938628158845, + "grad_norm": 7.515089511871338, + "learning_rate": 4.235860409145608e-05, + "loss": 5.6593, "step": 128 }, { - "epoch": 0.4690909090909091, - "grad_norm": 6.9742536544799805, - "learning_rate": 4.2242424242424244e-05, - "loss": 5.9003, + "epoch": 0.4657039711191336, + "grad_norm": 4.513574123382568, + "learning_rate": 4.229843561973526e-05, + "loss": 5.7707, "step": 129 }, { - "epoch": 0.4727272727272727, - "grad_norm": 3.8385305404663086, - "learning_rate": 4.218181818181818e-05, - "loss": 5.6588, + "epoch": 0.4693140794223827, + "grad_norm": 3.846007823944092, + "learning_rate": 4.223826714801444e-05, + "loss": 5.6836, "step": 130 }, { - "epoch": 0.4763636363636364, - "grad_norm": 6.935608863830566, - "learning_rate": 4.212121212121212e-05, - "loss": 5.7309, + "epoch": 0.4729241877256318, + "grad_norm": 4.019323825836182, + "learning_rate": 4.2178098676293626e-05, + "loss": 5.6789, "step": 131 }, { - "epoch": 0.48, - "grad_norm": 4.395711898803711, - "learning_rate": 4.206060606060606e-05, - "loss": 5.7787, + "epoch": 0.47653429602888087, + "grad_norm": 4.250916957855225, + "learning_rate": 4.211793020457281e-05, + "loss": 5.7052, "step": 132 }, { - "epoch": 0.48363636363636364, - "grad_norm": 6.357467174530029, - "learning_rate": 4.2e-05, - "loss": 5.5588, + "epoch": 0.48014440433212996, + "grad_norm": 3.1647579669952393, + "learning_rate": 4.205776173285199e-05, + "loss": 5.5919, "step": 133 }, { - "epoch": 0.48727272727272725, - "grad_norm": 8.066727638244629, - "learning_rate": 4.193939393939394e-05, - "loss": 5.5408, + "epoch": 0.48375451263537905, + "grad_norm": 3.269533157348633, + "learning_rate": 4.199759326113117e-05, + "loss": 5.7144, "step": 134 }, { - "epoch": 0.4909090909090909, - "grad_norm": 7.221047401428223, - "learning_rate": 4.187878787878788e-05, - "loss": 5.5937, + "epoch": 0.48736462093862815, + "grad_norm": 3.1203269958496094, + "learning_rate": 4.193742478941035e-05, + "loss": 5.6578, "step": 135 }, { - "epoch": 0.49454545454545457, - "grad_norm": 5.110551357269287, - "learning_rate": 4.181818181818182e-05, - "loss": 5.5971, + "epoch": 0.49097472924187724, + "grad_norm": 4.004848003387451, + "learning_rate": 4.187725631768953e-05, + "loss": 5.851, "step": 136 }, { - "epoch": 0.49818181818181817, - "grad_norm": 2.816383123397827, - "learning_rate": 4.1757575757575765e-05, - "loss": 5.563, + "epoch": 0.49458483754512633, + "grad_norm": 2.584901809692383, + "learning_rate": 4.1817087845968716e-05, + "loss": 5.5345, "step": 137 }, { - "epoch": 0.5018181818181818, - "grad_norm": 2.900914430618286, - "learning_rate": 4.16969696969697e-05, - "loss": 5.5362, + "epoch": 0.4981949458483754, + "grad_norm": 2.390103340148926, + "learning_rate": 4.175691937424789e-05, + "loss": 5.5518, "step": 138 }, { - "epoch": 0.5054545454545455, - "grad_norm": 4.6505560874938965, - "learning_rate": 4.163636363636364e-05, - "loss": 5.4366, + "epoch": 0.5018050541516246, + "grad_norm": 2.405592441558838, + "learning_rate": 4.169675090252708e-05, + "loss": 5.4639, "step": 139 }, { - "epoch": 0.509090909090909, - "grad_norm": 4.222754001617432, - "learning_rate": 4.157575757575758e-05, - "loss": 5.555, + "epoch": 0.5054151624548736, + "grad_norm": 2.404343605041504, + "learning_rate": 4.163658243080626e-05, + "loss": 5.3935, "step": 140 }, { - "epoch": 0.5127272727272727, - "grad_norm": 3.447328805923462, - "learning_rate": 4.151515151515152e-05, - "loss": 5.6128, + "epoch": 0.5090252707581228, + "grad_norm": 2.314335584640503, + "learning_rate": 4.1576413959085445e-05, + "loss": 5.4667, "step": 141 }, { - "epoch": 0.5163636363636364, - "grad_norm": 2.5096065998077393, - "learning_rate": 4.1454545454545456e-05, - "loss": 5.3437, + "epoch": 0.5126353790613718, + "grad_norm": 5.040092468261719, + "learning_rate": 4.151624548736462e-05, + "loss": 5.7844, "step": 142 }, { - "epoch": 0.52, - "grad_norm": 3.392411231994629, - "learning_rate": 4.1393939393939395e-05, - "loss": 5.4153, + "epoch": 0.516245487364621, + "grad_norm": 2.73350191116333, + "learning_rate": 4.1456077015643806e-05, + "loss": 5.581, "step": 143 }, { - "epoch": 0.5236363636363637, - "grad_norm": 3.3837954998016357, - "learning_rate": 4.133333333333333e-05, - "loss": 5.5665, + "epoch": 0.51985559566787, + "grad_norm": 3.540738821029663, + "learning_rate": 4.139590854392298e-05, + "loss": 5.433, "step": 144 }, { - "epoch": 0.5272727272727272, - "grad_norm": 3.1204018592834473, - "learning_rate": 4.127272727272727e-05, - "loss": 5.3836, + "epoch": 0.5234657039711191, + "grad_norm": 2.8726115226745605, + "learning_rate": 4.1335740072202167e-05, + "loss": 5.4571, "step": 145 }, { - "epoch": 0.5309090909090909, - "grad_norm": 2.5219132900238037, - "learning_rate": 4.1212121212121216e-05, - "loss": 5.5756, + "epoch": 0.5270758122743683, + "grad_norm": 2.460242509841919, + "learning_rate": 4.127557160048135e-05, + "loss": 5.3747, "step": 146 }, { - "epoch": 0.5345454545454545, - "grad_norm": 4.97948694229126, - "learning_rate": 4.1151515151515155e-05, - "loss": 5.583, + "epoch": 0.5306859205776173, + "grad_norm": 2.26070499420166, + "learning_rate": 4.1215403128760534e-05, + "loss": 5.337, "step": 147 }, { - "epoch": 0.5381818181818182, - "grad_norm": 4.947720050811768, - "learning_rate": 4.109090909090909e-05, - "loss": 5.4612, + "epoch": 0.5342960288808665, + "grad_norm": 2.353187084197998, + "learning_rate": 4.115523465703972e-05, + "loss": 5.3909, "step": 148 }, { - "epoch": 0.5418181818181819, - "grad_norm": 3.0988149642944336, - "learning_rate": 4.103030303030303e-05, - "loss": 5.4593, + "epoch": 0.5379061371841155, + "grad_norm": 3.6039485931396484, + "learning_rate": 4.1095066185318895e-05, + "loss": 5.4302, "step": 149 }, { - "epoch": 0.5454545454545454, - "grad_norm": 2.3757989406585693, - "learning_rate": 4.096969696969697e-05, - "loss": 5.3723, + "epoch": 0.5415162454873647, + "grad_norm": 2.220996856689453, + "learning_rate": 4.103489771359807e-05, + "loss": 5.3922, "step": 150 }, { - "epoch": 0.5490909090909091, - "grad_norm": 2.393914222717285, - "learning_rate": 4.0909090909090915e-05, - "loss": 5.4201, + "epoch": 0.5451263537906137, + "grad_norm": 2.450697660446167, + "learning_rate": 4.0974729241877256e-05, + "loss": 5.3605, "step": 151 }, { - "epoch": 0.5527272727272727, - "grad_norm": 4.960287570953369, - "learning_rate": 4.084848484848485e-05, - "loss": 5.5827, + "epoch": 0.5487364620938628, + "grad_norm": 2.709074020385742, + "learning_rate": 4.091456077015644e-05, + "loss": 5.433, "step": 152 }, { - "epoch": 0.5563636363636364, - "grad_norm": 3.0634284019470215, - "learning_rate": 4.078787878787879e-05, - "loss": 5.5706, + "epoch": 0.5523465703971119, + "grad_norm": 2.338242769241333, + "learning_rate": 4.0854392298435624e-05, + "loss": 5.3792, "step": 153 }, { - "epoch": 0.56, - "grad_norm": 2.428879499435425, - "learning_rate": 4.072727272727273e-05, - "loss": 5.4515, + "epoch": 0.555956678700361, + "grad_norm": 1.8731917142868042, + "learning_rate": 4.079422382671481e-05, + "loss": 5.3199, "step": 154 }, { - "epoch": 0.5636363636363636, - "grad_norm": 3.948068857192993, - "learning_rate": 4.066666666666667e-05, - "loss": 5.4016, + "epoch": 0.5595667870036101, + "grad_norm": 2.192985773086548, + "learning_rate": 4.0734055354993985e-05, + "loss": 5.2468, "step": 155 }, { - "epoch": 0.5672727272727273, - "grad_norm": 3.6819639205932617, - "learning_rate": 4.0606060606060606e-05, - "loss": 5.4725, + "epoch": 0.5631768953068592, + "grad_norm": 3.6917717456817627, + "learning_rate": 4.067388688327316e-05, + "loss": 5.3494, "step": 156 }, { - "epoch": 0.5709090909090909, - "grad_norm": 2.238288164138794, - "learning_rate": 4.0545454545454545e-05, - "loss": 5.3492, + "epoch": 0.5667870036101083, + "grad_norm": 4.642220497131348, + "learning_rate": 4.0613718411552346e-05, + "loss": 5.6593, "step": 157 }, { - "epoch": 0.5745454545454546, - "grad_norm": 2.360812187194824, - "learning_rate": 4.048484848484849e-05, - "loss": 5.379, + "epoch": 0.5703971119133574, + "grad_norm": 2.366976261138916, + "learning_rate": 4.055354993983153e-05, + "loss": 5.3336, "step": 158 }, { - "epoch": 0.5781818181818181, - "grad_norm": 2.7471134662628174, - "learning_rate": 4.042424242424243e-05, - "loss": 5.2598, + "epoch": 0.5740072202166066, + "grad_norm": 3.0058817863464355, + "learning_rate": 4.0493381468110714e-05, + "loss": 5.4665, "step": 159 }, { - "epoch": 0.5818181818181818, - "grad_norm": 2.7425880432128906, - "learning_rate": 4.0363636363636367e-05, - "loss": 5.2575, + "epoch": 0.5776173285198556, + "grad_norm": 2.297910690307617, + "learning_rate": 4.043321299638989e-05, + "loss": 5.232, "step": 160 }, { - "epoch": 0.5854545454545454, - "grad_norm": 2.0691795349121094, - "learning_rate": 4.0303030303030305e-05, - "loss": 5.4382, + "epoch": 0.5812274368231047, + "grad_norm": 1.971007227897644, + "learning_rate": 4.0373044524669075e-05, + "loss": 5.2868, "step": 161 }, { - "epoch": 0.5890909090909091, - "grad_norm": 2.2511022090911865, - "learning_rate": 4.024242424242424e-05, - "loss": 5.5356, + "epoch": 0.5848375451263538, + "grad_norm": 2.3164796829223633, + "learning_rate": 4.031287605294826e-05, + "loss": 5.42, "step": 162 }, { - "epoch": 0.5927272727272728, - "grad_norm": 2.0557987689971924, - "learning_rate": 4.018181818181818e-05, - "loss": 5.4373, + "epoch": 0.5884476534296029, + "grad_norm": 2.2059273719787598, + "learning_rate": 4.0252707581227436e-05, + "loss": 5.2749, "step": 163 }, { - "epoch": 0.5963636363636363, - "grad_norm": 2.4520606994628906, - "learning_rate": 4.012121212121212e-05, - "loss": 5.2067, + "epoch": 0.592057761732852, + "grad_norm": 2.0889828205108643, + "learning_rate": 4.019253910950662e-05, + "loss": 5.4344, "step": 164 }, { - "epoch": 0.6, - "grad_norm": 1.9265565872192383, - "learning_rate": 4.0060606060606065e-05, - "loss": 5.3952, + "epoch": 0.5956678700361011, + "grad_norm": 2.5745677947998047, + "learning_rate": 4.0132370637785804e-05, + "loss": 5.5377, "step": 165 }, { - "epoch": 0.6036363636363636, - "grad_norm": 2.1164627075195312, - "learning_rate": 4e-05, - "loss": 5.134, + "epoch": 0.5992779783393501, + "grad_norm": 2.0397369861602783, + "learning_rate": 4.007220216606498e-05, + "loss": 5.2009, "step": 166 }, { - "epoch": 0.6072727272727273, - "grad_norm": 1.9591832160949707, - "learning_rate": 3.993939393939394e-05, - "loss": 5.2561, + "epoch": 0.6028880866425993, + "grad_norm": 2.9067304134368896, + "learning_rate": 4.0012033694344165e-05, + "loss": 5.2047, "step": 167 }, { - "epoch": 0.610909090909091, - "grad_norm": 1.9654712677001953, - "learning_rate": 3.987878787878788e-05, - "loss": 5.1538, + "epoch": 0.6064981949458483, + "grad_norm": 3.246255874633789, + "learning_rate": 3.995186522262335e-05, + "loss": 5.5057, "step": 168 }, { - "epoch": 0.6145454545454545, - "grad_norm": 1.8879733085632324, - "learning_rate": 3.981818181818182e-05, - "loss": 5.2923, + "epoch": 0.6101083032490975, + "grad_norm": 1.773790717124939, + "learning_rate": 3.989169675090253e-05, + "loss": 5.2973, "step": 169 }, { - "epoch": 0.6181818181818182, - "grad_norm": 2.7771313190460205, - "learning_rate": 3.975757575757576e-05, - "loss": 5.3083, + "epoch": 0.6137184115523465, + "grad_norm": 2.192772150039673, + "learning_rate": 3.983152827918171e-05, + "loss": 5.1046, "step": 170 }, { - "epoch": 0.6218181818181818, - "grad_norm": 2.2295308113098145, - "learning_rate": 3.96969696969697e-05, - "loss": 5.2658, + "epoch": 0.6173285198555957, + "grad_norm": 1.923920750617981, + "learning_rate": 3.9771359807460894e-05, + "loss": 5.2984, "step": 171 }, { - "epoch": 0.6254545454545455, - "grad_norm": 2.3644328117370605, - "learning_rate": 3.963636363636364e-05, - "loss": 5.1096, + "epoch": 0.6209386281588448, + "grad_norm": 2.1909148693084717, + "learning_rate": 3.971119133574007e-05, + "loss": 5.222, "step": 172 }, { - "epoch": 0.6290909090909091, - "grad_norm": 1.9340869188308716, - "learning_rate": 3.957575757575758e-05, - "loss": 5.1372, + "epoch": 0.6245487364620939, + "grad_norm": 1.6805565357208252, + "learning_rate": 3.9651022864019255e-05, + "loss": 5.1115, "step": 173 }, { - "epoch": 0.6327272727272727, - "grad_norm": 2.5413575172424316, - "learning_rate": 3.951515151515152e-05, - "loss": 5.085, + "epoch": 0.628158844765343, + "grad_norm": 1.7837809324264526, + "learning_rate": 3.959085439229844e-05, + "loss": 5.1649, "step": 174 }, { - "epoch": 0.6363636363636364, - "grad_norm": 2.119511842727661, - "learning_rate": 3.9454545454545455e-05, - "loss": 5.2124, + "epoch": 0.631768953068592, + "grad_norm": 1.7250087261199951, + "learning_rate": 3.953068592057762e-05, + "loss": 5.0921, "step": 175 }, { - "epoch": 0.64, - "grad_norm": 2.2437052726745605, - "learning_rate": 3.939393939393939e-05, - "loss": 5.1622, + "epoch": 0.6353790613718412, + "grad_norm": 1.9569655656814575, + "learning_rate": 3.94705174488568e-05, + "loss": 5.2318, "step": 176 }, { - "epoch": 0.6436363636363637, - "grad_norm": 4.928871154785156, - "learning_rate": 3.933333333333333e-05, - "loss": 5.1084, + "epoch": 0.6389891696750902, + "grad_norm": 1.8813940286636353, + "learning_rate": 3.941034897713598e-05, + "loss": 5.2228, "step": 177 }, { - "epoch": 0.6472727272727272, - "grad_norm": 4.648951053619385, - "learning_rate": 3.927272727272727e-05, - "loss": 5.36, + "epoch": 0.6425992779783394, + "grad_norm": 3.922642469406128, + "learning_rate": 3.935018050541516e-05, + "loss": 5.4068, "step": 178 }, { - "epoch": 0.6509090909090909, - "grad_norm": 2.124182939529419, - "learning_rate": 3.9212121212121215e-05, - "loss": 5.1618, + "epoch": 0.6462093862815884, + "grad_norm": 2.448709726333618, + "learning_rate": 3.9290012033694345e-05, + "loss": 5.2182, "step": 179 }, { - "epoch": 0.6545454545454545, - "grad_norm": 1.795525074005127, - "learning_rate": 3.9151515151515153e-05, - "loss": 5.1547, + "epoch": 0.6498194945848376, + "grad_norm": 2.0298540592193604, + "learning_rate": 3.922984356197353e-05, + "loss": 5.0192, "step": 180 }, { - "epoch": 0.6581818181818182, - "grad_norm": 7.342172145843506, - "learning_rate": 3.909090909090909e-05, - "loss": 5.1927, + "epoch": 0.6534296028880866, + "grad_norm": 2.1303651332855225, + "learning_rate": 3.916967509025271e-05, + "loss": 5.1934, "step": 181 }, { - "epoch": 0.6618181818181819, - "grad_norm": 7.812733173370361, - "learning_rate": 3.903030303030304e-05, - "loss": 5.0418, + "epoch": 0.6570397111913358, + "grad_norm": 1.6486132144927979, + "learning_rate": 3.91095066185319e-05, + "loss": 5.1027, "step": 182 }, { - "epoch": 0.6654545454545454, - "grad_norm": 7.93719482421875, - "learning_rate": 3.8969696969696975e-05, - "loss": 5.2858, + "epoch": 0.6606498194945848, + "grad_norm": 3.1995131969451904, + "learning_rate": 3.9049338146811074e-05, + "loss": 5.2469, "step": 183 }, { - "epoch": 0.6690909090909091, - "grad_norm": 2.118412971496582, - "learning_rate": 3.8909090909090914e-05, - "loss": 5.0045, + "epoch": 0.6642599277978339, + "grad_norm": 2.314117193222046, + "learning_rate": 3.898916967509025e-05, + "loss": 4.9994, "step": 184 }, { - "epoch": 0.6727272727272727, - "grad_norm": 2.4741883277893066, - "learning_rate": 3.884848484848485e-05, - "loss": 5.0554, + "epoch": 0.6678700361010831, + "grad_norm": 2.438479423522949, + "learning_rate": 3.8929001203369435e-05, + "loss": 5.1398, "step": 185 }, { - "epoch": 0.6763636363636364, - "grad_norm": 2.7295148372650146, - "learning_rate": 3.878787878787879e-05, - "loss": 5.0661, + "epoch": 0.6714801444043321, + "grad_norm": 2.5420024394989014, + "learning_rate": 3.886883273164862e-05, + "loss": 4.9167, "step": 186 }, { - "epoch": 0.68, - "grad_norm": 3.038841962814331, - "learning_rate": 3.872727272727273e-05, - "loss": 5.2243, + "epoch": 0.6750902527075813, + "grad_norm": 2.7453882694244385, + "learning_rate": 3.88086642599278e-05, + "loss": 5.1795, "step": 187 }, { - "epoch": 0.6836363636363636, - "grad_norm": 1.7481746673583984, - "learning_rate": 3.866666666666667e-05, - "loss": 5.1118, + "epoch": 0.6787003610108303, + "grad_norm": 2.1120471954345703, + "learning_rate": 3.874849578820698e-05, + "loss": 5.0016, "step": 188 }, { - "epoch": 0.6872727272727273, - "grad_norm": 1.9238699674606323, - "learning_rate": 3.8606060606060605e-05, - "loss": 5.0864, + "epoch": 0.6823104693140795, + "grad_norm": 1.9928067922592163, + "learning_rate": 3.8688327316486164e-05, + "loss": 5.0273, "step": 189 }, { - "epoch": 0.6909090909090909, - "grad_norm": 2.7689208984375, - "learning_rate": 3.8545454545454544e-05, - "loss": 5.165, + "epoch": 0.6859205776173285, + "grad_norm": 2.736645221710205, + "learning_rate": 3.862815884476535e-05, + "loss": 5.1239, "step": 190 }, { - "epoch": 0.6945454545454546, - "grad_norm": 1.9809805154800415, - "learning_rate": 3.848484848484848e-05, - "loss": 5.2885, + "epoch": 0.6895306859205776, + "grad_norm": 2.823556423187256, + "learning_rate": 3.8567990373044525e-05, + "loss": 5.1206, "step": 191 }, { - "epoch": 0.6981818181818182, - "grad_norm": 1.7771217823028564, - "learning_rate": 3.842424242424243e-05, - "loss": 5.0592, + "epoch": 0.6931407942238267, + "grad_norm": 2.5614190101623535, + "learning_rate": 3.850782190132371e-05, + "loss": 5.0496, "step": 192 }, { - "epoch": 0.7018181818181818, - "grad_norm": 2.0928571224212646, - "learning_rate": 3.8363636363636365e-05, - "loss": 5.0556, + "epoch": 0.6967509025270758, + "grad_norm": 3.6118035316467285, + "learning_rate": 3.844765342960289e-05, + "loss": 5.0111, "step": 193 }, { - "epoch": 0.7054545454545454, - "grad_norm": 1.9713197946548462, - "learning_rate": 3.830303030303031e-05, - "loss": 4.9863, + "epoch": 0.7003610108303249, + "grad_norm": 2.387235164642334, + "learning_rate": 3.838748495788207e-05, + "loss": 5.1377, "step": 194 }, { - "epoch": 0.7090909090909091, - "grad_norm": 1.7148576974868774, - "learning_rate": 3.824242424242425e-05, - "loss": 5.1573, + "epoch": 0.703971119133574, + "grad_norm": 2.7525899410247803, + "learning_rate": 3.8327316486161254e-05, + "loss": 4.9603, "step": 195 }, { - "epoch": 0.7127272727272728, - "grad_norm": 1.9550209045410156, - "learning_rate": 3.818181818181819e-05, - "loss": 5.1499, + "epoch": 0.7075812274368231, + "grad_norm": 2.7961931228637695, + "learning_rate": 3.826714801444044e-05, + "loss": 5.279, "step": 196 }, { - "epoch": 0.7163636363636363, - "grad_norm": 1.695175290107727, - "learning_rate": 3.8121212121212125e-05, - "loss": 5.0989, + "epoch": 0.7111913357400722, + "grad_norm": 2.0542795658111572, + "learning_rate": 3.8206979542719615e-05, + "loss": 4.8995, "step": 197 }, { - "epoch": 0.72, - "grad_norm": 3.0943408012390137, - "learning_rate": 3.8060606060606064e-05, - "loss": 5.0336, + "epoch": 0.7148014440433214, + "grad_norm": 3.907233953475952, + "learning_rate": 3.81468110709988e-05, + "loss": 4.9418, "step": 198 }, { - "epoch": 0.7236363636363636, - "grad_norm": 2.219883918762207, - "learning_rate": 3.8e-05, - "loss": 4.942, + "epoch": 0.7184115523465704, + "grad_norm": 4.625024795532227, + "learning_rate": 3.8086642599277976e-05, + "loss": 5.0614, "step": 199 }, { - "epoch": 0.7272727272727273, - "grad_norm": 3.4300808906555176, - "learning_rate": 3.793939393939394e-05, - "loss": 4.9841, + "epoch": 0.7220216606498195, + "grad_norm": 1.8027046918869019, + "learning_rate": 3.802647412755716e-05, + "loss": 4.8633, "step": 200 }, { - "epoch": 0.730909090909091, - "grad_norm": 2.2358174324035645, - "learning_rate": 3.787878787878788e-05, - "loss": 5.0942, + "epoch": 0.7256317689530686, + "grad_norm": 2.0051488876342773, + "learning_rate": 3.7966305655836343e-05, + "loss": 4.8928, "step": 201 }, { - "epoch": 0.7345454545454545, - "grad_norm": 2.1182949542999268, - "learning_rate": 3.781818181818182e-05, - "loss": 4.9778, + "epoch": 0.7292418772563177, + "grad_norm": 3.243194818496704, + "learning_rate": 3.790613718411553e-05, + "loss": 5.0681, "step": 202 }, { - "epoch": 0.7381818181818182, - "grad_norm": 2.5967414379119873, - "learning_rate": 3.7757575757575755e-05, - "loss": 4.8359, + "epoch": 0.7328519855595668, + "grad_norm": 2.727696418762207, + "learning_rate": 3.784596871239471e-05, + "loss": 5.1281, "step": 203 }, { - "epoch": 0.7418181818181818, - "grad_norm": 2.3327507972717285, - "learning_rate": 3.76969696969697e-05, - "loss": 5.0698, + "epoch": 0.7364620938628159, + "grad_norm": 1.5806025266647339, + "learning_rate": 3.778580024067389e-05, + "loss": 4.9068, "step": 204 }, { - "epoch": 0.7454545454545455, - "grad_norm": 2.0424389839172363, - "learning_rate": 3.763636363636364e-05, - "loss": 5.0554, + "epoch": 0.740072202166065, + "grad_norm": 4.9806318283081055, + "learning_rate": 3.7725631768953066e-05, + "loss": 5.0823, "step": 205 }, { - "epoch": 0.7490909090909091, - "grad_norm": 2.24621844291687, - "learning_rate": 3.757575757575758e-05, - "loss": 5.0766, + "epoch": 0.7436823104693141, + "grad_norm": 2.2012317180633545, + "learning_rate": 3.766546329723225e-05, + "loss": 4.8072, "step": 206 }, { - "epoch": 0.7527272727272727, - "grad_norm": 1.9795176982879639, - "learning_rate": 3.7515151515151516e-05, - "loss": 4.8173, + "epoch": 0.7472924187725631, + "grad_norm": 1.9060529470443726, + "learning_rate": 3.760529482551143e-05, + "loss": 4.9973, "step": 207 }, { - "epoch": 0.7563636363636363, - "grad_norm": 2.3668367862701416, - "learning_rate": 3.745454545454546e-05, - "loss": 4.8863, + "epoch": 0.7509025270758123, + "grad_norm": 1.9107259511947632, + "learning_rate": 3.754512635379062e-05, + "loss": 4.9668, "step": 208 }, { - "epoch": 0.76, - "grad_norm": 2.0429444313049316, - "learning_rate": 3.73939393939394e-05, - "loss": 4.8721, + "epoch": 0.7545126353790613, + "grad_norm": 1.9527552127838135, + "learning_rate": 3.74849578820698e-05, + "loss": 4.9307, "step": 209 }, { - "epoch": 0.7636363636363637, - "grad_norm": 2.5353472232818604, - "learning_rate": 3.733333333333334e-05, - "loss": 5.0184, + "epoch": 0.7581227436823105, + "grad_norm": 2.9125237464904785, + "learning_rate": 3.742478941034898e-05, + "loss": 4.8515, "step": 210 }, { - "epoch": 0.7672727272727272, - "grad_norm": 1.7212400436401367, - "learning_rate": 3.7272727272727276e-05, - "loss": 4.8016, + "epoch": 0.7617328519855595, + "grad_norm": 2.114863395690918, + "learning_rate": 3.7364620938628155e-05, + "loss": 4.9533, "step": 211 }, { - "epoch": 0.7709090909090909, - "grad_norm": 4.483168125152588, - "learning_rate": 3.7212121212121214e-05, - "loss": 4.9398, + "epoch": 0.7653429602888087, + "grad_norm": 3.5130767822265625, + "learning_rate": 3.730445246690734e-05, + "loss": 5.1674, "step": 212 }, { - "epoch": 0.7745454545454545, - "grad_norm": 2.3681108951568604, - "learning_rate": 3.715151515151515e-05, - "loss": 4.895, + "epoch": 0.7689530685920578, + "grad_norm": 3.569817304611206, + "learning_rate": 3.724428399518652e-05, + "loss": 4.78, "step": 213 }, { - "epoch": 0.7781818181818182, - "grad_norm": 3.49525785446167, - "learning_rate": 3.709090909090909e-05, - "loss": 4.8547, + "epoch": 0.7725631768953068, + "grad_norm": 3.456483840942383, + "learning_rate": 3.718411552346571e-05, + "loss": 4.6653, "step": 214 }, { - "epoch": 0.7818181818181819, - "grad_norm": 4.825103282928467, - "learning_rate": 3.703030303030303e-05, - "loss": 4.7677, + "epoch": 0.776173285198556, + "grad_norm": 2.1834640502929688, + "learning_rate": 3.712394705174489e-05, + "loss": 4.9685, "step": 215 }, { - "epoch": 0.7854545454545454, - "grad_norm": 3.3975582122802734, - "learning_rate": 3.6969696969696974e-05, - "loss": 5.0948, + "epoch": 0.779783393501805, + "grad_norm": 3.3436832427978516, + "learning_rate": 3.706377858002407e-05, + "loss": 4.9632, "step": 216 }, { - "epoch": 0.7890909090909091, - "grad_norm": 2.115818738937378, - "learning_rate": 3.690909090909091e-05, - "loss": 4.8962, + "epoch": 0.7833935018050542, + "grad_norm": 3.473731517791748, + "learning_rate": 3.700361010830325e-05, + "loss": 4.8625, "step": 217 }, { - "epoch": 0.7927272727272727, - "grad_norm": 3.3818588256835938, - "learning_rate": 3.684848484848485e-05, - "loss": 5.0175, + "epoch": 0.7870036101083032, + "grad_norm": 4.904114246368408, + "learning_rate": 3.694344163658243e-05, + "loss": 5.1025, "step": 218 }, { - "epoch": 0.7963636363636364, - "grad_norm": 2.1706228256225586, - "learning_rate": 3.678787878787879e-05, - "loss": 4.9794, + "epoch": 0.7906137184115524, + "grad_norm": 2.1981472969055176, + "learning_rate": 3.688327316486161e-05, + "loss": 4.7881, "step": 219 }, { - "epoch": 0.8, - "grad_norm": 1.8316903114318848, - "learning_rate": 3.672727272727273e-05, - "loss": 5.0241, + "epoch": 0.7942238267148014, + "grad_norm": 2.3624002933502197, + "learning_rate": 3.68231046931408e-05, + "loss": 5.0446, "step": 220 }, { - "epoch": 0.8036363636363636, - "grad_norm": 2.90144944190979, - "learning_rate": 3.6666666666666666e-05, - "loss": 4.7593, + "epoch": 0.7978339350180506, + "grad_norm": 4.91404390335083, + "learning_rate": 3.676293622141998e-05, + "loss": 4.7575, "step": 221 }, { - "epoch": 0.8072727272727273, - "grad_norm": 3.2332253456115723, - "learning_rate": 3.660606060606061e-05, - "loss": 4.8894, + "epoch": 0.8014440433212996, + "grad_norm": 2.7553951740264893, + "learning_rate": 3.670276774969916e-05, + "loss": 5.0162, "step": 222 }, { - "epoch": 0.8109090909090909, - "grad_norm": 2.5061445236206055, - "learning_rate": 3.654545454545455e-05, - "loss": 4.7913, + "epoch": 0.8050541516245487, + "grad_norm": 2.2965950965881348, + "learning_rate": 3.664259927797834e-05, + "loss": 4.7732, "step": 223 }, { - "epoch": 0.8145454545454546, - "grad_norm": 1.689694881439209, - "learning_rate": 3.648484848484849e-05, - "loss": 4.8167, + "epoch": 0.8086642599277978, + "grad_norm": 4.340023040771484, + "learning_rate": 3.6582430806257526e-05, + "loss": 4.6869, "step": 224 }, { - "epoch": 0.8181818181818182, - "grad_norm": 2.99468994140625, - "learning_rate": 3.6424242424242426e-05, - "loss": 5.0467, + "epoch": 0.8122743682310469, + "grad_norm": 4.003706932067871, + "learning_rate": 3.65222623345367e-05, + "loss": 4.7567, "step": 225 }, { - "epoch": 0.8218181818181818, - "grad_norm": 1.5727870464324951, - "learning_rate": 3.6363636363636364e-05, - "loss": 4.8491, + "epoch": 0.8158844765342961, + "grad_norm": 1.9011428356170654, + "learning_rate": 3.646209386281589e-05, + "loss": 4.889, "step": 226 }, { - "epoch": 0.8254545454545454, - "grad_norm": 2.4910600185394287, - "learning_rate": 3.63030303030303e-05, - "loss": 4.7987, + "epoch": 0.8194945848375451, + "grad_norm": 4.650227069854736, + "learning_rate": 3.6401925391095064e-05, + "loss": 4.8348, "step": 227 }, { - "epoch": 0.8290909090909091, - "grad_norm": 2.5647313594818115, - "learning_rate": 3.624242424242425e-05, - "loss": 4.9892, + "epoch": 0.8231046931407943, + "grad_norm": 3.565837860107422, + "learning_rate": 3.634175691937425e-05, + "loss": 4.9042, "step": 228 }, { - "epoch": 0.8327272727272728, - "grad_norm": 2.091198205947876, - "learning_rate": 3.6181818181818186e-05, - "loss": 4.7671, + "epoch": 0.8267148014440433, + "grad_norm": 2.047588586807251, + "learning_rate": 3.628158844765343e-05, + "loss": 4.8082, "step": 229 }, { - "epoch": 0.8363636363636363, - "grad_norm": 2.494631290435791, - "learning_rate": 3.6121212121212124e-05, - "loss": 5.0782, + "epoch": 0.8303249097472925, + "grad_norm": 3.465941905975342, + "learning_rate": 3.6221419975932616e-05, + "loss": 4.5621, "step": 230 }, { - "epoch": 0.84, - "grad_norm": 6.3343000411987305, - "learning_rate": 3.606060606060606e-05, - "loss": 4.92, + "epoch": 0.8339350180505415, + "grad_norm": 2.2322192192077637, + "learning_rate": 3.616125150421179e-05, + "loss": 4.7779, "step": 231 }, { - "epoch": 0.8436363636363636, - "grad_norm": 2.6743531227111816, - "learning_rate": 3.6e-05, - "loss": 4.7505, + "epoch": 0.8375451263537906, + "grad_norm": 3.8442447185516357, + "learning_rate": 3.610108303249098e-05, + "loss": 4.7807, "step": 232 }, { - "epoch": 0.8472727272727273, - "grad_norm": 2.4769270420074463, - "learning_rate": 3.593939393939394e-05, - "loss": 4.6388, + "epoch": 0.8411552346570397, + "grad_norm": 3.0969555377960205, + "learning_rate": 3.6040914560770154e-05, + "loss": 4.7886, "step": 233 }, { - "epoch": 0.850909090909091, - "grad_norm": 1.9747099876403809, - "learning_rate": 3.587878787878788e-05, - "loss": 4.9499, + "epoch": 0.8447653429602888, + "grad_norm": 1.886405348777771, + "learning_rate": 3.598074608904934e-05, + "loss": 4.7991, "step": 234 }, { - "epoch": 0.8545454545454545, - "grad_norm": 3.063316822052002, - "learning_rate": 3.5818181818181816e-05, - "loss": 5.1008, + "epoch": 0.8483754512635379, + "grad_norm": 4.009521961212158, + "learning_rate": 3.592057761732852e-05, + "loss": 5.0554, "step": 235 }, { - "epoch": 0.8581818181818182, - "grad_norm": 1.9509135484695435, - "learning_rate": 3.575757575757576e-05, - "loss": 5.0562, + "epoch": 0.851985559566787, + "grad_norm": 3.61405086517334, + "learning_rate": 3.5860409145607706e-05, + "loss": 4.6868, "step": 236 }, { - "epoch": 0.8618181818181818, - "grad_norm": 2.897395372390747, - "learning_rate": 3.56969696969697e-05, - "loss": 4.9302, + "epoch": 0.855595667870036, + "grad_norm": 2.804062604904175, + "learning_rate": 3.580024067388689e-05, + "loss": 4.7725, "step": 237 }, { - "epoch": 0.8654545454545455, - "grad_norm": 2.996319055557251, - "learning_rate": 3.563636363636364e-05, - "loss": 4.8001, + "epoch": 0.8592057761732852, + "grad_norm": 2.4844532012939453, + "learning_rate": 3.574007220216607e-05, + "loss": 5.0849, "step": 238 }, { - "epoch": 0.8690909090909091, - "grad_norm": 2.526507616043091, - "learning_rate": 3.5575757575757576e-05, - "loss": 4.8748, + "epoch": 0.8628158844765343, + "grad_norm": 4.291991710662842, + "learning_rate": 3.5679903730445244e-05, + "loss": 4.954, "step": 239 }, { - "epoch": 0.8727272727272727, - "grad_norm": 3.4848058223724365, - "learning_rate": 3.551515151515152e-05, - "loss": 4.707, + "epoch": 0.8664259927797834, + "grad_norm": 7.508516788482666, + "learning_rate": 3.561973525872443e-05, + "loss": 4.6777, "step": 240 }, { - "epoch": 0.8763636363636363, - "grad_norm": 1.839698076248169, - "learning_rate": 3.545454545454546e-05, - "loss": 4.7075, + "epoch": 0.8700361010830325, + "grad_norm": 3.0717663764953613, + "learning_rate": 3.555956678700361e-05, + "loss": 4.9312, "step": 241 }, { - "epoch": 0.88, - "grad_norm": 2.332421064376831, - "learning_rate": 3.53939393939394e-05, - "loss": 4.8433, + "epoch": 0.8736462093862816, + "grad_norm": 2.553285837173462, + "learning_rate": 3.5499398315282796e-05, + "loss": 4.753, "step": 242 }, { - "epoch": 0.8836363636363637, - "grad_norm": 2.877429485321045, - "learning_rate": 3.5333333333333336e-05, - "loss": 4.6706, + "epoch": 0.8772563176895307, + "grad_norm": 3.743809461593628, + "learning_rate": 3.543922984356198e-05, + "loss": 4.8038, "step": 243 }, { - "epoch": 0.8872727272727273, - "grad_norm": 4.581256866455078, - "learning_rate": 3.5272727272727274e-05, - "loss": 4.7334, + "epoch": 0.8808664259927798, + "grad_norm": 2.7291336059570312, + "learning_rate": 3.537906137184116e-05, + "loss": 4.7678, "step": 244 }, { - "epoch": 0.8909090909090909, - "grad_norm": 2.1481051445007324, - "learning_rate": 3.521212121212121e-05, - "loss": 4.7936, + "epoch": 0.8844765342960289, + "grad_norm": 2.182262420654297, + "learning_rate": 3.531889290012034e-05, + "loss": 4.7459, "step": 245 }, { - "epoch": 0.8945454545454545, - "grad_norm": 3.5877878665924072, - "learning_rate": 3.515151515151515e-05, - "loss": 4.9524, + "epoch": 0.8880866425992779, + "grad_norm": 2.953906297683716, + "learning_rate": 3.525872442839952e-05, + "loss": 4.762, "step": 246 }, { - "epoch": 0.8981818181818182, - "grad_norm": 2.7592244148254395, - "learning_rate": 3.509090909090909e-05, - "loss": 4.6404, + "epoch": 0.8916967509025271, + "grad_norm": 5.394567012786865, + "learning_rate": 3.51985559566787e-05, + "loss": 4.7177, "step": 247 }, { - "epoch": 0.9018181818181819, - "grad_norm": 3.172548532485962, - "learning_rate": 3.503030303030303e-05, - "loss": 4.9291, + "epoch": 0.8953068592057761, + "grad_norm": 2.277156352996826, + "learning_rate": 3.5138387484957885e-05, + "loss": 4.6974, "step": 248 }, { - "epoch": 0.9054545454545454, - "grad_norm": 2.158982515335083, - "learning_rate": 3.4969696969696966e-05, - "loss": 4.6853, + "epoch": 0.8989169675090253, + "grad_norm": 2.9844322204589844, + "learning_rate": 3.507821901323707e-05, + "loss": 4.8035, "step": 249 }, { - "epoch": 0.9090909090909091, - "grad_norm": 2.6028811931610107, - "learning_rate": 3.490909090909091e-05, - "loss": 4.5646, + "epoch": 0.9025270758122743, + "grad_norm": 3.6745762825012207, + "learning_rate": 3.5018050541516247e-05, + "loss": 4.6441, "step": 250 }, { - "epoch": 0.9127272727272727, - "grad_norm": 5.963568210601807, - "learning_rate": 3.484848484848485e-05, - "loss": 4.5695, + "epoch": 0.9061371841155235, + "grad_norm": 2.259817600250244, + "learning_rate": 3.495788206979543e-05, + "loss": 4.8853, "step": 251 }, { - "epoch": 0.9163636363636364, - "grad_norm": 6.336101055145264, - "learning_rate": 3.4787878787878795e-05, - "loss": 5.0168, + "epoch": 0.9097472924187726, + "grad_norm": 2.207033634185791, + "learning_rate": 3.489771359807461e-05, + "loss": 4.7622, "step": 252 }, { - "epoch": 0.92, - "grad_norm": 3.318384885787964, - "learning_rate": 3.472727272727273e-05, - "loss": 4.7873, + "epoch": 0.9133574007220217, + "grad_norm": 1.8033169507980347, + "learning_rate": 3.483754512635379e-05, + "loss": 4.6108, "step": 253 }, { - "epoch": 0.9236363636363636, - "grad_norm": 3.579904079437256, - "learning_rate": 3.466666666666667e-05, - "loss": 4.7128, + "epoch": 0.9169675090252708, + "grad_norm": 2.7010810375213623, + "learning_rate": 3.4777376654632975e-05, + "loss": 4.8258, "step": 254 }, { - "epoch": 0.9272727272727272, - "grad_norm": 3.7936673164367676, - "learning_rate": 3.460606060606061e-05, - "loss": 4.7375, + "epoch": 0.9205776173285198, + "grad_norm": 2.7489020824432373, + "learning_rate": 3.471720818291215e-05, + "loss": 4.6758, "step": 255 }, { - "epoch": 0.9309090909090909, - "grad_norm": 3.784524440765381, - "learning_rate": 3.454545454545455e-05, - "loss": 4.7575, + "epoch": 0.924187725631769, + "grad_norm": 3.4038209915161133, + "learning_rate": 3.4657039711191336e-05, + "loss": 4.596, "step": 256 }, { - "epoch": 0.9345454545454546, - "grad_norm": 2.5060670375823975, - "learning_rate": 3.4484848484848486e-05, - "loss": 4.7153, + "epoch": 0.927797833935018, + "grad_norm": 3.105119228363037, + "learning_rate": 3.459687123947052e-05, + "loss": 4.883, "step": 257 }, { - "epoch": 0.9381818181818182, - "grad_norm": 2.5724377632141113, - "learning_rate": 3.4424242424242425e-05, - "loss": 4.8637, + "epoch": 0.9314079422382672, + "grad_norm": 6.055860996246338, + "learning_rate": 3.4536702767749704e-05, + "loss": 4.6512, "step": 258 }, { - "epoch": 0.9418181818181818, - "grad_norm": 3.9936795234680176, - "learning_rate": 3.436363636363636e-05, - "loss": 4.9075, + "epoch": 0.9350180505415162, + "grad_norm": 2.115447521209717, + "learning_rate": 3.447653429602888e-05, + "loss": 4.5428, "step": 259 }, { - "epoch": 0.9454545454545454, - "grad_norm": 1.9491581916809082, - "learning_rate": 3.43030303030303e-05, - "loss": 4.8538, + "epoch": 0.9386281588447654, + "grad_norm": 6.520136833190918, + "learning_rate": 3.4416365824308065e-05, + "loss": 4.9671, "step": 260 }, { - "epoch": 0.9490909090909091, - "grad_norm": 1.8183727264404297, - "learning_rate": 3.424242424242424e-05, - "loss": 4.7095, + "epoch": 0.9422382671480144, + "grad_norm": 5.740654945373535, + "learning_rate": 3.435619735258724e-05, + "loss": 4.9137, "step": 261 }, { - "epoch": 0.9527272727272728, - "grad_norm": 2.8100154399871826, - "learning_rate": 3.4181818181818185e-05, - "loss": 4.7456, + "epoch": 0.9458483754512635, + "grad_norm": 2.9576783180236816, + "learning_rate": 3.4296028880866426e-05, + "loss": 4.8369, "step": 262 }, { - "epoch": 0.9563636363636364, - "grad_norm": 2.25604248046875, - "learning_rate": 3.412121212121212e-05, - "loss": 4.5881, + "epoch": 0.9494584837545126, + "grad_norm": 2.0835630893707275, + "learning_rate": 3.423586040914561e-05, + "loss": 4.6395, "step": 263 }, { - "epoch": 0.96, - "grad_norm": 3.7915985584259033, - "learning_rate": 3.406060606060606e-05, - "loss": 4.6032, + "epoch": 0.9530685920577617, + "grad_norm": 3.2203140258789062, + "learning_rate": 3.4175691937424794e-05, + "loss": 4.7584, "step": 264 }, { - "epoch": 0.9636363636363636, - "grad_norm": 4.647451400756836, - "learning_rate": 3.4000000000000007e-05, - "loss": 4.9148, + "epoch": 0.9566787003610109, + "grad_norm": 3.108306646347046, + "learning_rate": 3.411552346570397e-05, + "loss": 4.8148, "step": 265 }, { - "epoch": 0.9672727272727273, - "grad_norm": 4.613765239715576, - "learning_rate": 3.3939393939393945e-05, - "loss": 4.6888, + "epoch": 0.9602888086642599, + "grad_norm": 4.311677932739258, + "learning_rate": 3.4055354993983155e-05, + "loss": 5.0121, "step": 266 }, { - "epoch": 0.9709090909090909, - "grad_norm": 2.291818857192993, - "learning_rate": 3.387878787878788e-05, - "loss": 4.8835, + "epoch": 0.9638989169675091, + "grad_norm": 2.4522058963775635, + "learning_rate": 3.399518652226233e-05, + "loss": 4.6426, "step": 267 }, { - "epoch": 0.9745454545454545, - "grad_norm": 2.959836483001709, - "learning_rate": 3.381818181818182e-05, - "loss": 4.781, + "epoch": 0.9675090252707581, + "grad_norm": 6.1543426513671875, + "learning_rate": 3.3935018050541516e-05, + "loss": 4.7737, "step": 268 }, { - "epoch": 0.9781818181818182, - "grad_norm": 2.353412389755249, - "learning_rate": 3.375757575757576e-05, - "loss": 4.6777, + "epoch": 0.9711191335740073, + "grad_norm": 6.448587417602539, + "learning_rate": 3.38748495788207e-05, + "loss": 4.9805, "step": 269 }, { - "epoch": 0.9818181818181818, - "grad_norm": 2.019970178604126, - "learning_rate": 3.36969696969697e-05, - "loss": 4.6844, + "epoch": 0.9747292418772563, + "grad_norm": 3.016731023788452, + "learning_rate": 3.3814681107099884e-05, + "loss": 4.701, "step": 270 }, { - "epoch": 0.9854545454545455, - "grad_norm": 3.597709894180298, - "learning_rate": 3.3636363636363636e-05, - "loss": 4.8324, + "epoch": 0.9783393501805054, + "grad_norm": 4.361660003662109, + "learning_rate": 3.375451263537907e-05, + "loss": 4.5368, "step": 271 }, { - "epoch": 0.9890909090909091, - "grad_norm": 1.8851228952407837, - "learning_rate": 3.3575757575757575e-05, - "loss": 4.7522, + "epoch": 0.9819494584837545, + "grad_norm": 17.387479782104492, + "learning_rate": 3.3694344163658245e-05, + "loss": 4.6907, "step": 272 }, { - "epoch": 0.9927272727272727, - "grad_norm": 2.674484968185425, - "learning_rate": 3.351515151515151e-05, - "loss": 4.6662, + "epoch": 0.9855595667870036, + "grad_norm": 3.6502737998962402, + "learning_rate": 3.363417569193742e-05, + "loss": 4.6492, "step": 273 }, { - "epoch": 0.9963636363636363, - "grad_norm": 4.3509840965271, - "learning_rate": 3.345454545454546e-05, - "loss": 4.7466, + "epoch": 0.9891696750902527, + "grad_norm": 3.673269510269165, + "learning_rate": 3.3574007220216606e-05, + "loss": 4.8245, "step": 274 }, { - "epoch": 1.0, - "grad_norm": 3.465963363647461, - "learning_rate": 3.3393939393939397e-05, - "loss": 4.9775, + "epoch": 0.9927797833935018, + "grad_norm": 4.932320594787598, + "learning_rate": 3.351383874849579e-05, + "loss": 4.6652, "step": 275 }, { - "epoch": 1.0036363636363637, - "grad_norm": 3.604304313659668, - "learning_rate": 3.3333333333333335e-05, - "loss": 4.6624, + "epoch": 0.9963898916967509, + "grad_norm": 7.902830600738525, + "learning_rate": 3.3453670276774974e-05, + "loss": 4.6427, "step": 276 }, { - "epoch": 1.0072727272727273, - "grad_norm": 3.1077980995178223, - "learning_rate": 3.327272727272727e-05, - "loss": 4.5934, + "epoch": 1.0, + "grad_norm": 5.016904354095459, + "learning_rate": 3.339350180505416e-05, + "loss": 4.6971, "step": 277 }, { - "epoch": 1.010909090909091, - "grad_norm": 2.4643919467926025, - "learning_rate": 3.321212121212121e-05, - "loss": 4.7402, + "epoch": 1.0036101083032491, + "grad_norm": 4.204914569854736, + "learning_rate": 3.3333333333333335e-05, + "loss": 4.6376, "step": 278 }, { - "epoch": 1.0145454545454546, - "grad_norm": 2.7680652141571045, - "learning_rate": 3.315151515151516e-05, - "loss": 4.8989, + "epoch": 1.0072202166064983, + "grad_norm": 2.7867376804351807, + "learning_rate": 3.327316486161252e-05, + "loss": 4.6176, "step": 279 }, { - "epoch": 1.018181818181818, - "grad_norm": 2.3318872451782227, - "learning_rate": 3.3090909090909095e-05, - "loss": 4.7498, + "epoch": 1.0108303249097472, + "grad_norm": 9.475534439086914, + "learning_rate": 3.3212996389891696e-05, + "loss": 4.8113, "step": 280 }, { - "epoch": 1.0218181818181817, - "grad_norm": 2.1118509769439697, - "learning_rate": 3.303030303030303e-05, - "loss": 4.5331, + "epoch": 1.0144404332129964, + "grad_norm": 9.185867309570312, + "learning_rate": 3.315282791817088e-05, + "loss": 4.8703, "step": 281 }, { - "epoch": 1.0254545454545454, - "grad_norm": 3.2147281169891357, - "learning_rate": 3.296969696969697e-05, - "loss": 4.8356, + "epoch": 1.0180505415162455, + "grad_norm": 3.52699875831604, + "learning_rate": 3.3092659446450064e-05, + "loss": 4.8145, "step": 282 }, { - "epoch": 1.029090909090909, - "grad_norm": 2.104762315750122, - "learning_rate": 3.290909090909091e-05, - "loss": 4.6182, + "epoch": 1.0216606498194947, + "grad_norm": 3.920712947845459, + "learning_rate": 3.303249097472924e-05, + "loss": 4.6849, "step": 283 }, { - "epoch": 1.0327272727272727, - "grad_norm": 2.818783760070801, - "learning_rate": 3.284848484848485e-05, - "loss": 4.8216, + "epoch": 1.0252707581227436, + "grad_norm": 4.423035144805908, + "learning_rate": 3.2972322503008425e-05, + "loss": 4.5551, "step": 284 }, { - "epoch": 1.0363636363636364, - "grad_norm": 2.765343427658081, - "learning_rate": 3.278787878787879e-05, - "loss": 4.8029, + "epoch": 1.0288808664259927, + "grad_norm": 3.3100874423980713, + "learning_rate": 3.291215403128761e-05, + "loss": 4.8097, "step": 285 }, { - "epoch": 1.04, - "grad_norm": 1.492432713508606, - "learning_rate": 3.272727272727273e-05, - "loss": 4.7011, + "epoch": 1.032490974729242, + "grad_norm": 6.985757350921631, + "learning_rate": 3.2851985559566786e-05, + "loss": 4.7497, "step": 286 }, { - "epoch": 1.0436363636363637, - "grad_norm": 2.7354354858398438, - "learning_rate": 3.266666666666667e-05, - "loss": 4.5074, + "epoch": 1.036101083032491, + "grad_norm": 5.6586222648620605, + "learning_rate": 3.279181708784597e-05, + "loss": 4.7276, "step": 287 }, { - "epoch": 1.0472727272727274, - "grad_norm": 3.9838192462921143, - "learning_rate": 3.260606060606061e-05, - "loss": 4.7455, + "epoch": 1.03971119133574, + "grad_norm": 3.2179510593414307, + "learning_rate": 3.2731648616125154e-05, + "loss": 4.6136, "step": 288 }, { - "epoch": 1.050909090909091, - "grad_norm": 3.0941343307495117, - "learning_rate": 3.254545454545455e-05, - "loss": 4.5761, + "epoch": 1.0433212996389891, + "grad_norm": 3.0860912799835205, + "learning_rate": 3.267148014440433e-05, + "loss": 4.7697, "step": 289 }, { - "epoch": 1.0545454545454545, - "grad_norm": 2.4687376022338867, - "learning_rate": 3.2484848484848485e-05, - "loss": 4.5997, + "epoch": 1.0469314079422383, + "grad_norm": 2.0248677730560303, + "learning_rate": 3.2611311672683515e-05, + "loss": 4.8089, "step": 290 }, { - "epoch": 1.0581818181818181, - "grad_norm": 3.4283993244171143, - "learning_rate": 3.2424242424242423e-05, - "loss": 4.6079, + "epoch": 1.0505415162454874, + "grad_norm": 3.7353522777557373, + "learning_rate": 3.25511432009627e-05, + "loss": 4.6567, "step": 291 }, { - "epoch": 1.0618181818181818, - "grad_norm": 2.7953782081604004, - "learning_rate": 3.236363636363636e-05, - "loss": 4.6912, + "epoch": 1.0541516245487366, + "grad_norm": 2.9157674312591553, + "learning_rate": 3.249097472924188e-05, + "loss": 4.5186, "step": 292 }, { - "epoch": 1.0654545454545454, - "grad_norm": 2.2899701595306396, - "learning_rate": 3.230303030303031e-05, - "loss": 4.6359, + "epoch": 1.0577617328519855, + "grad_norm": 3.010720729827881, + "learning_rate": 3.243080625752106e-05, + "loss": 4.9587, "step": 293 }, { - "epoch": 1.069090909090909, - "grad_norm": 3.88470721244812, - "learning_rate": 3.2242424242424245e-05, - "loss": 4.7514, + "epoch": 1.0613718411552346, + "grad_norm": 2.433742046356201, + "learning_rate": 3.237063778580024e-05, + "loss": 4.5569, "step": 294 }, { - "epoch": 1.0727272727272728, - "grad_norm": 2.5786900520324707, - "learning_rate": 3.2181818181818184e-05, - "loss": 4.5935, + "epoch": 1.0649819494584838, + "grad_norm": 2.5127952098846436, + "learning_rate": 3.231046931407942e-05, + "loss": 4.6183, "step": 295 }, { - "epoch": 1.0763636363636364, - "grad_norm": 2.5727601051330566, - "learning_rate": 3.212121212121212e-05, - "loss": 4.6504, + "epoch": 1.068592057761733, + "grad_norm": 2.83199143409729, + "learning_rate": 3.2250300842358605e-05, + "loss": 4.4713, "step": 296 }, { - "epoch": 1.08, - "grad_norm": 2.643254280090332, - "learning_rate": 3.206060606060606e-05, - "loss": 4.7769, + "epoch": 1.0722021660649819, + "grad_norm": 3.9529902935028076, + "learning_rate": 3.219013237063779e-05, + "loss": 4.7303, "step": 297 }, { - "epoch": 1.0836363636363637, - "grad_norm": 2.8821048736572266, - "learning_rate": 3.2000000000000005e-05, - "loss": 4.7027, + "epoch": 1.075812274368231, + "grad_norm": 3.1393637657165527, + "learning_rate": 3.212996389891697e-05, + "loss": 4.8081, "step": 298 }, { - "epoch": 1.0872727272727274, - "grad_norm": 4.047679424285889, - "learning_rate": 3.1939393939393944e-05, - "loss": 4.8341, + "epoch": 1.0794223826714802, + "grad_norm": 2.7979230880737305, + "learning_rate": 3.2069795427196156e-05, + "loss": 4.6623, "step": 299 }, { - "epoch": 1.0909090909090908, - "grad_norm": 3.6683619022369385, - "learning_rate": 3.187878787878788e-05, - "loss": 4.7663, + "epoch": 1.0830324909747293, + "grad_norm": 5.9944024085998535, + "learning_rate": 3.2009626955475333e-05, + "loss": 4.7301, "step": 300 }, { - "epoch": 1.0945454545454545, - "grad_norm": 5.787856578826904, - "learning_rate": 3.181818181818182e-05, - "loss": 4.697, + "epoch": 1.0866425992779782, + "grad_norm": 4.5418901443481445, + "learning_rate": 3.194945848375451e-05, + "loss": 4.5683, "step": 301 }, { - "epoch": 1.0981818181818181, - "grad_norm": 7.743192672729492, - "learning_rate": 3.175757575757576e-05, - "loss": 4.7046, + "epoch": 1.0902527075812274, + "grad_norm": 6.022171497344971, + "learning_rate": 3.1889290012033695e-05, + "loss": 4.8128, "step": 302 }, { - "epoch": 1.1018181818181818, - "grad_norm": 3.313638925552368, - "learning_rate": 3.16969696969697e-05, - "loss": 4.6653, + "epoch": 1.0938628158844765, + "grad_norm": 4.565722465515137, + "learning_rate": 3.182912154031288e-05, + "loss": 4.5517, "step": 303 }, { - "epoch": 1.1054545454545455, - "grad_norm": 4.846319198608398, - "learning_rate": 3.1636363636363635e-05, - "loss": 4.6066, + "epoch": 1.0974729241877257, + "grad_norm": 5.111222267150879, + "learning_rate": 3.176895306859206e-05, + "loss": 4.5883, "step": 304 }, { - "epoch": 1.1090909090909091, - "grad_norm": 4.69194221496582, - "learning_rate": 3.1575757575757574e-05, - "loss": 4.7325, + "epoch": 1.1010830324909748, + "grad_norm": 3.97930645942688, + "learning_rate": 3.170878459687124e-05, + "loss": 4.7255, "step": 305 }, { - "epoch": 1.1127272727272728, - "grad_norm": 3.353827476501465, - "learning_rate": 3.151515151515151e-05, - "loss": 4.6744, + "epoch": 1.1046931407942238, + "grad_norm": 3.147520065307617, + "learning_rate": 3.164861612515042e-05, + "loss": 4.5757, "step": 306 }, { - "epoch": 1.1163636363636364, - "grad_norm": 3.623960256576538, - "learning_rate": 3.145454545454546e-05, - "loss": 4.8799, + "epoch": 1.108303249097473, + "grad_norm": 5.289313793182373, + "learning_rate": 3.15884476534296e-05, + "loss": 4.7255, "step": 307 }, { - "epoch": 1.12, - "grad_norm": 4.136066436767578, - "learning_rate": 3.1393939393939395e-05, - "loss": 4.715, + "epoch": 1.111913357400722, + "grad_norm": 4.879770278930664, + "learning_rate": 3.1528279181708784e-05, + "loss": 4.9087, "step": 308 }, { - "epoch": 1.1236363636363635, - "grad_norm": 4.029481410980225, - "learning_rate": 3.1333333333333334e-05, - "loss": 4.6084, + "epoch": 1.1155234657039712, + "grad_norm": 3.533722162246704, + "learning_rate": 3.146811070998797e-05, + "loss": 4.4174, "step": 309 }, { - "epoch": 1.1272727272727272, - "grad_norm": 4.471688270568848, - "learning_rate": 3.127272727272728e-05, - "loss": 4.5595, + "epoch": 1.1191335740072201, + "grad_norm": 2.7764930725097656, + "learning_rate": 3.140794223826715e-05, + "loss": 4.634, "step": 310 }, { - "epoch": 1.1309090909090909, - "grad_norm": 2.3349573612213135, - "learning_rate": 3.121212121212122e-05, - "loss": 4.5042, + "epoch": 1.1227436823104693, + "grad_norm": 3.450363874435425, + "learning_rate": 3.134777376654633e-05, + "loss": 4.5882, "step": 311 }, { - "epoch": 1.1345454545454545, - "grad_norm": 3.2584612369537354, - "learning_rate": 3.1151515151515156e-05, - "loss": 4.5667, + "epoch": 1.1263537906137184, + "grad_norm": 2.2029852867126465, + "learning_rate": 3.128760529482551e-05, + "loss": 4.6923, "step": 312 }, { - "epoch": 1.1381818181818182, - "grad_norm": 4.187669277191162, - "learning_rate": 3.1090909090909094e-05, - "loss": 4.591, + "epoch": 1.1299638989169676, + "grad_norm": 13.167424201965332, + "learning_rate": 3.12274368231047e-05, + "loss": 4.6935, "step": 313 }, { - "epoch": 1.1418181818181818, - "grad_norm": 2.7688863277435303, - "learning_rate": 3.103030303030303e-05, - "loss": 4.6028, + "epoch": 1.1335740072202167, + "grad_norm": 7.503760814666748, + "learning_rate": 3.1167268351383874e-05, + "loss": 4.6496, "step": 314 }, { - "epoch": 1.1454545454545455, - "grad_norm": 2.8685548305511475, - "learning_rate": 3.096969696969697e-05, - "loss": 4.8637, + "epoch": 1.1371841155234657, + "grad_norm": 4.865889549255371, + "learning_rate": 3.110709987966306e-05, + "loss": 4.5744, "step": 315 }, { - "epoch": 1.1490909090909092, - "grad_norm": 3.441349506378174, - "learning_rate": 3.090909090909091e-05, - "loss": 4.3877, + "epoch": 1.1407942238267148, + "grad_norm": 3.753988742828369, + "learning_rate": 3.104693140794224e-05, + "loss": 4.534, "step": 316 }, { - "epoch": 1.1527272727272728, - "grad_norm": 2.61682391166687, - "learning_rate": 3.084848484848485e-05, - "loss": 4.5628, + "epoch": 1.144404332129964, + "grad_norm": 4.719662666320801, + "learning_rate": 3.098676293622142e-05, + "loss": 4.5527, "step": 317 }, { - "epoch": 1.1563636363636363, - "grad_norm": 2.719127655029297, - "learning_rate": 3.0787878787878786e-05, - "loss": 4.6051, + "epoch": 1.1480144404332129, + "grad_norm": 4.489358425140381, + "learning_rate": 3.09265944645006e-05, + "loss": 4.6453, "step": 318 }, { - "epoch": 1.16, - "grad_norm": 3.737633466720581, - "learning_rate": 3.0727272727272724e-05, - "loss": 4.4883, + "epoch": 1.151624548736462, + "grad_norm": 2.6681582927703857, + "learning_rate": 3.086642599277979e-05, + "loss": 4.5538, "step": 319 }, { - "epoch": 1.1636363636363636, - "grad_norm": 2.2933197021484375, - "learning_rate": 3.066666666666667e-05, - "loss": 4.5372, + "epoch": 1.1552346570397112, + "grad_norm": 2.0293869972229004, + "learning_rate": 3.080625752105897e-05, + "loss": 4.5606, "step": 320 }, { - "epoch": 1.1672727272727272, - "grad_norm": 2.3944780826568604, - "learning_rate": 3.060606060606061e-05, - "loss": 4.6886, + "epoch": 1.1588447653429603, + "grad_norm": 15.385830879211426, + "learning_rate": 3.074608904933815e-05, + "loss": 4.8805, "step": 321 }, { - "epoch": 1.170909090909091, - "grad_norm": 2.4027483463287354, - "learning_rate": 3.054545454545455e-05, - "loss": 4.5962, + "epoch": 1.1624548736462095, + "grad_norm": 17.79530143737793, + "learning_rate": 3.0685920577617325e-05, + "loss": 4.5318, "step": 322 }, { - "epoch": 1.1745454545454546, - "grad_norm": 2.160446882247925, - "learning_rate": 3.0484848484848487e-05, - "loss": 4.7377, + "epoch": 1.1660649819494584, + "grad_norm": 9.239445686340332, + "learning_rate": 3.062575210589651e-05, + "loss": 4.5567, "step": 323 }, { - "epoch": 1.1781818181818182, - "grad_norm": 2.3969006538391113, - "learning_rate": 3.0424242424242426e-05, - "loss": 4.5241, + "epoch": 1.1696750902527075, + "grad_norm": 4.8387322425842285, + "learning_rate": 3.056558363417569e-05, + "loss": 4.6446, "step": 324 }, { - "epoch": 1.1818181818181819, - "grad_norm": 1.7160532474517822, - "learning_rate": 3.0363636363636367e-05, - "loss": 4.5891, + "epoch": 1.1732851985559567, + "grad_norm": 5.810051441192627, + "learning_rate": 3.0505415162454877e-05, + "loss": 4.5906, "step": 325 }, { - "epoch": 1.1854545454545455, - "grad_norm": 2.991136312484741, - "learning_rate": 3.0303030303030306e-05, - "loss": 4.5969, + "epoch": 1.1768953068592058, + "grad_norm": 6.55676794052124, + "learning_rate": 3.0445246690734057e-05, + "loss": 4.6755, "step": 326 }, { - "epoch": 1.189090909090909, - "grad_norm": 2.5131070613861084, - "learning_rate": 3.0242424242424244e-05, - "loss": 4.5446, + "epoch": 1.1805054151624548, + "grad_norm": 4.300464153289795, + "learning_rate": 3.038507821901324e-05, + "loss": 4.6347, "step": 327 }, { - "epoch": 1.1927272727272726, - "grad_norm": 2.6630470752716064, - "learning_rate": 3.0181818181818182e-05, - "loss": 4.7089, + "epoch": 1.184115523465704, + "grad_norm": 4.3174519538879395, + "learning_rate": 3.032490974729242e-05, + "loss": 4.7603, "step": 328 }, { - "epoch": 1.1963636363636363, - "grad_norm": 4.355704307556152, - "learning_rate": 3.012121212121212e-05, - "loss": 4.4316, + "epoch": 1.187725631768953, + "grad_norm": 5.036055564880371, + "learning_rate": 3.0264741275571602e-05, + "loss": 4.7223, "step": 329 }, { - "epoch": 1.2, - "grad_norm": 2.5602145195007324, - "learning_rate": 3.0060606060606062e-05, - "loss": 4.5123, + "epoch": 1.1913357400722022, + "grad_norm": 4.322726726531982, + "learning_rate": 3.0204572803850783e-05, + "loss": 4.6241, "step": 330 }, { - "epoch": 1.2036363636363636, - "grad_norm": 2.802565574645996, - "learning_rate": 3e-05, - "loss": 4.4404, + "epoch": 1.1949458483754514, + "grad_norm": 3.5698063373565674, + "learning_rate": 3.0144404332129967e-05, + "loss": 4.6906, "step": 331 }, { - "epoch": 1.2072727272727273, - "grad_norm": 1.707558274269104, - "learning_rate": 2.993939393939394e-05, - "loss": 4.5435, + "epoch": 1.1985559566787003, + "grad_norm": 4.076971530914307, + "learning_rate": 3.0084235860409147e-05, + "loss": 4.5428, "step": 332 }, { - "epoch": 1.210909090909091, - "grad_norm": 3.6090481281280518, - "learning_rate": 2.9878787878787877e-05, - "loss": 4.5652, + "epoch": 1.2021660649819494, + "grad_norm": 4.875138759613037, + "learning_rate": 3.0024067388688324e-05, + "loss": 4.7874, "step": 333 }, { - "epoch": 1.2145454545454546, - "grad_norm": 2.4650418758392334, - "learning_rate": 2.9818181818181816e-05, - "loss": 4.5316, + "epoch": 1.2057761732851986, + "grad_norm": 3.3777639865875244, + "learning_rate": 2.996389891696751e-05, + "loss": 4.4553, "step": 334 }, { - "epoch": 1.2181818181818183, - "grad_norm": 4.170000076293945, - "learning_rate": 2.9757575757575757e-05, - "loss": 4.5409, + "epoch": 1.2093862815884477, + "grad_norm": 8.022859573364258, + "learning_rate": 2.9903730445246692e-05, + "loss": 4.5464, "step": 335 }, { - "epoch": 1.221818181818182, - "grad_norm": 3.2201592922210693, - "learning_rate": 2.96969696969697e-05, - "loss": 4.5562, + "epoch": 1.2129963898916967, + "grad_norm": 4.7307047843933105, + "learning_rate": 2.9843561973525873e-05, + "loss": 4.4503, "step": 336 }, { - "epoch": 1.2254545454545456, - "grad_norm": 2.6030731201171875, - "learning_rate": 2.963636363636364e-05, - "loss": 4.5289, + "epoch": 1.2166064981949458, + "grad_norm": 2.6915078163146973, + "learning_rate": 2.9783393501805057e-05, + "loss": 4.76, "step": 337 }, { - "epoch": 1.229090909090909, - "grad_norm": 5.5631513595581055, - "learning_rate": 2.957575757575758e-05, - "loss": 4.5547, + "epoch": 1.220216606498195, + "grad_norm": 4.561704158782959, + "learning_rate": 2.972322503008424e-05, + "loss": 4.8931, "step": 338 }, { - "epoch": 1.2327272727272727, - "grad_norm": 5.190867900848389, - "learning_rate": 2.9515151515151518e-05, - "loss": 4.4717, + "epoch": 1.2238267148014441, + "grad_norm": 3.5802175998687744, + "learning_rate": 2.9663056558363418e-05, + "loss": 4.7736, "step": 339 }, { - "epoch": 1.2363636363636363, - "grad_norm": 4.258591651916504, - "learning_rate": 2.9454545454545456e-05, - "loss": 4.5684, + "epoch": 1.2274368231046933, + "grad_norm": 2.2672626972198486, + "learning_rate": 2.9602888086642598e-05, + "loss": 4.6361, "step": 340 }, { - "epoch": 1.24, - "grad_norm": 6.548985004425049, - "learning_rate": 2.9393939393939394e-05, - "loss": 4.4463, + "epoch": 1.2310469314079422, + "grad_norm": 3.247804641723633, + "learning_rate": 2.9542719614921782e-05, + "loss": 4.5714, "step": 341 }, { - "epoch": 1.2436363636363637, - "grad_norm": 2.304765462875366, - "learning_rate": 2.9333333333333336e-05, - "loss": 4.514, + "epoch": 1.2346570397111913, + "grad_norm": 2.9589133262634277, + "learning_rate": 2.9482551143200966e-05, + "loss": 4.6753, "step": 342 }, { - "epoch": 1.2472727272727273, - "grad_norm": 4.920679569244385, - "learning_rate": 2.9272727272727274e-05, - "loss": 4.6245, + "epoch": 1.2382671480144405, + "grad_norm": 3.426271677017212, + "learning_rate": 2.9422382671480147e-05, + "loss": 4.5655, "step": 343 }, { - "epoch": 1.250909090909091, - "grad_norm": 4.364790916442871, - "learning_rate": 2.9212121212121213e-05, - "loss": 4.377, + "epoch": 1.2418772563176894, + "grad_norm": 3.625859498977661, + "learning_rate": 2.9362214199759324e-05, + "loss": 4.5216, "step": 344 }, { - "epoch": 1.2545454545454544, - "grad_norm": 2.6548376083374023, - "learning_rate": 2.915151515151515e-05, - "loss": 4.673, + "epoch": 1.2454873646209386, + "grad_norm": 2.466961145401001, + "learning_rate": 2.9302045728038508e-05, + "loss": 4.544, "step": 345 }, { - "epoch": 1.2581818181818183, - "grad_norm": 4.800558567047119, - "learning_rate": 2.909090909090909e-05, - "loss": 4.6142, + "epoch": 1.2490974729241877, + "grad_norm": 3.262009859085083, + "learning_rate": 2.924187725631769e-05, + "loss": 4.7037, "step": 346 }, { - "epoch": 1.2618181818181817, - "grad_norm": 2.561149835586548, - "learning_rate": 2.903030303030303e-05, - "loss": 4.6248, + "epoch": 1.2527075812274369, + "grad_norm": 2.6443886756896973, + "learning_rate": 2.9181708784596872e-05, + "loss": 4.5689, "step": 347 }, { - "epoch": 1.2654545454545454, - "grad_norm": 2.2318789958953857, - "learning_rate": 2.896969696969697e-05, - "loss": 4.6913, + "epoch": 1.256317689530686, + "grad_norm": 2.4855597019195557, + "learning_rate": 2.9121540312876056e-05, + "loss": 4.6886, "step": 348 }, { - "epoch": 1.269090909090909, - "grad_norm": 3.8998968601226807, - "learning_rate": 2.8909090909090908e-05, - "loss": 4.6961, + "epoch": 1.259927797833935, + "grad_norm": 2.964390754699707, + "learning_rate": 2.906137184115524e-05, + "loss": 4.5719, "step": 349 }, { - "epoch": 1.2727272727272727, - "grad_norm": 3.6434195041656494, - "learning_rate": 2.8848484848484853e-05, - "loss": 4.6966, + "epoch": 1.263537906137184, + "grad_norm": 2.4611830711364746, + "learning_rate": 2.9001203369434417e-05, + "loss": 4.6705, "step": 350 }, { - "epoch": 1.2763636363636364, - "grad_norm": 2.5094985961914062, - "learning_rate": 2.878787878787879e-05, - "loss": 4.6748, + "epoch": 1.2671480144404332, + "grad_norm": 3.128156900405884, + "learning_rate": 2.8941034897713598e-05, + "loss": 4.6805, "step": 351 }, { - "epoch": 1.28, - "grad_norm": 5.040907382965088, - "learning_rate": 2.872727272727273e-05, - "loss": 4.4934, + "epoch": 1.2707581227436824, + "grad_norm": 2.996750831604004, + "learning_rate": 2.888086642599278e-05, + "loss": 4.5068, "step": 352 }, { - "epoch": 1.2836363636363637, - "grad_norm": 4.69912576675415, - "learning_rate": 2.8666666666666668e-05, - "loss": 4.4348, + "epoch": 1.2743682310469313, + "grad_norm": 3.2886064052581787, + "learning_rate": 2.8820697954271962e-05, + "loss": 4.4811, "step": 353 }, { - "epoch": 1.2872727272727273, - "grad_norm": 2.9167287349700928, - "learning_rate": 2.860606060606061e-05, - "loss": 4.4813, + "epoch": 1.2779783393501805, + "grad_norm": 2.363309144973755, + "learning_rate": 2.8760529482551146e-05, + "loss": 4.4984, "step": 354 }, { - "epoch": 1.290909090909091, - "grad_norm": 2.7324609756469727, - "learning_rate": 2.8545454545454548e-05, - "loss": 4.5087, + "epoch": 1.2815884476534296, + "grad_norm": 5.747439861297607, + "learning_rate": 2.870036101083033e-05, + "loss": 4.5855, "step": 355 }, { - "epoch": 1.2945454545454544, - "grad_norm": 4.106588363647461, - "learning_rate": 2.8484848484848486e-05, - "loss": 4.8083, + "epoch": 1.2851985559566788, + "grad_norm": 2.802114486694336, + "learning_rate": 2.8640192539109507e-05, + "loss": 4.5466, "step": 356 }, { - "epoch": 1.298181818181818, - "grad_norm": 3.12602162361145, - "learning_rate": 2.8424242424242424e-05, - "loss": 4.5605, + "epoch": 1.288808664259928, + "grad_norm": 3.1506760120391846, + "learning_rate": 2.8580024067388687e-05, + "loss": 4.4259, "step": 357 }, { - "epoch": 1.3018181818181818, - "grad_norm": 3.1429829597473145, - "learning_rate": 2.8363636363636363e-05, - "loss": 4.6312, + "epoch": 1.2924187725631768, + "grad_norm": 6.545141220092773, + "learning_rate": 2.851985559566787e-05, + "loss": 4.4247, "step": 358 }, { - "epoch": 1.3054545454545454, - "grad_norm": 3.179574966430664, - "learning_rate": 2.8303030303030305e-05, - "loss": 4.6001, + "epoch": 1.296028880866426, + "grad_norm": 11.535502433776855, + "learning_rate": 2.8459687123947055e-05, + "loss": 4.5342, "step": 359 }, { - "epoch": 1.309090909090909, - "grad_norm": 3.6335721015930176, - "learning_rate": 2.8242424242424243e-05, - "loss": 4.4829, + "epoch": 1.2996389891696751, + "grad_norm": 6.203153133392334, + "learning_rate": 2.8399518652226236e-05, + "loss": 4.5254, "step": 360 }, { - "epoch": 1.3127272727272727, - "grad_norm": 3.5006937980651855, - "learning_rate": 2.818181818181818e-05, - "loss": 4.5427, + "epoch": 1.303249097472924, + "grad_norm": 3.3538057804107666, + "learning_rate": 2.8339350180505413e-05, + "loss": 4.5488, "step": 361 }, { - "epoch": 1.3163636363636364, - "grad_norm": 5.941033363342285, - "learning_rate": 2.812121212121212e-05, - "loss": 4.8228, + "epoch": 1.3068592057761732, + "grad_norm": 7.0459675788879395, + "learning_rate": 2.8279181708784597e-05, + "loss": 4.8362, "step": 362 }, { - "epoch": 1.32, - "grad_norm": 3.8232581615448, - "learning_rate": 2.8060606060606058e-05, - "loss": 4.6344, + "epoch": 1.3104693140794224, + "grad_norm": 6.513792037963867, + "learning_rate": 2.821901323706378e-05, + "loss": 4.5027, "step": 363 }, { - "epoch": 1.3236363636363637, - "grad_norm": 3.8346667289733887, - "learning_rate": 2.8000000000000003e-05, - "loss": 4.5342, + "epoch": 1.3140794223826715, + "grad_norm": 4.375041484832764, + "learning_rate": 2.815884476534296e-05, + "loss": 4.5529, "step": 364 }, { - "epoch": 1.3272727272727272, - "grad_norm": 3.590717315673828, - "learning_rate": 2.7939393939393945e-05, - "loss": 4.4932, + "epoch": 1.3176895306859207, + "grad_norm": 2.8317453861236572, + "learning_rate": 2.8098676293622145e-05, + "loss": 4.5939, "step": 365 }, { - "epoch": 1.330909090909091, - "grad_norm": 5.545670509338379, - "learning_rate": 2.7878787878787883e-05, - "loss": 4.6162, + "epoch": 1.3212996389891698, + "grad_norm": 5.5963640213012695, + "learning_rate": 2.803850782190133e-05, + "loss": 4.6253, "step": 366 }, { - "epoch": 1.3345454545454545, - "grad_norm": 2.9127771854400635, - "learning_rate": 2.781818181818182e-05, - "loss": 4.3389, + "epoch": 1.3249097472924187, + "grad_norm": 4.876716136932373, + "learning_rate": 2.7978339350180506e-05, + "loss": 4.5127, "step": 367 }, { - "epoch": 1.3381818181818181, - "grad_norm": 5.067335605621338, - "learning_rate": 2.775757575757576e-05, - "loss": 4.7217, + "epoch": 1.3285198555956679, + "grad_norm": 3.072681188583374, + "learning_rate": 2.7918170878459687e-05, + "loss": 4.756, "step": 368 }, { - "epoch": 1.3418181818181818, - "grad_norm": 6.718608856201172, - "learning_rate": 2.7696969696969698e-05, - "loss": 4.5497, + "epoch": 1.332129963898917, + "grad_norm": 2.941751480102539, + "learning_rate": 2.785800240673887e-05, + "loss": 4.694, "step": 369 }, { - "epoch": 1.3454545454545455, - "grad_norm": 3.0536184310913086, - "learning_rate": 2.7636363636363636e-05, - "loss": 4.48, + "epoch": 1.335740072202166, + "grad_norm": 5.169521331787109, + "learning_rate": 2.779783393501805e-05, + "loss": 4.5207, "step": 370 }, { - "epoch": 1.3490909090909091, - "grad_norm": 3.9842278957366943, - "learning_rate": 2.7575757575757578e-05, - "loss": 4.6911, + "epoch": 1.339350180505415, + "grad_norm": 4.792356491088867, + "learning_rate": 2.7737665463297235e-05, + "loss": 4.5512, "step": 371 }, { - "epoch": 1.3527272727272728, - "grad_norm": 4.863982677459717, - "learning_rate": 2.7515151515151516e-05, - "loss": 4.692, + "epoch": 1.3429602888086642, + "grad_norm": 6.515016555786133, + "learning_rate": 2.7677496991576412e-05, + "loss": 4.8404, "step": 372 }, { - "epoch": 1.3563636363636364, - "grad_norm": 3.385343551635742, - "learning_rate": 2.7454545454545455e-05, - "loss": 4.6677, + "epoch": 1.3465703971119134, + "grad_norm": 4.51361608505249, + "learning_rate": 2.7617328519855596e-05, + "loss": 4.3583, "step": 373 }, { - "epoch": 1.3599999999999999, - "grad_norm": 4.236108779907227, - "learning_rate": 2.7393939393939393e-05, - "loss": 4.5584, + "epoch": 1.3501805054151625, + "grad_norm": 3.141707181930542, + "learning_rate": 2.7557160048134777e-05, + "loss": 4.548, "step": 374 }, { - "epoch": 1.3636363636363638, - "grad_norm": 4.658186435699463, - "learning_rate": 2.733333333333333e-05, - "loss": 4.7851, + "epoch": 1.3537906137184115, + "grad_norm": 2.3701894283294678, + "learning_rate": 2.749699157641396e-05, + "loss": 4.5083, "step": 375 }, { - "epoch": 1.3672727272727272, - "grad_norm": 2.5289180278778076, - "learning_rate": 2.7272727272727273e-05, - "loss": 4.6179, + "epoch": 1.3574007220216606, + "grad_norm": 6.5997772216796875, + "learning_rate": 2.7436823104693144e-05, + "loss": 4.4702, "step": 376 }, { - "epoch": 1.3709090909090909, - "grad_norm": 3.179882049560547, - "learning_rate": 2.721212121212121e-05, - "loss": 4.5029, + "epoch": 1.3610108303249098, + "grad_norm": 6.290494441986084, + "learning_rate": 2.7376654632972325e-05, + "loss": 4.5668, "step": 377 }, { - "epoch": 1.3745454545454545, - "grad_norm": 3.1706180572509766, - "learning_rate": 2.7151515151515157e-05, - "loss": 4.4334, + "epoch": 1.364620938628159, + "grad_norm": 2.408001661300659, + "learning_rate": 2.7316486161251502e-05, + "loss": 4.7025, "step": 378 }, { - "epoch": 1.3781818181818182, - "grad_norm": 3.649819850921631, - "learning_rate": 2.7090909090909095e-05, - "loss": 4.3529, + "epoch": 1.3682310469314078, + "grad_norm": 4.783012390136719, + "learning_rate": 2.7256317689530686e-05, + "loss": 4.4451, "step": 379 }, { - "epoch": 1.3818181818181818, - "grad_norm": 4.352628231048584, - "learning_rate": 2.7030303030303033e-05, - "loss": 4.6518, + "epoch": 1.371841155234657, + "grad_norm": 5.366835594177246, + "learning_rate": 2.719614921780987e-05, + "loss": 4.6915, "step": 380 }, { - "epoch": 1.3854545454545455, - "grad_norm": 3.875267744064331, - "learning_rate": 2.696969696969697e-05, - "loss": 4.6, + "epoch": 1.3754512635379061, + "grad_norm": 2.9238317012786865, + "learning_rate": 2.713598074608905e-05, + "loss": 4.6786, "step": 381 }, { - "epoch": 1.3890909090909092, - "grad_norm": 7.508423328399658, - "learning_rate": 2.6909090909090913e-05, - "loss": 4.6121, + "epoch": 1.3790613718411553, + "grad_norm": 4.001113414764404, + "learning_rate": 2.7075812274368234e-05, + "loss": 4.5797, "step": 382 }, { - "epoch": 1.3927272727272726, - "grad_norm": 4.307608604431152, - "learning_rate": 2.684848484848485e-05, - "loss": 4.6758, + "epoch": 1.3826714801444044, + "grad_norm": 3.905136823654175, + "learning_rate": 2.7015643802647418e-05, + "loss": 4.5265, "step": 383 }, { - "epoch": 1.3963636363636365, - "grad_norm": 2.5840773582458496, - "learning_rate": 2.678787878787879e-05, - "loss": 4.5082, + "epoch": 1.3862815884476534, + "grad_norm": 3.3477468490600586, + "learning_rate": 2.6955475330926595e-05, + "loss": 4.5869, "step": 384 }, { - "epoch": 1.4, - "grad_norm": 3.530118703842163, - "learning_rate": 2.6727272727272728e-05, - "loss": 4.3333, + "epoch": 1.3898916967509025, + "grad_norm": 2.695615768432617, + "learning_rate": 2.6895306859205776e-05, + "loss": 4.5366, "step": 385 }, { - "epoch": 1.4036363636363636, - "grad_norm": 6.304224491119385, - "learning_rate": 2.6666666666666667e-05, - "loss": 4.6398, + "epoch": 1.3935018050541517, + "grad_norm": 3.9516983032226562, + "learning_rate": 2.683513838748496e-05, + "loss": 4.5686, "step": 386 }, { - "epoch": 1.4072727272727272, - "grad_norm": 3.043519973754883, - "learning_rate": 2.6606060606060605e-05, - "loss": 4.4337, + "epoch": 1.3971119133574006, + "grad_norm": 5.092808723449707, + "learning_rate": 2.6774969915764144e-05, + "loss": 4.4119, "step": 387 }, { - "epoch": 1.410909090909091, - "grad_norm": 5.097027778625488, - "learning_rate": 2.6545454545454547e-05, - "loss": 4.6867, + "epoch": 1.4007220216606497, + "grad_norm": 4.993492126464844, + "learning_rate": 2.6714801444043324e-05, + "loss": 4.7174, "step": 388 }, { - "epoch": 1.4145454545454546, - "grad_norm": 5.522812366485596, - "learning_rate": 2.6484848484848485e-05, - "loss": 4.3685, + "epoch": 1.404332129963899, + "grad_norm": 3.519829750061035, + "learning_rate": 2.66546329723225e-05, + "loss": 4.7413, "step": 389 }, { - "epoch": 1.4181818181818182, - "grad_norm": 3.3053135871887207, - "learning_rate": 2.6424242424242423e-05, - "loss": 4.6564, + "epoch": 1.407942238267148, + "grad_norm": 3.6330387592315674, + "learning_rate": 2.6594464500601685e-05, + "loss": 4.5918, "step": 390 }, { - "epoch": 1.4218181818181819, - "grad_norm": 4.631997585296631, - "learning_rate": 2.636363636363636e-05, - "loss": 4.5055, + "epoch": 1.4115523465703972, + "grad_norm": 4.714821815490723, + "learning_rate": 2.6534296028880866e-05, + "loss": 4.4284, "step": 391 }, { - "epoch": 1.4254545454545455, - "grad_norm": 2.758570671081543, - "learning_rate": 2.63030303030303e-05, - "loss": 4.5271, + "epoch": 1.4151624548736463, + "grad_norm": 5.586686134338379, + "learning_rate": 2.647412755716005e-05, + "loss": 4.6257, "step": 392 }, { - "epoch": 1.4290909090909092, - "grad_norm": 2.726597785949707, - "learning_rate": 2.6242424242424245e-05, - "loss": 4.4968, + "epoch": 1.4187725631768953, + "grad_norm": 4.272150993347168, + "learning_rate": 2.6413959085439234e-05, + "loss": 4.7499, "step": 393 }, { - "epoch": 1.4327272727272726, - "grad_norm": 5.252767086029053, - "learning_rate": 2.6181818181818187e-05, - "loss": 4.8041, + "epoch": 1.4223826714801444, + "grad_norm": 3.671114683151245, + "learning_rate": 2.6353790613718414e-05, + "loss": 4.4255, "step": 394 }, { - "epoch": 1.4363636363636363, - "grad_norm": 3.7906062602996826, - "learning_rate": 2.6121212121212125e-05, - "loss": 4.3581, + "epoch": 1.4259927797833936, + "grad_norm": 3.9677507877349854, + "learning_rate": 2.629362214199759e-05, + "loss": 4.8504, "step": 395 }, { - "epoch": 1.44, - "grad_norm": 5.029653549194336, - "learning_rate": 2.6060606060606063e-05, - "loss": 4.4908, + "epoch": 1.4296028880866425, + "grad_norm": 3.9609200954437256, + "learning_rate": 2.6233453670276775e-05, + "loss": 4.6043, "step": 396 }, { - "epoch": 1.4436363636363636, - "grad_norm": 6.576402187347412, - "learning_rate": 2.6000000000000002e-05, - "loss": 4.4629, + "epoch": 1.4332129963898916, + "grad_norm": 3.998934030532837, + "learning_rate": 2.617328519855596e-05, + "loss": 4.4927, "step": 397 }, { - "epoch": 1.4472727272727273, - "grad_norm": 6.675891399383545, - "learning_rate": 2.593939393939394e-05, - "loss": 4.7604, + "epoch": 1.4368231046931408, + "grad_norm": 3.251746892929077, + "learning_rate": 2.611311672683514e-05, + "loss": 4.4543, "step": 398 }, { - "epoch": 1.450909090909091, - "grad_norm": 4.859799861907959, - "learning_rate": 2.587878787878788e-05, - "loss": 4.431, + "epoch": 1.44043321299639, + "grad_norm": 6.4932379722595215, + "learning_rate": 2.6052948255114323e-05, + "loss": 4.7533, "step": 399 }, { - "epoch": 1.4545454545454546, - "grad_norm": 4.277210712432861, - "learning_rate": 2.581818181818182e-05, - "loss": 4.4418, + "epoch": 1.444043321299639, + "grad_norm": 6.668980121612549, + "learning_rate": 2.59927797833935e-05, + "loss": 4.6553, "step": 400 }, { - "epoch": 1.4581818181818182, - "grad_norm": 3.4903981685638428, - "learning_rate": 2.575757575757576e-05, - "loss": 4.5139, + "epoch": 1.447653429602888, + "grad_norm": 3.2073581218719482, + "learning_rate": 2.5932611311672685e-05, + "loss": 4.711, "step": 401 }, { - "epoch": 1.461818181818182, - "grad_norm": 5.1570963859558105, - "learning_rate": 2.5696969696969697e-05, - "loss": 4.4146, + "epoch": 1.4512635379061372, + "grad_norm": 5.215062141418457, + "learning_rate": 2.5872442839951865e-05, + "loss": 4.519, "step": 402 }, { - "epoch": 1.4654545454545453, - "grad_norm": 7.002410411834717, - "learning_rate": 2.5636363636363635e-05, - "loss": 4.6652, + "epoch": 1.4548736462093863, + "grad_norm": 7.263779640197754, + "learning_rate": 2.581227436823105e-05, + "loss": 4.8241, "step": 403 }, { - "epoch": 1.4690909090909092, - "grad_norm": 3.277343273162842, - "learning_rate": 2.5575757575757573e-05, - "loss": 4.8023, + "epoch": 1.4584837545126355, + "grad_norm": 3.1585211753845215, + "learning_rate": 2.5752105896510233e-05, + "loss": 4.6324, "step": 404 }, { - "epoch": 1.4727272727272727, - "grad_norm": 4.8317341804504395, - "learning_rate": 2.5515151515151515e-05, - "loss": 4.7799, + "epoch": 1.4620938628158844, + "grad_norm": 8.619237899780273, + "learning_rate": 2.5691937424789413e-05, + "loss": 4.4658, "step": 405 }, { - "epoch": 1.4763636363636363, - "grad_norm": 3.615807056427002, - "learning_rate": 2.5454545454545454e-05, - "loss": 4.4125, + "epoch": 1.4657039711191335, + "grad_norm": 9.702713012695312, + "learning_rate": 2.563176895306859e-05, + "loss": 4.7969, "step": 406 }, { - "epoch": 1.48, - "grad_norm": 2.5169756412506104, - "learning_rate": 2.53939393939394e-05, - "loss": 4.6356, + "epoch": 1.4693140794223827, + "grad_norm": 6.16801118850708, + "learning_rate": 2.5571600481347774e-05, + "loss": 4.5924, "step": 407 }, { - "epoch": 1.4836363636363636, - "grad_norm": 5.736434459686279, - "learning_rate": 2.5333333333333337e-05, - "loss": 4.3922, + "epoch": 1.4729241877256318, + "grad_norm": 3.287752151489258, + "learning_rate": 2.5511432009626955e-05, + "loss": 4.8517, "step": 408 }, { - "epoch": 1.4872727272727273, - "grad_norm": 5.380589008331299, - "learning_rate": 2.5272727272727275e-05, - "loss": 4.4222, + "epoch": 1.476534296028881, + "grad_norm": 6.46322774887085, + "learning_rate": 2.545126353790614e-05, + "loss": 4.588, "step": 409 }, { - "epoch": 1.490909090909091, - "grad_norm": 4.923427581787109, - "learning_rate": 2.5212121212121214e-05, - "loss": 4.5611, + "epoch": 1.48014440433213, + "grad_norm": 5.652312755584717, + "learning_rate": 2.5391095066185323e-05, + "loss": 4.649, "step": 410 }, { - "epoch": 1.4945454545454546, - "grad_norm": 6.039291858673096, - "learning_rate": 2.5151515151515155e-05, - "loss": 4.4644, + "epoch": 1.483754512635379, + "grad_norm": 4.650730609893799, + "learning_rate": 2.53309265944645e-05, + "loss": 4.5696, "step": 411 }, { - "epoch": 1.498181818181818, - "grad_norm": 4.344960689544678, - "learning_rate": 2.5090909090909094e-05, - "loss": 4.6914, + "epoch": 1.4873646209386282, + "grad_norm": 3.9885752201080322, + "learning_rate": 2.527075812274368e-05, + "loss": 4.3658, "step": 412 }, { - "epoch": 1.501818181818182, - "grad_norm": 2.4840219020843506, - "learning_rate": 2.5030303030303032e-05, - "loss": 4.4634, + "epoch": 1.4909747292418771, + "grad_norm": 7.778469085693359, + "learning_rate": 2.5210589651022864e-05, + "loss": 4.5531, "step": 413 }, { - "epoch": 1.5054545454545454, - "grad_norm": 4.167297840118408, - "learning_rate": 2.496969696969697e-05, - "loss": 4.5234, + "epoch": 1.4945848375451263, + "grad_norm": 4.358502388000488, + "learning_rate": 2.5150421179302048e-05, + "loss": 4.8113, "step": 414 }, { - "epoch": 1.509090909090909, - "grad_norm": 3.043473482131958, - "learning_rate": 2.490909090909091e-05, - "loss": 4.5519, + "epoch": 1.4981949458483754, + "grad_norm": 3.1924219131469727, + "learning_rate": 2.509025270758123e-05, + "loss": 4.597, "step": 415 }, { - "epoch": 1.5127272727272727, - "grad_norm": 2.029343366622925, - "learning_rate": 2.4848484848484847e-05, - "loss": 4.4137, + "epoch": 1.5018050541516246, + "grad_norm": 2.0474438667297363, + "learning_rate": 2.5030084235860413e-05, + "loss": 4.6747, "step": 416 }, { - "epoch": 1.5163636363636364, - "grad_norm": 3.7380053997039795, - "learning_rate": 2.478787878787879e-05, - "loss": 4.7946, + "epoch": 1.5054151624548737, + "grad_norm": 3.988816261291504, + "learning_rate": 2.4969915764139593e-05, + "loss": 4.4763, "step": 417 }, { - "epoch": 1.52, - "grad_norm": 3.8255624771118164, - "learning_rate": 2.472727272727273e-05, - "loss": 4.6077, + "epoch": 1.5090252707581229, + "grad_norm": 4.126523971557617, + "learning_rate": 2.4909747292418774e-05, + "loss": 4.5102, "step": 418 }, { - "epoch": 1.5236363636363637, - "grad_norm": 3.6973726749420166, - "learning_rate": 2.466666666666667e-05, - "loss": 4.4438, + "epoch": 1.5126353790613718, + "grad_norm": 3.799755334854126, + "learning_rate": 2.4849578820697954e-05, + "loss": 4.3798, "step": 419 }, { - "epoch": 1.5272727272727273, - "grad_norm": 2.9415085315704346, - "learning_rate": 2.4606060606060607e-05, - "loss": 4.5242, + "epoch": 1.516245487364621, + "grad_norm": 5.314898490905762, + "learning_rate": 2.4789410348977138e-05, + "loss": 4.4691, "step": 420 }, { - "epoch": 1.5309090909090908, - "grad_norm": 2.4222817420959473, - "learning_rate": 2.4545454545454545e-05, - "loss": 4.4857, + "epoch": 1.5198555956678699, + "grad_norm": 4.028231620788574, + "learning_rate": 2.472924187725632e-05, + "loss": 4.5874, "step": 421 }, { - "epoch": 1.5345454545454547, - "grad_norm": 3.7326831817626953, - "learning_rate": 2.4484848484848484e-05, - "loss": 4.3882, + "epoch": 1.523465703971119, + "grad_norm": 3.6319522857666016, + "learning_rate": 2.46690734055355e-05, + "loss": 4.5248, "step": 422 }, { - "epoch": 1.538181818181818, - "grad_norm": 5.729616165161133, - "learning_rate": 2.4424242424242426e-05, - "loss": 4.6133, + "epoch": 1.5270758122743682, + "grad_norm": 6.449628829956055, + "learning_rate": 2.4608904933814683e-05, + "loss": 4.4882, "step": 423 }, { - "epoch": 1.541818181818182, - "grad_norm": 3.818107843399048, - "learning_rate": 2.4363636363636364e-05, - "loss": 4.5399, + "epoch": 1.5306859205776173, + "grad_norm": 7.905242919921875, + "learning_rate": 2.4548736462093864e-05, + "loss": 4.7121, "step": 424 }, { - "epoch": 1.5454545454545454, - "grad_norm": 6.078042984008789, - "learning_rate": 2.4303030303030306e-05, - "loss": 4.4185, + "epoch": 1.5342960288808665, + "grad_norm": 4.377833843231201, + "learning_rate": 2.4488567990373047e-05, + "loss": 4.5071, "step": 425 }, { - "epoch": 1.549090909090909, - "grad_norm": 3.017300844192505, - "learning_rate": 2.4242424242424244e-05, - "loss": 4.3699, + "epoch": 1.5379061371841156, + "grad_norm": 3.6568801403045654, + "learning_rate": 2.4428399518652228e-05, + "loss": 4.4466, "step": 426 }, { - "epoch": 1.5527272727272727, - "grad_norm": 2.781771421432495, - "learning_rate": 2.4181818181818182e-05, - "loss": 4.4404, + "epoch": 1.5415162454873648, + "grad_norm": 11.142749786376953, + "learning_rate": 2.436823104693141e-05, + "loss": 4.515, "step": 427 }, { - "epoch": 1.5563636363636364, - "grad_norm": 5.910660743713379, - "learning_rate": 2.4121212121212124e-05, - "loss": 4.2755, + "epoch": 1.5451263537906137, + "grad_norm": 5.1178083419799805, + "learning_rate": 2.4308062575210592e-05, + "loss": 4.4591, "step": 428 }, { - "epoch": 1.56, - "grad_norm": 6.957926273345947, - "learning_rate": 2.4060606060606062e-05, - "loss": 4.6575, + "epoch": 1.5487364620938628, + "grad_norm": 2.430281639099121, + "learning_rate": 2.4247894103489773e-05, + "loss": 4.5393, "step": 429 }, { - "epoch": 1.5636363636363635, - "grad_norm": 5.411463737487793, - "learning_rate": 2.4e-05, - "loss": 4.5792, + "epoch": 1.5523465703971118, + "grad_norm": 3.443852424621582, + "learning_rate": 2.4187725631768953e-05, + "loss": 4.5422, "step": 430 }, { - "epoch": 1.5672727272727274, - "grad_norm": 4.2124924659729, - "learning_rate": 2.393939393939394e-05, - "loss": 4.4813, + "epoch": 1.555956678700361, + "grad_norm": 4.571306228637695, + "learning_rate": 2.4127557160048137e-05, + "loss": 4.673, "step": 431 }, { - "epoch": 1.5709090909090908, - "grad_norm": 5.113527297973633, - "learning_rate": 2.387878787878788e-05, - "loss": 4.6248, + "epoch": 1.55956678700361, + "grad_norm": 3.1203410625457764, + "learning_rate": 2.4067388688327318e-05, + "loss": 4.6205, "step": 432 }, { - "epoch": 1.5745454545454547, - "grad_norm": 5.174093246459961, - "learning_rate": 2.381818181818182e-05, - "loss": 4.4863, + "epoch": 1.5631768953068592, + "grad_norm": 5.54405403137207, + "learning_rate": 2.40072202166065e-05, + "loss": 4.6493, "step": 433 }, { - "epoch": 1.5781818181818181, - "grad_norm": 4.3809027671813965, - "learning_rate": 2.375757575757576e-05, - "loss": 4.4467, + "epoch": 1.5667870036101084, + "grad_norm": 4.841291904449463, + "learning_rate": 2.3947051744885682e-05, + "loss": 4.4992, "step": 434 }, { - "epoch": 1.5818181818181818, - "grad_norm": 3.806034564971924, - "learning_rate": 2.36969696969697e-05, - "loss": 4.458, + "epoch": 1.5703971119133575, + "grad_norm": 5.931495189666748, + "learning_rate": 2.3886883273164863e-05, + "loss": 4.773, "step": 435 }, { - "epoch": 1.5854545454545454, - "grad_norm": 6.237285614013672, - "learning_rate": 2.3636363636363637e-05, - "loss": 4.584, + "epoch": 1.5740072202166067, + "grad_norm": 3.611367702484131, + "learning_rate": 2.3826714801444043e-05, + "loss": 4.5129, "step": 436 }, { - "epoch": 1.589090909090909, - "grad_norm": 8.469368934631348, - "learning_rate": 2.3575757575757576e-05, - "loss": 4.5677, + "epoch": 1.5776173285198556, + "grad_norm": 7.546535015106201, + "learning_rate": 2.3766546329723227e-05, + "loss": 4.4994, "step": 437 }, { - "epoch": 1.5927272727272728, - "grad_norm": 6.997206211090088, - "learning_rate": 2.3515151515151514e-05, - "loss": 4.4851, + "epoch": 1.5812274368231047, + "grad_norm": 5.833862781524658, + "learning_rate": 2.3706377858002408e-05, + "loss": 4.8386, "step": 438 }, { - "epoch": 1.5963636363636362, - "grad_norm": 3.9165611267089844, - "learning_rate": 2.3454545454545456e-05, - "loss": 4.6739, + "epoch": 1.5848375451263537, + "grad_norm": 3.0301411151885986, + "learning_rate": 2.3646209386281588e-05, + "loss": 4.3775, "step": 439 }, { - "epoch": 1.6, - "grad_norm": 4.296258449554443, - "learning_rate": 2.3393939393939397e-05, - "loss": 4.3954, + "epoch": 1.5884476534296028, + "grad_norm": 5.783769607543945, + "learning_rate": 2.3586040914560772e-05, + "loss": 4.6718, "step": 440 }, { - "epoch": 1.6036363636363635, - "grad_norm": 4.895616054534912, - "learning_rate": 2.3333333333333336e-05, - "loss": 4.6904, + "epoch": 1.592057761732852, + "grad_norm": 3.89277982711792, + "learning_rate": 2.3525872442839953e-05, + "loss": 4.4917, "step": 441 }, { - "epoch": 1.6072727272727274, - "grad_norm": 3.0008482933044434, - "learning_rate": 2.3272727272727274e-05, - "loss": 4.4606, + "epoch": 1.595667870036101, + "grad_norm": 2.7134242057800293, + "learning_rate": 2.3465703971119137e-05, + "loss": 4.5437, "step": 442 }, { - "epoch": 1.6109090909090908, - "grad_norm": 3.4518730640411377, - "learning_rate": 2.3212121212121212e-05, - "loss": 4.3835, + "epoch": 1.5992779783393503, + "grad_norm": 4.376542568206787, + "learning_rate": 2.3405535499398314e-05, + "loss": 4.5702, "step": 443 }, { - "epoch": 1.6145454545454545, - "grad_norm": 7.562982559204102, - "learning_rate": 2.315151515151515e-05, - "loss": 4.7344, + "epoch": 1.6028880866425994, + "grad_norm": 5.11259651184082, + "learning_rate": 2.3345367027677498e-05, + "loss": 4.3343, "step": 444 }, { - "epoch": 1.6181818181818182, - "grad_norm": 3.1272881031036377, - "learning_rate": 2.309090909090909e-05, - "loss": 4.6143, + "epoch": 1.6064981949458483, + "grad_norm": 3.1635055541992188, + "learning_rate": 2.328519855595668e-05, + "loss": 4.3862, "step": 445 }, { - "epoch": 1.6218181818181818, - "grad_norm": 2.709015369415283, - "learning_rate": 2.3030303030303034e-05, - "loss": 4.4113, + "epoch": 1.6101083032490975, + "grad_norm": 5.924020767211914, + "learning_rate": 2.322503008423586e-05, + "loss": 4.3922, "step": 446 }, { - "epoch": 1.6254545454545455, - "grad_norm": 3.9435224533081055, - "learning_rate": 2.2969696969696973e-05, - "loss": 4.4467, + "epoch": 1.6137184115523464, + "grad_norm": 6.799149513244629, + "learning_rate": 2.3164861612515043e-05, + "loss": 4.5604, "step": 447 }, { - "epoch": 1.6290909090909091, - "grad_norm": 6.414844989776611, - "learning_rate": 2.290909090909091e-05, - "loss": 4.8062, + "epoch": 1.6173285198555956, + "grad_norm": 3.7257330417633057, + "learning_rate": 2.3104693140794227e-05, + "loss": 4.468, "step": 448 }, { - "epoch": 1.6327272727272728, - "grad_norm": 3.1530921459198, - "learning_rate": 2.284848484848485e-05, - "loss": 4.4613, + "epoch": 1.6209386281588447, + "grad_norm": 5.105785369873047, + "learning_rate": 2.3044524669073407e-05, + "loss": 4.5101, "step": 449 }, { - "epoch": 1.6363636363636362, - "grad_norm": 3.0431370735168457, - "learning_rate": 2.2787878787878788e-05, - "loss": 4.3202, + "epoch": 1.6245487364620939, + "grad_norm": 7.325453758239746, + "learning_rate": 2.2984356197352588e-05, + "loss": 4.4697, "step": 450 }, { - "epoch": 1.6400000000000001, - "grad_norm": 4.826960563659668, - "learning_rate": 2.272727272727273e-05, - "loss": 4.5825, + "epoch": 1.628158844765343, + "grad_norm": 3.203767776489258, + "learning_rate": 2.292418772563177e-05, + "loss": 4.5219, "step": 451 }, { - "epoch": 1.6436363636363636, - "grad_norm": 8.548043251037598, - "learning_rate": 2.2666666666666668e-05, - "loss": 4.3152, + "epoch": 1.6317689530685922, + "grad_norm": 3.2828943729400635, + "learning_rate": 2.2864019253910952e-05, + "loss": 4.4196, "step": 452 }, { - "epoch": 1.6472727272727272, - "grad_norm": 4.511161804199219, - "learning_rate": 2.260606060606061e-05, - "loss": 4.4899, + "epoch": 1.6353790613718413, + "grad_norm": 4.563648700714111, + "learning_rate": 2.2803850782190133e-05, + "loss": 4.6709, "step": 453 }, { - "epoch": 1.6509090909090909, - "grad_norm": 3.710296154022217, - "learning_rate": 2.2545454545454548e-05, - "loss": 4.6325, + "epoch": 1.6389891696750902, + "grad_norm": 4.947784900665283, + "learning_rate": 2.2743682310469316e-05, + "loss": 4.7708, "step": 454 }, { - "epoch": 1.6545454545454545, - "grad_norm": 1.9863725900650024, - "learning_rate": 2.2484848484848486e-05, - "loss": 4.4654, + "epoch": 1.6425992779783394, + "grad_norm": 3.3646271228790283, + "learning_rate": 2.2683513838748497e-05, + "loss": 4.5654, "step": 455 }, { - "epoch": 1.6581818181818182, - "grad_norm": 4.258086204528809, - "learning_rate": 2.2424242424242424e-05, - "loss": 4.561, + "epoch": 1.6462093862815883, + "grad_norm": 3.2443368434906006, + "learning_rate": 2.2623345367027677e-05, + "loss": 4.5126, "step": 456 }, { - "epoch": 1.6618181818181819, - "grad_norm": 5.20453405380249, - "learning_rate": 2.2363636363636366e-05, - "loss": 4.4563, + "epoch": 1.6498194945848375, + "grad_norm": 2.8289527893066406, + "learning_rate": 2.2563176895306858e-05, + "loss": 4.5584, "step": 457 }, { - "epoch": 1.6654545454545455, - "grad_norm": 5.107168674468994, - "learning_rate": 2.2303030303030304e-05, - "loss": 4.6385, + "epoch": 1.6534296028880866, + "grad_norm": 4.497228622436523, + "learning_rate": 2.2503008423586042e-05, + "loss": 4.6282, "step": 458 }, { - "epoch": 1.669090909090909, - "grad_norm": 6.3191304206848145, - "learning_rate": 2.2242424242424243e-05, - "loss": 4.5206, + "epoch": 1.6570397111913358, + "grad_norm": 2.863171100616455, + "learning_rate": 2.2442839951865226e-05, + "loss": 4.4665, "step": 459 }, { - "epoch": 1.6727272727272728, - "grad_norm": 3.3349220752716064, - "learning_rate": 2.2181818181818184e-05, - "loss": 4.3053, + "epoch": 1.660649819494585, + "grad_norm": 2.3778388500213623, + "learning_rate": 2.2382671480144403e-05, + "loss": 4.6985, "step": 460 }, { - "epoch": 1.6763636363636363, - "grad_norm": 4.218853950500488, - "learning_rate": 2.2121212121212123e-05, - "loss": 4.5629, + "epoch": 1.664259927797834, + "grad_norm": 4.169567108154297, + "learning_rate": 2.2322503008423587e-05, + "loss": 4.8346, "step": 461 }, { - "epoch": 1.6800000000000002, - "grad_norm": 5.860531806945801, - "learning_rate": 2.206060606060606e-05, - "loss": 4.4272, + "epoch": 1.6678700361010832, + "grad_norm": 5.816603183746338, + "learning_rate": 2.226233453670277e-05, + "loss": 4.4637, "step": 462 }, { - "epoch": 1.6836363636363636, - "grad_norm": 4.101426601409912, - "learning_rate": 2.2000000000000003e-05, - "loss": 4.573, + "epoch": 1.6714801444043321, + "grad_norm": 4.912841796875, + "learning_rate": 2.220216606498195e-05, + "loss": 4.5214, "step": 463 }, { - "epoch": 1.6872727272727273, - "grad_norm": 3.375948429107666, - "learning_rate": 2.193939393939394e-05, - "loss": 4.3781, + "epoch": 1.6750902527075813, + "grad_norm": 3.094038724899292, + "learning_rate": 2.2141997593261132e-05, + "loss": 4.5753, "step": 464 }, { - "epoch": 1.690909090909091, - "grad_norm": 3.112006902694702, - "learning_rate": 2.187878787878788e-05, - "loss": 4.4511, + "epoch": 1.6787003610108302, + "grad_norm": 6.050639629364014, + "learning_rate": 2.2081829121540316e-05, + "loss": 4.5278, "step": 465 }, { - "epoch": 1.6945454545454546, - "grad_norm": 3.9260976314544678, - "learning_rate": 2.1818181818181818e-05, - "loss": 4.4448, + "epoch": 1.6823104693140793, + "grad_norm": 5.237624168395996, + "learning_rate": 2.2021660649819496e-05, + "loss": 4.4212, "step": 466 }, { - "epoch": 1.6981818181818182, - "grad_norm": 2.8007559776306152, - "learning_rate": 2.175757575757576e-05, - "loss": 4.4512, + "epoch": 1.6859205776173285, + "grad_norm": 2.529154062271118, + "learning_rate": 2.1961492178098677e-05, + "loss": 4.5208, "step": 467 }, { - "epoch": 1.7018181818181817, - "grad_norm": 3.95316743850708, - "learning_rate": 2.1696969696969698e-05, - "loss": 4.6404, + "epoch": 1.6895306859205776, + "grad_norm": 2.7793679237365723, + "learning_rate": 2.190132370637786e-05, + "loss": 4.458, "step": 468 }, { - "epoch": 1.7054545454545456, - "grad_norm": 3.3578267097473145, - "learning_rate": 2.163636363636364e-05, - "loss": 4.4527, + "epoch": 1.6931407942238268, + "grad_norm": 3.213160991668701, + "learning_rate": 2.184115523465704e-05, + "loss": 4.4169, "step": 469 }, { - "epoch": 1.709090909090909, - "grad_norm": 4.38985013961792, - "learning_rate": 2.1575757575757578e-05, - "loss": 4.531, + "epoch": 1.696750902527076, + "grad_norm": 6.508391380310059, + "learning_rate": 2.178098676293622e-05, + "loss": 4.55, "step": 470 }, { - "epoch": 1.7127272727272729, - "grad_norm": 3.9264490604400635, - "learning_rate": 2.1515151515151516e-05, - "loss": 4.495, + "epoch": 1.7003610108303249, + "grad_norm": 8.193519592285156, + "learning_rate": 2.1720818291215402e-05, + "loss": 4.6847, "step": 471 }, { - "epoch": 1.7163636363636363, - "grad_norm": 4.830046653747559, - "learning_rate": 2.1454545454545455e-05, - "loss": 4.4321, + "epoch": 1.703971119133574, + "grad_norm": 3.6836369037628174, + "learning_rate": 2.1660649819494586e-05, + "loss": 4.607, "step": 472 }, { - "epoch": 1.72, - "grad_norm": 4.556273937225342, - "learning_rate": 2.1393939393939393e-05, - "loss": 4.4907, + "epoch": 1.707581227436823, + "grad_norm": 5.093707084655762, + "learning_rate": 2.1600481347773767e-05, + "loss": 4.3498, "step": 473 }, { - "epoch": 1.7236363636363636, - "grad_norm": 5.617473125457764, - "learning_rate": 2.1333333333333335e-05, - "loss": 4.7541, + "epoch": 1.711191335740072, + "grad_norm": 5.675652980804443, + "learning_rate": 2.1540312876052947e-05, + "loss": 4.5075, "step": 474 }, { - "epoch": 1.7272727272727273, - "grad_norm": 3.0827252864837646, - "learning_rate": 2.1272727272727276e-05, - "loss": 4.3495, + "epoch": 1.7148014440433212, + "grad_norm": 4.682228088378906, + "learning_rate": 2.148014440433213e-05, + "loss": 4.7128, "step": 475 }, { - "epoch": 1.730909090909091, - "grad_norm": 3.5010452270507812, - "learning_rate": 2.1212121212121215e-05, - "loss": 4.6724, + "epoch": 1.7184115523465704, + "grad_norm": 4.707435607910156, + "learning_rate": 2.1419975932611315e-05, + "loss": 4.4863, "step": 476 }, { - "epoch": 1.7345454545454544, - "grad_norm": 3.626228094100952, - "learning_rate": 2.1151515151515153e-05, - "loss": 4.7117, + "epoch": 1.7220216606498195, + "grad_norm": 7.476378440856934, + "learning_rate": 2.1359807460890492e-05, + "loss": 4.7813, "step": 477 }, { - "epoch": 1.7381818181818183, - "grad_norm": 3.944805383682251, - "learning_rate": 2.109090909090909e-05, - "loss": 4.4103, + "epoch": 1.7256317689530687, + "grad_norm": 3.5324857234954834, + "learning_rate": 2.1299638989169676e-05, + "loss": 4.4661, "step": 478 }, { - "epoch": 1.7418181818181817, - "grad_norm": 2.4511728286743164, - "learning_rate": 2.103030303030303e-05, - "loss": 4.5316, + "epoch": 1.7292418772563178, + "grad_norm": 2.770537853240967, + "learning_rate": 2.123947051744886e-05, + "loss": 4.5247, "step": 479 }, { - "epoch": 1.7454545454545456, - "grad_norm": 6.276540756225586, - "learning_rate": 2.096969696969697e-05, - "loss": 4.4635, + "epoch": 1.7328519855595668, + "grad_norm": 2.7567989826202393, + "learning_rate": 2.117930204572804e-05, + "loss": 4.6836, "step": 480 }, { - "epoch": 1.749090909090909, - "grad_norm": 5.2754082679748535, - "learning_rate": 2.090909090909091e-05, - "loss": 4.5604, + "epoch": 1.736462093862816, + "grad_norm": 3.8367581367492676, + "learning_rate": 2.111913357400722e-05, + "loss": 4.7117, "step": 481 }, { - "epoch": 1.7527272727272727, - "grad_norm": 4.733824253082275, - "learning_rate": 2.084848484848485e-05, - "loss": 4.3724, + "epoch": 1.7400722021660648, + "grad_norm": 2.3137428760528564, + "learning_rate": 2.1058965102286405e-05, + "loss": 4.5121, "step": 482 }, { - "epoch": 1.7563636363636363, - "grad_norm": 4.327937126159668, - "learning_rate": 2.078787878787879e-05, - "loss": 4.6649, + "epoch": 1.743682310469314, + "grad_norm": 3.6655476093292236, + "learning_rate": 2.0998796630565585e-05, + "loss": 4.6839, "step": 483 }, { - "epoch": 1.76, - "grad_norm": 3.6761820316314697, - "learning_rate": 2.0727272727272728e-05, - "loss": 4.466, + "epoch": 1.7472924187725631, + "grad_norm": 2.493283987045288, + "learning_rate": 2.0938628158844766e-05, + "loss": 4.4769, "step": 484 }, { - "epoch": 1.7636363636363637, - "grad_norm": 3.9023444652557373, - "learning_rate": 2.0666666666666666e-05, - "loss": 4.5215, + "epoch": 1.7509025270758123, + "grad_norm": 6.7742919921875, + "learning_rate": 2.0878459687123946e-05, + "loss": 4.7192, "step": 485 }, { - "epoch": 1.767272727272727, - "grad_norm": 9.362757682800293, - "learning_rate": 2.0606060606060608e-05, - "loss": 4.31, + "epoch": 1.7545126353790614, + "grad_norm": 5.782177925109863, + "learning_rate": 2.081829121540313e-05, + "loss": 4.6141, "step": 486 }, { - "epoch": 1.770909090909091, - "grad_norm": 6.796205043792725, - "learning_rate": 2.0545454545454546e-05, - "loss": 4.4703, + "epoch": 1.7581227436823106, + "grad_norm": 3.9292287826538086, + "learning_rate": 2.075812274368231e-05, + "loss": 4.4728, "step": 487 }, { - "epoch": 1.7745454545454544, - "grad_norm": 5.368844509124756, - "learning_rate": 2.0484848484848485e-05, - "loss": 4.5281, + "epoch": 1.7617328519855595, + "grad_norm": 5.2916717529296875, + "learning_rate": 2.069795427196149e-05, + "loss": 4.5358, "step": 488 }, { - "epoch": 1.7781818181818183, - "grad_norm": 4.443175315856934, - "learning_rate": 2.0424242424242427e-05, - "loss": 4.5208, + "epoch": 1.7653429602888087, + "grad_norm": 9.929496765136719, + "learning_rate": 2.0637785800240675e-05, + "loss": 4.8103, "step": 489 }, { - "epoch": 1.7818181818181817, - "grad_norm": 4.688342094421387, - "learning_rate": 2.0363636363636365e-05, - "loss": 4.5241, + "epoch": 1.7689530685920578, + "grad_norm": 4.576100826263428, + "learning_rate": 2.057761732851986e-05, + "loss": 4.4048, "step": 490 }, { - "epoch": 1.7854545454545454, - "grad_norm": 5.826910972595215, - "learning_rate": 2.0303030303030303e-05, - "loss": 4.5866, + "epoch": 1.7725631768953067, + "grad_norm": 3.692178964614868, + "learning_rate": 2.0517448856799036e-05, + "loss": 4.6001, "step": 491 }, { - "epoch": 1.789090909090909, - "grad_norm": 4.945174217224121, - "learning_rate": 2.0242424242424245e-05, - "loss": 4.5894, + "epoch": 1.7761732851985559, + "grad_norm": 2.9772250652313232, + "learning_rate": 2.045728038507822e-05, + "loss": 4.345, "step": 492 }, { - "epoch": 1.7927272727272727, - "grad_norm": 3.059390068054199, - "learning_rate": 2.0181818181818183e-05, - "loss": 4.5118, + "epoch": 1.779783393501805, + "grad_norm": 4.7923407554626465, + "learning_rate": 2.0397111913357404e-05, + "loss": 4.5239, "step": 493 }, { - "epoch": 1.7963636363636364, - "grad_norm": 3.689739942550659, - "learning_rate": 2.012121212121212e-05, - "loss": 4.4408, + "epoch": 1.7833935018050542, + "grad_norm": 3.105140209197998, + "learning_rate": 2.033694344163658e-05, + "loss": 4.4268, "step": 494 }, { - "epoch": 1.8, - "grad_norm": 4.748984336853027, - "learning_rate": 2.006060606060606e-05, - "loss": 4.4197, + "epoch": 1.7870036101083033, + "grad_norm": 2.8429927825927734, + "learning_rate": 2.0276774969915765e-05, + "loss": 4.4264, "step": 495 }, { - "epoch": 1.8036363636363637, - "grad_norm": 2.963027000427246, - "learning_rate": 2e-05, - "loss": 4.4785, + "epoch": 1.7906137184115525, + "grad_norm": 3.325678825378418, + "learning_rate": 2.0216606498194946e-05, + "loss": 4.423, "step": 496 }, { - "epoch": 1.8072727272727271, - "grad_norm": 2.5443899631500244, - "learning_rate": 1.993939393939394e-05, - "loss": 4.5684, + "epoch": 1.7942238267148014, + "grad_norm": 4.431451320648193, + "learning_rate": 2.015643802647413e-05, + "loss": 4.6256, "step": 497 }, { - "epoch": 1.810909090909091, - "grad_norm": 4.136449813842773, - "learning_rate": 1.987878787878788e-05, - "loss": 4.3071, + "epoch": 1.7978339350180506, + "grad_norm": 6.648149490356445, + "learning_rate": 2.009626955475331e-05, + "loss": 4.4858, "step": 498 }, { - "epoch": 1.8145454545454545, - "grad_norm": 4.769286155700684, - "learning_rate": 1.981818181818182e-05, - "loss": 4.3815, + "epoch": 1.8014440433212995, + "grad_norm": 3.306225538253784, + "learning_rate": 2.003610108303249e-05, + "loss": 4.6129, "step": 499 }, { - "epoch": 1.8181818181818183, - "grad_norm": 5.660130500793457, - "learning_rate": 1.975757575757576e-05, - "loss": 4.4983, + "epoch": 1.8050541516245486, + "grad_norm": 3.573742389678955, + "learning_rate": 1.9975932611311675e-05, + "loss": 4.4521, "step": 500 }, { - "epoch": 1.8218181818181818, - "grad_norm": 3.919520616531372, - "learning_rate": 1.9696969696969697e-05, - "loss": 4.4176, + "epoch": 1.8086642599277978, + "grad_norm": 2.5690858364105225, + "learning_rate": 1.9915764139590855e-05, + "loss": 4.339, "step": 501 }, { - "epoch": 1.8254545454545454, - "grad_norm": 4.681278228759766, - "learning_rate": 1.9636363636363635e-05, - "loss": 4.4025, + "epoch": 1.812274368231047, + "grad_norm": 5.37463903427124, + "learning_rate": 1.9855595667870036e-05, + "loss": 4.5435, "step": 502 }, { - "epoch": 1.829090909090909, - "grad_norm": 3.8179829120635986, - "learning_rate": 1.9575757575757577e-05, - "loss": 4.4669, + "epoch": 1.815884476534296, + "grad_norm": 2.394864082336426, + "learning_rate": 1.979542719614922e-05, + "loss": 4.3385, "step": 503 }, { - "epoch": 1.8327272727272728, - "grad_norm": 3.794281005859375, - "learning_rate": 1.951515151515152e-05, - "loss": 4.4807, + "epoch": 1.8194945848375452, + "grad_norm": 3.3190903663635254, + "learning_rate": 1.97352587244284e-05, + "loss": 4.4239, "step": 504 }, { - "epoch": 1.8363636363636364, - "grad_norm": 2.3946359157562256, - "learning_rate": 1.9454545454545457e-05, - "loss": 4.4092, + "epoch": 1.8231046931407944, + "grad_norm": 4.373250484466553, + "learning_rate": 1.967509025270758e-05, + "loss": 4.4304, "step": 505 }, { - "epoch": 1.8399999999999999, - "grad_norm": 4.364572525024414, - "learning_rate": 1.9393939393939395e-05, - "loss": 4.3869, + "epoch": 1.8267148014440433, + "grad_norm": 2.63519287109375, + "learning_rate": 1.9614921780986764e-05, + "loss": 4.5574, "step": 506 }, { - "epoch": 1.8436363636363637, - "grad_norm": 2.095834493637085, - "learning_rate": 1.9333333333333333e-05, - "loss": 4.5834, + "epoch": 1.8303249097472925, + "grad_norm": 5.139864444732666, + "learning_rate": 1.955475330926595e-05, + "loss": 4.4422, "step": 507 }, { - "epoch": 1.8472727272727272, - "grad_norm": 5.8669114112854, - "learning_rate": 1.9272727272727272e-05, - "loss": 4.6442, + "epoch": 1.8339350180505414, + "grad_norm": 4.551270961761475, + "learning_rate": 1.9494584837545125e-05, + "loss": 4.6433, "step": 508 }, { - "epoch": 1.850909090909091, - "grad_norm": 3.7711021900177, - "learning_rate": 1.9212121212121213e-05, - "loss": 4.3915, + "epoch": 1.8375451263537905, + "grad_norm": 3.421534299850464, + "learning_rate": 1.943441636582431e-05, + "loss": 4.4937, "step": 509 }, { - "epoch": 1.8545454545454545, - "grad_norm": 3.1917080879211426, - "learning_rate": 1.9151515151515155e-05, - "loss": 4.4269, + "epoch": 1.8411552346570397, + "grad_norm": 3.374833822250366, + "learning_rate": 1.937424789410349e-05, + "loss": 4.544, "step": 510 }, { - "epoch": 1.8581818181818182, - "grad_norm": 3.4579732418060303, - "learning_rate": 1.9090909090909094e-05, - "loss": 4.248, + "epoch": 1.8447653429602888, + "grad_norm": 3.4238624572753906, + "learning_rate": 1.9314079422382674e-05, + "loss": 4.4343, "step": 511 }, { - "epoch": 1.8618181818181818, - "grad_norm": 2.2174642086029053, - "learning_rate": 1.9030303030303032e-05, - "loss": 4.2461, + "epoch": 1.848375451263538, + "grad_norm": 2.8447346687316895, + "learning_rate": 1.9253910950661854e-05, + "loss": 4.3405, "step": 512 }, { - "epoch": 1.8654545454545455, - "grad_norm": 9.098743438720703, - "learning_rate": 1.896969696969697e-05, - "loss": 4.3802, + "epoch": 1.8519855595667871, + "grad_norm": 2.3235862255096436, + "learning_rate": 1.9193742478941035e-05, + "loss": 4.3878, "step": 513 }, { - "epoch": 1.8690909090909091, - "grad_norm": 4.291934967041016, - "learning_rate": 1.890909090909091e-05, - "loss": 4.5362, + "epoch": 1.855595667870036, + "grad_norm": 2.4331719875335693, + "learning_rate": 1.913357400722022e-05, + "loss": 4.3489, "step": 514 }, { - "epoch": 1.8727272727272726, - "grad_norm": 4.486403942108154, - "learning_rate": 1.884848484848485e-05, - "loss": 4.4608, + "epoch": 1.8592057761732852, + "grad_norm": 3.433101177215576, + "learning_rate": 1.90734055354994e-05, + "loss": 4.5183, "step": 515 }, { - "epoch": 1.8763636363636365, - "grad_norm": 3.7184762954711914, - "learning_rate": 1.878787878787879e-05, - "loss": 4.4732, + "epoch": 1.8628158844765343, + "grad_norm": 4.398335933685303, + "learning_rate": 1.901323706377858e-05, + "loss": 4.6618, "step": 516 }, { - "epoch": 1.88, - "grad_norm": 3.1857259273529053, - "learning_rate": 1.872727272727273e-05, - "loss": 4.3346, + "epoch": 1.8664259927797833, + "grad_norm": 3.5990779399871826, + "learning_rate": 1.8953068592057764e-05, + "loss": 4.4411, "step": 517 }, { - "epoch": 1.8836363636363638, - "grad_norm": 5.259026050567627, - "learning_rate": 1.866666666666667e-05, - "loss": 4.5613, + "epoch": 1.8700361010830324, + "grad_norm": 2.9849636554718018, + "learning_rate": 1.8892900120336944e-05, + "loss": 4.5298, "step": 518 }, { - "epoch": 1.8872727272727272, - "grad_norm": 4.3462233543396, - "learning_rate": 1.8606060606060607e-05, - "loss": 4.5914, + "epoch": 1.8736462093862816, + "grad_norm": 3.6262741088867188, + "learning_rate": 1.8832731648616125e-05, + "loss": 4.514, "step": 519 }, { - "epoch": 1.8909090909090909, - "grad_norm": 2.839916944503784, - "learning_rate": 1.8545454545454545e-05, - "loss": 4.4223, + "epoch": 1.8772563176895307, + "grad_norm": 4.894564151763916, + "learning_rate": 1.877256317689531e-05, + "loss": 4.4753, "step": 520 }, { - "epoch": 1.8945454545454545, - "grad_norm": 3.4585793018341064, - "learning_rate": 1.8484848484848487e-05, - "loss": 4.4892, + "epoch": 1.8808664259927799, + "grad_norm": 4.75285530090332, + "learning_rate": 1.871239470517449e-05, + "loss": 4.4204, "step": 521 }, { - "epoch": 1.8981818181818182, - "grad_norm": 4.047860145568848, - "learning_rate": 1.8424242424242425e-05, - "loss": 4.4238, + "epoch": 1.884476534296029, + "grad_norm": 4.635976791381836, + "learning_rate": 1.865222623345367e-05, + "loss": 4.4292, "step": 522 }, { - "epoch": 1.9018181818181819, - "grad_norm": 3.6828198432922363, - "learning_rate": 1.8363636363636364e-05, - "loss": 4.4123, + "epoch": 1.888086642599278, + "grad_norm": 3.461372137069702, + "learning_rate": 1.8592057761732854e-05, + "loss": 4.2734, "step": 523 }, { - "epoch": 1.9054545454545453, - "grad_norm": 3.2874972820281982, - "learning_rate": 1.8303030303030305e-05, - "loss": 4.4527, + "epoch": 1.891696750902527, + "grad_norm": 7.233281135559082, + "learning_rate": 1.8531889290012034e-05, + "loss": 4.6022, "step": 524 }, { - "epoch": 1.9090909090909092, - "grad_norm": 2.8080523014068604, - "learning_rate": 1.8242424242424244e-05, - "loss": 4.3722, + "epoch": 1.895306859205776, + "grad_norm": 6.590332508087158, + "learning_rate": 1.8471720818291215e-05, + "loss": 4.5391, "step": 525 }, { - "epoch": 1.9127272727272726, - "grad_norm": 3.2030367851257324, - "learning_rate": 1.8181818181818182e-05, - "loss": 4.5094, + "epoch": 1.8989169675090252, + "grad_norm": 6.628108024597168, + "learning_rate": 1.84115523465704e-05, + "loss": 4.6294, "step": 526 }, { - "epoch": 1.9163636363636365, - "grad_norm": 4.297696590423584, - "learning_rate": 1.8121212121212124e-05, - "loss": 4.392, + "epoch": 1.9025270758122743, + "grad_norm": 6.2744460105896, + "learning_rate": 1.835138387484958e-05, + "loss": 4.5166, "step": 527 }, { - "epoch": 1.92, - "grad_norm": 6.101680755615234, - "learning_rate": 1.8060606060606062e-05, - "loss": 4.502, + "epoch": 1.9061371841155235, + "grad_norm": 8.246423721313477, + "learning_rate": 1.8291215403128763e-05, + "loss": 4.4344, "step": 528 }, { - "epoch": 1.9236363636363636, - "grad_norm": 4.8301215171813965, - "learning_rate": 1.8e-05, - "loss": 4.4693, + "epoch": 1.9097472924187726, + "grad_norm": 5.218391418457031, + "learning_rate": 1.8231046931407943e-05, + "loss": 4.4208, "step": 529 }, { - "epoch": 1.9272727272727272, - "grad_norm": 4.521356582641602, - "learning_rate": 1.793939393939394e-05, - "loss": 4.478, + "epoch": 1.9133574007220218, + "grad_norm": 2.6028380393981934, + "learning_rate": 1.8170878459687124e-05, + "loss": 4.4997, "step": 530 }, { - "epoch": 1.930909090909091, - "grad_norm": 3.493539571762085, - "learning_rate": 1.787878787878788e-05, - "loss": 4.4816, + "epoch": 1.916967509025271, + "grad_norm": 8.268327713012695, + "learning_rate": 1.8110709987966308e-05, + "loss": 4.3713, "step": 531 }, { - "epoch": 1.9345454545454546, - "grad_norm": 3.7430574893951416, - "learning_rate": 1.781818181818182e-05, - "loss": 4.5463, + "epoch": 1.9205776173285198, + "grad_norm": 4.178406238555908, + "learning_rate": 1.805054151624549e-05, + "loss": 4.4769, "step": 532 }, { - "epoch": 1.9381818181818182, - "grad_norm": 3.7508163452148438, - "learning_rate": 1.775757575757576e-05, - "loss": 4.293, + "epoch": 1.924187725631769, + "grad_norm": 3.8857920169830322, + "learning_rate": 1.799037304452467e-05, + "loss": 4.6029, "step": 533 }, { - "epoch": 1.9418181818181819, - "grad_norm": 6.832218170166016, - "learning_rate": 1.76969696969697e-05, - "loss": 4.5343, + "epoch": 1.927797833935018, + "grad_norm": 3.7289071083068848, + "learning_rate": 1.7930204572803853e-05, + "loss": 4.3475, "step": 534 }, { - "epoch": 1.9454545454545453, - "grad_norm": 3.9975361824035645, - "learning_rate": 1.7636363636363637e-05, - "loss": 4.3917, + "epoch": 1.931407942238267, + "grad_norm": 3.547478199005127, + "learning_rate": 1.7870036101083033e-05, + "loss": 4.4758, "step": 535 }, { - "epoch": 1.9490909090909092, - "grad_norm": 3.4691479206085205, - "learning_rate": 1.7575757575757576e-05, - "loss": 4.4656, + "epoch": 1.9350180505415162, + "grad_norm": 4.167591571807861, + "learning_rate": 1.7809867629362214e-05, + "loss": 4.4301, "step": 536 }, { - "epoch": 1.9527272727272726, - "grad_norm": 4.352736949920654, - "learning_rate": 1.7515151515151514e-05, - "loss": 4.3993, + "epoch": 1.9386281588447654, + "grad_norm": 3.6818253993988037, + "learning_rate": 1.7749699157641398e-05, + "loss": 4.3256, "step": 537 }, { - "epoch": 1.9563636363636365, - "grad_norm": 4.212103843688965, - "learning_rate": 1.7454545454545456e-05, - "loss": 4.3395, + "epoch": 1.9422382671480145, + "grad_norm": 6.802430152893066, + "learning_rate": 1.768953068592058e-05, + "loss": 4.4358, "step": 538 }, { - "epoch": 1.96, - "grad_norm": 4.381357192993164, - "learning_rate": 1.7393939393939397e-05, - "loss": 4.3113, + "epoch": 1.9458483754512637, + "grad_norm": 7.140280723571777, + "learning_rate": 1.762936221419976e-05, + "loss": 4.4219, "step": 539 }, { - "epoch": 1.9636363636363636, - "grad_norm": 4.0517706871032715, - "learning_rate": 1.7333333333333336e-05, - "loss": 4.4248, + "epoch": 1.9494584837545126, + "grad_norm": 7.254362106323242, + "learning_rate": 1.7569193742478943e-05, + "loss": 4.4482, "step": 540 }, { - "epoch": 1.9672727272727273, - "grad_norm": 3.827972650527954, - "learning_rate": 1.7272727272727274e-05, - "loss": 4.2242, + "epoch": 1.9530685920577617, + "grad_norm": 4.236722469329834, + "learning_rate": 1.7509025270758123e-05, + "loss": 4.624, "step": 541 }, { - "epoch": 1.970909090909091, - "grad_norm": 3.4165265560150146, - "learning_rate": 1.7212121212121212e-05, - "loss": 4.4312, + "epoch": 1.9566787003610109, + "grad_norm": 5.225027561187744, + "learning_rate": 1.7448856799037304e-05, + "loss": 4.3267, "step": 542 }, { - "epoch": 1.9745454545454546, - "grad_norm": 15.285648345947266, - "learning_rate": 1.715151515151515e-05, - "loss": 4.5811, + "epoch": 1.9602888086642598, + "grad_norm": 6.063048362731934, + "learning_rate": 1.7388688327316488e-05, + "loss": 4.2722, "step": 543 }, { - "epoch": 1.978181818181818, - "grad_norm": 19.892024993896484, - "learning_rate": 1.7090909090909092e-05, - "loss": 4.467, + "epoch": 1.963898916967509, + "grad_norm": 3.3228302001953125, + "learning_rate": 1.7328519855595668e-05, + "loss": 4.4031, "step": 544 }, { - "epoch": 1.981818181818182, - "grad_norm": 7.771790981292725, - "learning_rate": 1.703030303030303e-05, - "loss": 4.4484, + "epoch": 1.967509025270758, + "grad_norm": 5.654480934143066, + "learning_rate": 1.7268351383874852e-05, + "loss": 4.5082, "step": 545 }, { - "epoch": 1.9854545454545454, - "grad_norm": 4.017660140991211, - "learning_rate": 1.6969696969696972e-05, - "loss": 4.3429, + "epoch": 1.9711191335740073, + "grad_norm": 3.153670072555542, + "learning_rate": 1.7208182912154033e-05, + "loss": 4.547, "step": 546 }, { - "epoch": 1.9890909090909092, - "grad_norm": 5.259058475494385, - "learning_rate": 1.690909090909091e-05, - "loss": 4.4623, + "epoch": 1.9747292418772564, + "grad_norm": 3.0730340480804443, + "learning_rate": 1.7148014440433213e-05, + "loss": 4.3911, "step": 547 }, { - "epoch": 1.9927272727272727, - "grad_norm": 10.823943138122559, - "learning_rate": 1.684848484848485e-05, - "loss": 4.5293, + "epoch": 1.9783393501805056, + "grad_norm": 6.021193981170654, + "learning_rate": 1.7087845968712397e-05, + "loss": 4.7361, "step": 548 }, { - "epoch": 1.9963636363636363, - "grad_norm": 6.573860168457031, - "learning_rate": 1.6787878787878787e-05, - "loss": 4.6044, + "epoch": 1.9819494584837545, + "grad_norm": 4.3618550300598145, + "learning_rate": 1.7027677496991578e-05, + "loss": 4.4272, "step": 549 }, { - "epoch": 2.0, - "grad_norm": 5.922011375427246, - "learning_rate": 1.672727272727273e-05, - "loss": 4.3309, + "epoch": 1.9855595667870036, + "grad_norm": 6.582357883453369, + "learning_rate": 1.6967509025270758e-05, + "loss": 4.4994, "step": 550 }, { - "epoch": 2.0036363636363634, - "grad_norm": 3.63747501373291, - "learning_rate": 1.6666666666666667e-05, - "loss": 4.4179, + "epoch": 1.9891696750902526, + "grad_norm": 3.4479753971099854, + "learning_rate": 1.6907340553549942e-05, + "loss": 4.6324, "step": 551 }, { - "epoch": 2.0072727272727273, - "grad_norm": 6.124152183532715, - "learning_rate": 1.6606060606060606e-05, - "loss": 4.612, + "epoch": 1.9927797833935017, + "grad_norm": 6.284206867218018, + "learning_rate": 1.6847172081829123e-05, + "loss": 4.5219, "step": 552 }, { - "epoch": 2.0109090909090908, - "grad_norm": 4.838201522827148, - "learning_rate": 1.6545454545454548e-05, - "loss": 4.4896, + "epoch": 1.9963898916967509, + "grad_norm": 8.223616600036621, + "learning_rate": 1.6787003610108303e-05, + "loss": 4.4365, "step": 553 }, { - "epoch": 2.0145454545454546, - "grad_norm": 5.851705074310303, - "learning_rate": 1.6484848484848486e-05, - "loss": 4.3424, + "epoch": 2.0, + "grad_norm": 4.31699275970459, + "learning_rate": 1.6726835138387487e-05, + "loss": 4.2858, "step": 554 }, { - "epoch": 2.018181818181818, - "grad_norm": 4.73307991027832, - "learning_rate": 1.6424242424242424e-05, - "loss": 4.4876, + "epoch": 2.003610108303249, + "grad_norm": 4.425889015197754, + "learning_rate": 1.6666666666666667e-05, + "loss": 4.4514, "step": 555 }, { - "epoch": 2.021818181818182, - "grad_norm": 3.129345417022705, - "learning_rate": 1.6363636363636366e-05, - "loss": 4.4337, + "epoch": 2.0072202166064983, + "grad_norm": 4.568549633026123, + "learning_rate": 1.6606498194945848e-05, + "loss": 4.4904, "step": 556 }, { - "epoch": 2.0254545454545454, - "grad_norm": 4.917591094970703, - "learning_rate": 1.6303030303030304e-05, - "loss": 4.6596, + "epoch": 2.0108303249097474, + "grad_norm": 4.659919261932373, + "learning_rate": 1.6546329723225032e-05, + "loss": 4.3659, "step": 557 }, { - "epoch": 2.0290909090909093, - "grad_norm": 6.358683109283447, - "learning_rate": 1.6242424242424243e-05, - "loss": 4.5853, + "epoch": 2.0144404332129966, + "grad_norm": 3.3128418922424316, + "learning_rate": 1.6486161251504212e-05, + "loss": 4.5661, "step": 558 }, { - "epoch": 2.0327272727272727, - "grad_norm": 4.329516887664795, - "learning_rate": 1.618181818181818e-05, - "loss": 4.3207, + "epoch": 2.0180505415162453, + "grad_norm": 5.247345924377441, + "learning_rate": 1.6425992779783393e-05, + "loss": 4.5473, "step": 559 }, { - "epoch": 2.036363636363636, - "grad_norm": 2.4272570610046387, - "learning_rate": 1.6121212121212123e-05, - "loss": 4.3553, + "epoch": 2.0216606498194944, + "grad_norm": 2.952227830886841, + "learning_rate": 1.6365824308062577e-05, + "loss": 4.4017, "step": 560 }, { - "epoch": 2.04, - "grad_norm": 3.0999836921691895, - "learning_rate": 1.606060606060606e-05, - "loss": 4.5796, + "epoch": 2.0252707581227436, + "grad_norm": 4.103919506072998, + "learning_rate": 1.6305655836341757e-05, + "loss": 4.5626, "step": 561 }, { - "epoch": 2.0436363636363635, - "grad_norm": 4.398182392120361, - "learning_rate": 1.6000000000000003e-05, - "loss": 4.4053, + "epoch": 2.0288808664259927, + "grad_norm": 3.0998098850250244, + "learning_rate": 1.624548736462094e-05, + "loss": 4.6575, "step": 562 }, { - "epoch": 2.0472727272727274, - "grad_norm": 4.124515533447266, - "learning_rate": 1.593939393939394e-05, - "loss": 4.5866, + "epoch": 2.032490974729242, + "grad_norm": 3.9067933559417725, + "learning_rate": 1.618531889290012e-05, + "loss": 4.5003, "step": 563 }, { - "epoch": 2.050909090909091, - "grad_norm": 10.85811710357666, - "learning_rate": 1.587878787878788e-05, - "loss": 4.5517, + "epoch": 2.036101083032491, + "grad_norm": 5.084564208984375, + "learning_rate": 1.6125150421179302e-05, + "loss": 4.593, "step": 564 }, { - "epoch": 2.0545454545454547, - "grad_norm": 3.445768117904663, - "learning_rate": 1.5818181818181818e-05, - "loss": 4.3176, + "epoch": 2.03971119133574, + "grad_norm": 2.939908027648926, + "learning_rate": 1.6064981949458486e-05, + "loss": 4.4549, "step": 565 }, { - "epoch": 2.058181818181818, - "grad_norm": 3.5251712799072266, - "learning_rate": 1.5757575757575756e-05, - "loss": 4.3987, + "epoch": 2.0433212996389893, + "grad_norm": 4.826956272125244, + "learning_rate": 1.6004813477737667e-05, + "loss": 4.5792, "step": 566 }, { - "epoch": 2.061818181818182, - "grad_norm": 4.528774738311768, - "learning_rate": 1.5696969696969698e-05, - "loss": 4.5358, + "epoch": 2.046931407942238, + "grad_norm": 4.092348575592041, + "learning_rate": 1.5944645006016847e-05, + "loss": 4.5905, "step": 567 }, { - "epoch": 2.0654545454545454, - "grad_norm": 8.901700019836426, - "learning_rate": 1.563636363636364e-05, - "loss": 4.6986, + "epoch": 2.050541516245487, + "grad_norm": 5.71950626373291, + "learning_rate": 1.588447653429603e-05, + "loss": 4.429, "step": 568 }, { - "epoch": 2.0690909090909093, - "grad_norm": 4.355597019195557, - "learning_rate": 1.5575757575757578e-05, - "loss": 4.3603, + "epoch": 2.0541516245487363, + "grad_norm": 3.808638572692871, + "learning_rate": 1.582430806257521e-05, + "loss": 4.4797, "step": 569 }, { - "epoch": 2.0727272727272728, - "grad_norm": 3.189401149749756, - "learning_rate": 1.5515151515151516e-05, - "loss": 4.3137, + "epoch": 2.0577617328519855, + "grad_norm": 4.150345325469971, + "learning_rate": 1.5764139590854392e-05, + "loss": 4.5639, "step": 570 }, { - "epoch": 2.076363636363636, - "grad_norm": 5.457395076751709, - "learning_rate": 1.5454545454545454e-05, - "loss": 4.5963, + "epoch": 2.0613718411552346, + "grad_norm": 4.915743350982666, + "learning_rate": 1.5703971119133576e-05, + "loss": 4.4055, "step": 571 }, { - "epoch": 2.08, - "grad_norm": 5.1099934577941895, - "learning_rate": 1.5393939393939393e-05, - "loss": 4.3938, + "epoch": 2.064981949458484, + "grad_norm": 5.808039665222168, + "learning_rate": 1.5643802647412757e-05, + "loss": 4.521, "step": 572 }, { - "epoch": 2.0836363636363635, - "grad_norm": 4.458206653594971, - "learning_rate": 1.5333333333333334e-05, - "loss": 4.3399, + "epoch": 2.068592057761733, + "grad_norm": 5.758335113525391, + "learning_rate": 1.5583634175691937e-05, + "loss": 4.5548, "step": 573 }, { - "epoch": 2.0872727272727274, - "grad_norm": 3.7500548362731934, - "learning_rate": 1.5272727272727276e-05, - "loss": 4.2681, + "epoch": 2.072202166064982, + "grad_norm": 5.1010422706604, + "learning_rate": 1.552346570397112e-05, + "loss": 4.3622, "step": 574 }, { - "epoch": 2.090909090909091, - "grad_norm": 3.342282295227051, - "learning_rate": 1.5212121212121213e-05, - "loss": 4.4396, + "epoch": 2.0758122743682312, + "grad_norm": 4.795560836791992, + "learning_rate": 1.54632972322503e-05, + "loss": 4.5221, "step": 575 }, { - "epoch": 2.0945454545454547, - "grad_norm": 5.0651092529296875, - "learning_rate": 1.5151515151515153e-05, - "loss": 4.5387, + "epoch": 2.07942238267148, + "grad_norm": 3.5267210006713867, + "learning_rate": 1.5403128760529485e-05, + "loss": 4.4669, "step": 576 }, { - "epoch": 2.098181818181818, - "grad_norm": 4.204915523529053, - "learning_rate": 1.5090909090909091e-05, - "loss": 4.4134, + "epoch": 2.083032490974729, + "grad_norm": 4.934731960296631, + "learning_rate": 1.5342960288808663e-05, + "loss": 4.4068, "step": 577 }, { - "epoch": 2.101818181818182, - "grad_norm": 3.690837860107422, - "learning_rate": 1.5030303030303031e-05, - "loss": 4.3563, + "epoch": 2.0866425992779782, + "grad_norm": 2.6810762882232666, + "learning_rate": 1.5282791817087847e-05, + "loss": 4.6031, "step": 578 }, { - "epoch": 2.1054545454545455, - "grad_norm": 3.8858180046081543, - "learning_rate": 1.496969696969697e-05, - "loss": 4.5194, + "epoch": 2.0902527075812274, + "grad_norm": 3.5045320987701416, + "learning_rate": 1.5222623345367029e-05, + "loss": 4.4259, "step": 579 }, { - "epoch": 2.109090909090909, - "grad_norm": 4.591732501983643, - "learning_rate": 1.4909090909090908e-05, - "loss": 4.5678, + "epoch": 2.0938628158844765, + "grad_norm": 4.098846435546875, + "learning_rate": 1.516245487364621e-05, + "loss": 4.3841, "step": 580 }, { - "epoch": 2.112727272727273, - "grad_norm": 2.2091257572174072, - "learning_rate": 1.484848484848485e-05, - "loss": 4.4385, + "epoch": 2.0974729241877257, + "grad_norm": 2.795208215713501, + "learning_rate": 1.5102286401925391e-05, + "loss": 4.4416, "step": 581 }, { - "epoch": 2.1163636363636362, - "grad_norm": 4.992697715759277, - "learning_rate": 1.478787878787879e-05, - "loss": 4.3667, + "epoch": 2.101083032490975, + "grad_norm": 5.189493179321289, + "learning_rate": 1.5042117930204574e-05, + "loss": 4.454, "step": 582 }, { - "epoch": 2.12, - "grad_norm": 4.349731922149658, - "learning_rate": 1.4727272727272728e-05, - "loss": 4.4421, + "epoch": 2.104693140794224, + "grad_norm": 3.3796346187591553, + "learning_rate": 1.4981949458483754e-05, + "loss": 4.581, "step": 583 }, { - "epoch": 2.1236363636363635, - "grad_norm": 3.431900978088379, - "learning_rate": 1.4666666666666668e-05, - "loss": 4.6203, + "epoch": 2.108303249097473, + "grad_norm": 3.021270513534546, + "learning_rate": 1.4921780986762936e-05, + "loss": 4.5721, "step": 584 }, { - "epoch": 2.1272727272727274, - "grad_norm": 7.2620062828063965, - "learning_rate": 1.4606060606060606e-05, - "loss": 4.4006, + "epoch": 2.111913357400722, + "grad_norm": 4.579837799072266, + "learning_rate": 1.486161251504212e-05, + "loss": 4.3886, "step": 585 }, { - "epoch": 2.130909090909091, - "grad_norm": 3.902266263961792, - "learning_rate": 1.4545454545454545e-05, - "loss": 4.6113, + "epoch": 2.115523465703971, + "grad_norm": 5.8261566162109375, + "learning_rate": 1.4801444043321299e-05, + "loss": 4.4218, "step": 586 }, { - "epoch": 2.1345454545454547, - "grad_norm": 3.3681981563568115, - "learning_rate": 1.4484848484848485e-05, - "loss": 4.212, + "epoch": 2.11913357400722, + "grad_norm": 3.2095069885253906, + "learning_rate": 1.4741275571600483e-05, + "loss": 4.3596, "step": 587 }, { - "epoch": 2.138181818181818, - "grad_norm": 3.5023534297943115, - "learning_rate": 1.4424242424242426e-05, - "loss": 4.23, + "epoch": 2.1227436823104693, + "grad_norm": 3.292442560195923, + "learning_rate": 1.4681107099879662e-05, + "loss": 4.4458, "step": 588 }, { - "epoch": 2.1418181818181816, - "grad_norm": 4.29148530960083, - "learning_rate": 1.4363636363636365e-05, - "loss": 4.5829, + "epoch": 2.1263537906137184, + "grad_norm": 5.677742958068848, + "learning_rate": 1.4620938628158846e-05, + "loss": 4.6095, "step": 589 }, { - "epoch": 2.1454545454545455, - "grad_norm": 2.9856648445129395, - "learning_rate": 1.4303030303030305e-05, - "loss": 4.4755, + "epoch": 2.1299638989169676, + "grad_norm": 3.2105185985565186, + "learning_rate": 1.4560770156438028e-05, + "loss": 4.333, "step": 590 }, { - "epoch": 2.149090909090909, - "grad_norm": 2.5396695137023926, - "learning_rate": 1.4242424242424243e-05, - "loss": 4.3903, + "epoch": 2.1335740072202167, + "grad_norm": 3.4787943363189697, + "learning_rate": 1.4500601684717209e-05, + "loss": 4.6703, "step": 591 }, { - "epoch": 2.152727272727273, - "grad_norm": 2.6252663135528564, - "learning_rate": 1.4181818181818181e-05, - "loss": 4.2966, + "epoch": 2.137184115523466, + "grad_norm": 2.8096866607666016, + "learning_rate": 1.444043321299639e-05, + "loss": 4.312, "step": 592 }, { - "epoch": 2.1563636363636363, - "grad_norm": 2.7963287830352783, - "learning_rate": 1.4121212121212121e-05, - "loss": 4.4912, + "epoch": 2.140794223826715, + "grad_norm": 3.680739164352417, + "learning_rate": 1.4380264741275573e-05, + "loss": 4.438, "step": 593 }, { - "epoch": 2.16, - "grad_norm": 4.679149150848389, - "learning_rate": 1.406060606060606e-05, - "loss": 4.6129, + "epoch": 2.1444043321299637, + "grad_norm": 3.65415620803833, + "learning_rate": 1.4320096269554753e-05, + "loss": 4.4032, "step": 594 }, { - "epoch": 2.1636363636363636, - "grad_norm": 4.0482869148254395, - "learning_rate": 1.4000000000000001e-05, - "loss": 4.5545, + "epoch": 2.148014440433213, + "grad_norm": 3.834942102432251, + "learning_rate": 1.4259927797833936e-05, + "loss": 4.499, "step": 595 }, { - "epoch": 2.1672727272727275, - "grad_norm": 2.992244005203247, - "learning_rate": 1.3939393939393942e-05, - "loss": 4.3336, + "epoch": 2.151624548736462, + "grad_norm": 2.716878890991211, + "learning_rate": 1.4199759326113118e-05, + "loss": 4.4475, "step": 596 }, { - "epoch": 2.170909090909091, - "grad_norm": 3.422591209411621, - "learning_rate": 1.387878787878788e-05, - "loss": 4.3752, + "epoch": 2.155234657039711, + "grad_norm": 3.9994101524353027, + "learning_rate": 1.4139590854392298e-05, + "loss": 4.4935, "step": 597 }, { - "epoch": 2.174545454545455, - "grad_norm": 3.826042652130127, - "learning_rate": 1.3818181818181818e-05, - "loss": 4.2918, + "epoch": 2.1588447653429603, + "grad_norm": 7.912623882293701, + "learning_rate": 1.407942238267148e-05, + "loss": 4.5646, "step": 598 }, { - "epoch": 2.178181818181818, - "grad_norm": 3.569441795349121, - "learning_rate": 1.3757575757575758e-05, - "loss": 4.4492, + "epoch": 2.1624548736462095, + "grad_norm": 5.011529445648193, + "learning_rate": 1.4019253910950665e-05, + "loss": 4.4788, "step": 599 }, { - "epoch": 2.1818181818181817, - "grad_norm": 3.8144829273223877, - "learning_rate": 1.3696969696969697e-05, - "loss": 4.3355, + "epoch": 2.1660649819494586, + "grad_norm": 4.918196201324463, + "learning_rate": 1.3959085439229843e-05, + "loss": 4.5599, "step": 600 }, { - "epoch": 2.1854545454545455, - "grad_norm": 3.672569990158081, - "learning_rate": 1.3636363636363637e-05, - "loss": 4.4492, + "epoch": 2.1696750902527078, + "grad_norm": 3.1637089252471924, + "learning_rate": 1.3898916967509026e-05, + "loss": 4.6395, "step": 601 }, { - "epoch": 2.189090909090909, - "grad_norm": 3.1186609268188477, - "learning_rate": 1.3575757575757578e-05, - "loss": 4.4051, + "epoch": 2.1732851985559565, + "grad_norm": 4.921415328979492, + "learning_rate": 1.3838748495788206e-05, + "loss": 4.3406, "step": 602 }, { - "epoch": 2.192727272727273, - "grad_norm": 2.243263006210327, - "learning_rate": 1.3515151515151517e-05, - "loss": 4.3317, + "epoch": 2.1768953068592056, + "grad_norm": 3.5986857414245605, + "learning_rate": 1.3778580024067388e-05, + "loss": 4.6468, "step": 603 }, { - "epoch": 2.1963636363636363, - "grad_norm": 3.677445650100708, - "learning_rate": 1.3454545454545457e-05, - "loss": 4.3382, + "epoch": 2.1805054151624548, + "grad_norm": 4.023871898651123, + "learning_rate": 1.3718411552346572e-05, + "loss": 4.4085, "step": 604 }, { - "epoch": 2.2, - "grad_norm": 2.7815980911254883, - "learning_rate": 1.3393939393939395e-05, - "loss": 4.4145, + "epoch": 2.184115523465704, + "grad_norm": 3.7364799976348877, + "learning_rate": 1.3658243080625751e-05, + "loss": 4.3773, "step": 605 }, { - "epoch": 2.2036363636363636, - "grad_norm": 4.602207183837891, - "learning_rate": 1.3333333333333333e-05, - "loss": 4.6324, + "epoch": 2.187725631768953, + "grad_norm": 3.5880355834960938, + "learning_rate": 1.3598074608904935e-05, + "loss": 4.3806, "step": 606 }, { - "epoch": 2.207272727272727, - "grad_norm": 4.399771690368652, - "learning_rate": 1.3272727272727273e-05, - "loss": 4.5507, + "epoch": 2.191335740072202, + "grad_norm": 3.8788704872131348, + "learning_rate": 1.3537906137184117e-05, + "loss": 4.3904, "step": 607 }, { - "epoch": 2.210909090909091, - "grad_norm": 4.964302062988281, - "learning_rate": 1.3212121212121212e-05, - "loss": 4.5691, + "epoch": 2.1949458483754514, + "grad_norm": 4.633315086364746, + "learning_rate": 1.3477737665463298e-05, + "loss": 4.7456, "step": 608 }, { - "epoch": 2.2145454545454544, - "grad_norm": 6.282228946685791, - "learning_rate": 1.315151515151515e-05, - "loss": 4.5208, + "epoch": 2.1985559566787005, + "grad_norm": 5.435410499572754, + "learning_rate": 1.341756919374248e-05, + "loss": 4.5496, "step": 609 }, { - "epoch": 2.2181818181818183, - "grad_norm": 4.12927770614624, - "learning_rate": 1.3090909090909093e-05, - "loss": 4.4183, + "epoch": 2.2021660649819497, + "grad_norm": 5.3415985107421875, + "learning_rate": 1.3357400722021662e-05, + "loss": 4.6066, "step": 610 }, { - "epoch": 2.2218181818181817, - "grad_norm": 3.556312322616577, - "learning_rate": 1.3030303030303032e-05, - "loss": 4.3785, + "epoch": 2.2057761732851984, + "grad_norm": 3.8118133544921875, + "learning_rate": 1.3297232250300843e-05, + "loss": 4.4713, "step": 611 }, { - "epoch": 2.2254545454545456, - "grad_norm": 3.0980546474456787, - "learning_rate": 1.296969696969697e-05, - "loss": 4.4019, + "epoch": 2.2093862815884475, + "grad_norm": 4.059695720672607, + "learning_rate": 1.3237063778580025e-05, + "loss": 4.4897, "step": 612 }, { - "epoch": 2.229090909090909, - "grad_norm": 3.8354101181030273, - "learning_rate": 1.290909090909091e-05, - "loss": 4.424, + "epoch": 2.2129963898916967, + "grad_norm": 2.7088205814361572, + "learning_rate": 1.3176895306859207e-05, + "loss": 4.5617, "step": 613 }, { - "epoch": 2.232727272727273, - "grad_norm": 5.496599197387695, - "learning_rate": 1.2848484848484848e-05, - "loss": 4.5742, + "epoch": 2.216606498194946, + "grad_norm": 4.490012168884277, + "learning_rate": 1.3116726835138388e-05, + "loss": 4.4686, "step": 614 }, { - "epoch": 2.2363636363636363, - "grad_norm": 3.4373464584350586, - "learning_rate": 1.2787878787878787e-05, - "loss": 4.5655, + "epoch": 2.220216606498195, + "grad_norm": 3.641652822494507, + "learning_rate": 1.305655836341757e-05, + "loss": 4.3731, "step": 615 }, { - "epoch": 2.24, - "grad_norm": 2.642533779144287, - "learning_rate": 1.2727272727272727e-05, - "loss": 4.3463, + "epoch": 2.223826714801444, + "grad_norm": 3.919235944747925, + "learning_rate": 1.299638989169675e-05, + "loss": 4.3683, "step": 616 }, { - "epoch": 2.2436363636363637, - "grad_norm": 7.2525224685668945, - "learning_rate": 1.2666666666666668e-05, - "loss": 4.3718, + "epoch": 2.2274368231046933, + "grad_norm": 3.3261141777038574, + "learning_rate": 1.2936221419975933e-05, + "loss": 4.3962, "step": 617 }, { - "epoch": 2.247272727272727, - "grad_norm": 5.643246173858643, - "learning_rate": 1.2606060606060607e-05, - "loss": 4.5915, + "epoch": 2.2310469314079424, + "grad_norm": 5.451700687408447, + "learning_rate": 1.2876052948255116e-05, + "loss": 4.5434, "step": 618 }, { - "epoch": 2.250909090909091, - "grad_norm": 6.5598835945129395, - "learning_rate": 1.2545454545454547e-05, - "loss": 4.5327, + "epoch": 2.234657039711191, + "grad_norm": 4.056484699249268, + "learning_rate": 1.2815884476534295e-05, + "loss": 4.4165, "step": 619 }, { - "epoch": 2.2545454545454544, - "grad_norm": 3.6604361534118652, - "learning_rate": 1.2484848484848485e-05, - "loss": 4.3982, + "epoch": 2.2382671480144403, + "grad_norm": 2.935309648513794, + "learning_rate": 1.2755716004813477e-05, + "loss": 4.3765, "step": 620 }, { - "epoch": 2.2581818181818183, - "grad_norm": 4.660340309143066, - "learning_rate": 1.2424242424242424e-05, - "loss": 4.3316, + "epoch": 2.2418772563176894, + "grad_norm": 2.4981465339660645, + "learning_rate": 1.2695547533092661e-05, + "loss": 4.4208, "step": 621 }, { - "epoch": 2.2618181818181817, - "grad_norm": 2.6217048168182373, - "learning_rate": 1.2363636363636365e-05, - "loss": 4.4005, + "epoch": 2.2454873646209386, + "grad_norm": 4.763179302215576, + "learning_rate": 1.263537906137184e-05, + "loss": 4.3193, "step": 622 }, { - "epoch": 2.2654545454545456, - "grad_norm": 5.1133952140808105, - "learning_rate": 1.2303030303030304e-05, - "loss": 4.6143, + "epoch": 2.2490974729241877, + "grad_norm": 6.148734092712402, + "learning_rate": 1.2575210589651024e-05, + "loss": 4.4633, "step": 623 }, { - "epoch": 2.269090909090909, - "grad_norm": 4.391026020050049, - "learning_rate": 1.2242424242424242e-05, - "loss": 4.5866, + "epoch": 2.252707581227437, + "grad_norm": 4.5394744873046875, + "learning_rate": 1.2515042117930206e-05, + "loss": 4.376, "step": 624 }, { - "epoch": 2.2727272727272725, - "grad_norm": 3.473971366882324, - "learning_rate": 1.2181818181818182e-05, - "loss": 4.5554, + "epoch": 2.256317689530686, + "grad_norm": 4.363958358764648, + "learning_rate": 1.2454873646209387e-05, + "loss": 4.5333, "step": 625 }, { - "epoch": 2.2763636363636364, - "grad_norm": 3.6338303089141846, - "learning_rate": 1.2121212121212122e-05, - "loss": 4.5832, + "epoch": 2.259927797833935, + "grad_norm": 3.6951801776885986, + "learning_rate": 1.2394705174488569e-05, + "loss": 4.4411, "step": 626 }, { - "epoch": 2.2800000000000002, - "grad_norm": 3.404435157775879, - "learning_rate": 1.2060606060606062e-05, - "loss": 4.4887, + "epoch": 2.2635379061371843, + "grad_norm": 4.908221244812012, + "learning_rate": 1.233453670276775e-05, + "loss": 4.4036, "step": 627 }, { - "epoch": 2.2836363636363637, - "grad_norm": 5.865675449371338, - "learning_rate": 1.2e-05, - "loss": 4.5747, + "epoch": 2.2671480144404335, + "grad_norm": 6.928462505340576, + "learning_rate": 1.2274368231046932e-05, + "loss": 4.5572, "step": 628 }, { - "epoch": 2.287272727272727, - "grad_norm": 3.2075819969177246, - "learning_rate": 1.193939393939394e-05, - "loss": 4.3799, + "epoch": 2.270758122743682, + "grad_norm": 4.029715061187744, + "learning_rate": 1.2214199759326114e-05, + "loss": 4.5028, "step": 629 }, { - "epoch": 2.290909090909091, - "grad_norm": 4.172214031219482, - "learning_rate": 1.187878787878788e-05, - "loss": 4.5288, + "epoch": 2.2743682310469313, + "grad_norm": 4.783502101898193, + "learning_rate": 1.2154031287605296e-05, + "loss": 4.5988, "step": 630 }, { - "epoch": 2.2945454545454544, - "grad_norm": 5.905372142791748, - "learning_rate": 1.1818181818181819e-05, - "loss": 4.3566, + "epoch": 2.2779783393501805, + "grad_norm": 5.311758041381836, + "learning_rate": 1.2093862815884477e-05, + "loss": 4.3549, "step": 631 }, { - "epoch": 2.2981818181818183, - "grad_norm": 3.021374225616455, - "learning_rate": 1.1757575757575757e-05, - "loss": 4.2457, + "epoch": 2.2815884476534296, + "grad_norm": 5.8430094718933105, + "learning_rate": 1.2033694344163659e-05, + "loss": 4.4828, "step": 632 }, { - "epoch": 2.3018181818181818, - "grad_norm": 6.168385982513428, - "learning_rate": 1.1696969696969699e-05, - "loss": 4.5722, + "epoch": 2.2851985559566788, + "grad_norm": 2.834203004837036, + "learning_rate": 1.1973525872442841e-05, + "loss": 4.4867, "step": 633 }, { - "epoch": 2.3054545454545456, - "grad_norm": 4.84739351272583, - "learning_rate": 1.1636363636363637e-05, - "loss": 4.5149, + "epoch": 2.288808664259928, + "grad_norm": 3.2368834018707275, + "learning_rate": 1.1913357400722022e-05, + "loss": 4.5147, "step": 634 }, { - "epoch": 2.309090909090909, - "grad_norm": 5.882564067840576, - "learning_rate": 1.1575757575757575e-05, - "loss": 4.6114, + "epoch": 2.292418772563177, + "grad_norm": 4.3523359298706055, + "learning_rate": 1.1853188929001204e-05, + "loss": 4.3972, "step": 635 }, { - "epoch": 2.3127272727272725, - "grad_norm": 3.714647054672241, - "learning_rate": 1.1515151515151517e-05, - "loss": 4.5177, + "epoch": 2.2960288808664258, + "grad_norm": 4.379057884216309, + "learning_rate": 1.1793020457280386e-05, + "loss": 4.3681, "step": 636 }, { - "epoch": 2.3163636363636364, - "grad_norm": 3.449841260910034, - "learning_rate": 1.1454545454545455e-05, - "loss": 4.5713, + "epoch": 2.299638989169675, + "grad_norm": 6.070941925048828, + "learning_rate": 1.1732851985559568e-05, + "loss": 4.4543, "step": 637 }, { - "epoch": 2.32, - "grad_norm": 6.447396755218506, - "learning_rate": 1.1393939393939394e-05, - "loss": 4.501, + "epoch": 2.303249097472924, + "grad_norm": 4.886351585388184, + "learning_rate": 1.1672683513838749e-05, + "loss": 4.4359, "step": 638 }, { - "epoch": 2.3236363636363637, - "grad_norm": 3.258431911468506, - "learning_rate": 1.1333333333333334e-05, - "loss": 4.5533, + "epoch": 2.306859205776173, + "grad_norm": 3.127772808074951, + "learning_rate": 1.161251504211793e-05, + "loss": 4.463, "step": 639 }, { - "epoch": 2.327272727272727, - "grad_norm": 4.90877103805542, - "learning_rate": 1.1272727272727274e-05, - "loss": 4.3269, + "epoch": 2.3104693140794224, + "grad_norm": 3.3918819427490234, + "learning_rate": 1.1552346570397113e-05, + "loss": 4.5662, "step": 640 }, { - "epoch": 2.330909090909091, - "grad_norm": 4.926637172698975, - "learning_rate": 1.1212121212121212e-05, - "loss": 4.395, + "epoch": 2.3140794223826715, + "grad_norm": 3.3989920616149902, + "learning_rate": 1.1492178098676294e-05, + "loss": 4.4219, "step": 641 }, { - "epoch": 2.3345454545454545, - "grad_norm": 4.344498634338379, - "learning_rate": 1.1151515151515152e-05, - "loss": 4.2733, + "epoch": 2.3176895306859207, + "grad_norm": 3.8738067150115967, + "learning_rate": 1.1432009626955476e-05, + "loss": 4.5089, "step": 642 }, { - "epoch": 2.3381818181818184, - "grad_norm": 2.261861562728882, - "learning_rate": 1.1090909090909092e-05, - "loss": 4.3516, + "epoch": 2.32129963898917, + "grad_norm": 4.018595218658447, + "learning_rate": 1.1371841155234658e-05, + "loss": 4.6628, "step": 643 }, { - "epoch": 2.341818181818182, - "grad_norm": 3.712153434753418, - "learning_rate": 1.103030303030303e-05, - "loss": 4.2867, + "epoch": 2.324909747292419, + "grad_norm": 4.790584087371826, + "learning_rate": 1.1311672683513839e-05, + "loss": 4.5279, "step": 644 }, { - "epoch": 2.3454545454545457, - "grad_norm": 2.1240344047546387, - "learning_rate": 1.096969696969697e-05, - "loss": 4.4041, + "epoch": 2.328519855595668, + "grad_norm": 3.1995787620544434, + "learning_rate": 1.1251504211793021e-05, + "loss": 4.6064, "step": 645 }, { - "epoch": 2.349090909090909, - "grad_norm": 5.3182053565979, - "learning_rate": 1.0909090909090909e-05, - "loss": 4.5118, + "epoch": 2.332129963898917, + "grad_norm": 2.8555033206939697, + "learning_rate": 1.1191335740072201e-05, + "loss": 4.4816, "step": 646 }, { - "epoch": 2.3527272727272726, - "grad_norm": 3.5492019653320312, - "learning_rate": 1.0848484848484849e-05, - "loss": 4.5899, + "epoch": 2.335740072202166, + "grad_norm": 3.3136844635009766, + "learning_rate": 1.1131167268351385e-05, + "loss": 4.5662, "step": 647 }, { - "epoch": 2.3563636363636364, - "grad_norm": 3.8747949600219727, - "learning_rate": 1.0787878787878789e-05, - "loss": 4.3194, + "epoch": 2.339350180505415, + "grad_norm": 3.995872735977173, + "learning_rate": 1.1070998796630566e-05, + "loss": 4.3727, "step": 648 }, { - "epoch": 2.36, - "grad_norm": 2.2515571117401123, - "learning_rate": 1.0727272727272727e-05, - "loss": 4.4239, + "epoch": 2.3429602888086642, + "grad_norm": 4.499544143676758, + "learning_rate": 1.1010830324909748e-05, + "loss": 4.478, "step": 649 }, { - "epoch": 2.3636363636363638, - "grad_norm": 3.838587760925293, - "learning_rate": 1.0666666666666667e-05, - "loss": 4.5496, + "epoch": 2.3465703971119134, + "grad_norm": 3.3210017681121826, + "learning_rate": 1.095066185318893e-05, + "loss": 4.396, "step": 650 }, { - "epoch": 2.367272727272727, - "grad_norm": 2.633479595184326, - "learning_rate": 1.0606060606060607e-05, - "loss": 4.2623, + "epoch": 2.3501805054151625, + "grad_norm": 2.9872148036956787, + "learning_rate": 1.089049338146811e-05, + "loss": 4.4108, "step": 651 }, { - "epoch": 2.370909090909091, - "grad_norm": 3.465851306915283, - "learning_rate": 1.0545454545454546e-05, - "loss": 4.3402, + "epoch": 2.3537906137184117, + "grad_norm": 5.219914436340332, + "learning_rate": 1.0830324909747293e-05, + "loss": 4.4073, "step": 652 }, { - "epoch": 2.3745454545454545, - "grad_norm": 4.0814595222473145, - "learning_rate": 1.0484848484848486e-05, - "loss": 4.2627, + "epoch": 2.357400722021661, + "grad_norm": 3.3450937271118164, + "learning_rate": 1.0770156438026474e-05, + "loss": 4.2831, "step": 653 }, { - "epoch": 2.378181818181818, - "grad_norm": 1.8538293838500977, - "learning_rate": 1.0424242424242426e-05, - "loss": 4.3685, + "epoch": 2.3610108303249095, + "grad_norm": 4.793975353240967, + "learning_rate": 1.0709987966305657e-05, + "loss": 4.6626, "step": 654 }, { - "epoch": 2.381818181818182, - "grad_norm": 3.5009477138519287, - "learning_rate": 1.0363636363636364e-05, - "loss": 4.4817, + "epoch": 2.3646209386281587, + "grad_norm": 4.04519510269165, + "learning_rate": 1.0649819494584838e-05, + "loss": 4.3976, "step": 655 }, { - "epoch": 2.3854545454545453, - "grad_norm": 5.759241580963135, - "learning_rate": 1.0303030303030304e-05, - "loss": 4.229, + "epoch": 2.368231046931408, + "grad_norm": 3.0251882076263428, + "learning_rate": 1.058965102286402e-05, + "loss": 4.4383, "step": 656 }, { - "epoch": 2.389090909090909, - "grad_norm": 8.428985595703125, - "learning_rate": 1.0242424242424242e-05, - "loss": 4.7398, + "epoch": 2.371841155234657, + "grad_norm": 3.5139055252075195, + "learning_rate": 1.0529482551143202e-05, + "loss": 4.5331, "step": 657 }, { - "epoch": 2.3927272727272726, - "grad_norm": 4.199151515960693, - "learning_rate": 1.0181818181818182e-05, - "loss": 4.2961, + "epoch": 2.375451263537906, + "grad_norm": 3.8762271404266357, + "learning_rate": 1.0469314079422383e-05, + "loss": 4.4937, "step": 658 }, { - "epoch": 2.3963636363636365, - "grad_norm": 4.035223484039307, - "learning_rate": 1.0121212121212122e-05, - "loss": 4.491, + "epoch": 2.3790613718411553, + "grad_norm": 4.291984558105469, + "learning_rate": 1.0409145607701565e-05, + "loss": 4.366, "step": 659 }, { - "epoch": 2.4, - "grad_norm": 3.7876856327056885, - "learning_rate": 1.006060606060606e-05, - "loss": 4.4866, + "epoch": 2.3826714801444044, + "grad_norm": 3.6737313270568848, + "learning_rate": 1.0348977135980746e-05, + "loss": 4.3613, "step": 660 }, { - "epoch": 2.403636363636364, - "grad_norm": 4.28821325302124, - "learning_rate": 1e-05, - "loss": 4.3898, + "epoch": 2.3862815884476536, + "grad_norm": 4.335361003875732, + "learning_rate": 1.028880866425993e-05, + "loss": 4.3535, "step": 661 }, { - "epoch": 2.4072727272727272, - "grad_norm": 5.3424391746521, - "learning_rate": 9.93939393939394e-06, - "loss": 4.3163, + "epoch": 2.3898916967509027, + "grad_norm": 2.939689874649048, + "learning_rate": 1.022864019253911e-05, + "loss": 4.6187, "step": 662 }, { - "epoch": 2.410909090909091, - "grad_norm": 3.205174207687378, - "learning_rate": 9.87878787878788e-06, - "loss": 4.4189, + "epoch": 2.3935018050541514, + "grad_norm": 2.5564160346984863, + "learning_rate": 1.016847172081829e-05, + "loss": 4.5736, "step": 663 }, { - "epoch": 2.4145454545454546, - "grad_norm": 2.7838244438171387, - "learning_rate": 9.818181818181818e-06, - "loss": 4.3889, + "epoch": 2.3971119133574006, + "grad_norm": 7.256919860839844, + "learning_rate": 1.0108303249097473e-05, + "loss": 4.6371, "step": 664 }, { - "epoch": 2.418181818181818, - "grad_norm": 4.061773777008057, - "learning_rate": 9.75757575757576e-06, - "loss": 4.5237, + "epoch": 2.4007220216606497, + "grad_norm": 5.051013946533203, + "learning_rate": 1.0048134777376655e-05, + "loss": 4.5431, "step": 665 }, { - "epoch": 2.421818181818182, - "grad_norm": 5.466277122497559, - "learning_rate": 9.696969696969698e-06, - "loss": 4.5694, + "epoch": 2.404332129963899, + "grad_norm": 4.401142597198486, + "learning_rate": 9.987966305655837e-06, + "loss": 4.419, "step": 666 }, { - "epoch": 2.4254545454545453, - "grad_norm": 3.7190425395965576, - "learning_rate": 9.636363636363636e-06, - "loss": 4.4085, + "epoch": 2.407942238267148, + "grad_norm": 2.8643991947174072, + "learning_rate": 9.927797833935018e-06, + "loss": 4.3324, "step": 667 }, { - "epoch": 2.429090909090909, - "grad_norm": 5.519341468811035, - "learning_rate": 9.575757575757578e-06, - "loss": 4.5653, + "epoch": 2.411552346570397, + "grad_norm": 7.718422889709473, + "learning_rate": 9.8676293622142e-06, + "loss": 4.3999, "step": 668 }, { - "epoch": 2.4327272727272726, - "grad_norm": 3.7906625270843506, - "learning_rate": 9.515151515151516e-06, - "loss": 4.3678, + "epoch": 2.4151624548736463, + "grad_norm": 6.474762916564941, + "learning_rate": 9.807460890493382e-06, + "loss": 4.3473, "step": 669 }, { - "epoch": 2.4363636363636365, - "grad_norm": 4.553290843963623, - "learning_rate": 9.454545454545454e-06, - "loss": 4.6579, + "epoch": 2.4187725631768955, + "grad_norm": 5.406153678894043, + "learning_rate": 9.747292418772563e-06, + "loss": 4.4933, "step": 670 }, { - "epoch": 2.44, - "grad_norm": 4.562989711761475, - "learning_rate": 9.393939393939394e-06, - "loss": 4.3674, + "epoch": 2.422382671480144, + "grad_norm": 4.858792781829834, + "learning_rate": 9.687123947051745e-06, + "loss": 4.658, "step": 671 }, { - "epoch": 2.443636363636364, - "grad_norm": 4.022787094116211, - "learning_rate": 9.333333333333334e-06, - "loss": 4.3675, + "epoch": 2.4259927797833933, + "grad_norm": 5.371998310089111, + "learning_rate": 9.626955475330927e-06, + "loss": 4.478, "step": 672 }, { - "epoch": 2.4472727272727273, - "grad_norm": 2.7684528827667236, - "learning_rate": 9.272727272727273e-06, - "loss": 4.3793, + "epoch": 2.4296028880866425, + "grad_norm": 6.951625347137451, + "learning_rate": 9.56678700361011e-06, + "loss": 4.5687, "step": 673 }, { - "epoch": 2.450909090909091, - "grad_norm": 3.1357200145721436, - "learning_rate": 9.212121212121213e-06, - "loss": 4.2807, + "epoch": 2.4332129963898916, + "grad_norm": 2.8884341716766357, + "learning_rate": 9.50661853188929e-06, + "loss": 4.3574, "step": 674 }, { - "epoch": 2.4545454545454546, - "grad_norm": 2.534762144088745, - "learning_rate": 9.151515151515153e-06, - "loss": 4.3097, + "epoch": 2.436823104693141, + "grad_norm": 4.6059699058532715, + "learning_rate": 9.446450060168472e-06, + "loss": 4.5776, "step": 675 }, { - "epoch": 2.458181818181818, - "grad_norm": 3.3176114559173584, - "learning_rate": 9.090909090909091e-06, - "loss": 4.6301, + "epoch": 2.44043321299639, + "grad_norm": 4.730526924133301, + "learning_rate": 9.386281588447654e-06, + "loss": 4.592, "step": 676 }, { - "epoch": 2.461818181818182, - "grad_norm": 3.287240505218506, - "learning_rate": 9.030303030303031e-06, - "loss": 4.5127, + "epoch": 2.444043321299639, + "grad_norm": 3.3078360557556152, + "learning_rate": 9.326113116726835e-06, + "loss": 4.5253, "step": 677 }, { - "epoch": 2.4654545454545453, - "grad_norm": 3.7666358947753906, - "learning_rate": 8.96969696969697e-06, - "loss": 4.2394, + "epoch": 2.4476534296028882, + "grad_norm": 3.702000856399536, + "learning_rate": 9.265944645006017e-06, + "loss": 4.7267, "step": 678 }, { - "epoch": 2.4690909090909092, - "grad_norm": 2.5276758670806885, - "learning_rate": 8.90909090909091e-06, - "loss": 4.3502, + "epoch": 2.4512635379061374, + "grad_norm": 3.359959363937378, + "learning_rate": 9.2057761732852e-06, + "loss": 4.4877, "step": 679 }, { - "epoch": 2.4727272727272727, - "grad_norm": 3.2294423580169678, - "learning_rate": 8.84848484848485e-06, - "loss": 4.5801, + "epoch": 2.4548736462093865, + "grad_norm": 5.103507995605469, + "learning_rate": 9.145607701564381e-06, + "loss": 4.6424, "step": 680 }, { - "epoch": 2.4763636363636365, - "grad_norm": 5.495777130126953, - "learning_rate": 8.787878787878788e-06, - "loss": 4.7307, + "epoch": 2.4584837545126352, + "grad_norm": 5.196680068969727, + "learning_rate": 9.085439229843562e-06, + "loss": 4.4706, "step": 681 }, { - "epoch": 2.48, - "grad_norm": 7.053243637084961, - "learning_rate": 8.727272727272728e-06, - "loss": 4.3913, + "epoch": 2.4620938628158844, + "grad_norm": 3.826127052307129, + "learning_rate": 9.025270758122744e-06, + "loss": 4.381, "step": 682 }, { - "epoch": 2.4836363636363634, - "grad_norm": 8.957902908325195, - "learning_rate": 8.666666666666668e-06, - "loss": 4.4562, + "epoch": 2.4657039711191335, + "grad_norm": 2.5819101333618164, + "learning_rate": 8.965102286401926e-06, + "loss": 4.3937, "step": 683 }, { - "epoch": 2.4872727272727273, - "grad_norm": 4.16687536239624, - "learning_rate": 8.606060606060606e-06, - "loss": 4.3475, + "epoch": 2.4693140794223827, + "grad_norm": 3.1093242168426514, + "learning_rate": 8.904933814681107e-06, + "loss": 4.3896, "step": 684 }, { - "epoch": 2.4909090909090907, - "grad_norm": 9.852190971374512, - "learning_rate": 8.545454545454546e-06, - "loss": 4.5089, + "epoch": 2.472924187725632, + "grad_norm": 4.864704132080078, + "learning_rate": 8.84476534296029e-06, + "loss": 4.5112, "step": 685 }, { - "epoch": 2.4945454545454546, - "grad_norm": 12.333488464355469, - "learning_rate": 8.484848484848486e-06, - "loss": 4.4475, + "epoch": 2.476534296028881, + "grad_norm": 3.471543073654175, + "learning_rate": 8.784596871239471e-06, + "loss": 4.5498, "step": 686 }, { - "epoch": 2.498181818181818, - "grad_norm": 7.947030544281006, - "learning_rate": 8.424242424242425e-06, - "loss": 4.2607, + "epoch": 2.48014440433213, + "grad_norm": 4.402616500854492, + "learning_rate": 8.724428399518652e-06, + "loss": 4.3387, "step": 687 }, { - "epoch": 2.501818181818182, - "grad_norm": 4.91619348526001, - "learning_rate": 8.363636363636365e-06, - "loss": 4.3802, + "epoch": 2.483754512635379, + "grad_norm": 6.379435062408447, + "learning_rate": 8.664259927797834e-06, + "loss": 4.5544, "step": 688 }, { - "epoch": 2.5054545454545454, - "grad_norm": 4.144915580749512, - "learning_rate": 8.303030303030303e-06, - "loss": 4.6197, + "epoch": 2.487364620938628, + "grad_norm": 6.62040901184082, + "learning_rate": 8.604091456077016e-06, + "loss": 4.4874, "step": 689 }, { - "epoch": 2.509090909090909, - "grad_norm": 4.914970874786377, - "learning_rate": 8.242424242424243e-06, - "loss": 4.4078, + "epoch": 2.490974729241877, + "grad_norm": 5.348603248596191, + "learning_rate": 8.543922984356199e-06, + "loss": 4.3741, "step": 690 }, { - "epoch": 2.5127272727272727, - "grad_norm": 5.826870918273926, - "learning_rate": 8.181818181818183e-06, - "loss": 4.3835, + "epoch": 2.4945848375451263, + "grad_norm": 2.902223825454712, + "learning_rate": 8.483754512635379e-06, + "loss": 4.3933, "step": 691 }, { - "epoch": 2.5163636363636366, - "grad_norm": 4.456042766571045, - "learning_rate": 8.121212121212121e-06, - "loss": 4.4389, + "epoch": 2.4981949458483754, + "grad_norm": 3.2909133434295654, + "learning_rate": 8.423586040914561e-06, + "loss": 4.5229, "step": 692 }, { - "epoch": 2.52, - "grad_norm": 7.257143497467041, - "learning_rate": 8.060606060606061e-06, - "loss": 4.4145, + "epoch": 2.5018050541516246, + "grad_norm": 3.990521192550659, + "learning_rate": 8.363417569193743e-06, + "loss": 4.5907, "step": 693 }, { - "epoch": 2.5236363636363635, - "grad_norm": 6.106967926025391, - "learning_rate": 8.000000000000001e-06, - "loss": 4.3769, + "epoch": 2.5054151624548737, + "grad_norm": 5.001603126525879, + "learning_rate": 8.303249097472924e-06, + "loss": 4.4247, "step": 694 }, { - "epoch": 2.5272727272727273, - "grad_norm": 3.0552563667297363, - "learning_rate": 7.93939393939394e-06, - "loss": 4.4448, + "epoch": 2.509025270758123, + "grad_norm": 4.88585090637207, + "learning_rate": 8.243080625752106e-06, + "loss": 4.4019, "step": 695 }, { - "epoch": 2.5309090909090908, - "grad_norm": 4.240468502044678, - "learning_rate": 7.878787878787878e-06, - "loss": 4.6053, + "epoch": 2.512635379061372, + "grad_norm": 3.634761333465576, + "learning_rate": 8.182912154031288e-06, + "loss": 4.3335, "step": 696 }, { - "epoch": 2.5345454545454547, - "grad_norm": 4.00745153427124, - "learning_rate": 7.81818181818182e-06, - "loss": 4.715, + "epoch": 2.516245487364621, + "grad_norm": 4.154645919799805, + "learning_rate": 8.12274368231047e-06, + "loss": 4.4616, "step": 697 }, { - "epoch": 2.538181818181818, - "grad_norm": 7.360689640045166, - "learning_rate": 7.757575757575758e-06, - "loss": 4.2768, + "epoch": 2.51985559566787, + "grad_norm": 2.943950891494751, + "learning_rate": 8.062575210589651e-06, + "loss": 4.4305, "step": 698 }, { - "epoch": 2.541818181818182, - "grad_norm": 10.462032318115234, - "learning_rate": 7.696969696969696e-06, - "loss": 4.4312, + "epoch": 2.523465703971119, + "grad_norm": 5.359757900238037, + "learning_rate": 8.002406738868833e-06, + "loss": 4.4419, "step": 699 }, { - "epoch": 2.5454545454545454, - "grad_norm": 7.483114719390869, - "learning_rate": 7.636363636363638e-06, - "loss": 4.386, + "epoch": 2.527075812274368, + "grad_norm": 3.205974817276001, + "learning_rate": 7.942238267148016e-06, + "loss": 4.4804, "step": 700 }, { - "epoch": 2.549090909090909, - "grad_norm": 2.8339669704437256, - "learning_rate": 7.5757575757575764e-06, - "loss": 4.495, + "epoch": 2.5306859205776173, + "grad_norm": 3.1769614219665527, + "learning_rate": 7.882069795427196e-06, + "loss": 4.5115, "step": 701 }, { - "epoch": 2.5527272727272727, - "grad_norm": 4.9380340576171875, - "learning_rate": 7.515151515151516e-06, - "loss": 4.3816, + "epoch": 2.5342960288808665, + "grad_norm": 3.652250051498413, + "learning_rate": 7.821901323706378e-06, + "loss": 4.2911, "step": 702 }, { - "epoch": 2.5563636363636366, - "grad_norm": 5.354694366455078, - "learning_rate": 7.454545454545454e-06, - "loss": 4.3048, + "epoch": 2.5379061371841156, + "grad_norm": 6.6311235427856445, + "learning_rate": 7.76173285198556e-06, + "loss": 4.4701, "step": 703 }, { - "epoch": 2.56, - "grad_norm": 8.832294464111328, - "learning_rate": 7.393939393939395e-06, - "loss": 4.3648, + "epoch": 2.5415162454873648, + "grad_norm": 4.466462135314941, + "learning_rate": 7.701564380264743e-06, + "loss": 4.2806, "step": 704 }, { - "epoch": 2.5636363636363635, - "grad_norm": 5.9402971267700195, - "learning_rate": 7.333333333333334e-06, - "loss": 4.2456, + "epoch": 2.5451263537906135, + "grad_norm": 2.840956687927246, + "learning_rate": 7.641395908543923e-06, + "loss": 4.3817, "step": 705 }, { - "epoch": 2.5672727272727274, - "grad_norm": 4.423623561859131, - "learning_rate": 7.272727272727272e-06, - "loss": 4.629, + "epoch": 2.5487364620938626, + "grad_norm": 3.926969289779663, + "learning_rate": 7.581227436823105e-06, + "loss": 4.4675, "step": 706 }, { - "epoch": 2.570909090909091, - "grad_norm": 3.3913474082946777, - "learning_rate": 7.212121212121213e-06, - "loss": 4.3109, + "epoch": 2.5523465703971118, + "grad_norm": 7.378762245178223, + "learning_rate": 7.521058965102287e-06, + "loss": 4.2906, "step": 707 }, { - "epoch": 2.5745454545454547, - "grad_norm": 4.365673542022705, - "learning_rate": 7.151515151515152e-06, - "loss": 4.4317, + "epoch": 2.555956678700361, + "grad_norm": 6.083559036254883, + "learning_rate": 7.460890493381468e-06, + "loss": 4.5136, "step": 708 }, { - "epoch": 2.578181818181818, - "grad_norm": 5.201317310333252, - "learning_rate": 7.090909090909091e-06, - "loss": 4.361, + "epoch": 2.55956678700361, + "grad_norm": 3.145780324935913, + "learning_rate": 7.4007220216606496e-06, + "loss": 4.3858, "step": 709 }, { - "epoch": 2.581818181818182, - "grad_norm": 8.283641815185547, - "learning_rate": 7.03030303030303e-06, - "loss": 4.4209, + "epoch": 2.563176895306859, + "grad_norm": 3.72505784034729, + "learning_rate": 7.340553549939831e-06, + "loss": 4.5418, "step": 710 }, { - "epoch": 2.5854545454545454, - "grad_norm": 6.324493885040283, - "learning_rate": 6.969696969696971e-06, - "loss": 4.572, + "epoch": 2.5667870036101084, + "grad_norm": 4.158870697021484, + "learning_rate": 7.280385078219014e-06, + "loss": 4.5686, "step": 711 }, { - "epoch": 2.589090909090909, - "grad_norm": 4.416196823120117, - "learning_rate": 6.909090909090909e-06, - "loss": 4.4043, + "epoch": 2.5703971119133575, + "grad_norm": 6.947221279144287, + "learning_rate": 7.220216606498195e-06, + "loss": 4.2351, "step": 712 }, { - "epoch": 2.5927272727272728, - "grad_norm": 3.493542194366455, - "learning_rate": 6.848484848484848e-06, - "loss": 4.3647, + "epoch": 2.5740072202166067, + "grad_norm": 4.236288070678711, + "learning_rate": 7.160048134777377e-06, + "loss": 4.451, "step": 713 }, { - "epoch": 2.596363636363636, - "grad_norm": 3.4340717792510986, - "learning_rate": 6.787878787878789e-06, - "loss": 4.354, + "epoch": 2.577617328519856, + "grad_norm": 4.43533182144165, + "learning_rate": 7.099879663056559e-06, + "loss": 4.3672, "step": 714 }, { - "epoch": 2.6, - "grad_norm": 3.536820650100708, - "learning_rate": 6.727272727272728e-06, - "loss": 4.5004, + "epoch": 2.581227436823105, + "grad_norm": 2.0811402797698975, + "learning_rate": 7.03971119133574e-06, + "loss": 4.4889, "step": 715 }, { - "epoch": 2.6036363636363635, - "grad_norm": 4.423953533172607, - "learning_rate": 6.666666666666667e-06, - "loss": 4.2383, + "epoch": 2.5848375451263537, + "grad_norm": 3.2417030334472656, + "learning_rate": 6.979542719614922e-06, + "loss": 4.3176, "step": 716 }, { - "epoch": 2.6072727272727274, - "grad_norm": 3.8500802516937256, - "learning_rate": 6.606060606060606e-06, - "loss": 4.3031, + "epoch": 2.588447653429603, + "grad_norm": 4.629251956939697, + "learning_rate": 6.919374247894103e-06, + "loss": 4.6087, "step": 717 }, { - "epoch": 2.610909090909091, - "grad_norm": 5.0357747077941895, - "learning_rate": 6.545454545454547e-06, - "loss": 4.3652, + "epoch": 2.592057761732852, + "grad_norm": 3.3105053901672363, + "learning_rate": 6.859205776173286e-06, + "loss": 4.6441, "step": 718 }, { - "epoch": 2.6145454545454543, - "grad_norm": 4.3186492919921875, - "learning_rate": 6.484848484848485e-06, - "loss": 4.5205, + "epoch": 2.595667870036101, + "grad_norm": 3.484877586364746, + "learning_rate": 6.7990373044524675e-06, + "loss": 4.5328, "step": 719 }, { - "epoch": 2.618181818181818, - "grad_norm": 4.643374443054199, - "learning_rate": 6.424242424242424e-06, - "loss": 4.3247, + "epoch": 2.5992779783393503, + "grad_norm": 3.027569532394409, + "learning_rate": 6.738868832731649e-06, + "loss": 4.5395, "step": 720 }, { - "epoch": 2.621818181818182, - "grad_norm": 7.076257228851318, - "learning_rate": 6.363636363636363e-06, - "loss": 4.4034, + "epoch": 2.6028880866425994, + "grad_norm": 3.3761277198791504, + "learning_rate": 6.678700361010831e-06, + "loss": 4.4973, "step": 721 }, { - "epoch": 2.6254545454545455, - "grad_norm": 3.3600053787231445, - "learning_rate": 6.303030303030303e-06, - "loss": 4.3745, + "epoch": 2.606498194945848, + "grad_norm": 4.028012275695801, + "learning_rate": 6.618531889290012e-06, + "loss": 4.3345, "step": 722 }, { - "epoch": 2.629090909090909, - "grad_norm": 6.4890642166137695, - "learning_rate": 6.242424242424243e-06, - "loss": 4.3592, + "epoch": 2.6101083032490973, + "grad_norm": 3.6526448726654053, + "learning_rate": 6.558363417569194e-06, + "loss": 4.5195, "step": 723 }, { - "epoch": 2.632727272727273, - "grad_norm": 8.538745880126953, - "learning_rate": 6.181818181818183e-06, - "loss": 4.3828, + "epoch": 2.6137184115523464, + "grad_norm": 3.5621955394744873, + "learning_rate": 6.498194945848375e-06, + "loss": 4.3931, "step": 724 }, { - "epoch": 2.6363636363636362, - "grad_norm": 6.684054374694824, - "learning_rate": 6.121212121212121e-06, - "loss": 4.3507, + "epoch": 2.6173285198555956, + "grad_norm": 4.076904773712158, + "learning_rate": 6.438026474127558e-06, + "loss": 4.3022, "step": 725 }, { - "epoch": 2.64, - "grad_norm": 9.749906539916992, - "learning_rate": 6.060606060606061e-06, - "loss": 4.6677, + "epoch": 2.6209386281588447, + "grad_norm": 3.3727328777313232, + "learning_rate": 6.377858002406739e-06, + "loss": 4.5659, "step": 726 }, { - "epoch": 2.6436363636363636, - "grad_norm": 2.9998209476470947, - "learning_rate": 6e-06, - "loss": 4.4207, + "epoch": 2.624548736462094, + "grad_norm": 3.9651689529418945, + "learning_rate": 6.31768953068592e-06, + "loss": 4.4302, "step": 727 }, { - "epoch": 2.6472727272727274, - "grad_norm": 4.262009143829346, - "learning_rate": 5.93939393939394e-06, - "loss": 4.5837, + "epoch": 2.628158844765343, + "grad_norm": 5.752009391784668, + "learning_rate": 6.257521058965103e-06, + "loss": 4.6044, "step": 728 }, { - "epoch": 2.650909090909091, - "grad_norm": 5.290808200836182, - "learning_rate": 5.8787878787878785e-06, - "loss": 4.5531, + "epoch": 2.631768953068592, + "grad_norm": 4.023155689239502, + "learning_rate": 6.1973525872442845e-06, + "loss": 4.562, "step": 729 }, { - "epoch": 2.6545454545454543, - "grad_norm": 6.753263473510742, - "learning_rate": 5.8181818181818185e-06, - "loss": 4.4299, + "epoch": 2.6353790613718413, + "grad_norm": 5.812756538391113, + "learning_rate": 6.137184115523466e-06, + "loss": 4.4802, "step": 730 }, { - "epoch": 2.658181818181818, - "grad_norm": 4.626419544219971, - "learning_rate": 5.7575757575757586e-06, - "loss": 4.2893, + "epoch": 2.6389891696750905, + "grad_norm": 4.468845367431641, + "learning_rate": 6.077015643802648e-06, + "loss": 4.3375, "step": 731 }, { - "epoch": 2.661818181818182, - "grad_norm": 4.211031436920166, - "learning_rate": 5.696969696969697e-06, - "loss": 4.3628, + "epoch": 2.6425992779783396, + "grad_norm": 2.6663763523101807, + "learning_rate": 6.0168471720818295e-06, + "loss": 4.455, "step": 732 }, { - "epoch": 2.6654545454545455, - "grad_norm": 4.562239170074463, - "learning_rate": 5.636363636363637e-06, - "loss": 4.5267, + "epoch": 2.6462093862815883, + "grad_norm": 3.7850840091705322, + "learning_rate": 5.956678700361011e-06, + "loss": 4.5436, "step": 733 }, { - "epoch": 2.669090909090909, - "grad_norm": 3.194110155105591, - "learning_rate": 5.575757575757576e-06, - "loss": 4.2716, + "epoch": 2.6498194945848375, + "grad_norm": 3.1074018478393555, + "learning_rate": 5.896510228640193e-06, + "loss": 4.4719, "step": 734 }, { - "epoch": 2.672727272727273, - "grad_norm": 4.2862749099731445, - "learning_rate": 5.515151515151515e-06, - "loss": 4.4384, + "epoch": 2.6534296028880866, + "grad_norm": 2.576409339904785, + "learning_rate": 5.836341756919374e-06, + "loss": 4.5036, "step": 735 }, { - "epoch": 2.6763636363636363, - "grad_norm": 3.666160821914673, - "learning_rate": 5.4545454545454545e-06, - "loss": 4.4241, + "epoch": 2.6570397111913358, + "grad_norm": 5.051802158355713, + "learning_rate": 5.776173285198557e-06, + "loss": 4.3509, "step": 736 }, { - "epoch": 2.68, - "grad_norm": 6.476161003112793, - "learning_rate": 5.3939393939393945e-06, - "loss": 4.402, + "epoch": 2.660649819494585, + "grad_norm": 4.346753120422363, + "learning_rate": 5.716004813477738e-06, + "loss": 4.3802, "step": 737 }, { - "epoch": 2.6836363636363636, - "grad_norm": 6.069520950317383, - "learning_rate": 5.333333333333334e-06, - "loss": 4.5149, + "epoch": 2.664259927797834, + "grad_norm": 3.5581212043762207, + "learning_rate": 5.655836341756919e-06, + "loss": 4.6375, "step": 738 }, { - "epoch": 2.6872727272727275, - "grad_norm": 3.9702067375183105, - "learning_rate": 5.272727272727273e-06, - "loss": 4.2622, + "epoch": 2.667870036101083, + "grad_norm": 3.9020423889160156, + "learning_rate": 5.595667870036101e-06, + "loss": 4.4485, "step": 739 }, { - "epoch": 2.690909090909091, - "grad_norm": 4.099255561828613, - "learning_rate": 5.212121212121213e-06, - "loss": 4.5067, + "epoch": 2.671480144404332, + "grad_norm": 3.6418099403381348, + "learning_rate": 5.535499398315283e-06, + "loss": 4.4073, "step": 740 }, { - "epoch": 2.6945454545454544, - "grad_norm": 4.346926212310791, - "learning_rate": 5.151515151515152e-06, - "loss": 4.585, + "epoch": 2.675090252707581, + "grad_norm": 5.49989128112793, + "learning_rate": 5.475330926594465e-06, + "loss": 4.3374, "step": 741 }, { - "epoch": 2.6981818181818182, - "grad_norm": 4.047079086303711, - "learning_rate": 5.090909090909091e-06, - "loss": 4.4456, + "epoch": 2.67870036101083, + "grad_norm": 3.2897582054138184, + "learning_rate": 5.4151624548736465e-06, + "loss": 4.3568, "step": 742 }, { - "epoch": 2.7018181818181817, - "grad_norm": 3.6299378871917725, - "learning_rate": 5.03030303030303e-06, - "loss": 4.2703, + "epoch": 2.6823104693140793, + "grad_norm": 5.068946838378906, + "learning_rate": 5.354993983152829e-06, + "loss": 4.5797, "step": 743 }, { - "epoch": 2.7054545454545456, - "grad_norm": 6.5678534507751465, - "learning_rate": 4.96969696969697e-06, - "loss": 4.6659, + "epoch": 2.6859205776173285, + "grad_norm": 3.4122793674468994, + "learning_rate": 5.29482551143201e-06, + "loss": 4.4701, "step": 744 }, { - "epoch": 2.709090909090909, - "grad_norm": 5.289844989776611, - "learning_rate": 4.909090909090909e-06, - "loss": 4.4673, + "epoch": 2.6895306859205776, + "grad_norm": 4.280747890472412, + "learning_rate": 5.2346570397111915e-06, + "loss": 4.5973, "step": 745 }, { - "epoch": 2.712727272727273, - "grad_norm": 6.299747467041016, - "learning_rate": 4.848484848484849e-06, - "loss": 4.6312, + "epoch": 2.693140794223827, + "grad_norm": 5.901773452758789, + "learning_rate": 5.174488567990373e-06, + "loss": 4.4262, "step": 746 }, { - "epoch": 2.7163636363636363, - "grad_norm": 3.0065627098083496, - "learning_rate": 4.787878787878789e-06, - "loss": 4.4757, + "epoch": 2.696750902527076, + "grad_norm": 10.026396751403809, + "learning_rate": 5.114320096269555e-06, + "loss": 4.3797, "step": 747 }, { - "epoch": 2.7199999999999998, - "grad_norm": 3.4834516048431396, - "learning_rate": 4.727272727272727e-06, - "loss": 4.3239, + "epoch": 2.700361010830325, + "grad_norm": 8.616659164428711, + "learning_rate": 5.054151624548736e-06, + "loss": 4.3961, "step": 748 }, { - "epoch": 2.7236363636363636, - "grad_norm": 4.76310396194458, - "learning_rate": 4.666666666666667e-06, - "loss": 4.3299, + "epoch": 2.7039711191335742, + "grad_norm": 6.453322887420654, + "learning_rate": 4.993983152827919e-06, + "loss": 4.515, "step": 749 }, { - "epoch": 2.7272727272727275, - "grad_norm": 2.904684543609619, - "learning_rate": 4.606060606060606e-06, - "loss": 4.4176, + "epoch": 2.707581227436823, + "grad_norm": 4.593882083892822, + "learning_rate": 4.9338146811071e-06, + "loss": 4.1834, "step": 750 }, { - "epoch": 2.730909090909091, - "grad_norm": 4.35284423828125, - "learning_rate": 4.5454545454545455e-06, - "loss": 4.5483, + "epoch": 2.711191335740072, + "grad_norm": 2.6859915256500244, + "learning_rate": 4.873646209386281e-06, + "loss": 4.2954, "step": 751 }, { - "epoch": 2.7345454545454544, - "grad_norm": 4.889273166656494, - "learning_rate": 4.484848484848485e-06, - "loss": 4.4221, + "epoch": 2.7148014440433212, + "grad_norm": 2.83162522315979, + "learning_rate": 4.813477737665464e-06, + "loss": 4.4438, "step": 752 }, { - "epoch": 2.7381818181818183, - "grad_norm": 3.620983600616455, - "learning_rate": 4.424242424242425e-06, - "loss": 4.4576, + "epoch": 2.7184115523465704, + "grad_norm": 4.4429402351379395, + "learning_rate": 4.753309265944645e-06, + "loss": 4.4136, "step": 753 }, { - "epoch": 2.7418181818181817, - "grad_norm": 2.826190233230591, - "learning_rate": 4.363636363636364e-06, - "loss": 4.5104, + "epoch": 2.7220216606498195, + "grad_norm": 6.28361177444458, + "learning_rate": 4.693140794223827e-06, + "loss": 4.3655, "step": 754 }, { - "epoch": 2.7454545454545456, - "grad_norm": 2.733161687850952, - "learning_rate": 4.303030303030303e-06, - "loss": 4.5294, + "epoch": 2.7256317689530687, + "grad_norm": 5.667455673217773, + "learning_rate": 4.6329723225030085e-06, + "loss": 4.3888, "step": 755 }, { - "epoch": 2.749090909090909, - "grad_norm": 2.110649347305298, - "learning_rate": 4.242424242424243e-06, - "loss": 4.3034, + "epoch": 2.729241877256318, + "grad_norm": 3.5756163597106934, + "learning_rate": 4.572803850782191e-06, + "loss": 4.3934, "step": 756 }, { - "epoch": 2.752727272727273, - "grad_norm": 3.5162851810455322, - "learning_rate": 4.181818181818182e-06, - "loss": 4.3989, + "epoch": 2.7328519855595665, + "grad_norm": 4.108674049377441, + "learning_rate": 4.512635379061372e-06, + "loss": 4.3017, "step": 757 }, { - "epoch": 2.7563636363636363, - "grad_norm": 4.206791400909424, - "learning_rate": 4.1212121212121215e-06, - "loss": 4.5553, + "epoch": 2.7364620938628157, + "grad_norm": 2.353041648864746, + "learning_rate": 4.4524669073405535e-06, + "loss": 4.3216, "step": 758 }, { - "epoch": 2.76, - "grad_norm": 4.430637836456299, - "learning_rate": 4.060606060606061e-06, - "loss": 4.3779, + "epoch": 2.740072202166065, + "grad_norm": 5.459471225738525, + "learning_rate": 4.392298435619736e-06, + "loss": 4.4489, "step": 759 }, { - "epoch": 2.7636363636363637, - "grad_norm": 2.928250551223755, - "learning_rate": 4.000000000000001e-06, - "loss": 4.5767, + "epoch": 2.743682310469314, + "grad_norm": 3.5886504650115967, + "learning_rate": 4.332129963898917e-06, + "loss": 4.4121, "step": 760 }, { - "epoch": 2.767272727272727, - "grad_norm": 5.950421333312988, - "learning_rate": 3.939393939393939e-06, - "loss": 4.5452, + "epoch": 2.747292418772563, + "grad_norm": 6.772552967071533, + "learning_rate": 4.271961492178099e-06, + "loss": 4.4531, "step": 761 }, { - "epoch": 2.770909090909091, - "grad_norm": 4.050290107727051, - "learning_rate": 3.878787878787879e-06, - "loss": 4.3672, + "epoch": 2.7509025270758123, + "grad_norm": 3.167595148086548, + "learning_rate": 4.211793020457281e-06, + "loss": 4.6096, "step": 762 }, { - "epoch": 2.7745454545454544, - "grad_norm": 4.645779132843018, - "learning_rate": 3.818181818181819e-06, - "loss": 4.5808, + "epoch": 2.7545126353790614, + "grad_norm": 4.510582447052002, + "learning_rate": 4.151624548736462e-06, + "loss": 4.4543, "step": 763 }, { - "epoch": 2.7781818181818183, - "grad_norm": 2.2464754581451416, - "learning_rate": 3.757575757575758e-06, - "loss": 4.3538, + "epoch": 2.7581227436823106, + "grad_norm": 3.3659489154815674, + "learning_rate": 4.091456077015644e-06, + "loss": 4.2664, "step": 764 }, { - "epoch": 2.7818181818181817, - "grad_norm": 2.465804100036621, - "learning_rate": 3.6969696969696974e-06, - "loss": 4.5622, + "epoch": 2.7617328519855597, + "grad_norm": 2.412745237350464, + "learning_rate": 4.0312876052948256e-06, + "loss": 4.2983, "step": 765 }, { - "epoch": 2.785454545454545, - "grad_norm": 3.098994731903076, - "learning_rate": 3.636363636363636e-06, - "loss": 4.379, + "epoch": 2.765342960288809, + "grad_norm": 4.937220573425293, + "learning_rate": 3.971119133574008e-06, + "loss": 4.5444, "step": 766 }, { - "epoch": 2.789090909090909, - "grad_norm": 4.835888385772705, - "learning_rate": 3.575757575757576e-06, - "loss": 4.535, + "epoch": 2.768953068592058, + "grad_norm": 5.845764636993408, + "learning_rate": 3.910950661853189e-06, + "loss": 4.7001, "step": 767 }, { - "epoch": 2.792727272727273, - "grad_norm": 4.923304557800293, - "learning_rate": 3.515151515151515e-06, - "loss": 4.4813, + "epoch": 2.7725631768953067, + "grad_norm": 3.4525749683380127, + "learning_rate": 3.850782190132371e-06, + "loss": 4.2573, "step": 768 }, { - "epoch": 2.7963636363636364, - "grad_norm": 5.952007293701172, - "learning_rate": 3.4545454545454545e-06, - "loss": 4.7257, + "epoch": 2.776173285198556, + "grad_norm": 4.180619716644287, + "learning_rate": 3.7906137184115523e-06, + "loss": 4.6078, "step": 769 }, { - "epoch": 2.8, - "grad_norm": 2.3326032161712646, - "learning_rate": 3.3939393939393946e-06, - "loss": 4.4968, + "epoch": 2.779783393501805, + "grad_norm": 2.957162857055664, + "learning_rate": 3.730445246690734e-06, + "loss": 4.576, "step": 770 }, { - "epoch": 2.8036363636363637, - "grad_norm": 5.354976177215576, - "learning_rate": 3.3333333333333333e-06, - "loss": 4.1594, + "epoch": 2.783393501805054, + "grad_norm": 3.526676654815674, + "learning_rate": 3.6702767749699155e-06, + "loss": 4.3551, "step": 771 }, { - "epoch": 2.807272727272727, - "grad_norm": 3.175776481628418, - "learning_rate": 3.2727272727272733e-06, - "loss": 4.406, + "epoch": 2.7870036101083033, + "grad_norm": 3.771137237548828, + "learning_rate": 3.6101083032490977e-06, + "loss": 4.5584, "step": 772 }, { - "epoch": 2.810909090909091, - "grad_norm": 6.175566673278809, - "learning_rate": 3.212121212121212e-06, - "loss": 4.3835, + "epoch": 2.7906137184115525, + "grad_norm": 5.286788463592529, + "learning_rate": 3.5499398315282795e-06, + "loss": 4.5008, "step": 773 }, { - "epoch": 2.8145454545454545, - "grad_norm": 4.814353942871094, - "learning_rate": 3.1515151515151517e-06, - "loss": 4.3803, + "epoch": 2.794223826714801, + "grad_norm": 3.8384170532226562, + "learning_rate": 3.489771359807461e-06, + "loss": 4.6235, "step": 774 }, { - "epoch": 2.8181818181818183, - "grad_norm": 2.532165288925171, - "learning_rate": 3.0909090909090913e-06, - "loss": 4.6085, + "epoch": 2.7978339350180503, + "grad_norm": 5.631938457489014, + "learning_rate": 3.429602888086643e-06, + "loss": 4.7163, "step": 775 }, { - "epoch": 2.821818181818182, - "grad_norm": 3.019963264465332, - "learning_rate": 3.0303030303030305e-06, - "loss": 4.4147, + "epoch": 2.8014440433212995, + "grad_norm": 3.3230323791503906, + "learning_rate": 3.3694344163658244e-06, + "loss": 4.3598, "step": 776 }, { - "epoch": 2.825454545454545, - "grad_norm": 5.928140640258789, - "learning_rate": 2.96969696969697e-06, - "loss": 4.5954, + "epoch": 2.8050541516245486, + "grad_norm": 3.211805582046509, + "learning_rate": 3.309265944645006e-06, + "loss": 4.5694, "step": 777 }, { - "epoch": 2.829090909090909, - "grad_norm": 4.487292289733887, - "learning_rate": 2.9090909090909093e-06, - "loss": 4.5333, + "epoch": 2.808664259927798, + "grad_norm": 4.486380577087402, + "learning_rate": 3.2490974729241876e-06, + "loss": 4.5402, "step": 778 }, { - "epoch": 2.832727272727273, - "grad_norm": 7.2252349853515625, - "learning_rate": 2.8484848484848484e-06, - "loss": 4.3552, + "epoch": 2.812274368231047, + "grad_norm": 3.37566876411438, + "learning_rate": 3.1889290012033694e-06, + "loss": 4.3392, "step": 779 }, { - "epoch": 2.8363636363636364, - "grad_norm": 2.668553113937378, - "learning_rate": 2.787878787878788e-06, - "loss": 4.5654, + "epoch": 2.815884476534296, + "grad_norm": 3.2982022762298584, + "learning_rate": 3.1287605294825516e-06, + "loss": 4.2883, "step": 780 }, { - "epoch": 2.84, - "grad_norm": 2.857837438583374, - "learning_rate": 2.7272727272727272e-06, - "loss": 4.5263, + "epoch": 2.8194945848375452, + "grad_norm": 3.964442253112793, + "learning_rate": 3.068592057761733e-06, + "loss": 4.4602, "step": 781 }, { - "epoch": 2.8436363636363637, - "grad_norm": 2.993135929107666, - "learning_rate": 2.666666666666667e-06, - "loss": 4.6839, + "epoch": 2.8231046931407944, + "grad_norm": 2.6601409912109375, + "learning_rate": 3.0084235860409147e-06, + "loss": 4.5541, "step": 782 }, { - "epoch": 2.847272727272727, - "grad_norm": 5.227684020996094, - "learning_rate": 2.6060606060606064e-06, - "loss": 4.5512, + "epoch": 2.8267148014440435, + "grad_norm": 2.461411237716675, + "learning_rate": 2.9482551143200965e-06, + "loss": 4.2491, "step": 783 }, { - "epoch": 2.850909090909091, - "grad_norm": 3.6668307781219482, - "learning_rate": 2.5454545454545456e-06, - "loss": 4.4039, + "epoch": 2.8303249097472927, + "grad_norm": 3.935789108276367, + "learning_rate": 2.8880866425992783e-06, + "loss": 4.7376, "step": 784 }, { - "epoch": 2.8545454545454545, - "grad_norm": 2.9002740383148193, - "learning_rate": 2.484848484848485e-06, - "loss": 4.4902, + "epoch": 2.8339350180505414, + "grad_norm": 3.178452491760254, + "learning_rate": 2.8279181708784597e-06, + "loss": 4.2914, "step": 785 }, { - "epoch": 2.8581818181818184, - "grad_norm": 2.6871190071105957, - "learning_rate": 2.4242424242424244e-06, - "loss": 4.4637, + "epoch": 2.8375451263537905, + "grad_norm": 2.401991128921509, + "learning_rate": 2.7677496991576415e-06, + "loss": 4.3846, "step": 786 }, { - "epoch": 2.861818181818182, - "grad_norm": 2.605130672454834, - "learning_rate": 2.3636363636363636e-06, - "loss": 4.3348, + "epoch": 2.8411552346570397, + "grad_norm": 2.8064956665039062, + "learning_rate": 2.7075812274368233e-06, + "loss": 4.5398, "step": 787 }, { - "epoch": 2.8654545454545453, - "grad_norm": 2.137845754623413, - "learning_rate": 2.303030303030303e-06, - "loss": 4.3367, + "epoch": 2.844765342960289, + "grad_norm": 3.338486433029175, + "learning_rate": 2.647412755716005e-06, + "loss": 4.5519, "step": 788 }, { - "epoch": 2.869090909090909, - "grad_norm": 4.47348165512085, - "learning_rate": 2.2424242424242423e-06, - "loss": 4.3623, + "epoch": 2.848375451263538, + "grad_norm": 1.800193190574646, + "learning_rate": 2.5872442839951864e-06, + "loss": 4.4321, "step": 789 }, { - "epoch": 2.8727272727272726, - "grad_norm": 2.943335771560669, - "learning_rate": 2.181818181818182e-06, - "loss": 4.3887, + "epoch": 2.851985559566787, + "grad_norm": 2.2621030807495117, + "learning_rate": 2.527075812274368e-06, + "loss": 4.5175, "step": 790 }, { - "epoch": 2.8763636363636365, - "grad_norm": 3.716531753540039, - "learning_rate": 2.1212121212121216e-06, - "loss": 4.3412, + "epoch": 2.855595667870036, + "grad_norm": 3.4302749633789062, + "learning_rate": 2.46690734055355e-06, + "loss": 4.4695, "step": 791 }, { - "epoch": 2.88, - "grad_norm": 2.894035577774048, - "learning_rate": 2.0606060606060607e-06, - "loss": 4.3741, + "epoch": 2.859205776173285, + "grad_norm": 5.186544418334961, + "learning_rate": 2.406738868832732e-06, + "loss": 4.3755, "step": 792 }, { - "epoch": 2.8836363636363638, - "grad_norm": 3.7420289516448975, - "learning_rate": 2.0000000000000003e-06, - "loss": 4.5828, + "epoch": 2.862815884476534, + "grad_norm": 2.793041706085205, + "learning_rate": 2.3465703971119136e-06, + "loss": 4.2424, "step": 793 }, { - "epoch": 2.887272727272727, - "grad_norm": 2.9738407135009766, - "learning_rate": 1.9393939393939395e-06, - "loss": 4.4787, + "epoch": 2.8664259927797833, + "grad_norm": 2.6105570793151855, + "learning_rate": 2.2864019253910954e-06, + "loss": 4.441, "step": 794 }, { - "epoch": 2.8909090909090907, - "grad_norm": 2.441652297973633, - "learning_rate": 1.878787878787879e-06, - "loss": 4.51, + "epoch": 2.8700361010830324, + "grad_norm": 2.990581750869751, + "learning_rate": 2.2262334536702767e-06, + "loss": 4.4562, "step": 795 }, { - "epoch": 2.8945454545454545, - "grad_norm": 4.284375190734863, - "learning_rate": 1.818181818181818e-06, - "loss": 4.2759, + "epoch": 2.8736462093862816, + "grad_norm": 2.425126791000366, + "learning_rate": 2.1660649819494585e-06, + "loss": 4.3576, "step": 796 }, { - "epoch": 2.8981818181818184, - "grad_norm": 5.655579090118408, - "learning_rate": 1.7575757575757575e-06, - "loss": 4.4516, + "epoch": 2.8772563176895307, + "grad_norm": 3.0731112957000732, + "learning_rate": 2.1058965102286403e-06, + "loss": 4.4477, "step": 797 }, { - "epoch": 2.901818181818182, - "grad_norm": 3.4644057750701904, - "learning_rate": 1.6969696969696973e-06, - "loss": 4.3542, + "epoch": 2.88086642599278, + "grad_norm": 2.6812827587127686, + "learning_rate": 2.045728038507822e-06, + "loss": 4.536, "step": 798 }, { - "epoch": 2.9054545454545453, - "grad_norm": 2.9349281787872314, - "learning_rate": 1.6363636363636367e-06, - "loss": 4.4623, + "epoch": 2.884476534296029, + "grad_norm": 2.6943960189819336, + "learning_rate": 1.985559566787004e-06, + "loss": 4.4028, "step": 799 }, { - "epoch": 2.909090909090909, - "grad_norm": 2.0899038314819336, - "learning_rate": 1.5757575757575759e-06, - "loss": 4.3693, + "epoch": 2.888086642599278, + "grad_norm": 2.157318592071533, + "learning_rate": 1.9253910950661857e-06, + "loss": 4.4981, "step": 800 }, { - "epoch": 2.9127272727272726, - "grad_norm": 3.156378746032715, - "learning_rate": 1.5151515151515152e-06, - "loss": 4.4346, + "epoch": 2.8916967509025273, + "grad_norm": 3.369413375854492, + "learning_rate": 1.865222623345367e-06, + "loss": 4.3188, "step": 801 }, { - "epoch": 2.9163636363636365, - "grad_norm": 1.8763753175735474, - "learning_rate": 1.4545454545454546e-06, - "loss": 4.3416, + "epoch": 2.895306859205776, + "grad_norm": 2.947744607925415, + "learning_rate": 1.8050541516245488e-06, + "loss": 4.3764, "step": 802 }, { - "epoch": 2.92, - "grad_norm": 2.1865854263305664, - "learning_rate": 1.393939393939394e-06, - "loss": 4.4388, + "epoch": 2.898916967509025, + "grad_norm": 2.9001619815826416, + "learning_rate": 1.7448856799037304e-06, + "loss": 4.3952, "step": 803 }, { - "epoch": 2.923636363636364, - "grad_norm": 2.035835027694702, - "learning_rate": 1.3333333333333334e-06, - "loss": 4.3105, + "epoch": 2.9025270758122743, + "grad_norm": 3.887981653213501, + "learning_rate": 1.6847172081829122e-06, + "loss": 4.4897, "step": 804 }, { - "epoch": 2.9272727272727272, - "grad_norm": 2.976594924926758, - "learning_rate": 1.2727272727272728e-06, - "loss": 4.4205, + "epoch": 2.9061371841155235, + "grad_norm": 2.611881732940674, + "learning_rate": 1.6245487364620938e-06, + "loss": 4.4093, "step": 805 }, { - "epoch": 2.9309090909090907, - "grad_norm": 4.820901393890381, - "learning_rate": 1.2121212121212122e-06, - "loss": 4.2923, + "epoch": 2.9097472924187726, + "grad_norm": 3.8872454166412354, + "learning_rate": 1.5643802647412758e-06, + "loss": 4.3923, "step": 806 }, { - "epoch": 2.9345454545454546, - "grad_norm": 3.00866436958313, - "learning_rate": 1.1515151515151516e-06, - "loss": 4.5881, + "epoch": 2.9133574007220218, + "grad_norm": 2.8728034496307373, + "learning_rate": 1.5042117930204574e-06, + "loss": 4.4359, "step": 807 }, { - "epoch": 2.9381818181818184, - "grad_norm": 2.354827880859375, - "learning_rate": 1.090909090909091e-06, - "loss": 4.2351, + "epoch": 2.916967509025271, + "grad_norm": 1.9554171562194824, + "learning_rate": 1.4440433212996392e-06, + "loss": 4.2807, "step": 808 }, { - "epoch": 2.941818181818182, - "grad_norm": 2.7474520206451416, - "learning_rate": 1.0303030303030304e-06, - "loss": 4.4496, + "epoch": 2.9205776173285196, + "grad_norm": 2.1981494426727295, + "learning_rate": 1.3838748495788207e-06, + "loss": 4.4444, "step": 809 }, { - "epoch": 2.9454545454545453, - "grad_norm": 3.4006338119506836, - "learning_rate": 9.696969696969698e-07, - "loss": 4.5303, + "epoch": 2.9241877256317688, + "grad_norm": 2.9973559379577637, + "learning_rate": 1.3237063778580025e-06, + "loss": 4.4673, "step": 810 }, { - "epoch": 2.949090909090909, - "grad_norm": 2.1111907958984375, - "learning_rate": 9.09090909090909e-07, - "loss": 4.3686, + "epoch": 2.927797833935018, + "grad_norm": 2.8037450313568115, + "learning_rate": 1.263537906137184e-06, + "loss": 4.5387, "step": 811 }, { - "epoch": 2.9527272727272726, - "grad_norm": 2.6187009811401367, - "learning_rate": 8.484848484848486e-07, - "loss": 4.236, + "epoch": 2.931407942238267, + "grad_norm": 1.915396809577942, + "learning_rate": 1.203369434416366e-06, + "loss": 4.3888, "step": 812 }, { - "epoch": 2.9563636363636365, - "grad_norm": 7.847300052642822, - "learning_rate": 7.878787878787879e-07, - "loss": 4.3803, + "epoch": 2.935018050541516, + "grad_norm": 4.414287090301514, + "learning_rate": 1.1432009626955477e-06, + "loss": 4.3828, "step": 813 }, { - "epoch": 2.96, - "grad_norm": 2.505014419555664, - "learning_rate": 7.272727272727273e-07, - "loss": 4.4579, + "epoch": 2.9386281588447654, + "grad_norm": 2.6727025508880615, + "learning_rate": 1.0830324909747293e-06, + "loss": 4.4088, "step": 814 }, { - "epoch": 2.963636363636364, - "grad_norm": 2.3602616786956787, - "learning_rate": 6.666666666666667e-07, - "loss": 4.3183, + "epoch": 2.9422382671480145, + "grad_norm": 4.590742588043213, + "learning_rate": 1.022864019253911e-06, + "loss": 4.3833, "step": 815 }, { - "epoch": 2.9672727272727273, - "grad_norm": 2.405813694000244, - "learning_rate": 6.060606060606061e-07, - "loss": 4.3485, + "epoch": 2.9458483754512637, + "grad_norm": 4.188723087310791, + "learning_rate": 9.626955475330928e-07, + "loss": 4.2543, "step": 816 }, { - "epoch": 2.9709090909090907, - "grad_norm": 3.003232717514038, - "learning_rate": 5.454545454545455e-07, - "loss": 4.5807, + "epoch": 2.949458483754513, + "grad_norm": 1.739635944366455, + "learning_rate": 9.025270758122744e-07, + "loss": 4.2869, "step": 817 }, { - "epoch": 2.9745454545454546, - "grad_norm": 2.4426870346069336, - "learning_rate": 4.848484848484849e-07, - "loss": 4.3509, + "epoch": 2.953068592057762, + "grad_norm": 2.8158509731292725, + "learning_rate": 8.423586040914561e-07, + "loss": 4.5182, "step": 818 }, { - "epoch": 2.978181818181818, - "grad_norm": 3.0719637870788574, - "learning_rate": 4.242424242424243e-07, - "loss": 4.3542, + "epoch": 2.956678700361011, + "grad_norm": 3.629409074783325, + "learning_rate": 7.821901323706379e-07, + "loss": 4.6912, "step": 819 }, { - "epoch": 2.981818181818182, - "grad_norm": 2.4720072746276855, - "learning_rate": 3.6363636363636366e-07, - "loss": 4.4289, + "epoch": 2.96028880866426, + "grad_norm": 2.2690486907958984, + "learning_rate": 7.220216606498196e-07, + "loss": 4.55, "step": 820 }, { - "epoch": 2.9854545454545454, - "grad_norm": 3.5286593437194824, - "learning_rate": 3.0303030303030305e-07, - "loss": 4.6304, + "epoch": 2.963898916967509, + "grad_norm": 3.6004555225372314, + "learning_rate": 6.618531889290013e-07, + "loss": 4.6055, "step": 821 }, { - "epoch": 2.9890909090909092, - "grad_norm": 2.771474838256836, - "learning_rate": 2.4242424242424244e-07, - "loss": 4.4305, + "epoch": 2.967509025270758, + "grad_norm": 3.75754451751709, + "learning_rate": 6.01684717208183e-07, + "loss": 4.4194, "step": 822 }, { - "epoch": 2.9927272727272727, - "grad_norm": 1.9957259893417358, - "learning_rate": 1.8181818181818183e-07, - "loss": 4.3848, + "epoch": 2.9711191335740073, + "grad_norm": 5.5864691734313965, + "learning_rate": 5.415162454873646e-07, + "loss": 4.3831, "step": 823 }, { - "epoch": 2.996363636363636, - "grad_norm": 3.018747568130493, - "learning_rate": 1.2121212121212122e-07, - "loss": 4.3957, + "epoch": 2.9747292418772564, + "grad_norm": 2.2573602199554443, + "learning_rate": 4.813477737665464e-07, + "loss": 4.2861, "step": 824 }, { - "epoch": 3.0, - "grad_norm": 5.8460917472839355, - "learning_rate": 6.060606060606061e-08, - "loss": 4.7331, + "epoch": 2.9783393501805056, + "grad_norm": 2.9080207347869873, + "learning_rate": 4.2117930204572805e-07, + "loss": 4.6265, "step": 825 + }, + { + "epoch": 2.9819494584837543, + "grad_norm": 1.611305594444275, + "learning_rate": 3.610108303249098e-07, + "loss": 4.3987, + "step": 826 + }, + { + "epoch": 2.9855595667870034, + "grad_norm": 4.546231269836426, + "learning_rate": 3.008423586040915e-07, + "loss": 4.3495, + "step": 827 + }, + { + "epoch": 2.9891696750902526, + "grad_norm": 1.9480468034744263, + "learning_rate": 2.406738868832732e-07, + "loss": 4.3835, + "step": 828 + }, + { + "epoch": 2.9927797833935017, + "grad_norm": 2.429931879043579, + "learning_rate": 1.805054151624549e-07, + "loss": 4.3572, + "step": 829 + }, + { + "epoch": 2.996389891696751, + "grad_norm": 3.491809844970703, + "learning_rate": 1.203369434416366e-07, + "loss": 4.3179, + "step": 830 + }, + { + "epoch": 3.0, + "grad_norm": 2.282447576522827, + "learning_rate": 6.01684717208183e-08, + "loss": 4.4063, + "step": 831 } ], "logging_steps": 1, - "max_steps": 825, + "max_steps": 831, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10, @@ -5802,7 +5844,7 @@ "attributes": {} } }, - "total_flos": 1.0032173593657344e+16, + "total_flos": 1.0123541294874624e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null