| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 5871, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005109862033725089, |
| "grad_norm": 0.33659857760959316, |
| "learning_rate": 5e-06, |
| "loss": 1.2266, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.010219724067450179, |
| "grad_norm": 0.27906848020280245, |
| "learning_rate": 5e-06, |
| "loss": 1.1927, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.015329586101175269, |
| "grad_norm": 0.17780612512031585, |
| "learning_rate": 5e-06, |
| "loss": 1.1325, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.020439448134900357, |
| "grad_norm": 0.1258477993946921, |
| "learning_rate": 5e-06, |
| "loss": 1.0755, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.025549310168625446, |
| "grad_norm": 0.10850896052962487, |
| "learning_rate": 5e-06, |
| "loss": 1.0559, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.030659172202350538, |
| "grad_norm": 0.1008214475411604, |
| "learning_rate": 5e-06, |
| "loss": 1.0381, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.03576903423607562, |
| "grad_norm": 0.09230130516181131, |
| "learning_rate": 5e-06, |
| "loss": 1.0359, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.040878896269800714, |
| "grad_norm": 0.12761405349230925, |
| "learning_rate": 5e-06, |
| "loss": 1.02, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.045988758303525806, |
| "grad_norm": 0.09657363359659779, |
| "learning_rate": 5e-06, |
| "loss": 1.0108, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.05109862033725089, |
| "grad_norm": 0.07985711883606804, |
| "learning_rate": 5e-06, |
| "loss": 1.0112, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.05620848237097598, |
| "grad_norm": 0.3841577988890222, |
| "learning_rate": 5e-06, |
| "loss": 1.004, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.061318344404701075, |
| "grad_norm": 0.07880791404523275, |
| "learning_rate": 5e-06, |
| "loss": 1.0005, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.06642820643842616, |
| "grad_norm": 0.07684646209763941, |
| "learning_rate": 5e-06, |
| "loss": 0.9969, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.07153806847215124, |
| "grad_norm": 0.07364942700770905, |
| "learning_rate": 5e-06, |
| "loss": 0.9817, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.07664793050587634, |
| "grad_norm": 0.07307840824030326, |
| "learning_rate": 5e-06, |
| "loss": 1.0017, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.08175779253960143, |
| "grad_norm": 0.07899180708739008, |
| "learning_rate": 5e-06, |
| "loss": 1.0064, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.08686765457332651, |
| "grad_norm": 0.13990750744492506, |
| "learning_rate": 5e-06, |
| "loss": 0.9664, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.09197751660705161, |
| "grad_norm": 0.07623610966974216, |
| "learning_rate": 5e-06, |
| "loss": 0.993, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.0970873786407767, |
| "grad_norm": 0.07177184337925457, |
| "learning_rate": 5e-06, |
| "loss": 0.9802, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.10219724067450178, |
| "grad_norm": 0.07599595168859198, |
| "learning_rate": 5e-06, |
| "loss": 0.9821, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.10730710270822688, |
| "grad_norm": 0.08156619926999734, |
| "learning_rate": 5e-06, |
| "loss": 0.9848, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.11241696474195197, |
| "grad_norm": 0.08647559353210667, |
| "learning_rate": 5e-06, |
| "loss": 0.9669, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.11752682677567705, |
| "grad_norm": 0.09175368496172068, |
| "learning_rate": 5e-06, |
| "loss": 0.9768, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.12263668880940215, |
| "grad_norm": 0.07679212882863759, |
| "learning_rate": 5e-06, |
| "loss": 0.9785, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.12774655084312725, |
| "grad_norm": 0.0876157967492165, |
| "learning_rate": 5e-06, |
| "loss": 0.97, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.13285641287685232, |
| "grad_norm": 0.07438658820336003, |
| "learning_rate": 5e-06, |
| "loss": 0.9642, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.13796627491057742, |
| "grad_norm": 0.11327800060102156, |
| "learning_rate": 5e-06, |
| "loss": 0.9566, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.1430761369443025, |
| "grad_norm": 0.07300739672589697, |
| "learning_rate": 5e-06, |
| "loss": 0.9641, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.1481859989780276, |
| "grad_norm": 0.0706224800129807, |
| "learning_rate": 5e-06, |
| "loss": 0.9608, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.1532958610117527, |
| "grad_norm": 0.10268101288716767, |
| "learning_rate": 5e-06, |
| "loss": 0.9594, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.15840572304547776, |
| "grad_norm": 0.10115126588406155, |
| "learning_rate": 5e-06, |
| "loss": 0.9497, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.16351558507920286, |
| "grad_norm": 0.07906545142322015, |
| "learning_rate": 5e-06, |
| "loss": 0.9708, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.16862544711292796, |
| "grad_norm": 0.08205445152944524, |
| "learning_rate": 5e-06, |
| "loss": 0.9578, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.17373530914665303, |
| "grad_norm": 0.08422630569633463, |
| "learning_rate": 5e-06, |
| "loss": 0.9596, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.17884517118037813, |
| "grad_norm": 0.08374924134366432, |
| "learning_rate": 5e-06, |
| "loss": 0.9538, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.18395503321410323, |
| "grad_norm": 0.07381594898486545, |
| "learning_rate": 5e-06, |
| "loss": 0.9632, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.1890648952478283, |
| "grad_norm": 0.0742024858441321, |
| "learning_rate": 5e-06, |
| "loss": 0.9574, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.1941747572815534, |
| "grad_norm": 0.07635133337670513, |
| "learning_rate": 5e-06, |
| "loss": 0.9642, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.1992846193152785, |
| "grad_norm": 0.07547026097910627, |
| "learning_rate": 5e-06, |
| "loss": 0.9559, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.20439448134900357, |
| "grad_norm": 0.08098346259777026, |
| "learning_rate": 5e-06, |
| "loss": 0.9608, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.20950434338272866, |
| "grad_norm": 0.0740110860457063, |
| "learning_rate": 5e-06, |
| "loss": 0.9626, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.21461420541645376, |
| "grad_norm": 0.06838098261628725, |
| "learning_rate": 5e-06, |
| "loss": 0.956, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.21972406745017883, |
| "grad_norm": 0.07627481328859255, |
| "learning_rate": 5e-06, |
| "loss": 0.947, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.22483392948390393, |
| "grad_norm": 0.07121602448305268, |
| "learning_rate": 5e-06, |
| "loss": 0.936, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.22994379151762903, |
| "grad_norm": 0.07130096221687535, |
| "learning_rate": 5e-06, |
| "loss": 0.9489, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.2350536535513541, |
| "grad_norm": 0.0705427638202678, |
| "learning_rate": 5e-06, |
| "loss": 0.9488, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.2401635155850792, |
| "grad_norm": 0.07263405103965026, |
| "learning_rate": 5e-06, |
| "loss": 0.9577, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.2452733776188043, |
| "grad_norm": 0.11012974887960399, |
| "learning_rate": 5e-06, |
| "loss": 0.943, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.2503832396525294, |
| "grad_norm": 0.10355809585077033, |
| "learning_rate": 5e-06, |
| "loss": 0.939, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.2554931016862545, |
| "grad_norm": 0.0793194619120587, |
| "learning_rate": 5e-06, |
| "loss": 0.9461, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.26060296371997954, |
| "grad_norm": 0.07305857929904291, |
| "learning_rate": 5e-06, |
| "loss": 0.94, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.26571282575370464, |
| "grad_norm": 0.07524998598879123, |
| "learning_rate": 5e-06, |
| "loss": 0.9346, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.27082268778742974, |
| "grad_norm": 0.07947372908760092, |
| "learning_rate": 5e-06, |
| "loss": 0.9469, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.27593254982115484, |
| "grad_norm": 0.1241601221912902, |
| "learning_rate": 5e-06, |
| "loss": 0.9439, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.28104241185487994, |
| "grad_norm": 0.09428433932371161, |
| "learning_rate": 5e-06, |
| "loss": 0.9364, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.286152273888605, |
| "grad_norm": 0.07643085121646655, |
| "learning_rate": 5e-06, |
| "loss": 0.9362, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.2912621359223301, |
| "grad_norm": 0.07053914919464828, |
| "learning_rate": 5e-06, |
| "loss": 0.9434, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.2963719979560552, |
| "grad_norm": 0.07770599024749027, |
| "learning_rate": 5e-06, |
| "loss": 0.9339, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.3014818599897803, |
| "grad_norm": 0.07178293782557839, |
| "learning_rate": 5e-06, |
| "loss": 0.9375, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.3065917220235054, |
| "grad_norm": 0.08584366089279562, |
| "learning_rate": 5e-06, |
| "loss": 0.9371, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.3117015840572305, |
| "grad_norm": 0.07944827441074408, |
| "learning_rate": 5e-06, |
| "loss": 0.9455, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.3168114460909555, |
| "grad_norm": 0.07201506395499371, |
| "learning_rate": 5e-06, |
| "loss": 0.9468, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.3219213081246806, |
| "grad_norm": 0.06802889338099032, |
| "learning_rate": 5e-06, |
| "loss": 0.9216, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.3270311701584057, |
| "grad_norm": 0.07330902217526364, |
| "learning_rate": 5e-06, |
| "loss": 0.9236, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.3321410321921308, |
| "grad_norm": 0.07252233851918731, |
| "learning_rate": 5e-06, |
| "loss": 0.9389, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.3372508942258559, |
| "grad_norm": 0.07496139269293961, |
| "learning_rate": 5e-06, |
| "loss": 0.9237, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.342360756259581, |
| "grad_norm": 0.07750656954616246, |
| "learning_rate": 5e-06, |
| "loss": 0.9369, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.34747061829330605, |
| "grad_norm": 0.07099579035430438, |
| "learning_rate": 5e-06, |
| "loss": 0.9305, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.35258048032703115, |
| "grad_norm": 0.07147696465885814, |
| "learning_rate": 5e-06, |
| "loss": 0.9278, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.35769034236075625, |
| "grad_norm": 0.06679094313753112, |
| "learning_rate": 5e-06, |
| "loss": 0.9209, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.36280020439448135, |
| "grad_norm": 0.0741205857711921, |
| "learning_rate": 5e-06, |
| "loss": 0.9239, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.36791006642820645, |
| "grad_norm": 0.07314565717664778, |
| "learning_rate": 5e-06, |
| "loss": 0.9228, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.37301992846193155, |
| "grad_norm": 0.06970796428185207, |
| "learning_rate": 5e-06, |
| "loss": 0.9261, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.3781297904956566, |
| "grad_norm": 0.08781289255732734, |
| "learning_rate": 5e-06, |
| "loss": 0.9184, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.3832396525293817, |
| "grad_norm": 0.0733298775244439, |
| "learning_rate": 5e-06, |
| "loss": 0.9239, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.3883495145631068, |
| "grad_norm": 0.07059494784250205, |
| "learning_rate": 5e-06, |
| "loss": 0.9256, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.3934593765968319, |
| "grad_norm": 0.07435764722394751, |
| "learning_rate": 5e-06, |
| "loss": 0.9317, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.398569238630557, |
| "grad_norm": 0.08580109544056579, |
| "learning_rate": 5e-06, |
| "loss": 0.915, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.4036791006642821, |
| "grad_norm": 0.13857712945248865, |
| "learning_rate": 5e-06, |
| "loss": 0.9266, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.40878896269800713, |
| "grad_norm": 0.09252960975773743, |
| "learning_rate": 5e-06, |
| "loss": 0.9292, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.41389882473173223, |
| "grad_norm": 0.0786939477268516, |
| "learning_rate": 5e-06, |
| "loss": 0.9229, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.4190086867654573, |
| "grad_norm": 0.07870146545651402, |
| "learning_rate": 5e-06, |
| "loss": 0.9222, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.4241185487991824, |
| "grad_norm": 0.22636518086425345, |
| "learning_rate": 5e-06, |
| "loss": 0.9198, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.4292284108329075, |
| "grad_norm": 0.07019267445206644, |
| "learning_rate": 5e-06, |
| "loss": 0.9175, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.4343382728666326, |
| "grad_norm": 0.1807350892295952, |
| "learning_rate": 5e-06, |
| "loss": 0.9196, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.43944813490035767, |
| "grad_norm": 0.0725849350335069, |
| "learning_rate": 5e-06, |
| "loss": 0.9125, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.44455799693408277, |
| "grad_norm": 0.07593721579585741, |
| "learning_rate": 5e-06, |
| "loss": 0.9281, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.44966785896780787, |
| "grad_norm": 0.07560334097743046, |
| "learning_rate": 5e-06, |
| "loss": 0.9135, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.45477772100153296, |
| "grad_norm": 0.06971504265848284, |
| "learning_rate": 5e-06, |
| "loss": 0.9237, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.45988758303525806, |
| "grad_norm": 0.07156759501755959, |
| "learning_rate": 5e-06, |
| "loss": 0.9138, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.46499744506898316, |
| "grad_norm": 0.07895160546618792, |
| "learning_rate": 5e-06, |
| "loss": 0.921, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.4701073071027082, |
| "grad_norm": 0.07451564044792575, |
| "learning_rate": 5e-06, |
| "loss": 0.9199, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.4752171691364333, |
| "grad_norm": 0.10378845787672986, |
| "learning_rate": 5e-06, |
| "loss": 0.9265, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.4803270311701584, |
| "grad_norm": 0.0667143142951708, |
| "learning_rate": 5e-06, |
| "loss": 0.9177, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.4854368932038835, |
| "grad_norm": 0.1158465430965177, |
| "learning_rate": 5e-06, |
| "loss": 0.9226, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.4905467552376086, |
| "grad_norm": 0.07122640133664004, |
| "learning_rate": 5e-06, |
| "loss": 0.9264, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.4956566172713337, |
| "grad_norm": 0.07221992051194193, |
| "learning_rate": 5e-06, |
| "loss": 0.9131, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.5007664793050588, |
| "grad_norm": 0.07924509176000209, |
| "learning_rate": 5e-06, |
| "loss": 0.9125, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.5058763413387839, |
| "grad_norm": 0.07672190213186483, |
| "learning_rate": 5e-06, |
| "loss": 0.9122, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.510986203372509, |
| "grad_norm": 0.07692062751941903, |
| "learning_rate": 5e-06, |
| "loss": 0.929, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.516096065406234, |
| "grad_norm": 0.11741801773911116, |
| "learning_rate": 5e-06, |
| "loss": 0.9159, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.5212059274399591, |
| "grad_norm": 0.07370462072026876, |
| "learning_rate": 5e-06, |
| "loss": 0.9297, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.5263157894736842, |
| "grad_norm": 0.07324740223664235, |
| "learning_rate": 5e-06, |
| "loss": 0.9102, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.5314256515074093, |
| "grad_norm": 0.06719441090965804, |
| "learning_rate": 5e-06, |
| "loss": 0.9095, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.5365355135411344, |
| "grad_norm": 0.0752101579557485, |
| "learning_rate": 5e-06, |
| "loss": 0.9118, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.5416453755748595, |
| "grad_norm": 0.07316438131338852, |
| "learning_rate": 5e-06, |
| "loss": 0.909, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.5467552376085846, |
| "grad_norm": 0.07217762056098373, |
| "learning_rate": 5e-06, |
| "loss": 0.9045, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.5518650996423097, |
| "grad_norm": 0.06882269438361814, |
| "learning_rate": 5e-06, |
| "loss": 0.9241, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.5569749616760348, |
| "grad_norm": 0.18147585225890778, |
| "learning_rate": 5e-06, |
| "loss": 0.9167, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.5620848237097599, |
| "grad_norm": 0.07945137709096421, |
| "learning_rate": 5e-06, |
| "loss": 0.9058, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.567194685743485, |
| "grad_norm": 0.06985941949554184, |
| "learning_rate": 5e-06, |
| "loss": 0.9159, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.57230454777721, |
| "grad_norm": 0.07626155133966736, |
| "learning_rate": 5e-06, |
| "loss": 0.9152, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.5774144098109351, |
| "grad_norm": 0.07367359682427968, |
| "learning_rate": 5e-06, |
| "loss": 0.9033, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.5825242718446602, |
| "grad_norm": 0.07246294215791606, |
| "learning_rate": 5e-06, |
| "loss": 0.9056, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.5876341338783853, |
| "grad_norm": 0.06636935667746374, |
| "learning_rate": 5e-06, |
| "loss": 0.9096, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.5927439959121104, |
| "grad_norm": 0.06914204310614962, |
| "learning_rate": 5e-06, |
| "loss": 0.9124, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.5978538579458355, |
| "grad_norm": 0.06622472549827609, |
| "learning_rate": 5e-06, |
| "loss": 0.9042, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.6029637199795606, |
| "grad_norm": 0.07526422643221314, |
| "learning_rate": 5e-06, |
| "loss": 0.9099, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.6080735820132857, |
| "grad_norm": 0.07865312982736179, |
| "learning_rate": 5e-06, |
| "loss": 0.9121, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.6131834440470108, |
| "grad_norm": 0.08100630197435015, |
| "learning_rate": 5e-06, |
| "loss": 0.9108, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.6182933060807358, |
| "grad_norm": 0.14439251719042823, |
| "learning_rate": 5e-06, |
| "loss": 0.9085, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.623403168114461, |
| "grad_norm": 0.07351842090072493, |
| "learning_rate": 5e-06, |
| "loss": 0.9102, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.628513030148186, |
| "grad_norm": 0.0710785086017881, |
| "learning_rate": 5e-06, |
| "loss": 0.9137, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.633622892181911, |
| "grad_norm": 0.07184247120200028, |
| "learning_rate": 5e-06, |
| "loss": 0.8997, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.6387327542156361, |
| "grad_norm": 0.07131702245604768, |
| "learning_rate": 5e-06, |
| "loss": 0.9087, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.6438426162493612, |
| "grad_norm": 0.0730402054534026, |
| "learning_rate": 5e-06, |
| "loss": 0.894, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.6489524782830863, |
| "grad_norm": 0.07322424351281255, |
| "learning_rate": 5e-06, |
| "loss": 0.9127, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.6540623403168114, |
| "grad_norm": 0.09436026180688377, |
| "learning_rate": 5e-06, |
| "loss": 0.919, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.6591722023505365, |
| "grad_norm": 0.07076855510548616, |
| "learning_rate": 5e-06, |
| "loss": 0.9044, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.6642820643842616, |
| "grad_norm": 0.07409183519590924, |
| "learning_rate": 5e-06, |
| "loss": 0.8925, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.6693919264179867, |
| "grad_norm": 0.3253569122085339, |
| "learning_rate": 5e-06, |
| "loss": 0.897, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.6745017884517118, |
| "grad_norm": 0.07002482080488048, |
| "learning_rate": 5e-06, |
| "loss": 0.8905, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.6796116504854369, |
| "grad_norm": 0.8542494508434101, |
| "learning_rate": 5e-06, |
| "loss": 0.9052, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.684721512519162, |
| "grad_norm": 0.09406257309495393, |
| "learning_rate": 5e-06, |
| "loss": 0.91, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.6898313745528871, |
| "grad_norm": 0.08661096377263379, |
| "learning_rate": 5e-06, |
| "loss": 0.9096, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.6949412365866121, |
| "grad_norm": 0.07544412418690473, |
| "learning_rate": 5e-06, |
| "loss": 0.9039, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.7000510986203372, |
| "grad_norm": 0.07803646465670126, |
| "learning_rate": 5e-06, |
| "loss": 0.8892, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.7051609606540623, |
| "grad_norm": 0.07088553834184458, |
| "learning_rate": 5e-06, |
| "loss": 0.8975, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.7102708226877874, |
| "grad_norm": 0.08931444692520309, |
| "learning_rate": 5e-06, |
| "loss": 0.882, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.7153806847215125, |
| "grad_norm": 0.07261368946056458, |
| "learning_rate": 5e-06, |
| "loss": 0.903, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.7204905467552376, |
| "grad_norm": 0.07364955770557434, |
| "learning_rate": 5e-06, |
| "loss": 0.9019, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.7256004087889627, |
| "grad_norm": 0.07232725672310655, |
| "learning_rate": 5e-06, |
| "loss": 0.9047, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.7307102708226878, |
| "grad_norm": 0.07284538520098455, |
| "learning_rate": 5e-06, |
| "loss": 0.8969, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.7358201328564129, |
| "grad_norm": 0.06804520794145767, |
| "learning_rate": 5e-06, |
| "loss": 0.8983, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.740929994890138, |
| "grad_norm": 0.07080981200832792, |
| "learning_rate": 5e-06, |
| "loss": 0.9034, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.7460398569238631, |
| "grad_norm": 0.07015236628919412, |
| "learning_rate": 5e-06, |
| "loss": 0.9033, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.7511497189575882, |
| "grad_norm": 0.10418032275009997, |
| "learning_rate": 5e-06, |
| "loss": 0.9008, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.7562595809913132, |
| "grad_norm": 0.07009703060030868, |
| "learning_rate": 5e-06, |
| "loss": 0.9006, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.7613694430250383, |
| "grad_norm": 0.07561374961635098, |
| "learning_rate": 5e-06, |
| "loss": 0.8992, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.7664793050587634, |
| "grad_norm": 0.07543087417873817, |
| "learning_rate": 5e-06, |
| "loss": 0.9022, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.7715891670924885, |
| "grad_norm": 0.07303222610905763, |
| "learning_rate": 5e-06, |
| "loss": 0.8876, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.7766990291262136, |
| "grad_norm": 0.12947977214304454, |
| "learning_rate": 5e-06, |
| "loss": 0.9017, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.7818088911599387, |
| "grad_norm": 0.07389484625510247, |
| "learning_rate": 5e-06, |
| "loss": 0.8972, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.7869187531936638, |
| "grad_norm": 0.07577895984597245, |
| "learning_rate": 5e-06, |
| "loss": 0.9029, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.7920286152273889, |
| "grad_norm": 0.07179725920965228, |
| "learning_rate": 5e-06, |
| "loss": 0.8973, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.797138477261114, |
| "grad_norm": 0.09446753813662215, |
| "learning_rate": 5e-06, |
| "loss": 0.8884, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.8022483392948391, |
| "grad_norm": 0.0694040937128912, |
| "learning_rate": 5e-06, |
| "loss": 0.8858, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.8073582013285642, |
| "grad_norm": 0.07119010100651514, |
| "learning_rate": 5e-06, |
| "loss": 0.891, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.8124680633622893, |
| "grad_norm": 0.06814136288038194, |
| "learning_rate": 5e-06, |
| "loss": 0.8882, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.8175779253960143, |
| "grad_norm": 0.14101206207855727, |
| "learning_rate": 5e-06, |
| "loss": 0.8901, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.8226877874297394, |
| "grad_norm": 0.06823593991844153, |
| "learning_rate": 5e-06, |
| "loss": 0.8969, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.8277976494634645, |
| "grad_norm": 0.0781844807225488, |
| "learning_rate": 5e-06, |
| "loss": 0.9092, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.8329075114971896, |
| "grad_norm": 0.07770931010296782, |
| "learning_rate": 5e-06, |
| "loss": 0.9099, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.8380173735309147, |
| "grad_norm": 0.07037044497495822, |
| "learning_rate": 5e-06, |
| "loss": 0.8955, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.8431272355646398, |
| "grad_norm": 0.0765067693249436, |
| "learning_rate": 5e-06, |
| "loss": 0.8839, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.8482370975983649, |
| "grad_norm": 0.09158943006728, |
| "learning_rate": 5e-06, |
| "loss": 0.9055, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.85334695963209, |
| "grad_norm": 0.07388296884455732, |
| "learning_rate": 5e-06, |
| "loss": 0.8829, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.858456821665815, |
| "grad_norm": 0.07364367251534173, |
| "learning_rate": 5e-06, |
| "loss": 0.8906, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.8635666836995401, |
| "grad_norm": 0.10472917075697648, |
| "learning_rate": 5e-06, |
| "loss": 0.8998, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.8686765457332652, |
| "grad_norm": 0.07209129300287077, |
| "learning_rate": 5e-06, |
| "loss": 0.9079, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.8737864077669902, |
| "grad_norm": 0.06988678080238575, |
| "learning_rate": 5e-06, |
| "loss": 0.903, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.8788962698007153, |
| "grad_norm": 0.07515978821834793, |
| "learning_rate": 5e-06, |
| "loss": 0.8904, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.8840061318344404, |
| "grad_norm": 0.07433094849872902, |
| "learning_rate": 5e-06, |
| "loss": 0.889, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.8891159938681655, |
| "grad_norm": 0.08433306671757082, |
| "learning_rate": 5e-06, |
| "loss": 0.8986, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.8942258559018906, |
| "grad_norm": 0.07396884075939439, |
| "learning_rate": 5e-06, |
| "loss": 0.8874, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.8993357179356157, |
| "grad_norm": 0.07751456977281067, |
| "learning_rate": 5e-06, |
| "loss": 0.8992, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.9044455799693408, |
| "grad_norm": 0.0789527788405623, |
| "learning_rate": 5e-06, |
| "loss": 0.8969, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.9095554420030659, |
| "grad_norm": 0.07602904790252976, |
| "learning_rate": 5e-06, |
| "loss": 0.8901, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.914665304036791, |
| "grad_norm": 0.06982248060839691, |
| "learning_rate": 5e-06, |
| "loss": 0.8968, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.9197751660705161, |
| "grad_norm": 0.07169359819101034, |
| "learning_rate": 5e-06, |
| "loss": 0.9087, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.9248850281042412, |
| "grad_norm": 0.07321342917964954, |
| "learning_rate": 5e-06, |
| "loss": 0.8969, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.9299948901379663, |
| "grad_norm": 0.07444618890892986, |
| "learning_rate": 5e-06, |
| "loss": 0.9045, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.9351047521716913, |
| "grad_norm": 0.11365069738490574, |
| "learning_rate": 5e-06, |
| "loss": 0.8909, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.9402146142054164, |
| "grad_norm": 0.380588282512115, |
| "learning_rate": 5e-06, |
| "loss": 0.8803, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.9453244762391415, |
| "grad_norm": 0.07262951461490495, |
| "learning_rate": 5e-06, |
| "loss": 0.8718, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.9504343382728666, |
| "grad_norm": 0.06829017808656876, |
| "learning_rate": 5e-06, |
| "loss": 0.8822, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.9555442003065917, |
| "grad_norm": 0.07878675404968587, |
| "learning_rate": 5e-06, |
| "loss": 0.878, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.9606540623403168, |
| "grad_norm": 0.0760531098736545, |
| "learning_rate": 5e-06, |
| "loss": 0.8808, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.9657639243740419, |
| "grad_norm": 0.12494712249352007, |
| "learning_rate": 5e-06, |
| "loss": 0.8829, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.970873786407767, |
| "grad_norm": 0.07219974763591076, |
| "learning_rate": 5e-06, |
| "loss": 0.8875, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.9759836484414921, |
| "grad_norm": 0.0678705623185834, |
| "learning_rate": 5e-06, |
| "loss": 0.9012, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.9810935104752172, |
| "grad_norm": 0.07156172069177699, |
| "learning_rate": 5e-06, |
| "loss": 0.898, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.9862033725089423, |
| "grad_norm": 0.0732677504535335, |
| "learning_rate": 5e-06, |
| "loss": 0.8962, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.9913132345426674, |
| "grad_norm": 0.07202912622478508, |
| "learning_rate": 5e-06, |
| "loss": 0.8951, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.9964230965763924, |
| "grad_norm": 0.10969747960973296, |
| "learning_rate": 5e-06, |
| "loss": 0.9097, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.0015329586101176, |
| "grad_norm": 0.06912267564528896, |
| "learning_rate": 5e-06, |
| "loss": 0.8839, |
| "step": 1960 |
| }, |
| { |
| "epoch": 1.0066428206438427, |
| "grad_norm": 0.07686255349334188, |
| "learning_rate": 5e-06, |
| "loss": 0.8895, |
| "step": 1970 |
| }, |
| { |
| "epoch": 1.0117526826775678, |
| "grad_norm": 0.07239494301457147, |
| "learning_rate": 5e-06, |
| "loss": 0.8802, |
| "step": 1980 |
| }, |
| { |
| "epoch": 1.016862544711293, |
| "grad_norm": 0.06896146275536877, |
| "learning_rate": 5e-06, |
| "loss": 0.8743, |
| "step": 1990 |
| }, |
| { |
| "epoch": 1.021972406745018, |
| "grad_norm": 0.06973937218730168, |
| "learning_rate": 5e-06, |
| "loss": 0.8858, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.0270822687787429, |
| "grad_norm": 0.07590849152549478, |
| "learning_rate": 5e-06, |
| "loss": 0.8702, |
| "step": 2010 |
| }, |
| { |
| "epoch": 1.032192130812468, |
| "grad_norm": 0.07727922335579492, |
| "learning_rate": 5e-06, |
| "loss": 0.8802, |
| "step": 2020 |
| }, |
| { |
| "epoch": 1.037301992846193, |
| "grad_norm": 0.07317945118774036, |
| "learning_rate": 5e-06, |
| "loss": 0.8764, |
| "step": 2030 |
| }, |
| { |
| "epoch": 1.0424118548799182, |
| "grad_norm": 0.0790764962189524, |
| "learning_rate": 5e-06, |
| "loss": 0.8675, |
| "step": 2040 |
| }, |
| { |
| "epoch": 1.0475217169136433, |
| "grad_norm": 0.14927111441309496, |
| "learning_rate": 5e-06, |
| "loss": 0.8795, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.0526315789473684, |
| "grad_norm": 0.09144412400190963, |
| "learning_rate": 5e-06, |
| "loss": 0.8787, |
| "step": 2060 |
| }, |
| { |
| "epoch": 1.0577414409810935, |
| "grad_norm": 0.07045354181303777, |
| "learning_rate": 5e-06, |
| "loss": 0.8797, |
| "step": 2070 |
| }, |
| { |
| "epoch": 1.0628513030148186, |
| "grad_norm": 0.07891724899455275, |
| "learning_rate": 5e-06, |
| "loss": 0.8905, |
| "step": 2080 |
| }, |
| { |
| "epoch": 1.0679611650485437, |
| "grad_norm": 0.06772214066866822, |
| "learning_rate": 5e-06, |
| "loss": 0.8872, |
| "step": 2090 |
| }, |
| { |
| "epoch": 1.0730710270822688, |
| "grad_norm": 0.0712589563903319, |
| "learning_rate": 5e-06, |
| "loss": 0.8855, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.0781808891159939, |
| "grad_norm": 0.06862698369349315, |
| "learning_rate": 5e-06, |
| "loss": 0.881, |
| "step": 2110 |
| }, |
| { |
| "epoch": 1.083290751149719, |
| "grad_norm": 0.0736606231100836, |
| "learning_rate": 5e-06, |
| "loss": 0.8674, |
| "step": 2120 |
| }, |
| { |
| "epoch": 1.088400613183444, |
| "grad_norm": 0.07106355054734692, |
| "learning_rate": 5e-06, |
| "loss": 0.8946, |
| "step": 2130 |
| }, |
| { |
| "epoch": 1.0935104752171692, |
| "grad_norm": 0.07255234123815231, |
| "learning_rate": 5e-06, |
| "loss": 0.882, |
| "step": 2140 |
| }, |
| { |
| "epoch": 1.0986203372508943, |
| "grad_norm": 0.0732742082692386, |
| "learning_rate": 5e-06, |
| "loss": 0.8676, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.1037301992846194, |
| "grad_norm": 0.06964362410343532, |
| "learning_rate": 5e-06, |
| "loss": 0.8774, |
| "step": 2160 |
| }, |
| { |
| "epoch": 1.1088400613183444, |
| "grad_norm": 0.06779365708748122, |
| "learning_rate": 5e-06, |
| "loss": 0.8746, |
| "step": 2170 |
| }, |
| { |
| "epoch": 1.1139499233520695, |
| "grad_norm": 0.1233845297748753, |
| "learning_rate": 5e-06, |
| "loss": 0.8765, |
| "step": 2180 |
| }, |
| { |
| "epoch": 1.1190597853857946, |
| "grad_norm": 0.08906300490745893, |
| "learning_rate": 5e-06, |
| "loss": 0.8887, |
| "step": 2190 |
| }, |
| { |
| "epoch": 1.1241696474195197, |
| "grad_norm": 0.09632061303323498, |
| "learning_rate": 5e-06, |
| "loss": 0.873, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.1292795094532448, |
| "grad_norm": 0.08521280891691754, |
| "learning_rate": 5e-06, |
| "loss": 0.8898, |
| "step": 2210 |
| }, |
| { |
| "epoch": 1.13438937148697, |
| "grad_norm": 0.08360918869150777, |
| "learning_rate": 5e-06, |
| "loss": 0.8744, |
| "step": 2220 |
| }, |
| { |
| "epoch": 1.139499233520695, |
| "grad_norm": 0.07235955845669943, |
| "learning_rate": 5e-06, |
| "loss": 0.871, |
| "step": 2230 |
| }, |
| { |
| "epoch": 1.14460909555442, |
| "grad_norm": 0.0730315588992174, |
| "learning_rate": 5e-06, |
| "loss": 0.8821, |
| "step": 2240 |
| }, |
| { |
| "epoch": 1.1497189575881452, |
| "grad_norm": 0.11078438063259888, |
| "learning_rate": 5e-06, |
| "loss": 0.8769, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.1548288196218701, |
| "grad_norm": 0.07056038440126992, |
| "learning_rate": 5e-06, |
| "loss": 0.8794, |
| "step": 2260 |
| }, |
| { |
| "epoch": 1.1599386816555952, |
| "grad_norm": 0.0708667161304354, |
| "learning_rate": 5e-06, |
| "loss": 0.8816, |
| "step": 2270 |
| }, |
| { |
| "epoch": 1.1650485436893203, |
| "grad_norm": 0.07609323526554022, |
| "learning_rate": 5e-06, |
| "loss": 0.8819, |
| "step": 2280 |
| }, |
| { |
| "epoch": 1.1701584057230454, |
| "grad_norm": 0.0735028145209297, |
| "learning_rate": 5e-06, |
| "loss": 0.8703, |
| "step": 2290 |
| }, |
| { |
| "epoch": 1.1752682677567705, |
| "grad_norm": 0.07069493332631555, |
| "learning_rate": 5e-06, |
| "loss": 0.8847, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.1803781297904956, |
| "grad_norm": 0.072283465693465, |
| "learning_rate": 5e-06, |
| "loss": 0.8821, |
| "step": 2310 |
| }, |
| { |
| "epoch": 1.1854879918242207, |
| "grad_norm": 0.0682393777040698, |
| "learning_rate": 5e-06, |
| "loss": 0.8783, |
| "step": 2320 |
| }, |
| { |
| "epoch": 1.1905978538579458, |
| "grad_norm": 0.07033706862496379, |
| "learning_rate": 5e-06, |
| "loss": 0.8803, |
| "step": 2330 |
| }, |
| { |
| "epoch": 1.195707715891671, |
| "grad_norm": 0.07616585487021542, |
| "learning_rate": 5e-06, |
| "loss": 0.8797, |
| "step": 2340 |
| }, |
| { |
| "epoch": 1.200817577925396, |
| "grad_norm": 0.09257368897028359, |
| "learning_rate": 5e-06, |
| "loss": 0.8848, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.205927439959121, |
| "grad_norm": 0.07218601192609732, |
| "learning_rate": 5e-06, |
| "loss": 0.88, |
| "step": 2360 |
| }, |
| { |
| "epoch": 1.2110373019928462, |
| "grad_norm": 0.06958174210341682, |
| "learning_rate": 5e-06, |
| "loss": 0.8736, |
| "step": 2370 |
| }, |
| { |
| "epoch": 1.2161471640265713, |
| "grad_norm": 0.07752486671959553, |
| "learning_rate": 5e-06, |
| "loss": 0.8872, |
| "step": 2380 |
| }, |
| { |
| "epoch": 1.2212570260602964, |
| "grad_norm": 0.06925884038134095, |
| "learning_rate": 5e-06, |
| "loss": 0.8723, |
| "step": 2390 |
| }, |
| { |
| "epoch": 1.2263668880940215, |
| "grad_norm": 0.08334609436821461, |
| "learning_rate": 5e-06, |
| "loss": 0.8733, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.2314767501277466, |
| "grad_norm": 0.07385728362174653, |
| "learning_rate": 5e-06, |
| "loss": 0.884, |
| "step": 2410 |
| }, |
| { |
| "epoch": 1.2365866121614717, |
| "grad_norm": 0.07450268605349716, |
| "learning_rate": 5e-06, |
| "loss": 0.8759, |
| "step": 2420 |
| }, |
| { |
| "epoch": 1.2416964741951968, |
| "grad_norm": 0.07354090310210888, |
| "learning_rate": 5e-06, |
| "loss": 0.8688, |
| "step": 2430 |
| }, |
| { |
| "epoch": 1.246806336228922, |
| "grad_norm": 0.07320978260247811, |
| "learning_rate": 5e-06, |
| "loss": 0.8714, |
| "step": 2440 |
| }, |
| { |
| "epoch": 1.2519161982626468, |
| "grad_norm": 0.11135661739097093, |
| "learning_rate": 5e-06, |
| "loss": 0.8739, |
| "step": 2450 |
| }, |
| { |
| "epoch": 1.257026060296372, |
| "grad_norm": 0.06689839605333785, |
| "learning_rate": 5e-06, |
| "loss": 0.8765, |
| "step": 2460 |
| }, |
| { |
| "epoch": 1.262135922330097, |
| "grad_norm": 0.06878455740414403, |
| "learning_rate": 5e-06, |
| "loss": 0.8689, |
| "step": 2470 |
| }, |
| { |
| "epoch": 1.2672457843638223, |
| "grad_norm": 0.07939944943381874, |
| "learning_rate": 5e-06, |
| "loss": 0.8746, |
| "step": 2480 |
| }, |
| { |
| "epoch": 1.2723556463975472, |
| "grad_norm": 0.0847486139125745, |
| "learning_rate": 5e-06, |
| "loss": 0.8559, |
| "step": 2490 |
| }, |
| { |
| "epoch": 1.2774655084312725, |
| "grad_norm": 0.07908001512097204, |
| "learning_rate": 5e-06, |
| "loss": 0.8837, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.2825753704649974, |
| "grad_norm": 0.07179831207428604, |
| "learning_rate": 5e-06, |
| "loss": 0.8719, |
| "step": 2510 |
| }, |
| { |
| "epoch": 1.2876852324987225, |
| "grad_norm": 0.06908942799455607, |
| "learning_rate": 5e-06, |
| "loss": 0.8719, |
| "step": 2520 |
| }, |
| { |
| "epoch": 1.2927950945324476, |
| "grad_norm": 0.18170225872460127, |
| "learning_rate": 5e-06, |
| "loss": 0.8716, |
| "step": 2530 |
| }, |
| { |
| "epoch": 1.2979049565661727, |
| "grad_norm": 0.07191218825096998, |
| "learning_rate": 5e-06, |
| "loss": 0.8722, |
| "step": 2540 |
| }, |
| { |
| "epoch": 1.3030148185998978, |
| "grad_norm": 0.07372060023260767, |
| "learning_rate": 5e-06, |
| "loss": 0.864, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.3081246806336229, |
| "grad_norm": 0.06857070499864196, |
| "learning_rate": 5e-06, |
| "loss": 0.8737, |
| "step": 2560 |
| }, |
| { |
| "epoch": 1.313234542667348, |
| "grad_norm": 0.07004154529738099, |
| "learning_rate": 5e-06, |
| "loss": 0.8813, |
| "step": 2570 |
| }, |
| { |
| "epoch": 1.318344404701073, |
| "grad_norm": 0.07646266858295366, |
| "learning_rate": 5e-06, |
| "loss": 0.8673, |
| "step": 2580 |
| }, |
| { |
| "epoch": 1.3234542667347982, |
| "grad_norm": 0.22621512935898874, |
| "learning_rate": 5e-06, |
| "loss": 0.874, |
| "step": 2590 |
| }, |
| { |
| "epoch": 1.3285641287685233, |
| "grad_norm": 0.07172127109965361, |
| "learning_rate": 5e-06, |
| "loss": 0.8702, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.3336739908022484, |
| "grad_norm": 0.07919820863816981, |
| "learning_rate": 5e-06, |
| "loss": 0.8714, |
| "step": 2610 |
| }, |
| { |
| "epoch": 1.3387838528359735, |
| "grad_norm": 0.0737384119314816, |
| "learning_rate": 5e-06, |
| "loss": 0.8596, |
| "step": 2620 |
| }, |
| { |
| "epoch": 1.3438937148696986, |
| "grad_norm": 0.07649678940149303, |
| "learning_rate": 5e-06, |
| "loss": 0.8762, |
| "step": 2630 |
| }, |
| { |
| "epoch": 1.3490035769034237, |
| "grad_norm": 0.07616587969512677, |
| "learning_rate": 5e-06, |
| "loss": 0.8752, |
| "step": 2640 |
| }, |
| { |
| "epoch": 1.3541134389371488, |
| "grad_norm": 0.07029295823641774, |
| "learning_rate": 5e-06, |
| "loss": 0.8873, |
| "step": 2650 |
| }, |
| { |
| "epoch": 1.3592233009708738, |
| "grad_norm": 0.09494349210733874, |
| "learning_rate": 5e-06, |
| "loss": 0.8711, |
| "step": 2660 |
| }, |
| { |
| "epoch": 1.364333163004599, |
| "grad_norm": 0.07259460873158792, |
| "learning_rate": 5e-06, |
| "loss": 0.8637, |
| "step": 2670 |
| }, |
| { |
| "epoch": 1.369443025038324, |
| "grad_norm": 0.07116222480426102, |
| "learning_rate": 5e-06, |
| "loss": 0.871, |
| "step": 2680 |
| }, |
| { |
| "epoch": 1.3745528870720491, |
| "grad_norm": 0.07020506586822424, |
| "learning_rate": 5e-06, |
| "loss": 0.8695, |
| "step": 2690 |
| }, |
| { |
| "epoch": 1.379662749105774, |
| "grad_norm": 0.07347815298194012, |
| "learning_rate": 5e-06, |
| "loss": 0.8635, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.3847726111394993, |
| "grad_norm": 0.07534096599250913, |
| "learning_rate": 5e-06, |
| "loss": 0.874, |
| "step": 2710 |
| }, |
| { |
| "epoch": 1.3898824731732242, |
| "grad_norm": 0.07312451287982565, |
| "learning_rate": 5e-06, |
| "loss": 0.8583, |
| "step": 2720 |
| }, |
| { |
| "epoch": 1.3949923352069495, |
| "grad_norm": 0.07656396202261084, |
| "learning_rate": 5e-06, |
| "loss": 0.8757, |
| "step": 2730 |
| }, |
| { |
| "epoch": 1.4001021972406744, |
| "grad_norm": 0.06967035802932788, |
| "learning_rate": 5e-06, |
| "loss": 0.8679, |
| "step": 2740 |
| }, |
| { |
| "epoch": 1.4052120592743995, |
| "grad_norm": 0.07040784275347066, |
| "learning_rate": 5e-06, |
| "loss": 0.8691, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.4103219213081246, |
| "grad_norm": 0.07613577722321895, |
| "learning_rate": 5e-06, |
| "loss": 0.8622, |
| "step": 2760 |
| }, |
| { |
| "epoch": 1.4154317833418497, |
| "grad_norm": 0.0864741566205661, |
| "learning_rate": 5e-06, |
| "loss": 0.8795, |
| "step": 2770 |
| }, |
| { |
| "epoch": 1.4205416453755748, |
| "grad_norm": 0.07310285802451263, |
| "learning_rate": 5e-06, |
| "loss": 0.8652, |
| "step": 2780 |
| }, |
| { |
| "epoch": 1.4256515074093, |
| "grad_norm": 0.09973189293248541, |
| "learning_rate": 5e-06, |
| "loss": 0.8588, |
| "step": 2790 |
| }, |
| { |
| "epoch": 1.430761369443025, |
| "grad_norm": 0.06701808614361114, |
| "learning_rate": 5e-06, |
| "loss": 0.8618, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.43587123147675, |
| "grad_norm": 0.07371045895798196, |
| "learning_rate": 5e-06, |
| "loss": 0.868, |
| "step": 2810 |
| }, |
| { |
| "epoch": 1.4409810935104752, |
| "grad_norm": 0.07317697928871572, |
| "learning_rate": 5e-06, |
| "loss": 0.8696, |
| "step": 2820 |
| }, |
| { |
| "epoch": 1.4460909555442003, |
| "grad_norm": 0.07564444911927448, |
| "learning_rate": 5e-06, |
| "loss": 0.8752, |
| "step": 2830 |
| }, |
| { |
| "epoch": 1.4512008175779254, |
| "grad_norm": 0.07293430197763458, |
| "learning_rate": 5e-06, |
| "loss": 0.8739, |
| "step": 2840 |
| }, |
| { |
| "epoch": 1.4563106796116505, |
| "grad_norm": 0.08105248343525902, |
| "learning_rate": 5e-06, |
| "loss": 0.8657, |
| "step": 2850 |
| }, |
| { |
| "epoch": 1.4614205416453756, |
| "grad_norm": 0.06997018312332681, |
| "learning_rate": 5e-06, |
| "loss": 0.8687, |
| "step": 2860 |
| }, |
| { |
| "epoch": 1.4665304036791007, |
| "grad_norm": 0.0747583962947945, |
| "learning_rate": 5e-06, |
| "loss": 0.876, |
| "step": 2870 |
| }, |
| { |
| "epoch": 1.4716402657128258, |
| "grad_norm": 0.07233657303691476, |
| "learning_rate": 5e-06, |
| "loss": 0.8737, |
| "step": 2880 |
| }, |
| { |
| "epoch": 1.476750127746551, |
| "grad_norm": 0.09587028399395618, |
| "learning_rate": 5e-06, |
| "loss": 0.8737, |
| "step": 2890 |
| }, |
| { |
| "epoch": 1.481859989780276, |
| "grad_norm": 0.07573225439316397, |
| "learning_rate": 5e-06, |
| "loss": 0.8768, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.486969851814001, |
| "grad_norm": 0.07814308912110395, |
| "learning_rate": 5e-06, |
| "loss": 0.8585, |
| "step": 2910 |
| }, |
| { |
| "epoch": 1.4920797138477262, |
| "grad_norm": 0.07563890740598028, |
| "learning_rate": 5e-06, |
| "loss": 0.8688, |
| "step": 2920 |
| }, |
| { |
| "epoch": 1.497189575881451, |
| "grad_norm": 0.09576148511380349, |
| "learning_rate": 5e-06, |
| "loss": 0.8664, |
| "step": 2930 |
| }, |
| { |
| "epoch": 1.5022994379151764, |
| "grad_norm": 0.07323289783838559, |
| "learning_rate": 5e-06, |
| "loss": 0.8635, |
| "step": 2940 |
| }, |
| { |
| "epoch": 1.5074092999489013, |
| "grad_norm": 0.06803171848149765, |
| "learning_rate": 5e-06, |
| "loss": 0.87, |
| "step": 2950 |
| }, |
| { |
| "epoch": 1.5125191619826266, |
| "grad_norm": 0.0750090128477362, |
| "learning_rate": 5e-06, |
| "loss": 0.8772, |
| "step": 2960 |
| }, |
| { |
| "epoch": 1.5176290240163515, |
| "grad_norm": 0.07137689878588128, |
| "learning_rate": 5e-06, |
| "loss": 0.8751, |
| "step": 2970 |
| }, |
| { |
| "epoch": 1.5227388860500768, |
| "grad_norm": 0.07023359972800564, |
| "learning_rate": 5e-06, |
| "loss": 0.8643, |
| "step": 2980 |
| }, |
| { |
| "epoch": 1.5278487480838017, |
| "grad_norm": 0.07455037859546688, |
| "learning_rate": 5e-06, |
| "loss": 0.861, |
| "step": 2990 |
| }, |
| { |
| "epoch": 1.532958610117527, |
| "grad_norm": 0.25433406248411833, |
| "learning_rate": 5e-06, |
| "loss": 0.8647, |
| "step": 3000 |
| }, |
| { |
| "epoch": 1.5380684721512519, |
| "grad_norm": 0.07418037183166513, |
| "learning_rate": 5e-06, |
| "loss": 0.8722, |
| "step": 3010 |
| }, |
| { |
| "epoch": 1.543178334184977, |
| "grad_norm": 0.527753296001325, |
| "learning_rate": 5e-06, |
| "loss": 0.8487, |
| "step": 3020 |
| }, |
| { |
| "epoch": 1.548288196218702, |
| "grad_norm": 0.08808550920766758, |
| "learning_rate": 5e-06, |
| "loss": 0.8496, |
| "step": 3030 |
| }, |
| { |
| "epoch": 1.5533980582524272, |
| "grad_norm": 0.2138159671824102, |
| "learning_rate": 5e-06, |
| "loss": 0.8589, |
| "step": 3040 |
| }, |
| { |
| "epoch": 1.5585079202861523, |
| "grad_norm": 0.0756879397945152, |
| "learning_rate": 5e-06, |
| "loss": 0.8611, |
| "step": 3050 |
| }, |
| { |
| "epoch": 1.5636177823198774, |
| "grad_norm": 0.07261770215074904, |
| "learning_rate": 5e-06, |
| "loss": 0.8653, |
| "step": 3060 |
| }, |
| { |
| "epoch": 1.5687276443536025, |
| "grad_norm": 0.0716298530862272, |
| "learning_rate": 5e-06, |
| "loss": 0.8742, |
| "step": 3070 |
| }, |
| { |
| "epoch": 1.5738375063873276, |
| "grad_norm": 0.07104512740332262, |
| "learning_rate": 5e-06, |
| "loss": 0.8639, |
| "step": 3080 |
| }, |
| { |
| "epoch": 1.5789473684210527, |
| "grad_norm": 0.0734502304224508, |
| "learning_rate": 5e-06, |
| "loss": 0.8654, |
| "step": 3090 |
| }, |
| { |
| "epoch": 1.5840572304547778, |
| "grad_norm": 0.0718119752165925, |
| "learning_rate": 5e-06, |
| "loss": 0.8829, |
| "step": 3100 |
| }, |
| { |
| "epoch": 1.5891670924885029, |
| "grad_norm": 0.09084312572124162, |
| "learning_rate": 5e-06, |
| "loss": 0.87, |
| "step": 3110 |
| }, |
| { |
| "epoch": 1.594276954522228, |
| "grad_norm": 0.0718386089812322, |
| "learning_rate": 5e-06, |
| "loss": 0.8674, |
| "step": 3120 |
| }, |
| { |
| "epoch": 1.599386816555953, |
| "grad_norm": 0.07237170282771108, |
| "learning_rate": 5e-06, |
| "loss": 0.8773, |
| "step": 3130 |
| }, |
| { |
| "epoch": 1.604496678589678, |
| "grad_norm": 0.08717441511772783, |
| "learning_rate": 5e-06, |
| "loss": 0.8596, |
| "step": 3140 |
| }, |
| { |
| "epoch": 1.6096065406234032, |
| "grad_norm": 0.07543204298441727, |
| "learning_rate": 5e-06, |
| "loss": 0.8662, |
| "step": 3150 |
| }, |
| { |
| "epoch": 1.6147164026571281, |
| "grad_norm": 0.16450122906983272, |
| "learning_rate": 5e-06, |
| "loss": 0.8521, |
| "step": 3160 |
| }, |
| { |
| "epoch": 1.6198262646908534, |
| "grad_norm": 0.0761301308978785, |
| "learning_rate": 5e-06, |
| "loss": 0.8676, |
| "step": 3170 |
| }, |
| { |
| "epoch": 1.6249361267245783, |
| "grad_norm": 0.07342038953957022, |
| "learning_rate": 5e-06, |
| "loss": 0.858, |
| "step": 3180 |
| }, |
| { |
| "epoch": 1.6300459887583036, |
| "grad_norm": 0.07763414058632681, |
| "learning_rate": 5e-06, |
| "loss": 0.8515, |
| "step": 3190 |
| }, |
| { |
| "epoch": 1.6351558507920285, |
| "grad_norm": 0.07214206206945292, |
| "learning_rate": 5e-06, |
| "loss": 0.8552, |
| "step": 3200 |
| }, |
| { |
| "epoch": 1.6402657128257538, |
| "grad_norm": 0.07217350290839333, |
| "learning_rate": 5e-06, |
| "loss": 0.8692, |
| "step": 3210 |
| }, |
| { |
| "epoch": 1.6453755748594787, |
| "grad_norm": 0.07049849978950093, |
| "learning_rate": 5e-06, |
| "loss": 0.8637, |
| "step": 3220 |
| }, |
| { |
| "epoch": 1.650485436893204, |
| "grad_norm": 0.07280794458006132, |
| "learning_rate": 5e-06, |
| "loss": 0.865, |
| "step": 3230 |
| }, |
| { |
| "epoch": 1.655595298926929, |
| "grad_norm": 0.0725642747319213, |
| "learning_rate": 5e-06, |
| "loss": 0.8707, |
| "step": 3240 |
| }, |
| { |
| "epoch": 1.660705160960654, |
| "grad_norm": 0.0833892314403624, |
| "learning_rate": 5e-06, |
| "loss": 0.8563, |
| "step": 3250 |
| }, |
| { |
| "epoch": 1.6658150229943791, |
| "grad_norm": 0.07584097737400752, |
| "learning_rate": 5e-06, |
| "loss": 0.8641, |
| "step": 3260 |
| }, |
| { |
| "epoch": 1.6709248850281042, |
| "grad_norm": 0.07213631314506445, |
| "learning_rate": 5e-06, |
| "loss": 0.8481, |
| "step": 3270 |
| }, |
| { |
| "epoch": 1.6760347470618293, |
| "grad_norm": 0.09381953559357656, |
| "learning_rate": 5e-06, |
| "loss": 0.8701, |
| "step": 3280 |
| }, |
| { |
| "epoch": 1.6811446090955544, |
| "grad_norm": 0.08666983262995791, |
| "learning_rate": 5e-06, |
| "loss": 0.8647, |
| "step": 3290 |
| }, |
| { |
| "epoch": 1.6862544711292795, |
| "grad_norm": 0.07241102137300287, |
| "learning_rate": 5e-06, |
| "loss": 0.8659, |
| "step": 3300 |
| }, |
| { |
| "epoch": 1.6913643331630046, |
| "grad_norm": 0.07570259861415368, |
| "learning_rate": 5e-06, |
| "loss": 0.857, |
| "step": 3310 |
| }, |
| { |
| "epoch": 1.6964741951967297, |
| "grad_norm": 0.0712132785838072, |
| "learning_rate": 5e-06, |
| "loss": 0.8628, |
| "step": 3320 |
| }, |
| { |
| "epoch": 1.7015840572304548, |
| "grad_norm": 0.08048218877183527, |
| "learning_rate": 5e-06, |
| "loss": 0.8695, |
| "step": 3330 |
| }, |
| { |
| "epoch": 1.70669391926418, |
| "grad_norm": 0.09318443713082784, |
| "learning_rate": 5e-06, |
| "loss": 0.8517, |
| "step": 3340 |
| }, |
| { |
| "epoch": 1.711803781297905, |
| "grad_norm": 0.07475815362258889, |
| "learning_rate": 5e-06, |
| "loss": 0.8511, |
| "step": 3350 |
| }, |
| { |
| "epoch": 1.71691364333163, |
| "grad_norm": 0.17675220557910395, |
| "learning_rate": 5e-06, |
| "loss": 0.8704, |
| "step": 3360 |
| }, |
| { |
| "epoch": 1.722023505365355, |
| "grad_norm": 0.08155115744528993, |
| "learning_rate": 5e-06, |
| "loss": 0.8782, |
| "step": 3370 |
| }, |
| { |
| "epoch": 1.7271333673990803, |
| "grad_norm": 0.0688426924243207, |
| "learning_rate": 5e-06, |
| "loss": 0.8745, |
| "step": 3380 |
| }, |
| { |
| "epoch": 1.7322432294328052, |
| "grad_norm": 0.20082535327419024, |
| "learning_rate": 5e-06, |
| "loss": 0.8714, |
| "step": 3390 |
| }, |
| { |
| "epoch": 1.7373530914665305, |
| "grad_norm": 0.07080189321410434, |
| "learning_rate": 5e-06, |
| "loss": 0.8599, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.7424629535002554, |
| "grad_norm": 0.07121276144536708, |
| "learning_rate": 5e-06, |
| "loss": 0.8559, |
| "step": 3410 |
| }, |
| { |
| "epoch": 1.7475728155339807, |
| "grad_norm": 0.07900066453877426, |
| "learning_rate": 5e-06, |
| "loss": 0.8478, |
| "step": 3420 |
| }, |
| { |
| "epoch": 1.7526826775677056, |
| "grad_norm": 0.06739380270330443, |
| "learning_rate": 5e-06, |
| "loss": 0.8607, |
| "step": 3430 |
| }, |
| { |
| "epoch": 1.757792539601431, |
| "grad_norm": 0.07034902532215459, |
| "learning_rate": 5e-06, |
| "loss": 0.8571, |
| "step": 3440 |
| }, |
| { |
| "epoch": 1.7629024016351558, |
| "grad_norm": 0.12075846844627011, |
| "learning_rate": 5e-06, |
| "loss": 0.8656, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.768012263668881, |
| "grad_norm": 0.06907079017712828, |
| "learning_rate": 5e-06, |
| "loss": 0.8639, |
| "step": 3460 |
| }, |
| { |
| "epoch": 1.773122125702606, |
| "grad_norm": 0.07179248413436795, |
| "learning_rate": 5e-06, |
| "loss": 0.8576, |
| "step": 3470 |
| }, |
| { |
| "epoch": 1.778231987736331, |
| "grad_norm": 0.07132838373781546, |
| "learning_rate": 5e-06, |
| "loss": 0.8491, |
| "step": 3480 |
| }, |
| { |
| "epoch": 1.7833418497700562, |
| "grad_norm": 0.06976172529981421, |
| "learning_rate": 5e-06, |
| "loss": 0.862, |
| "step": 3490 |
| }, |
| { |
| "epoch": 1.7884517118037813, |
| "grad_norm": 0.07698270839551638, |
| "learning_rate": 5e-06, |
| "loss": 0.8798, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.7935615738375064, |
| "grad_norm": 0.07221517095149313, |
| "learning_rate": 5e-06, |
| "loss": 0.8614, |
| "step": 3510 |
| }, |
| { |
| "epoch": 1.7986714358712315, |
| "grad_norm": 0.06942929153366358, |
| "learning_rate": 5e-06, |
| "loss": 0.8516, |
| "step": 3520 |
| }, |
| { |
| "epoch": 1.8037812979049566, |
| "grad_norm": 0.07515778700424314, |
| "learning_rate": 5e-06, |
| "loss": 0.8513, |
| "step": 3530 |
| }, |
| { |
| "epoch": 1.8088911599386817, |
| "grad_norm": 0.09189050887868352, |
| "learning_rate": 5e-06, |
| "loss": 0.8516, |
| "step": 3540 |
| }, |
| { |
| "epoch": 1.8140010219724068, |
| "grad_norm": 0.0728814328355567, |
| "learning_rate": 5e-06, |
| "loss": 0.8621, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.8191108840061319, |
| "grad_norm": 0.10913457550732027, |
| "learning_rate": 5e-06, |
| "loss": 0.8646, |
| "step": 3560 |
| }, |
| { |
| "epoch": 1.824220746039857, |
| "grad_norm": 0.07052348211530159, |
| "learning_rate": 5e-06, |
| "loss": 0.8681, |
| "step": 3570 |
| }, |
| { |
| "epoch": 1.829330608073582, |
| "grad_norm": 0.07264597385076586, |
| "learning_rate": 5e-06, |
| "loss": 0.8511, |
| "step": 3580 |
| }, |
| { |
| "epoch": 1.8344404701073072, |
| "grad_norm": 0.07018720603142706, |
| "learning_rate": 5e-06, |
| "loss": 0.8496, |
| "step": 3590 |
| }, |
| { |
| "epoch": 1.839550332141032, |
| "grad_norm": 0.07405274174709763, |
| "learning_rate": 5e-06, |
| "loss": 0.8642, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.8446601941747574, |
| "grad_norm": 0.06823076228063171, |
| "learning_rate": 5e-06, |
| "loss": 0.8688, |
| "step": 3610 |
| }, |
| { |
| "epoch": 1.8497700562084822, |
| "grad_norm": 0.07162073827665592, |
| "learning_rate": 5e-06, |
| "loss": 0.8539, |
| "step": 3620 |
| }, |
| { |
| "epoch": 1.8548799182422075, |
| "grad_norm": 0.08920479490208502, |
| "learning_rate": 5e-06, |
| "loss": 0.8654, |
| "step": 3630 |
| }, |
| { |
| "epoch": 1.8599897802759324, |
| "grad_norm": 0.06984715874447373, |
| "learning_rate": 5e-06, |
| "loss": 0.8398, |
| "step": 3640 |
| }, |
| { |
| "epoch": 1.8650996423096577, |
| "grad_norm": 0.07140680546752168, |
| "learning_rate": 5e-06, |
| "loss": 0.8563, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.8702095043433826, |
| "grad_norm": 0.074558519928601, |
| "learning_rate": 5e-06, |
| "loss": 0.8538, |
| "step": 3660 |
| }, |
| { |
| "epoch": 1.875319366377108, |
| "grad_norm": 0.0721153352880791, |
| "learning_rate": 5e-06, |
| "loss": 0.8681, |
| "step": 3670 |
| }, |
| { |
| "epoch": 1.8804292284108328, |
| "grad_norm": 0.0758789196857535, |
| "learning_rate": 5e-06, |
| "loss": 0.8635, |
| "step": 3680 |
| }, |
| { |
| "epoch": 1.8855390904445581, |
| "grad_norm": 0.14485848629874987, |
| "learning_rate": 5e-06, |
| "loss": 0.8481, |
| "step": 3690 |
| }, |
| { |
| "epoch": 1.890648952478283, |
| "grad_norm": 0.0819983215124987, |
| "learning_rate": 5e-06, |
| "loss": 0.8629, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.8957588145120083, |
| "grad_norm": 0.12069351872374186, |
| "learning_rate": 5e-06, |
| "loss": 0.8586, |
| "step": 3710 |
| }, |
| { |
| "epoch": 1.9008686765457332, |
| "grad_norm": 0.07900003124096018, |
| "learning_rate": 5e-06, |
| "loss": 0.8601, |
| "step": 3720 |
| }, |
| { |
| "epoch": 1.9059785385794583, |
| "grad_norm": 0.07294378619834212, |
| "learning_rate": 5e-06, |
| "loss": 0.8587, |
| "step": 3730 |
| }, |
| { |
| "epoch": 1.9110884006131834, |
| "grad_norm": 0.07407840442755602, |
| "learning_rate": 5e-06, |
| "loss": 0.8528, |
| "step": 3740 |
| }, |
| { |
| "epoch": 1.9161982626469085, |
| "grad_norm": 0.06874497219883488, |
| "learning_rate": 5e-06, |
| "loss": 0.8605, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.9213081246806336, |
| "grad_norm": 0.07843559919629378, |
| "learning_rate": 5e-06, |
| "loss": 0.8527, |
| "step": 3760 |
| }, |
| { |
| "epoch": 1.9264179867143587, |
| "grad_norm": 0.07180996099623314, |
| "learning_rate": 5e-06, |
| "loss": 0.8523, |
| "step": 3770 |
| }, |
| { |
| "epoch": 1.9315278487480838, |
| "grad_norm": 0.08056676529863596, |
| "learning_rate": 5e-06, |
| "loss": 0.8633, |
| "step": 3780 |
| }, |
| { |
| "epoch": 1.936637710781809, |
| "grad_norm": 0.07275382170948991, |
| "learning_rate": 5e-06, |
| "loss": 0.8515, |
| "step": 3790 |
| }, |
| { |
| "epoch": 1.941747572815534, |
| "grad_norm": 0.07098939053342133, |
| "learning_rate": 5e-06, |
| "loss": 0.8547, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.946857434849259, |
| "grad_norm": 0.06701155010213629, |
| "learning_rate": 5e-06, |
| "loss": 0.8481, |
| "step": 3810 |
| }, |
| { |
| "epoch": 1.9519672968829842, |
| "grad_norm": 0.07031575089856135, |
| "learning_rate": 5e-06, |
| "loss": 0.8438, |
| "step": 3820 |
| }, |
| { |
| "epoch": 1.9570771589167093, |
| "grad_norm": 0.07564841797187823, |
| "learning_rate": 5e-06, |
| "loss": 0.8582, |
| "step": 3830 |
| }, |
| { |
| "epoch": 1.9621870209504344, |
| "grad_norm": 0.07024404592017057, |
| "learning_rate": 5e-06, |
| "loss": 0.8621, |
| "step": 3840 |
| }, |
| { |
| "epoch": 1.9672968829841593, |
| "grad_norm": 0.06812867527901585, |
| "learning_rate": 5e-06, |
| "loss": 0.8535, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.9724067450178846, |
| "grad_norm": 0.06914782077210797, |
| "learning_rate": 5e-06, |
| "loss": 0.8579, |
| "step": 3860 |
| }, |
| { |
| "epoch": 1.9775166070516095, |
| "grad_norm": 0.07264208755659882, |
| "learning_rate": 5e-06, |
| "loss": 0.8587, |
| "step": 3870 |
| }, |
| { |
| "epoch": 1.9826264690853348, |
| "grad_norm": 0.07656435460747489, |
| "learning_rate": 5e-06, |
| "loss": 0.8569, |
| "step": 3880 |
| }, |
| { |
| "epoch": 1.9877363311190597, |
| "grad_norm": 0.07973389243138154, |
| "learning_rate": 5e-06, |
| "loss": 0.8519, |
| "step": 3890 |
| }, |
| { |
| "epoch": 1.992846193152785, |
| "grad_norm": 0.07179271046170288, |
| "learning_rate": 5e-06, |
| "loss": 0.8622, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.9979560551865099, |
| "grad_norm": 0.11365614273983228, |
| "learning_rate": 5e-06, |
| "loss": 0.8573, |
| "step": 3910 |
| }, |
| { |
| "epoch": 2.003065917220235, |
| "grad_norm": 0.07141837340160201, |
| "learning_rate": 5e-06, |
| "loss": 0.8513, |
| "step": 3920 |
| }, |
| { |
| "epoch": 2.00817577925396, |
| "grad_norm": 0.10505624480952601, |
| "learning_rate": 5e-06, |
| "loss": 0.8631, |
| "step": 3930 |
| }, |
| { |
| "epoch": 2.0132856412876854, |
| "grad_norm": 0.07301133387642879, |
| "learning_rate": 5e-06, |
| "loss": 0.851, |
| "step": 3940 |
| }, |
| { |
| "epoch": 2.0183955033214103, |
| "grad_norm": 0.08124439773368344, |
| "learning_rate": 5e-06, |
| "loss": 0.8556, |
| "step": 3950 |
| }, |
| { |
| "epoch": 2.0235053653551356, |
| "grad_norm": 0.18456635683801864, |
| "learning_rate": 5e-06, |
| "loss": 0.8601, |
| "step": 3960 |
| }, |
| { |
| "epoch": 2.0286152273888605, |
| "grad_norm": 0.0839091878240074, |
| "learning_rate": 5e-06, |
| "loss": 0.8476, |
| "step": 3970 |
| }, |
| { |
| "epoch": 2.033725089422586, |
| "grad_norm": 0.06923549834655754, |
| "learning_rate": 5e-06, |
| "loss": 0.836, |
| "step": 3980 |
| }, |
| { |
| "epoch": 2.0388349514563107, |
| "grad_norm": 0.07270485015207773, |
| "learning_rate": 5e-06, |
| "loss": 0.8559, |
| "step": 3990 |
| }, |
| { |
| "epoch": 2.043944813490036, |
| "grad_norm": 0.0855738032972615, |
| "learning_rate": 5e-06, |
| "loss": 0.8527, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.049054675523761, |
| "grad_norm": 0.0737050057656388, |
| "learning_rate": 5e-06, |
| "loss": 0.8532, |
| "step": 4010 |
| }, |
| { |
| "epoch": 2.0541645375574857, |
| "grad_norm": 0.10214471523135125, |
| "learning_rate": 5e-06, |
| "loss": 0.8428, |
| "step": 4020 |
| }, |
| { |
| "epoch": 2.059274399591211, |
| "grad_norm": 0.0883446336624383, |
| "learning_rate": 5e-06, |
| "loss": 0.8451, |
| "step": 4030 |
| }, |
| { |
| "epoch": 2.064384261624936, |
| "grad_norm": 0.07417051518736814, |
| "learning_rate": 5e-06, |
| "loss": 0.8536, |
| "step": 4040 |
| }, |
| { |
| "epoch": 2.0694941236586613, |
| "grad_norm": 0.06790782347477319, |
| "learning_rate": 5e-06, |
| "loss": 0.8375, |
| "step": 4050 |
| }, |
| { |
| "epoch": 2.074603985692386, |
| "grad_norm": 0.07032456253507346, |
| "learning_rate": 5e-06, |
| "loss": 0.8429, |
| "step": 4060 |
| }, |
| { |
| "epoch": 2.0797138477261115, |
| "grad_norm": 0.10296354446960965, |
| "learning_rate": 5e-06, |
| "loss": 0.8459, |
| "step": 4070 |
| }, |
| { |
| "epoch": 2.0848237097598363, |
| "grad_norm": 0.11146967299257413, |
| "learning_rate": 5e-06, |
| "loss": 0.8368, |
| "step": 4080 |
| }, |
| { |
| "epoch": 2.0899335717935617, |
| "grad_norm": 0.07549742436944092, |
| "learning_rate": 5e-06, |
| "loss": 0.8523, |
| "step": 4090 |
| }, |
| { |
| "epoch": 2.0950434338272865, |
| "grad_norm": 0.07340256510209953, |
| "learning_rate": 5e-06, |
| "loss": 0.8322, |
| "step": 4100 |
| }, |
| { |
| "epoch": 2.100153295861012, |
| "grad_norm": 0.0757031546197513, |
| "learning_rate": 5e-06, |
| "loss": 0.8551, |
| "step": 4110 |
| }, |
| { |
| "epoch": 2.1052631578947367, |
| "grad_norm": 0.15844891302909508, |
| "learning_rate": 5e-06, |
| "loss": 0.8361, |
| "step": 4120 |
| }, |
| { |
| "epoch": 2.110373019928462, |
| "grad_norm": 0.07913043954436448, |
| "learning_rate": 5e-06, |
| "loss": 0.8478, |
| "step": 4130 |
| }, |
| { |
| "epoch": 2.115482881962187, |
| "grad_norm": 0.07876653993674046, |
| "learning_rate": 5e-06, |
| "loss": 0.8491, |
| "step": 4140 |
| }, |
| { |
| "epoch": 2.1205927439959122, |
| "grad_norm": 0.1767146264245717, |
| "learning_rate": 5e-06, |
| "loss": 0.8451, |
| "step": 4150 |
| }, |
| { |
| "epoch": 2.125702606029637, |
| "grad_norm": 0.06871322115584581, |
| "learning_rate": 5e-06, |
| "loss": 0.8586, |
| "step": 4160 |
| }, |
| { |
| "epoch": 2.1308124680633624, |
| "grad_norm": 0.07906449831613256, |
| "learning_rate": 5e-06, |
| "loss": 0.8477, |
| "step": 4170 |
| }, |
| { |
| "epoch": 2.1359223300970873, |
| "grad_norm": 0.09775775034969345, |
| "learning_rate": 5e-06, |
| "loss": 0.8522, |
| "step": 4180 |
| }, |
| { |
| "epoch": 2.1410321921308126, |
| "grad_norm": 0.0760441138088996, |
| "learning_rate": 5e-06, |
| "loss": 0.8469, |
| "step": 4190 |
| }, |
| { |
| "epoch": 2.1461420541645375, |
| "grad_norm": 0.07225182587888014, |
| "learning_rate": 5e-06, |
| "loss": 0.8608, |
| "step": 4200 |
| }, |
| { |
| "epoch": 2.151251916198263, |
| "grad_norm": 0.13154761525803269, |
| "learning_rate": 5e-06, |
| "loss": 0.8488, |
| "step": 4210 |
| }, |
| { |
| "epoch": 2.1563617782319877, |
| "grad_norm": 0.07374851045064566, |
| "learning_rate": 5e-06, |
| "loss": 0.841, |
| "step": 4220 |
| }, |
| { |
| "epoch": 2.161471640265713, |
| "grad_norm": 0.07031542242629284, |
| "learning_rate": 5e-06, |
| "loss": 0.8568, |
| "step": 4230 |
| }, |
| { |
| "epoch": 2.166581502299438, |
| "grad_norm": 0.07651902738251523, |
| "learning_rate": 5e-06, |
| "loss": 0.8352, |
| "step": 4240 |
| }, |
| { |
| "epoch": 2.171691364333163, |
| "grad_norm": 0.071926401689458, |
| "learning_rate": 5e-06, |
| "loss": 0.8558, |
| "step": 4250 |
| }, |
| { |
| "epoch": 2.176801226366888, |
| "grad_norm": 0.07423622272637805, |
| "learning_rate": 5e-06, |
| "loss": 0.8418, |
| "step": 4260 |
| }, |
| { |
| "epoch": 2.181911088400613, |
| "grad_norm": 0.07126311114741704, |
| "learning_rate": 5e-06, |
| "loss": 0.8522, |
| "step": 4270 |
| }, |
| { |
| "epoch": 2.1870209504343383, |
| "grad_norm": 0.07377430376003857, |
| "learning_rate": 5e-06, |
| "loss": 0.8446, |
| "step": 4280 |
| }, |
| { |
| "epoch": 2.192130812468063, |
| "grad_norm": 0.06949191578937867, |
| "learning_rate": 5e-06, |
| "loss": 0.8454, |
| "step": 4290 |
| }, |
| { |
| "epoch": 2.1972406745017885, |
| "grad_norm": 0.07843634966497359, |
| "learning_rate": 5e-06, |
| "loss": 0.8495, |
| "step": 4300 |
| }, |
| { |
| "epoch": 2.2023505365355134, |
| "grad_norm": 0.07396166555039149, |
| "learning_rate": 5e-06, |
| "loss": 0.8544, |
| "step": 4310 |
| }, |
| { |
| "epoch": 2.2074603985692387, |
| "grad_norm": 0.06906877269519048, |
| "learning_rate": 5e-06, |
| "loss": 0.8519, |
| "step": 4320 |
| }, |
| { |
| "epoch": 2.2125702606029636, |
| "grad_norm": 0.07738435381809824, |
| "learning_rate": 5e-06, |
| "loss": 0.8612, |
| "step": 4330 |
| }, |
| { |
| "epoch": 2.217680122636689, |
| "grad_norm": 0.07272677909652538, |
| "learning_rate": 5e-06, |
| "loss": 0.8483, |
| "step": 4340 |
| }, |
| { |
| "epoch": 2.2227899846704138, |
| "grad_norm": 0.1704602161006234, |
| "learning_rate": 5e-06, |
| "loss": 0.8494, |
| "step": 4350 |
| }, |
| { |
| "epoch": 2.227899846704139, |
| "grad_norm": 0.08280430588455355, |
| "learning_rate": 5e-06, |
| "loss": 0.8369, |
| "step": 4360 |
| }, |
| { |
| "epoch": 2.233009708737864, |
| "grad_norm": 0.0778136234869108, |
| "learning_rate": 5e-06, |
| "loss": 0.8413, |
| "step": 4370 |
| }, |
| { |
| "epoch": 2.2381195707715893, |
| "grad_norm": 0.06902177322002463, |
| "learning_rate": 5e-06, |
| "loss": 0.8555, |
| "step": 4380 |
| }, |
| { |
| "epoch": 2.243229432805314, |
| "grad_norm": 0.07436632107648229, |
| "learning_rate": 5e-06, |
| "loss": 0.8341, |
| "step": 4390 |
| }, |
| { |
| "epoch": 2.2483392948390395, |
| "grad_norm": 0.08037131223518179, |
| "learning_rate": 5e-06, |
| "loss": 0.837, |
| "step": 4400 |
| }, |
| { |
| "epoch": 2.2534491568727644, |
| "grad_norm": 0.12080203046044066, |
| "learning_rate": 5e-06, |
| "loss": 0.8484, |
| "step": 4410 |
| }, |
| { |
| "epoch": 2.2585590189064897, |
| "grad_norm": 0.08092569363882715, |
| "learning_rate": 5e-06, |
| "loss": 0.8387, |
| "step": 4420 |
| }, |
| { |
| "epoch": 2.2636688809402146, |
| "grad_norm": 0.07901582143607909, |
| "learning_rate": 5e-06, |
| "loss": 0.8436, |
| "step": 4430 |
| }, |
| { |
| "epoch": 2.26877874297394, |
| "grad_norm": 0.07599584492537564, |
| "learning_rate": 5e-06, |
| "loss": 0.8603, |
| "step": 4440 |
| }, |
| { |
| "epoch": 2.2738886050076648, |
| "grad_norm": 0.07078355889274002, |
| "learning_rate": 5e-06, |
| "loss": 0.8354, |
| "step": 4450 |
| }, |
| { |
| "epoch": 2.27899846704139, |
| "grad_norm": 0.10294533024093251, |
| "learning_rate": 5e-06, |
| "loss": 0.8515, |
| "step": 4460 |
| }, |
| { |
| "epoch": 2.284108329075115, |
| "grad_norm": 0.07272362249279035, |
| "learning_rate": 5e-06, |
| "loss": 0.8448, |
| "step": 4470 |
| }, |
| { |
| "epoch": 2.28921819110884, |
| "grad_norm": 0.07263729571802433, |
| "learning_rate": 5e-06, |
| "loss": 0.8457, |
| "step": 4480 |
| }, |
| { |
| "epoch": 2.294328053142565, |
| "grad_norm": 0.07075340939726783, |
| "learning_rate": 5e-06, |
| "loss": 0.8479, |
| "step": 4490 |
| }, |
| { |
| "epoch": 2.2994379151762905, |
| "grad_norm": 0.07782417357166051, |
| "learning_rate": 5e-06, |
| "loss": 0.8553, |
| "step": 4500 |
| }, |
| { |
| "epoch": 2.3045477772100154, |
| "grad_norm": 0.07531889060674515, |
| "learning_rate": 5e-06, |
| "loss": 0.8462, |
| "step": 4510 |
| }, |
| { |
| "epoch": 2.3096576392437402, |
| "grad_norm": 0.07208019084503213, |
| "learning_rate": 5e-06, |
| "loss": 0.8525, |
| "step": 4520 |
| }, |
| { |
| "epoch": 2.3147675012774656, |
| "grad_norm": 0.0835242862697962, |
| "learning_rate": 5e-06, |
| "loss": 0.8513, |
| "step": 4530 |
| }, |
| { |
| "epoch": 2.3198773633111904, |
| "grad_norm": 0.07119059664069095, |
| "learning_rate": 5e-06, |
| "loss": 0.8447, |
| "step": 4540 |
| }, |
| { |
| "epoch": 2.3249872253449158, |
| "grad_norm": 0.07612636819413399, |
| "learning_rate": 5e-06, |
| "loss": 0.8392, |
| "step": 4550 |
| }, |
| { |
| "epoch": 2.3300970873786406, |
| "grad_norm": 0.19261221848157942, |
| "learning_rate": 5e-06, |
| "loss": 0.8412, |
| "step": 4560 |
| }, |
| { |
| "epoch": 2.335206949412366, |
| "grad_norm": 0.07559214061873046, |
| "learning_rate": 5e-06, |
| "loss": 0.8439, |
| "step": 4570 |
| }, |
| { |
| "epoch": 2.340316811446091, |
| "grad_norm": 0.07642749847907093, |
| "learning_rate": 5e-06, |
| "loss": 0.8496, |
| "step": 4580 |
| }, |
| { |
| "epoch": 2.345426673479816, |
| "grad_norm": 0.08063004427375733, |
| "learning_rate": 5e-06, |
| "loss": 0.8409, |
| "step": 4590 |
| }, |
| { |
| "epoch": 2.350536535513541, |
| "grad_norm": 0.0720684787353252, |
| "learning_rate": 5e-06, |
| "loss": 0.8307, |
| "step": 4600 |
| }, |
| { |
| "epoch": 2.3556463975472663, |
| "grad_norm": 0.0694395988603129, |
| "learning_rate": 5e-06, |
| "loss": 0.8495, |
| "step": 4610 |
| }, |
| { |
| "epoch": 2.3607562595809912, |
| "grad_norm": 0.12026943156490484, |
| "learning_rate": 5e-06, |
| "loss": 0.853, |
| "step": 4620 |
| }, |
| { |
| "epoch": 2.3658661216147165, |
| "grad_norm": 0.07464848880206523, |
| "learning_rate": 5e-06, |
| "loss": 0.8504, |
| "step": 4630 |
| }, |
| { |
| "epoch": 2.3709759836484414, |
| "grad_norm": 0.15496993855974875, |
| "learning_rate": 5e-06, |
| "loss": 0.8441, |
| "step": 4640 |
| }, |
| { |
| "epoch": 2.3760858456821667, |
| "grad_norm": 0.07134185410989129, |
| "learning_rate": 5e-06, |
| "loss": 0.8447, |
| "step": 4650 |
| }, |
| { |
| "epoch": 2.3811957077158916, |
| "grad_norm": 0.07242368272475976, |
| "learning_rate": 5e-06, |
| "loss": 0.8406, |
| "step": 4660 |
| }, |
| { |
| "epoch": 2.386305569749617, |
| "grad_norm": 0.0697443617984476, |
| "learning_rate": 5e-06, |
| "loss": 0.8329, |
| "step": 4670 |
| }, |
| { |
| "epoch": 2.391415431783342, |
| "grad_norm": 0.07044017928388062, |
| "learning_rate": 5e-06, |
| "loss": 0.8462, |
| "step": 4680 |
| }, |
| { |
| "epoch": 2.396525293817067, |
| "grad_norm": 0.06981669634551392, |
| "learning_rate": 5e-06, |
| "loss": 0.8348, |
| "step": 4690 |
| }, |
| { |
| "epoch": 2.401635155850792, |
| "grad_norm": 0.07668972004047576, |
| "learning_rate": 5e-06, |
| "loss": 0.8398, |
| "step": 4700 |
| }, |
| { |
| "epoch": 2.406745017884517, |
| "grad_norm": 0.07415042030571524, |
| "learning_rate": 5e-06, |
| "loss": 0.8443, |
| "step": 4710 |
| }, |
| { |
| "epoch": 2.411854879918242, |
| "grad_norm": 0.07295120303267964, |
| "learning_rate": 5e-06, |
| "loss": 0.8507, |
| "step": 4720 |
| }, |
| { |
| "epoch": 2.4169647419519675, |
| "grad_norm": 0.07741393469112542, |
| "learning_rate": 5e-06, |
| "loss": 0.8275, |
| "step": 4730 |
| }, |
| { |
| "epoch": 2.4220746039856924, |
| "grad_norm": 0.07693385601963627, |
| "learning_rate": 5e-06, |
| "loss": 0.8308, |
| "step": 4740 |
| }, |
| { |
| "epoch": 2.4271844660194173, |
| "grad_norm": 0.0755618799603966, |
| "learning_rate": 5e-06, |
| "loss": 0.8405, |
| "step": 4750 |
| }, |
| { |
| "epoch": 2.4322943280531426, |
| "grad_norm": 0.09159998552156758, |
| "learning_rate": 5e-06, |
| "loss": 0.8347, |
| "step": 4760 |
| }, |
| { |
| "epoch": 2.4374041900868675, |
| "grad_norm": 0.07341845982606449, |
| "learning_rate": 5e-06, |
| "loss": 0.8322, |
| "step": 4770 |
| }, |
| { |
| "epoch": 2.442514052120593, |
| "grad_norm": 0.07237831649311181, |
| "learning_rate": 5e-06, |
| "loss": 0.8484, |
| "step": 4780 |
| }, |
| { |
| "epoch": 2.4476239141543177, |
| "grad_norm": 0.07192165535074778, |
| "learning_rate": 5e-06, |
| "loss": 0.8408, |
| "step": 4790 |
| }, |
| { |
| "epoch": 2.452733776188043, |
| "grad_norm": 0.06648324207306504, |
| "learning_rate": 5e-06, |
| "loss": 0.8425, |
| "step": 4800 |
| }, |
| { |
| "epoch": 2.457843638221768, |
| "grad_norm": 0.07448716190725979, |
| "learning_rate": 5e-06, |
| "loss": 0.8425, |
| "step": 4810 |
| }, |
| { |
| "epoch": 2.462953500255493, |
| "grad_norm": 0.07121284030804295, |
| "learning_rate": 5e-06, |
| "loss": 0.8375, |
| "step": 4820 |
| }, |
| { |
| "epoch": 2.468063362289218, |
| "grad_norm": 0.06909159105773967, |
| "learning_rate": 5e-06, |
| "loss": 0.854, |
| "step": 4830 |
| }, |
| { |
| "epoch": 2.4731732243229434, |
| "grad_norm": 0.13098921577807285, |
| "learning_rate": 5e-06, |
| "loss": 0.8384, |
| "step": 4840 |
| }, |
| { |
| "epoch": 2.4782830863566683, |
| "grad_norm": 0.07170492201621687, |
| "learning_rate": 5e-06, |
| "loss": 0.8404, |
| "step": 4850 |
| }, |
| { |
| "epoch": 2.4833929483903936, |
| "grad_norm": 0.13089324735228272, |
| "learning_rate": 5e-06, |
| "loss": 0.8348, |
| "step": 4860 |
| }, |
| { |
| "epoch": 2.4885028104241185, |
| "grad_norm": 0.08153573679797024, |
| "learning_rate": 5e-06, |
| "loss": 0.842, |
| "step": 4870 |
| }, |
| { |
| "epoch": 2.493612672457844, |
| "grad_norm": 0.07186193556818891, |
| "learning_rate": 5e-06, |
| "loss": 0.8401, |
| "step": 4880 |
| }, |
| { |
| "epoch": 2.4987225344915687, |
| "grad_norm": 0.06985782726108822, |
| "learning_rate": 5e-06, |
| "loss": 0.8358, |
| "step": 4890 |
| }, |
| { |
| "epoch": 2.5038323965252935, |
| "grad_norm": 0.08449885090909025, |
| "learning_rate": 5e-06, |
| "loss": 0.832, |
| "step": 4900 |
| }, |
| { |
| "epoch": 2.508942258559019, |
| "grad_norm": 0.07501130238223368, |
| "learning_rate": 5e-06, |
| "loss": 0.8418, |
| "step": 4910 |
| }, |
| { |
| "epoch": 2.514052120592744, |
| "grad_norm": 0.07141698939838328, |
| "learning_rate": 5e-06, |
| "loss": 0.8446, |
| "step": 4920 |
| }, |
| { |
| "epoch": 2.519161982626469, |
| "grad_norm": 0.08787977387334635, |
| "learning_rate": 5e-06, |
| "loss": 0.8452, |
| "step": 4930 |
| }, |
| { |
| "epoch": 2.524271844660194, |
| "grad_norm": 0.07291979045054302, |
| "learning_rate": 5e-06, |
| "loss": 0.8272, |
| "step": 4940 |
| }, |
| { |
| "epoch": 2.5293817066939193, |
| "grad_norm": 0.09132127326249193, |
| "learning_rate": 5e-06, |
| "loss": 0.8307, |
| "step": 4950 |
| }, |
| { |
| "epoch": 2.5344915687276446, |
| "grad_norm": 0.07043767494061426, |
| "learning_rate": 5e-06, |
| "loss": 0.8341, |
| "step": 4960 |
| }, |
| { |
| "epoch": 2.5396014307613695, |
| "grad_norm": 0.06814985809885249, |
| "learning_rate": 5e-06, |
| "loss": 0.8434, |
| "step": 4970 |
| }, |
| { |
| "epoch": 2.5447112927950943, |
| "grad_norm": 0.06881812234299259, |
| "learning_rate": 5e-06, |
| "loss": 0.8336, |
| "step": 4980 |
| }, |
| { |
| "epoch": 2.5498211548288197, |
| "grad_norm": 0.07479588795644554, |
| "learning_rate": 5e-06, |
| "loss": 0.841, |
| "step": 4990 |
| }, |
| { |
| "epoch": 2.554931016862545, |
| "grad_norm": 0.07030708837765214, |
| "learning_rate": 5e-06, |
| "loss": 0.8345, |
| "step": 5000 |
| }, |
| { |
| "epoch": 2.56004087889627, |
| "grad_norm": 0.07322531185816425, |
| "learning_rate": 5e-06, |
| "loss": 0.8522, |
| "step": 5010 |
| }, |
| { |
| "epoch": 2.5651507409299947, |
| "grad_norm": 0.0773410219834939, |
| "learning_rate": 5e-06, |
| "loss": 0.8336, |
| "step": 5020 |
| }, |
| { |
| "epoch": 2.57026060296372, |
| "grad_norm": 0.07112761941798731, |
| "learning_rate": 5e-06, |
| "loss": 0.8356, |
| "step": 5030 |
| }, |
| { |
| "epoch": 2.575370464997445, |
| "grad_norm": 0.08092671080816972, |
| "learning_rate": 5e-06, |
| "loss": 0.8395, |
| "step": 5040 |
| }, |
| { |
| "epoch": 2.5804803270311703, |
| "grad_norm": 0.0743938817129939, |
| "learning_rate": 5e-06, |
| "loss": 0.8436, |
| "step": 5050 |
| }, |
| { |
| "epoch": 2.585590189064895, |
| "grad_norm": 0.09478134737738281, |
| "learning_rate": 5e-06, |
| "loss": 0.8346, |
| "step": 5060 |
| }, |
| { |
| "epoch": 2.5907000510986204, |
| "grad_norm": 0.07135602438861294, |
| "learning_rate": 5e-06, |
| "loss": 0.8478, |
| "step": 5070 |
| }, |
| { |
| "epoch": 2.5958099131323453, |
| "grad_norm": 0.07891915237359796, |
| "learning_rate": 5e-06, |
| "loss": 0.8447, |
| "step": 5080 |
| }, |
| { |
| "epoch": 2.6009197751660706, |
| "grad_norm": 0.0677288319217476, |
| "learning_rate": 5e-06, |
| "loss": 0.837, |
| "step": 5090 |
| }, |
| { |
| "epoch": 2.6060296371997955, |
| "grad_norm": 0.07944388902515932, |
| "learning_rate": 5e-06, |
| "loss": 0.835, |
| "step": 5100 |
| }, |
| { |
| "epoch": 2.611139499233521, |
| "grad_norm": 0.08111140690525463, |
| "learning_rate": 5e-06, |
| "loss": 0.8514, |
| "step": 5110 |
| }, |
| { |
| "epoch": 2.6162493612672457, |
| "grad_norm": 0.07187495630590449, |
| "learning_rate": 5e-06, |
| "loss": 0.8218, |
| "step": 5120 |
| }, |
| { |
| "epoch": 2.6213592233009706, |
| "grad_norm": 0.07201247333070886, |
| "learning_rate": 5e-06, |
| "loss": 0.8459, |
| "step": 5130 |
| }, |
| { |
| "epoch": 2.626469085334696, |
| "grad_norm": 0.0725250109726293, |
| "learning_rate": 5e-06, |
| "loss": 0.8316, |
| "step": 5140 |
| }, |
| { |
| "epoch": 2.6315789473684212, |
| "grad_norm": 0.06989455908125097, |
| "learning_rate": 5e-06, |
| "loss": 0.8387, |
| "step": 5150 |
| }, |
| { |
| "epoch": 2.636688809402146, |
| "grad_norm": 0.0732170962484253, |
| "learning_rate": 5e-06, |
| "loss": 0.8293, |
| "step": 5160 |
| }, |
| { |
| "epoch": 2.641798671435871, |
| "grad_norm": 0.10306093874083574, |
| "learning_rate": 5e-06, |
| "loss": 0.8232, |
| "step": 5170 |
| }, |
| { |
| "epoch": 2.6469085334695963, |
| "grad_norm": 0.08216331881118105, |
| "learning_rate": 5e-06, |
| "loss": 0.8313, |
| "step": 5180 |
| }, |
| { |
| "epoch": 2.6520183955033216, |
| "grad_norm": 0.076191518190693, |
| "learning_rate": 5e-06, |
| "loss": 0.8421, |
| "step": 5190 |
| }, |
| { |
| "epoch": 2.6571282575370465, |
| "grad_norm": 0.0740134986538764, |
| "learning_rate": 5e-06, |
| "loss": 0.8299, |
| "step": 5200 |
| }, |
| { |
| "epoch": 2.6622381195707714, |
| "grad_norm": 0.07496389694090964, |
| "learning_rate": 5e-06, |
| "loss": 0.8487, |
| "step": 5210 |
| }, |
| { |
| "epoch": 2.6673479816044967, |
| "grad_norm": 0.07747091248924778, |
| "learning_rate": 5e-06, |
| "loss": 0.8347, |
| "step": 5220 |
| }, |
| { |
| "epoch": 2.672457843638222, |
| "grad_norm": 0.07574688789486558, |
| "learning_rate": 5e-06, |
| "loss": 0.8425, |
| "step": 5230 |
| }, |
| { |
| "epoch": 2.677567705671947, |
| "grad_norm": 0.6563717615599837, |
| "learning_rate": 5e-06, |
| "loss": 0.84, |
| "step": 5240 |
| }, |
| { |
| "epoch": 2.682677567705672, |
| "grad_norm": 0.0686367826525851, |
| "learning_rate": 5e-06, |
| "loss": 0.8379, |
| "step": 5250 |
| }, |
| { |
| "epoch": 2.687787429739397, |
| "grad_norm": 0.07371216319372703, |
| "learning_rate": 5e-06, |
| "loss": 0.8394, |
| "step": 5260 |
| }, |
| { |
| "epoch": 2.692897291773122, |
| "grad_norm": 0.08012300669491436, |
| "learning_rate": 5e-06, |
| "loss": 0.8384, |
| "step": 5270 |
| }, |
| { |
| "epoch": 2.6980071538068473, |
| "grad_norm": 0.07316847913612938, |
| "learning_rate": 5e-06, |
| "loss": 0.8282, |
| "step": 5280 |
| }, |
| { |
| "epoch": 2.703117015840572, |
| "grad_norm": 0.07844263026076834, |
| "learning_rate": 5e-06, |
| "loss": 0.8436, |
| "step": 5290 |
| }, |
| { |
| "epoch": 2.7082268778742975, |
| "grad_norm": 0.07050662660833308, |
| "learning_rate": 5e-06, |
| "loss": 0.8351, |
| "step": 5300 |
| }, |
| { |
| "epoch": 2.7133367399080224, |
| "grad_norm": 0.07050037035095988, |
| "learning_rate": 5e-06, |
| "loss": 0.8316, |
| "step": 5310 |
| }, |
| { |
| "epoch": 2.7184466019417477, |
| "grad_norm": 0.08412885461053499, |
| "learning_rate": 5e-06, |
| "loss": 0.8222, |
| "step": 5320 |
| }, |
| { |
| "epoch": 2.7235564639754726, |
| "grad_norm": 0.0687520823166467, |
| "learning_rate": 5e-06, |
| "loss": 0.8348, |
| "step": 5330 |
| }, |
| { |
| "epoch": 2.728666326009198, |
| "grad_norm": 0.06966312923863605, |
| "learning_rate": 5e-06, |
| "loss": 0.8441, |
| "step": 5340 |
| }, |
| { |
| "epoch": 2.7337761880429228, |
| "grad_norm": 0.07226731254674132, |
| "learning_rate": 5e-06, |
| "loss": 0.848, |
| "step": 5350 |
| }, |
| { |
| "epoch": 2.738886050076648, |
| "grad_norm": 0.06981884709594542, |
| "learning_rate": 5e-06, |
| "loss": 0.8431, |
| "step": 5360 |
| }, |
| { |
| "epoch": 2.743995912110373, |
| "grad_norm": 0.2293923868794652, |
| "learning_rate": 5e-06, |
| "loss": 0.8258, |
| "step": 5370 |
| }, |
| { |
| "epoch": 2.7491057741440983, |
| "grad_norm": 0.0711478134486219, |
| "learning_rate": 5e-06, |
| "loss": 0.8457, |
| "step": 5380 |
| }, |
| { |
| "epoch": 2.754215636177823, |
| "grad_norm": 0.0748407931600363, |
| "learning_rate": 5e-06, |
| "loss": 0.8368, |
| "step": 5390 |
| }, |
| { |
| "epoch": 2.759325498211548, |
| "grad_norm": 0.07392069328246453, |
| "learning_rate": 5e-06, |
| "loss": 0.8352, |
| "step": 5400 |
| }, |
| { |
| "epoch": 2.7644353602452734, |
| "grad_norm": 0.0721976398087471, |
| "learning_rate": 5e-06, |
| "loss": 0.8393, |
| "step": 5410 |
| }, |
| { |
| "epoch": 2.7695452222789987, |
| "grad_norm": 0.07095428053471639, |
| "learning_rate": 5e-06, |
| "loss": 0.8471, |
| "step": 5420 |
| }, |
| { |
| "epoch": 2.7746550843127236, |
| "grad_norm": 0.07005521600898579, |
| "learning_rate": 5e-06, |
| "loss": 0.8313, |
| "step": 5430 |
| }, |
| { |
| "epoch": 2.7797649463464484, |
| "grad_norm": 0.07197884091944991, |
| "learning_rate": 5e-06, |
| "loss": 0.8344, |
| "step": 5440 |
| }, |
| { |
| "epoch": 2.7848748083801738, |
| "grad_norm": 0.07344997287379904, |
| "learning_rate": 5e-06, |
| "loss": 0.8362, |
| "step": 5450 |
| }, |
| { |
| "epoch": 2.789984670413899, |
| "grad_norm": 0.08322662489974626, |
| "learning_rate": 5e-06, |
| "loss": 0.8367, |
| "step": 5460 |
| }, |
| { |
| "epoch": 2.795094532447624, |
| "grad_norm": 0.07375885412776004, |
| "learning_rate": 5e-06, |
| "loss": 0.8512, |
| "step": 5470 |
| }, |
| { |
| "epoch": 2.800204394481349, |
| "grad_norm": 0.07070472807893792, |
| "learning_rate": 5e-06, |
| "loss": 0.8253, |
| "step": 5480 |
| }, |
| { |
| "epoch": 2.805314256515074, |
| "grad_norm": 0.07428848558504005, |
| "learning_rate": 5e-06, |
| "loss": 0.8468, |
| "step": 5490 |
| }, |
| { |
| "epoch": 2.810424118548799, |
| "grad_norm": 0.07307806588861744, |
| "learning_rate": 5e-06, |
| "loss": 0.8234, |
| "step": 5500 |
| }, |
| { |
| "epoch": 2.8155339805825244, |
| "grad_norm": 0.06945402346011086, |
| "learning_rate": 5e-06, |
| "loss": 0.8249, |
| "step": 5510 |
| }, |
| { |
| "epoch": 2.8206438426162492, |
| "grad_norm": 0.07097605102696264, |
| "learning_rate": 5e-06, |
| "loss": 0.825, |
| "step": 5520 |
| }, |
| { |
| "epoch": 2.8257537046499746, |
| "grad_norm": 0.06978991852647402, |
| "learning_rate": 5e-06, |
| "loss": 0.832, |
| "step": 5530 |
| }, |
| { |
| "epoch": 2.8308635666836994, |
| "grad_norm": 0.06908078380765026, |
| "learning_rate": 5e-06, |
| "loss": 0.8399, |
| "step": 5540 |
| }, |
| { |
| "epoch": 2.8359734287174247, |
| "grad_norm": 0.07440005138379917, |
| "learning_rate": 5e-06, |
| "loss": 0.8586, |
| "step": 5550 |
| }, |
| { |
| "epoch": 2.8410832907511496, |
| "grad_norm": 0.07215664654991572, |
| "learning_rate": 5e-06, |
| "loss": 0.8344, |
| "step": 5560 |
| }, |
| { |
| "epoch": 2.846193152784875, |
| "grad_norm": 0.07245462980842311, |
| "learning_rate": 5e-06, |
| "loss": 0.8377, |
| "step": 5570 |
| }, |
| { |
| "epoch": 2.8513030148186, |
| "grad_norm": 0.09751916723568736, |
| "learning_rate": 5e-06, |
| "loss": 0.8359, |
| "step": 5580 |
| }, |
| { |
| "epoch": 2.856412876852325, |
| "grad_norm": 0.0855945660262981, |
| "learning_rate": 5e-06, |
| "loss": 0.8204, |
| "step": 5590 |
| }, |
| { |
| "epoch": 2.86152273888605, |
| "grad_norm": 0.07362144431075729, |
| "learning_rate": 5e-06, |
| "loss": 0.8339, |
| "step": 5600 |
| }, |
| { |
| "epoch": 2.8666326009197753, |
| "grad_norm": 0.13422144896661692, |
| "learning_rate": 5e-06, |
| "loss": 0.8156, |
| "step": 5610 |
| }, |
| { |
| "epoch": 2.8717424629535, |
| "grad_norm": 0.07766367283659784, |
| "learning_rate": 5e-06, |
| "loss": 0.8366, |
| "step": 5620 |
| }, |
| { |
| "epoch": 2.876852324987225, |
| "grad_norm": 0.07067764533904561, |
| "learning_rate": 5e-06, |
| "loss": 0.8319, |
| "step": 5630 |
| }, |
| { |
| "epoch": 2.8819621870209504, |
| "grad_norm": 0.07333934553515849, |
| "learning_rate": 5e-06, |
| "loss": 0.8403, |
| "step": 5640 |
| }, |
| { |
| "epoch": 2.8870720490546757, |
| "grad_norm": 0.06944274451784573, |
| "learning_rate": 5e-06, |
| "loss": 0.8147, |
| "step": 5650 |
| }, |
| { |
| "epoch": 2.8921819110884006, |
| "grad_norm": 0.07851574427605637, |
| "learning_rate": 5e-06, |
| "loss": 0.8324, |
| "step": 5660 |
| }, |
| { |
| "epoch": 2.8972917731221255, |
| "grad_norm": 0.07149894775276078, |
| "learning_rate": 5e-06, |
| "loss": 0.8299, |
| "step": 5670 |
| }, |
| { |
| "epoch": 2.902401635155851, |
| "grad_norm": 0.06665250066960869, |
| "learning_rate": 5e-06, |
| "loss": 0.8375, |
| "step": 5680 |
| }, |
| { |
| "epoch": 2.907511497189576, |
| "grad_norm": 0.07577406445198871, |
| "learning_rate": 5e-06, |
| "loss": 0.8448, |
| "step": 5690 |
| }, |
| { |
| "epoch": 2.912621359223301, |
| "grad_norm": 0.0713958097703238, |
| "learning_rate": 5e-06, |
| "loss": 0.8229, |
| "step": 5700 |
| }, |
| { |
| "epoch": 2.917731221257026, |
| "grad_norm": 0.06957330049674462, |
| "learning_rate": 5e-06, |
| "loss": 0.8183, |
| "step": 5710 |
| }, |
| { |
| "epoch": 2.922841083290751, |
| "grad_norm": 0.07288163057814427, |
| "learning_rate": 5e-06, |
| "loss": 0.8331, |
| "step": 5720 |
| }, |
| { |
| "epoch": 2.927950945324476, |
| "grad_norm": 0.08344782116194745, |
| "learning_rate": 5e-06, |
| "loss": 0.8438, |
| "step": 5730 |
| }, |
| { |
| "epoch": 2.9330608073582014, |
| "grad_norm": 0.07775582347773638, |
| "learning_rate": 5e-06, |
| "loss": 0.8366, |
| "step": 5740 |
| }, |
| { |
| "epoch": 2.9381706693919263, |
| "grad_norm": 0.07127430773847676, |
| "learning_rate": 5e-06, |
| "loss": 0.8292, |
| "step": 5750 |
| }, |
| { |
| "epoch": 2.9432805314256516, |
| "grad_norm": 0.07234002587778945, |
| "learning_rate": 5e-06, |
| "loss": 0.8243, |
| "step": 5760 |
| }, |
| { |
| "epoch": 2.9483903934593765, |
| "grad_norm": 0.07200420571093186, |
| "learning_rate": 5e-06, |
| "loss": 0.8359, |
| "step": 5770 |
| }, |
| { |
| "epoch": 2.953500255493102, |
| "grad_norm": 0.07108664442621124, |
| "learning_rate": 5e-06, |
| "loss": 0.8238, |
| "step": 5780 |
| }, |
| { |
| "epoch": 2.9586101175268267, |
| "grad_norm": 0.07157660179737228, |
| "learning_rate": 5e-06, |
| "loss": 0.8396, |
| "step": 5790 |
| }, |
| { |
| "epoch": 2.963719979560552, |
| "grad_norm": 0.0703513442497635, |
| "learning_rate": 5e-06, |
| "loss": 0.846, |
| "step": 5800 |
| }, |
| { |
| "epoch": 2.968829841594277, |
| "grad_norm": 0.07279488037158444, |
| "learning_rate": 5e-06, |
| "loss": 0.8328, |
| "step": 5810 |
| }, |
| { |
| "epoch": 2.973939703628002, |
| "grad_norm": 0.07319627670370121, |
| "learning_rate": 5e-06, |
| "loss": 0.839, |
| "step": 5820 |
| }, |
| { |
| "epoch": 2.979049565661727, |
| "grad_norm": 0.06864454810394688, |
| "learning_rate": 5e-06, |
| "loss": 0.838, |
| "step": 5830 |
| }, |
| { |
| "epoch": 2.9841594276954524, |
| "grad_norm": 0.07409945387796224, |
| "learning_rate": 5e-06, |
| "loss": 0.8325, |
| "step": 5840 |
| }, |
| { |
| "epoch": 2.9892692897291773, |
| "grad_norm": 0.0827828571554194, |
| "learning_rate": 5e-06, |
| "loss": 0.8348, |
| "step": 5850 |
| }, |
| { |
| "epoch": 2.994379151762902, |
| "grad_norm": 0.07176468776677063, |
| "learning_rate": 5e-06, |
| "loss": 0.8345, |
| "step": 5860 |
| }, |
| { |
| "epoch": 2.9994890137966275, |
| "grad_norm": 0.07248509371414884, |
| "learning_rate": 5e-06, |
| "loss": 0.8471, |
| "step": 5870 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 5871, |
| "total_flos": 1.808135467381555e+16, |
| "train_loss": 0.8794001941176538, |
| "train_runtime": 31816.8802, |
| "train_samples_per_second": 94.436, |
| "train_steps_per_second": 0.185 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 5871, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.808135467381555e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|