diff --git "a/checkpoint-1431/trainer_state.json" "b/checkpoint-1431/trainer_state.json" deleted file mode 100644--- "a/checkpoint-1431/trainer_state.json" +++ /dev/null @@ -1,10066 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.0, - "eval_steps": 500, - "global_step": 1431, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0006988120195667365, - "grad_norm": 29.745662689208984, - "learning_rate": 3.472222222222222e-08, - "loss": 0.4892, - "step": 1 - }, - { - "epoch": 0.001397624039133473, - "grad_norm": 27.119117736816406, - "learning_rate": 6.944444444444444e-08, - "loss": 0.5036, - "step": 2 - }, - { - "epoch": 0.0020964360587002098, - "grad_norm": 26.687828063964844, - "learning_rate": 1.0416666666666667e-07, - "loss": 0.4122, - "step": 3 - }, - { - "epoch": 0.002795248078266946, - "grad_norm": 34.61165237426758, - "learning_rate": 1.3888888888888888e-07, - "loss": 0.364, - "step": 4 - }, - { - "epoch": 0.0034940600978336828, - "grad_norm": 45.05033874511719, - "learning_rate": 1.7361111111111115e-07, - "loss": 1.0297, - "step": 5 - }, - { - "epoch": 0.0041928721174004195, - "grad_norm": 26.53506851196289, - "learning_rate": 2.0833333333333333e-07, - "loss": 0.2974, - "step": 6 - }, - { - "epoch": 0.004891684136967156, - "grad_norm": 26.738407135009766, - "learning_rate": 2.430555555555556e-07, - "loss": 0.3303, - "step": 7 - }, - { - "epoch": 0.005590496156533892, - "grad_norm": 14.776915550231934, - "learning_rate": 2.7777777777777776e-07, - "loss": 0.1818, - "step": 8 - }, - { - "epoch": 0.006289308176100629, - "grad_norm": 23.260135650634766, - "learning_rate": 3.125e-07, - "loss": 0.2969, - "step": 9 - }, - { - "epoch": 0.0069881201956673656, - "grad_norm": 15.725804328918457, - "learning_rate": 3.472222222222223e-07, - "loss": 0.2365, - "step": 10 - }, - { - "epoch": 0.007686932215234102, - "grad_norm": 76.81114196777344, - "learning_rate": 3.819444444444445e-07, - "loss": 1.1389, - "step": 11 - }, - { - "epoch": 0.008385744234800839, - "grad_norm": 18.762008666992188, - "learning_rate": 4.1666666666666667e-07, - "loss": 0.2786, - "step": 12 - }, - { - "epoch": 0.009084556254367574, - "grad_norm": 19.521512985229492, - "learning_rate": 4.5138888888888893e-07, - "loss": 0.2404, - "step": 13 - }, - { - "epoch": 0.009783368273934312, - "grad_norm": 53.75028610229492, - "learning_rate": 4.861111111111112e-07, - "loss": 1.0421, - "step": 14 - }, - { - "epoch": 0.010482180293501049, - "grad_norm": 23.264202117919922, - "learning_rate": 5.208333333333334e-07, - "loss": 0.3734, - "step": 15 - }, - { - "epoch": 0.011180992313067784, - "grad_norm": 24.78360366821289, - "learning_rate": 5.555555555555555e-07, - "loss": 0.3425, - "step": 16 - }, - { - "epoch": 0.011879804332634521, - "grad_norm": 8.888618469238281, - "learning_rate": 5.902777777777778e-07, - "loss": 0.1022, - "step": 17 - }, - { - "epoch": 0.012578616352201259, - "grad_norm": 47.52095413208008, - "learning_rate": 6.25e-07, - "loss": 0.7342, - "step": 18 - }, - { - "epoch": 0.013277428371767994, - "grad_norm": 22.385765075683594, - "learning_rate": 6.597222222222223e-07, - "loss": 0.3538, - "step": 19 - }, - { - "epoch": 0.013976240391334731, - "grad_norm": 10.82557201385498, - "learning_rate": 6.944444444444446e-07, - "loss": 0.092, - "step": 20 - }, - { - "epoch": 0.014675052410901468, - "grad_norm": 22.204137802124023, - "learning_rate": 7.291666666666667e-07, - "loss": 0.3104, - "step": 21 - }, - { - "epoch": 0.015373864430468204, - "grad_norm": 19.167959213256836, - "learning_rate": 7.63888888888889e-07, - "loss": 0.3576, - "step": 22 - }, - { - "epoch": 0.01607267645003494, - "grad_norm": 17.112138748168945, - "learning_rate": 7.986111111111111e-07, - "loss": 0.3013, - "step": 23 - }, - { - "epoch": 0.016771488469601678, - "grad_norm": 10.015545845031738, - "learning_rate": 8.333333333333333e-07, - "loss": 0.1068, - "step": 24 - }, - { - "epoch": 0.017470300489168415, - "grad_norm": 10.68148136138916, - "learning_rate": 8.680555555555556e-07, - "loss": 0.1036, - "step": 25 - }, - { - "epoch": 0.01816911250873515, - "grad_norm": 25.902118682861328, - "learning_rate": 9.027777777777779e-07, - "loss": 0.2941, - "step": 26 - }, - { - "epoch": 0.018867924528301886, - "grad_norm": 14.709237098693848, - "learning_rate": 9.375000000000001e-07, - "loss": 0.1508, - "step": 27 - }, - { - "epoch": 0.019566736547868623, - "grad_norm": 11.095511436462402, - "learning_rate": 9.722222222222224e-07, - "loss": 0.0734, - "step": 28 - }, - { - "epoch": 0.02026554856743536, - "grad_norm": 6.820224761962891, - "learning_rate": 1.0069444444444447e-06, - "loss": 0.0676, - "step": 29 - }, - { - "epoch": 0.020964360587002098, - "grad_norm": 35.560279846191406, - "learning_rate": 1.0416666666666667e-06, - "loss": 0.5665, - "step": 30 - }, - { - "epoch": 0.02166317260656883, - "grad_norm": 18.7200984954834, - "learning_rate": 1.076388888888889e-06, - "loss": 0.2377, - "step": 31 - }, - { - "epoch": 0.02236198462613557, - "grad_norm": 24.008363723754883, - "learning_rate": 1.111111111111111e-06, - "loss": 0.3863, - "step": 32 - }, - { - "epoch": 0.023060796645702306, - "grad_norm": 42.345916748046875, - "learning_rate": 1.1458333333333333e-06, - "loss": 0.6234, - "step": 33 - }, - { - "epoch": 0.023759608665269043, - "grad_norm": 14.686384201049805, - "learning_rate": 1.1805555555555556e-06, - "loss": 0.1656, - "step": 34 - }, - { - "epoch": 0.02445842068483578, - "grad_norm": 15.544900894165039, - "learning_rate": 1.2152777777777778e-06, - "loss": 0.0781, - "step": 35 - }, - { - "epoch": 0.025157232704402517, - "grad_norm": 20.373491287231445, - "learning_rate": 1.25e-06, - "loss": 0.3014, - "step": 36 - }, - { - "epoch": 0.02585604472396925, - "grad_norm": 19.440364837646484, - "learning_rate": 1.2847222222222222e-06, - "loss": 0.2605, - "step": 37 - }, - { - "epoch": 0.026554856743535988, - "grad_norm": 14.13811206817627, - "learning_rate": 1.3194444444444446e-06, - "loss": 0.0976, - "step": 38 - }, - { - "epoch": 0.027253668763102725, - "grad_norm": 16.72405433654785, - "learning_rate": 1.3541666666666667e-06, - "loss": 0.189, - "step": 39 - }, - { - "epoch": 0.027952480782669462, - "grad_norm": 26.501976013183594, - "learning_rate": 1.3888888888888892e-06, - "loss": 0.5341, - "step": 40 - }, - { - "epoch": 0.0286512928022362, - "grad_norm": 16.74985122680664, - "learning_rate": 1.4236111111111112e-06, - "loss": 0.1534, - "step": 41 - }, - { - "epoch": 0.029350104821802937, - "grad_norm": 37.214256286621094, - "learning_rate": 1.4583333333333335e-06, - "loss": 0.5655, - "step": 42 - }, - { - "epoch": 0.03004891684136967, - "grad_norm": 7.750210285186768, - "learning_rate": 1.4930555555555555e-06, - "loss": 0.052, - "step": 43 - }, - { - "epoch": 0.030747728860936407, - "grad_norm": 18.785743713378906, - "learning_rate": 1.527777777777778e-06, - "loss": 0.3389, - "step": 44 - }, - { - "epoch": 0.031446540880503145, - "grad_norm": 31.0648250579834, - "learning_rate": 1.5625e-06, - "loss": 0.3845, - "step": 45 - }, - { - "epoch": 0.03214535290006988, - "grad_norm": 52.98978805541992, - "learning_rate": 1.5972222222222221e-06, - "loss": 0.8845, - "step": 46 - }, - { - "epoch": 0.03284416491963662, - "grad_norm": 26.285877227783203, - "learning_rate": 1.6319444444444446e-06, - "loss": 0.3638, - "step": 47 - }, - { - "epoch": 0.033542976939203356, - "grad_norm": 30.1326904296875, - "learning_rate": 1.6666666666666667e-06, - "loss": 0.3756, - "step": 48 - }, - { - "epoch": 0.03424178895877009, - "grad_norm": 19.73238754272461, - "learning_rate": 1.7013888888888891e-06, - "loss": 0.3055, - "step": 49 - }, - { - "epoch": 0.03494060097833683, - "grad_norm": 21.93083381652832, - "learning_rate": 1.7361111111111112e-06, - "loss": 0.2084, - "step": 50 - }, - { - "epoch": 0.03563941299790356, - "grad_norm": 44.288185119628906, - "learning_rate": 1.7708333333333337e-06, - "loss": 0.6509, - "step": 51 - }, - { - "epoch": 0.0363382250174703, - "grad_norm": 10.11754035949707, - "learning_rate": 1.8055555555555557e-06, - "loss": 0.1512, - "step": 52 - }, - { - "epoch": 0.037037037037037035, - "grad_norm": 16.429866790771484, - "learning_rate": 1.840277777777778e-06, - "loss": 0.2423, - "step": 53 - }, - { - "epoch": 0.03773584905660377, - "grad_norm": 53.626380920410156, - "learning_rate": 1.8750000000000003e-06, - "loss": 0.8959, - "step": 54 - }, - { - "epoch": 0.03843466107617051, - "grad_norm": 18.586044311523438, - "learning_rate": 1.909722222222222e-06, - "loss": 0.4167, - "step": 55 - }, - { - "epoch": 0.039133473095737246, - "grad_norm": 22.02561378479004, - "learning_rate": 1.944444444444445e-06, - "loss": 0.3638, - "step": 56 - }, - { - "epoch": 0.039832285115303984, - "grad_norm": 12.25664234161377, - "learning_rate": 1.9791666666666666e-06, - "loss": 0.1458, - "step": 57 - }, - { - "epoch": 0.04053109713487072, - "grad_norm": 26.37610626220703, - "learning_rate": 2.0138888888888893e-06, - "loss": 0.4709, - "step": 58 - }, - { - "epoch": 0.04122990915443746, - "grad_norm": 23.607717514038086, - "learning_rate": 2.048611111111111e-06, - "loss": 0.3147, - "step": 59 - }, - { - "epoch": 0.041928721174004195, - "grad_norm": 12.090802192687988, - "learning_rate": 2.0833333333333334e-06, - "loss": 0.1457, - "step": 60 - }, - { - "epoch": 0.04262753319357093, - "grad_norm": 9.616584777832031, - "learning_rate": 2.1180555555555557e-06, - "loss": 0.1364, - "step": 61 - }, - { - "epoch": 0.04332634521313766, - "grad_norm": 22.472373962402344, - "learning_rate": 2.152777777777778e-06, - "loss": 0.4109, - "step": 62 - }, - { - "epoch": 0.0440251572327044, - "grad_norm": 15.625946044921875, - "learning_rate": 2.1875000000000002e-06, - "loss": 0.1571, - "step": 63 - }, - { - "epoch": 0.04472396925227114, - "grad_norm": 15.733026504516602, - "learning_rate": 2.222222222222222e-06, - "loss": 0.2123, - "step": 64 - }, - { - "epoch": 0.045422781271837874, - "grad_norm": 17.976316452026367, - "learning_rate": 2.2569444444444448e-06, - "loss": 0.211, - "step": 65 - }, - { - "epoch": 0.04612159329140461, - "grad_norm": 33.2844123840332, - "learning_rate": 2.2916666666666666e-06, - "loss": 0.633, - "step": 66 - }, - { - "epoch": 0.04682040531097135, - "grad_norm": 46.91634750366211, - "learning_rate": 2.3263888888888893e-06, - "loss": 0.5234, - "step": 67 - }, - { - "epoch": 0.047519217330538085, - "grad_norm": 37.31273651123047, - "learning_rate": 2.361111111111111e-06, - "loss": 0.603, - "step": 68 - }, - { - "epoch": 0.04821802935010482, - "grad_norm": 37.39180374145508, - "learning_rate": 2.395833333333334e-06, - "loss": 0.6505, - "step": 69 - }, - { - "epoch": 0.04891684136967156, - "grad_norm": 23.082998275756836, - "learning_rate": 2.4305555555555557e-06, - "loss": 0.3517, - "step": 70 - }, - { - "epoch": 0.0496156533892383, - "grad_norm": 6.721218109130859, - "learning_rate": 2.465277777777778e-06, - "loss": 0.0438, - "step": 71 - }, - { - "epoch": 0.050314465408805034, - "grad_norm": 17.105623245239258, - "learning_rate": 2.5e-06, - "loss": 0.2446, - "step": 72 - }, - { - "epoch": 0.05101327742837177, - "grad_norm": 21.732906341552734, - "learning_rate": 2.5347222222222225e-06, - "loss": 0.2211, - "step": 73 - }, - { - "epoch": 0.0517120894479385, - "grad_norm": 11.384443283081055, - "learning_rate": 2.5694444444444443e-06, - "loss": 0.1657, - "step": 74 - }, - { - "epoch": 0.05241090146750524, - "grad_norm": 4.708110809326172, - "learning_rate": 2.604166666666667e-06, - "loss": 0.0341, - "step": 75 - }, - { - "epoch": 0.053109713487071976, - "grad_norm": 14.210625648498535, - "learning_rate": 2.6388888888888893e-06, - "loss": 0.1754, - "step": 76 - }, - { - "epoch": 0.05380852550663871, - "grad_norm": 13.893073081970215, - "learning_rate": 2.673611111111111e-06, - "loss": 0.1682, - "step": 77 - }, - { - "epoch": 0.05450733752620545, - "grad_norm": 18.905048370361328, - "learning_rate": 2.7083333333333334e-06, - "loss": 0.2632, - "step": 78 - }, - { - "epoch": 0.05520614954577219, - "grad_norm": 12.439525604248047, - "learning_rate": 2.743055555555556e-06, - "loss": 0.1458, - "step": 79 - }, - { - "epoch": 0.055904961565338925, - "grad_norm": 24.605615615844727, - "learning_rate": 2.7777777777777783e-06, - "loss": 0.3817, - "step": 80 - }, - { - "epoch": 0.05660377358490566, - "grad_norm": 27.12447738647461, - "learning_rate": 2.8125e-06, - "loss": 0.5378, - "step": 81 - }, - { - "epoch": 0.0573025856044724, - "grad_norm": 21.627233505249023, - "learning_rate": 2.8472222222222224e-06, - "loss": 0.3125, - "step": 82 - }, - { - "epoch": 0.058001397624039136, - "grad_norm": 2.656435966491699, - "learning_rate": 2.8819444444444443e-06, - "loss": 0.0084, - "step": 83 - }, - { - "epoch": 0.05870020964360587, - "grad_norm": 20.46363067626953, - "learning_rate": 2.916666666666667e-06, - "loss": 0.279, - "step": 84 - }, - { - "epoch": 0.0593990216631726, - "grad_norm": 17.098342895507812, - "learning_rate": 2.9513888888888892e-06, - "loss": 0.2007, - "step": 85 - }, - { - "epoch": 0.06009783368273934, - "grad_norm": 22.730363845825195, - "learning_rate": 2.986111111111111e-06, - "loss": 0.3265, - "step": 86 - }, - { - "epoch": 0.06079664570230608, - "grad_norm": 13.15532112121582, - "learning_rate": 3.0208333333333334e-06, - "loss": 0.1778, - "step": 87 - }, - { - "epoch": 0.061495457721872815, - "grad_norm": 12.832003593444824, - "learning_rate": 3.055555555555556e-06, - "loss": 0.1664, - "step": 88 - }, - { - "epoch": 0.06219426974143955, - "grad_norm": 4.212726593017578, - "learning_rate": 3.0902777777777783e-06, - "loss": 0.0126, - "step": 89 - }, - { - "epoch": 0.06289308176100629, - "grad_norm": 9.53214168548584, - "learning_rate": 3.125e-06, - "loss": 0.0876, - "step": 90 - }, - { - "epoch": 0.06359189378057302, - "grad_norm": 53.41883087158203, - "learning_rate": 3.1597222222222224e-06, - "loss": 0.9937, - "step": 91 - }, - { - "epoch": 0.06429070580013976, - "grad_norm": 16.07880973815918, - "learning_rate": 3.1944444444444443e-06, - "loss": 0.1187, - "step": 92 - }, - { - "epoch": 0.0649895178197065, - "grad_norm": 4.490603446960449, - "learning_rate": 3.229166666666667e-06, - "loss": 0.015, - "step": 93 - }, - { - "epoch": 0.06568832983927324, - "grad_norm": 32.008087158203125, - "learning_rate": 3.2638888888888892e-06, - "loss": 0.5616, - "step": 94 - }, - { - "epoch": 0.06638714185883997, - "grad_norm": 30.389848709106445, - "learning_rate": 3.2986111111111115e-06, - "loss": 0.4378, - "step": 95 - }, - { - "epoch": 0.06708595387840671, - "grad_norm": 20.06416130065918, - "learning_rate": 3.3333333333333333e-06, - "loss": 0.2875, - "step": 96 - }, - { - "epoch": 0.06778476589797344, - "grad_norm": 8.978775978088379, - "learning_rate": 3.368055555555556e-06, - "loss": 0.1084, - "step": 97 - }, - { - "epoch": 0.06848357791754019, - "grad_norm": 59.80595779418945, - "learning_rate": 3.4027777777777783e-06, - "loss": 0.9074, - "step": 98 - }, - { - "epoch": 0.06918238993710692, - "grad_norm": 18.711381912231445, - "learning_rate": 3.4375e-06, - "loss": 0.2926, - "step": 99 - }, - { - "epoch": 0.06988120195667366, - "grad_norm": 9.171241760253906, - "learning_rate": 3.4722222222222224e-06, - "loss": 0.1213, - "step": 100 - }, - { - "epoch": 0.07058001397624039, - "grad_norm": 12.976991653442383, - "learning_rate": 3.5069444444444447e-06, - "loss": 0.2017, - "step": 101 - }, - { - "epoch": 0.07127882599580712, - "grad_norm": 2.725184917449951, - "learning_rate": 3.5416666666666673e-06, - "loss": 0.0167, - "step": 102 - }, - { - "epoch": 0.07197763801537387, - "grad_norm": 12.85250473022461, - "learning_rate": 3.576388888888889e-06, - "loss": 0.1463, - "step": 103 - }, - { - "epoch": 0.0726764500349406, - "grad_norm": 23.173830032348633, - "learning_rate": 3.6111111111111115e-06, - "loss": 0.3149, - "step": 104 - }, - { - "epoch": 0.07337526205450734, - "grad_norm": 32.99692153930664, - "learning_rate": 3.6458333333333333e-06, - "loss": 0.6735, - "step": 105 - }, - { - "epoch": 0.07407407407407407, - "grad_norm": 15.19417667388916, - "learning_rate": 3.680555555555556e-06, - "loss": 0.2826, - "step": 106 - }, - { - "epoch": 0.07477288609364081, - "grad_norm": 13.61240291595459, - "learning_rate": 3.7152777777777783e-06, - "loss": 0.1674, - "step": 107 - }, - { - "epoch": 0.07547169811320754, - "grad_norm": 10.538482666015625, - "learning_rate": 3.7500000000000005e-06, - "loss": 0.1172, - "step": 108 - }, - { - "epoch": 0.07617051013277429, - "grad_norm": 9.325597763061523, - "learning_rate": 3.7847222222222224e-06, - "loss": 0.0898, - "step": 109 - }, - { - "epoch": 0.07686932215234102, - "grad_norm": 29.34699821472168, - "learning_rate": 3.819444444444444e-06, - "loss": 0.4043, - "step": 110 - }, - { - "epoch": 0.07756813417190776, - "grad_norm": 10.5206937789917, - "learning_rate": 3.854166666666667e-06, - "loss": 0.1151, - "step": 111 - }, - { - "epoch": 0.07826694619147449, - "grad_norm": 16.31255531311035, - "learning_rate": 3.88888888888889e-06, - "loss": 0.1857, - "step": 112 - }, - { - "epoch": 0.07896575821104122, - "grad_norm": 10.712072372436523, - "learning_rate": 3.9236111111111114e-06, - "loss": 0.0467, - "step": 113 - }, - { - "epoch": 0.07966457023060797, - "grad_norm": 49.4650764465332, - "learning_rate": 3.958333333333333e-06, - "loss": 0.8661, - "step": 114 - }, - { - "epoch": 0.0803633822501747, - "grad_norm": 10.439119338989258, - "learning_rate": 3.993055555555556e-06, - "loss": 0.1552, - "step": 115 - }, - { - "epoch": 0.08106219426974144, - "grad_norm": 8.641897201538086, - "learning_rate": 4.027777777777779e-06, - "loss": 0.0724, - "step": 116 - }, - { - "epoch": 0.08176100628930817, - "grad_norm": 21.5423641204834, - "learning_rate": 4.0625000000000005e-06, - "loss": 0.3887, - "step": 117 - }, - { - "epoch": 0.08245981830887492, - "grad_norm": 10.582996368408203, - "learning_rate": 4.097222222222222e-06, - "loss": 0.1831, - "step": 118 - }, - { - "epoch": 0.08315863032844165, - "grad_norm": 4.447153568267822, - "learning_rate": 4.131944444444444e-06, - "loss": 0.0429, - "step": 119 - }, - { - "epoch": 0.08385744234800839, - "grad_norm": 4.488365173339844, - "learning_rate": 4.166666666666667e-06, - "loss": 0.0286, - "step": 120 - }, - { - "epoch": 0.08455625436757512, - "grad_norm": 27.888690948486328, - "learning_rate": 4.2013888888888896e-06, - "loss": 0.5963, - "step": 121 - }, - { - "epoch": 0.08525506638714186, - "grad_norm": 22.506404876708984, - "learning_rate": 4.236111111111111e-06, - "loss": 0.4488, - "step": 122 - }, - { - "epoch": 0.0859538784067086, - "grad_norm": 12.767803192138672, - "learning_rate": 4.270833333333333e-06, - "loss": 0.0468, - "step": 123 - }, - { - "epoch": 0.08665269042627533, - "grad_norm": 21.219179153442383, - "learning_rate": 4.305555555555556e-06, - "loss": 0.396, - "step": 124 - }, - { - "epoch": 0.08735150244584207, - "grad_norm": 19.373239517211914, - "learning_rate": 4.340277777777779e-06, - "loss": 0.2474, - "step": 125 - }, - { - "epoch": 0.0880503144654088, - "grad_norm": 1.8469300270080566, - "learning_rate": 4.3750000000000005e-06, - "loss": 0.0126, - "step": 126 - }, - { - "epoch": 0.08874912648497554, - "grad_norm": 2.2161784172058105, - "learning_rate": 4.409722222222222e-06, - "loss": 0.0194, - "step": 127 - }, - { - "epoch": 0.08944793850454227, - "grad_norm": 10.892654418945312, - "learning_rate": 4.444444444444444e-06, - "loss": 0.1174, - "step": 128 - }, - { - "epoch": 0.09014675052410902, - "grad_norm": 21.090118408203125, - "learning_rate": 4.479166666666667e-06, - "loss": 0.2698, - "step": 129 - }, - { - "epoch": 0.09084556254367575, - "grad_norm": 15.25643539428711, - "learning_rate": 4.5138888888888895e-06, - "loss": 0.1317, - "step": 130 - }, - { - "epoch": 0.09154437456324249, - "grad_norm": 31.168315887451172, - "learning_rate": 4.548611111111111e-06, - "loss": 0.4776, - "step": 131 - }, - { - "epoch": 0.09224318658280922, - "grad_norm": 33.08750534057617, - "learning_rate": 4.583333333333333e-06, - "loss": 0.4328, - "step": 132 - }, - { - "epoch": 0.09294199860237597, - "grad_norm": 52.183746337890625, - "learning_rate": 4.618055555555556e-06, - "loss": 0.564, - "step": 133 - }, - { - "epoch": 0.0936408106219427, - "grad_norm": 40.50120162963867, - "learning_rate": 4.652777777777779e-06, - "loss": 0.5272, - "step": 134 - }, - { - "epoch": 0.09433962264150944, - "grad_norm": 20.32408905029297, - "learning_rate": 4.6875000000000004e-06, - "loss": 0.2767, - "step": 135 - }, - { - "epoch": 0.09503843466107617, - "grad_norm": 2.8684241771698, - "learning_rate": 4.722222222222222e-06, - "loss": 0.0144, - "step": 136 - }, - { - "epoch": 0.0957372466806429, - "grad_norm": 7.477058410644531, - "learning_rate": 4.756944444444445e-06, - "loss": 0.0493, - "step": 137 - }, - { - "epoch": 0.09643605870020965, - "grad_norm": 23.22918128967285, - "learning_rate": 4.791666666666668e-06, - "loss": 0.3952, - "step": 138 - }, - { - "epoch": 0.09713487071977638, - "grad_norm": 9.2420654296875, - "learning_rate": 4.8263888888888895e-06, - "loss": 0.0882, - "step": 139 - }, - { - "epoch": 0.09783368273934312, - "grad_norm": 27.750822067260742, - "learning_rate": 4.861111111111111e-06, - "loss": 0.3521, - "step": 140 - }, - { - "epoch": 0.09853249475890985, - "grad_norm": 12.380672454833984, - "learning_rate": 4.895833333333333e-06, - "loss": 0.1333, - "step": 141 - }, - { - "epoch": 0.0992313067784766, - "grad_norm": 25.04052734375, - "learning_rate": 4.930555555555556e-06, - "loss": 0.3795, - "step": 142 - }, - { - "epoch": 0.09993011879804332, - "grad_norm": 11.43722915649414, - "learning_rate": 4.9652777777777786e-06, - "loss": 0.1927, - "step": 143 - }, - { - "epoch": 0.10062893081761007, - "grad_norm": 55.4820442199707, - "learning_rate": 5e-06, - "loss": 0.9907, - "step": 144 - }, - { - "epoch": 0.1013277428371768, - "grad_norm": 20.55660057067871, - "learning_rate": 4.999992551780808e-06, - "loss": 0.2302, - "step": 145 - }, - { - "epoch": 0.10202655485674354, - "grad_norm": 13.871188163757324, - "learning_rate": 4.999970207167611e-06, - "loss": 0.1635, - "step": 146 - }, - { - "epoch": 0.10272536687631027, - "grad_norm": 17.654191970825195, - "learning_rate": 4.999932966293553e-06, - "loss": 0.2374, - "step": 147 - }, - { - "epoch": 0.103424178895877, - "grad_norm": 13.314811706542969, - "learning_rate": 4.9998808293805355e-06, - "loss": 0.1912, - "step": 148 - }, - { - "epoch": 0.10412299091544375, - "grad_norm": 12.17277717590332, - "learning_rate": 4.9998137967392205e-06, - "loss": 0.1267, - "step": 149 - }, - { - "epoch": 0.10482180293501048, - "grad_norm": 1.8680511713027954, - "learning_rate": 4.999731868769027e-06, - "loss": 0.0062, - "step": 150 - }, - { - "epoch": 0.10552061495457722, - "grad_norm": 3.3582608699798584, - "learning_rate": 4.999635045958129e-06, - "loss": 0.0244, - "step": 151 - }, - { - "epoch": 0.10621942697414395, - "grad_norm": 15.267073631286621, - "learning_rate": 4.999523328883451e-06, - "loss": 0.226, - "step": 152 - }, - { - "epoch": 0.1069182389937107, - "grad_norm": 13.432097434997559, - "learning_rate": 4.999396718210671e-06, - "loss": 0.1862, - "step": 153 - }, - { - "epoch": 0.10761705101327743, - "grad_norm": 34.178123474121094, - "learning_rate": 4.9992552146942054e-06, - "loss": 0.4717, - "step": 154 - }, - { - "epoch": 0.10831586303284417, - "grad_norm": 10.88023853302002, - "learning_rate": 4.999098819177214e-06, - "loss": 0.0827, - "step": 155 - }, - { - "epoch": 0.1090146750524109, - "grad_norm": 12.706320762634277, - "learning_rate": 4.998927532591592e-06, - "loss": 0.24, - "step": 156 - }, - { - "epoch": 0.10971348707197764, - "grad_norm": 33.3758544921875, - "learning_rate": 4.998741355957963e-06, - "loss": 0.3967, - "step": 157 - }, - { - "epoch": 0.11041229909154437, - "grad_norm": 9.823050498962402, - "learning_rate": 4.998540290385675e-06, - "loss": 0.1052, - "step": 158 - }, - { - "epoch": 0.1111111111111111, - "grad_norm": 21.20310401916504, - "learning_rate": 4.998324337072792e-06, - "loss": 0.3256, - "step": 159 - }, - { - "epoch": 0.11180992313067785, - "grad_norm": 3.378161668777466, - "learning_rate": 4.998093497306088e-06, - "loss": 0.0348, - "step": 160 - }, - { - "epoch": 0.11250873515024458, - "grad_norm": 16.39934539794922, - "learning_rate": 4.997847772461038e-06, - "loss": 0.2811, - "step": 161 - }, - { - "epoch": 0.11320754716981132, - "grad_norm": 11.997238159179688, - "learning_rate": 4.997587164001815e-06, - "loss": 0.1928, - "step": 162 - }, - { - "epoch": 0.11390635918937805, - "grad_norm": 11.762815475463867, - "learning_rate": 4.997311673481272e-06, - "loss": 0.1047, - "step": 163 - }, - { - "epoch": 0.1146051712089448, - "grad_norm": 41.98662185668945, - "learning_rate": 4.99702130254094e-06, - "loss": 0.8216, - "step": 164 - }, - { - "epoch": 0.11530398322851153, - "grad_norm": 0.8907880783081055, - "learning_rate": 4.996716052911017e-06, - "loss": 0.0136, - "step": 165 - }, - { - "epoch": 0.11600279524807827, - "grad_norm": 6.044946193695068, - "learning_rate": 4.996395926410354e-06, - "loss": 0.0574, - "step": 166 - }, - { - "epoch": 0.116701607267645, - "grad_norm": 7.652251243591309, - "learning_rate": 4.996060924946452e-06, - "loss": 0.1021, - "step": 167 - }, - { - "epoch": 0.11740041928721175, - "grad_norm": 16.434112548828125, - "learning_rate": 4.99571105051544e-06, - "loss": 0.3096, - "step": 168 - }, - { - "epoch": 0.11809923130677848, - "grad_norm": 8.08709716796875, - "learning_rate": 4.995346305202073e-06, - "loss": 0.0687, - "step": 169 - }, - { - "epoch": 0.1187980433263452, - "grad_norm": 16.4686222076416, - "learning_rate": 4.994966691179712e-06, - "loss": 0.3045, - "step": 170 - }, - { - "epoch": 0.11949685534591195, - "grad_norm": 28.794492721557617, - "learning_rate": 4.994572210710315e-06, - "loss": 0.3226, - "step": 171 - }, - { - "epoch": 0.12019566736547868, - "grad_norm": 1.1931850910186768, - "learning_rate": 4.994162866144425e-06, - "loss": 0.0193, - "step": 172 - }, - { - "epoch": 0.12089447938504543, - "grad_norm": 13.104757308959961, - "learning_rate": 4.993738659921153e-06, - "loss": 0.1906, - "step": 173 - }, - { - "epoch": 0.12159329140461216, - "grad_norm": 20.407514572143555, - "learning_rate": 4.993299594568163e-06, - "loss": 0.3464, - "step": 174 - }, - { - "epoch": 0.1222921034241789, - "grad_norm": 18.145898818969727, - "learning_rate": 4.992845672701658e-06, - "loss": 0.3235, - "step": 175 - }, - { - "epoch": 0.12299091544374563, - "grad_norm": 11.253594398498535, - "learning_rate": 4.9923768970263675e-06, - "loss": 0.1361, - "step": 176 - }, - { - "epoch": 0.12368972746331237, - "grad_norm": 13.6408109664917, - "learning_rate": 4.991893270335526e-06, - "loss": 0.2828, - "step": 177 - }, - { - "epoch": 0.1243885394828791, - "grad_norm": 38.79020309448242, - "learning_rate": 4.9913947955108575e-06, - "loss": 0.604, - "step": 178 - }, - { - "epoch": 0.12508735150244585, - "grad_norm": 11.21642017364502, - "learning_rate": 4.990881475522566e-06, - "loss": 0.0548, - "step": 179 - }, - { - "epoch": 0.12578616352201258, - "grad_norm": 19.890596389770508, - "learning_rate": 4.9903533134293035e-06, - "loss": 0.3927, - "step": 180 - }, - { - "epoch": 0.1264849755415793, - "grad_norm": 13.620441436767578, - "learning_rate": 4.989810312378165e-06, - "loss": 0.1395, - "step": 181 - }, - { - "epoch": 0.12718378756114604, - "grad_norm": 7.4906134605407715, - "learning_rate": 4.989252475604664e-06, - "loss": 0.0814, - "step": 182 - }, - { - "epoch": 0.1278825995807128, - "grad_norm": 9.755054473876953, - "learning_rate": 4.988679806432712e-06, - "loss": 0.1458, - "step": 183 - }, - { - "epoch": 0.12858141160027953, - "grad_norm": 14.173589706420898, - "learning_rate": 4.9880923082746015e-06, - "loss": 0.1895, - "step": 184 - }, - { - "epoch": 0.12928022361984626, - "grad_norm": 10.801349639892578, - "learning_rate": 4.987489984630985e-06, - "loss": 0.1126, - "step": 185 - }, - { - "epoch": 0.129979035639413, - "grad_norm": 21.870594024658203, - "learning_rate": 4.986872839090853e-06, - "loss": 0.3324, - "step": 186 - }, - { - "epoch": 0.13067784765897975, - "grad_norm": 32.091796875, - "learning_rate": 4.986240875331513e-06, - "loss": 0.7947, - "step": 187 - }, - { - "epoch": 0.13137665967854648, - "grad_norm": 12.568334579467773, - "learning_rate": 4.9855940971185705e-06, - "loss": 0.2354, - "step": 188 - }, - { - "epoch": 0.1320754716981132, - "grad_norm": 37.35740280151367, - "learning_rate": 4.9849325083059e-06, - "loss": 0.6075, - "step": 189 - }, - { - "epoch": 0.13277428371767994, - "grad_norm": 24.105323791503906, - "learning_rate": 4.98425611283563e-06, - "loss": 0.3857, - "step": 190 - }, - { - "epoch": 0.1334730957372467, - "grad_norm": 3.437150239944458, - "learning_rate": 4.983564914738113e-06, - "loss": 0.0299, - "step": 191 - }, - { - "epoch": 0.13417190775681342, - "grad_norm": 19.739456176757812, - "learning_rate": 4.982858918131906e-06, - "loss": 0.2763, - "step": 192 - }, - { - "epoch": 0.13487071977638015, - "grad_norm": 20.301265716552734, - "learning_rate": 4.982138127223742e-06, - "loss": 0.3133, - "step": 193 - }, - { - "epoch": 0.13556953179594688, - "grad_norm": 26.877233505249023, - "learning_rate": 4.981402546308508e-06, - "loss": 0.4081, - "step": 194 - }, - { - "epoch": 0.13626834381551362, - "grad_norm": 26.43645477294922, - "learning_rate": 4.9806521797692184e-06, - "loss": 0.2629, - "step": 195 - }, - { - "epoch": 0.13696715583508037, - "grad_norm": 23.70842933654785, - "learning_rate": 4.9798870320769884e-06, - "loss": 0.364, - "step": 196 - }, - { - "epoch": 0.1376659678546471, - "grad_norm": 17.802026748657227, - "learning_rate": 4.979107107791009e-06, - "loss": 0.2183, - "step": 197 - }, - { - "epoch": 0.13836477987421383, - "grad_norm": 24.834115982055664, - "learning_rate": 4.978312411558518e-06, - "loss": 0.2716, - "step": 198 - }, - { - "epoch": 0.13906359189378056, - "grad_norm": 22.922401428222656, - "learning_rate": 4.977502948114772e-06, - "loss": 0.2933, - "step": 199 - }, - { - "epoch": 0.13976240391334732, - "grad_norm": 34.77713394165039, - "learning_rate": 4.976678722283019e-06, - "loss": 0.7012, - "step": 200 - }, - { - "epoch": 0.14046121593291405, - "grad_norm": 12.13795280456543, - "learning_rate": 4.975839738974473e-06, - "loss": 0.1903, - "step": 201 - }, - { - "epoch": 0.14116002795248078, - "grad_norm": 16.988998413085938, - "learning_rate": 4.974986003188278e-06, - "loss": 0.2054, - "step": 202 - }, - { - "epoch": 0.1418588399720475, - "grad_norm": 19.275632858276367, - "learning_rate": 4.974117520011484e-06, - "loss": 0.2453, - "step": 203 - }, - { - "epoch": 0.14255765199161424, - "grad_norm": 35.913509368896484, - "learning_rate": 4.973234294619011e-06, - "loss": 0.5632, - "step": 204 - }, - { - "epoch": 0.143256464011181, - "grad_norm": 28.961172103881836, - "learning_rate": 4.972336332273626e-06, - "loss": 0.6512, - "step": 205 - }, - { - "epoch": 0.14395527603074773, - "grad_norm": 33.84477233886719, - "learning_rate": 4.971423638325906e-06, - "loss": 0.8773, - "step": 206 - }, - { - "epoch": 0.14465408805031446, - "grad_norm": 15.914165496826172, - "learning_rate": 4.970496218214205e-06, - "loss": 0.1928, - "step": 207 - }, - { - "epoch": 0.1453529000698812, - "grad_norm": 12.857492446899414, - "learning_rate": 4.969554077464626e-06, - "loss": 0.2481, - "step": 208 - }, - { - "epoch": 0.14605171208944795, - "grad_norm": 18.26762580871582, - "learning_rate": 4.968597221690986e-06, - "loss": 0.3223, - "step": 209 - }, - { - "epoch": 0.14675052410901468, - "grad_norm": 3.23185396194458, - "learning_rate": 4.967625656594782e-06, - "loss": 0.0309, - "step": 210 - }, - { - "epoch": 0.1474493361285814, - "grad_norm": 39.02360534667969, - "learning_rate": 4.966639387965158e-06, - "loss": 0.4517, - "step": 211 - }, - { - "epoch": 0.14814814814814814, - "grad_norm": 4.6353840827941895, - "learning_rate": 4.965638421678871e-06, - "loss": 0.0509, - "step": 212 - }, - { - "epoch": 0.1488469601677149, - "grad_norm": 4.9387993812561035, - "learning_rate": 4.964622763700252e-06, - "loss": 0.05, - "step": 213 - }, - { - "epoch": 0.14954577218728163, - "grad_norm": 3.62610125541687, - "learning_rate": 4.963592420081177e-06, - "loss": 0.0318, - "step": 214 - }, - { - "epoch": 0.15024458420684836, - "grad_norm": 39.914981842041016, - "learning_rate": 4.962547396961026e-06, - "loss": 0.6649, - "step": 215 - }, - { - "epoch": 0.1509433962264151, - "grad_norm": 29.543420791625977, - "learning_rate": 4.961487700566646e-06, - "loss": 0.4618, - "step": 216 - }, - { - "epoch": 0.15164220824598182, - "grad_norm": 15.438580513000488, - "learning_rate": 4.960413337212321e-06, - "loss": 0.0964, - "step": 217 - }, - { - "epoch": 0.15234102026554858, - "grad_norm": 12.203367233276367, - "learning_rate": 4.959324313299724e-06, - "loss": 0.1419, - "step": 218 - }, - { - "epoch": 0.1530398322851153, - "grad_norm": 36.472225189208984, - "learning_rate": 4.958220635317886e-06, - "loss": 0.7075, - "step": 219 - }, - { - "epoch": 0.15373864430468204, - "grad_norm": 15.606555938720703, - "learning_rate": 4.957102309843157e-06, - "loss": 0.1814, - "step": 220 - }, - { - "epoch": 0.15443745632424877, - "grad_norm": 34.66213607788086, - "learning_rate": 4.955969343539162e-06, - "loss": 0.4877, - "step": 221 - }, - { - "epoch": 0.15513626834381553, - "grad_norm": 2.426269292831421, - "learning_rate": 4.9548217431567665e-06, - "loss": 0.015, - "step": 222 - }, - { - "epoch": 0.15583508036338226, - "grad_norm": 7.730508804321289, - "learning_rate": 4.953659515534035e-06, - "loss": 0.1582, - "step": 223 - }, - { - "epoch": 0.15653389238294899, - "grad_norm": 34.605899810791016, - "learning_rate": 4.952482667596187e-06, - "loss": 0.3362, - "step": 224 - }, - { - "epoch": 0.15723270440251572, - "grad_norm": 48.30050277709961, - "learning_rate": 4.95129120635556e-06, - "loss": 0.9097, - "step": 225 - }, - { - "epoch": 0.15793151642208245, - "grad_norm": 5.044818878173828, - "learning_rate": 4.9500851389115645e-06, - "loss": 0.0458, - "step": 226 - }, - { - "epoch": 0.1586303284416492, - "grad_norm": 13.12427043914795, - "learning_rate": 4.948864472450646e-06, - "loss": 0.1505, - "step": 227 - }, - { - "epoch": 0.15932914046121593, - "grad_norm": 8.89181900024414, - "learning_rate": 4.947629214246238e-06, - "loss": 0.1343, - "step": 228 - }, - { - "epoch": 0.16002795248078266, - "grad_norm": 8.44274616241455, - "learning_rate": 4.946379371658717e-06, - "loss": 0.0818, - "step": 229 - }, - { - "epoch": 0.1607267645003494, - "grad_norm": 9.806259155273438, - "learning_rate": 4.9451149521353655e-06, - "loss": 0.1254, - "step": 230 - }, - { - "epoch": 0.16142557651991615, - "grad_norm": 4.928332328796387, - "learning_rate": 4.943835963210324e-06, - "loss": 0.0608, - "step": 231 - }, - { - "epoch": 0.16212438853948288, - "grad_norm": 17.608247756958008, - "learning_rate": 4.942542412504543e-06, - "loss": 0.2809, - "step": 232 - }, - { - "epoch": 0.1628232005590496, - "grad_norm": 13.168478012084961, - "learning_rate": 4.9412343077257415e-06, - "loss": 0.1497, - "step": 233 - }, - { - "epoch": 0.16352201257861634, - "grad_norm": 3.6533076763153076, - "learning_rate": 4.939911656668361e-06, - "loss": 0.0362, - "step": 234 - }, - { - "epoch": 0.1642208245981831, - "grad_norm": 26.288984298706055, - "learning_rate": 4.938574467213519e-06, - "loss": 0.46, - "step": 235 - }, - { - "epoch": 0.16491963661774983, - "grad_norm": 18.746585845947266, - "learning_rate": 4.937222747328956e-06, - "loss": 0.2843, - "step": 236 - }, - { - "epoch": 0.16561844863731656, - "grad_norm": 13.078903198242188, - "learning_rate": 4.935856505068999e-06, - "loss": 0.1926, - "step": 237 - }, - { - "epoch": 0.1663172606568833, - "grad_norm": 11.8361177444458, - "learning_rate": 4.934475748574506e-06, - "loss": 0.1823, - "step": 238 - }, - { - "epoch": 0.16701607267645002, - "grad_norm": 40.54023361206055, - "learning_rate": 4.933080486072817e-06, - "loss": 0.8266, - "step": 239 - }, - { - "epoch": 0.16771488469601678, - "grad_norm": 14.78891372680664, - "learning_rate": 4.93167072587771e-06, - "loss": 0.2339, - "step": 240 - }, - { - "epoch": 0.1684136967155835, - "grad_norm": 8.922636032104492, - "learning_rate": 4.9302464763893474e-06, - "loss": 0.1516, - "step": 241 - }, - { - "epoch": 0.16911250873515024, - "grad_norm": 28.712753295898438, - "learning_rate": 4.9288077460942266e-06, - "loss": 0.3758, - "step": 242 - }, - { - "epoch": 0.16981132075471697, - "grad_norm": 20.33342170715332, - "learning_rate": 4.927354543565131e-06, - "loss": 0.2495, - "step": 243 - }, - { - "epoch": 0.17051013277428373, - "grad_norm": 6.673890590667725, - "learning_rate": 4.925886877461076e-06, - "loss": 0.0951, - "step": 244 - }, - { - "epoch": 0.17120894479385046, - "grad_norm": 17.336618423461914, - "learning_rate": 4.924404756527262e-06, - "loss": 0.2111, - "step": 245 - }, - { - "epoch": 0.1719077568134172, - "grad_norm": 10.82662296295166, - "learning_rate": 4.9229081895950185e-06, - "loss": 0.1613, - "step": 246 - }, - { - "epoch": 0.17260656883298392, - "grad_norm": 54.12034606933594, - "learning_rate": 4.92139718558175e-06, - "loss": 0.9248, - "step": 247 - }, - { - "epoch": 0.17330538085255065, - "grad_norm": 14.005218505859375, - "learning_rate": 4.919871753490892e-06, - "loss": 0.2436, - "step": 248 - }, - { - "epoch": 0.1740041928721174, - "grad_norm": 34.445777893066406, - "learning_rate": 4.918331902411842e-06, - "loss": 0.5176, - "step": 249 - }, - { - "epoch": 0.17470300489168414, - "grad_norm": 10.388051986694336, - "learning_rate": 4.916777641519921e-06, - "loss": 0.1205, - "step": 250 - }, - { - "epoch": 0.17540181691125087, - "grad_norm": 2.350389242172241, - "learning_rate": 4.91520898007631e-06, - "loss": 0.0318, - "step": 251 - }, - { - "epoch": 0.1761006289308176, - "grad_norm": 8.809117317199707, - "learning_rate": 4.913625927427996e-06, - "loss": 0.0922, - "step": 252 - }, - { - "epoch": 0.17679944095038436, - "grad_norm": 11.93591594696045, - "learning_rate": 4.912028493007717e-06, - "loss": 0.1614, - "step": 253 - }, - { - "epoch": 0.1774982529699511, - "grad_norm": 19.281707763671875, - "learning_rate": 4.9104166863339065e-06, - "loss": 0.2842, - "step": 254 - }, - { - "epoch": 0.17819706498951782, - "grad_norm": 17.486173629760742, - "learning_rate": 4.908790517010637e-06, - "loss": 0.2744, - "step": 255 - }, - { - "epoch": 0.17889587700908455, - "grad_norm": 8.761900901794434, - "learning_rate": 4.907149994727559e-06, - "loss": 0.0757, - "step": 256 - }, - { - "epoch": 0.1795946890286513, - "grad_norm": 8.123963356018066, - "learning_rate": 4.90549512925985e-06, - "loss": 0.1441, - "step": 257 - }, - { - "epoch": 0.18029350104821804, - "grad_norm": 13.555806159973145, - "learning_rate": 4.903825930468149e-06, - "loss": 0.171, - "step": 258 - }, - { - "epoch": 0.18099231306778477, - "grad_norm": 20.24187469482422, - "learning_rate": 4.902142408298504e-06, - "loss": 0.3552, - "step": 259 - }, - { - "epoch": 0.1816911250873515, - "grad_norm": 16.279403686523438, - "learning_rate": 4.9004445727823095e-06, - "loss": 0.2269, - "step": 260 - }, - { - "epoch": 0.18238993710691823, - "grad_norm": 35.07636260986328, - "learning_rate": 4.8987324340362445e-06, - "loss": 0.6414, - "step": 261 - }, - { - "epoch": 0.18308874912648498, - "grad_norm": 7.011019706726074, - "learning_rate": 4.897006002262217e-06, - "loss": 0.0676, - "step": 262 - }, - { - "epoch": 0.18378756114605171, - "grad_norm": 25.15359115600586, - "learning_rate": 4.895265287747302e-06, - "loss": 0.3347, - "step": 263 - }, - { - "epoch": 0.18448637316561844, - "grad_norm": 1.544360637664795, - "learning_rate": 4.893510300863677e-06, - "loss": 0.0242, - "step": 264 - }, - { - "epoch": 0.18518518518518517, - "grad_norm": 28.481163024902344, - "learning_rate": 4.8917410520685635e-06, - "loss": 0.6507, - "step": 265 - }, - { - "epoch": 0.18588399720475193, - "grad_norm": 10.587766647338867, - "learning_rate": 4.889957551904164e-06, - "loss": 0.172, - "step": 266 - }, - { - "epoch": 0.18658280922431866, - "grad_norm": 4.213011264801025, - "learning_rate": 4.8881598109976e-06, - "loss": 0.0372, - "step": 267 - }, - { - "epoch": 0.1872816212438854, - "grad_norm": 4.916326999664307, - "learning_rate": 4.886347840060845e-06, - "loss": 0.0578, - "step": 268 - }, - { - "epoch": 0.18798043326345212, - "grad_norm": 17.933826446533203, - "learning_rate": 4.884521649890664e-06, - "loss": 0.344, - "step": 269 - }, - { - "epoch": 0.18867924528301888, - "grad_norm": 35.10563278198242, - "learning_rate": 4.882681251368549e-06, - "loss": 0.5693, - "step": 270 - }, - { - "epoch": 0.1893780573025856, - "grad_norm": 16.641815185546875, - "learning_rate": 4.8808266554606535e-06, - "loss": 0.313, - "step": 271 - }, - { - "epoch": 0.19007686932215234, - "grad_norm": 18.587114334106445, - "learning_rate": 4.878957873217727e-06, - "loss": 0.3172, - "step": 272 - }, - { - "epoch": 0.19077568134171907, - "grad_norm": 26.699010848999023, - "learning_rate": 4.877074915775049e-06, - "loss": 0.4658, - "step": 273 - }, - { - "epoch": 0.1914744933612858, - "grad_norm": 19.77286720275879, - "learning_rate": 4.875177794352364e-06, - "loss": 0.2559, - "step": 274 - }, - { - "epoch": 0.19217330538085256, - "grad_norm": 35.742652893066406, - "learning_rate": 4.873266520253812e-06, - "loss": 0.4641, - "step": 275 - }, - { - "epoch": 0.1928721174004193, - "grad_norm": 23.883516311645508, - "learning_rate": 4.8713411048678635e-06, - "loss": 0.3472, - "step": 276 - }, - { - "epoch": 0.19357092941998602, - "grad_norm": 7.226071357727051, - "learning_rate": 4.869401559667253e-06, - "loss": 0.0887, - "step": 277 - }, - { - "epoch": 0.19426974143955275, - "grad_norm": 1.5649971961975098, - "learning_rate": 4.867447896208906e-06, - "loss": 0.0164, - "step": 278 - }, - { - "epoch": 0.1949685534591195, - "grad_norm": 5.674210548400879, - "learning_rate": 4.865480126133872e-06, - "loss": 0.0697, - "step": 279 - }, - { - "epoch": 0.19566736547868624, - "grad_norm": 27.634450912475586, - "learning_rate": 4.863498261167258e-06, - "loss": 0.2649, - "step": 280 - }, - { - "epoch": 0.19636617749825297, - "grad_norm": 13.977226257324219, - "learning_rate": 4.861502313118157e-06, - "loss": 0.1844, - "step": 281 - }, - { - "epoch": 0.1970649895178197, - "grad_norm": 2.140982151031494, - "learning_rate": 4.859492293879574e-06, - "loss": 0.008, - "step": 282 - }, - { - "epoch": 0.19776380153738643, - "grad_norm": 8.574052810668945, - "learning_rate": 4.857468215428362e-06, - "loss": 0.158, - "step": 283 - }, - { - "epoch": 0.1984626135569532, - "grad_norm": 2.9333882331848145, - "learning_rate": 4.855430089825143e-06, - "loss": 0.0322, - "step": 284 - }, - { - "epoch": 0.19916142557651992, - "grad_norm": 31.97005271911621, - "learning_rate": 4.853377929214243e-06, - "loss": 0.8695, - "step": 285 - }, - { - "epoch": 0.19986023759608665, - "grad_norm": 18.904457092285156, - "learning_rate": 4.851311745823616e-06, - "loss": 0.2074, - "step": 286 - }, - { - "epoch": 0.20055904961565338, - "grad_norm": 7.113580226898193, - "learning_rate": 4.849231551964771e-06, - "loss": 0.0478, - "step": 287 - }, - { - "epoch": 0.20125786163522014, - "grad_norm": 14.682963371276855, - "learning_rate": 4.8471373600327e-06, - "loss": 0.2164, - "step": 288 - }, - { - "epoch": 0.20195667365478687, - "grad_norm": 17.470304489135742, - "learning_rate": 4.8450291825058036e-06, - "loss": 0.4563, - "step": 289 - }, - { - "epoch": 0.2026554856743536, - "grad_norm": 10.486756324768066, - "learning_rate": 4.842907031945815e-06, - "loss": 0.1696, - "step": 290 - }, - { - "epoch": 0.20335429769392033, - "grad_norm": 15.90971851348877, - "learning_rate": 4.84077092099773e-06, - "loss": 0.3015, - "step": 291 - }, - { - "epoch": 0.20405310971348709, - "grad_norm": 15.302689552307129, - "learning_rate": 4.838620862389727e-06, - "loss": 0.2725, - "step": 292 - }, - { - "epoch": 0.20475192173305382, - "grad_norm": 10.254291534423828, - "learning_rate": 4.83645686893309e-06, - "loss": 0.2092, - "step": 293 - }, - { - "epoch": 0.20545073375262055, - "grad_norm": 10.138662338256836, - "learning_rate": 4.834278953522139e-06, - "loss": 0.0555, - "step": 294 - }, - { - "epoch": 0.20614954577218728, - "grad_norm": 34.5139274597168, - "learning_rate": 4.8320871291341455e-06, - "loss": 0.4119, - "step": 295 - }, - { - "epoch": 0.206848357791754, - "grad_norm": 11.110896110534668, - "learning_rate": 4.829881408829262e-06, - "loss": 0.2111, - "step": 296 - }, - { - "epoch": 0.20754716981132076, - "grad_norm": 3.394921064376831, - "learning_rate": 4.827661805750438e-06, - "loss": 0.0446, - "step": 297 - }, - { - "epoch": 0.2082459818308875, - "grad_norm": 19.610963821411133, - "learning_rate": 4.825428333123346e-06, - "loss": 0.3897, - "step": 298 - }, - { - "epoch": 0.20894479385045422, - "grad_norm": 20.56781005859375, - "learning_rate": 4.823181004256301e-06, - "loss": 0.3383, - "step": 299 - }, - { - "epoch": 0.20964360587002095, - "grad_norm": 4.803562164306641, - "learning_rate": 4.8209198325401815e-06, - "loss": 0.0608, - "step": 300 - }, - { - "epoch": 0.2103424178895877, - "grad_norm": 52.001216888427734, - "learning_rate": 4.81864483144835e-06, - "loss": 0.8058, - "step": 301 - }, - { - "epoch": 0.21104122990915444, - "grad_norm": 36.58800506591797, - "learning_rate": 4.816356014536571e-06, - "loss": 0.6983, - "step": 302 - }, - { - "epoch": 0.21174004192872117, - "grad_norm": 7.013560771942139, - "learning_rate": 4.814053395442933e-06, - "loss": 0.1034, - "step": 303 - }, - { - "epoch": 0.2124388539482879, - "grad_norm": 5.222038269042969, - "learning_rate": 4.811736987887765e-06, - "loss": 0.0527, - "step": 304 - }, - { - "epoch": 0.21313766596785463, - "grad_norm": 48.45743179321289, - "learning_rate": 4.8094068056735564e-06, - "loss": 0.9699, - "step": 305 - }, - { - "epoch": 0.2138364779874214, - "grad_norm": 27.32036018371582, - "learning_rate": 4.807062862684874e-06, - "loss": 0.4697, - "step": 306 - }, - { - "epoch": 0.21453529000698812, - "grad_norm": 12.215785026550293, - "learning_rate": 4.804705172888277e-06, - "loss": 0.1393, - "step": 307 - }, - { - "epoch": 0.21523410202655485, - "grad_norm": 17.731470108032227, - "learning_rate": 4.80233375033224e-06, - "loss": 0.1351, - "step": 308 - }, - { - "epoch": 0.21593291404612158, - "grad_norm": 28.933956146240234, - "learning_rate": 4.799948609147061e-06, - "loss": 0.5294, - "step": 309 - }, - { - "epoch": 0.21663172606568834, - "grad_norm": 13.886995315551758, - "learning_rate": 4.797549763544784e-06, - "loss": 0.2022, - "step": 310 - }, - { - "epoch": 0.21733053808525507, - "grad_norm": 9.811971664428711, - "learning_rate": 4.795137227819113e-06, - "loss": 0.1662, - "step": 311 - }, - { - "epoch": 0.2180293501048218, - "grad_norm": 8.048477172851562, - "learning_rate": 4.792711016345322e-06, - "loss": 0.101, - "step": 312 - }, - { - "epoch": 0.21872816212438853, - "grad_norm": 6.266959190368652, - "learning_rate": 4.790271143580174e-06, - "loss": 0.1021, - "step": 313 - }, - { - "epoch": 0.2194269741439553, - "grad_norm": 39.531700134277344, - "learning_rate": 4.787817624061838e-06, - "loss": 0.8468, - "step": 314 - }, - { - "epoch": 0.22012578616352202, - "grad_norm": 5.592189311981201, - "learning_rate": 4.785350472409792e-06, - "loss": 0.0619, - "step": 315 - }, - { - "epoch": 0.22082459818308875, - "grad_norm": 17.003190994262695, - "learning_rate": 4.782869703324746e-06, - "loss": 0.2143, - "step": 316 - }, - { - "epoch": 0.22152341020265548, - "grad_norm": 9.305402755737305, - "learning_rate": 4.78037533158855e-06, - "loss": 0.0783, - "step": 317 - }, - { - "epoch": 0.2222222222222222, - "grad_norm": 15.373815536499023, - "learning_rate": 4.777867372064105e-06, - "loss": 0.2495, - "step": 318 - }, - { - "epoch": 0.22292103424178897, - "grad_norm": 10.769071578979492, - "learning_rate": 4.775345839695279e-06, - "loss": 0.2074, - "step": 319 - }, - { - "epoch": 0.2236198462613557, - "grad_norm": 0.8323988318443298, - "learning_rate": 4.77281074950681e-06, - "loss": 0.0114, - "step": 320 - }, - { - "epoch": 0.22431865828092243, - "grad_norm": 16.682310104370117, - "learning_rate": 4.770262116604224e-06, - "loss": 0.1843, - "step": 321 - }, - { - "epoch": 0.22501747030048916, - "grad_norm": 12.464035987854004, - "learning_rate": 4.767699956173745e-06, - "loss": 0.1565, - "step": 322 - }, - { - "epoch": 0.22571628232005592, - "grad_norm": 10.665525436401367, - "learning_rate": 4.765124283482195e-06, - "loss": 0.1886, - "step": 323 - }, - { - "epoch": 0.22641509433962265, - "grad_norm": 8.285576820373535, - "learning_rate": 4.7625351138769175e-06, - "loss": 0.1113, - "step": 324 - }, - { - "epoch": 0.22711390635918938, - "grad_norm": 13.075751304626465, - "learning_rate": 4.759932462785672e-06, - "loss": 0.1893, - "step": 325 - }, - { - "epoch": 0.2278127183787561, - "grad_norm": 18.610963821411133, - "learning_rate": 4.757316345716554e-06, - "loss": 0.2235, - "step": 326 - }, - { - "epoch": 0.22851153039832284, - "grad_norm": 35.326637268066406, - "learning_rate": 4.754686778257891e-06, - "loss": 0.8035, - "step": 327 - }, - { - "epoch": 0.2292103424178896, - "grad_norm": 18.4339542388916, - "learning_rate": 4.75204377607816e-06, - "loss": 0.4119, - "step": 328 - }, - { - "epoch": 0.22990915443745633, - "grad_norm": 34.74818420410156, - "learning_rate": 4.74938735492589e-06, - "loss": 0.6765, - "step": 329 - }, - { - "epoch": 0.23060796645702306, - "grad_norm": 1.2656896114349365, - "learning_rate": 4.746717530629565e-06, - "loss": 0.0165, - "step": 330 - }, - { - "epoch": 0.23130677847658979, - "grad_norm": 33.5367546081543, - "learning_rate": 4.744034319097536e-06, - "loss": 0.6406, - "step": 331 - }, - { - "epoch": 0.23200559049615654, - "grad_norm": 4.118407726287842, - "learning_rate": 4.741337736317919e-06, - "loss": 0.0494, - "step": 332 - }, - { - "epoch": 0.23270440251572327, - "grad_norm": 35.5570068359375, - "learning_rate": 4.738627798358506e-06, - "loss": 0.6405, - "step": 333 - }, - { - "epoch": 0.23340321453529, - "grad_norm": 13.001288414001465, - "learning_rate": 4.7359045213666675e-06, - "loss": 0.285, - "step": 334 - }, - { - "epoch": 0.23410202655485673, - "grad_norm": 10.730605125427246, - "learning_rate": 4.733167921569255e-06, - "loss": 0.1991, - "step": 335 - }, - { - "epoch": 0.2348008385744235, - "grad_norm": 0.6715048551559448, - "learning_rate": 4.7304180152725035e-06, - "loss": 0.0091, - "step": 336 - }, - { - "epoch": 0.23549965059399022, - "grad_norm": 10.875028610229492, - "learning_rate": 4.727654818861937e-06, - "loss": 0.2589, - "step": 337 - }, - { - "epoch": 0.23619846261355695, - "grad_norm": 2.7218732833862305, - "learning_rate": 4.724878348802271e-06, - "loss": 0.0154, - "step": 338 - }, - { - "epoch": 0.23689727463312368, - "grad_norm": 7.1363935470581055, - "learning_rate": 4.7220886216373095e-06, - "loss": 0.1054, - "step": 339 - }, - { - "epoch": 0.2375960866526904, - "grad_norm": 8.102635383605957, - "learning_rate": 4.719285653989852e-06, - "loss": 0.1246, - "step": 340 - }, - { - "epoch": 0.23829489867225717, - "grad_norm": 32.68269729614258, - "learning_rate": 4.716469462561595e-06, - "loss": 0.6109, - "step": 341 - }, - { - "epoch": 0.2389937106918239, - "grad_norm": 14.749852180480957, - "learning_rate": 4.7136400641330245e-06, - "loss": 0.236, - "step": 342 - }, - { - "epoch": 0.23969252271139063, - "grad_norm": 18.897247314453125, - "learning_rate": 4.710797475563327e-06, - "loss": 0.3625, - "step": 343 - }, - { - "epoch": 0.24039133473095736, - "grad_norm": 11.592190742492676, - "learning_rate": 4.707941713790279e-06, - "loss": 0.1785, - "step": 344 - }, - { - "epoch": 0.24109014675052412, - "grad_norm": 32.90082931518555, - "learning_rate": 4.7050727958301505e-06, - "loss": 0.548, - "step": 345 - }, - { - "epoch": 0.24178895877009085, - "grad_norm": 4.296587944030762, - "learning_rate": 4.702190738777608e-06, - "loss": 0.0411, - "step": 346 - }, - { - "epoch": 0.24248777078965758, - "grad_norm": 5.985422611236572, - "learning_rate": 4.699295559805606e-06, - "loss": 0.093, - "step": 347 - }, - { - "epoch": 0.2431865828092243, - "grad_norm": 11.231877326965332, - "learning_rate": 4.696387276165284e-06, - "loss": 0.1743, - "step": 348 - }, - { - "epoch": 0.24388539482879107, - "grad_norm": 10.44618034362793, - "learning_rate": 4.693465905185871e-06, - "loss": 0.0967, - "step": 349 - }, - { - "epoch": 0.2445842068483578, - "grad_norm": 11.621986389160156, - "learning_rate": 4.690531464274577e-06, - "loss": 0.2119, - "step": 350 - }, - { - "epoch": 0.24528301886792453, - "grad_norm": 9.701064109802246, - "learning_rate": 4.687583970916487e-06, - "loss": 0.1547, - "step": 351 - }, - { - "epoch": 0.24598183088749126, - "grad_norm": 0.47121426463127136, - "learning_rate": 4.684623442674463e-06, - "loss": 0.0093, - "step": 352 - }, - { - "epoch": 0.246680642907058, - "grad_norm": 12.1204195022583, - "learning_rate": 4.681649897189036e-06, - "loss": 0.2302, - "step": 353 - }, - { - "epoch": 0.24737945492662475, - "grad_norm": 18.194692611694336, - "learning_rate": 4.678663352178301e-06, - "loss": 0.2065, - "step": 354 - }, - { - "epoch": 0.24807826694619148, - "grad_norm": 20.830915451049805, - "learning_rate": 4.675663825437811e-06, - "loss": 0.3612, - "step": 355 - }, - { - "epoch": 0.2487770789657582, - "grad_norm": 10.530333518981934, - "learning_rate": 4.6726513348404736e-06, - "loss": 0.1429, - "step": 356 - }, - { - "epoch": 0.24947589098532494, - "grad_norm": 5.4854326248168945, - "learning_rate": 4.669625898336439e-06, - "loss": 0.0727, - "step": 357 - }, - { - "epoch": 0.2501747030048917, - "grad_norm": 30.590425491333008, - "learning_rate": 4.666587533952998e-06, - "loss": 0.6164, - "step": 358 - }, - { - "epoch": 0.2508735150244584, - "grad_norm": 40.59944534301758, - "learning_rate": 4.663536259794477e-06, - "loss": 0.844, - "step": 359 - }, - { - "epoch": 0.25157232704402516, - "grad_norm": 11.132962226867676, - "learning_rate": 4.660472094042121e-06, - "loss": 0.1291, - "step": 360 - }, - { - "epoch": 0.2522711390635919, - "grad_norm": 6.713598728179932, - "learning_rate": 4.657395054953992e-06, - "loss": 0.1139, - "step": 361 - }, - { - "epoch": 0.2529699510831586, - "grad_norm": 16.51830291748047, - "learning_rate": 4.65430516086486e-06, - "loss": 0.3327, - "step": 362 - }, - { - "epoch": 0.25366876310272535, - "grad_norm": 12.358148574829102, - "learning_rate": 4.6512024301860925e-06, - "loss": 0.2048, - "step": 363 - }, - { - "epoch": 0.2543675751222921, - "grad_norm": 29.781177520751953, - "learning_rate": 4.648086881405542e-06, - "loss": 0.5272, - "step": 364 - }, - { - "epoch": 0.25506638714185886, - "grad_norm": 20.371816635131836, - "learning_rate": 4.644958533087443e-06, - "loss": 0.301, - "step": 365 - }, - { - "epoch": 0.2557651991614256, - "grad_norm": 20.111852645874023, - "learning_rate": 4.641817403872293e-06, - "loss": 0.5609, - "step": 366 - }, - { - "epoch": 0.2564640111809923, - "grad_norm": 11.709222793579102, - "learning_rate": 4.638663512476748e-06, - "loss": 0.1922, - "step": 367 - }, - { - "epoch": 0.25716282320055905, - "grad_norm": 13.188800811767578, - "learning_rate": 4.635496877693507e-06, - "loss": 0.1669, - "step": 368 - }, - { - "epoch": 0.2578616352201258, - "grad_norm": 32.15465545654297, - "learning_rate": 4.632317518391203e-06, - "loss": 0.6982, - "step": 369 - }, - { - "epoch": 0.2585604472396925, - "grad_norm": 26.257347106933594, - "learning_rate": 4.629125453514286e-06, - "loss": 0.3907, - "step": 370 - }, - { - "epoch": 0.25925925925925924, - "grad_norm": 25.885942459106445, - "learning_rate": 4.625920702082918e-06, - "loss": 0.4158, - "step": 371 - }, - { - "epoch": 0.259958071278826, - "grad_norm": 15.703845024108887, - "learning_rate": 4.622703283192849e-06, - "loss": 0.1871, - "step": 372 - }, - { - "epoch": 0.2606568832983927, - "grad_norm": 11.522906303405762, - "learning_rate": 4.619473216015313e-06, - "loss": 0.168, - "step": 373 - }, - { - "epoch": 0.2613556953179595, - "grad_norm": 32.49365997314453, - "learning_rate": 4.616230519796909e-06, - "loss": 0.6912, - "step": 374 - }, - { - "epoch": 0.2620545073375262, - "grad_norm": 0.8261187672615051, - "learning_rate": 4.612975213859487e-06, - "loss": 0.0107, - "step": 375 - }, - { - "epoch": 0.26275331935709295, - "grad_norm": 25.673416137695312, - "learning_rate": 4.6097073176000325e-06, - "loss": 0.5722, - "step": 376 - }, - { - "epoch": 0.2634521313766597, - "grad_norm": 35.15298080444336, - "learning_rate": 4.606426850490551e-06, - "loss": 0.5508, - "step": 377 - }, - { - "epoch": 0.2641509433962264, - "grad_norm": 6.139308929443359, - "learning_rate": 4.603133832077953e-06, - "loss": 0.0962, - "step": 378 - }, - { - "epoch": 0.26484975541579314, - "grad_norm": 18.431100845336914, - "learning_rate": 4.599828281983938e-06, - "loss": 0.3406, - "step": 379 - }, - { - "epoch": 0.2655485674353599, - "grad_norm": 14.415935516357422, - "learning_rate": 4.596510219904874e-06, - "loss": 0.2035, - "step": 380 - }, - { - "epoch": 0.2662473794549266, - "grad_norm": 6.168896198272705, - "learning_rate": 4.593179665611685e-06, - "loss": 0.0701, - "step": 381 - }, - { - "epoch": 0.2669461914744934, - "grad_norm": 19.408126831054688, - "learning_rate": 4.589836638949729e-06, - "loss": 0.2329, - "step": 382 - }, - { - "epoch": 0.2676450034940601, - "grad_norm": 10.20333480834961, - "learning_rate": 4.586481159838682e-06, - "loss": 0.1782, - "step": 383 - }, - { - "epoch": 0.26834381551362685, - "grad_norm": 1.0518056154251099, - "learning_rate": 4.58311324827242e-06, - "loss": 0.0157, - "step": 384 - }, - { - "epoch": 0.2690426275331936, - "grad_norm": 37.60113525390625, - "learning_rate": 4.579732924318898e-06, - "loss": 0.7857, - "step": 385 - }, - { - "epoch": 0.2697414395527603, - "grad_norm": 24.878284454345703, - "learning_rate": 4.5763402081200295e-06, - "loss": 0.5532, - "step": 386 - }, - { - "epoch": 0.27044025157232704, - "grad_norm": 13.037965774536133, - "learning_rate": 4.5729351198915715e-06, - "loss": 0.1916, - "step": 387 - }, - { - "epoch": 0.27113906359189377, - "grad_norm": 0.6771875619888306, - "learning_rate": 4.569517679922997e-06, - "loss": 0.0056, - "step": 388 - }, - { - "epoch": 0.2718378756114605, - "grad_norm": 14.487314224243164, - "learning_rate": 4.566087908577382e-06, - "loss": 0.2975, - "step": 389 - }, - { - "epoch": 0.27253668763102723, - "grad_norm": 18.10540199279785, - "learning_rate": 4.5626458262912745e-06, - "loss": 0.2606, - "step": 390 - }, - { - "epoch": 0.273235499650594, - "grad_norm": 9.74409008026123, - "learning_rate": 4.559191453574582e-06, - "loss": 0.1439, - "step": 391 - }, - { - "epoch": 0.27393431167016075, - "grad_norm": 11.126801490783691, - "learning_rate": 4.555724811010447e-06, - "loss": 0.1839, - "step": 392 - }, - { - "epoch": 0.2746331236897275, - "grad_norm": 14.446985244750977, - "learning_rate": 4.5522459192551175e-06, - "loss": 0.312, - "step": 393 - }, - { - "epoch": 0.2753319357092942, - "grad_norm": 7.447902202606201, - "learning_rate": 4.548754799037834e-06, - "loss": 0.1061, - "step": 394 - }, - { - "epoch": 0.27603074772886094, - "grad_norm": 28.85816192626953, - "learning_rate": 4.545251471160698e-06, - "loss": 0.6249, - "step": 395 - }, - { - "epoch": 0.27672955974842767, - "grad_norm": 33.03690719604492, - "learning_rate": 4.541735956498555e-06, - "loss": 0.508, - "step": 396 - }, - { - "epoch": 0.2774283717679944, - "grad_norm": 20.449827194213867, - "learning_rate": 4.538208275998861e-06, - "loss": 0.2899, - "step": 397 - }, - { - "epoch": 0.2781271837875611, - "grad_norm": 16.259653091430664, - "learning_rate": 4.534668450681569e-06, - "loss": 0.292, - "step": 398 - }, - { - "epoch": 0.27882599580712786, - "grad_norm": 26.654836654663086, - "learning_rate": 4.531116501638992e-06, - "loss": 0.3557, - "step": 399 - }, - { - "epoch": 0.27952480782669464, - "grad_norm": 1.7782492637634277, - "learning_rate": 4.527552450035689e-06, - "loss": 0.0237, - "step": 400 - }, - { - "epoch": 0.2802236198462614, - "grad_norm": 20.36215591430664, - "learning_rate": 4.523976317108326e-06, - "loss": 0.3595, - "step": 401 - }, - { - "epoch": 0.2809224318658281, - "grad_norm": 4.038337707519531, - "learning_rate": 4.520388124165564e-06, - "loss": 0.0438, - "step": 402 - }, - { - "epoch": 0.28162124388539483, - "grad_norm": 17.7996768951416, - "learning_rate": 4.516787892587921e-06, - "loss": 0.3754, - "step": 403 - }, - { - "epoch": 0.28232005590496156, - "grad_norm": 33.77705764770508, - "learning_rate": 4.513175643827647e-06, - "loss": 0.5027, - "step": 404 - }, - { - "epoch": 0.2830188679245283, - "grad_norm": 1.2933781147003174, - "learning_rate": 4.509551399408598e-06, - "loss": 0.0147, - "step": 405 - }, - { - "epoch": 0.283717679944095, - "grad_norm": 37.71927261352539, - "learning_rate": 4.5059151809261085e-06, - "loss": 0.5584, - "step": 406 - }, - { - "epoch": 0.28441649196366176, - "grad_norm": 45.29424285888672, - "learning_rate": 4.50226701004686e-06, - "loss": 0.8957, - "step": 407 - }, - { - "epoch": 0.2851153039832285, - "grad_norm": 16.94281578063965, - "learning_rate": 4.498606908508754e-06, - "loss": 0.3971, - "step": 408 - }, - { - "epoch": 0.28581411600279527, - "grad_norm": 54.23704528808594, - "learning_rate": 4.494934898120779e-06, - "loss": 0.9391, - "step": 409 - }, - { - "epoch": 0.286512928022362, - "grad_norm": 13.617294311523438, - "learning_rate": 4.491251000762889e-06, - "loss": 0.1867, - "step": 410 - }, - { - "epoch": 0.28721174004192873, - "grad_norm": 4.1379923820495605, - "learning_rate": 4.487555238385862e-06, - "loss": 0.0204, - "step": 411 - }, - { - "epoch": 0.28791055206149546, - "grad_norm": 18.19028663635254, - "learning_rate": 4.483847633011177e-06, - "loss": 0.3196, - "step": 412 - }, - { - "epoch": 0.2886093640810622, - "grad_norm": 7.146092891693115, - "learning_rate": 4.480128206730881e-06, - "loss": 0.1091, - "step": 413 - }, - { - "epoch": 0.2893081761006289, - "grad_norm": 14.049055099487305, - "learning_rate": 4.476396981707454e-06, - "loss": 0.2865, - "step": 414 - }, - { - "epoch": 0.29000698812019565, - "grad_norm": 6.086685657501221, - "learning_rate": 4.4726539801736815e-06, - "loss": 0.0453, - "step": 415 - }, - { - "epoch": 0.2907058001397624, - "grad_norm": 7.355288982391357, - "learning_rate": 4.4688992244325215e-06, - "loss": 0.0962, - "step": 416 - }, - { - "epoch": 0.2914046121593291, - "grad_norm": 27.951030731201172, - "learning_rate": 4.4651327368569695e-06, - "loss": 0.4766, - "step": 417 - }, - { - "epoch": 0.2921034241788959, - "grad_norm": 10.94737720489502, - "learning_rate": 4.461354539889923e-06, - "loss": 0.1603, - "step": 418 - }, - { - "epoch": 0.29280223619846263, - "grad_norm": 5.964730262756348, - "learning_rate": 4.457564656044056e-06, - "loss": 0.0751, - "step": 419 - }, - { - "epoch": 0.29350104821802936, - "grad_norm": 18.6251277923584, - "learning_rate": 4.453763107901676e-06, - "loss": 0.4091, - "step": 420 - }, - { - "epoch": 0.2941998602375961, - "grad_norm": 36.76710891723633, - "learning_rate": 4.449949918114593e-06, - "loss": 0.8475, - "step": 421 - }, - { - "epoch": 0.2948986722571628, - "grad_norm": 16.30900764465332, - "learning_rate": 4.446125109403987e-06, - "loss": 0.229, - "step": 422 - }, - { - "epoch": 0.29559748427672955, - "grad_norm": 0.9128844738006592, - "learning_rate": 4.442288704560268e-06, - "loss": 0.0206, - "step": 423 - }, - { - "epoch": 0.2962962962962963, - "grad_norm": 5.981807231903076, - "learning_rate": 4.438440726442944e-06, - "loss": 0.0885, - "step": 424 - }, - { - "epoch": 0.296995108315863, - "grad_norm": 27.69352912902832, - "learning_rate": 4.434581197980483e-06, - "loss": 0.608, - "step": 425 - }, - { - "epoch": 0.2976939203354298, - "grad_norm": 17.880767822265625, - "learning_rate": 4.430710142170176e-06, - "loss": 0.252, - "step": 426 - }, - { - "epoch": 0.2983927323549965, - "grad_norm": 5.650568962097168, - "learning_rate": 4.426827582077999e-06, - "loss": 0.0755, - "step": 427 - }, - { - "epoch": 0.29909154437456326, - "grad_norm": 13.807109832763672, - "learning_rate": 4.422933540838481e-06, - "loss": 0.1333, - "step": 428 - }, - { - "epoch": 0.29979035639413, - "grad_norm": 22.934085845947266, - "learning_rate": 4.419028041654559e-06, - "loss": 0.4995, - "step": 429 - }, - { - "epoch": 0.3004891684136967, - "grad_norm": 21.154132843017578, - "learning_rate": 4.415111107797445e-06, - "loss": 0.2275, - "step": 430 - }, - { - "epoch": 0.30118798043326345, - "grad_norm": 45.836483001708984, - "learning_rate": 4.411182762606484e-06, - "loss": 0.6165, - "step": 431 - }, - { - "epoch": 0.3018867924528302, - "grad_norm": 18.20020866394043, - "learning_rate": 4.407243029489018e-06, - "loss": 0.3395, - "step": 432 - }, - { - "epoch": 0.3025856044723969, - "grad_norm": 6.130852699279785, - "learning_rate": 4.4032919319202415e-06, - "loss": 0.0991, - "step": 433 - }, - { - "epoch": 0.30328441649196364, - "grad_norm": 12.936942100524902, - "learning_rate": 4.399329493443067e-06, - "loss": 0.1179, - "step": 434 - }, - { - "epoch": 0.3039832285115304, - "grad_norm": 12.123394966125488, - "learning_rate": 4.3953557376679856e-06, - "loss": 0.1573, - "step": 435 - }, - { - "epoch": 0.30468204053109715, - "grad_norm": 14.226133346557617, - "learning_rate": 4.391370688272919e-06, - "loss": 0.1557, - "step": 436 - }, - { - "epoch": 0.3053808525506639, - "grad_norm": 27.995567321777344, - "learning_rate": 4.387374369003083e-06, - "loss": 0.5142, - "step": 437 - }, - { - "epoch": 0.3060796645702306, - "grad_norm": 12.208261489868164, - "learning_rate": 4.383366803670849e-06, - "loss": 0.201, - "step": 438 - }, - { - "epoch": 0.30677847658979734, - "grad_norm": 27.00516128540039, - "learning_rate": 4.379348016155596e-06, - "loss": 0.4225, - "step": 439 - }, - { - "epoch": 0.3074772886093641, - "grad_norm": 7.272449493408203, - "learning_rate": 4.375318030403573e-06, - "loss": 0.0786, - "step": 440 - }, - { - "epoch": 0.3081761006289308, - "grad_norm": 0.5066769123077393, - "learning_rate": 4.3712768704277535e-06, - "loss": 0.0102, - "step": 441 - }, - { - "epoch": 0.30887491264849753, - "grad_norm": 9.54050350189209, - "learning_rate": 4.367224560307693e-06, - "loss": 0.1596, - "step": 442 - }, - { - "epoch": 0.30957372466806427, - "grad_norm": 15.01711654663086, - "learning_rate": 4.363161124189387e-06, - "loss": 0.3198, - "step": 443 - }, - { - "epoch": 0.31027253668763105, - "grad_norm": 16.74932861328125, - "learning_rate": 4.359086586285127e-06, - "loss": 0.3108, - "step": 444 - }, - { - "epoch": 0.3109713487071978, - "grad_norm": 40.40657424926758, - "learning_rate": 4.355000970873352e-06, - "loss": 0.8194, - "step": 445 - }, - { - "epoch": 0.3116701607267645, - "grad_norm": 4.776082992553711, - "learning_rate": 4.350904302298511e-06, - "loss": 0.0333, - "step": 446 - }, - { - "epoch": 0.31236897274633124, - "grad_norm": 11.798108100891113, - "learning_rate": 4.346796604970913e-06, - "loss": 0.2779, - "step": 447 - }, - { - "epoch": 0.31306778476589797, - "grad_norm": 27.99241828918457, - "learning_rate": 4.34267790336658e-06, - "loss": 0.5368, - "step": 448 - }, - { - "epoch": 0.3137665967854647, - "grad_norm": 16.165719985961914, - "learning_rate": 4.338548222027107e-06, - "loss": 0.3069, - "step": 449 - }, - { - "epoch": 0.31446540880503143, - "grad_norm": 5.398584842681885, - "learning_rate": 4.33440758555951e-06, - "loss": 0.0882, - "step": 450 - }, - { - "epoch": 0.31516422082459816, - "grad_norm": 17.657291412353516, - "learning_rate": 4.330256018636086e-06, - "loss": 0.2464, - "step": 451 - }, - { - "epoch": 0.3158630328441649, - "grad_norm": 13.588639259338379, - "learning_rate": 4.326093545994258e-06, - "loss": 0.1828, - "step": 452 - }, - { - "epoch": 0.3165618448637317, - "grad_norm": 5.476947784423828, - "learning_rate": 4.3219201924364325e-06, - "loss": 0.0618, - "step": 453 - }, - { - "epoch": 0.3172606568832984, - "grad_norm": 50.05097198486328, - "learning_rate": 4.317735982829852e-06, - "loss": 0.9741, - "step": 454 - }, - { - "epoch": 0.31795946890286514, - "grad_norm": 3.6453871726989746, - "learning_rate": 4.313540942106445e-06, - "loss": 0.0466, - "step": 455 - }, - { - "epoch": 0.31865828092243187, - "grad_norm": 5.7373785972595215, - "learning_rate": 4.309335095262675e-06, - "loss": 0.0701, - "step": 456 - }, - { - "epoch": 0.3193570929419986, - "grad_norm": 32.402801513671875, - "learning_rate": 4.305118467359402e-06, - "loss": 0.6547, - "step": 457 - }, - { - "epoch": 0.32005590496156533, - "grad_norm": 9.464690208435059, - "learning_rate": 4.300891083521717e-06, - "loss": 0.0971, - "step": 458 - }, - { - "epoch": 0.32075471698113206, - "grad_norm": 33.40875244140625, - "learning_rate": 4.296652968938807e-06, - "loss": 0.6254, - "step": 459 - }, - { - "epoch": 0.3214535290006988, - "grad_norm": 8.698691368103027, - "learning_rate": 4.2924041488637966e-06, - "loss": 0.1062, - "step": 460 - }, - { - "epoch": 0.3221523410202656, - "grad_norm": 35.50666427612305, - "learning_rate": 4.288144648613601e-06, - "loss": 0.8304, - "step": 461 - }, - { - "epoch": 0.3228511530398323, - "grad_norm": 5.5079169273376465, - "learning_rate": 4.283874493568772e-06, - "loss": 0.0682, - "step": 462 - }, - { - "epoch": 0.32354996505939904, - "grad_norm": 29.080564498901367, - "learning_rate": 4.279593709173352e-06, - "loss": 0.4879, - "step": 463 - }, - { - "epoch": 0.32424877707896577, - "grad_norm": 0.8691240549087524, - "learning_rate": 4.2753023209347164e-06, - "loss": 0.0137, - "step": 464 - }, - { - "epoch": 0.3249475890985325, - "grad_norm": 18.309951782226562, - "learning_rate": 4.2710003544234255e-06, - "loss": 0.3709, - "step": 465 - }, - { - "epoch": 0.3256464011180992, - "grad_norm": 25.955778121948242, - "learning_rate": 4.266687835273071e-06, - "loss": 0.4848, - "step": 466 - }, - { - "epoch": 0.32634521313766596, - "grad_norm": 2.9740233421325684, - "learning_rate": 4.262364789180123e-06, - "loss": 0.0299, - "step": 467 - }, - { - "epoch": 0.3270440251572327, - "grad_norm": 24.613683700561523, - "learning_rate": 4.258031241903778e-06, - "loss": 0.4708, - "step": 468 - }, - { - "epoch": 0.3277428371767994, - "grad_norm": 32.723487854003906, - "learning_rate": 4.253687219265803e-06, - "loss": 0.6666, - "step": 469 - }, - { - "epoch": 0.3284416491963662, - "grad_norm": 9.264688491821289, - "learning_rate": 4.249332747150386e-06, - "loss": 0.1299, - "step": 470 - }, - { - "epoch": 0.32914046121593293, - "grad_norm": 16.0210018157959, - "learning_rate": 4.244967851503975e-06, - "loss": 0.2999, - "step": 471 - }, - { - "epoch": 0.32983927323549966, - "grad_norm": 14.11863899230957, - "learning_rate": 4.240592558335131e-06, - "loss": 0.1809, - "step": 472 - }, - { - "epoch": 0.3305380852550664, - "grad_norm": 5.871946811676025, - "learning_rate": 4.236206893714369e-06, - "loss": 0.0786, - "step": 473 - }, - { - "epoch": 0.3312368972746331, - "grad_norm": 24.123607635498047, - "learning_rate": 4.231810883773999e-06, - "loss": 0.4737, - "step": 474 - }, - { - "epoch": 0.33193570929419985, - "grad_norm": 36.70653533935547, - "learning_rate": 4.22740455470798e-06, - "loss": 0.8122, - "step": 475 - }, - { - "epoch": 0.3326345213137666, - "grad_norm": 8.99547290802002, - "learning_rate": 4.2229879327717545e-06, - "loss": 0.0938, - "step": 476 - }, - { - "epoch": 0.3333333333333333, - "grad_norm": 10.947245597839355, - "learning_rate": 4.218561044282099e-06, - "loss": 0.1411, - "step": 477 - }, - { - "epoch": 0.33403214535290005, - "grad_norm": 9.174323081970215, - "learning_rate": 4.21412391561696e-06, - "loss": 0.1841, - "step": 478 - }, - { - "epoch": 0.33473095737246683, - "grad_norm": 5.052950859069824, - "learning_rate": 4.209676573215304e-06, - "loss": 0.0873, - "step": 479 - }, - { - "epoch": 0.33542976939203356, - "grad_norm": 3.0600333213806152, - "learning_rate": 4.205219043576955e-06, - "loss": 0.0213, - "step": 480 - }, - { - "epoch": 0.3361285814116003, - "grad_norm": 10.695812225341797, - "learning_rate": 4.200751353262442e-06, - "loss": 0.1853, - "step": 481 - }, - { - "epoch": 0.336827393431167, - "grad_norm": 24.313112258911133, - "learning_rate": 4.196273528892831e-06, - "loss": 0.3228, - "step": 482 - }, - { - "epoch": 0.33752620545073375, - "grad_norm": 3.9740796089172363, - "learning_rate": 4.191785597149577e-06, - "loss": 0.0468, - "step": 483 - }, - { - "epoch": 0.3382250174703005, - "grad_norm": 24.344791412353516, - "learning_rate": 4.1872875847743605e-06, - "loss": 0.3732, - "step": 484 - }, - { - "epoch": 0.3389238294898672, - "grad_norm": 32.8441276550293, - "learning_rate": 4.182779518568925e-06, - "loss": 0.7411, - "step": 485 - }, - { - "epoch": 0.33962264150943394, - "grad_norm": 12.237163543701172, - "learning_rate": 4.178261425394926e-06, - "loss": 0.2246, - "step": 486 - }, - { - "epoch": 0.3403214535290007, - "grad_norm": 17.052074432373047, - "learning_rate": 4.173733332173759e-06, - "loss": 0.3097, - "step": 487 - }, - { - "epoch": 0.34102026554856746, - "grad_norm": 0.5796659588813782, - "learning_rate": 4.1691952658864106e-06, - "loss": 0.013, - "step": 488 - }, - { - "epoch": 0.3417190775681342, - "grad_norm": 20.193439483642578, - "learning_rate": 4.16464725357329e-06, - "loss": 0.2634, - "step": 489 - }, - { - "epoch": 0.3424178895877009, - "grad_norm": 13.251941680908203, - "learning_rate": 4.160089322334071e-06, - "loss": 0.2221, - "step": 490 - }, - { - "epoch": 0.34311670160726765, - "grad_norm": 44.23042678833008, - "learning_rate": 4.15552149932753e-06, - "loss": 0.9051, - "step": 491 - }, - { - "epoch": 0.3438155136268344, - "grad_norm": 10.936775207519531, - "learning_rate": 4.150943811771387e-06, - "loss": 0.2734, - "step": 492 - }, - { - "epoch": 0.3445143256464011, - "grad_norm": 11.693808555603027, - "learning_rate": 4.146356286942136e-06, - "loss": 0.1532, - "step": 493 - }, - { - "epoch": 0.34521313766596784, - "grad_norm": 0.5446651577949524, - "learning_rate": 4.1417589521748895e-06, - "loss": 0.0103, - "step": 494 - }, - { - "epoch": 0.34591194968553457, - "grad_norm": 19.49287986755371, - "learning_rate": 4.137151834863213e-06, - "loss": 0.5919, - "step": 495 - }, - { - "epoch": 0.3466107617051013, - "grad_norm": 20.148540496826172, - "learning_rate": 4.1325349624589625e-06, - "loss": 0.3516, - "step": 496 - }, - { - "epoch": 0.3473095737246681, - "grad_norm": 7.5778398513793945, - "learning_rate": 4.127908362472121e-06, - "loss": 0.0888, - "step": 497 - }, - { - "epoch": 0.3480083857442348, - "grad_norm": 16.51567840576172, - "learning_rate": 4.123272062470633e-06, - "loss": 0.3252, - "step": 498 - }, - { - "epoch": 0.34870719776380155, - "grad_norm": 10.965740203857422, - "learning_rate": 4.1186260900802405e-06, - "loss": 0.1349, - "step": 499 - }, - { - "epoch": 0.3494060097833683, - "grad_norm": 21.047245025634766, - "learning_rate": 4.11397047298432e-06, - "loss": 0.3691, - "step": 500 - }, - { - "epoch": 0.3494060097833683, - "eval_loss": 0.2600938379764557, - "eval_runtime": 305.1384, - "eval_samples_per_second": 2.084, - "eval_steps_per_second": 0.521, - "step": 500 - }, - { - "epoch": 0.350104821802935, - "grad_norm": 3.7424590587615967, - "learning_rate": 4.109305238923718e-06, - "loss": 0.0433, - "step": 501 - }, - { - "epoch": 0.35080363382250174, - "grad_norm": 11.72683334350586, - "learning_rate": 4.1046304156965825e-06, - "loss": 0.1315, - "step": 502 - }, - { - "epoch": 0.35150244584206847, - "grad_norm": 20.837553024291992, - "learning_rate": 4.0999460311582e-06, - "loss": 0.1873, - "step": 503 - }, - { - "epoch": 0.3522012578616352, - "grad_norm": 18.056699752807617, - "learning_rate": 4.095252113220827e-06, - "loss": 0.3669, - "step": 504 - }, - { - "epoch": 0.352900069881202, - "grad_norm": 4.906400680541992, - "learning_rate": 4.0905486898535305e-06, - "loss": 0.0484, - "step": 505 - }, - { - "epoch": 0.3535988819007687, - "grad_norm": 25.349031448364258, - "learning_rate": 4.0858357890820115e-06, - "loss": 0.6116, - "step": 506 - }, - { - "epoch": 0.35429769392033544, - "grad_norm": 30.679439544677734, - "learning_rate": 4.081113438988443e-06, - "loss": 0.6092, - "step": 507 - }, - { - "epoch": 0.3549965059399022, - "grad_norm": 6.323155403137207, - "learning_rate": 4.076381667711306e-06, - "loss": 0.0762, - "step": 508 - }, - { - "epoch": 0.3556953179594689, - "grad_norm": 18.520811080932617, - "learning_rate": 4.071640503445217e-06, - "loss": 0.561, - "step": 509 - }, - { - "epoch": 0.35639412997903563, - "grad_norm": 8.855299949645996, - "learning_rate": 4.066889974440757e-06, - "loss": 0.1858, - "step": 510 - }, - { - "epoch": 0.35709294199860236, - "grad_norm": 22.839412689208984, - "learning_rate": 4.062130109004313e-06, - "loss": 0.3881, - "step": 511 - }, - { - "epoch": 0.3577917540181691, - "grad_norm": 10.531706809997559, - "learning_rate": 4.057360935497903e-06, - "loss": 0.2053, - "step": 512 - }, - { - "epoch": 0.3584905660377358, - "grad_norm": 0.7138981223106384, - "learning_rate": 4.052582482339004e-06, - "loss": 0.0122, - "step": 513 - }, - { - "epoch": 0.3591893780573026, - "grad_norm": 4.182406425476074, - "learning_rate": 4.047794778000394e-06, - "loss": 0.0433, - "step": 514 - }, - { - "epoch": 0.35988819007686934, - "grad_norm": 43.830116271972656, - "learning_rate": 4.0429978510099645e-06, - "loss": 0.8811, - "step": 515 - }, - { - "epoch": 0.36058700209643607, - "grad_norm": 11.225503921508789, - "learning_rate": 4.038191729950569e-06, - "loss": 0.1653, - "step": 516 - }, - { - "epoch": 0.3612858141160028, - "grad_norm": 13.19017505645752, - "learning_rate": 4.033376443459842e-06, - "loss": 0.2209, - "step": 517 - }, - { - "epoch": 0.36198462613556953, - "grad_norm": 15.181008338928223, - "learning_rate": 4.028552020230031e-06, - "loss": 0.2336, - "step": 518 - }, - { - "epoch": 0.36268343815513626, - "grad_norm": 13.00654411315918, - "learning_rate": 4.023718489007825e-06, - "loss": 0.2148, - "step": 519 - }, - { - "epoch": 0.363382250174703, - "grad_norm": 11.28083610534668, - "learning_rate": 4.018875878594184e-06, - "loss": 0.1845, - "step": 520 - }, - { - "epoch": 0.3640810621942697, - "grad_norm": 25.007339477539062, - "learning_rate": 4.014024217844167e-06, - "loss": 0.379, - "step": 521 - }, - { - "epoch": 0.36477987421383645, - "grad_norm": 9.63041877746582, - "learning_rate": 4.009163535666761e-06, - "loss": 0.177, - "step": 522 - }, - { - "epoch": 0.36547868623340324, - "grad_norm": 14.077667236328125, - "learning_rate": 4.004293861024706e-06, - "loss": 0.291, - "step": 523 - }, - { - "epoch": 0.36617749825296997, - "grad_norm": 2.34429669380188, - "learning_rate": 3.999415222934325e-06, - "loss": 0.0159, - "step": 524 - }, - { - "epoch": 0.3668763102725367, - "grad_norm": 28.44304847717285, - "learning_rate": 3.994527650465352e-06, - "loss": 0.6444, - "step": 525 - }, - { - "epoch": 0.36757512229210343, - "grad_norm": 15.48746109008789, - "learning_rate": 3.989631172740756e-06, - "loss": 0.2803, - "step": 526 - }, - { - "epoch": 0.36827393431167016, - "grad_norm": 16.812519073486328, - "learning_rate": 3.9847258189365664e-06, - "loss": 0.2712, - "step": 527 - }, - { - "epoch": 0.3689727463312369, - "grad_norm": 17.016923904418945, - "learning_rate": 3.979811618281706e-06, - "loss": 0.2577, - "step": 528 - }, - { - "epoch": 0.3696715583508036, - "grad_norm": 12.322420120239258, - "learning_rate": 3.974888600057808e-06, - "loss": 0.1538, - "step": 529 - }, - { - "epoch": 0.37037037037037035, - "grad_norm": 8.590230941772461, - "learning_rate": 3.969956793599048e-06, - "loss": 0.1597, - "step": 530 - }, - { - "epoch": 0.3710691823899371, - "grad_norm": 8.045814514160156, - "learning_rate": 3.965016228291966e-06, - "loss": 0.0748, - "step": 531 - }, - { - "epoch": 0.37176799440950387, - "grad_norm": 23.290437698364258, - "learning_rate": 3.960066933575293e-06, - "loss": 0.2692, - "step": 532 - }, - { - "epoch": 0.3724668064290706, - "grad_norm": 1.2425870895385742, - "learning_rate": 3.955108938939774e-06, - "loss": 0.0141, - "step": 533 - }, - { - "epoch": 0.3731656184486373, - "grad_norm": 20.197912216186523, - "learning_rate": 3.950142273927996e-06, - "loss": 0.2987, - "step": 534 - }, - { - "epoch": 0.37386443046820406, - "grad_norm": 10.080740928649902, - "learning_rate": 3.9451669681342034e-06, - "loss": 0.1712, - "step": 535 - }, - { - "epoch": 0.3745632424877708, - "grad_norm": 28.413753509521484, - "learning_rate": 3.940183051204133e-06, - "loss": 0.4963, - "step": 536 - }, - { - "epoch": 0.3752620545073375, - "grad_norm": 1.5597444772720337, - "learning_rate": 3.9351905528348285e-06, - "loss": 0.0369, - "step": 537 - }, - { - "epoch": 0.37596086652690425, - "grad_norm": 15.507638931274414, - "learning_rate": 3.930189502774467e-06, - "loss": 0.2956, - "step": 538 - }, - { - "epoch": 0.376659678546471, - "grad_norm": 12.51319408416748, - "learning_rate": 3.9251799308221835e-06, - "loss": 0.1758, - "step": 539 - }, - { - "epoch": 0.37735849056603776, - "grad_norm": 15.60328483581543, - "learning_rate": 3.92016186682789e-06, - "loss": 0.4238, - "step": 540 - }, - { - "epoch": 0.3780573025856045, - "grad_norm": 28.993366241455078, - "learning_rate": 3.915135340692098e-06, - "loss": 0.6373, - "step": 541 - }, - { - "epoch": 0.3787561146051712, - "grad_norm": 19.215646743774414, - "learning_rate": 3.910100382365741e-06, - "loss": 0.2849, - "step": 542 - }, - { - "epoch": 0.37945492662473795, - "grad_norm": 9.090166091918945, - "learning_rate": 3.905057021850001e-06, - "loss": 0.1784, - "step": 543 - }, - { - "epoch": 0.3801537386443047, - "grad_norm": 18.84730339050293, - "learning_rate": 3.900005289196119e-06, - "loss": 0.4435, - "step": 544 - }, - { - "epoch": 0.3808525506638714, - "grad_norm": 14.470978736877441, - "learning_rate": 3.894945214505226e-06, - "loss": 0.2638, - "step": 545 - }, - { - "epoch": 0.38155136268343814, - "grad_norm": 5.556311130523682, - "learning_rate": 3.889876827928156e-06, - "loss": 0.0836, - "step": 546 - }, - { - "epoch": 0.3822501747030049, - "grad_norm": 1.6064200401306152, - "learning_rate": 3.8848001596652765e-06, - "loss": 0.015, - "step": 547 - }, - { - "epoch": 0.3829489867225716, - "grad_norm": 15.108182907104492, - "learning_rate": 3.879715239966294e-06, - "loss": 0.3347, - "step": 548 - }, - { - "epoch": 0.3836477987421384, - "grad_norm": 20.073755264282227, - "learning_rate": 3.874622099130087e-06, - "loss": 0.3031, - "step": 549 - }, - { - "epoch": 0.3843466107617051, - "grad_norm": 37.742095947265625, - "learning_rate": 3.869520767504521e-06, - "loss": 0.7479, - "step": 550 - }, - { - "epoch": 0.38504542278127185, - "grad_norm": 17.45064926147461, - "learning_rate": 3.8644112754862614e-06, - "loss": 0.2259, - "step": 551 - }, - { - "epoch": 0.3857442348008386, - "grad_norm": 32.154056549072266, - "learning_rate": 3.8592936535206044e-06, - "loss": 0.4873, - "step": 552 - }, - { - "epoch": 0.3864430468204053, - "grad_norm": 27.295841217041016, - "learning_rate": 3.8541679321012836e-06, - "loss": 0.5003, - "step": 553 - }, - { - "epoch": 0.38714185883997204, - "grad_norm": 7.330911636352539, - "learning_rate": 3.8490341417702985e-06, - "loss": 0.0867, - "step": 554 - }, - { - "epoch": 0.38784067085953877, - "grad_norm": 16.558582305908203, - "learning_rate": 3.843892313117724e-06, - "loss": 0.2806, - "step": 555 - }, - { - "epoch": 0.3885394828791055, - "grad_norm": 16.339736938476562, - "learning_rate": 3.838742476781535e-06, - "loss": 0.2887, - "step": 556 - }, - { - "epoch": 0.38923829489867223, - "grad_norm": 10.998993873596191, - "learning_rate": 3.833584663447418e-06, - "loss": 0.2142, - "step": 557 - }, - { - "epoch": 0.389937106918239, - "grad_norm": 29.255325317382812, - "learning_rate": 3.828418903848593e-06, - "loss": 0.5292, - "step": 558 - }, - { - "epoch": 0.39063591893780575, - "grad_norm": 17.574804306030273, - "learning_rate": 3.823245228765628e-06, - "loss": 0.4972, - "step": 559 - }, - { - "epoch": 0.3913347309573725, - "grad_norm": 12.789593696594238, - "learning_rate": 3.8180636690262565e-06, - "loss": 0.1983, - "step": 560 - }, - { - "epoch": 0.3920335429769392, - "grad_norm": 6.3378520011901855, - "learning_rate": 3.812874255505191e-06, - "loss": 0.0954, - "step": 561 - }, - { - "epoch": 0.39273235499650594, - "grad_norm": 17.002426147460938, - "learning_rate": 3.8076770191239444e-06, - "loss": 0.2236, - "step": 562 - }, - { - "epoch": 0.39343116701607267, - "grad_norm": 18.199960708618164, - "learning_rate": 3.8024719908506403e-06, - "loss": 0.3397, - "step": 563 - }, - { - "epoch": 0.3941299790356394, - "grad_norm": 16.758031845092773, - "learning_rate": 3.797259201699833e-06, - "loss": 0.2647, - "step": 564 - }, - { - "epoch": 0.39482879105520613, - "grad_norm": 0.8165547847747803, - "learning_rate": 3.7920386827323186e-06, - "loss": 0.0131, - "step": 565 - }, - { - "epoch": 0.39552760307477286, - "grad_norm": 11.709540367126465, - "learning_rate": 3.786810465054953e-06, - "loss": 0.1208, - "step": 566 - }, - { - "epoch": 0.39622641509433965, - "grad_norm": 7.707137584686279, - "learning_rate": 3.7815745798204646e-06, - "loss": 0.1561, - "step": 567 - }, - { - "epoch": 0.3969252271139064, - "grad_norm": 14.922818183898926, - "learning_rate": 3.776331058227271e-06, - "loss": 0.4501, - "step": 568 - }, - { - "epoch": 0.3976240391334731, - "grad_norm": 13.77346420288086, - "learning_rate": 3.7710799315192904e-06, - "loss": 0.2852, - "step": 569 - }, - { - "epoch": 0.39832285115303984, - "grad_norm": 10.340530395507812, - "learning_rate": 3.7658212309857576e-06, - "loss": 0.1373, - "step": 570 - }, - { - "epoch": 0.39902166317260657, - "grad_norm": 16.489418029785156, - "learning_rate": 3.7605549879610346e-06, - "loss": 0.3165, - "step": 571 - }, - { - "epoch": 0.3997204751921733, - "grad_norm": 0.4466027021408081, - "learning_rate": 3.755281233824428e-06, - "loss": 0.0088, - "step": 572 - }, - { - "epoch": 0.40041928721174, - "grad_norm": 35.0068359375, - "learning_rate": 3.7500000000000005e-06, - "loss": 0.6015, - "step": 573 - }, - { - "epoch": 0.40111809923130676, - "grad_norm": 34.37299728393555, - "learning_rate": 3.74471131795638e-06, - "loss": 0.6125, - "step": 574 - }, - { - "epoch": 0.4018169112508735, - "grad_norm": 15.488386154174805, - "learning_rate": 3.739415219206577e-06, - "loss": 0.3235, - "step": 575 - }, - { - "epoch": 0.4025157232704403, - "grad_norm": 11.364415168762207, - "learning_rate": 3.7341117353077964e-06, - "loss": 0.2425, - "step": 576 - }, - { - "epoch": 0.403214535290007, - "grad_norm": 17.152284622192383, - "learning_rate": 3.7288008978612457e-06, - "loss": 0.3821, - "step": 577 - }, - { - "epoch": 0.40391334730957373, - "grad_norm": 8.417110443115234, - "learning_rate": 3.72348273851195e-06, - "loss": 0.126, - "step": 578 - }, - { - "epoch": 0.40461215932914046, - "grad_norm": 8.288241386413574, - "learning_rate": 3.718157288948563e-06, - "loss": 0.0613, - "step": 579 - }, - { - "epoch": 0.4053109713487072, - "grad_norm": 19.758319854736328, - "learning_rate": 3.7128245809031765e-06, - "loss": 0.3416, - "step": 580 - }, - { - "epoch": 0.4060097833682739, - "grad_norm": 38.723880767822266, - "learning_rate": 3.7074846461511336e-06, - "loss": 1.0351, - "step": 581 - }, - { - "epoch": 0.40670859538784065, - "grad_norm": 31.21034049987793, - "learning_rate": 3.702137516510838e-06, - "loss": 0.8269, - "step": 582 - }, - { - "epoch": 0.4074074074074074, - "grad_norm": 9.645279884338379, - "learning_rate": 3.6967832238435645e-06, - "loss": 0.13, - "step": 583 - }, - { - "epoch": 0.40810621942697417, - "grad_norm": 49.4558219909668, - "learning_rate": 3.6914218000532697e-06, - "loss": 0.9998, - "step": 584 - }, - { - "epoch": 0.4088050314465409, - "grad_norm": 12.992410659790039, - "learning_rate": 3.686053277086401e-06, - "loss": 0.1639, - "step": 585 - }, - { - "epoch": 0.40950384346610763, - "grad_norm": 4.361360549926758, - "learning_rate": 3.6806776869317074e-06, - "loss": 0.0342, - "step": 586 - }, - { - "epoch": 0.41020265548567436, - "grad_norm": 24.56938934326172, - "learning_rate": 3.675295061620047e-06, - "loss": 0.4526, - "step": 587 - }, - { - "epoch": 0.4109014675052411, - "grad_norm": 15.994097709655762, - "learning_rate": 3.669905433224199e-06, - "loss": 0.2717, - "step": 588 - }, - { - "epoch": 0.4116002795248078, - "grad_norm": 12.24364948272705, - "learning_rate": 3.66450883385867e-06, - "loss": 0.1582, - "step": 589 - }, - { - "epoch": 0.41229909154437455, - "grad_norm": 30.853675842285156, - "learning_rate": 3.6591052956795043e-06, - "loss": 0.7669, - "step": 590 - }, - { - "epoch": 0.4129979035639413, - "grad_norm": 13.666319847106934, - "learning_rate": 3.6536948508840915e-06, - "loss": 0.1385, - "step": 591 - }, - { - "epoch": 0.413696715583508, - "grad_norm": 17.81270980834961, - "learning_rate": 3.648277531710974e-06, - "loss": 0.2861, - "step": 592 - }, - { - "epoch": 0.4143955276030748, - "grad_norm": 30.490543365478516, - "learning_rate": 3.6428533704396566e-06, - "loss": 0.5916, - "step": 593 - }, - { - "epoch": 0.41509433962264153, - "grad_norm": 4.1663618087768555, - "learning_rate": 3.637422399390413e-06, - "loss": 0.0467, - "step": 594 - }, - { - "epoch": 0.41579315164220826, - "grad_norm": 35.07849884033203, - "learning_rate": 3.631984650924094e-06, - "loss": 0.7829, - "step": 595 - }, - { - "epoch": 0.416491963661775, - "grad_norm": 9.993618965148926, - "learning_rate": 3.6265401574419316e-06, - "loss": 0.1388, - "step": 596 - }, - { - "epoch": 0.4171907756813417, - "grad_norm": 20.32595443725586, - "learning_rate": 3.621088951385353e-06, - "loss": 0.2625, - "step": 597 - }, - { - "epoch": 0.41788958770090845, - "grad_norm": 11.669280052185059, - "learning_rate": 3.615631065235779e-06, - "loss": 0.1614, - "step": 598 - }, - { - "epoch": 0.4185883997204752, - "grad_norm": 9.180257797241211, - "learning_rate": 3.6101665315144357e-06, - "loss": 0.1378, - "step": 599 - }, - { - "epoch": 0.4192872117400419, - "grad_norm": 9.609012603759766, - "learning_rate": 3.604695382782159e-06, - "loss": 0.1165, - "step": 600 - }, - { - "epoch": 0.41998602375960864, - "grad_norm": 6.068733215332031, - "learning_rate": 3.5992176516392007e-06, - "loss": 0.0884, - "step": 601 - }, - { - "epoch": 0.4206848357791754, - "grad_norm": 17.611309051513672, - "learning_rate": 3.593733370725035e-06, - "loss": 0.3073, - "step": 602 - }, - { - "epoch": 0.42138364779874216, - "grad_norm": 5.565227508544922, - "learning_rate": 3.5882425727181625e-06, - "loss": 0.0648, - "step": 603 - }, - { - "epoch": 0.4220824598183089, - "grad_norm": 9.799570083618164, - "learning_rate": 3.5827452903359174e-06, - "loss": 0.1383, - "step": 604 - }, - { - "epoch": 0.4227812718378756, - "grad_norm": 42.80298614501953, - "learning_rate": 3.5772415563342703e-06, - "loss": 0.9584, - "step": 605 - }, - { - "epoch": 0.42348008385744235, - "grad_norm": 11.341519355773926, - "learning_rate": 3.5717314035076355e-06, - "loss": 0.1623, - "step": 606 - }, - { - "epoch": 0.4241788958770091, - "grad_norm": 5.312375545501709, - "learning_rate": 3.566214864688674e-06, - "loss": 0.044, - "step": 607 - }, - { - "epoch": 0.4248777078965758, - "grad_norm": 11.391216278076172, - "learning_rate": 3.5606919727480984e-06, - "loss": 0.1592, - "step": 608 - }, - { - "epoch": 0.42557651991614254, - "grad_norm": 8.821585655212402, - "learning_rate": 3.555162760594475e-06, - "loss": 0.0636, - "step": 609 - }, - { - "epoch": 0.42627533193570927, - "grad_norm": 36.22662353515625, - "learning_rate": 3.549627261174032e-06, - "loss": 0.7925, - "step": 610 - }, - { - "epoch": 0.42697414395527605, - "grad_norm": 18.937807083129883, - "learning_rate": 3.54408550747046e-06, - "loss": 0.2626, - "step": 611 - }, - { - "epoch": 0.4276729559748428, - "grad_norm": 20.100234985351562, - "learning_rate": 3.5385375325047167e-06, - "loss": 0.4096, - "step": 612 - }, - { - "epoch": 0.4283717679944095, - "grad_norm": 24.95952033996582, - "learning_rate": 3.532983369334827e-06, - "loss": 0.5348, - "step": 613 - }, - { - "epoch": 0.42907058001397624, - "grad_norm": 13.789188385009766, - "learning_rate": 3.527423051055692e-06, - "loss": 0.2886, - "step": 614 - }, - { - "epoch": 0.429769392033543, - "grad_norm": 22.10971450805664, - "learning_rate": 3.5218566107988872e-06, - "loss": 0.3529, - "step": 615 - }, - { - "epoch": 0.4304682040531097, - "grad_norm": 12.709105491638184, - "learning_rate": 3.516284081732466e-06, - "loss": 0.1759, - "step": 616 - }, - { - "epoch": 0.43116701607267643, - "grad_norm": 9.625626564025879, - "learning_rate": 3.5107054970607624e-06, - "loss": 0.1221, - "step": 617 - }, - { - "epoch": 0.43186582809224316, - "grad_norm": 35.25808334350586, - "learning_rate": 3.505120890024195e-06, - "loss": 0.6798, - "step": 618 - }, - { - "epoch": 0.43256464011180995, - "grad_norm": 26.348852157592773, - "learning_rate": 3.499530293899064e-06, - "loss": 0.583, - "step": 619 - }, - { - "epoch": 0.4332634521313767, - "grad_norm": 23.911104202270508, - "learning_rate": 3.4939337419973584e-06, - "loss": 0.1785, - "step": 620 - }, - { - "epoch": 0.4339622641509434, - "grad_norm": 16.0843563079834, - "learning_rate": 3.4883312676665537e-06, - "loss": 0.1151, - "step": 621 - }, - { - "epoch": 0.43466107617051014, - "grad_norm": 0.3315320312976837, - "learning_rate": 3.4827229042894174e-06, - "loss": 0.0061, - "step": 622 - }, - { - "epoch": 0.43535988819007687, - "grad_norm": 16.160917282104492, - "learning_rate": 3.477108685283803e-06, - "loss": 0.3259, - "step": 623 - }, - { - "epoch": 0.4360587002096436, - "grad_norm": 4.331777095794678, - "learning_rate": 3.4714886441024576e-06, - "loss": 0.0253, - "step": 624 - }, - { - "epoch": 0.43675751222921033, - "grad_norm": 28.858049392700195, - "learning_rate": 3.4658628142328215e-06, - "loss": 0.603, - "step": 625 - }, - { - "epoch": 0.43745632424877706, - "grad_norm": 32.425533294677734, - "learning_rate": 3.460231229196826e-06, - "loss": 0.6591, - "step": 626 - }, - { - "epoch": 0.4381551362683438, - "grad_norm": 3.895393133163452, - "learning_rate": 3.4545939225506935e-06, - "loss": 0.0455, - "step": 627 - }, - { - "epoch": 0.4388539482879106, - "grad_norm": 0.6910443902015686, - "learning_rate": 3.4489509278847415e-06, - "loss": 0.0167, - "step": 628 - }, - { - "epoch": 0.4395527603074773, - "grad_norm": 0.4243311583995819, - "learning_rate": 3.443302278823178e-06, - "loss": 0.0098, - "step": 629 - }, - { - "epoch": 0.44025157232704404, - "grad_norm": 4.930887222290039, - "learning_rate": 3.437648009023905e-06, - "loss": 0.0569, - "step": 630 - }, - { - "epoch": 0.44095038434661077, - "grad_norm": 15.226705551147461, - "learning_rate": 3.431988152178315e-06, - "loss": 0.1541, - "step": 631 - }, - { - "epoch": 0.4416491963661775, - "grad_norm": 16.80103874206543, - "learning_rate": 3.4263227420110905e-06, - "loss": 0.3672, - "step": 632 - }, - { - "epoch": 0.44234800838574423, - "grad_norm": 18.1402587890625, - "learning_rate": 3.420651812280006e-06, - "loss": 0.3196, - "step": 633 - }, - { - "epoch": 0.44304682040531096, - "grad_norm": 20.149139404296875, - "learning_rate": 3.414975396775724e-06, - "loss": 0.2924, - "step": 634 - }, - { - "epoch": 0.4437456324248777, - "grad_norm": 34.62575912475586, - "learning_rate": 3.409293529321593e-06, - "loss": 0.7303, - "step": 635 - }, - { - "epoch": 0.4444444444444444, - "grad_norm": 4.3120198249816895, - "learning_rate": 3.4036062437734484e-06, - "loss": 0.0809, - "step": 636 - }, - { - "epoch": 0.4451432564640112, - "grad_norm": 18.546066284179688, - "learning_rate": 3.39791357401941e-06, - "loss": 0.2833, - "step": 637 - }, - { - "epoch": 0.44584206848357794, - "grad_norm": 3.934873342514038, - "learning_rate": 3.39221555397968e-06, - "loss": 0.0441, - "step": 638 - }, - { - "epoch": 0.44654088050314467, - "grad_norm": 9.060565948486328, - "learning_rate": 3.386512217606339e-06, - "loss": 0.1529, - "step": 639 - }, - { - "epoch": 0.4472396925227114, - "grad_norm": 18.467248916625977, - "learning_rate": 3.3808035988831483e-06, - "loss": 0.3309, - "step": 640 - }, - { - "epoch": 0.4479385045422781, - "grad_norm": 44.660743713378906, - "learning_rate": 3.3750897318253407e-06, - "loss": 0.609, - "step": 641 - }, - { - "epoch": 0.44863731656184486, - "grad_norm": 4.418940544128418, - "learning_rate": 3.369370650479425e-06, - "loss": 0.0616, - "step": 642 - }, - { - "epoch": 0.4493361285814116, - "grad_norm": 27.10004425048828, - "learning_rate": 3.363646388922978e-06, - "loss": 0.5705, - "step": 643 - }, - { - "epoch": 0.4500349406009783, - "grad_norm": 15.004708290100098, - "learning_rate": 3.3579169812644434e-06, - "loss": 0.2559, - "step": 644 - }, - { - "epoch": 0.45073375262054505, - "grad_norm": 4.751020908355713, - "learning_rate": 3.352182461642929e-06, - "loss": 0.0597, - "step": 645 - }, - { - "epoch": 0.45143256464011183, - "grad_norm": 13.198694229125977, - "learning_rate": 3.3464428642280004e-06, - "loss": 0.287, - "step": 646 - }, - { - "epoch": 0.45213137665967856, - "grad_norm": 13.90754222869873, - "learning_rate": 3.340698223219484e-06, - "loss": 0.2928, - "step": 647 - }, - { - "epoch": 0.4528301886792453, - "grad_norm": 0.8066713809967041, - "learning_rate": 3.3349485728472536e-06, - "loss": 0.0148, - "step": 648 - }, - { - "epoch": 0.453529000698812, - "grad_norm": 15.261994361877441, - "learning_rate": 3.329193947371036e-06, - "loss": 0.3029, - "step": 649 - }, - { - "epoch": 0.45422781271837875, - "grad_norm": 9.391402244567871, - "learning_rate": 3.3234343810801995e-06, - "loss": 0.0981, - "step": 650 - }, - { - "epoch": 0.4549266247379455, - "grad_norm": 7.278449058532715, - "learning_rate": 3.3176699082935546e-06, - "loss": 0.0663, - "step": 651 - }, - { - "epoch": 0.4556254367575122, - "grad_norm": 13.300775527954102, - "learning_rate": 3.3119005633591462e-06, - "loss": 0.2691, - "step": 652 - }, - { - "epoch": 0.45632424877707894, - "grad_norm": 6.767027378082275, - "learning_rate": 3.3061263806540513e-06, - "loss": 0.0936, - "step": 653 - }, - { - "epoch": 0.4570230607966457, - "grad_norm": 17.303125381469727, - "learning_rate": 3.3003473945841725e-06, - "loss": 0.3734, - "step": 654 - }, - { - "epoch": 0.45772187281621246, - "grad_norm": 15.9292631149292, - "learning_rate": 3.294563639584034e-06, - "loss": 0.2842, - "step": 655 - }, - { - "epoch": 0.4584206848357792, - "grad_norm": 15.620192527770996, - "learning_rate": 3.2887751501165755e-06, - "loss": 0.3369, - "step": 656 - }, - { - "epoch": 0.4591194968553459, - "grad_norm": 28.214311599731445, - "learning_rate": 3.282981960672948e-06, - "loss": 0.5888, - "step": 657 - }, - { - "epoch": 0.45981830887491265, - "grad_norm": 8.121265411376953, - "learning_rate": 3.2771841057723064e-06, - "loss": 0.1366, - "step": 658 - }, - { - "epoch": 0.4605171208944794, - "grad_norm": 9.672018051147461, - "learning_rate": 3.2713816199616078e-06, - "loss": 0.1612, - "step": 659 - }, - { - "epoch": 0.4612159329140461, - "grad_norm": 32.084754943847656, - "learning_rate": 3.265574537815398e-06, - "loss": 0.5014, - "step": 660 - }, - { - "epoch": 0.46191474493361284, - "grad_norm": 14.26152515411377, - "learning_rate": 3.2597628939356174e-06, - "loss": 0.2296, - "step": 661 - }, - { - "epoch": 0.46261355695317957, - "grad_norm": 37.188236236572266, - "learning_rate": 3.2539467229513816e-06, - "loss": 0.9165, - "step": 662 - }, - { - "epoch": 0.46331236897274636, - "grad_norm": 7.820224761962891, - "learning_rate": 3.248126059518785e-06, - "loss": 0.107, - "step": 663 - }, - { - "epoch": 0.4640111809923131, - "grad_norm": 15.722857475280762, - "learning_rate": 3.2423009383206876e-06, - "loss": 0.2378, - "step": 664 - }, - { - "epoch": 0.4647099930118798, - "grad_norm": 19.045873641967773, - "learning_rate": 3.236471394066515e-06, - "loss": 0.2276, - "step": 665 - }, - { - "epoch": 0.46540880503144655, - "grad_norm": 12.427132606506348, - "learning_rate": 3.2306374614920434e-06, - "loss": 0.1354, - "step": 666 - }, - { - "epoch": 0.4661076170510133, - "grad_norm": 15.619916915893555, - "learning_rate": 3.2247991753592018e-06, - "loss": 0.2482, - "step": 667 - }, - { - "epoch": 0.46680642907058, - "grad_norm": 16.29061508178711, - "learning_rate": 3.2189565704558573e-06, - "loss": 0.317, - "step": 668 - }, - { - "epoch": 0.46750524109014674, - "grad_norm": 32.359222412109375, - "learning_rate": 3.213109681595612e-06, - "loss": 0.7687, - "step": 669 - }, - { - "epoch": 0.46820405310971347, - "grad_norm": 16.902082443237305, - "learning_rate": 3.2072585436175927e-06, - "loss": 0.2544, - "step": 670 - }, - { - "epoch": 0.4689028651292802, - "grad_norm": 13.869462966918945, - "learning_rate": 3.201403191386247e-06, - "loss": 0.304, - "step": 671 - }, - { - "epoch": 0.469601677148847, - "grad_norm": 23.443647384643555, - "learning_rate": 3.195543659791132e-06, - "loss": 0.5109, - "step": 672 - }, - { - "epoch": 0.4703004891684137, - "grad_norm": 23.310325622558594, - "learning_rate": 3.189679983746708e-06, - "loss": 0.5522, - "step": 673 - }, - { - "epoch": 0.47099930118798045, - "grad_norm": 8.290467262268066, - "learning_rate": 3.1838121981921307e-06, - "loss": 0.1214, - "step": 674 - }, - { - "epoch": 0.4716981132075472, - "grad_norm": 8.271326065063477, - "learning_rate": 3.177940338091043e-06, - "loss": 0.1217, - "step": 675 - }, - { - "epoch": 0.4723969252271139, - "grad_norm": 8.18448257446289, - "learning_rate": 3.1720644384313647e-06, - "loss": 0.1252, - "step": 676 - }, - { - "epoch": 0.47309573724668064, - "grad_norm": 6.801828384399414, - "learning_rate": 3.1661845342250874e-06, - "loss": 0.0621, - "step": 677 - }, - { - "epoch": 0.47379454926624737, - "grad_norm": 30.326765060424805, - "learning_rate": 3.1603006605080642e-06, - "loss": 0.8601, - "step": 678 - }, - { - "epoch": 0.4744933612858141, - "grad_norm": 7.404317855834961, - "learning_rate": 3.154412852339798e-06, - "loss": 0.1335, - "step": 679 - }, - { - "epoch": 0.4751921733053808, - "grad_norm": 20.994054794311523, - "learning_rate": 3.1485211448032397e-06, - "loss": 0.3439, - "step": 680 - }, - { - "epoch": 0.4758909853249476, - "grad_norm": 7.629608631134033, - "learning_rate": 3.1426255730045703e-06, - "loss": 0.1024, - "step": 681 - }, - { - "epoch": 0.47658979734451434, - "grad_norm": 4.6212968826293945, - "learning_rate": 3.1367261720730007e-06, - "loss": 0.0579, - "step": 682 - }, - { - "epoch": 0.4772886093640811, - "grad_norm": 14.404751777648926, - "learning_rate": 3.1308229771605546e-06, - "loss": 0.2595, - "step": 683 - }, - { - "epoch": 0.4779874213836478, - "grad_norm": 23.75787925720215, - "learning_rate": 3.1249160234418646e-06, - "loss": 0.3531, - "step": 684 - }, - { - "epoch": 0.47868623340321453, - "grad_norm": 16.153486251831055, - "learning_rate": 3.1190053461139584e-06, - "loss": 0.3444, - "step": 685 - }, - { - "epoch": 0.47938504542278126, - "grad_norm": 14.183429718017578, - "learning_rate": 3.1130909803960533e-06, - "loss": 0.2709, - "step": 686 - }, - { - "epoch": 0.480083857442348, - "grad_norm": 38.140480041503906, - "learning_rate": 3.107172961529343e-06, - "loss": 0.7512, - "step": 687 - }, - { - "epoch": 0.4807826694619147, - "grad_norm": 32.07691955566406, - "learning_rate": 3.101251324776788e-06, - "loss": 0.5958, - "step": 688 - }, - { - "epoch": 0.48148148148148145, - "grad_norm": 15.85103988647461, - "learning_rate": 3.095326105422908e-06, - "loss": 0.3813, - "step": 689 - }, - { - "epoch": 0.48218029350104824, - "grad_norm": 24.13711166381836, - "learning_rate": 3.089397338773569e-06, - "loss": 0.3759, - "step": 690 - }, - { - "epoch": 0.48287910552061497, - "grad_norm": 5.350378036499023, - "learning_rate": 3.0834650601557724e-06, - "loss": 0.0628, - "step": 691 - }, - { - "epoch": 0.4835779175401817, - "grad_norm": 27.808002471923828, - "learning_rate": 3.07752930491745e-06, - "loss": 0.6048, - "step": 692 - }, - { - "epoch": 0.48427672955974843, - "grad_norm": 15.739814758300781, - "learning_rate": 3.071590108427244e-06, - "loss": 0.1894, - "step": 693 - }, - { - "epoch": 0.48497554157931516, - "grad_norm": 33.10055923461914, - "learning_rate": 3.0656475060743065e-06, - "loss": 0.6215, - "step": 694 - }, - { - "epoch": 0.4856743535988819, - "grad_norm": 10.470149040222168, - "learning_rate": 3.0597015332680792e-06, - "loss": 0.2044, - "step": 695 - }, - { - "epoch": 0.4863731656184486, - "grad_norm": 6.0774617195129395, - "learning_rate": 3.0537522254380902e-06, - "loss": 0.0864, - "step": 696 - }, - { - "epoch": 0.48707197763801535, - "grad_norm": 37.91016387939453, - "learning_rate": 3.047799618033739e-06, - "loss": 0.6288, - "step": 697 - }, - { - "epoch": 0.48777078965758214, - "grad_norm": 3.7894229888916016, - "learning_rate": 3.041843746524085e-06, - "loss": 0.0553, - "step": 698 - }, - { - "epoch": 0.48846960167714887, - "grad_norm": 0.4576641619205475, - "learning_rate": 3.035884646397637e-06, - "loss": 0.0051, - "step": 699 - }, - { - "epoch": 0.4891684136967156, - "grad_norm": 5.785749912261963, - "learning_rate": 3.029922353162143e-06, - "loss": 0.0615, - "step": 700 - }, - { - "epoch": 0.48986722571628233, - "grad_norm": 6.178563117980957, - "learning_rate": 3.0239569023443756e-06, - "loss": 0.0909, - "step": 701 - }, - { - "epoch": 0.49056603773584906, - "grad_norm": 7.636557102203369, - "learning_rate": 3.017988329489923e-06, - "loss": 0.1073, - "step": 702 - }, - { - "epoch": 0.4912648497554158, - "grad_norm": 41.79106903076172, - "learning_rate": 3.012016670162977e-06, - "loss": 0.7351, - "step": 703 - }, - { - "epoch": 0.4919636617749825, - "grad_norm": 0.37176260352134705, - "learning_rate": 3.00604195994612e-06, - "loss": 0.0105, - "step": 704 - }, - { - "epoch": 0.49266247379454925, - "grad_norm": 9.574935913085938, - "learning_rate": 3.0000642344401115e-06, - "loss": 0.0977, - "step": 705 - }, - { - "epoch": 0.493361285814116, - "grad_norm": 5.7922587394714355, - "learning_rate": 2.9940835292636806e-06, - "loss": 0.0841, - "step": 706 - }, - { - "epoch": 0.49406009783368277, - "grad_norm": 7.826452732086182, - "learning_rate": 2.9880998800533095e-06, - "loss": 0.079, - "step": 707 - }, - { - "epoch": 0.4947589098532495, - "grad_norm": 26.17197608947754, - "learning_rate": 2.9821133224630226e-06, - "loss": 0.4604, - "step": 708 - }, - { - "epoch": 0.4954577218728162, - "grad_norm": 7.133215427398682, - "learning_rate": 2.9761238921641753e-06, - "loss": 0.0866, - "step": 709 - }, - { - "epoch": 0.49615653389238296, - "grad_norm": 15.51923656463623, - "learning_rate": 2.970131624845239e-06, - "loss": 0.352, - "step": 710 - }, - { - "epoch": 0.4968553459119497, - "grad_norm": 39.22820281982422, - "learning_rate": 2.9641365562115886e-06, - "loss": 0.834, - "step": 711 - }, - { - "epoch": 0.4975541579315164, - "grad_norm": 17.42025375366211, - "learning_rate": 2.958138721985294e-06, - "loss": 0.2244, - "step": 712 - }, - { - "epoch": 0.49825296995108315, - "grad_norm": 34.302284240722656, - "learning_rate": 2.9521381579049026e-06, - "loss": 0.6057, - "step": 713 - }, - { - "epoch": 0.4989517819706499, - "grad_norm": 9.785982131958008, - "learning_rate": 2.9461348997252263e-06, - "loss": 0.1302, - "step": 714 - }, - { - "epoch": 0.4996505939902166, - "grad_norm": 35.624393463134766, - "learning_rate": 2.9401289832171325e-06, - "loss": 0.8042, - "step": 715 - }, - { - "epoch": 0.5003494060097834, - "grad_norm": 36.01160430908203, - "learning_rate": 2.9341204441673267e-06, - "loss": 0.7304, - "step": 716 - }, - { - "epoch": 0.5010482180293501, - "grad_norm": 1.0751196146011353, - "learning_rate": 2.9281093183781406e-06, - "loss": 0.013, - "step": 717 - }, - { - "epoch": 0.5017470300489169, - "grad_norm": 0.673009991645813, - "learning_rate": 2.922095641667322e-06, - "loss": 0.015, - "step": 718 - }, - { - "epoch": 0.5024458420684835, - "grad_norm": 10.420042991638184, - "learning_rate": 2.9160794498678164e-06, - "loss": 0.0958, - "step": 719 - }, - { - "epoch": 0.5031446540880503, - "grad_norm": 8.547834396362305, - "learning_rate": 2.9100607788275547e-06, - "loss": 0.1287, - "step": 720 - }, - { - "epoch": 0.5038434661076171, - "grad_norm": 9.815986633300781, - "learning_rate": 2.904039664409244e-06, - "loss": 0.1443, - "step": 721 - }, - { - "epoch": 0.5045422781271838, - "grad_norm": 17.126943588256836, - "learning_rate": 2.8980161424901453e-06, - "loss": 0.2772, - "step": 722 - }, - { - "epoch": 0.5052410901467506, - "grad_norm": 17.710704803466797, - "learning_rate": 2.8919902489618713e-06, - "loss": 0.2407, - "step": 723 - }, - { - "epoch": 0.5059399021663172, - "grad_norm": 17.003799438476562, - "learning_rate": 2.8859620197301584e-06, - "loss": 0.3684, - "step": 724 - }, - { - "epoch": 0.506638714185884, - "grad_norm": 45.78942108154297, - "learning_rate": 2.879931490714669e-06, - "loss": 0.6906, - "step": 725 - }, - { - "epoch": 0.5073375262054507, - "grad_norm": 17.234827041625977, - "learning_rate": 2.8738986978487625e-06, - "loss": 0.1731, - "step": 726 - }, - { - "epoch": 0.5080363382250175, - "grad_norm": 19.101106643676758, - "learning_rate": 2.8678636770792907e-06, - "loss": 0.2642, - "step": 727 - }, - { - "epoch": 0.5087351502445842, - "grad_norm": 3.6941137313842773, - "learning_rate": 2.8618264643663783e-06, - "loss": 0.0501, - "step": 728 - }, - { - "epoch": 0.5094339622641509, - "grad_norm": 41.841102600097656, - "learning_rate": 2.8557870956832135e-06, - "loss": 0.8272, - "step": 729 - }, - { - "epoch": 0.5101327742837177, - "grad_norm": 3.8646914958953857, - "learning_rate": 2.8497456070158285e-06, - "loss": 0.0449, - "step": 730 - }, - { - "epoch": 0.5108315863032844, - "grad_norm": 9.886859893798828, - "learning_rate": 2.8437020343628896e-06, - "loss": 0.1389, - "step": 731 - }, - { - "epoch": 0.5115303983228512, - "grad_norm": 13.040285110473633, - "learning_rate": 2.8376564137354797e-06, - "loss": 0.1109, - "step": 732 - }, - { - "epoch": 0.5122292103424179, - "grad_norm": 26.846677780151367, - "learning_rate": 2.831608781156885e-06, - "loss": 0.4851, - "step": 733 - }, - { - "epoch": 0.5129280223619846, - "grad_norm": 12.580215454101562, - "learning_rate": 2.82555917266238e-06, - "loss": 0.2044, - "step": 734 - }, - { - "epoch": 0.5136268343815513, - "grad_norm": 15.406930923461914, - "learning_rate": 2.8195076242990124e-06, - "loss": 0.2733, - "step": 735 - }, - { - "epoch": 0.5143256464011181, - "grad_norm": 14.674403190612793, - "learning_rate": 2.813454172125389e-06, - "loss": 0.1848, - "step": 736 - }, - { - "epoch": 0.5150244584206848, - "grad_norm": 3.3575496673583984, - "learning_rate": 2.80739885221146e-06, - "loss": 0.0383, - "step": 737 - }, - { - "epoch": 0.5157232704402516, - "grad_norm": 31.089427947998047, - "learning_rate": 2.8013417006383078e-06, - "loss": 0.6764, - "step": 738 - }, - { - "epoch": 0.5164220824598184, - "grad_norm": 16.392793655395508, - "learning_rate": 2.7952827534979247e-06, - "loss": 0.4091, - "step": 739 - }, - { - "epoch": 0.517120894479385, - "grad_norm": 29.370450973510742, - "learning_rate": 2.7892220468930044e-06, - "loss": 0.5504, - "step": 740 - }, - { - "epoch": 0.5178197064989518, - "grad_norm": 13.022504806518555, - "learning_rate": 2.783159616936723e-06, - "loss": 0.1726, - "step": 741 - }, - { - "epoch": 0.5185185185185185, - "grad_norm": 3.6434409618377686, - "learning_rate": 2.7770954997525277e-06, - "loss": 0.0425, - "step": 742 - }, - { - "epoch": 0.5192173305380853, - "grad_norm": 20.857698440551758, - "learning_rate": 2.7710297314739164e-06, - "loss": 0.4138, - "step": 743 - }, - { - "epoch": 0.519916142557652, - "grad_norm": 8.907276153564453, - "learning_rate": 2.764962348244228e-06, - "loss": 0.1691, - "step": 744 - }, - { - "epoch": 0.5206149545772187, - "grad_norm": 8.210733413696289, - "learning_rate": 2.7588933862164198e-06, - "loss": 0.1415, - "step": 745 - }, - { - "epoch": 0.5213137665967854, - "grad_norm": 9.880792617797852, - "learning_rate": 2.7528228815528622e-06, - "loss": 0.0766, - "step": 746 - }, - { - "epoch": 0.5220125786163522, - "grad_norm": 37.39931869506836, - "learning_rate": 2.746750870425114e-06, - "loss": 0.4727, - "step": 747 - }, - { - "epoch": 0.522711390635919, - "grad_norm": 16.655193328857422, - "learning_rate": 2.7406773890137104e-06, - "loss": 0.3948, - "step": 748 - }, - { - "epoch": 0.5234102026554857, - "grad_norm": 0.31611526012420654, - "learning_rate": 2.7346024735079483e-06, - "loss": 0.0077, - "step": 749 - }, - { - "epoch": 0.5241090146750524, - "grad_norm": 38.12084197998047, - "learning_rate": 2.72852616010567e-06, - "loss": 0.7144, - "step": 750 - }, - { - "epoch": 0.5248078266946191, - "grad_norm": 9.617975234985352, - "learning_rate": 2.722448485013046e-06, - "loss": 0.1428, - "step": 751 - }, - { - "epoch": 0.5255066387141859, - "grad_norm": 13.710186958312988, - "learning_rate": 2.7163694844443617e-06, - "loss": 0.1987, - "step": 752 - }, - { - "epoch": 0.5262054507337526, - "grad_norm": 13.954463005065918, - "learning_rate": 2.7102891946217998e-06, - "loss": 0.1896, - "step": 753 - }, - { - "epoch": 0.5269042627533194, - "grad_norm": 8.394665718078613, - "learning_rate": 2.7042076517752264e-06, - "loss": 0.0878, - "step": 754 - }, - { - "epoch": 0.527603074772886, - "grad_norm": 38.6686897277832, - "learning_rate": 2.6981248921419713e-06, - "loss": 0.8208, - "step": 755 - }, - { - "epoch": 0.5283018867924528, - "grad_norm": 8.702239036560059, - "learning_rate": 2.6920409519666173e-06, - "loss": 0.1404, - "step": 756 - }, - { - "epoch": 0.5290006988120196, - "grad_norm": 16.821306228637695, - "learning_rate": 2.68595586750078e-06, - "loss": 0.3416, - "step": 757 - }, - { - "epoch": 0.5296995108315863, - "grad_norm": 18.222768783569336, - "learning_rate": 2.679869675002894e-06, - "loss": 0.2438, - "step": 758 - }, - { - "epoch": 0.5303983228511531, - "grad_norm": 9.650904655456543, - "learning_rate": 2.673782410737995e-06, - "loss": 0.13, - "step": 759 - }, - { - "epoch": 0.5310971348707197, - "grad_norm": 29.527938842773438, - "learning_rate": 2.667694110977506e-06, - "loss": 0.6485, - "step": 760 - }, - { - "epoch": 0.5317959468902865, - "grad_norm": 30.556961059570312, - "learning_rate": 2.6616048119990214e-06, - "loss": 0.5844, - "step": 761 - }, - { - "epoch": 0.5324947589098532, - "grad_norm": 5.451582431793213, - "learning_rate": 2.6555145500860864e-06, - "loss": 0.0668, - "step": 762 - }, - { - "epoch": 0.53319357092942, - "grad_norm": 34.449161529541016, - "learning_rate": 2.6494233615279865e-06, - "loss": 0.6288, - "step": 763 - }, - { - "epoch": 0.5338923829489868, - "grad_norm": 31.49390983581543, - "learning_rate": 2.6433312826195266e-06, - "loss": 0.7115, - "step": 764 - }, - { - "epoch": 0.5345911949685535, - "grad_norm": 10.340435028076172, - "learning_rate": 2.637238349660819e-06, - "loss": 0.1203, - "step": 765 - }, - { - "epoch": 0.5352900069881202, - "grad_norm": 23.544458389282227, - "learning_rate": 2.6311445989570633e-06, - "loss": 0.4053, - "step": 766 - }, - { - "epoch": 0.5359888190076869, - "grad_norm": 44.20396041870117, - "learning_rate": 2.6250500668183325e-06, - "loss": 0.542, - "step": 767 - }, - { - "epoch": 0.5366876310272537, - "grad_norm": 24.0120849609375, - "learning_rate": 2.6189547895593565e-06, - "loss": 0.27, - "step": 768 - }, - { - "epoch": 0.5373864430468204, - "grad_norm": 17.73352813720703, - "learning_rate": 2.612858803499302e-06, - "loss": 0.3584, - "step": 769 - }, - { - "epoch": 0.5380852550663872, - "grad_norm": 14.346426963806152, - "learning_rate": 2.6067621449615633e-06, - "loss": 0.224, - "step": 770 - }, - { - "epoch": 0.5387840670859538, - "grad_norm": 2.1989574432373047, - "learning_rate": 2.6006648502735384e-06, - "loss": 0.0258, - "step": 771 - }, - { - "epoch": 0.5394828791055206, - "grad_norm": 15.983598709106445, - "learning_rate": 2.5945669557664176e-06, - "loss": 0.3592, - "step": 772 - }, - { - "epoch": 0.5401816911250874, - "grad_norm": 48.945289611816406, - "learning_rate": 2.588468497774965e-06, - "loss": 0.6682, - "step": 773 - }, - { - "epoch": 0.5408805031446541, - "grad_norm": 16.14139175415039, - "learning_rate": 2.582369512637302e-06, - "loss": 0.186, - "step": 774 - }, - { - "epoch": 0.5415793151642209, - "grad_norm": 23.920515060424805, - "learning_rate": 2.5762700366946915e-06, - "loss": 0.3151, - "step": 775 - }, - { - "epoch": 0.5422781271837875, - "grad_norm": 9.31468677520752, - "learning_rate": 2.5701701062913194e-06, - "loss": 0.1181, - "step": 776 - }, - { - "epoch": 0.5429769392033543, - "grad_norm": 8.389810562133789, - "learning_rate": 2.564069757774082e-06, - "loss": 0.1615, - "step": 777 - }, - { - "epoch": 0.543675751222921, - "grad_norm": 19.005733489990234, - "learning_rate": 2.557969027492364e-06, - "loss": 0.2801, - "step": 778 - }, - { - "epoch": 0.5443745632424878, - "grad_norm": 11.176656723022461, - "learning_rate": 2.5518679517978288e-06, - "loss": 0.1673, - "step": 779 - }, - { - "epoch": 0.5450733752620545, - "grad_norm": 34.44825744628906, - "learning_rate": 2.5457665670441937e-06, - "loss": 0.7358, - "step": 780 - }, - { - "epoch": 0.5457721872816212, - "grad_norm": 15.895112037658691, - "learning_rate": 2.53966490958702e-06, - "loss": 0.2741, - "step": 781 - }, - { - "epoch": 0.546470999301188, - "grad_norm": 2.891005516052246, - "learning_rate": 2.533563015783494e-06, - "loss": 0.0453, - "step": 782 - }, - { - "epoch": 0.5471698113207547, - "grad_norm": 19.366334915161133, - "learning_rate": 2.5274609219922093e-06, - "loss": 0.3381, - "step": 783 - }, - { - "epoch": 0.5478686233403215, - "grad_norm": 9.77247142791748, - "learning_rate": 2.5213586645729514e-06, - "loss": 0.0948, - "step": 784 - }, - { - "epoch": 0.5485674353598882, - "grad_norm": 11.503024101257324, - "learning_rate": 2.5152562798864816e-06, - "loss": 0.104, - "step": 785 - }, - { - "epoch": 0.549266247379455, - "grad_norm": 5.956698417663574, - "learning_rate": 2.5091538042943183e-06, - "loss": 0.0999, - "step": 786 - }, - { - "epoch": 0.5499650593990216, - "grad_norm": 29.25767707824707, - "learning_rate": 2.503051274158522e-06, - "loss": 0.6862, - "step": 787 - }, - { - "epoch": 0.5506638714185884, - "grad_norm": 9.83717155456543, - "learning_rate": 2.496948725841479e-06, - "loss": 0.1615, - "step": 788 - }, - { - "epoch": 0.5513626834381551, - "grad_norm": 16.832477569580078, - "learning_rate": 2.490846195705683e-06, - "loss": 0.3955, - "step": 789 - }, - { - "epoch": 0.5520614954577219, - "grad_norm": 2.8090572357177734, - "learning_rate": 2.4847437201135197e-06, - "loss": 0.0257, - "step": 790 - }, - { - "epoch": 0.5527603074772887, - "grad_norm": 14.68285846710205, - "learning_rate": 2.4786413354270494e-06, - "loss": 0.3496, - "step": 791 - }, - { - "epoch": 0.5534591194968553, - "grad_norm": 4.9410719871521, - "learning_rate": 2.472539078007791e-06, - "loss": 0.0648, - "step": 792 - }, - { - "epoch": 0.5541579315164221, - "grad_norm": 35.7769889831543, - "learning_rate": 2.466436984216507e-06, - "loss": 0.7493, - "step": 793 - }, - { - "epoch": 0.5548567435359888, - "grad_norm": 0.6087682843208313, - "learning_rate": 2.4603350904129802e-06, - "loss": 0.012, - "step": 794 - }, - { - "epoch": 0.5555555555555556, - "grad_norm": 12.119845390319824, - "learning_rate": 2.4542334329558075e-06, - "loss": 0.1424, - "step": 795 - }, - { - "epoch": 0.5562543675751223, - "grad_norm": 14.537439346313477, - "learning_rate": 2.4481320482021716e-06, - "loss": 0.2377, - "step": 796 - }, - { - "epoch": 0.556953179594689, - "grad_norm": 7.99770975112915, - "learning_rate": 2.4420309725076364e-06, - "loss": 0.1143, - "step": 797 - }, - { - "epoch": 0.5576519916142557, - "grad_norm": 13.68201732635498, - "learning_rate": 2.435930242225919e-06, - "loss": 0.2435, - "step": 798 - }, - { - "epoch": 0.5583508036338225, - "grad_norm": 15.178496360778809, - "learning_rate": 2.429829893708681e-06, - "loss": 0.348, - "step": 799 - }, - { - "epoch": 0.5590496156533893, - "grad_norm": 15.514969825744629, - "learning_rate": 2.4237299633053098e-06, - "loss": 0.2089, - "step": 800 - }, - { - "epoch": 0.559748427672956, - "grad_norm": 0.8043599128723145, - "learning_rate": 2.4176304873626983e-06, - "loss": 0.0103, - "step": 801 - }, - { - "epoch": 0.5604472396925227, - "grad_norm": 9.657746315002441, - "learning_rate": 2.411531502225036e-06, - "loss": 0.1336, - "step": 802 - }, - { - "epoch": 0.5611460517120894, - "grad_norm": 8.856959342956543, - "learning_rate": 2.405433044233583e-06, - "loss": 0.0432, - "step": 803 - }, - { - "epoch": 0.5618448637316562, - "grad_norm": 10.32404899597168, - "learning_rate": 2.399335149726463e-06, - "loss": 0.0961, - "step": 804 - }, - { - "epoch": 0.5625436757512229, - "grad_norm": 32.715003967285156, - "learning_rate": 2.3932378550384375e-06, - "loss": 0.6596, - "step": 805 - }, - { - "epoch": 0.5632424877707897, - "grad_norm": 33.0238151550293, - "learning_rate": 2.3871411965006985e-06, - "loss": 0.5614, - "step": 806 - }, - { - "epoch": 0.5639412997903563, - "grad_norm": 5.669223308563232, - "learning_rate": 2.3810452104406444e-06, - "loss": 0.1091, - "step": 807 - }, - { - "epoch": 0.5646401118099231, - "grad_norm": 8.27646255493164, - "learning_rate": 2.3749499331816675e-06, - "loss": 0.1514, - "step": 808 - }, - { - "epoch": 0.5653389238294899, - "grad_norm": 0.4891330897808075, - "learning_rate": 2.3688554010429376e-06, - "loss": 0.0074, - "step": 809 - }, - { - "epoch": 0.5660377358490566, - "grad_norm": 27.837779998779297, - "learning_rate": 2.3627616503391813e-06, - "loss": 0.6814, - "step": 810 - }, - { - "epoch": 0.5667365478686234, - "grad_norm": 27.580463409423828, - "learning_rate": 2.3566687173804747e-06, - "loss": 0.3439, - "step": 811 - }, - { - "epoch": 0.56743535988819, - "grad_norm": 30.333078384399414, - "learning_rate": 2.3505766384720148e-06, - "loss": 0.4608, - "step": 812 - }, - { - "epoch": 0.5681341719077568, - "grad_norm": 16.45708465576172, - "learning_rate": 2.344485449913914e-06, - "loss": 0.1964, - "step": 813 - }, - { - "epoch": 0.5688329839273235, - "grad_norm": 14.987152099609375, - "learning_rate": 2.33839518800098e-06, - "loss": 0.2649, - "step": 814 - }, - { - "epoch": 0.5695317959468903, - "grad_norm": 7.402322292327881, - "learning_rate": 2.332305889022494e-06, - "loss": 0.0971, - "step": 815 - }, - { - "epoch": 0.570230607966457, - "grad_norm": 0.4026069939136505, - "learning_rate": 2.3262175892620064e-06, - "loss": 0.0086, - "step": 816 - }, - { - "epoch": 0.5709294199860238, - "grad_norm": 10.794352531433105, - "learning_rate": 2.3201303249971068e-06, - "loss": 0.1431, - "step": 817 - }, - { - "epoch": 0.5716282320055905, - "grad_norm": 6.888916969299316, - "learning_rate": 2.3140441324992215e-06, - "loss": 0.1166, - "step": 818 - }, - { - "epoch": 0.5723270440251572, - "grad_norm": 7.584359645843506, - "learning_rate": 2.307959048033383e-06, - "loss": 0.0924, - "step": 819 - }, - { - "epoch": 0.573025856044724, - "grad_norm": 14.411758422851562, - "learning_rate": 2.3018751078580287e-06, - "loss": 0.2478, - "step": 820 - }, - { - "epoch": 0.5737246680642907, - "grad_norm": 39.14580535888672, - "learning_rate": 2.2957923482247745e-06, - "loss": 0.8627, - "step": 821 - }, - { - "epoch": 0.5744234800838575, - "grad_norm": 0.23252451419830322, - "learning_rate": 2.2897108053782e-06, - "loss": 0.0044, - "step": 822 - }, - { - "epoch": 0.5751222921034241, - "grad_norm": 4.2255024909973145, - "learning_rate": 2.283630515555639e-06, - "loss": 0.0531, - "step": 823 - }, - { - "epoch": 0.5758211041229909, - "grad_norm": 28.8110408782959, - "learning_rate": 2.2775515149869544e-06, - "loss": 0.697, - "step": 824 - }, - { - "epoch": 0.5765199161425576, - "grad_norm": 27.963674545288086, - "learning_rate": 2.271473839894331e-06, - "loss": 0.5143, - "step": 825 - }, - { - "epoch": 0.5772187281621244, - "grad_norm": 23.054174423217773, - "learning_rate": 2.265397526492052e-06, - "loss": 0.4966, - "step": 826 - }, - { - "epoch": 0.5779175401816912, - "grad_norm": 26.273439407348633, - "learning_rate": 2.2593226109862896e-06, - "loss": 0.5055, - "step": 827 - }, - { - "epoch": 0.5786163522012578, - "grad_norm": 12.187822341918945, - "learning_rate": 2.253249129574887e-06, - "loss": 0.1633, - "step": 828 - }, - { - "epoch": 0.5793151642208246, - "grad_norm": 14.102840423583984, - "learning_rate": 2.2471771184471373e-06, - "loss": 0.2118, - "step": 829 - }, - { - "epoch": 0.5800139762403913, - "grad_norm": 14.490646362304688, - "learning_rate": 2.2411066137835806e-06, - "loss": 0.2574, - "step": 830 - }, - { - "epoch": 0.5807127882599581, - "grad_norm": 16.337011337280273, - "learning_rate": 2.235037651755773e-06, - "loss": 0.2978, - "step": 831 - }, - { - "epoch": 0.5814116002795248, - "grad_norm": 19.132368087768555, - "learning_rate": 2.228970268526084e-06, - "loss": 0.292, - "step": 832 - }, - { - "epoch": 0.5821104122990916, - "grad_norm": 12.487456321716309, - "learning_rate": 2.2229045002474727e-06, - "loss": 0.2232, - "step": 833 - }, - { - "epoch": 0.5828092243186582, - "grad_norm": 8.487640380859375, - "learning_rate": 2.216840383063277e-06, - "loss": 0.1229, - "step": 834 - }, - { - "epoch": 0.583508036338225, - "grad_norm": 12.428542137145996, - "learning_rate": 2.2107779531069964e-06, - "loss": 0.1643, - "step": 835 - }, - { - "epoch": 0.5842068483577918, - "grad_norm": 10.548884391784668, - "learning_rate": 2.2047172465020757e-06, - "loss": 0.2065, - "step": 836 - }, - { - "epoch": 0.5849056603773585, - "grad_norm": 6.87788200378418, - "learning_rate": 2.1986582993616926e-06, - "loss": 0.1038, - "step": 837 - }, - { - "epoch": 0.5856044723969253, - "grad_norm": 6.321012020111084, - "learning_rate": 2.1926011477885403e-06, - "loss": 0.0783, - "step": 838 - }, - { - "epoch": 0.5863032844164919, - "grad_norm": 16.344270706176758, - "learning_rate": 2.186545827874613e-06, - "loss": 0.2885, - "step": 839 - }, - { - "epoch": 0.5870020964360587, - "grad_norm": 40.341651916503906, - "learning_rate": 2.1804923757009885e-06, - "loss": 0.7366, - "step": 840 - }, - { - "epoch": 0.5877009084556254, - "grad_norm": 6.851194858551025, - "learning_rate": 2.1744408273376204e-06, - "loss": 0.0793, - "step": 841 - }, - { - "epoch": 0.5883997204751922, - "grad_norm": 20.683561325073242, - "learning_rate": 2.1683912188431154e-06, - "loss": 0.3483, - "step": 842 - }, - { - "epoch": 0.589098532494759, - "grad_norm": 45.648109436035156, - "learning_rate": 2.1623435862645207e-06, - "loss": 0.7047, - "step": 843 - }, - { - "epoch": 0.5897973445143256, - "grad_norm": 12.754045486450195, - "learning_rate": 2.1562979656371112e-06, - "loss": 0.1772, - "step": 844 - }, - { - "epoch": 0.5904961565338924, - "grad_norm": 31.535858154296875, - "learning_rate": 2.150254392984172e-06, - "loss": 0.4651, - "step": 845 - }, - { - "epoch": 0.5911949685534591, - "grad_norm": 16.96849822998047, - "learning_rate": 2.1442129043167877e-06, - "loss": 0.1642, - "step": 846 - }, - { - "epoch": 0.5918937805730259, - "grad_norm": 38.35987854003906, - "learning_rate": 2.1381735356336225e-06, - "loss": 0.5815, - "step": 847 - }, - { - "epoch": 0.5925925925925926, - "grad_norm": 12.405257225036621, - "learning_rate": 2.1321363229207097e-06, - "loss": 0.2265, - "step": 848 - }, - { - "epoch": 0.5932914046121593, - "grad_norm": 4.138080596923828, - "learning_rate": 2.126101302151238e-06, - "loss": 0.0641, - "step": 849 - }, - { - "epoch": 0.593990216631726, - "grad_norm": 28.451597213745117, - "learning_rate": 2.1200685092853305e-06, - "loss": 0.5244, - "step": 850 - }, - { - "epoch": 0.5946890286512928, - "grad_norm": 39.756771087646484, - "learning_rate": 2.114037980269842e-06, - "loss": 0.7235, - "step": 851 - }, - { - "epoch": 0.5953878406708596, - "grad_norm": 16.830041885375977, - "learning_rate": 2.10800975103813e-06, - "loss": 0.2449, - "step": 852 - }, - { - "epoch": 0.5960866526904263, - "grad_norm": 16.128522872924805, - "learning_rate": 2.1019838575098555e-06, - "loss": 0.4214, - "step": 853 - }, - { - "epoch": 0.596785464709993, - "grad_norm": 13.651690483093262, - "learning_rate": 2.095960335590757e-06, - "loss": 0.25, - "step": 854 - }, - { - "epoch": 0.5974842767295597, - "grad_norm": 12.453420639038086, - "learning_rate": 2.089939221172446e-06, - "loss": 0.2461, - "step": 855 - }, - { - "epoch": 0.5981830887491265, - "grad_norm": 16.346458435058594, - "learning_rate": 2.0839205501321844e-06, - "loss": 0.2147, - "step": 856 - }, - { - "epoch": 0.5988819007686932, - "grad_norm": 12.999120712280273, - "learning_rate": 2.077904358332678e-06, - "loss": 0.1417, - "step": 857 - }, - { - "epoch": 0.59958071278826, - "grad_norm": 10.378223419189453, - "learning_rate": 2.07189068162186e-06, - "loss": 0.1583, - "step": 858 - }, - { - "epoch": 0.6002795248078266, - "grad_norm": 31.185504913330078, - "learning_rate": 2.0658795558326745e-06, - "loss": 0.4964, - "step": 859 - }, - { - "epoch": 0.6009783368273934, - "grad_norm": 13.377418518066406, - "learning_rate": 2.0598710167828688e-06, - "loss": 0.1833, - "step": 860 - }, - { - "epoch": 0.6016771488469602, - "grad_norm": 11.63349437713623, - "learning_rate": 2.0538651002747745e-06, - "loss": 0.1984, - "step": 861 - }, - { - "epoch": 0.6023759608665269, - "grad_norm": 33.24647903442383, - "learning_rate": 2.0478618420950987e-06, - "loss": 0.7118, - "step": 862 - }, - { - "epoch": 0.6030747728860937, - "grad_norm": 5.947911262512207, - "learning_rate": 2.0418612780147064e-06, - "loss": 0.0532, - "step": 863 - }, - { - "epoch": 0.6037735849056604, - "grad_norm": 23.14008903503418, - "learning_rate": 2.0358634437884114e-06, - "loss": 0.3858, - "step": 864 - }, - { - "epoch": 0.6044723969252271, - "grad_norm": 8.678864479064941, - "learning_rate": 2.0298683751547622e-06, - "loss": 0.112, - "step": 865 - }, - { - "epoch": 0.6051712089447938, - "grad_norm": 42.92127990722656, - "learning_rate": 2.023876107835825e-06, - "loss": 0.9151, - "step": 866 - }, - { - "epoch": 0.6058700209643606, - "grad_norm": 37.035179138183594, - "learning_rate": 2.017886677536978e-06, - "loss": 0.5857, - "step": 867 - }, - { - "epoch": 0.6065688329839273, - "grad_norm": 9.909701347351074, - "learning_rate": 2.011900119946691e-06, - "loss": 0.1413, - "step": 868 - }, - { - "epoch": 0.6072676450034941, - "grad_norm": 33.31441879272461, - "learning_rate": 2.0059164707363206e-06, - "loss": 0.8741, - "step": 869 - }, - { - "epoch": 0.6079664570230608, - "grad_norm": 30.437944412231445, - "learning_rate": 1.9999357655598894e-06, - "loss": 0.61, - "step": 870 - }, - { - "epoch": 0.6086652690426275, - "grad_norm": 0.5675687789916992, - "learning_rate": 1.993958040053881e-06, - "loss": 0.0121, - "step": 871 - }, - { - "epoch": 0.6093640810621943, - "grad_norm": 15.325254440307617, - "learning_rate": 1.987983329837024e-06, - "loss": 0.1945, - "step": 872 - }, - { - "epoch": 0.610062893081761, - "grad_norm": 4.526660919189453, - "learning_rate": 1.9820116705100778e-06, - "loss": 0.071, - "step": 873 - }, - { - "epoch": 0.6107617051013278, - "grad_norm": 10.334325790405273, - "learning_rate": 1.9760430976556257e-06, - "loss": 0.1423, - "step": 874 - }, - { - "epoch": 0.6114605171208944, - "grad_norm": 3.35660457611084, - "learning_rate": 1.970077646837858e-06, - "loss": 0.0366, - "step": 875 - }, - { - "epoch": 0.6121593291404612, - "grad_norm": 9.794960975646973, - "learning_rate": 1.9641153536023646e-06, - "loss": 0.1246, - "step": 876 - }, - { - "epoch": 0.6128581411600279, - "grad_norm": 6.1428961753845215, - "learning_rate": 1.958156253475916e-06, - "loss": 0.0725, - "step": 877 - }, - { - "epoch": 0.6135569531795947, - "grad_norm": 7.976772785186768, - "learning_rate": 1.9522003819662614e-06, - "loss": 0.084, - "step": 878 - }, - { - "epoch": 0.6142557651991615, - "grad_norm": 7.7063984870910645, - "learning_rate": 1.9462477745619106e-06, - "loss": 0.0784, - "step": 879 - }, - { - "epoch": 0.6149545772187281, - "grad_norm": 18.760948181152344, - "learning_rate": 1.940298466731922e-06, - "loss": 0.3548, - "step": 880 - }, - { - "epoch": 0.6156533892382949, - "grad_norm": 0.3327697217464447, - "learning_rate": 1.934352493925695e-06, - "loss": 0.0071, - "step": 881 - }, - { - "epoch": 0.6163522012578616, - "grad_norm": 17.469369888305664, - "learning_rate": 1.928409891572757e-06, - "loss": 0.3589, - "step": 882 - }, - { - "epoch": 0.6170510132774284, - "grad_norm": 16.62134552001953, - "learning_rate": 1.9224706950825517e-06, - "loss": 0.3401, - "step": 883 - }, - { - "epoch": 0.6177498252969951, - "grad_norm": 9.542234420776367, - "learning_rate": 1.9165349398442284e-06, - "loss": 0.151, - "step": 884 - }, - { - "epoch": 0.6184486373165619, - "grad_norm": 0.6506238579750061, - "learning_rate": 1.9106026612264316e-06, - "loss": 0.018, - "step": 885 - }, - { - "epoch": 0.6191474493361285, - "grad_norm": 5.937638282775879, - "learning_rate": 1.9046738945770932e-06, - "loss": 0.0772, - "step": 886 - }, - { - "epoch": 0.6198462613556953, - "grad_norm": 26.081083297729492, - "learning_rate": 1.8987486752232122e-06, - "loss": 0.6689, - "step": 887 - }, - { - "epoch": 0.6205450733752621, - "grad_norm": 25.548797607421875, - "learning_rate": 1.8928270384706585e-06, - "loss": 0.5399, - "step": 888 - }, - { - "epoch": 0.6212438853948288, - "grad_norm": 20.33055877685547, - "learning_rate": 1.8869090196039469e-06, - "loss": 0.3576, - "step": 889 - }, - { - "epoch": 0.6219426974143956, - "grad_norm": 9.043344497680664, - "learning_rate": 1.8809946538860427e-06, - "loss": 0.1147, - "step": 890 - }, - { - "epoch": 0.6226415094339622, - "grad_norm": 23.66865348815918, - "learning_rate": 1.875083976558136e-06, - "loss": 0.3109, - "step": 891 - }, - { - "epoch": 0.623340321453529, - "grad_norm": 18.012081146240234, - "learning_rate": 1.8691770228394458e-06, - "loss": 0.4284, - "step": 892 - }, - { - "epoch": 0.6240391334730957, - "grad_norm": 4.8346848487854, - "learning_rate": 1.863273827927e-06, - "loss": 0.0597, - "step": 893 - }, - { - "epoch": 0.6247379454926625, - "grad_norm": 25.748119354248047, - "learning_rate": 1.85737442699543e-06, - "loss": 0.6459, - "step": 894 - }, - { - "epoch": 0.6254367575122292, - "grad_norm": 15.449180603027344, - "learning_rate": 1.8514788551967616e-06, - "loss": 0.2046, - "step": 895 - }, - { - "epoch": 0.6261355695317959, - "grad_norm": 16.44902992248535, - "learning_rate": 1.8455871476602023e-06, - "loss": 0.3188, - "step": 896 - }, - { - "epoch": 0.6268343815513627, - "grad_norm": 7.270335674285889, - "learning_rate": 1.8396993394919372e-06, - "loss": 0.0761, - "step": 897 - }, - { - "epoch": 0.6275331935709294, - "grad_norm": 20.945117950439453, - "learning_rate": 1.833815465774913e-06, - "loss": 0.4962, - "step": 898 - }, - { - "epoch": 0.6282320055904962, - "grad_norm": 18.072628021240234, - "learning_rate": 1.8279355615686353e-06, - "loss": 0.2448, - "step": 899 - }, - { - "epoch": 0.6289308176100629, - "grad_norm": 18.547746658325195, - "learning_rate": 1.8220596619089576e-06, - "loss": 0.1941, - "step": 900 - }, - { - "epoch": 0.6296296296296297, - "grad_norm": 2.448774576187134, - "learning_rate": 1.8161878018078693e-06, - "loss": 0.0268, - "step": 901 - }, - { - "epoch": 0.6303284416491963, - "grad_norm": 16.698841094970703, - "learning_rate": 1.8103200162532927e-06, - "loss": 0.3668, - "step": 902 - }, - { - "epoch": 0.6310272536687631, - "grad_norm": 18.395183563232422, - "learning_rate": 1.8044563402088686e-06, - "loss": 0.3204, - "step": 903 - }, - { - "epoch": 0.6317260656883298, - "grad_norm": 27.81114959716797, - "learning_rate": 1.798596808613754e-06, - "loss": 0.4934, - "step": 904 - }, - { - "epoch": 0.6324248777078966, - "grad_norm": 12.515846252441406, - "learning_rate": 1.7927414563824077e-06, - "loss": 0.1551, - "step": 905 - }, - { - "epoch": 0.6331236897274634, - "grad_norm": 11.712455749511719, - "learning_rate": 1.7868903184043888e-06, - "loss": 0.1924, - "step": 906 - }, - { - "epoch": 0.63382250174703, - "grad_norm": 17.306638717651367, - "learning_rate": 1.7810434295441434e-06, - "loss": 0.4241, - "step": 907 - }, - { - "epoch": 0.6345213137665968, - "grad_norm": 0.378282368183136, - "learning_rate": 1.7752008246407986e-06, - "loss": 0.0091, - "step": 908 - }, - { - "epoch": 0.6352201257861635, - "grad_norm": 14.262761116027832, - "learning_rate": 1.7693625385079576e-06, - "loss": 0.2913, - "step": 909 - }, - { - "epoch": 0.6359189378057303, - "grad_norm": 5.014953136444092, - "learning_rate": 1.763528605933486e-06, - "loss": 0.059, - "step": 910 - }, - { - "epoch": 0.636617749825297, - "grad_norm": 1.9716682434082031, - "learning_rate": 1.7576990616793139e-06, - "loss": 0.0199, - "step": 911 - }, - { - "epoch": 0.6373165618448637, - "grad_norm": 35.16741943359375, - "learning_rate": 1.7518739404812158e-06, - "loss": 0.8233, - "step": 912 - }, - { - "epoch": 0.6380153738644304, - "grad_norm": 14.723204612731934, - "learning_rate": 1.7460532770486185e-06, - "loss": 0.2537, - "step": 913 - }, - { - "epoch": 0.6387141858839972, - "grad_norm": 7.040991306304932, - "learning_rate": 1.740237106064383e-06, - "loss": 0.0829, - "step": 914 - }, - { - "epoch": 0.639412997903564, - "grad_norm": 10.06245231628418, - "learning_rate": 1.7344254621846018e-06, - "loss": 0.1666, - "step": 915 - }, - { - "epoch": 0.6401118099231307, - "grad_norm": 13.61905574798584, - "learning_rate": 1.7286183800383937e-06, - "loss": 0.316, - "step": 916 - }, - { - "epoch": 0.6408106219426974, - "grad_norm": 27.60746192932129, - "learning_rate": 1.7228158942276942e-06, - "loss": 0.5873, - "step": 917 - }, - { - "epoch": 0.6415094339622641, - "grad_norm": 4.896931171417236, - "learning_rate": 1.7170180393270533e-06, - "loss": 0.0708, - "step": 918 - }, - { - "epoch": 0.6422082459818309, - "grad_norm": 14.917655944824219, - "learning_rate": 1.7112248498834256e-06, - "loss": 0.3021, - "step": 919 - }, - { - "epoch": 0.6429070580013976, - "grad_norm": 49.91603469848633, - "learning_rate": 1.705436360415966e-06, - "loss": 0.8994, - "step": 920 - }, - { - "epoch": 0.6436058700209644, - "grad_norm": 0.4870874881744385, - "learning_rate": 1.6996526054158283e-06, - "loss": 0.0053, - "step": 921 - }, - { - "epoch": 0.6443046820405312, - "grad_norm": 15.165771484375, - "learning_rate": 1.6938736193459487e-06, - "loss": 0.198, - "step": 922 - }, - { - "epoch": 0.6450034940600978, - "grad_norm": 11.082405090332031, - "learning_rate": 1.6880994366408548e-06, - "loss": 0.1458, - "step": 923 - }, - { - "epoch": 0.6457023060796646, - "grad_norm": 2.855769634246826, - "learning_rate": 1.6823300917064462e-06, - "loss": 0.0266, - "step": 924 - }, - { - "epoch": 0.6464011180992313, - "grad_norm": 11.491934776306152, - "learning_rate": 1.6765656189198013e-06, - "loss": 0.2059, - "step": 925 - }, - { - "epoch": 0.6470999301187981, - "grad_norm": 11.414868354797363, - "learning_rate": 1.6708060526289648e-06, - "loss": 0.143, - "step": 926 - }, - { - "epoch": 0.6477987421383647, - "grad_norm": 8.71941089630127, - "learning_rate": 1.6650514271527468e-06, - "loss": 0.1409, - "step": 927 - }, - { - "epoch": 0.6484975541579315, - "grad_norm": 3.4228744506835938, - "learning_rate": 1.659301776780517e-06, - "loss": 0.0383, - "step": 928 - }, - { - "epoch": 0.6491963661774982, - "grad_norm": 15.330958366394043, - "learning_rate": 1.6535571357719998e-06, - "loss": 0.2552, - "step": 929 - }, - { - "epoch": 0.649895178197065, - "grad_norm": 8.872035026550293, - "learning_rate": 1.647817538357072e-06, - "loss": 0.0741, - "step": 930 - }, - { - "epoch": 0.6505939902166318, - "grad_norm": 7.346905708312988, - "learning_rate": 1.6420830187355572e-06, - "loss": 0.1115, - "step": 931 - }, - { - "epoch": 0.6512928022361985, - "grad_norm": 25.926292419433594, - "learning_rate": 1.636353611077023e-06, - "loss": 0.5164, - "step": 932 - }, - { - "epoch": 0.6519916142557652, - "grad_norm": 7.558488368988037, - "learning_rate": 1.6306293495205758e-06, - "loss": 0.1039, - "step": 933 - }, - { - "epoch": 0.6526904262753319, - "grad_norm": 6.821887493133545, - "learning_rate": 1.6249102681746593e-06, - "loss": 0.0803, - "step": 934 - }, - { - "epoch": 0.6533892382948987, - "grad_norm": 13.661949157714844, - "learning_rate": 1.6191964011168523e-06, - "loss": 0.1932, - "step": 935 - }, - { - "epoch": 0.6540880503144654, - "grad_norm": 12.226921081542969, - "learning_rate": 1.613487782393661e-06, - "loss": 0.2131, - "step": 936 - }, - { - "epoch": 0.6547868623340322, - "grad_norm": 15.027230262756348, - "learning_rate": 1.6077844460203207e-06, - "loss": 0.2632, - "step": 937 - }, - { - "epoch": 0.6554856743535988, - "grad_norm": 9.334952354431152, - "learning_rate": 1.6020864259805902e-06, - "loss": 0.128, - "step": 938 - }, - { - "epoch": 0.6561844863731656, - "grad_norm": 8.83455753326416, - "learning_rate": 1.5963937562265524e-06, - "loss": 0.1019, - "step": 939 - }, - { - "epoch": 0.6568832983927324, - "grad_norm": 20.954299926757812, - "learning_rate": 1.5907064706784082e-06, - "loss": 0.3247, - "step": 940 - }, - { - "epoch": 0.6575821104122991, - "grad_norm": 3.849893569946289, - "learning_rate": 1.5850246032242766e-06, - "loss": 0.0554, - "step": 941 - }, - { - "epoch": 0.6582809224318659, - "grad_norm": 15.237322807312012, - "learning_rate": 1.5793481877199946e-06, - "loss": 0.2743, - "step": 942 - }, - { - "epoch": 0.6589797344514325, - "grad_norm": 18.91193962097168, - "learning_rate": 1.5736772579889102e-06, - "loss": 0.3172, - "step": 943 - }, - { - "epoch": 0.6596785464709993, - "grad_norm": 9.425347328186035, - "learning_rate": 1.5680118478216865e-06, - "loss": 0.1308, - "step": 944 - }, - { - "epoch": 0.660377358490566, - "grad_norm": 19.43840789794922, - "learning_rate": 1.5623519909760953e-06, - "loss": 0.3984, - "step": 945 - }, - { - "epoch": 0.6610761705101328, - "grad_norm": 38.46794891357422, - "learning_rate": 1.556697721176823e-06, - "loss": 0.6891, - "step": 946 - }, - { - "epoch": 0.6617749825296995, - "grad_norm": 2.573190689086914, - "learning_rate": 1.5510490721152594e-06, - "loss": 0.0392, - "step": 947 - }, - { - "epoch": 0.6624737945492662, - "grad_norm": 14.021162986755371, - "learning_rate": 1.545406077449307e-06, - "loss": 0.2038, - "step": 948 - }, - { - "epoch": 0.663172606568833, - "grad_norm": 15.211523056030273, - "learning_rate": 1.5397687708031747e-06, - "loss": 0.2286, - "step": 949 - }, - { - "epoch": 0.6638714185883997, - "grad_norm": 15.636740684509277, - "learning_rate": 1.5341371857671782e-06, - "loss": 0.2839, - "step": 950 - }, - { - "epoch": 0.6645702306079665, - "grad_norm": 39.785118103027344, - "learning_rate": 1.5285113558975429e-06, - "loss": 0.5478, - "step": 951 - }, - { - "epoch": 0.6652690426275332, - "grad_norm": 40.327266693115234, - "learning_rate": 1.5228913147161982e-06, - "loss": 0.8884, - "step": 952 - }, - { - "epoch": 0.6659678546471, - "grad_norm": 11.233820915222168, - "learning_rate": 1.5172770957105843e-06, - "loss": 0.1929, - "step": 953 - }, - { - "epoch": 0.6666666666666666, - "grad_norm": 39.03425216674805, - "learning_rate": 1.5116687323334467e-06, - "loss": 0.7189, - "step": 954 - }, - { - "epoch": 0.6673654786862334, - "grad_norm": 8.349373817443848, - "learning_rate": 1.506066258002642e-06, - "loss": 0.1376, - "step": 955 - }, - { - "epoch": 0.6680642907058001, - "grad_norm": 15.850727081298828, - "learning_rate": 1.5004697061009372e-06, - "loss": 0.2705, - "step": 956 - }, - { - "epoch": 0.6687631027253669, - "grad_norm": 18.72193145751953, - "learning_rate": 1.4948791099758052e-06, - "loss": 0.3741, - "step": 957 - }, - { - "epoch": 0.6694619147449337, - "grad_norm": 28.91965103149414, - "learning_rate": 1.489294502939238e-06, - "loss": 0.4521, - "step": 958 - }, - { - "epoch": 0.6701607267645003, - "grad_norm": 9.582024574279785, - "learning_rate": 1.4837159182675343e-06, - "loss": 0.1211, - "step": 959 - }, - { - "epoch": 0.6708595387840671, - "grad_norm": 34.728755950927734, - "learning_rate": 1.4781433892011132e-06, - "loss": 0.8332, - "step": 960 - }, - { - "epoch": 0.6715583508036338, - "grad_norm": 12.798219680786133, - "learning_rate": 1.4725769489443082e-06, - "loss": 0.1845, - "step": 961 - }, - { - "epoch": 0.6722571628232006, - "grad_norm": 22.087730407714844, - "learning_rate": 1.4670166306651734e-06, - "loss": 0.4124, - "step": 962 - }, - { - "epoch": 0.6729559748427673, - "grad_norm": 8.047623634338379, - "learning_rate": 1.4614624674952843e-06, - "loss": 0.125, - "step": 963 - }, - { - "epoch": 0.673654786862334, - "grad_norm": 17.283178329467773, - "learning_rate": 1.45591449252954e-06, - "loss": 0.296, - "step": 964 - }, - { - "epoch": 0.6743535988819007, - "grad_norm": 5.014110565185547, - "learning_rate": 1.4503727388259686e-06, - "loss": 0.0755, - "step": 965 - }, - { - "epoch": 0.6750524109014675, - "grad_norm": 3.204698085784912, - "learning_rate": 1.4448372394055249e-06, - "loss": 0.0295, - "step": 966 - }, - { - "epoch": 0.6757512229210343, - "grad_norm": 14.568187713623047, - "learning_rate": 1.4393080272519022e-06, - "loss": 0.197, - "step": 967 - }, - { - "epoch": 0.676450034940601, - "grad_norm": 6.707142353057861, - "learning_rate": 1.4337851353113264e-06, - "loss": 0.0702, - "step": 968 - }, - { - "epoch": 0.6771488469601677, - "grad_norm": 15.176682472229004, - "learning_rate": 1.4282685964923643e-06, - "loss": 0.2576, - "step": 969 - }, - { - "epoch": 0.6778476589797344, - "grad_norm": 25.824369430541992, - "learning_rate": 1.42275844366573e-06, - "loss": 0.314, - "step": 970 - }, - { - "epoch": 0.6785464709993012, - "grad_norm": 19.518863677978516, - "learning_rate": 1.4172547096640837e-06, - "loss": 0.429, - "step": 971 - }, - { - "epoch": 0.6792452830188679, - "grad_norm": 15.642006874084473, - "learning_rate": 1.4117574272818388e-06, - "loss": 0.2069, - "step": 972 - }, - { - "epoch": 0.6799440950384347, - "grad_norm": 4.075155735015869, - "learning_rate": 1.4062666292749657e-06, - "loss": 0.046, - "step": 973 - }, - { - "epoch": 0.6806429070580013, - "grad_norm": 1.8811545372009277, - "learning_rate": 1.4007823483608002e-06, - "loss": 0.0231, - "step": 974 - }, - { - "epoch": 0.6813417190775681, - "grad_norm": 3.0156357288360596, - "learning_rate": 1.3953046172178413e-06, - "loss": 0.0428, - "step": 975 - }, - { - "epoch": 0.6820405310971349, - "grad_norm": 21.55653190612793, - "learning_rate": 1.3898334684855647e-06, - "loss": 0.3441, - "step": 976 - }, - { - "epoch": 0.6827393431167016, - "grad_norm": 18.346210479736328, - "learning_rate": 1.3843689347642217e-06, - "loss": 0.3297, - "step": 977 - }, - { - "epoch": 0.6834381551362684, - "grad_norm": 21.336627960205078, - "learning_rate": 1.378911048614647e-06, - "loss": 0.3687, - "step": 978 - }, - { - "epoch": 0.684136967155835, - "grad_norm": 29.595643997192383, - "learning_rate": 1.3734598425580686e-06, - "loss": 0.5676, - "step": 979 - }, - { - "epoch": 0.6848357791754018, - "grad_norm": 17.99949836730957, - "learning_rate": 1.3680153490759074e-06, - "loss": 0.173, - "step": 980 - }, - { - "epoch": 0.6855345911949685, - "grad_norm": 7.617617130279541, - "learning_rate": 1.3625776006095882e-06, - "loss": 0.0668, - "step": 981 - }, - { - "epoch": 0.6862334032145353, - "grad_norm": 8.539046287536621, - "learning_rate": 1.3571466295603438e-06, - "loss": 0.1134, - "step": 982 - }, - { - "epoch": 0.686932215234102, - "grad_norm": 8.51557731628418, - "learning_rate": 1.3517224682890268e-06, - "loss": 0.1357, - "step": 983 - }, - { - "epoch": 0.6876310272536688, - "grad_norm": 10.750319480895996, - "learning_rate": 1.3463051491159095e-06, - "loss": 0.1554, - "step": 984 - }, - { - "epoch": 0.6883298392732355, - "grad_norm": 35.21259689331055, - "learning_rate": 1.340894704320496e-06, - "loss": 0.6226, - "step": 985 - }, - { - "epoch": 0.6890286512928022, - "grad_norm": 0.2831842005252838, - "learning_rate": 1.3354911661413305e-06, - "loss": 0.0074, - "step": 986 - }, - { - "epoch": 0.689727463312369, - "grad_norm": 0.295246958732605, - "learning_rate": 1.3300945667758015e-06, - "loss": 0.0035, - "step": 987 - }, - { - "epoch": 0.6904262753319357, - "grad_norm": 4.366105079650879, - "learning_rate": 1.3247049383799545e-06, - "loss": 0.0426, - "step": 988 - }, - { - "epoch": 0.6911250873515025, - "grad_norm": 5.727261066436768, - "learning_rate": 1.3193223130682937e-06, - "loss": 0.1059, - "step": 989 - }, - { - "epoch": 0.6918238993710691, - "grad_norm": 14.105305671691895, - "learning_rate": 1.3139467229135999e-06, - "loss": 0.2551, - "step": 990 - }, - { - "epoch": 0.6925227113906359, - "grad_norm": 10.028843879699707, - "learning_rate": 1.3085781999467303e-06, - "loss": 0.1389, - "step": 991 - }, - { - "epoch": 0.6932215234102026, - "grad_norm": 8.396821975708008, - "learning_rate": 1.3032167761564357e-06, - "loss": 0.1124, - "step": 992 - }, - { - "epoch": 0.6939203354297694, - "grad_norm": 5.298531532287598, - "learning_rate": 1.2978624834891629e-06, - "loss": 0.0734, - "step": 993 - }, - { - "epoch": 0.6946191474493362, - "grad_norm": 0.6527535319328308, - "learning_rate": 1.2925153538488666e-06, - "loss": 0.0151, - "step": 994 - }, - { - "epoch": 0.6953179594689028, - "grad_norm": 10.608854293823242, - "learning_rate": 1.2871754190968244e-06, - "loss": 0.1448, - "step": 995 - }, - { - "epoch": 0.6960167714884696, - "grad_norm": 0.40155184268951416, - "learning_rate": 1.2818427110514382e-06, - "loss": 0.0102, - "step": 996 - }, - { - "epoch": 0.6967155835080363, - "grad_norm": 13.118616104125977, - "learning_rate": 1.276517261488051e-06, - "loss": 0.1396, - "step": 997 - }, - { - "epoch": 0.6974143955276031, - "grad_norm": 8.69198226928711, - "learning_rate": 1.271199102138755e-06, - "loss": 0.1276, - "step": 998 - }, - { - "epoch": 0.6981132075471698, - "grad_norm": 33.57307815551758, - "learning_rate": 1.2658882646922036e-06, - "loss": 0.6523, - "step": 999 - }, - { - "epoch": 0.6988120195667366, - "grad_norm": 43.360130310058594, - "learning_rate": 1.2605847807934229e-06, - "loss": 0.901, - "step": 1000 - }, - { - "epoch": 0.6988120195667366, - "eval_loss": 0.25201931595802307, - "eval_runtime": 305.164, - "eval_samples_per_second": 2.084, - "eval_steps_per_second": 0.521, - "step": 1000 - }, - { - "epoch": 0.6995108315863033, - "grad_norm": 11.683773040771484, - "learning_rate": 1.2552886820436208e-06, - "loss": 0.2002, - "step": 1001 - }, - { - "epoch": 0.70020964360587, - "grad_norm": 16.589052200317383, - "learning_rate": 1.2500000000000007e-06, - "loss": 0.4033, - "step": 1002 - }, - { - "epoch": 0.7009084556254368, - "grad_norm": 15.332732200622559, - "learning_rate": 1.2447187661755717e-06, - "loss": 0.2018, - "step": 1003 - }, - { - "epoch": 0.7016072676450035, - "grad_norm": 13.976405143737793, - "learning_rate": 1.2394450120389658e-06, - "loss": 0.2187, - "step": 1004 - }, - { - "epoch": 0.7023060796645703, - "grad_norm": 15.938003540039062, - "learning_rate": 1.2341787690142436e-06, - "loss": 0.2626, - "step": 1005 - }, - { - "epoch": 0.7030048916841369, - "grad_norm": 10.803960800170898, - "learning_rate": 1.2289200684807098e-06, - "loss": 0.1176, - "step": 1006 - }, - { - "epoch": 0.7037037037037037, - "grad_norm": 4.652684211730957, - "learning_rate": 1.2236689417727297e-06, - "loss": 0.0518, - "step": 1007 - }, - { - "epoch": 0.7044025157232704, - "grad_norm": 16.630571365356445, - "learning_rate": 1.2184254201795364e-06, - "loss": 0.2343, - "step": 1008 - }, - { - "epoch": 0.7051013277428372, - "grad_norm": 27.492624282836914, - "learning_rate": 1.213189534945049e-06, - "loss": 0.4521, - "step": 1009 - }, - { - "epoch": 0.705800139762404, - "grad_norm": 19.129714965820312, - "learning_rate": 1.2079613172676824e-06, - "loss": 0.4215, - "step": 1010 - }, - { - "epoch": 0.7064989517819706, - "grad_norm": 16.36144256591797, - "learning_rate": 1.2027407983001683e-06, - "loss": 0.3727, - "step": 1011 - }, - { - "epoch": 0.7071977638015374, - "grad_norm": 12.514931678771973, - "learning_rate": 1.19752800914936e-06, - "loss": 0.1988, - "step": 1012 - }, - { - "epoch": 0.7078965758211041, - "grad_norm": 11.539505958557129, - "learning_rate": 1.1923229808760565e-06, - "loss": 0.1587, - "step": 1013 - }, - { - "epoch": 0.7085953878406709, - "grad_norm": 10.866935729980469, - "learning_rate": 1.1871257444948098e-06, - "loss": 0.1876, - "step": 1014 - }, - { - "epoch": 0.7092941998602376, - "grad_norm": 3.8440191745758057, - "learning_rate": 1.181936330973744e-06, - "loss": 0.0321, - "step": 1015 - }, - { - "epoch": 0.7099930118798043, - "grad_norm": 28.109813690185547, - "learning_rate": 1.1767547712343722e-06, - "loss": 0.4796, - "step": 1016 - }, - { - "epoch": 0.710691823899371, - "grad_norm": 17.91498374938965, - "learning_rate": 1.1715810961514073e-06, - "loss": 0.2798, - "step": 1017 - }, - { - "epoch": 0.7113906359189378, - "grad_norm": 14.650308609008789, - "learning_rate": 1.166415336552583e-06, - "loss": 0.1929, - "step": 1018 - }, - { - "epoch": 0.7120894479385046, - "grad_norm": 26.033111572265625, - "learning_rate": 1.1612575232184657e-06, - "loss": 0.5942, - "step": 1019 - }, - { - "epoch": 0.7127882599580713, - "grad_norm": 16.04847526550293, - "learning_rate": 1.1561076868822756e-06, - "loss": 0.3298, - "step": 1020 - }, - { - "epoch": 0.713487071977638, - "grad_norm": 15.109528541564941, - "learning_rate": 1.1509658582297025e-06, - "loss": 0.2602, - "step": 1021 - }, - { - "epoch": 0.7141858839972047, - "grad_norm": 26.70970916748047, - "learning_rate": 1.1458320678987166e-06, - "loss": 0.3617, - "step": 1022 - }, - { - "epoch": 0.7148846960167715, - "grad_norm": 27.35272789001465, - "learning_rate": 1.1407063464793966e-06, - "loss": 0.4271, - "step": 1023 - }, - { - "epoch": 0.7155835080363382, - "grad_norm": 10.951732635498047, - "learning_rate": 1.1355887245137383e-06, - "loss": 0.137, - "step": 1024 - }, - { - "epoch": 0.716282320055905, - "grad_norm": 16.68122673034668, - "learning_rate": 1.1304792324954796e-06, - "loss": 0.1934, - "step": 1025 - }, - { - "epoch": 0.7169811320754716, - "grad_norm": 12.747377395629883, - "learning_rate": 1.1253779008699131e-06, - "loss": 0.2297, - "step": 1026 - }, - { - "epoch": 0.7176799440950384, - "grad_norm": 17.35643768310547, - "learning_rate": 1.120284760033706e-06, - "loss": 0.3665, - "step": 1027 - }, - { - "epoch": 0.7183787561146052, - "grad_norm": 19.289302825927734, - "learning_rate": 1.1151998403347245e-06, - "loss": 0.4151, - "step": 1028 - }, - { - "epoch": 0.7190775681341719, - "grad_norm": 15.83156681060791, - "learning_rate": 1.1101231720718442e-06, - "loss": 0.1688, - "step": 1029 - }, - { - "epoch": 0.7197763801537387, - "grad_norm": 7.146099090576172, - "learning_rate": 1.1050547854947757e-06, - "loss": 0.1016, - "step": 1030 - }, - { - "epoch": 0.7204751921733054, - "grad_norm": 10.062895774841309, - "learning_rate": 1.0999947108038816e-06, - "loss": 0.153, - "step": 1031 - }, - { - "epoch": 0.7211740041928721, - "grad_norm": 26.228126525878906, - "learning_rate": 1.0949429781500002e-06, - "loss": 0.5056, - "step": 1032 - }, - { - "epoch": 0.7218728162124388, - "grad_norm": 27.05112075805664, - "learning_rate": 1.0898996176342595e-06, - "loss": 0.3457, - "step": 1033 - }, - { - "epoch": 0.7225716282320056, - "grad_norm": 17.30702781677246, - "learning_rate": 1.0848646593079028e-06, - "loss": 0.293, - "step": 1034 - }, - { - "epoch": 0.7232704402515723, - "grad_norm": 0.6861129403114319, - "learning_rate": 1.079838133172111e-06, - "loss": 0.0126, - "step": 1035 - }, - { - "epoch": 0.7239692522711391, - "grad_norm": 36.39925003051758, - "learning_rate": 1.074820069177816e-06, - "loss": 0.5264, - "step": 1036 - }, - { - "epoch": 0.7246680642907058, - "grad_norm": 10.432401657104492, - "learning_rate": 1.069810497225533e-06, - "loss": 0.129, - "step": 1037 - }, - { - "epoch": 0.7253668763102725, - "grad_norm": 7.180110931396484, - "learning_rate": 1.0648094471651723e-06, - "loss": 0.0759, - "step": 1038 - }, - { - "epoch": 0.7260656883298393, - "grad_norm": 16.11195182800293, - "learning_rate": 1.0598169487958678e-06, - "loss": 0.2345, - "step": 1039 - }, - { - "epoch": 0.726764500349406, - "grad_norm": 12.371841430664062, - "learning_rate": 1.0548330318657968e-06, - "loss": 0.1364, - "step": 1040 - }, - { - "epoch": 0.7274633123689728, - "grad_norm": 35.49949645996094, - "learning_rate": 1.049857726072005e-06, - "loss": 0.8971, - "step": 1041 - }, - { - "epoch": 0.7281621243885394, - "grad_norm": 14.55894947052002, - "learning_rate": 1.0448910610602262e-06, - "loss": 0.3012, - "step": 1042 - }, - { - "epoch": 0.7288609364081062, - "grad_norm": 3.433270215988159, - "learning_rate": 1.0399330664247077e-06, - "loss": 0.0334, - "step": 1043 - }, - { - "epoch": 0.7295597484276729, - "grad_norm": 10.520794868469238, - "learning_rate": 1.034983771708035e-06, - "loss": 0.1334, - "step": 1044 - }, - { - "epoch": 0.7302585604472397, - "grad_norm": 4.554783821105957, - "learning_rate": 1.0300432064009527e-06, - "loss": 0.0308, - "step": 1045 - }, - { - "epoch": 0.7309573724668065, - "grad_norm": 11.731820106506348, - "learning_rate": 1.0251113999421936e-06, - "loss": 0.1483, - "step": 1046 - }, - { - "epoch": 0.7316561844863732, - "grad_norm": 18.906362533569336, - "learning_rate": 1.020188381718295e-06, - "loss": 0.1836, - "step": 1047 - }, - { - "epoch": 0.7323549965059399, - "grad_norm": 2.9701437950134277, - "learning_rate": 1.0152741810634333e-06, - "loss": 0.0466, - "step": 1048 - }, - { - "epoch": 0.7330538085255066, - "grad_norm": 10.123627662658691, - "learning_rate": 1.0103688272592446e-06, - "loss": 0.1555, - "step": 1049 - }, - { - "epoch": 0.7337526205450734, - "grad_norm": 0.5355169773101807, - "learning_rate": 1.0054723495346484e-06, - "loss": 0.0113, - "step": 1050 - }, - { - "epoch": 0.7344514325646401, - "grad_norm": 20.25732421875, - "learning_rate": 1.0005847770656757e-06, - "loss": 0.3654, - "step": 1051 - }, - { - "epoch": 0.7351502445842069, - "grad_norm": 7.024266242980957, - "learning_rate": 9.957061389752948e-07, - "loss": 0.0442, - "step": 1052 - }, - { - "epoch": 0.7358490566037735, - "grad_norm": 1.0375354290008545, - "learning_rate": 9.9083646433324e-07, - "loss": 0.0126, - "step": 1053 - }, - { - "epoch": 0.7365478686233403, - "grad_norm": 13.183897018432617, - "learning_rate": 9.85975782155834e-07, - "loss": 0.2425, - "step": 1054 - }, - { - "epoch": 0.7372466806429071, - "grad_norm": 32.841346740722656, - "learning_rate": 9.811241214058168e-07, - "loss": 0.6539, - "step": 1055 - }, - { - "epoch": 0.7379454926624738, - "grad_norm": 2.4381468296051025, - "learning_rate": 9.762815109921762e-07, - "loss": 0.03, - "step": 1056 - }, - { - "epoch": 0.7386443046820406, - "grad_norm": 13.839322090148926, - "learning_rate": 9.714479797699695e-07, - "loss": 0.192, - "step": 1057 - }, - { - "epoch": 0.7393431167016072, - "grad_norm": 21.913719177246094, - "learning_rate": 9.666235565401594e-07, - "loss": 0.4449, - "step": 1058 - }, - { - "epoch": 0.740041928721174, - "grad_norm": 3.269843816757202, - "learning_rate": 9.61808270049432e-07, - "loss": 0.0238, - "step": 1059 - }, - { - "epoch": 0.7407407407407407, - "grad_norm": 28.28165054321289, - "learning_rate": 9.57002148990037e-07, - "loss": 0.3564, - "step": 1060 - }, - { - "epoch": 0.7414395527603075, - "grad_norm": 9.303828239440918, - "learning_rate": 9.522052219996072e-07, - "loss": 0.1166, - "step": 1061 - }, - { - "epoch": 0.7421383647798742, - "grad_norm": 9.00042724609375, - "learning_rate": 9.474175176609956e-07, - "loss": 0.0915, - "step": 1062 - }, - { - "epoch": 0.742837176799441, - "grad_norm": 12.855506896972656, - "learning_rate": 9.426390645020981e-07, - "loss": 0.2047, - "step": 1063 - }, - { - "epoch": 0.7435359888190077, - "grad_norm": 19.918502807617188, - "learning_rate": 9.378698909956868e-07, - "loss": 0.3605, - "step": 1064 - }, - { - "epoch": 0.7442348008385744, - "grad_norm": 12.481935501098633, - "learning_rate": 9.331100255592437e-07, - "loss": 0.2544, - "step": 1065 - }, - { - "epoch": 0.7449336128581412, - "grad_norm": 19.18770980834961, - "learning_rate": 9.283594965547846e-07, - "loss": 0.2931, - "step": 1066 - }, - { - "epoch": 0.7456324248777079, - "grad_norm": 26.039501190185547, - "learning_rate": 9.236183322886946e-07, - "loss": 0.4876, - "step": 1067 - }, - { - "epoch": 0.7463312368972747, - "grad_norm": 32.522987365722656, - "learning_rate": 9.188865610115572e-07, - "loss": 0.5723, - "step": 1068 - }, - { - "epoch": 0.7470300489168413, - "grad_norm": 16.446271896362305, - "learning_rate": 9.141642109179891e-07, - "loss": 0.3341, - "step": 1069 - }, - { - "epoch": 0.7477288609364081, - "grad_norm": 30.10618782043457, - "learning_rate": 9.094513101464697e-07, - "loss": 0.596, - "step": 1070 - }, - { - "epoch": 0.7484276729559748, - "grad_norm": 53.186527252197266, - "learning_rate": 9.047478867791732e-07, - "loss": 0.9323, - "step": 1071 - }, - { - "epoch": 0.7491264849755416, - "grad_norm": 16.313310623168945, - "learning_rate": 9.000539688418017e-07, - "loss": 0.2225, - "step": 1072 - }, - { - "epoch": 0.7498252969951084, - "grad_norm": 39.722625732421875, - "learning_rate": 8.953695843034179e-07, - "loss": 0.5108, - "step": 1073 - }, - { - "epoch": 0.750524109014675, - "grad_norm": 21.892547607421875, - "learning_rate": 8.906947610762826e-07, - "loss": 0.4294, - "step": 1074 - }, - { - "epoch": 0.7512229210342418, - "grad_norm": 45.99984359741211, - "learning_rate": 8.860295270156804e-07, - "loss": 0.8281, - "step": 1075 - }, - { - "epoch": 0.7519217330538085, - "grad_norm": 32.13447952270508, - "learning_rate": 8.813739099197597e-07, - "loss": 0.9, - "step": 1076 - }, - { - "epoch": 0.7526205450733753, - "grad_norm": 16.832719802856445, - "learning_rate": 8.767279375293672e-07, - "loss": 0.3074, - "step": 1077 - }, - { - "epoch": 0.753319357092942, - "grad_norm": 16.95676040649414, - "learning_rate": 8.720916375278782e-07, - "loss": 0.2203, - "step": 1078 - }, - { - "epoch": 0.7540181691125087, - "grad_norm": 19.983858108520508, - "learning_rate": 8.674650375410379e-07, - "loss": 0.4076, - "step": 1079 - }, - { - "epoch": 0.7547169811320755, - "grad_norm": 21.438579559326172, - "learning_rate": 8.628481651367876e-07, - "loss": 0.4041, - "step": 1080 - }, - { - "epoch": 0.7554157931516422, - "grad_norm": 6.569042682647705, - "learning_rate": 8.582410478251119e-07, - "loss": 0.0916, - "step": 1081 - }, - { - "epoch": 0.756114605171209, - "grad_norm": 14.822148323059082, - "learning_rate": 8.536437130578648e-07, - "loss": 0.2316, - "step": 1082 - }, - { - "epoch": 0.7568134171907757, - "grad_norm": 17.814233779907227, - "learning_rate": 8.490561882286136e-07, - "loss": 0.2503, - "step": 1083 - }, - { - "epoch": 0.7575122292103424, - "grad_norm": 16.003833770751953, - "learning_rate": 8.444785006724698e-07, - "loss": 0.3499, - "step": 1084 - }, - { - "epoch": 0.7582110412299091, - "grad_norm": 23.80466079711914, - "learning_rate": 8.399106776659291e-07, - "loss": 0.2041, - "step": 1085 - }, - { - "epoch": 0.7589098532494759, - "grad_norm": 3.951340913772583, - "learning_rate": 8.353527464267105e-07, - "loss": 0.0424, - "step": 1086 - }, - { - "epoch": 0.7596086652690426, - "grad_norm": 25.488731384277344, - "learning_rate": 8.308047341135899e-07, - "loss": 0.3805, - "step": 1087 - }, - { - "epoch": 0.7603074772886094, - "grad_norm": 21.661376953125, - "learning_rate": 8.262666678262415e-07, - "loss": 0.2974, - "step": 1088 - }, - { - "epoch": 0.7610062893081762, - "grad_norm": 14.710705757141113, - "learning_rate": 8.217385746050743e-07, - "loss": 0.2444, - "step": 1089 - }, - { - "epoch": 0.7617051013277428, - "grad_norm": 21.718036651611328, - "learning_rate": 8.172204814310741e-07, - "loss": 0.2034, - "step": 1090 - }, - { - "epoch": 0.7624039133473096, - "grad_norm": 15.717751502990723, - "learning_rate": 8.127124152256408e-07, - "loss": 0.3216, - "step": 1091 - }, - { - "epoch": 0.7631027253668763, - "grad_norm": 11.18538761138916, - "learning_rate": 8.082144028504233e-07, - "loss": 0.1174, - "step": 1092 - }, - { - "epoch": 0.7638015373864431, - "grad_norm": 0.560878574848175, - "learning_rate": 8.037264711071699e-07, - "loss": 0.0118, - "step": 1093 - }, - { - "epoch": 0.7645003494060097, - "grad_norm": 22.066762924194336, - "learning_rate": 7.992486467375585e-07, - "loss": 0.4938, - "step": 1094 - }, - { - "epoch": 0.7651991614255765, - "grad_norm": 7.879112243652344, - "learning_rate": 7.947809564230446e-07, - "loss": 0.1014, - "step": 1095 - }, - { - "epoch": 0.7658979734451432, - "grad_norm": 6.53630256652832, - "learning_rate": 7.903234267846965e-07, - "loss": 0.0612, - "step": 1096 - }, - { - "epoch": 0.76659678546471, - "grad_norm": 6.8303961753845215, - "learning_rate": 7.858760843830402e-07, - "loss": 0.0902, - "step": 1097 - }, - { - "epoch": 0.7672955974842768, - "grad_norm": 8.56202507019043, - "learning_rate": 7.814389557179017e-07, - "loss": 0.1365, - "step": 1098 - }, - { - "epoch": 0.7679944095038435, - "grad_norm": 5.431033611297607, - "learning_rate": 7.770120672282458e-07, - "loss": 0.0647, - "step": 1099 - }, - { - "epoch": 0.7686932215234102, - "grad_norm": 8.112288475036621, - "learning_rate": 7.725954452920212e-07, - "loss": 0.1713, - "step": 1100 - }, - { - "epoch": 0.7693920335429769, - "grad_norm": 8.673176765441895, - "learning_rate": 7.681891162260016e-07, - "loss": 0.0855, - "step": 1101 - }, - { - "epoch": 0.7700908455625437, - "grad_norm": 13.275389671325684, - "learning_rate": 7.637931062856324e-07, - "loss": 0.1583, - "step": 1102 - }, - { - "epoch": 0.7707896575821104, - "grad_norm": 16.91658592224121, - "learning_rate": 7.594074416648689e-07, - "loss": 0.2911, - "step": 1103 - }, - { - "epoch": 0.7714884696016772, - "grad_norm": 8.190262794494629, - "learning_rate": 7.550321484960252e-07, - "loss": 0.099, - "step": 1104 - }, - { - "epoch": 0.7721872816212438, - "grad_norm": 19.804262161254883, - "learning_rate": 7.506672528496148e-07, - "loss": 0.3536, - "step": 1105 - }, - { - "epoch": 0.7728860936408106, - "grad_norm": 0.4071040749549866, - "learning_rate": 7.463127807341966e-07, - "loss": 0.006, - "step": 1106 - }, - { - "epoch": 0.7735849056603774, - "grad_norm": 19.50054931640625, - "learning_rate": 7.419687580962223e-07, - "loss": 0.3748, - "step": 1107 - }, - { - "epoch": 0.7742837176799441, - "grad_norm": 30.91229248046875, - "learning_rate": 7.376352108198776e-07, - "loss": 0.5882, - "step": 1108 - }, - { - "epoch": 0.7749825296995109, - "grad_norm": 3.1740357875823975, - "learning_rate": 7.3331216472693e-07, - "loss": 0.045, - "step": 1109 - }, - { - "epoch": 0.7756813417190775, - "grad_norm": 5.362880229949951, - "learning_rate": 7.289996455765749e-07, - "loss": 0.0537, - "step": 1110 - }, - { - "epoch": 0.7763801537386443, - "grad_norm": 6.516132354736328, - "learning_rate": 7.246976790652843e-07, - "loss": 0.1104, - "step": 1111 - }, - { - "epoch": 0.777078965758211, - "grad_norm": 23.61691665649414, - "learning_rate": 7.204062908266491e-07, - "loss": 0.5486, - "step": 1112 - }, - { - "epoch": 0.7777777777777778, - "grad_norm": 50.94094467163086, - "learning_rate": 7.161255064312284e-07, - "loss": 0.7642, - "step": 1113 - }, - { - "epoch": 0.7784765897973445, - "grad_norm": 11.640260696411133, - "learning_rate": 7.118553513864002e-07, - "loss": 0.1143, - "step": 1114 - }, - { - "epoch": 0.7791754018169113, - "grad_norm": 29.011348724365234, - "learning_rate": 7.075958511362038e-07, - "loss": 0.4451, - "step": 1115 - }, - { - "epoch": 0.779874213836478, - "grad_norm": 0.5991443991661072, - "learning_rate": 7.033470310611945e-07, - "loss": 0.0139, - "step": 1116 - }, - { - "epoch": 0.7805730258560447, - "grad_norm": 32.352317810058594, - "learning_rate": 6.991089164782839e-07, - "loss": 0.4879, - "step": 1117 - }, - { - "epoch": 0.7812718378756115, - "grad_norm": 12.8659086227417, - "learning_rate": 6.948815326405994e-07, - "loss": 0.2705, - "step": 1118 - }, - { - "epoch": 0.7819706498951782, - "grad_norm": 10.678431510925293, - "learning_rate": 6.906649047373246e-07, - "loss": 0.1254, - "step": 1119 - }, - { - "epoch": 0.782669461914745, - "grad_norm": 12.482305526733398, - "learning_rate": 6.864590578935562e-07, - "loss": 0.2328, - "step": 1120 - }, - { - "epoch": 0.7833682739343116, - "grad_norm": 10.02873420715332, - "learning_rate": 6.822640171701486e-07, - "loss": 0.1594, - "step": 1121 - }, - { - "epoch": 0.7840670859538784, - "grad_norm": 14.333048820495605, - "learning_rate": 6.780798075635675e-07, - "loss": 0.2723, - "step": 1122 - }, - { - "epoch": 0.7847658979734451, - "grad_norm": 30.315757751464844, - "learning_rate": 6.739064540057425e-07, - "loss": 0.6848, - "step": 1123 - }, - { - "epoch": 0.7854647099930119, - "grad_norm": 25.60342788696289, - "learning_rate": 6.697439813639145e-07, - "loss": 0.4684, - "step": 1124 - }, - { - "epoch": 0.7861635220125787, - "grad_norm": 0.3895127475261688, - "learning_rate": 6.655924144404907e-07, - "loss": 0.0098, - "step": 1125 - }, - { - "epoch": 0.7868623340321453, - "grad_norm": 13.570385932922363, - "learning_rate": 6.614517779728943e-07, - "loss": 0.289, - "step": 1126 - }, - { - "epoch": 0.7875611460517121, - "grad_norm": 68.9692611694336, - "learning_rate": 6.573220966334207e-07, - "loss": 0.8583, - "step": 1127 - }, - { - "epoch": 0.7882599580712788, - "grad_norm": 15.602972030639648, - "learning_rate": 6.532033950290887e-07, - "loss": 0.2941, - "step": 1128 - }, - { - "epoch": 0.7889587700908456, - "grad_norm": 3.4355599880218506, - "learning_rate": 6.490956977014892e-07, - "loss": 0.0543, - "step": 1129 - }, - { - "epoch": 0.7896575821104123, - "grad_norm": 17.661762237548828, - "learning_rate": 6.449990291266486e-07, - "loss": 0.3505, - "step": 1130 - }, - { - "epoch": 0.790356394129979, - "grad_norm": 13.397007942199707, - "learning_rate": 6.409134137148737e-07, - "loss": 0.2101, - "step": 1131 - }, - { - "epoch": 0.7910552061495457, - "grad_norm": 2.0868258476257324, - "learning_rate": 6.368388758106134e-07, - "loss": 0.0249, - "step": 1132 - }, - { - "epoch": 0.7917540181691125, - "grad_norm": 40.14077377319336, - "learning_rate": 6.32775439692308e-07, - "loss": 0.9145, - "step": 1133 - }, - { - "epoch": 0.7924528301886793, - "grad_norm": 4.327674388885498, - "learning_rate": 6.28723129572247e-07, - "loss": 0.0544, - "step": 1134 - }, - { - "epoch": 0.793151642208246, - "grad_norm": 24.076866149902344, - "learning_rate": 6.246819695964274e-07, - "loss": 0.5572, - "step": 1135 - }, - { - "epoch": 0.7938504542278128, - "grad_norm": 10.14948558807373, - "learning_rate": 6.206519838444044e-07, - "loss": 0.176, - "step": 1136 - }, - { - "epoch": 0.7945492662473794, - "grad_norm": 24.600963592529297, - "learning_rate": 6.166331963291519e-07, - "loss": 0.3653, - "step": 1137 - }, - { - "epoch": 0.7952480782669462, - "grad_norm": 35.2950553894043, - "learning_rate": 6.126256309969172e-07, - "loss": 0.6447, - "step": 1138 - }, - { - "epoch": 0.7959468902865129, - "grad_norm": 1.5564249753952026, - "learning_rate": 6.086293117270822e-07, - "loss": 0.0146, - "step": 1139 - }, - { - "epoch": 0.7966457023060797, - "grad_norm": 33.24662399291992, - "learning_rate": 6.046442623320145e-07, - "loss": 0.4562, - "step": 1140 - }, - { - "epoch": 0.7973445143256463, - "grad_norm": 20.075868606567383, - "learning_rate": 6.006705065569329e-07, - "loss": 0.3227, - "step": 1141 - }, - { - "epoch": 0.7980433263452131, - "grad_norm": 43.78253936767578, - "learning_rate": 5.967080680797599e-07, - "loss": 0.912, - "step": 1142 - }, - { - "epoch": 0.7987421383647799, - "grad_norm": 9.017322540283203, - "learning_rate": 5.927569705109828e-07, - "loss": 0.1488, - "step": 1143 - }, - { - "epoch": 0.7994409503843466, - "grad_norm": 13.467599868774414, - "learning_rate": 5.888172373935161e-07, - "loss": 0.2121, - "step": 1144 - }, - { - "epoch": 0.8001397624039134, - "grad_norm": 8.279585838317871, - "learning_rate": 5.848888922025553e-07, - "loss": 0.1342, - "step": 1145 - }, - { - "epoch": 0.80083857442348, - "grad_norm": 16.24057388305664, - "learning_rate": 5.809719583454415e-07, - "loss": 0.2465, - "step": 1146 - }, - { - "epoch": 0.8015373864430468, - "grad_norm": 7.338842391967773, - "learning_rate": 5.770664591615191e-07, - "loss": 0.1038, - "step": 1147 - }, - { - "epoch": 0.8022361984626135, - "grad_norm": 30.530437469482422, - "learning_rate": 5.731724179220008e-07, - "loss": 0.5192, - "step": 1148 - }, - { - "epoch": 0.8029350104821803, - "grad_norm": 17.99674415588379, - "learning_rate": 5.692898578298253e-07, - "loss": 0.3013, - "step": 1149 - }, - { - "epoch": 0.803633822501747, - "grad_norm": 18.61930274963379, - "learning_rate": 5.654188020195173e-07, - "loss": 0.295, - "step": 1150 - }, - { - "epoch": 0.8043326345213138, - "grad_norm": 7.76812744140625, - "learning_rate": 5.615592735570563e-07, - "loss": 0.1041, - "step": 1151 - }, - { - "epoch": 0.8050314465408805, - "grad_norm": 15.843855857849121, - "learning_rate": 5.57711295439732e-07, - "loss": 0.3321, - "step": 1152 - }, - { - "epoch": 0.8057302585604472, - "grad_norm": 17.76589012145996, - "learning_rate": 5.538748905960145e-07, - "loss": 0.2668, - "step": 1153 - }, - { - "epoch": 0.806429070580014, - "grad_norm": 11.344962120056152, - "learning_rate": 5.500500818854079e-07, - "loss": 0.1651, - "step": 1154 - }, - { - "epoch": 0.8071278825995807, - "grad_norm": 6.680407524108887, - "learning_rate": 5.462368920983249e-07, - "loss": 0.0716, - "step": 1155 - }, - { - "epoch": 0.8078266946191475, - "grad_norm": 10.928862571716309, - "learning_rate": 5.424353439559446e-07, - "loss": 0.2133, - "step": 1156 - }, - { - "epoch": 0.8085255066387141, - "grad_norm": 7.662057876586914, - "learning_rate": 5.386454601100774e-07, - "loss": 0.0991, - "step": 1157 - }, - { - "epoch": 0.8092243186582809, - "grad_norm": 18.08972930908203, - "learning_rate": 5.348672631430319e-07, - "loss": 0.4483, - "step": 1158 - }, - { - "epoch": 0.8099231306778477, - "grad_norm": 8.420894622802734, - "learning_rate": 5.311007755674786e-07, - "loss": 0.1444, - "step": 1159 - }, - { - "epoch": 0.8106219426974144, - "grad_norm": 14.010246276855469, - "learning_rate": 5.273460198263192e-07, - "loss": 0.2457, - "step": 1160 - }, - { - "epoch": 0.8113207547169812, - "grad_norm": 16.338651657104492, - "learning_rate": 5.236030182925475e-07, - "loss": 0.3208, - "step": 1161 - }, - { - "epoch": 0.8120195667365478, - "grad_norm": 11.418418884277344, - "learning_rate": 5.1987179326912e-07, - "loss": 0.1766, - "step": 1162 - }, - { - "epoch": 0.8127183787561146, - "grad_norm": 15.831872940063477, - "learning_rate": 5.161523669888235e-07, - "loss": 0.2866, - "step": 1163 - }, - { - "epoch": 0.8134171907756813, - "grad_norm": 31.641719818115234, - "learning_rate": 5.124447616141382e-07, - "loss": 0.7301, - "step": 1164 - }, - { - "epoch": 0.8141160027952481, - "grad_norm": 3.373783588409424, - "learning_rate": 5.087489992371114e-07, - "loss": 0.0255, - "step": 1165 - }, - { - "epoch": 0.8148148148148148, - "grad_norm": 29.973318099975586, - "learning_rate": 5.050651018792213e-07, - "loss": 0.5794, - "step": 1166 - }, - { - "epoch": 0.8155136268343816, - "grad_norm": 0.6115692257881165, - "learning_rate": 5.013930914912477e-07, - "loss": 0.0123, - "step": 1167 - }, - { - "epoch": 0.8162124388539483, - "grad_norm": 6.854272365570068, - "learning_rate": 4.977329899531405e-07, - "loss": 0.0808, - "step": 1168 - }, - { - "epoch": 0.816911250873515, - "grad_norm": 18.352251052856445, - "learning_rate": 4.94084819073892e-07, - "loss": 0.3883, - "step": 1169 - }, - { - "epoch": 0.8176100628930818, - "grad_norm": 29.763479232788086, - "learning_rate": 4.904486005914027e-07, - "loss": 0.4594, - "step": 1170 - }, - { - "epoch": 0.8183088749126485, - "grad_norm": 2.543560266494751, - "learning_rate": 4.868243561723535e-07, - "loss": 0.0395, - "step": 1171 - }, - { - "epoch": 0.8190076869322153, - "grad_norm": 28.25546646118164, - "learning_rate": 4.832121074120794e-07, - "loss": 0.546, - "step": 1172 - }, - { - "epoch": 0.8197064989517819, - "grad_norm": 8.269977569580078, - "learning_rate": 4.796118758344354e-07, - "loss": 0.0658, - "step": 1173 - }, - { - "epoch": 0.8204053109713487, - "grad_norm": 19.697662353515625, - "learning_rate": 4.7602368289167477e-07, - "loss": 0.3166, - "step": 1174 - }, - { - "epoch": 0.8211041229909154, - "grad_norm": 3.3113882541656494, - "learning_rate": 4.7244754996431273e-07, - "loss": 0.04, - "step": 1175 - }, - { - "epoch": 0.8218029350104822, - "grad_norm": 4.408804893493652, - "learning_rate": 4.688834983610083e-07, - "loss": 0.0377, - "step": 1176 - }, - { - "epoch": 0.822501747030049, - "grad_norm": 14.940951347351074, - "learning_rate": 4.653315493184321e-07, - "loss": 0.3609, - "step": 1177 - }, - { - "epoch": 0.8232005590496156, - "grad_norm": 10.80850601196289, - "learning_rate": 4.617917240011394e-07, - "loss": 0.1914, - "step": 1178 - }, - { - "epoch": 0.8238993710691824, - "grad_norm": 10.918196678161621, - "learning_rate": 4.5826404350144597e-07, - "loss": 0.1193, - "step": 1179 - }, - { - "epoch": 0.8245981830887491, - "grad_norm": 11.238945007324219, - "learning_rate": 4.5474852883930163e-07, - "loss": 0.1728, - "step": 1180 - }, - { - "epoch": 0.8252969951083159, - "grad_norm": 8.476938247680664, - "learning_rate": 4.512452009621665e-07, - "loss": 0.1252, - "step": 1181 - }, - { - "epoch": 0.8259958071278826, - "grad_norm": 39.37675094604492, - "learning_rate": 4.4775408074488326e-07, - "loss": 0.8467, - "step": 1182 - }, - { - "epoch": 0.8266946191474493, - "grad_norm": 15.164828300476074, - "learning_rate": 4.4427518898955383e-07, - "loss": 0.2857, - "step": 1183 - }, - { - "epoch": 0.827393431167016, - "grad_norm": 26.918004989624023, - "learning_rate": 4.4080854642541833e-07, - "loss": 0.5531, - "step": 1184 - }, - { - "epoch": 0.8280922431865828, - "grad_norm": 14.009383201599121, - "learning_rate": 4.373541737087264e-07, - "loss": 0.22, - "step": 1185 - }, - { - "epoch": 0.8287910552061496, - "grad_norm": 43.12004089355469, - "learning_rate": 4.3391209142261996e-07, - "loss": 0.5798, - "step": 1186 - }, - { - "epoch": 0.8294898672257163, - "grad_norm": 18.772907257080078, - "learning_rate": 4.3048232007700363e-07, - "loss": 0.2251, - "step": 1187 - }, - { - "epoch": 0.8301886792452831, - "grad_norm": 11.540468215942383, - "learning_rate": 4.2706488010842957e-07, - "loss": 0.1528, - "step": 1188 - }, - { - "epoch": 0.8308874912648497, - "grad_norm": 0.327936589717865, - "learning_rate": 4.2365979187997094e-07, - "loss": 0.004, - "step": 1189 - }, - { - "epoch": 0.8315863032844165, - "grad_norm": 2.8561463356018066, - "learning_rate": 4.202670756811028e-07, - "loss": 0.0308, - "step": 1190 - }, - { - "epoch": 0.8322851153039832, - "grad_norm": 10.796645164489746, - "learning_rate": 4.168867517275807e-07, - "loss": 0.1616, - "step": 1191 - }, - { - "epoch": 0.83298392732355, - "grad_norm": 11.712217330932617, - "learning_rate": 4.1351884016131826e-07, - "loss": 0.1234, - "step": 1192 - }, - { - "epoch": 0.8336827393431167, - "grad_norm": 33.217811584472656, - "learning_rate": 4.101633610502717e-07, - "loss": 0.3849, - "step": 1193 - }, - { - "epoch": 0.8343815513626834, - "grad_norm": 19.651063919067383, - "learning_rate": 4.0682033438831593e-07, - "loss": 0.3681, - "step": 1194 - }, - { - "epoch": 0.8350803633822502, - "grad_norm": 44.6031379699707, - "learning_rate": 4.034897800951268e-07, - "loss": 0.8734, - "step": 1195 - }, - { - "epoch": 0.8357791754018169, - "grad_norm": 17.963214874267578, - "learning_rate": 4.0017171801606245e-07, - "loss": 0.2907, - "step": 1196 - }, - { - "epoch": 0.8364779874213837, - "grad_norm": 2.867605686187744, - "learning_rate": 3.9686616792204677e-07, - "loss": 0.0381, - "step": 1197 - }, - { - "epoch": 0.8371767994409504, - "grad_norm": 0.34205174446105957, - "learning_rate": 3.9357314950944943e-07, - "loss": 0.0069, - "step": 1198 - }, - { - "epoch": 0.8378756114605171, - "grad_norm": 6.6677751541137695, - "learning_rate": 3.902926823999681e-07, - "loss": 0.0718, - "step": 1199 - }, - { - "epoch": 0.8385744234800838, - "grad_norm": 11.719327926635742, - "learning_rate": 3.8702478614051353e-07, - "loss": 0.1409, - "step": 1200 - }, - { - "epoch": 0.8392732354996506, - "grad_norm": 11.700016021728516, - "learning_rate": 3.8376948020309083e-07, - "loss": 0.1951, - "step": 1201 - }, - { - "epoch": 0.8399720475192173, - "grad_norm": 25.646282196044922, - "learning_rate": 3.805267839846874e-07, - "loss": 0.2608, - "step": 1202 - }, - { - "epoch": 0.8406708595387841, - "grad_norm": 29.2886905670166, - "learning_rate": 3.7729671680715175e-07, - "loss": 0.6764, - "step": 1203 - }, - { - "epoch": 0.8413696715583509, - "grad_norm": 4.25199556350708, - "learning_rate": 3.7407929791708264e-07, - "loss": 0.0447, - "step": 1204 - }, - { - "epoch": 0.8420684835779175, - "grad_norm": 15.59027099609375, - "learning_rate": 3.7087454648571385e-07, - "loss": 0.1857, - "step": 1205 - }, - { - "epoch": 0.8427672955974843, - "grad_norm": 31.77241325378418, - "learning_rate": 3.6768248160879786e-07, - "loss": 0.6149, - "step": 1206 - }, - { - "epoch": 0.843466107617051, - "grad_norm": 6.0527729988098145, - "learning_rate": 3.645031223064935e-07, - "loss": 0.0655, - "step": 1207 - }, - { - "epoch": 0.8441649196366178, - "grad_norm": 12.311010360717773, - "learning_rate": 3.6133648752325253e-07, - "loss": 0.2465, - "step": 1208 - }, - { - "epoch": 0.8448637316561844, - "grad_norm": 7.464306354522705, - "learning_rate": 3.5818259612770746e-07, - "loss": 0.09, - "step": 1209 - }, - { - "epoch": 0.8455625436757512, - "grad_norm": 13.457850456237793, - "learning_rate": 3.5504146691255736e-07, - "loss": 0.236, - "step": 1210 - }, - { - "epoch": 0.8462613556953179, - "grad_norm": 7.328340530395508, - "learning_rate": 3.51913118594458e-07, - "loss": 0.1122, - "step": 1211 - }, - { - "epoch": 0.8469601677148847, - "grad_norm": 33.875457763671875, - "learning_rate": 3.487975698139084e-07, - "loss": 0.5184, - "step": 1212 - }, - { - "epoch": 0.8476589797344515, - "grad_norm": 12.644630432128906, - "learning_rate": 3.4569483913514003e-07, - "loss": 0.2363, - "step": 1213 - }, - { - "epoch": 0.8483577917540182, - "grad_norm": 9.978564262390137, - "learning_rate": 3.426049450460084e-07, - "loss": 0.1615, - "step": 1214 - }, - { - "epoch": 0.8490566037735849, - "grad_norm": 7.424810886383057, - "learning_rate": 3.3952790595787986e-07, - "loss": 0.1235, - "step": 1215 - }, - { - "epoch": 0.8497554157931516, - "grad_norm": 19.88933563232422, - "learning_rate": 3.364637402055235e-07, - "loss": 0.3339, - "step": 1216 - }, - { - "epoch": 0.8504542278127184, - "grad_norm": 15.566442489624023, - "learning_rate": 3.334124660470017e-07, - "loss": 0.2928, - "step": 1217 - }, - { - "epoch": 0.8511530398322851, - "grad_norm": 8.519716262817383, - "learning_rate": 3.3037410166356144e-07, - "loss": 0.0591, - "step": 1218 - }, - { - "epoch": 0.8518518518518519, - "grad_norm": 9.44159984588623, - "learning_rate": 3.2734866515952754e-07, - "loss": 0.1245, - "step": 1219 - }, - { - "epoch": 0.8525506638714185, - "grad_norm": 24.32775115966797, - "learning_rate": 3.2433617456218894e-07, - "loss": 0.6771, - "step": 1220 - }, - { - "epoch": 0.8532494758909853, - "grad_norm": 4.077937126159668, - "learning_rate": 3.2133664782169947e-07, - "loss": 0.0482, - "step": 1221 - }, - { - "epoch": 0.8539482879105521, - "grad_norm": 21.460363388061523, - "learning_rate": 3.1835010281096426e-07, - "loss": 0.5593, - "step": 1222 - }, - { - "epoch": 0.8546470999301188, - "grad_norm": 10.10963249206543, - "learning_rate": 3.153765573255377e-07, - "loss": 0.1257, - "step": 1223 - }, - { - "epoch": 0.8553459119496856, - "grad_norm": 16.485063552856445, - "learning_rate": 3.1241602908351404e-07, - "loss": 0.4086, - "step": 1224 - }, - { - "epoch": 0.8560447239692522, - "grad_norm": 33.40040969848633, - "learning_rate": 3.0946853572542375e-07, - "loss": 0.6702, - "step": 1225 - }, - { - "epoch": 0.856743535988819, - "grad_norm": 5.947866439819336, - "learning_rate": 3.0653409481412906e-07, - "loss": 0.0523, - "step": 1226 - }, - { - "epoch": 0.8574423480083857, - "grad_norm": 7.11247444152832, - "learning_rate": 3.036127238347164e-07, - "loss": 0.0991, - "step": 1227 - }, - { - "epoch": 0.8581411600279525, - "grad_norm": 17.405256271362305, - "learning_rate": 3.007044401943951e-07, - "loss": 0.4326, - "step": 1228 - }, - { - "epoch": 0.8588399720475192, - "grad_norm": 1.9672048091888428, - "learning_rate": 2.9780926122239206e-07, - "loss": 0.0256, - "step": 1229 - }, - { - "epoch": 0.859538784067086, - "grad_norm": 44.16718673706055, - "learning_rate": 2.9492720416985004e-07, - "loss": 0.8448, - "step": 1230 - }, - { - "epoch": 0.8602375960866527, - "grad_norm": 0.45374947786331177, - "learning_rate": 2.9205828620972267e-07, - "loss": 0.0088, - "step": 1231 - }, - { - "epoch": 0.8609364081062194, - "grad_norm": 8.271926879882812, - "learning_rate": 2.892025244366736e-07, - "loss": 0.1034, - "step": 1232 - }, - { - "epoch": 0.8616352201257862, - "grad_norm": 9.046159744262695, - "learning_rate": 2.8635993586697555e-07, - "loss": 0.1309, - "step": 1233 - }, - { - "epoch": 0.8623340321453529, - "grad_norm": 17.490642547607422, - "learning_rate": 2.8353053743840534e-07, - "loss": 0.3128, - "step": 1234 - }, - { - "epoch": 0.8630328441649197, - "grad_norm": 41.88935470581055, - "learning_rate": 2.8071434601014774e-07, - "loss": 0.7318, - "step": 1235 - }, - { - "epoch": 0.8637316561844863, - "grad_norm": 22.86333465576172, - "learning_rate": 2.779113783626916e-07, - "loss": 0.5506, - "step": 1236 - }, - { - "epoch": 0.8644304682040531, - "grad_norm": 8.263748168945312, - "learning_rate": 2.751216511977303e-07, - "loss": 0.1022, - "step": 1237 - }, - { - "epoch": 0.8651292802236199, - "grad_norm": 32.37844467163086, - "learning_rate": 2.7234518113806316e-07, - "loss": 0.5176, - "step": 1238 - }, - { - "epoch": 0.8658280922431866, - "grad_norm": 6.2310051918029785, - "learning_rate": 2.695819847274972e-07, - "loss": 0.094, - "step": 1239 - }, - { - "epoch": 0.8665269042627534, - "grad_norm": 10.591695785522461, - "learning_rate": 2.668320784307457e-07, - "loss": 0.1435, - "step": 1240 - }, - { - "epoch": 0.86722571628232, - "grad_norm": 15.819378852844238, - "learning_rate": 2.6409547863333246e-07, - "loss": 0.2305, - "step": 1241 - }, - { - "epoch": 0.8679245283018868, - "grad_norm": 36.6091194152832, - "learning_rate": 2.613722016414944e-07, - "loss": 0.6671, - "step": 1242 - }, - { - "epoch": 0.8686233403214535, - "grad_norm": 11.718859672546387, - "learning_rate": 2.586622636820818e-07, - "loss": 0.1594, - "step": 1243 - }, - { - "epoch": 0.8693221523410203, - "grad_norm": 9.152027130126953, - "learning_rate": 2.5596568090246546e-07, - "loss": 0.1939, - "step": 1244 - }, - { - "epoch": 0.870020964360587, - "grad_norm": 3.220729112625122, - "learning_rate": 2.5328246937043526e-07, - "loss": 0.0395, - "step": 1245 - }, - { - "epoch": 0.8707197763801537, - "grad_norm": 18.78268814086914, - "learning_rate": 2.5061264507411057e-07, - "loss": 0.3532, - "step": 1246 - }, - { - "epoch": 0.8714185883997205, - "grad_norm": 0.3317777216434479, - "learning_rate": 2.4795622392184e-07, - "loss": 0.0075, - "step": 1247 - }, - { - "epoch": 0.8721174004192872, - "grad_norm": 10.846556663513184, - "learning_rate": 2.4531322174210976e-07, - "loss": 0.1979, - "step": 1248 - }, - { - "epoch": 0.872816212438854, - "grad_norm": 17.63131332397461, - "learning_rate": 2.4268365428344737e-07, - "loss": 0.2031, - "step": 1249 - }, - { - "epoch": 0.8735150244584207, - "grad_norm": 6.232522964477539, - "learning_rate": 2.4006753721432794e-07, - "loss": 0.1037, - "step": 1250 - }, - { - "epoch": 0.8742138364779874, - "grad_norm": 0.34121277928352356, - "learning_rate": 2.37464886123083e-07, - "loss": 0.0057, - "step": 1251 - }, - { - "epoch": 0.8749126484975541, - "grad_norm": 40.91691207885742, - "learning_rate": 2.3487571651780534e-07, - "loss": 0.6563, - "step": 1252 - }, - { - "epoch": 0.8756114605171209, - "grad_norm": 8.928706169128418, - "learning_rate": 2.3230004382625653e-07, - "loss": 0.1043, - "step": 1253 - }, - { - "epoch": 0.8763102725366876, - "grad_norm": 11.589432716369629, - "learning_rate": 2.2973788339577613e-07, - "loss": 0.2147, - "step": 1254 - }, - { - "epoch": 0.8770090845562544, - "grad_norm": 11.781576156616211, - "learning_rate": 2.271892504931905e-07, - "loss": 0.2171, - "step": 1255 - }, - { - "epoch": 0.8777078965758212, - "grad_norm": 5.011703968048096, - "learning_rate": 2.2465416030472227e-07, - "loss": 0.0539, - "step": 1256 - }, - { - "epoch": 0.8784067085953878, - "grad_norm": 3.3927674293518066, - "learning_rate": 2.2213262793589485e-07, - "loss": 0.0462, - "step": 1257 - }, - { - "epoch": 0.8791055206149546, - "grad_norm": 5.420595169067383, - "learning_rate": 2.196246684114506e-07, - "loss": 0.101, - "step": 1258 - }, - { - "epoch": 0.8798043326345213, - "grad_norm": 9.41069221496582, - "learning_rate": 2.1713029667525422e-07, - "loss": 0.0961, - "step": 1259 - }, - { - "epoch": 0.8805031446540881, - "grad_norm": 19.567420959472656, - "learning_rate": 2.1464952759020857e-07, - "loss": 0.3023, - "step": 1260 - }, - { - "epoch": 0.8812019566736548, - "grad_norm": 6.458042621612549, - "learning_rate": 2.1218237593816305e-07, - "loss": 0.0746, - "step": 1261 - }, - { - "epoch": 0.8819007686932215, - "grad_norm": 8.432520866394043, - "learning_rate": 2.0972885641982605e-07, - "loss": 0.1698, - "step": 1262 - }, - { - "epoch": 0.8825995807127882, - "grad_norm": 22.517902374267578, - "learning_rate": 2.0728898365467903e-07, - "loss": 0.3332, - "step": 1263 - }, - { - "epoch": 0.883298392732355, - "grad_norm": 25.901752471923828, - "learning_rate": 2.0486277218088796e-07, - "loss": 0.5327, - "step": 1264 - }, - { - "epoch": 0.8839972047519218, - "grad_norm": 12.131880760192871, - "learning_rate": 2.024502364552164e-07, - "loss": 0.1544, - "step": 1265 - }, - { - "epoch": 0.8846960167714885, - "grad_norm": 3.912961959838867, - "learning_rate": 2.0005139085293945e-07, - "loss": 0.0489, - "step": 1266 - }, - { - "epoch": 0.8853948287910552, - "grad_norm": 36.00802993774414, - "learning_rate": 1.9766624966776088e-07, - "loss": 0.6536, - "step": 1267 - }, - { - "epoch": 0.8860936408106219, - "grad_norm": 2.209369421005249, - "learning_rate": 1.9529482711172305e-07, - "loss": 0.0268, - "step": 1268 - }, - { - "epoch": 0.8867924528301887, - "grad_norm": 10.653560638427734, - "learning_rate": 1.9293713731512675e-07, - "loss": 0.1398, - "step": 1269 - }, - { - "epoch": 0.8874912648497554, - "grad_norm": 16.39984703063965, - "learning_rate": 1.9059319432644412e-07, - "loss": 0.2051, - "step": 1270 - }, - { - "epoch": 0.8881900768693222, - "grad_norm": 26.631011962890625, - "learning_rate": 1.882630121122353e-07, - "loss": 0.4272, - "step": 1271 - }, - { - "epoch": 0.8888888888888888, - "grad_norm": 9.047606468200684, - "learning_rate": 1.8594660455706764e-07, - "loss": 0.1233, - "step": 1272 - }, - { - "epoch": 0.8895877009084556, - "grad_norm": 6.452094554901123, - "learning_rate": 1.8364398546342978e-07, - "loss": 0.0654, - "step": 1273 - }, - { - "epoch": 0.8902865129280224, - "grad_norm": 5.8464579582214355, - "learning_rate": 1.813551685516507e-07, - "loss": 0.0886, - "step": 1274 - }, - { - "epoch": 0.8909853249475891, - "grad_norm": 5.251763343811035, - "learning_rate": 1.790801674598186e-07, - "loss": 0.0463, - "step": 1275 - }, - { - "epoch": 0.8916841369671559, - "grad_norm": 3.5294036865234375, - "learning_rate": 1.7681899574369916e-07, - "loss": 0.0258, - "step": 1276 - }, - { - "epoch": 0.8923829489867225, - "grad_norm": 2.3838448524475098, - "learning_rate": 1.745716668766545e-07, - "loss": 0.022, - "step": 1277 - }, - { - "epoch": 0.8930817610062893, - "grad_norm": 38.03437805175781, - "learning_rate": 1.723381942495625e-07, - "loss": 0.811, - "step": 1278 - }, - { - "epoch": 0.893780573025856, - "grad_norm": 9.917598724365234, - "learning_rate": 1.701185911707387e-07, - "loss": 0.1415, - "step": 1279 - }, - { - "epoch": 0.8944793850454228, - "grad_norm": 6.581984996795654, - "learning_rate": 1.679128708658548e-07, - "loss": 0.1164, - "step": 1280 - }, - { - "epoch": 0.8951781970649895, - "grad_norm": 29.610240936279297, - "learning_rate": 1.6572104647786247e-07, - "loss": 0.5961, - "step": 1281 - }, - { - "epoch": 0.8958770090845563, - "grad_norm": 17.084474563598633, - "learning_rate": 1.6354313106691083e-07, - "loss": 0.2784, - "step": 1282 - }, - { - "epoch": 0.896575821104123, - "grad_norm": 22.56829071044922, - "learning_rate": 1.6137913761027384e-07, - "loss": 0.3331, - "step": 1283 - }, - { - "epoch": 0.8972746331236897, - "grad_norm": 14.29967975616455, - "learning_rate": 1.592290790022702e-07, - "loss": 0.2789, - "step": 1284 - }, - { - "epoch": 0.8979734451432565, - "grad_norm": 6.18997859954834, - "learning_rate": 1.5709296805418523e-07, - "loss": 0.1076, - "step": 1285 - }, - { - "epoch": 0.8986722571628232, - "grad_norm": 22.442312240600586, - "learning_rate": 1.5497081749419745e-07, - "loss": 0.4934, - "step": 1286 - }, - { - "epoch": 0.89937106918239, - "grad_norm": 38.552181243896484, - "learning_rate": 1.5286263996730027e-07, - "loss": 0.6428, - "step": 1287 - }, - { - "epoch": 0.9000698812019566, - "grad_norm": 8.054842948913574, - "learning_rate": 1.507684480352292e-07, - "loss": 0.0955, - "step": 1288 - }, - { - "epoch": 0.9007686932215234, - "grad_norm": 5.846706390380859, - "learning_rate": 1.4868825417638427e-07, - "loss": 0.0617, - "step": 1289 - }, - { - "epoch": 0.9014675052410901, - "grad_norm": 7.911886215209961, - "learning_rate": 1.4662207078575685e-07, - "loss": 0.0648, - "step": 1290 - }, - { - "epoch": 0.9021663172606569, - "grad_norm": 11.648468971252441, - "learning_rate": 1.4456991017485737e-07, - "loss": 0.1457, - "step": 1291 - }, - { - "epoch": 0.9028651292802237, - "grad_norm": 18.855669021606445, - "learning_rate": 1.425317845716384e-07, - "loss": 0.3698, - "step": 1292 - }, - { - "epoch": 0.9035639412997903, - "grad_norm": 19.89934539794922, - "learning_rate": 1.4050770612042603e-07, - "loss": 0.2361, - "step": 1293 - }, - { - "epoch": 0.9042627533193571, - "grad_norm": 10.439923286437988, - "learning_rate": 1.3849768688184357e-07, - "loss": 0.1834, - "step": 1294 - }, - { - "epoch": 0.9049615653389238, - "grad_norm": 11.241701126098633, - "learning_rate": 1.365017388327422e-07, - "loss": 0.1202, - "step": 1295 - }, - { - "epoch": 0.9056603773584906, - "grad_norm": 0.5028015971183777, - "learning_rate": 1.3451987386612852e-07, - "loss": 0.009, - "step": 1296 - }, - { - "epoch": 0.9063591893780573, - "grad_norm": 34.43354797363281, - "learning_rate": 1.3255210379109485e-07, - "loss": 0.6384, - "step": 1297 - }, - { - "epoch": 0.907058001397624, - "grad_norm": 29.990886688232422, - "learning_rate": 1.3059844033274733e-07, - "loss": 0.358, - "step": 1298 - }, - { - "epoch": 0.9077568134171907, - "grad_norm": 32.4084358215332, - "learning_rate": 1.286588951321363e-07, - "loss": 0.4848, - "step": 1299 - }, - { - "epoch": 0.9084556254367575, - "grad_norm": 12.453899383544922, - "learning_rate": 1.267334797461886e-07, - "loss": 0.1846, - "step": 1300 - }, - { - "epoch": 0.9091544374563243, - "grad_norm": 14.791337013244629, - "learning_rate": 1.2482220564763669e-07, - "loss": 0.3099, - "step": 1301 - }, - { - "epoch": 0.909853249475891, - "grad_norm": 14.566367149353027, - "learning_rate": 1.2292508422495158e-07, - "loss": 0.2153, - "step": 1302 - }, - { - "epoch": 0.9105520614954578, - "grad_norm": 32.48210144042969, - "learning_rate": 1.210421267822734e-07, - "loss": 0.4966, - "step": 1303 - }, - { - "epoch": 0.9112508735150244, - "grad_norm": 16.934797286987305, - "learning_rate": 1.191733445393467e-07, - "loss": 0.2415, - "step": 1304 - }, - { - "epoch": 0.9119496855345912, - "grad_norm": 3.7917070388793945, - "learning_rate": 1.1731874863145143e-07, - "loss": 0.0432, - "step": 1305 - }, - { - "epoch": 0.9126484975541579, - "grad_norm": 20.115760803222656, - "learning_rate": 1.154783501093365e-07, - "loss": 0.2785, - "step": 1306 - }, - { - "epoch": 0.9133473095737247, - "grad_norm": 28.754680633544922, - "learning_rate": 1.1365215993915573e-07, - "loss": 0.594, - "step": 1307 - }, - { - "epoch": 0.9140461215932913, - "grad_norm": 14.853599548339844, - "learning_rate": 1.1184018900240012e-07, - "loss": 0.265, - "step": 1308 - }, - { - "epoch": 0.9147449336128581, - "grad_norm": 31.062646865844727, - "learning_rate": 1.1004244809583591e-07, - "loss": 0.6687, - "step": 1309 - }, - { - "epoch": 0.9154437456324249, - "grad_norm": 5.976188659667969, - "learning_rate": 1.0825894793143721e-07, - "loss": 0.0938, - "step": 1310 - }, - { - "epoch": 0.9161425576519916, - "grad_norm": 11.048742294311523, - "learning_rate": 1.0648969913632401e-07, - "loss": 0.1308, - "step": 1311 - }, - { - "epoch": 0.9168413696715584, - "grad_norm": 11.694340705871582, - "learning_rate": 1.0473471225269898e-07, - "loss": 0.1481, - "step": 1312 - }, - { - "epoch": 0.9175401816911251, - "grad_norm": 2.9253950119018555, - "learning_rate": 1.0299399773778362e-07, - "loss": 0.0362, - "step": 1313 - }, - { - "epoch": 0.9182389937106918, - "grad_norm": 18.658864974975586, - "learning_rate": 1.0126756596375687e-07, - "loss": 0.3327, - "step": 1314 - }, - { - "epoch": 0.9189378057302585, - "grad_norm": 4.727551460266113, - "learning_rate": 9.955542721769156e-08, - "loss": 0.0634, - "step": 1315 - }, - { - "epoch": 0.9196366177498253, - "grad_norm": 6.097905158996582, - "learning_rate": 9.785759170149622e-08, - "loss": 0.0655, - "step": 1316 - }, - { - "epoch": 0.9203354297693921, - "grad_norm": 7.615106105804443, - "learning_rate": 9.617406953185138e-08, - "loss": 0.0755, - "step": 1317 - }, - { - "epoch": 0.9210342417889588, - "grad_norm": 25.990507125854492, - "learning_rate": 9.450487074015108e-08, - "loss": 0.5468, - "step": 1318 - }, - { - "epoch": 0.9217330538085255, - "grad_norm": 6.017211437225342, - "learning_rate": 9.285000527244181e-08, - "loss": 0.0774, - "step": 1319 - }, - { - "epoch": 0.9224318658280922, - "grad_norm": 8.59233283996582, - "learning_rate": 9.120948298936422e-08, - "loss": 0.1223, - "step": 1320 - }, - { - "epoch": 0.923130677847659, - "grad_norm": 36.531856536865234, - "learning_rate": 8.958331366609424e-08, - "loss": 0.5861, - "step": 1321 - }, - { - "epoch": 0.9238294898672257, - "grad_norm": 19.516033172607422, - "learning_rate": 8.797150699228374e-08, - "loss": 0.3335, - "step": 1322 - }, - { - "epoch": 0.9245283018867925, - "grad_norm": 17.62247657775879, - "learning_rate": 8.637407257200498e-08, - "loss": 0.4359, - "step": 1323 - }, - { - "epoch": 0.9252271139063591, - "grad_norm": 10.937044143676758, - "learning_rate": 8.479101992369038e-08, - "loss": 0.1565, - "step": 1324 - }, - { - "epoch": 0.9259259259259259, - "grad_norm": 6.678995132446289, - "learning_rate": 8.322235848007898e-08, - "loss": 0.0869, - "step": 1325 - }, - { - "epoch": 0.9266247379454927, - "grad_norm": 19.89664649963379, - "learning_rate": 8.166809758815897e-08, - "loss": 0.36, - "step": 1326 - }, - { - "epoch": 0.9273235499650594, - "grad_norm": 35.38471221923828, - "learning_rate": 8.012824650910938e-08, - "loss": 0.7401, - "step": 1327 - }, - { - "epoch": 0.9280223619846262, - "grad_norm": 23.878799438476562, - "learning_rate": 7.860281441825018e-08, - "loss": 0.4833, - "step": 1328 - }, - { - "epoch": 0.9287211740041929, - "grad_norm": 13.079452514648438, - "learning_rate": 7.709181040498253e-08, - "loss": 0.1611, - "step": 1329 - }, - { - "epoch": 0.9294199860237596, - "grad_norm": 26.776845932006836, - "learning_rate": 7.559524347273861e-08, - "loss": 0.4894, - "step": 1330 - }, - { - "epoch": 0.9301187980433263, - "grad_norm": 7.918013572692871, - "learning_rate": 7.411312253892466e-08, - "loss": 0.0912, - "step": 1331 - }, - { - "epoch": 0.9308176100628931, - "grad_norm": 14.908166885375977, - "learning_rate": 7.264545643486997e-08, - "loss": 0.1744, - "step": 1332 - }, - { - "epoch": 0.9315164220824598, - "grad_norm": 12.482393264770508, - "learning_rate": 7.119225390577383e-08, - "loss": 0.1415, - "step": 1333 - }, - { - "epoch": 0.9322152341020266, - "grad_norm": 15.868339538574219, - "learning_rate": 6.975352361065307e-08, - "loss": 0.2916, - "step": 1334 - }, - { - "epoch": 0.9329140461215933, - "grad_norm": 11.9424409866333, - "learning_rate": 6.832927412229017e-08, - "loss": 0.1109, - "step": 1335 - }, - { - "epoch": 0.93361285814116, - "grad_norm": 14.300697326660156, - "learning_rate": 6.691951392718332e-08, - "loss": 0.234, - "step": 1336 - }, - { - "epoch": 0.9343116701607268, - "grad_norm": 15.891079902648926, - "learning_rate": 6.5524251425495e-08, - "loss": 0.2725, - "step": 1337 - }, - { - "epoch": 0.9350104821802935, - "grad_norm": 15.432860374450684, - "learning_rate": 6.414349493100131e-08, - "loss": 0.3189, - "step": 1338 - }, - { - "epoch": 0.9357092941998603, - "grad_norm": 11.384847640991211, - "learning_rate": 6.277725267104489e-08, - "loss": 0.1717, - "step": 1339 - }, - { - "epoch": 0.9364081062194269, - "grad_norm": 19.854578018188477, - "learning_rate": 6.142553278648239e-08, - "loss": 0.2997, - "step": 1340 - }, - { - "epoch": 0.9371069182389937, - "grad_norm": 19.603626251220703, - "learning_rate": 6.008834333163876e-08, - "loss": 0.2613, - "step": 1341 - }, - { - "epoch": 0.9378057302585604, - "grad_norm": 5.466430187225342, - "learning_rate": 5.876569227425855e-08, - "loss": 0.0723, - "step": 1342 - }, - { - "epoch": 0.9385045422781272, - "grad_norm": 7.386483669281006, - "learning_rate": 5.745758749545749e-08, - "loss": 0.1227, - "step": 1343 - }, - { - "epoch": 0.939203354297694, - "grad_norm": 8.658585548400879, - "learning_rate": 5.616403678967625e-08, - "loss": 0.1067, - "step": 1344 - }, - { - "epoch": 0.9399021663172606, - "grad_norm": 32.88015365600586, - "learning_rate": 5.4885047864634275e-08, - "loss": 0.4002, - "step": 1345 - }, - { - "epoch": 0.9406009783368274, - "grad_norm": 43.813045501708984, - "learning_rate": 5.3620628341283234e-08, - "loss": 1.009, - "step": 1346 - }, - { - "epoch": 0.9412997903563941, - "grad_norm": 31.651718139648438, - "learning_rate": 5.2370785753763364e-08, - "loss": 0.699, - "step": 1347 - }, - { - "epoch": 0.9419986023759609, - "grad_norm": 3.451824903488159, - "learning_rate": 5.113552754935414e-08, - "loss": 0.0261, - "step": 1348 - }, - { - "epoch": 0.9426974143955276, - "grad_norm": 38.350582122802734, - "learning_rate": 4.9914861088435904e-08, - "loss": 0.9024, - "step": 1349 - }, - { - "epoch": 0.9433962264150944, - "grad_norm": 23.009504318237305, - "learning_rate": 4.870879364444109e-08, - "loss": 0.3106, - "step": 1350 - }, - { - "epoch": 0.944095038434661, - "grad_norm": 12.33083724975586, - "learning_rate": 4.75173324038139e-08, - "loss": 0.1936, - "step": 1351 - }, - { - "epoch": 0.9447938504542278, - "grad_norm": 18.856685638427734, - "learning_rate": 4.6340484465965396e-08, - "loss": 0.433, - "step": 1352 - }, - { - "epoch": 0.9454926624737946, - "grad_norm": 8.254105567932129, - "learning_rate": 4.5178256843233235e-08, - "loss": 0.0884, - "step": 1353 - }, - { - "epoch": 0.9461914744933613, - "grad_norm": 10.499476432800293, - "learning_rate": 4.40306564608381e-08, - "loss": 0.1794, - "step": 1354 - }, - { - "epoch": 0.9468902865129281, - "grad_norm": 6.47723388671875, - "learning_rate": 4.2897690156843144e-08, - "loss": 0.0639, - "step": 1355 - }, - { - "epoch": 0.9475890985324947, - "grad_norm": 24.49585723876953, - "learning_rate": 4.1779364682113796e-08, - "loss": 0.6026, - "step": 1356 - }, - { - "epoch": 0.9482879105520615, - "grad_norm": 13.254868507385254, - "learning_rate": 4.067568670027638e-08, - "loss": 0.2008, - "step": 1357 - }, - { - "epoch": 0.9489867225716282, - "grad_norm": 3.554204225540161, - "learning_rate": 3.958666278767953e-08, - "loss": 0.0433, - "step": 1358 - }, - { - "epoch": 0.949685534591195, - "grad_norm": 11.670513153076172, - "learning_rate": 3.851229943335394e-08, - "loss": 0.2552, - "step": 1359 - }, - { - "epoch": 0.9503843466107617, - "grad_norm": 7.214932441711426, - "learning_rate": 3.745260303897491e-08, - "loss": 0.0665, - "step": 1360 - }, - { - "epoch": 0.9510831586303284, - "grad_norm": 12.584773063659668, - "learning_rate": 3.640757991882349e-08, - "loss": 0.1777, - "step": 1361 - }, - { - "epoch": 0.9517819706498952, - "grad_norm": 0.3003522753715515, - "learning_rate": 3.5377236299748154e-08, - "loss": 0.0092, - "step": 1362 - }, - { - "epoch": 0.9524807826694619, - "grad_norm": 15.687874794006348, - "learning_rate": 3.43615783211293e-08, - "loss": 0.2015, - "step": 1363 - }, - { - "epoch": 0.9531795946890287, - "grad_norm": 6.1313629150390625, - "learning_rate": 3.3360612034841475e-08, - "loss": 0.0635, - "step": 1364 - }, - { - "epoch": 0.9538784067085954, - "grad_norm": 11.138212203979492, - "learning_rate": 3.237434340521789e-08, - "loss": 0.1005, - "step": 1365 - }, - { - "epoch": 0.9545772187281621, - "grad_norm": 14.754155158996582, - "learning_rate": 3.1402778309014284e-08, - "loss": 0.2354, - "step": 1366 - }, - { - "epoch": 0.9552760307477288, - "grad_norm": 14.251261711120605, - "learning_rate": 3.0445922535374263e-08, - "loss": 0.2881, - "step": 1367 - }, - { - "epoch": 0.9559748427672956, - "grad_norm": 16.188068389892578, - "learning_rate": 2.9503781785795715e-08, - "loss": 0.2261, - "step": 1368 - }, - { - "epoch": 0.9566736547868623, - "grad_norm": 9.298933982849121, - "learning_rate": 2.857636167409472e-08, - "loss": 0.1479, - "step": 1369 - }, - { - "epoch": 0.9573724668064291, - "grad_norm": 32.74161911010742, - "learning_rate": 2.766366772637391e-08, - "loss": 0.7875, - "step": 1370 - }, - { - "epoch": 0.9580712788259959, - "grad_norm": 27.600805282592773, - "learning_rate": 2.676570538098944e-08, - "loss": 0.633, - "step": 1371 - }, - { - "epoch": 0.9587700908455625, - "grad_norm": 12.529052734375, - "learning_rate": 2.5882479988517394e-08, - "loss": 0.1636, - "step": 1372 - }, - { - "epoch": 0.9594689028651293, - "grad_norm": 30.737586975097656, - "learning_rate": 2.5013996811722175e-08, - "loss": 0.6475, - "step": 1373 - }, - { - "epoch": 0.960167714884696, - "grad_norm": 9.5076322555542, - "learning_rate": 2.416026102552732e-08, - "loss": 0.1587, - "step": 1374 - }, - { - "epoch": 0.9608665269042628, - "grad_norm": 19.371639251708984, - "learning_rate": 2.332127771698084e-08, - "loss": 0.1789, - "step": 1375 - }, - { - "epoch": 0.9615653389238294, - "grad_norm": 16.10708999633789, - "learning_rate": 2.2497051885228825e-08, - "loss": 0.1858, - "step": 1376 - }, - { - "epoch": 0.9622641509433962, - "grad_norm": 12.367995262145996, - "learning_rate": 2.168758844148272e-08, - "loss": 0.1865, - "step": 1377 - }, - { - "epoch": 0.9629629629629629, - "grad_norm": 14.510953903198242, - "learning_rate": 2.089289220899099e-08, - "loss": 0.1663, - "step": 1378 - }, - { - "epoch": 0.9636617749825297, - "grad_norm": 27.010644912719727, - "learning_rate": 2.011296792301165e-08, - "loss": 0.4795, - "step": 1379 - }, - { - "epoch": 0.9643605870020965, - "grad_norm": 0.7491756677627563, - "learning_rate": 1.93478202307823e-08, - "loss": 0.0144, - "step": 1380 - }, - { - "epoch": 0.9650593990216632, - "grad_norm": 15.297125816345215, - "learning_rate": 1.8597453691492628e-08, - "loss": 0.242, - "step": 1381 - }, - { - "epoch": 0.9657582110412299, - "grad_norm": 15.847579002380371, - "learning_rate": 1.7861872776258617e-08, - "loss": 0.2409, - "step": 1382 - }, - { - "epoch": 0.9664570230607966, - "grad_norm": 17.18081283569336, - "learning_rate": 1.714108186809421e-08, - "loss": 0.3505, - "step": 1383 - }, - { - "epoch": 0.9671558350803634, - "grad_norm": 0.34237828850746155, - "learning_rate": 1.643508526188692e-08, - "loss": 0.0079, - "step": 1384 - }, - { - "epoch": 0.9678546470999301, - "grad_norm": 9.91698169708252, - "learning_rate": 1.574388716437003e-08, - "loss": 0.1179, - "step": 1385 - }, - { - "epoch": 0.9685534591194969, - "grad_norm": 9.358024597167969, - "learning_rate": 1.5067491694100156e-08, - "loss": 0.1403, - "step": 1386 - }, - { - "epoch": 0.9692522711390635, - "grad_norm": 9.932446479797363, - "learning_rate": 1.4405902881430289e-08, - "loss": 0.1974, - "step": 1387 - }, - { - "epoch": 0.9699510831586303, - "grad_norm": 1.0891270637512207, - "learning_rate": 1.3759124668487057e-08, - "loss": 0.0219, - "step": 1388 - }, - { - "epoch": 0.9706498951781971, - "grad_norm": 15.910924911499023, - "learning_rate": 1.3127160909147674e-08, - "loss": 0.2585, - "step": 1389 - }, - { - "epoch": 0.9713487071977638, - "grad_norm": 7.774021148681641, - "learning_rate": 1.2510015369015527e-08, - "loss": 0.0774, - "step": 1390 - }, - { - "epoch": 0.9720475192173306, - "grad_norm": 37.42231369018555, - "learning_rate": 1.1907691725398795e-08, - "loss": 0.6032, - "step": 1391 - }, - { - "epoch": 0.9727463312368972, - "grad_norm": 25.671794891357422, - "learning_rate": 1.132019356728853e-08, - "loss": 0.3234, - "step": 1392 - }, - { - "epoch": 0.973445143256464, - "grad_norm": 0.3582533299922943, - "learning_rate": 1.0747524395336439e-08, - "loss": 0.0073, - "step": 1393 - }, - { - "epoch": 0.9741439552760307, - "grad_norm": 12.987789154052734, - "learning_rate": 1.0189687621835198e-08, - "loss": 0.2237, - "step": 1394 - }, - { - "epoch": 0.9748427672955975, - "grad_norm": 8.105573654174805, - "learning_rate": 9.646686570697062e-09, - "loss": 0.1102, - "step": 1395 - }, - { - "epoch": 0.9755415793151643, - "grad_norm": 28.24883460998535, - "learning_rate": 9.118524477434999e-09, - "loss": 0.4962, - "step": 1396 - }, - { - "epoch": 0.976240391334731, - "grad_norm": 27.196672439575195, - "learning_rate": 8.605204489142426e-09, - "loss": 0.4905, - "step": 1397 - }, - { - "epoch": 0.9769392033542977, - "grad_norm": 30.801727294921875, - "learning_rate": 8.106729664475178e-09, - "loss": 0.7748, - "step": 1398 - }, - { - "epoch": 0.9776380153738644, - "grad_norm": 6.034183502197266, - "learning_rate": 7.62310297363289e-09, - "loss": 0.0622, - "step": 1399 - }, - { - "epoch": 0.9783368273934312, - "grad_norm": 30.760272979736328, - "learning_rate": 7.154327298342089e-09, - "loss": 0.6448, - "step": 1400 - }, - { - "epoch": 0.9790356394129979, - "grad_norm": 15.908082008361816, - "learning_rate": 6.700405431837587e-09, - "loss": 0.1951, - "step": 1401 - }, - { - "epoch": 0.9797344514325647, - "grad_norm": 24.25754737854004, - "learning_rate": 6.2613400788472115e-09, - "loss": 0.4904, - "step": 1402 - }, - { - "epoch": 0.9804332634521313, - "grad_norm": 5.621443271636963, - "learning_rate": 5.837133855574884e-09, - "loss": 0.1, - "step": 1403 - }, - { - "epoch": 0.9811320754716981, - "grad_norm": 6.193020343780518, - "learning_rate": 5.427789289685348e-09, - "loss": 0.1037, - "step": 1404 - }, - { - "epoch": 0.9818308874912649, - "grad_norm": 11.584317207336426, - "learning_rate": 5.033308820289185e-09, - "loss": 0.2061, - "step": 1405 - }, - { - "epoch": 0.9825296995108316, - "grad_norm": 0.7964586615562439, - "learning_rate": 4.653694797927544e-09, - "loss": 0.0124, - "step": 1406 - }, - { - "epoch": 0.9832285115303984, - "grad_norm": 37.44985580444336, - "learning_rate": 4.288949484559934e-09, - "loss": 0.9075, - "step": 1407 - }, - { - "epoch": 0.983927323549965, - "grad_norm": 15.352105140686035, - "learning_rate": 3.939075053548125e-09, - "loss": 0.2573, - "step": 1408 - }, - { - "epoch": 0.9846261355695318, - "grad_norm": 11.130683898925781, - "learning_rate": 3.6040735896455957e-09, - "loss": 0.1399, - "step": 1409 - }, - { - "epoch": 0.9853249475890985, - "grad_norm": 21.828716278076172, - "learning_rate": 3.283947088983663e-09, - "loss": 0.4917, - "step": 1410 - }, - { - "epoch": 0.9860237596086653, - "grad_norm": 14.967291831970215, - "learning_rate": 2.978697459060098e-09, - "loss": 0.3058, - "step": 1411 - }, - { - "epoch": 0.986722571628232, - "grad_norm": 0.42502361536026, - "learning_rate": 2.6883265187283014e-09, - "loss": 0.0065, - "step": 1412 - }, - { - "epoch": 0.9874213836477987, - "grad_norm": 7.10825252532959, - "learning_rate": 2.412835998185092e-09, - "loss": 0.0535, - "step": 1413 - }, - { - "epoch": 0.9881201956673655, - "grad_norm": 24.652862548828125, - "learning_rate": 2.1522275389615487e-09, - "loss": 0.3725, - "step": 1414 - }, - { - "epoch": 0.9888190076869322, - "grad_norm": 27.862064361572266, - "learning_rate": 1.9065026939127374e-09, - "loss": 0.4904, - "step": 1415 - }, - { - "epoch": 0.989517819706499, - "grad_norm": 31.075525283813477, - "learning_rate": 1.6756629272085545e-09, - "loss": 0.6628, - "step": 1416 - }, - { - "epoch": 0.9902166317260657, - "grad_norm": 14.951384544372559, - "learning_rate": 1.4597096143253996e-09, - "loss": 0.2783, - "step": 1417 - }, - { - "epoch": 0.9909154437456325, - "grad_norm": 39.61639404296875, - "learning_rate": 1.2586440420372936e-09, - "loss": 0.8025, - "step": 1418 - }, - { - "epoch": 0.9916142557651991, - "grad_norm": 18.797163009643555, - "learning_rate": 1.0724674084083841e-09, - "loss": 0.3223, - "step": 1419 - }, - { - "epoch": 0.9923130677847659, - "grad_norm": 16.350080490112305, - "learning_rate": 9.011808227865626e-10, - "loss": 0.3212, - "step": 1420 - }, - { - "epoch": 0.9930118798043326, - "grad_norm": 21.701143264770508, - "learning_rate": 7.447853057954146e-10, - "loss": 0.3172, - "step": 1421 - }, - { - "epoch": 0.9937106918238994, - "grad_norm": 29.53713607788086, - "learning_rate": 6.032817893297793e-10, - "loss": 0.5336, - "step": 1422 - }, - { - "epoch": 0.9944095038434662, - "grad_norm": 0.3307841420173645, - "learning_rate": 4.766711165488103e-10, - "loss": 0.009, - "step": 1423 - }, - { - "epoch": 0.9951083158630328, - "grad_norm": 26.6708984375, - "learning_rate": 3.6495404187181224e-10, - "loss": 0.4706, - "step": 1424 - }, - { - "epoch": 0.9958071278825996, - "grad_norm": 7.891889572143555, - "learning_rate": 2.681312309735229e-10, - "loss": 0.1358, - "step": 1425 - }, - { - "epoch": 0.9965059399021663, - "grad_norm": 0.35765719413757324, - "learning_rate": 1.8620326077967155e-10, - "loss": 0.0056, - "step": 1426 - }, - { - "epoch": 0.9972047519217331, - "grad_norm": 38.667030334472656, - "learning_rate": 1.191706194644815e-10, - "loss": 0.8021, - "step": 1427 - }, - { - "epoch": 0.9979035639412998, - "grad_norm": 12.724098205566406, - "learning_rate": 6.703370644706165e-11, - "loss": 0.3228, - "step": 1428 - }, - { - "epoch": 0.9986023759608665, - "grad_norm": 4.352276802062988, - "learning_rate": 2.979283238863095e-11, - "loss": 0.0337, - "step": 1429 - }, - { - "epoch": 0.9993011879804332, - "grad_norm": 16.719449996948242, - "learning_rate": 7.448219192240923e-12, - "loss": 0.3472, - "step": 1430 - }, - { - "epoch": 1.0, - "grad_norm": 15.640430450439453, - "learning_rate": 0.0, - "loss": 0.2292, - "step": 1431 - } - ], - "logging_steps": 1, - "max_steps": 1431, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 500, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": true - }, - "attributes": {} - } - }, - "total_flos": 1.4002910351553331e+17, - "train_batch_size": 4, - "trial_name": null, - "trial_params": null -}