{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.060952845355012227, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.0476422677506115e-05, "grad_norm": 3.320129632949829, "learning_rate": 4.9975e-05, "loss": 6.4044, "step": 1 }, { "epoch": 6.095284535501223e-05, "grad_norm": 3.4497978687286377, "learning_rate": 4.995e-05, "loss": 6.3701, "step": 2 }, { "epoch": 9.142926803251835e-05, "grad_norm": 3.660489797592163, "learning_rate": 4.992500000000001e-05, "loss": 6.2111, "step": 3 }, { "epoch": 0.00012190569071002446, "grad_norm": 3.9403293132781982, "learning_rate": 4.99e-05, "loss": 6.0074, "step": 4 }, { "epoch": 0.00015238211338753057, "grad_norm": 4.429495811462402, "learning_rate": 4.9875000000000006e-05, "loss": 5.897, "step": 5 }, { "epoch": 0.0001828585360650367, "grad_norm": 4.492753505706787, "learning_rate": 4.9850000000000006e-05, "loss": 5.4463, "step": 6 }, { "epoch": 0.0002133349587425428, "grad_norm": 5.5487141609191895, "learning_rate": 4.9825000000000005e-05, "loss": 5.7327, "step": 7 }, { "epoch": 0.00024381138142004892, "grad_norm": 5.252483367919922, "learning_rate": 4.9800000000000004e-05, "loss": 5.0707, "step": 8 }, { "epoch": 0.00027428780409755505, "grad_norm": 5.801187515258789, "learning_rate": 4.9775000000000004e-05, "loss": 4.7944, "step": 9 }, { "epoch": 0.00030476422677506114, "grad_norm": 6.401638984680176, "learning_rate": 4.975e-05, "loss": 4.5727, "step": 10 }, { "epoch": 0.00033524064945256724, "grad_norm": 6.714531421661377, "learning_rate": 4.9725e-05, "loss": 4.3186, "step": 11 }, { "epoch": 0.0003657170721300734, "grad_norm": 7.066725730895996, "learning_rate": 4.97e-05, "loss": 3.8646, "step": 12 }, { "epoch": 0.0003961934948075795, "grad_norm": 7.418155193328857, "learning_rate": 4.967500000000001e-05, "loss": 3.7329, "step": 13 }, { "epoch": 0.0004266699174850856, "grad_norm": 8.726635932922363, "learning_rate": 4.965e-05, "loss": 3.5928, "step": 14 }, { "epoch": 0.0004571463401625917, "grad_norm": 8.25580883026123, "learning_rate": 4.962500000000001e-05, "loss": 3.089, "step": 15 }, { "epoch": 0.00048762276284009784, "grad_norm": 9.004563331604004, "learning_rate": 4.96e-05, "loss": 2.7404, "step": 16 }, { "epoch": 0.0005180991855176039, "grad_norm": 8.678912162780762, "learning_rate": 4.9575000000000006e-05, "loss": 2.1629, "step": 17 }, { "epoch": 0.0005485756081951101, "grad_norm": 8.780608177185059, "learning_rate": 4.9550000000000005e-05, "loss": 1.9109, "step": 18 }, { "epoch": 0.0005790520308726161, "grad_norm": 8.066640853881836, "learning_rate": 4.9525000000000004e-05, "loss": 1.5925, "step": 19 }, { "epoch": 0.0006095284535501223, "grad_norm": 6.5066938400268555, "learning_rate": 4.9500000000000004e-05, "loss": 1.2806, "step": 20 }, { "epoch": 0.0006400048762276284, "grad_norm": 4.906357765197754, "learning_rate": 4.9475e-05, "loss": 1.023, "step": 21 }, { "epoch": 0.0006704812989051345, "grad_norm": 3.3437726497650146, "learning_rate": 4.945e-05, "loss": 0.8827, "step": 22 }, { "epoch": 0.0007009577215826406, "grad_norm": 2.7981982231140137, "learning_rate": 4.9425e-05, "loss": 0.7093, "step": 23 }, { "epoch": 0.0007314341442601468, "grad_norm": 1.6184637546539307, "learning_rate": 4.94e-05, "loss": 0.6417, "step": 24 }, { "epoch": 0.0007619105669376528, "grad_norm": 0.8903117775917053, "learning_rate": 4.937500000000001e-05, "loss": 0.548, "step": 25 }, { "epoch": 0.000792386989615159, "grad_norm": 0.44086983799934387, "learning_rate": 4.935e-05, "loss": 0.5151, "step": 26 }, { "epoch": 0.0008228634122926651, "grad_norm": 0.2645550072193146, "learning_rate": 4.9325000000000006e-05, "loss": 0.4919, "step": 27 }, { "epoch": 0.0008533398349701712, "grad_norm": 0.19282805919647217, "learning_rate": 4.93e-05, "loss": 0.5655, "step": 28 }, { "epoch": 0.0008838162576476773, "grad_norm": 0.18571054935455322, "learning_rate": 4.9275000000000005e-05, "loss": 0.5178, "step": 29 }, { "epoch": 0.0009142926803251834, "grad_norm": 0.2436906099319458, "learning_rate": 4.9250000000000004e-05, "loss": 0.6077, "step": 30 }, { "epoch": 0.0009447691030026895, "grad_norm": 0.21767941117286682, "learning_rate": 4.9225000000000004e-05, "loss": 0.6189, "step": 31 }, { "epoch": 0.0009752455256801957, "grad_norm": 0.2010110318660736, "learning_rate": 4.92e-05, "loss": 0.4847, "step": 32 }, { "epoch": 0.0010057219483577018, "grad_norm": 0.28035563230514526, "learning_rate": 4.9175e-05, "loss": 0.6326, "step": 33 }, { "epoch": 0.0010361983710352079, "grad_norm": 0.2148405760526657, "learning_rate": 4.915e-05, "loss": 0.5343, "step": 34 }, { "epoch": 0.001066674793712714, "grad_norm": 0.19581177830696106, "learning_rate": 4.9125e-05, "loss": 0.5139, "step": 35 }, { "epoch": 0.0010971512163902202, "grad_norm": 0.22044502198696136, "learning_rate": 4.91e-05, "loss": 0.5301, "step": 36 }, { "epoch": 0.0011276276390677262, "grad_norm": 0.18560416996479034, "learning_rate": 4.907500000000001e-05, "loss": 0.4715, "step": 37 }, { "epoch": 0.0011581040617452323, "grad_norm": 0.19371341168880463, "learning_rate": 4.905e-05, "loss": 0.5614, "step": 38 }, { "epoch": 0.0011885804844227385, "grad_norm": 0.18388929963111877, "learning_rate": 4.9025000000000006e-05, "loss": 0.5655, "step": 39 }, { "epoch": 0.0012190569071002446, "grad_norm": 0.1954178363084793, "learning_rate": 4.9e-05, "loss": 0.608, "step": 40 }, { "epoch": 0.0012495333297777506, "grad_norm": 0.17106999456882477, "learning_rate": 4.8975000000000005e-05, "loss": 0.4374, "step": 41 }, { "epoch": 0.0012800097524552569, "grad_norm": 0.16132968664169312, "learning_rate": 4.8950000000000004e-05, "loss": 0.5056, "step": 42 }, { "epoch": 0.001310486175132763, "grad_norm": 0.153102308511734, "learning_rate": 4.8925e-05, "loss": 0.4782, "step": 43 }, { "epoch": 0.001340962597810269, "grad_norm": 0.14665229618549347, "learning_rate": 4.89e-05, "loss": 0.5014, "step": 44 }, { "epoch": 0.0013714390204877752, "grad_norm": 0.13884370028972626, "learning_rate": 4.8875e-05, "loss": 0.5254, "step": 45 }, { "epoch": 0.0014019154431652813, "grad_norm": 0.16250255703926086, "learning_rate": 4.885e-05, "loss": 0.5569, "step": 46 }, { "epoch": 0.0014323918658427873, "grad_norm": 0.13465836644172668, "learning_rate": 4.8825e-05, "loss": 0.6255, "step": 47 }, { "epoch": 0.0014628682885202936, "grad_norm": 0.1271282136440277, "learning_rate": 4.88e-05, "loss": 0.4848, "step": 48 }, { "epoch": 0.0014933447111977996, "grad_norm": 0.10999290645122528, "learning_rate": 4.8775000000000007e-05, "loss": 0.5012, "step": 49 }, { "epoch": 0.0015238211338753057, "grad_norm": 0.11422800272703171, "learning_rate": 4.875e-05, "loss": 0.5386, "step": 50 }, { "epoch": 0.001554297556552812, "grad_norm": 0.11624183505773544, "learning_rate": 4.8725000000000005e-05, "loss": 0.5367, "step": 51 }, { "epoch": 0.001584773979230318, "grad_norm": 0.11618327349424362, "learning_rate": 4.87e-05, "loss": 0.5228, "step": 52 }, { "epoch": 0.001615250401907824, "grad_norm": 0.1149577870965004, "learning_rate": 4.8675000000000004e-05, "loss": 0.5569, "step": 53 }, { "epoch": 0.0016457268245853303, "grad_norm": 0.10163170844316483, "learning_rate": 4.8650000000000003e-05, "loss": 0.4849, "step": 54 }, { "epoch": 0.0016762032472628363, "grad_norm": 0.11256093531847, "learning_rate": 4.8625e-05, "loss": 0.5659, "step": 55 }, { "epoch": 0.0017066796699403424, "grad_norm": 0.11522180587053299, "learning_rate": 4.86e-05, "loss": 0.5805, "step": 56 }, { "epoch": 0.0017371560926178486, "grad_norm": 0.0993754044175148, "learning_rate": 4.8575e-05, "loss": 0.5249, "step": 57 }, { "epoch": 0.0017676325152953547, "grad_norm": 0.0922098234295845, "learning_rate": 4.855e-05, "loss": 0.4332, "step": 58 }, { "epoch": 0.0017981089379728607, "grad_norm": 0.10734675079584122, "learning_rate": 4.8525e-05, "loss": 0.4886, "step": 59 }, { "epoch": 0.0018285853606503668, "grad_norm": 0.10106879472732544, "learning_rate": 4.85e-05, "loss": 0.5352, "step": 60 }, { "epoch": 0.001859061783327873, "grad_norm": 0.09412974864244461, "learning_rate": 4.8475000000000006e-05, "loss": 0.4769, "step": 61 }, { "epoch": 0.001889538206005379, "grad_norm": 0.10016250610351562, "learning_rate": 4.845e-05, "loss": 0.5517, "step": 62 }, { "epoch": 0.001920014628682885, "grad_norm": 0.10829256474971771, "learning_rate": 4.8425000000000005e-05, "loss": 0.4812, "step": 63 }, { "epoch": 0.0019504910513603914, "grad_norm": 0.08421169966459274, "learning_rate": 4.8400000000000004e-05, "loss": 0.507, "step": 64 }, { "epoch": 0.0019809674740378976, "grad_norm": 0.0865599513053894, "learning_rate": 4.8375000000000004e-05, "loss": 0.5111, "step": 65 }, { "epoch": 0.0020114438967154037, "grad_norm": 0.08734598010778427, "learning_rate": 4.835e-05, "loss": 0.5088, "step": 66 }, { "epoch": 0.0020419203193929097, "grad_norm": 0.08866006135940552, "learning_rate": 4.8325e-05, "loss": 0.5739, "step": 67 }, { "epoch": 0.0020723967420704158, "grad_norm": 0.08538202941417694, "learning_rate": 4.83e-05, "loss": 0.6081, "step": 68 }, { "epoch": 0.002102873164747922, "grad_norm": 0.0833793357014656, "learning_rate": 4.8275e-05, "loss": 0.4843, "step": 69 }, { "epoch": 0.002133349587425428, "grad_norm": 0.07972536981105804, "learning_rate": 4.825e-05, "loss": 0.5153, "step": 70 }, { "epoch": 0.0021638260101029343, "grad_norm": 0.07567499577999115, "learning_rate": 4.822500000000001e-05, "loss": 0.4452, "step": 71 }, { "epoch": 0.0021943024327804404, "grad_norm": 0.07216943800449371, "learning_rate": 4.82e-05, "loss": 0.4234, "step": 72 }, { "epoch": 0.0022247788554579464, "grad_norm": 0.07606346160173416, "learning_rate": 4.8175000000000005e-05, "loss": 0.5006, "step": 73 }, { "epoch": 0.0022552552781354524, "grad_norm": 0.0838082805275917, "learning_rate": 4.815e-05, "loss": 0.5035, "step": 74 }, { "epoch": 0.0022857317008129585, "grad_norm": 0.0769057497382164, "learning_rate": 4.8125000000000004e-05, "loss": 0.4647, "step": 75 }, { "epoch": 0.0023162081234904645, "grad_norm": 0.07561961561441422, "learning_rate": 4.8100000000000004e-05, "loss": 0.4516, "step": 76 }, { "epoch": 0.002346684546167971, "grad_norm": 0.08561524003744125, "learning_rate": 4.8075e-05, "loss": 0.6115, "step": 77 }, { "epoch": 0.002377160968845477, "grad_norm": 0.07826441526412964, "learning_rate": 4.805e-05, "loss": 0.4395, "step": 78 }, { "epoch": 0.002407637391522983, "grad_norm": 0.07693878561258316, "learning_rate": 4.8025e-05, "loss": 0.4794, "step": 79 }, { "epoch": 0.002438113814200489, "grad_norm": 0.07945053279399872, "learning_rate": 4.8e-05, "loss": 0.5, "step": 80 }, { "epoch": 0.002468590236877995, "grad_norm": 0.07221172749996185, "learning_rate": 4.7975e-05, "loss": 0.4417, "step": 81 }, { "epoch": 0.0024990666595555012, "grad_norm": 0.08559772372245789, "learning_rate": 4.795e-05, "loss": 0.5892, "step": 82 }, { "epoch": 0.0025295430822330073, "grad_norm": 0.07304175943136215, "learning_rate": 4.7925000000000006e-05, "loss": 0.4383, "step": 83 }, { "epoch": 0.0025600195049105138, "grad_norm": 0.0767160952091217, "learning_rate": 4.79e-05, "loss": 0.4956, "step": 84 }, { "epoch": 0.00259049592758802, "grad_norm": 0.07742952555418015, "learning_rate": 4.7875000000000005e-05, "loss": 0.4888, "step": 85 }, { "epoch": 0.002620972350265526, "grad_norm": 0.08384500443935394, "learning_rate": 4.785e-05, "loss": 0.6217, "step": 86 }, { "epoch": 0.002651448772943032, "grad_norm": 0.07919060438871384, "learning_rate": 4.7825000000000004e-05, "loss": 0.4963, "step": 87 }, { "epoch": 0.002681925195620538, "grad_norm": 0.07416897267103195, "learning_rate": 4.78e-05, "loss": 0.4757, "step": 88 }, { "epoch": 0.002712401618298044, "grad_norm": 0.07384693622589111, "learning_rate": 4.7775e-05, "loss": 0.4785, "step": 89 }, { "epoch": 0.0027428780409755505, "grad_norm": 0.07896066457033157, "learning_rate": 4.775e-05, "loss": 0.4799, "step": 90 }, { "epoch": 0.0027733544636530565, "grad_norm": 0.07913591712713242, "learning_rate": 4.7725e-05, "loss": 0.6084, "step": 91 }, { "epoch": 0.0028038308863305625, "grad_norm": 0.08058233559131622, "learning_rate": 4.77e-05, "loss": 0.5222, "step": 92 }, { "epoch": 0.0028343073090080686, "grad_norm": 0.07450579106807709, "learning_rate": 4.7675e-05, "loss": 0.476, "step": 93 }, { "epoch": 0.0028647837316855746, "grad_norm": 0.07476439327001572, "learning_rate": 4.765e-05, "loss": 0.4765, "step": 94 }, { "epoch": 0.0028952601543630807, "grad_norm": 0.06974098086357117, "learning_rate": 4.7625000000000006e-05, "loss": 0.4218, "step": 95 }, { "epoch": 0.002925736577040587, "grad_norm": 0.07148098945617676, "learning_rate": 4.76e-05, "loss": 0.4566, "step": 96 }, { "epoch": 0.002956212999718093, "grad_norm": 0.06729903817176819, "learning_rate": 4.7575000000000004e-05, "loss": 0.396, "step": 97 }, { "epoch": 0.0029866894223955992, "grad_norm": 0.07787555456161499, "learning_rate": 4.755e-05, "loss": 0.5719, "step": 98 }, { "epoch": 0.0030171658450731053, "grad_norm": 0.06740543991327286, "learning_rate": 4.7525e-05, "loss": 0.4183, "step": 99 }, { "epoch": 0.0030476422677506113, "grad_norm": 0.08390388637781143, "learning_rate": 4.75e-05, "loss": 0.5187, "step": 100 }, { "epoch": 0.0030781186904281174, "grad_norm": 0.0819668099284172, "learning_rate": 4.7475e-05, "loss": 0.5681, "step": 101 }, { "epoch": 0.003108595113105624, "grad_norm": 0.07854864746332169, "learning_rate": 4.745e-05, "loss": 0.4852, "step": 102 }, { "epoch": 0.00313907153578313, "grad_norm": 0.07695122808218002, "learning_rate": 4.7425e-05, "loss": 0.448, "step": 103 }, { "epoch": 0.003169547958460636, "grad_norm": 0.08394775539636612, "learning_rate": 4.74e-05, "loss": 0.5242, "step": 104 }, { "epoch": 0.003200024381138142, "grad_norm": 0.08301324397325516, "learning_rate": 4.7375e-05, "loss": 0.5766, "step": 105 }, { "epoch": 0.003230500803815648, "grad_norm": 0.07388174533843994, "learning_rate": 4.735e-05, "loss": 0.4475, "step": 106 }, { "epoch": 0.003260977226493154, "grad_norm": 0.08008576184511185, "learning_rate": 4.7325000000000005e-05, "loss": 0.5531, "step": 107 }, { "epoch": 0.0032914536491706605, "grad_norm": 0.0802341178059578, "learning_rate": 4.73e-05, "loss": 0.5474, "step": 108 }, { "epoch": 0.0033219300718481666, "grad_norm": 0.07738009840250015, "learning_rate": 4.7275000000000004e-05, "loss": 0.5144, "step": 109 }, { "epoch": 0.0033524064945256726, "grad_norm": 0.07348138093948364, "learning_rate": 4.7249999999999997e-05, "loss": 0.5114, "step": 110 }, { "epoch": 0.0033828829172031787, "grad_norm": 0.07322855293750763, "learning_rate": 4.7225e-05, "loss": 0.4588, "step": 111 }, { "epoch": 0.0034133593398806847, "grad_norm": 0.07346679270267487, "learning_rate": 4.72e-05, "loss": 0.4489, "step": 112 }, { "epoch": 0.0034438357625581908, "grad_norm": 0.08024899661540985, "learning_rate": 4.7175e-05, "loss": 0.546, "step": 113 }, { "epoch": 0.0034743121852356972, "grad_norm": 0.08227278292179108, "learning_rate": 4.715e-05, "loss": 0.543, "step": 114 }, { "epoch": 0.0035047886079132033, "grad_norm": 0.0743311196565628, "learning_rate": 4.7125e-05, "loss": 0.4729, "step": 115 }, { "epoch": 0.0035352650305907093, "grad_norm": 0.08084755390882492, "learning_rate": 4.71e-05, "loss": 0.5497, "step": 116 }, { "epoch": 0.0035657414532682154, "grad_norm": 0.07875463366508484, "learning_rate": 4.7075e-05, "loss": 0.5547, "step": 117 }, { "epoch": 0.0035962178759457214, "grad_norm": 0.07655800879001617, "learning_rate": 4.705e-05, "loss": 0.4884, "step": 118 }, { "epoch": 0.0036266942986232275, "grad_norm": 0.07774712890386581, "learning_rate": 4.7025000000000005e-05, "loss": 0.4174, "step": 119 }, { "epoch": 0.0036571707213007335, "grad_norm": 0.07228706032037735, "learning_rate": 4.7e-05, "loss": 0.4656, "step": 120 }, { "epoch": 0.00368764714397824, "grad_norm": 0.06769265234470367, "learning_rate": 4.6975000000000003e-05, "loss": 0.4338, "step": 121 }, { "epoch": 0.003718123566655746, "grad_norm": 0.07867695391178131, "learning_rate": 4.695e-05, "loss": 0.4988, "step": 122 }, { "epoch": 0.003748599989333252, "grad_norm": 0.07736552506685257, "learning_rate": 4.6925e-05, "loss": 0.5358, "step": 123 }, { "epoch": 0.003779076412010758, "grad_norm": 0.0828608050942421, "learning_rate": 4.69e-05, "loss": 0.5636, "step": 124 }, { "epoch": 0.003809552834688264, "grad_norm": 0.07226227223873138, "learning_rate": 4.6875e-05, "loss": 0.5172, "step": 125 }, { "epoch": 0.00384002925736577, "grad_norm": 0.07456226646900177, "learning_rate": 4.685000000000001e-05, "loss": 0.4917, "step": 126 }, { "epoch": 0.0038705056800432767, "grad_norm": 0.07656902074813843, "learning_rate": 4.6825e-05, "loss": 0.5061, "step": 127 }, { "epoch": 0.0039009821027207827, "grad_norm": 0.07930399477481842, "learning_rate": 4.6800000000000006e-05, "loss": 0.5593, "step": 128 }, { "epoch": 0.003931458525398289, "grad_norm": 0.07578976452350616, "learning_rate": 4.6775000000000005e-05, "loss": 0.4713, "step": 129 }, { "epoch": 0.003961934948075795, "grad_norm": 0.08862394839525223, "learning_rate": 4.6750000000000005e-05, "loss": 0.5498, "step": 130 }, { "epoch": 0.003992411370753301, "grad_norm": 0.061442673206329346, "learning_rate": 4.6725000000000004e-05, "loss": 0.3975, "step": 131 }, { "epoch": 0.004022887793430807, "grad_norm": 0.07381784915924072, "learning_rate": 4.6700000000000003e-05, "loss": 0.4227, "step": 132 }, { "epoch": 0.004053364216108313, "grad_norm": 0.07735998928546906, "learning_rate": 4.6675e-05, "loss": 0.4878, "step": 133 }, { "epoch": 0.004083840638785819, "grad_norm": 0.07014645636081696, "learning_rate": 4.665e-05, "loss": 0.4592, "step": 134 }, { "epoch": 0.004114317061463325, "grad_norm": 0.07754282653331757, "learning_rate": 4.6625e-05, "loss": 0.5571, "step": 135 }, { "epoch": 0.0041447934841408315, "grad_norm": 0.07732149213552475, "learning_rate": 4.660000000000001e-05, "loss": 0.5258, "step": 136 }, { "epoch": 0.004175269906818338, "grad_norm": 0.07155423611402512, "learning_rate": 4.6575e-05, "loss": 0.4602, "step": 137 }, { "epoch": 0.004205746329495844, "grad_norm": 0.07008311152458191, "learning_rate": 4.655000000000001e-05, "loss": 0.4565, "step": 138 }, { "epoch": 0.00423622275217335, "grad_norm": 0.07598588615655899, "learning_rate": 4.6525e-05, "loss": 0.439, "step": 139 }, { "epoch": 0.004266699174850856, "grad_norm": 0.08095088601112366, "learning_rate": 4.6500000000000005e-05, "loss": 0.5184, "step": 140 }, { "epoch": 0.004297175597528362, "grad_norm": 0.07447018474340439, "learning_rate": 4.6475000000000005e-05, "loss": 0.4655, "step": 141 }, { "epoch": 0.004327652020205869, "grad_norm": 0.08011619001626968, "learning_rate": 4.6450000000000004e-05, "loss": 0.4703, "step": 142 }, { "epoch": 0.004358128442883374, "grad_norm": 0.07435956597328186, "learning_rate": 4.6425000000000004e-05, "loss": 0.4723, "step": 143 }, { "epoch": 0.004388604865560881, "grad_norm": 0.07260461151599884, "learning_rate": 4.64e-05, "loss": 0.4962, "step": 144 }, { "epoch": 0.004419081288238386, "grad_norm": 0.06951161473989487, "learning_rate": 4.6375e-05, "loss": 0.4169, "step": 145 }, { "epoch": 0.004449557710915893, "grad_norm": 0.09075510501861572, "learning_rate": 4.635e-05, "loss": 0.5909, "step": 146 }, { "epoch": 0.004480034133593398, "grad_norm": 0.07657283544540405, "learning_rate": 4.6325e-05, "loss": 0.4774, "step": 147 }, { "epoch": 0.004510510556270905, "grad_norm": 0.07151765376329422, "learning_rate": 4.630000000000001e-05, "loss": 0.4601, "step": 148 }, { "epoch": 0.004540986978948411, "grad_norm": 0.08092119544744492, "learning_rate": 4.6275e-05, "loss": 0.4484, "step": 149 }, { "epoch": 0.004571463401625917, "grad_norm": 0.07646743208169937, "learning_rate": 4.6250000000000006e-05, "loss": 0.4921, "step": 150 }, { "epoch": 0.0046019398243034235, "grad_norm": 0.0710703432559967, "learning_rate": 4.6225e-05, "loss": 0.4697, "step": 151 }, { "epoch": 0.004632416246980929, "grad_norm": 0.08577946573495865, "learning_rate": 4.6200000000000005e-05, "loss": 0.5943, "step": 152 }, { "epoch": 0.0046628926696584356, "grad_norm": 0.07209080457687378, "learning_rate": 4.6175000000000004e-05, "loss": 0.51, "step": 153 }, { "epoch": 0.004693369092335942, "grad_norm": 0.07447349280118942, "learning_rate": 4.6150000000000004e-05, "loss": 0.4733, "step": 154 }, { "epoch": 0.004723845515013448, "grad_norm": 0.07124325633049011, "learning_rate": 4.6125e-05, "loss": 0.426, "step": 155 }, { "epoch": 0.004754321937690954, "grad_norm": 0.07892026007175446, "learning_rate": 4.61e-05, "loss": 0.5156, "step": 156 }, { "epoch": 0.00478479836036846, "grad_norm": 0.07265159487724304, "learning_rate": 4.6075e-05, "loss": 0.5039, "step": 157 }, { "epoch": 0.004815274783045966, "grad_norm": 0.08285248279571533, "learning_rate": 4.605e-05, "loss": 0.634, "step": 158 }, { "epoch": 0.004845751205723472, "grad_norm": 0.0858485996723175, "learning_rate": 4.6025e-05, "loss": 0.5695, "step": 159 }, { "epoch": 0.004876227628400978, "grad_norm": 0.07008542865514755, "learning_rate": 4.600000000000001e-05, "loss": 0.4579, "step": 160 }, { "epoch": 0.004906704051078485, "grad_norm": 0.078867107629776, "learning_rate": 4.5975e-05, "loss": 0.5282, "step": 161 }, { "epoch": 0.00493718047375599, "grad_norm": 0.07972768694162369, "learning_rate": 4.5950000000000006e-05, "loss": 0.5581, "step": 162 }, { "epoch": 0.004967656896433497, "grad_norm": 0.08281397074460983, "learning_rate": 4.5925e-05, "loss": 0.5694, "step": 163 }, { "epoch": 0.0049981333191110025, "grad_norm": 0.07624434679746628, "learning_rate": 4.5900000000000004e-05, "loss": 0.4934, "step": 164 }, { "epoch": 0.005028609741788509, "grad_norm": 0.08031974732875824, "learning_rate": 4.5875000000000004e-05, "loss": 0.4571, "step": 165 }, { "epoch": 0.0050590861644660146, "grad_norm": 0.07153106480836868, "learning_rate": 4.585e-05, "loss": 0.4341, "step": 166 }, { "epoch": 0.005089562587143521, "grad_norm": 0.08010619878768921, "learning_rate": 4.5825e-05, "loss": 0.4946, "step": 167 }, { "epoch": 0.0051200390098210275, "grad_norm": 0.0832223892211914, "learning_rate": 4.58e-05, "loss": 0.5153, "step": 168 }, { "epoch": 0.005150515432498533, "grad_norm": 0.08256994187831879, "learning_rate": 4.5775e-05, "loss": 0.4561, "step": 169 }, { "epoch": 0.00518099185517604, "grad_norm": 0.08391042053699493, "learning_rate": 4.575e-05, "loss": 0.5744, "step": 170 }, { "epoch": 0.005211468277853545, "grad_norm": 0.07512183487415314, "learning_rate": 4.5725e-05, "loss": 0.4818, "step": 171 }, { "epoch": 0.005241944700531052, "grad_norm": 0.08564597368240356, "learning_rate": 4.5700000000000006e-05, "loss": 0.563, "step": 172 }, { "epoch": 0.005272421123208558, "grad_norm": 0.07766963541507721, "learning_rate": 4.5675e-05, "loss": 0.5249, "step": 173 }, { "epoch": 0.005302897545886064, "grad_norm": 0.07498954981565475, "learning_rate": 4.5650000000000005e-05, "loss": 0.4853, "step": 174 }, { "epoch": 0.00533337396856357, "grad_norm": 0.0733124241232872, "learning_rate": 4.5625e-05, "loss": 0.4352, "step": 175 }, { "epoch": 0.005363850391241076, "grad_norm": 0.07112406939268112, "learning_rate": 4.5600000000000004e-05, "loss": 0.4009, "step": 176 }, { "epoch": 0.005394326813918582, "grad_norm": 0.0815376490354538, "learning_rate": 4.5575e-05, "loss": 0.5653, "step": 177 }, { "epoch": 0.005424803236596088, "grad_norm": 0.08396834880113602, "learning_rate": 4.555e-05, "loss": 0.5032, "step": 178 }, { "epoch": 0.005455279659273594, "grad_norm": 0.07345952838659286, "learning_rate": 4.5525e-05, "loss": 0.4274, "step": 179 }, { "epoch": 0.005485756081951101, "grad_norm": 0.06867942959070206, "learning_rate": 4.55e-05, "loss": 0.438, "step": 180 }, { "epoch": 0.0055162325046286065, "grad_norm": 0.0842433050274849, "learning_rate": 4.5475e-05, "loss": 0.5374, "step": 181 }, { "epoch": 0.005546708927306113, "grad_norm": 0.07024747133255005, "learning_rate": 4.545000000000001e-05, "loss": 0.4512, "step": 182 }, { "epoch": 0.005577185349983619, "grad_norm": 0.07151992619037628, "learning_rate": 4.5425e-05, "loss": 0.4949, "step": 183 }, { "epoch": 0.005607661772661125, "grad_norm": 0.07465283572673798, "learning_rate": 4.5400000000000006e-05, "loss": 0.4864, "step": 184 }, { "epoch": 0.0056381381953386316, "grad_norm": 0.08061084896326065, "learning_rate": 4.5375e-05, "loss": 0.4922, "step": 185 }, { "epoch": 0.005668614618016137, "grad_norm": 0.07567588239908218, "learning_rate": 4.5350000000000005e-05, "loss": 0.4245, "step": 186 }, { "epoch": 0.005699091040693644, "grad_norm": 0.07391155511140823, "learning_rate": 4.5325000000000004e-05, "loss": 0.4927, "step": 187 }, { "epoch": 0.005729567463371149, "grad_norm": 0.08357247710227966, "learning_rate": 4.53e-05, "loss": 0.5125, "step": 188 }, { "epoch": 0.005760043886048656, "grad_norm": 0.07513033598661423, "learning_rate": 4.5275e-05, "loss": 0.4754, "step": 189 }, { "epoch": 0.005790520308726161, "grad_norm": 0.07378851622343063, "learning_rate": 4.525e-05, "loss": 0.4472, "step": 190 }, { "epoch": 0.005820996731403668, "grad_norm": 0.06821310520172119, "learning_rate": 4.5225e-05, "loss": 0.4188, "step": 191 }, { "epoch": 0.005851473154081174, "grad_norm": 0.07594940811395645, "learning_rate": 4.52e-05, "loss": 0.4764, "step": 192 }, { "epoch": 0.00588194957675868, "grad_norm": 0.06853988766670227, "learning_rate": 4.5175e-05, "loss": 0.4233, "step": 193 }, { "epoch": 0.005912425999436186, "grad_norm": 0.08140639960765839, "learning_rate": 4.5150000000000006e-05, "loss": 0.5761, "step": 194 }, { "epoch": 0.005942902422113692, "grad_norm": 0.07199399173259735, "learning_rate": 4.5125e-05, "loss": 0.4233, "step": 195 }, { "epoch": 0.0059733788447911985, "grad_norm": 0.07677506655454636, "learning_rate": 4.5100000000000005e-05, "loss": 0.5331, "step": 196 }, { "epoch": 0.006003855267468705, "grad_norm": 0.08087047934532166, "learning_rate": 4.5075e-05, "loss": 0.5559, "step": 197 }, { "epoch": 0.0060343316901462106, "grad_norm": 0.0826466903090477, "learning_rate": 4.5050000000000004e-05, "loss": 0.481, "step": 198 }, { "epoch": 0.006064808112823717, "grad_norm": 0.08658347278833389, "learning_rate": 4.5025000000000003e-05, "loss": 0.5096, "step": 199 }, { "epoch": 0.006095284535501223, "grad_norm": 0.07301410287618637, "learning_rate": 4.5e-05, "loss": 0.4696, "step": 200 }, { "epoch": 0.006125760958178729, "grad_norm": 0.07605516910552979, "learning_rate": 4.4975e-05, "loss": 0.488, "step": 201 }, { "epoch": 0.006156237380856235, "grad_norm": 0.08129167556762695, "learning_rate": 4.495e-05, "loss": 0.5479, "step": 202 }, { "epoch": 0.006186713803533741, "grad_norm": 0.07311762124300003, "learning_rate": 4.4925e-05, "loss": 0.4846, "step": 203 }, { "epoch": 0.006217190226211248, "grad_norm": 0.08050026744604111, "learning_rate": 4.49e-05, "loss": 0.4928, "step": 204 }, { "epoch": 0.006247666648888753, "grad_norm": 0.08124244213104248, "learning_rate": 4.4875e-05, "loss": 0.5505, "step": 205 }, { "epoch": 0.00627814307156626, "grad_norm": 0.0760040357708931, "learning_rate": 4.4850000000000006e-05, "loss": 0.4735, "step": 206 }, { "epoch": 0.006308619494243765, "grad_norm": 0.06538926064968109, "learning_rate": 4.4825e-05, "loss": 0.3924, "step": 207 }, { "epoch": 0.006339095916921272, "grad_norm": 0.07014009356498718, "learning_rate": 4.4800000000000005e-05, "loss": 0.4242, "step": 208 }, { "epoch": 0.0063695723395987775, "grad_norm": 0.07288139313459396, "learning_rate": 4.4775e-05, "loss": 0.4578, "step": 209 }, { "epoch": 0.006400048762276284, "grad_norm": 0.08024341613054276, "learning_rate": 4.4750000000000004e-05, "loss": 0.4984, "step": 210 }, { "epoch": 0.0064305251849537904, "grad_norm": 0.08177649229764938, "learning_rate": 4.4725e-05, "loss": 0.5086, "step": 211 }, { "epoch": 0.006461001607631296, "grad_norm": 0.07111845165491104, "learning_rate": 4.47e-05, "loss": 0.4412, "step": 212 }, { "epoch": 0.0064914780303088025, "grad_norm": 0.0800376608967781, "learning_rate": 4.4675e-05, "loss": 0.4914, "step": 213 }, { "epoch": 0.006521954452986308, "grad_norm": 0.07600034028291702, "learning_rate": 4.465e-05, "loss": 0.4273, "step": 214 }, { "epoch": 0.006552430875663815, "grad_norm": 0.08243297040462494, "learning_rate": 4.4625e-05, "loss": 0.5404, "step": 215 }, { "epoch": 0.006582907298341321, "grad_norm": 0.07431615144014359, "learning_rate": 4.46e-05, "loss": 0.5037, "step": 216 }, { "epoch": 0.006613383721018827, "grad_norm": 0.07259438931941986, "learning_rate": 4.4575e-05, "loss": 0.4363, "step": 217 }, { "epoch": 0.006643860143696333, "grad_norm": 0.07400614768266678, "learning_rate": 4.4550000000000005e-05, "loss": 0.5001, "step": 218 }, { "epoch": 0.006674336566373839, "grad_norm": 0.0698956847190857, "learning_rate": 4.4525e-05, "loss": 0.4161, "step": 219 }, { "epoch": 0.006704812989051345, "grad_norm": 0.07952745258808136, "learning_rate": 4.4500000000000004e-05, "loss": 0.5533, "step": 220 }, { "epoch": 0.006735289411728851, "grad_norm": 0.06805063784122467, "learning_rate": 4.4475e-05, "loss": 0.4282, "step": 221 }, { "epoch": 0.006765765834406357, "grad_norm": 0.07276663929224014, "learning_rate": 4.445e-05, "loss": 0.4399, "step": 222 }, { "epoch": 0.006796242257083864, "grad_norm": 0.07740099728107452, "learning_rate": 4.4425e-05, "loss": 0.4615, "step": 223 }, { "epoch": 0.0068267186797613694, "grad_norm": 0.07705219835042953, "learning_rate": 4.44e-05, "loss": 0.4655, "step": 224 }, { "epoch": 0.006857195102438876, "grad_norm": 0.07430753856897354, "learning_rate": 4.4375e-05, "loss": 0.4607, "step": 225 }, { "epoch": 0.0068876715251163815, "grad_norm": 0.0762786790728569, "learning_rate": 4.435e-05, "loss": 0.4725, "step": 226 }, { "epoch": 0.006918147947793888, "grad_norm": 0.07193771004676819, "learning_rate": 4.4325e-05, "loss": 0.4486, "step": 227 }, { "epoch": 0.0069486243704713945, "grad_norm": 0.08445839583873749, "learning_rate": 4.43e-05, "loss": 0.5583, "step": 228 }, { "epoch": 0.0069791007931489, "grad_norm": 0.07651213556528091, "learning_rate": 4.4275e-05, "loss": 0.4419, "step": 229 }, { "epoch": 0.007009577215826407, "grad_norm": 0.07532670348882675, "learning_rate": 4.4250000000000005e-05, "loss": 0.5153, "step": 230 }, { "epoch": 0.007040053638503912, "grad_norm": 0.07118629664182663, "learning_rate": 4.4225e-05, "loss": 0.4457, "step": 231 }, { "epoch": 0.007070530061181419, "grad_norm": 0.08137690275907516, "learning_rate": 4.4200000000000004e-05, "loss": 0.5275, "step": 232 }, { "epoch": 0.007101006483858924, "grad_norm": 0.08533833175897598, "learning_rate": 4.4174999999999996e-05, "loss": 0.4327, "step": 233 }, { "epoch": 0.007131482906536431, "grad_norm": 0.07369881123304367, "learning_rate": 4.415e-05, "loss": 0.4467, "step": 234 }, { "epoch": 0.007161959329213937, "grad_norm": 0.07385102659463882, "learning_rate": 4.4125e-05, "loss": 0.4016, "step": 235 }, { "epoch": 0.007192435751891443, "grad_norm": 0.07528825104236603, "learning_rate": 4.41e-05, "loss": 0.3836, "step": 236 }, { "epoch": 0.007222912174568949, "grad_norm": 0.0802421048283577, "learning_rate": 4.4075e-05, "loss": 0.4791, "step": 237 }, { "epoch": 0.007253388597246455, "grad_norm": 0.08030351996421814, "learning_rate": 4.405e-05, "loss": 0.5105, "step": 238 }, { "epoch": 0.007283865019923961, "grad_norm": 0.0701889768242836, "learning_rate": 4.4025e-05, "loss": 0.417, "step": 239 }, { "epoch": 0.007314341442601467, "grad_norm": 0.07961532473564148, "learning_rate": 4.4000000000000006e-05, "loss": 0.5295, "step": 240 }, { "epoch": 0.0073448178652789735, "grad_norm": 0.07134562730789185, "learning_rate": 4.3975e-05, "loss": 0.4093, "step": 241 }, { "epoch": 0.00737529428795648, "grad_norm": 0.07273057848215103, "learning_rate": 4.3950000000000004e-05, "loss": 0.4682, "step": 242 }, { "epoch": 0.007405770710633986, "grad_norm": 0.07073590159416199, "learning_rate": 4.3925e-05, "loss": 0.4467, "step": 243 }, { "epoch": 0.007436247133311492, "grad_norm": 0.07955696433782578, "learning_rate": 4.39e-05, "loss": 0.4688, "step": 244 }, { "epoch": 0.007466723555988998, "grad_norm": 0.0687423050403595, "learning_rate": 4.3875e-05, "loss": 0.4017, "step": 245 }, { "epoch": 0.007497199978666504, "grad_norm": 0.0788503885269165, "learning_rate": 4.385e-05, "loss": 0.5184, "step": 246 }, { "epoch": 0.007527676401344011, "grad_norm": 0.08318697661161423, "learning_rate": 4.3825e-05, "loss": 0.4574, "step": 247 }, { "epoch": 0.007558152824021516, "grad_norm": 0.07369694113731384, "learning_rate": 4.38e-05, "loss": 0.4392, "step": 248 }, { "epoch": 0.007588629246699023, "grad_norm": 0.0802166610956192, "learning_rate": 4.3775e-05, "loss": 0.4789, "step": 249 }, { "epoch": 0.007619105669376528, "grad_norm": 0.0804591178894043, "learning_rate": 4.375e-05, "loss": 0.5674, "step": 250 }, { "epoch": 0.007649582092054035, "grad_norm": 0.07581956684589386, "learning_rate": 4.3725000000000006e-05, "loss": 0.4913, "step": 251 }, { "epoch": 0.00768005851473154, "grad_norm": 0.08410049974918365, "learning_rate": 4.3700000000000005e-05, "loss": 0.5396, "step": 252 }, { "epoch": 0.007710534937409047, "grad_norm": 0.07544326037168503, "learning_rate": 4.3675000000000005e-05, "loss": 0.496, "step": 253 }, { "epoch": 0.007741011360086553, "grad_norm": 0.07511407136917114, "learning_rate": 4.3650000000000004e-05, "loss": 0.4467, "step": 254 }, { "epoch": 0.007771487782764059, "grad_norm": 0.07699119299650192, "learning_rate": 4.3625e-05, "loss": 0.4485, "step": 255 }, { "epoch": 0.0078019642054415654, "grad_norm": 0.0734848603606224, "learning_rate": 4.36e-05, "loss": 0.4556, "step": 256 }, { "epoch": 0.007832440628119072, "grad_norm": 0.07248286157846451, "learning_rate": 4.3575e-05, "loss": 0.4617, "step": 257 }, { "epoch": 0.007862917050796578, "grad_norm": 0.07695899903774261, "learning_rate": 4.355e-05, "loss": 0.4675, "step": 258 }, { "epoch": 0.007893393473474083, "grad_norm": 0.07150145620107651, "learning_rate": 4.352500000000001e-05, "loss": 0.4387, "step": 259 }, { "epoch": 0.00792386989615159, "grad_norm": 0.09068621695041656, "learning_rate": 4.35e-05, "loss": 0.5776, "step": 260 }, { "epoch": 0.007954346318829096, "grad_norm": 0.07706692069768906, "learning_rate": 4.3475000000000006e-05, "loss": 0.4651, "step": 261 }, { "epoch": 0.007984822741506602, "grad_norm": 0.09030276536941528, "learning_rate": 4.345e-05, "loss": 0.5912, "step": 262 }, { "epoch": 0.008015299164184107, "grad_norm": 0.08644349873065948, "learning_rate": 4.3425000000000005e-05, "loss": 0.5507, "step": 263 }, { "epoch": 0.008045775586861615, "grad_norm": 0.08443138748407364, "learning_rate": 4.3400000000000005e-05, "loss": 0.4633, "step": 264 }, { "epoch": 0.00807625200953912, "grad_norm": 0.08237074315547943, "learning_rate": 4.3375000000000004e-05, "loss": 0.459, "step": 265 }, { "epoch": 0.008106728432216626, "grad_norm": 0.072572261095047, "learning_rate": 4.335e-05, "loss": 0.435, "step": 266 }, { "epoch": 0.008137204854894133, "grad_norm": 0.06919915229082108, "learning_rate": 4.3325e-05, "loss": 0.3942, "step": 267 }, { "epoch": 0.008167681277571639, "grad_norm": 0.0756046250462532, "learning_rate": 4.33e-05, "loss": 0.4857, "step": 268 }, { "epoch": 0.008198157700249144, "grad_norm": 0.08680189400911331, "learning_rate": 4.3275e-05, "loss": 0.5668, "step": 269 }, { "epoch": 0.00822863412292665, "grad_norm": 0.07228045910596848, "learning_rate": 4.325e-05, "loss": 0.4254, "step": 270 }, { "epoch": 0.008259110545604157, "grad_norm": 0.08690252900123596, "learning_rate": 4.322500000000001e-05, "loss": 0.5195, "step": 271 }, { "epoch": 0.008289586968281663, "grad_norm": 0.08255655318498611, "learning_rate": 4.32e-05, "loss": 0.5488, "step": 272 }, { "epoch": 0.008320063390959169, "grad_norm": 0.07025208324193954, "learning_rate": 4.3175000000000006e-05, "loss": 0.4198, "step": 273 }, { "epoch": 0.008350539813636676, "grad_norm": 0.08045826852321625, "learning_rate": 4.315e-05, "loss": 0.5334, "step": 274 }, { "epoch": 0.008381016236314182, "grad_norm": 0.08307645469903946, "learning_rate": 4.3125000000000005e-05, "loss": 0.5511, "step": 275 }, { "epoch": 0.008411492658991687, "grad_norm": 0.0827789455652237, "learning_rate": 4.3100000000000004e-05, "loss": 0.5713, "step": 276 }, { "epoch": 0.008441969081669195, "grad_norm": 0.0748307853937149, "learning_rate": 4.3075000000000003e-05, "loss": 0.5015, "step": 277 }, { "epoch": 0.0084724455043467, "grad_norm": 0.07697857171297073, "learning_rate": 4.305e-05, "loss": 0.4674, "step": 278 }, { "epoch": 0.008502921927024206, "grad_norm": 0.06781286746263504, "learning_rate": 4.3025e-05, "loss": 0.4186, "step": 279 }, { "epoch": 0.008533398349701711, "grad_norm": 0.07750406861305237, "learning_rate": 4.3e-05, "loss": 0.5024, "step": 280 }, { "epoch": 0.008563874772379219, "grad_norm": 0.07747860997915268, "learning_rate": 4.2975e-05, "loss": 0.4938, "step": 281 }, { "epoch": 0.008594351195056724, "grad_norm": 0.08144353330135345, "learning_rate": 4.295e-05, "loss": 0.4669, "step": 282 }, { "epoch": 0.00862482761773423, "grad_norm": 0.07401426136493683, "learning_rate": 4.2925000000000007e-05, "loss": 0.4699, "step": 283 }, { "epoch": 0.008655304040411737, "grad_norm": 0.06882119923830032, "learning_rate": 4.29e-05, "loss": 0.3806, "step": 284 }, { "epoch": 0.008685780463089243, "grad_norm": 0.08108849078416824, "learning_rate": 4.2875000000000005e-05, "loss": 0.5306, "step": 285 }, { "epoch": 0.008716256885766748, "grad_norm": 0.07456759363412857, "learning_rate": 4.285e-05, "loss": 0.4699, "step": 286 }, { "epoch": 0.008746733308444254, "grad_norm": 0.07641758024692535, "learning_rate": 4.2825000000000004e-05, "loss": 0.48, "step": 287 }, { "epoch": 0.008777209731121761, "grad_norm": 0.09046676754951477, "learning_rate": 4.2800000000000004e-05, "loss": 0.5725, "step": 288 }, { "epoch": 0.008807686153799267, "grad_norm": 0.07710941880941391, "learning_rate": 4.2775e-05, "loss": 0.4937, "step": 289 }, { "epoch": 0.008838162576476773, "grad_norm": 0.07338641583919525, "learning_rate": 4.275e-05, "loss": 0.4494, "step": 290 }, { "epoch": 0.00886863899915428, "grad_norm": 0.07254870235919952, "learning_rate": 4.2725e-05, "loss": 0.4465, "step": 291 }, { "epoch": 0.008899115421831786, "grad_norm": 0.08015207201242447, "learning_rate": 4.27e-05, "loss": 0.4431, "step": 292 }, { "epoch": 0.008929591844509291, "grad_norm": 0.07529203593730927, "learning_rate": 4.2675e-05, "loss": 0.4618, "step": 293 }, { "epoch": 0.008960068267186797, "grad_norm": 0.07689110934734344, "learning_rate": 4.265e-05, "loss": 0.4062, "step": 294 }, { "epoch": 0.008990544689864304, "grad_norm": 0.08753497153520584, "learning_rate": 4.2625000000000006e-05, "loss": 0.5656, "step": 295 }, { "epoch": 0.00902102111254181, "grad_norm": 0.08756504207849503, "learning_rate": 4.26e-05, "loss": 0.5694, "step": 296 }, { "epoch": 0.009051497535219315, "grad_norm": 0.07585353404283524, "learning_rate": 4.2575000000000005e-05, "loss": 0.4272, "step": 297 }, { "epoch": 0.009081973957896823, "grad_norm": 0.07256894558668137, "learning_rate": 4.2550000000000004e-05, "loss": 0.3954, "step": 298 }, { "epoch": 0.009112450380574328, "grad_norm": 0.09153769910335541, "learning_rate": 4.2525000000000004e-05, "loss": 0.5122, "step": 299 }, { "epoch": 0.009142926803251834, "grad_norm": 0.0816190093755722, "learning_rate": 4.25e-05, "loss": 0.5171, "step": 300 }, { "epoch": 0.00917340322592934, "grad_norm": 0.0805639773607254, "learning_rate": 4.2475e-05, "loss": 0.5511, "step": 301 }, { "epoch": 0.009203879648606847, "grad_norm": 0.07218771427869797, "learning_rate": 4.245e-05, "loss": 0.3859, "step": 302 }, { "epoch": 0.009234356071284353, "grad_norm": 0.07436776161193848, "learning_rate": 4.2425e-05, "loss": 0.4575, "step": 303 }, { "epoch": 0.009264832493961858, "grad_norm": 0.07286684960126877, "learning_rate": 4.24e-05, "loss": 0.4137, "step": 304 }, { "epoch": 0.009295308916639365, "grad_norm": 0.07623466104269028, "learning_rate": 4.237500000000001e-05, "loss": 0.4005, "step": 305 }, { "epoch": 0.009325785339316871, "grad_norm": 0.0739443451166153, "learning_rate": 4.235e-05, "loss": 0.4484, "step": 306 }, { "epoch": 0.009356261761994377, "grad_norm": 0.08217545598745346, "learning_rate": 4.2325000000000006e-05, "loss": 0.5258, "step": 307 }, { "epoch": 0.009386738184671884, "grad_norm": 0.07618866115808487, "learning_rate": 4.23e-05, "loss": 0.4903, "step": 308 }, { "epoch": 0.00941721460734939, "grad_norm": 0.07885143160820007, "learning_rate": 4.2275000000000004e-05, "loss": 0.4987, "step": 309 }, { "epoch": 0.009447691030026895, "grad_norm": 0.07813480496406555, "learning_rate": 4.2250000000000004e-05, "loss": 0.494, "step": 310 }, { "epoch": 0.009478167452704401, "grad_norm": 0.07637833803892136, "learning_rate": 4.2225e-05, "loss": 0.4203, "step": 311 }, { "epoch": 0.009508643875381908, "grad_norm": 0.0785045176744461, "learning_rate": 4.22e-05, "loss": 0.4664, "step": 312 }, { "epoch": 0.009539120298059414, "grad_norm": 0.06332698464393616, "learning_rate": 4.2175e-05, "loss": 0.3423, "step": 313 }, { "epoch": 0.00956959672073692, "grad_norm": 0.08062972128391266, "learning_rate": 4.215e-05, "loss": 0.516, "step": 314 }, { "epoch": 0.009600073143414427, "grad_norm": 0.0749763622879982, "learning_rate": 4.2125e-05, "loss": 0.4377, "step": 315 }, { "epoch": 0.009630549566091932, "grad_norm": 0.07362157106399536, "learning_rate": 4.21e-05, "loss": 0.4897, "step": 316 }, { "epoch": 0.009661025988769438, "grad_norm": 0.07914413511753082, "learning_rate": 4.2075000000000006e-05, "loss": 0.5261, "step": 317 }, { "epoch": 0.009691502411446944, "grad_norm": 0.08899474889039993, "learning_rate": 4.205e-05, "loss": 0.5316, "step": 318 }, { "epoch": 0.009721978834124451, "grad_norm": 0.07666253298521042, "learning_rate": 4.2025000000000005e-05, "loss": 0.4479, "step": 319 }, { "epoch": 0.009752455256801957, "grad_norm": 0.07678390294313431, "learning_rate": 4.2e-05, "loss": 0.4688, "step": 320 }, { "epoch": 0.009782931679479462, "grad_norm": 0.06993228942155838, "learning_rate": 4.1975000000000004e-05, "loss": 0.4112, "step": 321 }, { "epoch": 0.00981340810215697, "grad_norm": 0.07682249695062637, "learning_rate": 4.195e-05, "loss": 0.4844, "step": 322 }, { "epoch": 0.009843884524834475, "grad_norm": 0.08613816648721695, "learning_rate": 4.1925e-05, "loss": 0.5618, "step": 323 }, { "epoch": 0.00987436094751198, "grad_norm": 0.09769878536462784, "learning_rate": 4.19e-05, "loss": 0.6224, "step": 324 }, { "epoch": 0.009904837370189486, "grad_norm": 0.0781012549996376, "learning_rate": 4.1875e-05, "loss": 0.4844, "step": 325 }, { "epoch": 0.009935313792866994, "grad_norm": 0.08791881054639816, "learning_rate": 4.185e-05, "loss": 0.5745, "step": 326 }, { "epoch": 0.0099657902155445, "grad_norm": 0.07679228484630585, "learning_rate": 4.1825e-05, "loss": 0.5058, "step": 327 }, { "epoch": 0.009996266638222005, "grad_norm": 0.07798115164041519, "learning_rate": 4.18e-05, "loss": 0.4785, "step": 328 }, { "epoch": 0.010026743060899512, "grad_norm": 0.0843651294708252, "learning_rate": 4.1775000000000006e-05, "loss": 0.5538, "step": 329 }, { "epoch": 0.010057219483577018, "grad_norm": 0.07032376527786255, "learning_rate": 4.175e-05, "loss": 0.4209, "step": 330 }, { "epoch": 0.010087695906254524, "grad_norm": 0.07439099252223969, "learning_rate": 4.1725000000000005e-05, "loss": 0.4053, "step": 331 }, { "epoch": 0.010118172328932029, "grad_norm": 0.08256080001592636, "learning_rate": 4.17e-05, "loss": 0.4582, "step": 332 }, { "epoch": 0.010148648751609536, "grad_norm": 0.07892946898937225, "learning_rate": 4.1675e-05, "loss": 0.4831, "step": 333 }, { "epoch": 0.010179125174287042, "grad_norm": 0.07524366676807404, "learning_rate": 4.165e-05, "loss": 0.4616, "step": 334 }, { "epoch": 0.010209601596964548, "grad_norm": 0.07178211212158203, "learning_rate": 4.1625e-05, "loss": 0.3863, "step": 335 }, { "epoch": 0.010240078019642055, "grad_norm": 0.09030681103467941, "learning_rate": 4.16e-05, "loss": 0.5006, "step": 336 }, { "epoch": 0.01027055444231956, "grad_norm": 0.08173321187496185, "learning_rate": 4.1575e-05, "loss": 0.4858, "step": 337 }, { "epoch": 0.010301030864997066, "grad_norm": 0.07854704558849335, "learning_rate": 4.155e-05, "loss": 0.4673, "step": 338 }, { "epoch": 0.010331507287674574, "grad_norm": 0.0789709985256195, "learning_rate": 4.1525e-05, "loss": 0.4963, "step": 339 }, { "epoch": 0.01036198371035208, "grad_norm": 0.08882351219654083, "learning_rate": 4.15e-05, "loss": 0.5322, "step": 340 }, { "epoch": 0.010392460133029585, "grad_norm": 0.08192622661590576, "learning_rate": 4.1475000000000005e-05, "loss": 0.5095, "step": 341 }, { "epoch": 0.01042293655570709, "grad_norm": 0.08845946937799454, "learning_rate": 4.145e-05, "loss": 0.5734, "step": 342 }, { "epoch": 0.010453412978384598, "grad_norm": 0.07699241489171982, "learning_rate": 4.1425000000000004e-05, "loss": 0.487, "step": 343 }, { "epoch": 0.010483889401062103, "grad_norm": 0.08124350011348724, "learning_rate": 4.14e-05, "loss": 0.4916, "step": 344 }, { "epoch": 0.010514365823739609, "grad_norm": 0.07038760930299759, "learning_rate": 4.1375e-05, "loss": 0.3755, "step": 345 }, { "epoch": 0.010544842246417116, "grad_norm": 0.08216731995344162, "learning_rate": 4.135e-05, "loss": 0.4822, "step": 346 }, { "epoch": 0.010575318669094622, "grad_norm": 0.06939674913883209, "learning_rate": 4.1325e-05, "loss": 0.3854, "step": 347 }, { "epoch": 0.010605795091772128, "grad_norm": 0.08228998631238937, "learning_rate": 4.13e-05, "loss": 0.508, "step": 348 }, { "epoch": 0.010636271514449633, "grad_norm": 0.08826014399528503, "learning_rate": 4.1275e-05, "loss": 0.583, "step": 349 }, { "epoch": 0.01066674793712714, "grad_norm": 0.07431170344352722, "learning_rate": 4.125e-05, "loss": 0.4318, "step": 350 }, { "epoch": 0.010697224359804646, "grad_norm": 0.07770109921693802, "learning_rate": 4.1225e-05, "loss": 0.4391, "step": 351 }, { "epoch": 0.010727700782482152, "grad_norm": 0.0851755365729332, "learning_rate": 4.12e-05, "loss": 0.5036, "step": 352 }, { "epoch": 0.010758177205159659, "grad_norm": 0.0783093273639679, "learning_rate": 4.1175000000000005e-05, "loss": 0.4355, "step": 353 }, { "epoch": 0.010788653627837165, "grad_norm": 0.07728315889835358, "learning_rate": 4.115e-05, "loss": 0.4403, "step": 354 }, { "epoch": 0.01081913005051467, "grad_norm": 0.07687405496835709, "learning_rate": 4.1125000000000004e-05, "loss": 0.4331, "step": 355 }, { "epoch": 0.010849606473192176, "grad_norm": 0.08391179889440536, "learning_rate": 4.11e-05, "loss": 0.5219, "step": 356 }, { "epoch": 0.010880082895869683, "grad_norm": 0.08034833520650864, "learning_rate": 4.1075e-05, "loss": 0.462, "step": 357 }, { "epoch": 0.010910559318547189, "grad_norm": 0.08306483179330826, "learning_rate": 4.105e-05, "loss": 0.4798, "step": 358 }, { "epoch": 0.010941035741224694, "grad_norm": 0.07427525520324707, "learning_rate": 4.1025e-05, "loss": 0.3977, "step": 359 }, { "epoch": 0.010971512163902202, "grad_norm": 0.07523825764656067, "learning_rate": 4.1e-05, "loss": 0.4161, "step": 360 }, { "epoch": 0.011001988586579707, "grad_norm": 0.08339823782444, "learning_rate": 4.0975e-05, "loss": 0.4884, "step": 361 }, { "epoch": 0.011032465009257213, "grad_norm": 0.07922948151826859, "learning_rate": 4.095e-05, "loss": 0.4612, "step": 362 }, { "epoch": 0.011062941431934719, "grad_norm": 0.0810355469584465, "learning_rate": 4.0925000000000005e-05, "loss": 0.5522, "step": 363 }, { "epoch": 0.011093417854612226, "grad_norm": 0.08766781538724899, "learning_rate": 4.09e-05, "loss": 0.5884, "step": 364 }, { "epoch": 0.011123894277289732, "grad_norm": 0.08863656222820282, "learning_rate": 4.0875000000000004e-05, "loss": 0.5265, "step": 365 }, { "epoch": 0.011154370699967237, "grad_norm": 0.0837583914399147, "learning_rate": 4.085e-05, "loss": 0.4875, "step": 366 }, { "epoch": 0.011184847122644745, "grad_norm": 0.08287142962217331, "learning_rate": 4.0825e-05, "loss": 0.5187, "step": 367 }, { "epoch": 0.01121532354532225, "grad_norm": 0.08429639041423798, "learning_rate": 4.08e-05, "loss": 0.5085, "step": 368 }, { "epoch": 0.011245799967999756, "grad_norm": 0.08869506418704987, "learning_rate": 4.0775e-05, "loss": 0.5489, "step": 369 }, { "epoch": 0.011276276390677263, "grad_norm": 0.07599665224552155, "learning_rate": 4.075e-05, "loss": 0.4202, "step": 370 }, { "epoch": 0.011306752813354769, "grad_norm": 0.07747755944728851, "learning_rate": 4.0725e-05, "loss": 0.4555, "step": 371 }, { "epoch": 0.011337229236032274, "grad_norm": 0.07758960872888565, "learning_rate": 4.07e-05, "loss": 0.5277, "step": 372 }, { "epoch": 0.01136770565870978, "grad_norm": 0.10279912501573563, "learning_rate": 4.0675e-05, "loss": 0.5704, "step": 373 }, { "epoch": 0.011398182081387287, "grad_norm": 0.09146252274513245, "learning_rate": 4.065e-05, "loss": 0.4633, "step": 374 }, { "epoch": 0.011428658504064793, "grad_norm": 0.07463935762643814, "learning_rate": 4.0625000000000005e-05, "loss": 0.442, "step": 375 }, { "epoch": 0.011459134926742299, "grad_norm": 0.07800383120775223, "learning_rate": 4.0600000000000004e-05, "loss": 0.4684, "step": 376 }, { "epoch": 0.011489611349419806, "grad_norm": 0.0858907699584961, "learning_rate": 4.0575000000000004e-05, "loss": 0.5116, "step": 377 }, { "epoch": 0.011520087772097311, "grad_norm": 0.08336257934570312, "learning_rate": 4.055e-05, "loss": 0.4999, "step": 378 }, { "epoch": 0.011550564194774817, "grad_norm": 0.06975117325782776, "learning_rate": 4.0525e-05, "loss": 0.36, "step": 379 }, { "epoch": 0.011581040617452323, "grad_norm": 0.08228785544633865, "learning_rate": 4.05e-05, "loss": 0.5401, "step": 380 }, { "epoch": 0.01161151704012983, "grad_norm": 0.07256509363651276, "learning_rate": 4.0475e-05, "loss": 0.4582, "step": 381 }, { "epoch": 0.011641993462807336, "grad_norm": 0.09035415947437286, "learning_rate": 4.045000000000001e-05, "loss": 0.5545, "step": 382 }, { "epoch": 0.011672469885484841, "grad_norm": 0.08884069323539734, "learning_rate": 4.0425e-05, "loss": 0.5202, "step": 383 }, { "epoch": 0.011702946308162349, "grad_norm": 0.08722904324531555, "learning_rate": 4.0400000000000006e-05, "loss": 0.4609, "step": 384 }, { "epoch": 0.011733422730839854, "grad_norm": 0.1007859855890274, "learning_rate": 4.0375e-05, "loss": 0.6291, "step": 385 }, { "epoch": 0.01176389915351736, "grad_norm": 0.08049754798412323, "learning_rate": 4.0350000000000005e-05, "loss": 0.509, "step": 386 }, { "epoch": 0.011794375576194865, "grad_norm": 0.07452309876680374, "learning_rate": 4.0325000000000004e-05, "loss": 0.4385, "step": 387 }, { "epoch": 0.011824851998872373, "grad_norm": 0.08714322000741959, "learning_rate": 4.0300000000000004e-05, "loss": 0.5173, "step": 388 }, { "epoch": 0.011855328421549878, "grad_norm": 0.09337788820266724, "learning_rate": 4.0275e-05, "loss": 0.5304, "step": 389 }, { "epoch": 0.011885804844227384, "grad_norm": 0.08082461357116699, "learning_rate": 4.025e-05, "loss": 0.402, "step": 390 }, { "epoch": 0.011916281266904891, "grad_norm": 0.09719090163707733, "learning_rate": 4.0225e-05, "loss": 0.5822, "step": 391 }, { "epoch": 0.011946757689582397, "grad_norm": 0.07829541712999344, "learning_rate": 4.02e-05, "loss": 0.4693, "step": 392 }, { "epoch": 0.011977234112259903, "grad_norm": 0.08262904733419418, "learning_rate": 4.0175e-05, "loss": 0.531, "step": 393 }, { "epoch": 0.01200771053493741, "grad_norm": 0.08425929397344589, "learning_rate": 4.015000000000001e-05, "loss": 0.5226, "step": 394 }, { "epoch": 0.012038186957614916, "grad_norm": 0.078396275639534, "learning_rate": 4.0125e-05, "loss": 0.467, "step": 395 }, { "epoch": 0.012068663380292421, "grad_norm": 0.07454272359609604, "learning_rate": 4.0100000000000006e-05, "loss": 0.4209, "step": 396 }, { "epoch": 0.012099139802969927, "grad_norm": 0.08931227773427963, "learning_rate": 4.0075e-05, "loss": 0.508, "step": 397 }, { "epoch": 0.012129616225647434, "grad_norm": 0.07294146716594696, "learning_rate": 4.0050000000000004e-05, "loss": 0.4024, "step": 398 }, { "epoch": 0.01216009264832494, "grad_norm": 0.07619839906692505, "learning_rate": 4.0025000000000004e-05, "loss": 0.4567, "step": 399 }, { "epoch": 0.012190569071002445, "grad_norm": 0.07713538408279419, "learning_rate": 4e-05, "loss": 0.4528, "step": 400 }, { "epoch": 0.012221045493679953, "grad_norm": 0.07649137824773788, "learning_rate": 3.9975e-05, "loss": 0.4859, "step": 401 }, { "epoch": 0.012251521916357458, "grad_norm": 0.0757165402173996, "learning_rate": 3.995e-05, "loss": 0.3973, "step": 402 }, { "epoch": 0.012281998339034964, "grad_norm": 0.0760243758559227, "learning_rate": 3.9925e-05, "loss": 0.4527, "step": 403 }, { "epoch": 0.01231247476171247, "grad_norm": 0.08435600996017456, "learning_rate": 3.99e-05, "loss": 0.4858, "step": 404 }, { "epoch": 0.012342951184389977, "grad_norm": 0.08177062124013901, "learning_rate": 3.9875e-05, "loss": 0.5606, "step": 405 }, { "epoch": 0.012373427607067482, "grad_norm": 0.08372743427753448, "learning_rate": 3.9850000000000006e-05, "loss": 0.457, "step": 406 }, { "epoch": 0.012403904029744988, "grad_norm": 0.07480288296937943, "learning_rate": 3.9825e-05, "loss": 0.4508, "step": 407 }, { "epoch": 0.012434380452422495, "grad_norm": 0.0682869479060173, "learning_rate": 3.9800000000000005e-05, "loss": 0.3992, "step": 408 }, { "epoch": 0.012464856875100001, "grad_norm": 0.07428453117609024, "learning_rate": 3.9775e-05, "loss": 0.4312, "step": 409 }, { "epoch": 0.012495333297777507, "grad_norm": 0.08548396080732346, "learning_rate": 3.9750000000000004e-05, "loss": 0.5507, "step": 410 }, { "epoch": 0.012525809720455012, "grad_norm": 0.0731261596083641, "learning_rate": 3.9725e-05, "loss": 0.3561, "step": 411 }, { "epoch": 0.01255628614313252, "grad_norm": 0.07011257857084274, "learning_rate": 3.97e-05, "loss": 0.4081, "step": 412 }, { "epoch": 0.012586762565810025, "grad_norm": 0.08660565316677094, "learning_rate": 3.9675e-05, "loss": 0.5573, "step": 413 }, { "epoch": 0.01261723898848753, "grad_norm": 0.07212682068347931, "learning_rate": 3.965e-05, "loss": 0.374, "step": 414 }, { "epoch": 0.012647715411165038, "grad_norm": 0.08162117004394531, "learning_rate": 3.9625e-05, "loss": 0.4749, "step": 415 }, { "epoch": 0.012678191833842544, "grad_norm": 0.08177854865789413, "learning_rate": 3.960000000000001e-05, "loss": 0.491, "step": 416 }, { "epoch": 0.01270866825652005, "grad_norm": 0.07734846323728561, "learning_rate": 3.9575e-05, "loss": 0.4729, "step": 417 }, { "epoch": 0.012739144679197555, "grad_norm": 0.08370812982320786, "learning_rate": 3.9550000000000006e-05, "loss": 0.5116, "step": 418 }, { "epoch": 0.012769621101875062, "grad_norm": 0.07237836718559265, "learning_rate": 3.9525e-05, "loss": 0.3803, "step": 419 }, { "epoch": 0.012800097524552568, "grad_norm": 0.07796831429004669, "learning_rate": 3.9500000000000005e-05, "loss": 0.4851, "step": 420 }, { "epoch": 0.012830573947230074, "grad_norm": 0.08543843030929565, "learning_rate": 3.9475000000000004e-05, "loss": 0.5465, "step": 421 }, { "epoch": 0.012861050369907581, "grad_norm": 0.08032520115375519, "learning_rate": 3.9450000000000003e-05, "loss": 0.5003, "step": 422 }, { "epoch": 0.012891526792585086, "grad_norm": 0.09619452059268951, "learning_rate": 3.9425e-05, "loss": 0.5955, "step": 423 }, { "epoch": 0.012922003215262592, "grad_norm": 0.07409845292568207, "learning_rate": 3.94e-05, "loss": 0.3718, "step": 424 }, { "epoch": 0.0129524796379401, "grad_norm": 0.08596406131982803, "learning_rate": 3.9375e-05, "loss": 0.5497, "step": 425 }, { "epoch": 0.012982956060617605, "grad_norm": 0.08141975849866867, "learning_rate": 3.935e-05, "loss": 0.4538, "step": 426 }, { "epoch": 0.01301343248329511, "grad_norm": 0.08574359118938446, "learning_rate": 3.9325e-05, "loss": 0.5416, "step": 427 }, { "epoch": 0.013043908905972616, "grad_norm": 0.0776650458574295, "learning_rate": 3.9300000000000007e-05, "loss": 0.4761, "step": 428 }, { "epoch": 0.013074385328650124, "grad_norm": 0.0675501599907875, "learning_rate": 3.9275e-05, "loss": 0.3704, "step": 429 }, { "epoch": 0.01310486175132763, "grad_norm": 0.07565828412771225, "learning_rate": 3.9250000000000005e-05, "loss": 0.4601, "step": 430 }, { "epoch": 0.013135338174005135, "grad_norm": 0.09003283828496933, "learning_rate": 3.9225e-05, "loss": 0.4576, "step": 431 }, { "epoch": 0.013165814596682642, "grad_norm": 0.07453721761703491, "learning_rate": 3.9200000000000004e-05, "loss": 0.4726, "step": 432 }, { "epoch": 0.013196291019360148, "grad_norm": 0.07131854444742203, "learning_rate": 3.9175000000000004e-05, "loss": 0.3941, "step": 433 }, { "epoch": 0.013226767442037653, "grad_norm": 0.08867894858121872, "learning_rate": 3.915e-05, "loss": 0.5736, "step": 434 }, { "epoch": 0.013257243864715159, "grad_norm": 0.07429534196853638, "learning_rate": 3.9125e-05, "loss": 0.475, "step": 435 }, { "epoch": 0.013287720287392666, "grad_norm": 0.07305336743593216, "learning_rate": 3.91e-05, "loss": 0.3877, "step": 436 }, { "epoch": 0.013318196710070172, "grad_norm": 0.08547985553741455, "learning_rate": 3.9075e-05, "loss": 0.5192, "step": 437 }, { "epoch": 0.013348673132747678, "grad_norm": 0.07932808995246887, "learning_rate": 3.905e-05, "loss": 0.4495, "step": 438 }, { "epoch": 0.013379149555425185, "grad_norm": 0.07845611870288849, "learning_rate": 3.9025e-05, "loss": 0.4142, "step": 439 }, { "epoch": 0.01340962597810269, "grad_norm": 0.0850796028971672, "learning_rate": 3.9000000000000006e-05, "loss": 0.5536, "step": 440 }, { "epoch": 0.013440102400780196, "grad_norm": 0.07283198833465576, "learning_rate": 3.8975e-05, "loss": 0.4152, "step": 441 }, { "epoch": 0.013470578823457702, "grad_norm": 0.07944342494010925, "learning_rate": 3.8950000000000005e-05, "loss": 0.473, "step": 442 }, { "epoch": 0.013501055246135209, "grad_norm": 0.07776287198066711, "learning_rate": 3.8925e-05, "loss": 0.454, "step": 443 }, { "epoch": 0.013531531668812715, "grad_norm": 0.08026792854070663, "learning_rate": 3.8900000000000004e-05, "loss": 0.4772, "step": 444 }, { "epoch": 0.01356200809149022, "grad_norm": 0.07469011843204498, "learning_rate": 3.8875e-05, "loss": 0.408, "step": 445 }, { "epoch": 0.013592484514167728, "grad_norm": 0.07390125095844269, "learning_rate": 3.885e-05, "loss": 0.4134, "step": 446 }, { "epoch": 0.013622960936845233, "grad_norm": 0.0784907266497612, "learning_rate": 3.8825e-05, "loss": 0.4678, "step": 447 }, { "epoch": 0.013653437359522739, "grad_norm": 0.07014909386634827, "learning_rate": 3.88e-05, "loss": 0.4212, "step": 448 }, { "epoch": 0.013683913782200244, "grad_norm": 0.07684614509344101, "learning_rate": 3.8775e-05, "loss": 0.4614, "step": 449 }, { "epoch": 0.013714390204877752, "grad_norm": 0.08842401206493378, "learning_rate": 3.875e-05, "loss": 0.5443, "step": 450 }, { "epoch": 0.013744866627555257, "grad_norm": 0.08343005180358887, "learning_rate": 3.8725e-05, "loss": 0.4957, "step": 451 }, { "epoch": 0.013775343050232763, "grad_norm": 0.07054363191127777, "learning_rate": 3.8700000000000006e-05, "loss": 0.4017, "step": 452 }, { "epoch": 0.01380581947291027, "grad_norm": 0.07983597368001938, "learning_rate": 3.8675e-05, "loss": 0.4475, "step": 453 }, { "epoch": 0.013836295895587776, "grad_norm": 0.08152148872613907, "learning_rate": 3.8650000000000004e-05, "loss": 0.452, "step": 454 }, { "epoch": 0.013866772318265282, "grad_norm": 0.08927030861377716, "learning_rate": 3.8625e-05, "loss": 0.5574, "step": 455 }, { "epoch": 0.013897248740942789, "grad_norm": 0.08453473448753357, "learning_rate": 3.86e-05, "loss": 0.5184, "step": 456 }, { "epoch": 0.013927725163620295, "grad_norm": 0.09216178953647614, "learning_rate": 3.8575e-05, "loss": 0.5366, "step": 457 }, { "epoch": 0.0139582015862978, "grad_norm": 0.08244302123785019, "learning_rate": 3.855e-05, "loss": 0.4558, "step": 458 }, { "epoch": 0.013988678008975306, "grad_norm": 0.10122604668140411, "learning_rate": 3.8525e-05, "loss": 0.6865, "step": 459 }, { "epoch": 0.014019154431652813, "grad_norm": 0.07872531563043594, "learning_rate": 3.85e-05, "loss": 0.4727, "step": 460 }, { "epoch": 0.014049630854330319, "grad_norm": 0.07340526580810547, "learning_rate": 3.8475e-05, "loss": 0.4166, "step": 461 }, { "epoch": 0.014080107277007824, "grad_norm": 0.08701716363430023, "learning_rate": 3.845e-05, "loss": 0.5126, "step": 462 }, { "epoch": 0.014110583699685332, "grad_norm": 0.08246690779924393, "learning_rate": 3.8425e-05, "loss": 0.5024, "step": 463 }, { "epoch": 0.014141060122362837, "grad_norm": 0.07318553328514099, "learning_rate": 3.8400000000000005e-05, "loss": 0.3916, "step": 464 }, { "epoch": 0.014171536545040343, "grad_norm": 0.0717618316411972, "learning_rate": 3.8375e-05, "loss": 0.4433, "step": 465 }, { "epoch": 0.014202012967717849, "grad_norm": 0.07733088731765747, "learning_rate": 3.8350000000000004e-05, "loss": 0.4767, "step": 466 }, { "epoch": 0.014232489390395356, "grad_norm": 0.08398626744747162, "learning_rate": 3.8324999999999996e-05, "loss": 0.4199, "step": 467 }, { "epoch": 0.014262965813072861, "grad_norm": 0.07931336015462875, "learning_rate": 3.83e-05, "loss": 0.4861, "step": 468 }, { "epoch": 0.014293442235750367, "grad_norm": 0.08510344475507736, "learning_rate": 3.8275e-05, "loss": 0.5148, "step": 469 }, { "epoch": 0.014323918658427874, "grad_norm": 0.07216447591781616, "learning_rate": 3.825e-05, "loss": 0.3918, "step": 470 }, { "epoch": 0.01435439508110538, "grad_norm": 0.0710480809211731, "learning_rate": 3.8225e-05, "loss": 0.4232, "step": 471 }, { "epoch": 0.014384871503782886, "grad_norm": 0.08191974461078644, "learning_rate": 3.82e-05, "loss": 0.467, "step": 472 }, { "epoch": 0.014415347926460391, "grad_norm": 0.0795518159866333, "learning_rate": 3.8175e-05, "loss": 0.4143, "step": 473 }, { "epoch": 0.014445824349137899, "grad_norm": 0.084683358669281, "learning_rate": 3.8150000000000006e-05, "loss": 0.5059, "step": 474 }, { "epoch": 0.014476300771815404, "grad_norm": 0.08683113008737564, "learning_rate": 3.8125e-05, "loss": 0.5581, "step": 475 }, { "epoch": 0.01450677719449291, "grad_norm": 0.07901822775602341, "learning_rate": 3.8100000000000005e-05, "loss": 0.4858, "step": 476 }, { "epoch": 0.014537253617170417, "grad_norm": 0.0714401826262474, "learning_rate": 3.8075e-05, "loss": 0.4008, "step": 477 }, { "epoch": 0.014567730039847923, "grad_norm": 0.08758770674467087, "learning_rate": 3.805e-05, "loss": 0.5007, "step": 478 }, { "epoch": 0.014598206462525428, "grad_norm": 0.07747769355773926, "learning_rate": 3.8025e-05, "loss": 0.4646, "step": 479 }, { "epoch": 0.014628682885202934, "grad_norm": 0.06961502134799957, "learning_rate": 3.8e-05, "loss": 0.4005, "step": 480 }, { "epoch": 0.014659159307880441, "grad_norm": 0.08239906281232834, "learning_rate": 3.7975e-05, "loss": 0.4928, "step": 481 }, { "epoch": 0.014689635730557947, "grad_norm": 0.07400859892368317, "learning_rate": 3.795e-05, "loss": 0.4319, "step": 482 }, { "epoch": 0.014720112153235453, "grad_norm": 0.0799640417098999, "learning_rate": 3.7925e-05, "loss": 0.5039, "step": 483 }, { "epoch": 0.01475058857591296, "grad_norm": 0.07632756233215332, "learning_rate": 3.79e-05, "loss": 0.4618, "step": 484 }, { "epoch": 0.014781064998590466, "grad_norm": 0.07667844742536545, "learning_rate": 3.7875e-05, "loss": 0.3596, "step": 485 }, { "epoch": 0.014811541421267971, "grad_norm": 0.0750662237405777, "learning_rate": 3.7850000000000005e-05, "loss": 0.4274, "step": 486 }, { "epoch": 0.014842017843945478, "grad_norm": 0.07854458689689636, "learning_rate": 3.7825e-05, "loss": 0.4358, "step": 487 }, { "epoch": 0.014872494266622984, "grad_norm": 0.08175184577703476, "learning_rate": 3.7800000000000004e-05, "loss": 0.4799, "step": 488 }, { "epoch": 0.01490297068930049, "grad_norm": 0.0801885649561882, "learning_rate": 3.7775e-05, "loss": 0.4828, "step": 489 }, { "epoch": 0.014933447111977995, "grad_norm": 0.07974839955568314, "learning_rate": 3.775e-05, "loss": 0.4403, "step": 490 }, { "epoch": 0.014963923534655503, "grad_norm": 0.07385674118995667, "learning_rate": 3.7725e-05, "loss": 0.4234, "step": 491 }, { "epoch": 0.014994399957333008, "grad_norm": 0.07916196435689926, "learning_rate": 3.77e-05, "loss": 0.5123, "step": 492 }, { "epoch": 0.015024876380010514, "grad_norm": 0.08391543477773666, "learning_rate": 3.7675e-05, "loss": 0.4743, "step": 493 }, { "epoch": 0.015055352802688021, "grad_norm": 0.07716374099254608, "learning_rate": 3.765e-05, "loss": 0.4162, "step": 494 }, { "epoch": 0.015085829225365527, "grad_norm": 0.0842309296131134, "learning_rate": 3.7625e-05, "loss": 0.5166, "step": 495 }, { "epoch": 0.015116305648043032, "grad_norm": 0.08199356496334076, "learning_rate": 3.76e-05, "loss": 0.4381, "step": 496 }, { "epoch": 0.015146782070720538, "grad_norm": 0.08166930079460144, "learning_rate": 3.7575e-05, "loss": 0.4459, "step": 497 }, { "epoch": 0.015177258493398045, "grad_norm": 0.09283913671970367, "learning_rate": 3.7550000000000005e-05, "loss": 0.4972, "step": 498 }, { "epoch": 0.015207734916075551, "grad_norm": 0.07399161159992218, "learning_rate": 3.7525e-05, "loss": 0.433, "step": 499 }, { "epoch": 0.015238211338753057, "grad_norm": 0.077593132853508, "learning_rate": 3.7500000000000003e-05, "loss": 0.4577, "step": 500 }, { "epoch": 0.015268687761430564, "grad_norm": 0.07779756188392639, "learning_rate": 3.7475e-05, "loss": 0.4599, "step": 501 }, { "epoch": 0.01529916418410807, "grad_norm": 0.0814337283372879, "learning_rate": 3.745e-05, "loss": 0.5169, "step": 502 }, { "epoch": 0.015329640606785575, "grad_norm": 0.07928173243999481, "learning_rate": 3.7425e-05, "loss": 0.5128, "step": 503 }, { "epoch": 0.01536011702946308, "grad_norm": 0.09029681980609894, "learning_rate": 3.74e-05, "loss": 0.5687, "step": 504 }, { "epoch": 0.015390593452140588, "grad_norm": 0.0836619883775711, "learning_rate": 3.737500000000001e-05, "loss": 0.5252, "step": 505 }, { "epoch": 0.015421069874818094, "grad_norm": 0.07593099772930145, "learning_rate": 3.735e-05, "loss": 0.4365, "step": 506 }, { "epoch": 0.0154515462974956, "grad_norm": 0.0810251533985138, "learning_rate": 3.7325000000000006e-05, "loss": 0.4729, "step": 507 }, { "epoch": 0.015482022720173107, "grad_norm": 0.08296098560094833, "learning_rate": 3.73e-05, "loss": 0.5088, "step": 508 }, { "epoch": 0.015512499142850612, "grad_norm": 0.08177246153354645, "learning_rate": 3.7275000000000005e-05, "loss": 0.4731, "step": 509 }, { "epoch": 0.015542975565528118, "grad_norm": 0.07968758046627045, "learning_rate": 3.7250000000000004e-05, "loss": 0.4485, "step": 510 }, { "epoch": 0.015573451988205625, "grad_norm": 0.08020075410604477, "learning_rate": 3.7225000000000004e-05, "loss": 0.5004, "step": 511 }, { "epoch": 0.015603928410883131, "grad_norm": 0.07310120016336441, "learning_rate": 3.72e-05, "loss": 0.4016, "step": 512 }, { "epoch": 0.015634404833560638, "grad_norm": 0.0734933614730835, "learning_rate": 3.7175e-05, "loss": 0.459, "step": 513 }, { "epoch": 0.015664881256238144, "grad_norm": 0.0773499608039856, "learning_rate": 3.715e-05, "loss": 0.45, "step": 514 }, { "epoch": 0.01569535767891565, "grad_norm": 0.0709955021739006, "learning_rate": 3.7125e-05, "loss": 0.3958, "step": 515 }, { "epoch": 0.015725834101593155, "grad_norm": 0.07795226573944092, "learning_rate": 3.71e-05, "loss": 0.4406, "step": 516 }, { "epoch": 0.01575631052427066, "grad_norm": 0.0725317969918251, "learning_rate": 3.707500000000001e-05, "loss": 0.4192, "step": 517 }, { "epoch": 0.015786786946948166, "grad_norm": 0.08699551224708557, "learning_rate": 3.705e-05, "loss": 0.4687, "step": 518 }, { "epoch": 0.015817263369625672, "grad_norm": 0.07942593097686768, "learning_rate": 3.7025000000000005e-05, "loss": 0.4259, "step": 519 }, { "epoch": 0.01584773979230318, "grad_norm": 0.08367237448692322, "learning_rate": 3.7e-05, "loss": 0.527, "step": 520 }, { "epoch": 0.015878216214980687, "grad_norm": 0.09568676352500916, "learning_rate": 3.6975000000000004e-05, "loss": 0.6368, "step": 521 }, { "epoch": 0.015908692637658192, "grad_norm": 0.07774842530488968, "learning_rate": 3.6950000000000004e-05, "loss": 0.496, "step": 522 }, { "epoch": 0.015939169060335698, "grad_norm": 0.07695405930280685, "learning_rate": 3.6925e-05, "loss": 0.4343, "step": 523 }, { "epoch": 0.015969645483013203, "grad_norm": 0.0844259113073349, "learning_rate": 3.69e-05, "loss": 0.5288, "step": 524 }, { "epoch": 0.01600012190569071, "grad_norm": 0.07857687026262283, "learning_rate": 3.6875e-05, "loss": 0.4853, "step": 525 }, { "epoch": 0.016030598328368215, "grad_norm": 0.08118993788957596, "learning_rate": 3.685e-05, "loss": 0.4744, "step": 526 }, { "epoch": 0.016061074751045724, "grad_norm": 0.08050089329481125, "learning_rate": 3.6825e-05, "loss": 0.4623, "step": 527 }, { "epoch": 0.01609155117372323, "grad_norm": 0.07659171521663666, "learning_rate": 3.68e-05, "loss": 0.4638, "step": 528 }, { "epoch": 0.016122027596400735, "grad_norm": 0.07600045204162598, "learning_rate": 3.6775000000000006e-05, "loss": 0.4405, "step": 529 }, { "epoch": 0.01615250401907824, "grad_norm": 0.07693934440612793, "learning_rate": 3.675e-05, "loss": 0.4888, "step": 530 }, { "epoch": 0.016182980441755746, "grad_norm": 0.08221478015184402, "learning_rate": 3.6725000000000005e-05, "loss": 0.4671, "step": 531 }, { "epoch": 0.016213456864433252, "grad_norm": 0.07940209656953812, "learning_rate": 3.6700000000000004e-05, "loss": 0.3986, "step": 532 }, { "epoch": 0.016243933287110757, "grad_norm": 0.07854613661766052, "learning_rate": 3.6675000000000004e-05, "loss": 0.5168, "step": 533 }, { "epoch": 0.016274409709788266, "grad_norm": 0.08529219031333923, "learning_rate": 3.665e-05, "loss": 0.5551, "step": 534 }, { "epoch": 0.016304886132465772, "grad_norm": 0.08174476027488708, "learning_rate": 3.6625e-05, "loss": 0.4518, "step": 535 }, { "epoch": 0.016335362555143278, "grad_norm": 0.08744428306818008, "learning_rate": 3.66e-05, "loss": 0.5202, "step": 536 }, { "epoch": 0.016365838977820783, "grad_norm": 0.0846613198518753, "learning_rate": 3.6575e-05, "loss": 0.503, "step": 537 }, { "epoch": 0.01639631540049829, "grad_norm": 0.07582278549671173, "learning_rate": 3.655e-05, "loss": 0.4123, "step": 538 }, { "epoch": 0.016426791823175795, "grad_norm": 0.07609230279922485, "learning_rate": 3.652500000000001e-05, "loss": 0.4099, "step": 539 }, { "epoch": 0.0164572682458533, "grad_norm": 0.08397295325994492, "learning_rate": 3.65e-05, "loss": 0.5449, "step": 540 }, { "epoch": 0.01648774466853081, "grad_norm": 0.07257740944623947, "learning_rate": 3.6475000000000006e-05, "loss": 0.4276, "step": 541 }, { "epoch": 0.016518221091208315, "grad_norm": 0.07120921462774277, "learning_rate": 3.645e-05, "loss": 0.4355, "step": 542 }, { "epoch": 0.01654869751388582, "grad_norm": 0.0811382308602333, "learning_rate": 3.6425000000000004e-05, "loss": 0.4625, "step": 543 }, { "epoch": 0.016579173936563326, "grad_norm": 0.0823940858244896, "learning_rate": 3.6400000000000004e-05, "loss": 0.4965, "step": 544 }, { "epoch": 0.01660965035924083, "grad_norm": 0.07318028062582016, "learning_rate": 3.6375e-05, "loss": 0.4174, "step": 545 }, { "epoch": 0.016640126781918337, "grad_norm": 0.07844001054763794, "learning_rate": 3.635e-05, "loss": 0.4374, "step": 546 }, { "epoch": 0.016670603204595843, "grad_norm": 0.07991214841604233, "learning_rate": 3.6325e-05, "loss": 0.4634, "step": 547 }, { "epoch": 0.016701079627273352, "grad_norm": 0.075746089220047, "learning_rate": 3.63e-05, "loss": 0.4628, "step": 548 }, { "epoch": 0.016731556049950858, "grad_norm": 0.07438556104898453, "learning_rate": 3.6275e-05, "loss": 0.4489, "step": 549 }, { "epoch": 0.016762032472628363, "grad_norm": 0.07881918549537659, "learning_rate": 3.625e-05, "loss": 0.469, "step": 550 }, { "epoch": 0.01679250889530587, "grad_norm": 0.07700483500957489, "learning_rate": 3.6225000000000006e-05, "loss": 0.446, "step": 551 }, { "epoch": 0.016822985317983374, "grad_norm": 0.09192753583192825, "learning_rate": 3.62e-05, "loss": 0.4477, "step": 552 }, { "epoch": 0.01685346174066088, "grad_norm": 0.07975888252258301, "learning_rate": 3.6175000000000005e-05, "loss": 0.4426, "step": 553 }, { "epoch": 0.01688393816333839, "grad_norm": 0.07312572002410889, "learning_rate": 3.615e-05, "loss": 0.424, "step": 554 }, { "epoch": 0.016914414586015895, "grad_norm": 0.07869100570678711, "learning_rate": 3.6125000000000004e-05, "loss": 0.4192, "step": 555 }, { "epoch": 0.0169448910086934, "grad_norm": 0.0741017609834671, "learning_rate": 3.61e-05, "loss": 0.4058, "step": 556 }, { "epoch": 0.016975367431370906, "grad_norm": 0.0810854360461235, "learning_rate": 3.6075e-05, "loss": 0.4976, "step": 557 }, { "epoch": 0.01700584385404841, "grad_norm": 0.07737844437360764, "learning_rate": 3.605e-05, "loss": 0.4122, "step": 558 }, { "epoch": 0.017036320276725917, "grad_norm": 0.08047479391098022, "learning_rate": 3.6025e-05, "loss": 0.4309, "step": 559 }, { "epoch": 0.017066796699403423, "grad_norm": 0.08581901341676712, "learning_rate": 3.6e-05, "loss": 0.412, "step": 560 }, { "epoch": 0.017097273122080932, "grad_norm": 0.07605123519897461, "learning_rate": 3.5975e-05, "loss": 0.4368, "step": 561 }, { "epoch": 0.017127749544758437, "grad_norm": 0.077277272939682, "learning_rate": 3.595e-05, "loss": 0.4576, "step": 562 }, { "epoch": 0.017158225967435943, "grad_norm": 0.07902626693248749, "learning_rate": 3.5925000000000006e-05, "loss": 0.4672, "step": 563 }, { "epoch": 0.01718870239011345, "grad_norm": 0.08601362258195877, "learning_rate": 3.59e-05, "loss": 0.5897, "step": 564 }, { "epoch": 0.017219178812790954, "grad_norm": 0.07740458846092224, "learning_rate": 3.5875000000000005e-05, "loss": 0.4956, "step": 565 }, { "epoch": 0.01724965523546846, "grad_norm": 0.08294559270143509, "learning_rate": 3.585e-05, "loss": 0.5526, "step": 566 }, { "epoch": 0.017280131658145965, "grad_norm": 0.07695913314819336, "learning_rate": 3.5825000000000003e-05, "loss": 0.4564, "step": 567 }, { "epoch": 0.017310608080823475, "grad_norm": 0.07989615947008133, "learning_rate": 3.58e-05, "loss": 0.4695, "step": 568 }, { "epoch": 0.01734108450350098, "grad_norm": 0.08057578653097153, "learning_rate": 3.5775e-05, "loss": 0.4767, "step": 569 }, { "epoch": 0.017371560926178486, "grad_norm": 0.07041118294000626, "learning_rate": 3.575e-05, "loss": 0.3817, "step": 570 }, { "epoch": 0.01740203734885599, "grad_norm": 0.0779685229063034, "learning_rate": 3.5725e-05, "loss": 0.4344, "step": 571 }, { "epoch": 0.017432513771533497, "grad_norm": 0.07803837954998016, "learning_rate": 3.57e-05, "loss": 0.4933, "step": 572 }, { "epoch": 0.017462990194211003, "grad_norm": 0.088725745677948, "learning_rate": 3.5675e-05, "loss": 0.4925, "step": 573 }, { "epoch": 0.017493466616888508, "grad_norm": 0.06865601986646652, "learning_rate": 3.565e-05, "loss": 0.3975, "step": 574 }, { "epoch": 0.017523943039566017, "grad_norm": 0.07504341006278992, "learning_rate": 3.5625000000000005e-05, "loss": 0.4534, "step": 575 }, { "epoch": 0.017554419462243523, "grad_norm": 0.07205109298229218, "learning_rate": 3.56e-05, "loss": 0.4362, "step": 576 }, { "epoch": 0.01758489588492103, "grad_norm": 0.07844554632902145, "learning_rate": 3.5575000000000004e-05, "loss": 0.4585, "step": 577 }, { "epoch": 0.017615372307598534, "grad_norm": 0.06853543221950531, "learning_rate": 3.555e-05, "loss": 0.3719, "step": 578 }, { "epoch": 0.01764584873027604, "grad_norm": 0.18647930026054382, "learning_rate": 3.5525e-05, "loss": 0.5017, "step": 579 }, { "epoch": 0.017676325152953545, "grad_norm": 0.07756783813238144, "learning_rate": 3.55e-05, "loss": 0.3943, "step": 580 }, { "epoch": 0.01770680157563105, "grad_norm": 0.0708971843123436, "learning_rate": 3.5475e-05, "loss": 0.4018, "step": 581 }, { "epoch": 0.01773727799830856, "grad_norm": 0.07733050733804703, "learning_rate": 3.545e-05, "loss": 0.4522, "step": 582 }, { "epoch": 0.017767754420986066, "grad_norm": 0.0759468674659729, "learning_rate": 3.5425e-05, "loss": 0.4469, "step": 583 }, { "epoch": 0.01779823084366357, "grad_norm": 0.06872480362653732, "learning_rate": 3.54e-05, "loss": 0.3963, "step": 584 }, { "epoch": 0.017828707266341077, "grad_norm": 0.08087463676929474, "learning_rate": 3.5375e-05, "loss": 0.4555, "step": 585 }, { "epoch": 0.017859183689018582, "grad_norm": 0.0749768316745758, "learning_rate": 3.535e-05, "loss": 0.4522, "step": 586 }, { "epoch": 0.017889660111696088, "grad_norm": 0.07711078971624374, "learning_rate": 3.5325000000000005e-05, "loss": 0.4122, "step": 587 }, { "epoch": 0.017920136534373594, "grad_norm": 0.08123817294836044, "learning_rate": 3.53e-05, "loss": 0.428, "step": 588 }, { "epoch": 0.017950612957051103, "grad_norm": 0.07939854264259338, "learning_rate": 3.5275000000000004e-05, "loss": 0.4735, "step": 589 }, { "epoch": 0.01798108937972861, "grad_norm": 0.07783577591180801, "learning_rate": 3.525e-05, "loss": 0.4987, "step": 590 }, { "epoch": 0.018011565802406114, "grad_norm": 0.08213981986045837, "learning_rate": 3.5225e-05, "loss": 0.5222, "step": 591 }, { "epoch": 0.01804204222508362, "grad_norm": 0.07623777538537979, "learning_rate": 3.52e-05, "loss": 0.4323, "step": 592 }, { "epoch": 0.018072518647761125, "grad_norm": 0.0808035135269165, "learning_rate": 3.5175e-05, "loss": 0.4738, "step": 593 }, { "epoch": 0.01810299507043863, "grad_norm": 0.07478028535842896, "learning_rate": 3.515e-05, "loss": 0.4136, "step": 594 }, { "epoch": 0.018133471493116136, "grad_norm": 0.09484824538230896, "learning_rate": 3.5125e-05, "loss": 0.5827, "step": 595 }, { "epoch": 0.018163947915793646, "grad_norm": 0.08181550353765488, "learning_rate": 3.51e-05, "loss": 0.4586, "step": 596 }, { "epoch": 0.01819442433847115, "grad_norm": 0.07361916452646255, "learning_rate": 3.5075000000000006e-05, "loss": 0.43, "step": 597 }, { "epoch": 0.018224900761148657, "grad_norm": 0.07526674121618271, "learning_rate": 3.505e-05, "loss": 0.4537, "step": 598 }, { "epoch": 0.018255377183826162, "grad_norm": 0.07827012240886688, "learning_rate": 3.5025000000000004e-05, "loss": 0.466, "step": 599 }, { "epoch": 0.018285853606503668, "grad_norm": 0.07435450702905655, "learning_rate": 3.5e-05, "loss": 0.398, "step": 600 }, { "epoch": 0.018316330029181174, "grad_norm": 0.08013113588094711, "learning_rate": 3.4975e-05, "loss": 0.4742, "step": 601 }, { "epoch": 0.01834680645185868, "grad_norm": 0.07617498934268951, "learning_rate": 3.495e-05, "loss": 0.3745, "step": 602 }, { "epoch": 0.018377282874536188, "grad_norm": 0.0836411640048027, "learning_rate": 3.4925e-05, "loss": 0.4773, "step": 603 }, { "epoch": 0.018407759297213694, "grad_norm": 0.07737065851688385, "learning_rate": 3.49e-05, "loss": 0.4808, "step": 604 }, { "epoch": 0.0184382357198912, "grad_norm": 0.08309837430715561, "learning_rate": 3.4875e-05, "loss": 0.5497, "step": 605 }, { "epoch": 0.018468712142568705, "grad_norm": 0.08329971134662628, "learning_rate": 3.485e-05, "loss": 0.4662, "step": 606 }, { "epoch": 0.01849918856524621, "grad_norm": 0.07565900683403015, "learning_rate": 3.4825e-05, "loss": 0.4143, "step": 607 }, { "epoch": 0.018529664987923716, "grad_norm": 0.06912257522344589, "learning_rate": 3.48e-05, "loss": 0.3433, "step": 608 }, { "epoch": 0.018560141410601222, "grad_norm": 0.07574879378080368, "learning_rate": 3.4775000000000005e-05, "loss": 0.44, "step": 609 }, { "epoch": 0.01859061783327873, "grad_norm": 0.08195056021213531, "learning_rate": 3.475e-05, "loss": 0.5004, "step": 610 }, { "epoch": 0.018621094255956237, "grad_norm": 0.07795493304729462, "learning_rate": 3.4725000000000004e-05, "loss": 0.4295, "step": 611 }, { "epoch": 0.018651570678633742, "grad_norm": 0.07339303940534592, "learning_rate": 3.4699999999999996e-05, "loss": 0.4587, "step": 612 }, { "epoch": 0.018682047101311248, "grad_norm": 0.0827263817191124, "learning_rate": 3.4675e-05, "loss": 0.5198, "step": 613 }, { "epoch": 0.018712523523988753, "grad_norm": 0.08158499747514725, "learning_rate": 3.465e-05, "loss": 0.4936, "step": 614 }, { "epoch": 0.01874299994666626, "grad_norm": 0.0794842317700386, "learning_rate": 3.4625e-05, "loss": 0.4749, "step": 615 }, { "epoch": 0.018773476369343768, "grad_norm": 0.08031702786684036, "learning_rate": 3.46e-05, "loss": 0.5039, "step": 616 }, { "epoch": 0.018803952792021274, "grad_norm": 0.08018321543931961, "learning_rate": 3.4575e-05, "loss": 0.48, "step": 617 }, { "epoch": 0.01883442921469878, "grad_norm": 0.08102942258119583, "learning_rate": 3.455e-05, "loss": 0.5215, "step": 618 }, { "epoch": 0.018864905637376285, "grad_norm": 0.09557921439409256, "learning_rate": 3.4525e-05, "loss": 0.6062, "step": 619 }, { "epoch": 0.01889538206005379, "grad_norm": 0.07677757740020752, "learning_rate": 3.45e-05, "loss": 0.4489, "step": 620 }, { "epoch": 0.018925858482731296, "grad_norm": 0.06773815304040909, "learning_rate": 3.4475000000000005e-05, "loss": 0.3749, "step": 621 }, { "epoch": 0.018956334905408802, "grad_norm": 0.08537542074918747, "learning_rate": 3.445e-05, "loss": 0.5621, "step": 622 }, { "epoch": 0.01898681132808631, "grad_norm": 0.08308160305023193, "learning_rate": 3.4425e-05, "loss": 0.507, "step": 623 }, { "epoch": 0.019017287750763816, "grad_norm": 0.08689706772565842, "learning_rate": 3.4399999999999996e-05, "loss": 0.5036, "step": 624 }, { "epoch": 0.019047764173441322, "grad_norm": 0.08027336746454239, "learning_rate": 3.4375e-05, "loss": 0.4623, "step": 625 }, { "epoch": 0.019078240596118828, "grad_norm": 0.08224154263734818, "learning_rate": 3.435e-05, "loss": 0.4322, "step": 626 }, { "epoch": 0.019108717018796333, "grad_norm": 0.09157747030258179, "learning_rate": 3.4325e-05, "loss": 0.5332, "step": 627 }, { "epoch": 0.01913919344147384, "grad_norm": 0.08036287128925323, "learning_rate": 3.430000000000001e-05, "loss": 0.4672, "step": 628 }, { "epoch": 0.019169669864151345, "grad_norm": 0.07607066631317139, "learning_rate": 3.4275e-05, "loss": 0.4179, "step": 629 }, { "epoch": 0.019200146286828854, "grad_norm": 0.07857298851013184, "learning_rate": 3.4250000000000006e-05, "loss": 0.4675, "step": 630 }, { "epoch": 0.01923062270950636, "grad_norm": 0.07547951489686966, "learning_rate": 3.4225e-05, "loss": 0.4227, "step": 631 }, { "epoch": 0.019261099132183865, "grad_norm": 0.0746256411075592, "learning_rate": 3.4200000000000005e-05, "loss": 0.4105, "step": 632 }, { "epoch": 0.01929157555486137, "grad_norm": 0.07292158901691437, "learning_rate": 3.4175000000000004e-05, "loss": 0.4084, "step": 633 }, { "epoch": 0.019322051977538876, "grad_norm": 0.09091036766767502, "learning_rate": 3.415e-05, "loss": 0.5911, "step": 634 }, { "epoch": 0.01935252840021638, "grad_norm": 0.06717415153980255, "learning_rate": 3.4125e-05, "loss": 0.377, "step": 635 }, { "epoch": 0.019383004822893887, "grad_norm": 0.08688321709632874, "learning_rate": 3.41e-05, "loss": 0.5281, "step": 636 }, { "epoch": 0.019413481245571396, "grad_norm": 0.07493972778320312, "learning_rate": 3.4075e-05, "loss": 0.431, "step": 637 }, { "epoch": 0.019443957668248902, "grad_norm": 0.08446397632360458, "learning_rate": 3.405e-05, "loss": 0.5203, "step": 638 }, { "epoch": 0.019474434090926408, "grad_norm": 0.08509645611047745, "learning_rate": 3.4025e-05, "loss": 0.5303, "step": 639 }, { "epoch": 0.019504910513603913, "grad_norm": 0.07615584880113602, "learning_rate": 3.4000000000000007e-05, "loss": 0.4332, "step": 640 }, { "epoch": 0.01953538693628142, "grad_norm": 0.0785723403096199, "learning_rate": 3.3975e-05, "loss": 0.4253, "step": 641 }, { "epoch": 0.019565863358958924, "grad_norm": 0.07296749204397202, "learning_rate": 3.3950000000000005e-05, "loss": 0.4352, "step": 642 }, { "epoch": 0.01959633978163643, "grad_norm": 0.08098521083593369, "learning_rate": 3.3925e-05, "loss": 0.5159, "step": 643 }, { "epoch": 0.01962681620431394, "grad_norm": 0.08699838072061539, "learning_rate": 3.3900000000000004e-05, "loss": 0.5175, "step": 644 }, { "epoch": 0.019657292626991445, "grad_norm": 0.0788174569606781, "learning_rate": 3.3875000000000003e-05, "loss": 0.459, "step": 645 }, { "epoch": 0.01968776904966895, "grad_norm": 0.07544863224029541, "learning_rate": 3.385e-05, "loss": 0.4557, "step": 646 }, { "epoch": 0.019718245472346456, "grad_norm": 0.0815189778804779, "learning_rate": 3.3825e-05, "loss": 0.4863, "step": 647 }, { "epoch": 0.01974872189502396, "grad_norm": 0.07696584612131119, "learning_rate": 3.38e-05, "loss": 0.4678, "step": 648 }, { "epoch": 0.019779198317701467, "grad_norm": 0.07731449604034424, "learning_rate": 3.3775e-05, "loss": 0.4392, "step": 649 }, { "epoch": 0.019809674740378973, "grad_norm": 0.08522268384695053, "learning_rate": 3.375000000000001e-05, "loss": 0.5187, "step": 650 }, { "epoch": 0.019840151163056482, "grad_norm": 0.12538625299930573, "learning_rate": 3.3725e-05, "loss": 0.4393, "step": 651 }, { "epoch": 0.019870627585733987, "grad_norm": 0.07827665656805038, "learning_rate": 3.3700000000000006e-05, "loss": 0.4816, "step": 652 }, { "epoch": 0.019901104008411493, "grad_norm": 0.09126632660627365, "learning_rate": 3.3675e-05, "loss": 0.4677, "step": 653 }, { "epoch": 0.019931580431089, "grad_norm": 0.08352585136890411, "learning_rate": 3.3650000000000005e-05, "loss": 0.4932, "step": 654 }, { "epoch": 0.019962056853766504, "grad_norm": 0.07250239700078964, "learning_rate": 3.3625000000000004e-05, "loss": 0.4051, "step": 655 }, { "epoch": 0.01999253327644401, "grad_norm": 0.0738164484500885, "learning_rate": 3.3600000000000004e-05, "loss": 0.4314, "step": 656 }, { "epoch": 0.020023009699121515, "grad_norm": 0.08510053157806396, "learning_rate": 3.3575e-05, "loss": 0.5533, "step": 657 }, { "epoch": 0.020053486121799025, "grad_norm": 0.07941199839115143, "learning_rate": 3.355e-05, "loss": 0.4264, "step": 658 }, { "epoch": 0.02008396254447653, "grad_norm": 0.08894859254360199, "learning_rate": 3.3525e-05, "loss": 0.5238, "step": 659 }, { "epoch": 0.020114438967154036, "grad_norm": 0.0749766007065773, "learning_rate": 3.35e-05, "loss": 0.4699, "step": 660 }, { "epoch": 0.02014491538983154, "grad_norm": 0.08259838074445724, "learning_rate": 3.3475e-05, "loss": 0.5334, "step": 661 }, { "epoch": 0.020175391812509047, "grad_norm": 0.07347257435321808, "learning_rate": 3.345000000000001e-05, "loss": 0.4126, "step": 662 }, { "epoch": 0.020205868235186553, "grad_norm": 0.07784102112054825, "learning_rate": 3.3425e-05, "loss": 0.4744, "step": 663 }, { "epoch": 0.020236344657864058, "grad_norm": 0.07555169612169266, "learning_rate": 3.3400000000000005e-05, "loss": 0.4153, "step": 664 }, { "epoch": 0.020266821080541567, "grad_norm": 0.07731566578149796, "learning_rate": 3.3375e-05, "loss": 0.4747, "step": 665 }, { "epoch": 0.020297297503219073, "grad_norm": 0.08030174672603607, "learning_rate": 3.3350000000000004e-05, "loss": 0.4968, "step": 666 }, { "epoch": 0.02032777392589658, "grad_norm": 0.08051862567663193, "learning_rate": 3.3325000000000004e-05, "loss": 0.5116, "step": 667 }, { "epoch": 0.020358250348574084, "grad_norm": 0.08271659910678864, "learning_rate": 3.33e-05, "loss": 0.5148, "step": 668 }, { "epoch": 0.02038872677125159, "grad_norm": 0.0845998004078865, "learning_rate": 3.3275e-05, "loss": 0.5518, "step": 669 }, { "epoch": 0.020419203193929095, "grad_norm": 0.07129185646772385, "learning_rate": 3.325e-05, "loss": 0.4154, "step": 670 }, { "epoch": 0.020449679616606604, "grad_norm": 0.08390328288078308, "learning_rate": 3.3225e-05, "loss": 0.5765, "step": 671 }, { "epoch": 0.02048015603928411, "grad_norm": 0.08010441064834595, "learning_rate": 3.32e-05, "loss": 0.4579, "step": 672 }, { "epoch": 0.020510632461961616, "grad_norm": 0.08220945298671722, "learning_rate": 3.3175e-05, "loss": 0.5184, "step": 673 }, { "epoch": 0.02054110888463912, "grad_norm": 0.0890781581401825, "learning_rate": 3.3150000000000006e-05, "loss": 0.6132, "step": 674 }, { "epoch": 0.020571585307316627, "grad_norm": 0.07920479029417038, "learning_rate": 3.3125e-05, "loss": 0.4162, "step": 675 }, { "epoch": 0.020602061729994132, "grad_norm": 0.07725699990987778, "learning_rate": 3.3100000000000005e-05, "loss": 0.4816, "step": 676 }, { "epoch": 0.020632538152671638, "grad_norm": 0.08018038421869278, "learning_rate": 3.3075e-05, "loss": 0.4981, "step": 677 }, { "epoch": 0.020663014575349147, "grad_norm": 0.08136797696352005, "learning_rate": 3.3050000000000004e-05, "loss": 0.5078, "step": 678 }, { "epoch": 0.020693490998026653, "grad_norm": 0.2175755649805069, "learning_rate": 3.3025e-05, "loss": 0.3835, "step": 679 }, { "epoch": 0.02072396742070416, "grad_norm": 0.07626160979270935, "learning_rate": 3.3e-05, "loss": 0.5028, "step": 680 }, { "epoch": 0.020754443843381664, "grad_norm": 0.07610258460044861, "learning_rate": 3.2975e-05, "loss": 0.3755, "step": 681 }, { "epoch": 0.02078492026605917, "grad_norm": 0.07580359280109406, "learning_rate": 3.295e-05, "loss": 0.4479, "step": 682 }, { "epoch": 0.020815396688736675, "grad_norm": 0.09018397331237793, "learning_rate": 3.2925e-05, "loss": 0.5776, "step": 683 }, { "epoch": 0.02084587311141418, "grad_norm": 0.07913251221179962, "learning_rate": 3.29e-05, "loss": 0.4948, "step": 684 }, { "epoch": 0.02087634953409169, "grad_norm": 0.07612618058919907, "learning_rate": 3.2875e-05, "loss": 0.4493, "step": 685 }, { "epoch": 0.020906825956769196, "grad_norm": 0.08366430550813675, "learning_rate": 3.2850000000000006e-05, "loss": 0.4791, "step": 686 }, { "epoch": 0.0209373023794467, "grad_norm": 0.07185101509094238, "learning_rate": 3.2825e-05, "loss": 0.4288, "step": 687 }, { "epoch": 0.020967778802124207, "grad_norm": 0.08122757822275162, "learning_rate": 3.2800000000000004e-05, "loss": 0.487, "step": 688 }, { "epoch": 0.020998255224801712, "grad_norm": 0.08130741119384766, "learning_rate": 3.2775e-05, "loss": 0.5136, "step": 689 }, { "epoch": 0.021028731647479218, "grad_norm": 0.07637075334787369, "learning_rate": 3.275e-05, "loss": 0.4971, "step": 690 }, { "epoch": 0.021059208070156724, "grad_norm": 0.06764618307352066, "learning_rate": 3.2725e-05, "loss": 0.3474, "step": 691 }, { "epoch": 0.021089684492834233, "grad_norm": 0.07166243344545364, "learning_rate": 3.27e-05, "loss": 0.389, "step": 692 }, { "epoch": 0.02112016091551174, "grad_norm": 0.07807081192731857, "learning_rate": 3.2675e-05, "loss": 0.4769, "step": 693 }, { "epoch": 0.021150637338189244, "grad_norm": 0.07140066474676132, "learning_rate": 3.265e-05, "loss": 0.3885, "step": 694 }, { "epoch": 0.02118111376086675, "grad_norm": 0.06999816745519638, "learning_rate": 3.2625e-05, "loss": 0.393, "step": 695 }, { "epoch": 0.021211590183544255, "grad_norm": 0.07473816722631454, "learning_rate": 3.26e-05, "loss": 0.4723, "step": 696 }, { "epoch": 0.02124206660622176, "grad_norm": 0.07314229756593704, "learning_rate": 3.2575e-05, "loss": 0.4172, "step": 697 }, { "epoch": 0.021272543028899266, "grad_norm": 0.07526496052742004, "learning_rate": 3.2550000000000005e-05, "loss": 0.4456, "step": 698 }, { "epoch": 0.021303019451576775, "grad_norm": 0.07897503674030304, "learning_rate": 3.2525e-05, "loss": 0.5004, "step": 699 }, { "epoch": 0.02133349587425428, "grad_norm": 0.06984610110521317, "learning_rate": 3.2500000000000004e-05, "loss": 0.4189, "step": 700 }, { "epoch": 0.021363972296931787, "grad_norm": 0.1035536453127861, "learning_rate": 3.2474999999999997e-05, "loss": 0.3995, "step": 701 }, { "epoch": 0.021394448719609292, "grad_norm": 0.08006973564624786, "learning_rate": 3.245e-05, "loss": 0.4859, "step": 702 }, { "epoch": 0.021424925142286798, "grad_norm": 0.0787738636136055, "learning_rate": 3.2425e-05, "loss": 0.476, "step": 703 }, { "epoch": 0.021455401564964303, "grad_norm": 0.06794199347496033, "learning_rate": 3.24e-05, "loss": 0.3785, "step": 704 }, { "epoch": 0.02148587798764181, "grad_norm": 0.06511055678129196, "learning_rate": 3.2375e-05, "loss": 0.3517, "step": 705 }, { "epoch": 0.021516354410319318, "grad_norm": 0.07990792393684387, "learning_rate": 3.235e-05, "loss": 0.4686, "step": 706 }, { "epoch": 0.021546830832996824, "grad_norm": 0.07449106127023697, "learning_rate": 3.2325e-05, "loss": 0.5081, "step": 707 }, { "epoch": 0.02157730725567433, "grad_norm": 0.08249309659004211, "learning_rate": 3.2300000000000006e-05, "loss": 0.6008, "step": 708 }, { "epoch": 0.021607783678351835, "grad_norm": 0.07654834538698196, "learning_rate": 3.2275e-05, "loss": 0.4558, "step": 709 }, { "epoch": 0.02163826010102934, "grad_norm": 0.08403019607067108, "learning_rate": 3.2250000000000005e-05, "loss": 0.5323, "step": 710 }, { "epoch": 0.021668736523706846, "grad_norm": 0.08114602416753769, "learning_rate": 3.2225e-05, "loss": 0.4803, "step": 711 }, { "epoch": 0.021699212946384352, "grad_norm": 0.07625332474708557, "learning_rate": 3.2200000000000003e-05, "loss": 0.4384, "step": 712 }, { "epoch": 0.02172968936906186, "grad_norm": 0.07679414004087448, "learning_rate": 3.2175e-05, "loss": 0.4011, "step": 713 }, { "epoch": 0.021760165791739366, "grad_norm": 0.08277739584445953, "learning_rate": 3.215e-05, "loss": 0.5307, "step": 714 }, { "epoch": 0.021790642214416872, "grad_norm": 0.07288917154073715, "learning_rate": 3.2125e-05, "loss": 0.4146, "step": 715 }, { "epoch": 0.021821118637094378, "grad_norm": 0.07741447538137436, "learning_rate": 3.21e-05, "loss": 0.4549, "step": 716 }, { "epoch": 0.021851595059771883, "grad_norm": 0.08456466346979141, "learning_rate": 3.2075e-05, "loss": 0.4929, "step": 717 }, { "epoch": 0.02188207148244939, "grad_norm": 0.07214605063199997, "learning_rate": 3.205e-05, "loss": 0.3766, "step": 718 }, { "epoch": 0.021912547905126895, "grad_norm": 0.07888051122426987, "learning_rate": 3.2025e-05, "loss": 0.4265, "step": 719 }, { "epoch": 0.021943024327804404, "grad_norm": 0.08320137113332748, "learning_rate": 3.2000000000000005e-05, "loss": 0.5246, "step": 720 }, { "epoch": 0.02197350075048191, "grad_norm": 0.07392632961273193, "learning_rate": 3.1975e-05, "loss": 0.4598, "step": 721 }, { "epoch": 0.022003977173159415, "grad_norm": 0.08072426915168762, "learning_rate": 3.1950000000000004e-05, "loss": 0.5273, "step": 722 }, { "epoch": 0.02203445359583692, "grad_norm": 0.0754271000623703, "learning_rate": 3.1925e-05, "loss": 0.4529, "step": 723 }, { "epoch": 0.022064930018514426, "grad_norm": 0.0795695036649704, "learning_rate": 3.19e-05, "loss": 0.4801, "step": 724 }, { "epoch": 0.02209540644119193, "grad_norm": 0.07074704021215439, "learning_rate": 3.1875e-05, "loss": 0.4173, "step": 725 }, { "epoch": 0.022125882863869437, "grad_norm": 0.07941755652427673, "learning_rate": 3.185e-05, "loss": 0.501, "step": 726 }, { "epoch": 0.022156359286546946, "grad_norm": 0.07171125710010529, "learning_rate": 3.1825e-05, "loss": 0.4274, "step": 727 }, { "epoch": 0.022186835709224452, "grad_norm": 0.07893607765436172, "learning_rate": 3.18e-05, "loss": 0.5057, "step": 728 }, { "epoch": 0.022217312131901958, "grad_norm": 0.08008928596973419, "learning_rate": 3.1775e-05, "loss": 0.4689, "step": 729 }, { "epoch": 0.022247788554579463, "grad_norm": 0.06810446828603745, "learning_rate": 3.175e-05, "loss": 0.4119, "step": 730 }, { "epoch": 0.02227826497725697, "grad_norm": 0.07091254740953445, "learning_rate": 3.1725e-05, "loss": 0.4232, "step": 731 }, { "epoch": 0.022308741399934474, "grad_norm": 0.07897093892097473, "learning_rate": 3.1700000000000005e-05, "loss": 0.4323, "step": 732 }, { "epoch": 0.022339217822611983, "grad_norm": 0.08282682299613953, "learning_rate": 3.1675e-05, "loss": 0.4838, "step": 733 }, { "epoch": 0.02236969424528949, "grad_norm": 0.07797695696353912, "learning_rate": 3.1650000000000004e-05, "loss": 0.4952, "step": 734 }, { "epoch": 0.022400170667966995, "grad_norm": 0.07180066406726837, "learning_rate": 3.1624999999999996e-05, "loss": 0.4105, "step": 735 }, { "epoch": 0.0224306470906445, "grad_norm": 0.07071620225906372, "learning_rate": 3.16e-05, "loss": 0.389, "step": 736 }, { "epoch": 0.022461123513322006, "grad_norm": 0.0747050866484642, "learning_rate": 3.1575e-05, "loss": 0.4194, "step": 737 }, { "epoch": 0.02249159993599951, "grad_norm": 0.07323236018419266, "learning_rate": 3.155e-05, "loss": 0.4489, "step": 738 }, { "epoch": 0.022522076358677017, "grad_norm": 0.0762617215514183, "learning_rate": 3.1525e-05, "loss": 0.4291, "step": 739 }, { "epoch": 0.022552552781354526, "grad_norm": 0.07539180666208267, "learning_rate": 3.15e-05, "loss": 0.4426, "step": 740 }, { "epoch": 0.022583029204032032, "grad_norm": 0.07351407408714294, "learning_rate": 3.1475e-05, "loss": 0.4275, "step": 741 }, { "epoch": 0.022613505626709537, "grad_norm": 0.08230896294116974, "learning_rate": 3.145e-05, "loss": 0.4817, "step": 742 }, { "epoch": 0.022643982049387043, "grad_norm": 0.08832620829343796, "learning_rate": 3.1425e-05, "loss": 0.555, "step": 743 }, { "epoch": 0.02267445847206455, "grad_norm": 0.07944190502166748, "learning_rate": 3.1400000000000004e-05, "loss": 0.4806, "step": 744 }, { "epoch": 0.022704934894742054, "grad_norm": 0.07576034963130951, "learning_rate": 3.1375e-05, "loss": 0.4409, "step": 745 }, { "epoch": 0.02273541131741956, "grad_norm": 0.078110471367836, "learning_rate": 3.135e-05, "loss": 0.4491, "step": 746 }, { "epoch": 0.02276588774009707, "grad_norm": 0.07695135474205017, "learning_rate": 3.1324999999999996e-05, "loss": 0.4612, "step": 747 }, { "epoch": 0.022796364162774575, "grad_norm": 0.07881361991167068, "learning_rate": 3.13e-05, "loss": 0.4933, "step": 748 }, { "epoch": 0.02282684058545208, "grad_norm": 0.08591780811548233, "learning_rate": 3.1275e-05, "loss": 0.4795, "step": 749 }, { "epoch": 0.022857317008129586, "grad_norm": 0.0761345773935318, "learning_rate": 3.125e-05, "loss": 0.4048, "step": 750 }, { "epoch": 0.02288779343080709, "grad_norm": 0.09167242795228958, "learning_rate": 3.122500000000001e-05, "loss": 0.5331, "step": 751 }, { "epoch": 0.022918269853484597, "grad_norm": 0.08001095801591873, "learning_rate": 3.12e-05, "loss": 0.4798, "step": 752 }, { "epoch": 0.022948746276162103, "grad_norm": 0.07636299729347229, "learning_rate": 3.1175000000000006e-05, "loss": 0.463, "step": 753 }, { "epoch": 0.02297922269883961, "grad_norm": 0.07507184892892838, "learning_rate": 3.115e-05, "loss": 0.4459, "step": 754 }, { "epoch": 0.023009699121517117, "grad_norm": 0.07079699635505676, "learning_rate": 3.1125000000000004e-05, "loss": 0.385, "step": 755 }, { "epoch": 0.023040175544194623, "grad_norm": 0.0726543515920639, "learning_rate": 3.1100000000000004e-05, "loss": 0.4348, "step": 756 }, { "epoch": 0.02307065196687213, "grad_norm": 0.0678820088505745, "learning_rate": 3.1075e-05, "loss": 0.4063, "step": 757 }, { "epoch": 0.023101128389549634, "grad_norm": 0.07501821219921112, "learning_rate": 3.105e-05, "loss": 0.4158, "step": 758 }, { "epoch": 0.02313160481222714, "grad_norm": 0.09139441698789597, "learning_rate": 3.1025e-05, "loss": 0.6703, "step": 759 }, { "epoch": 0.023162081234904645, "grad_norm": 0.0785442665219307, "learning_rate": 3.1e-05, "loss": 0.4562, "step": 760 }, { "epoch": 0.023192557657582154, "grad_norm": 0.07859134674072266, "learning_rate": 3.0975e-05, "loss": 0.5137, "step": 761 }, { "epoch": 0.02322303408025966, "grad_norm": 0.08164365589618683, "learning_rate": 3.095e-05, "loss": 0.5048, "step": 762 }, { "epoch": 0.023253510502937166, "grad_norm": 0.0774446502327919, "learning_rate": 3.0925000000000006e-05, "loss": 0.4901, "step": 763 }, { "epoch": 0.02328398692561467, "grad_norm": 0.07779061794281006, "learning_rate": 3.09e-05, "loss": 0.4621, "step": 764 }, { "epoch": 0.023314463348292177, "grad_norm": 0.06896117329597473, "learning_rate": 3.0875000000000005e-05, "loss": 0.4264, "step": 765 }, { "epoch": 0.023344939770969683, "grad_norm": 0.07755974680185318, "learning_rate": 3.0850000000000004e-05, "loss": 0.5308, "step": 766 }, { "epoch": 0.023375416193647188, "grad_norm": 0.07218644767999649, "learning_rate": 3.0825000000000004e-05, "loss": 0.4374, "step": 767 }, { "epoch": 0.023405892616324697, "grad_norm": 0.07598605751991272, "learning_rate": 3.08e-05, "loss": 0.4497, "step": 768 }, { "epoch": 0.023436369039002203, "grad_norm": 0.3519637882709503, "learning_rate": 3.0775e-05, "loss": 0.4559, "step": 769 }, { "epoch": 0.02346684546167971, "grad_norm": 0.07984943687915802, "learning_rate": 3.075e-05, "loss": 0.4657, "step": 770 }, { "epoch": 0.023497321884357214, "grad_norm": 0.08020984381437302, "learning_rate": 3.0725e-05, "loss": 0.4857, "step": 771 }, { "epoch": 0.02352779830703472, "grad_norm": 0.08672421425580978, "learning_rate": 3.07e-05, "loss": 0.529, "step": 772 }, { "epoch": 0.023558274729712225, "grad_norm": 0.08113051950931549, "learning_rate": 3.067500000000001e-05, "loss": 0.4514, "step": 773 }, { "epoch": 0.02358875115238973, "grad_norm": 0.075934499502182, "learning_rate": 3.065e-05, "loss": 0.4351, "step": 774 }, { "epoch": 0.02361922757506724, "grad_norm": 0.08796574920415878, "learning_rate": 3.0625000000000006e-05, "loss": 0.5286, "step": 775 }, { "epoch": 0.023649703997744746, "grad_norm": 0.07502539455890656, "learning_rate": 3.06e-05, "loss": 0.4233, "step": 776 }, { "epoch": 0.02368018042042225, "grad_norm": 0.09704858809709549, "learning_rate": 3.0575000000000005e-05, "loss": 0.556, "step": 777 }, { "epoch": 0.023710656843099757, "grad_norm": 0.07310730218887329, "learning_rate": 3.0550000000000004e-05, "loss": 0.4294, "step": 778 }, { "epoch": 0.023741133265777262, "grad_norm": 0.08146729320287704, "learning_rate": 3.0525e-05, "loss": 0.5302, "step": 779 }, { "epoch": 0.023771609688454768, "grad_norm": 0.07184258103370667, "learning_rate": 3.05e-05, "loss": 0.4402, "step": 780 }, { "epoch": 0.023802086111132274, "grad_norm": 0.06501343846321106, "learning_rate": 3.0475000000000002e-05, "loss": 0.3524, "step": 781 }, { "epoch": 0.023832562533809783, "grad_norm": 0.08379477262496948, "learning_rate": 3.045e-05, "loss": 0.5233, "step": 782 }, { "epoch": 0.02386303895648729, "grad_norm": 0.08310538530349731, "learning_rate": 3.0425000000000004e-05, "loss": 0.4919, "step": 783 }, { "epoch": 0.023893515379164794, "grad_norm": 0.0763951763510704, "learning_rate": 3.04e-05, "loss": 0.4556, "step": 784 }, { "epoch": 0.0239239918018423, "grad_norm": 0.07481810450553894, "learning_rate": 3.0375000000000003e-05, "loss": 0.4579, "step": 785 }, { "epoch": 0.023954468224519805, "grad_norm": 0.0734109878540039, "learning_rate": 3.035e-05, "loss": 0.4327, "step": 786 }, { "epoch": 0.02398494464719731, "grad_norm": 0.07707050442695618, "learning_rate": 3.0325000000000002e-05, "loss": 0.3948, "step": 787 }, { "epoch": 0.02401542106987482, "grad_norm": 0.07522984594106674, "learning_rate": 3.03e-05, "loss": 0.444, "step": 788 }, { "epoch": 0.024045897492552325, "grad_norm": 0.07829128950834274, "learning_rate": 3.0275000000000004e-05, "loss": 0.4071, "step": 789 }, { "epoch": 0.02407637391522983, "grad_norm": 0.07951958477497101, "learning_rate": 3.025e-05, "loss": 0.4332, "step": 790 }, { "epoch": 0.024106850337907337, "grad_norm": 0.07313957065343857, "learning_rate": 3.0225000000000003e-05, "loss": 0.442, "step": 791 }, { "epoch": 0.024137326760584842, "grad_norm": 0.07657737284898758, "learning_rate": 3.02e-05, "loss": 0.4563, "step": 792 }, { "epoch": 0.024167803183262348, "grad_norm": 0.07271629571914673, "learning_rate": 3.0175e-05, "loss": 0.3932, "step": 793 }, { "epoch": 0.024198279605939853, "grad_norm": 0.0736134797334671, "learning_rate": 3.015e-05, "loss": 0.4575, "step": 794 }, { "epoch": 0.024228756028617363, "grad_norm": 0.07868345826864243, "learning_rate": 3.0125000000000004e-05, "loss": 0.5188, "step": 795 }, { "epoch": 0.024259232451294868, "grad_norm": 0.08168003708124161, "learning_rate": 3.01e-05, "loss": 0.5095, "step": 796 }, { "epoch": 0.024289708873972374, "grad_norm": 0.07721247524023056, "learning_rate": 3.0075000000000003e-05, "loss": 0.4524, "step": 797 }, { "epoch": 0.02432018529664988, "grad_norm": 0.07874132692813873, "learning_rate": 3.0050000000000002e-05, "loss": 0.5256, "step": 798 }, { "epoch": 0.024350661719327385, "grad_norm": 0.06846034526824951, "learning_rate": 3.0025000000000005e-05, "loss": 0.4124, "step": 799 }, { "epoch": 0.02438113814200489, "grad_norm": 0.07739362120628357, "learning_rate": 3e-05, "loss": 0.4493, "step": 800 }, { "epoch": 0.024411614564682396, "grad_norm": 0.07584405690431595, "learning_rate": 2.9975000000000004e-05, "loss": 0.472, "step": 801 }, { "epoch": 0.024442090987359905, "grad_norm": 0.08582182228565216, "learning_rate": 2.995e-05, "loss": 0.6001, "step": 802 }, { "epoch": 0.02447256741003741, "grad_norm": 0.07978557050228119, "learning_rate": 2.9925000000000002e-05, "loss": 0.4817, "step": 803 }, { "epoch": 0.024503043832714917, "grad_norm": 0.08911380171775818, "learning_rate": 2.9900000000000002e-05, "loss": 0.5408, "step": 804 }, { "epoch": 0.024533520255392422, "grad_norm": 0.07432105392217636, "learning_rate": 2.9875000000000004e-05, "loss": 0.472, "step": 805 }, { "epoch": 0.024563996678069928, "grad_norm": 0.07940880209207535, "learning_rate": 2.985e-05, "loss": 0.4402, "step": 806 }, { "epoch": 0.024594473100747433, "grad_norm": 0.07357704639434814, "learning_rate": 2.9825000000000003e-05, "loss": 0.452, "step": 807 }, { "epoch": 0.02462494952342494, "grad_norm": 0.07442279905080795, "learning_rate": 2.98e-05, "loss": 0.436, "step": 808 }, { "epoch": 0.024655425946102448, "grad_norm": 0.07187695801258087, "learning_rate": 2.9775000000000002e-05, "loss": 0.3862, "step": 809 }, { "epoch": 0.024685902368779954, "grad_norm": 0.07520197331905365, "learning_rate": 2.975e-05, "loss": 0.4595, "step": 810 }, { "epoch": 0.02471637879145746, "grad_norm": 0.07910121232271194, "learning_rate": 2.9725000000000004e-05, "loss": 0.3978, "step": 811 }, { "epoch": 0.024746855214134965, "grad_norm": 0.0761004164814949, "learning_rate": 2.97e-05, "loss": 0.4813, "step": 812 }, { "epoch": 0.02477733163681247, "grad_norm": 0.0793793648481369, "learning_rate": 2.9675000000000003e-05, "loss": 0.4374, "step": 813 }, { "epoch": 0.024807808059489976, "grad_norm": 0.07510001957416534, "learning_rate": 2.965e-05, "loss": 0.4357, "step": 814 }, { "epoch": 0.02483828448216748, "grad_norm": 0.0735272616147995, "learning_rate": 2.9625000000000002e-05, "loss": 0.4432, "step": 815 }, { "epoch": 0.02486876090484499, "grad_norm": 0.08128804713487625, "learning_rate": 2.96e-05, "loss": 0.4972, "step": 816 }, { "epoch": 0.024899237327522496, "grad_norm": 0.07027821987867355, "learning_rate": 2.9575000000000004e-05, "loss": 0.3901, "step": 817 }, { "epoch": 0.024929713750200002, "grad_norm": 0.07340038567781448, "learning_rate": 2.955e-05, "loss": 0.4242, "step": 818 }, { "epoch": 0.024960190172877508, "grad_norm": 0.07834784686565399, "learning_rate": 2.9525000000000003e-05, "loss": 0.4452, "step": 819 }, { "epoch": 0.024990666595555013, "grad_norm": 0.07630225270986557, "learning_rate": 2.95e-05, "loss": 0.4342, "step": 820 }, { "epoch": 0.02502114301823252, "grad_norm": 0.07949775457382202, "learning_rate": 2.9475e-05, "loss": 0.4565, "step": 821 }, { "epoch": 0.025051619440910024, "grad_norm": 0.07719437032938004, "learning_rate": 2.945e-05, "loss": 0.4949, "step": 822 }, { "epoch": 0.025082095863587534, "grad_norm": 0.07572416961193085, "learning_rate": 2.9425000000000004e-05, "loss": 0.4684, "step": 823 }, { "epoch": 0.02511257228626504, "grad_norm": 0.07777194678783417, "learning_rate": 2.94e-05, "loss": 0.4592, "step": 824 }, { "epoch": 0.025143048708942545, "grad_norm": 0.06550128757953644, "learning_rate": 2.9375000000000003e-05, "loss": 0.3597, "step": 825 }, { "epoch": 0.02517352513162005, "grad_norm": 0.07669232785701752, "learning_rate": 2.935e-05, "loss": 0.4399, "step": 826 }, { "epoch": 0.025204001554297556, "grad_norm": 0.07982314378023148, "learning_rate": 2.9325e-05, "loss": 0.5271, "step": 827 }, { "epoch": 0.02523447797697506, "grad_norm": 0.08627428859472275, "learning_rate": 2.93e-05, "loss": 0.4853, "step": 828 }, { "epoch": 0.025264954399652567, "grad_norm": 0.07127828896045685, "learning_rate": 2.9275000000000003e-05, "loss": 0.411, "step": 829 }, { "epoch": 0.025295430822330076, "grad_norm": 0.0848105251789093, "learning_rate": 2.925e-05, "loss": 0.5029, "step": 830 }, { "epoch": 0.025325907245007582, "grad_norm": 0.0756024420261383, "learning_rate": 2.9225000000000002e-05, "loss": 0.4459, "step": 831 }, { "epoch": 0.025356383667685087, "grad_norm": 0.08566156774759293, "learning_rate": 2.9199999999999998e-05, "loss": 0.5292, "step": 832 }, { "epoch": 0.025386860090362593, "grad_norm": 0.06998670101165771, "learning_rate": 2.9175e-05, "loss": 0.3855, "step": 833 }, { "epoch": 0.0254173365130401, "grad_norm": 0.07515319436788559, "learning_rate": 2.915e-05, "loss": 0.4355, "step": 834 }, { "epoch": 0.025447812935717604, "grad_norm": 0.08191827684640884, "learning_rate": 2.9125000000000003e-05, "loss": 0.5317, "step": 835 }, { "epoch": 0.02547828935839511, "grad_norm": 0.09056302160024643, "learning_rate": 2.91e-05, "loss": 0.53, "step": 836 }, { "epoch": 0.02550876578107262, "grad_norm": 0.08108913153409958, "learning_rate": 2.9075000000000002e-05, "loss": 0.5123, "step": 837 }, { "epoch": 0.025539242203750125, "grad_norm": 0.07883120328187943, "learning_rate": 2.9049999999999998e-05, "loss": 0.5174, "step": 838 }, { "epoch": 0.02556971862642763, "grad_norm": 0.06701305508613586, "learning_rate": 2.9025e-05, "loss": 0.3485, "step": 839 }, { "epoch": 0.025600195049105136, "grad_norm": 0.07452358305454254, "learning_rate": 2.9e-05, "loss": 0.48, "step": 840 }, { "epoch": 0.02563067147178264, "grad_norm": 0.07280770689249039, "learning_rate": 2.8975000000000003e-05, "loss": 0.4057, "step": 841 }, { "epoch": 0.025661147894460147, "grad_norm": 0.08248447626829147, "learning_rate": 2.895e-05, "loss": 0.5287, "step": 842 }, { "epoch": 0.025691624317137653, "grad_norm": 0.07846277207136154, "learning_rate": 2.8925000000000002e-05, "loss": 0.5575, "step": 843 }, { "epoch": 0.025722100739815162, "grad_norm": 0.07679636776447296, "learning_rate": 2.8899999999999998e-05, "loss": 0.4295, "step": 844 }, { "epoch": 0.025752577162492667, "grad_norm": 0.07498426735401154, "learning_rate": 2.8875e-05, "loss": 0.4015, "step": 845 }, { "epoch": 0.025783053585170173, "grad_norm": 0.07252594083547592, "learning_rate": 2.885e-05, "loss": 0.4005, "step": 846 }, { "epoch": 0.02581353000784768, "grad_norm": 0.0790233165025711, "learning_rate": 2.8825000000000003e-05, "loss": 0.4808, "step": 847 }, { "epoch": 0.025844006430525184, "grad_norm": 0.07086073607206345, "learning_rate": 2.88e-05, "loss": 0.3973, "step": 848 }, { "epoch": 0.02587448285320269, "grad_norm": 0.074672631919384, "learning_rate": 2.8775e-05, "loss": 0.485, "step": 849 }, { "epoch": 0.0259049592758802, "grad_norm": 0.07642818987369537, "learning_rate": 2.8749999999999997e-05, "loss": 0.4712, "step": 850 }, { "epoch": 0.025935435698557704, "grad_norm": 0.08130404353141785, "learning_rate": 2.8725e-05, "loss": 0.476, "step": 851 }, { "epoch": 0.02596591212123521, "grad_norm": 0.0773453637957573, "learning_rate": 2.87e-05, "loss": 0.4313, "step": 852 }, { "epoch": 0.025996388543912716, "grad_norm": 0.07671422511339188, "learning_rate": 2.8675000000000002e-05, "loss": 0.4634, "step": 853 }, { "epoch": 0.02602686496659022, "grad_norm": 0.07637978345155716, "learning_rate": 2.865e-05, "loss": 0.4494, "step": 854 }, { "epoch": 0.026057341389267727, "grad_norm": 0.08301280438899994, "learning_rate": 2.8625e-05, "loss": 0.4933, "step": 855 }, { "epoch": 0.026087817811945233, "grad_norm": 0.07347893714904785, "learning_rate": 2.86e-05, "loss": 0.4504, "step": 856 }, { "epoch": 0.02611829423462274, "grad_norm": 0.06668021529912949, "learning_rate": 2.8575000000000003e-05, "loss": 0.3645, "step": 857 }, { "epoch": 0.026148770657300247, "grad_norm": 0.0817548856139183, "learning_rate": 2.855e-05, "loss": 0.5139, "step": 858 }, { "epoch": 0.026179247079977753, "grad_norm": 0.07030104845762253, "learning_rate": 2.8525000000000002e-05, "loss": 0.3698, "step": 859 }, { "epoch": 0.02620972350265526, "grad_norm": 0.07268773019313812, "learning_rate": 2.8499999999999998e-05, "loss": 0.447, "step": 860 }, { "epoch": 0.026240199925332764, "grad_norm": 0.07412206381559372, "learning_rate": 2.8475e-05, "loss": 0.4167, "step": 861 }, { "epoch": 0.02627067634801027, "grad_norm": 0.07715493440628052, "learning_rate": 2.845e-05, "loss": 0.4785, "step": 862 }, { "epoch": 0.026301152770687775, "grad_norm": 0.07998108863830566, "learning_rate": 2.8425000000000003e-05, "loss": 0.5194, "step": 863 }, { "epoch": 0.026331629193365284, "grad_norm": 0.08235049992799759, "learning_rate": 2.84e-05, "loss": 0.5274, "step": 864 }, { "epoch": 0.02636210561604279, "grad_norm": 0.08126657456159592, "learning_rate": 2.8375000000000002e-05, "loss": 0.5257, "step": 865 }, { "epoch": 0.026392582038720296, "grad_norm": 0.07651044428348541, "learning_rate": 2.8349999999999998e-05, "loss": 0.4761, "step": 866 }, { "epoch": 0.0264230584613978, "grad_norm": 0.08697587996721268, "learning_rate": 2.8325e-05, "loss": 0.5337, "step": 867 }, { "epoch": 0.026453534884075307, "grad_norm": 0.07071240246295929, "learning_rate": 2.83e-05, "loss": 0.4247, "step": 868 }, { "epoch": 0.026484011306752812, "grad_norm": 0.08284544199705124, "learning_rate": 2.8275000000000003e-05, "loss": 0.4647, "step": 869 }, { "epoch": 0.026514487729430318, "grad_norm": 0.0824238657951355, "learning_rate": 2.825e-05, "loss": 0.5029, "step": 870 }, { "epoch": 0.026544964152107827, "grad_norm": 0.0750720426440239, "learning_rate": 2.8225e-05, "loss": 0.3417, "step": 871 }, { "epoch": 0.026575440574785333, "grad_norm": 0.0718650221824646, "learning_rate": 2.8199999999999998e-05, "loss": 0.4061, "step": 872 }, { "epoch": 0.02660591699746284, "grad_norm": 0.07437416166067123, "learning_rate": 2.8175e-05, "loss": 0.4062, "step": 873 }, { "epoch": 0.026636393420140344, "grad_norm": 0.07978783547878265, "learning_rate": 2.815e-05, "loss": 0.4686, "step": 874 }, { "epoch": 0.02666686984281785, "grad_norm": 0.08174781501293182, "learning_rate": 2.8125000000000003e-05, "loss": 0.4964, "step": 875 }, { "epoch": 0.026697346265495355, "grad_norm": 0.07302827388048172, "learning_rate": 2.8100000000000005e-05, "loss": 0.4249, "step": 876 }, { "epoch": 0.02672782268817286, "grad_norm": 0.08205913752317429, "learning_rate": 2.8075e-05, "loss": 0.4578, "step": 877 }, { "epoch": 0.02675829911085037, "grad_norm": 0.07700104266405106, "learning_rate": 2.8050000000000004e-05, "loss": 0.416, "step": 878 }, { "epoch": 0.026788775533527875, "grad_norm": 0.07367152720689774, "learning_rate": 2.8025e-05, "loss": 0.4031, "step": 879 }, { "epoch": 0.02681925195620538, "grad_norm": 0.06761021912097931, "learning_rate": 2.8000000000000003e-05, "loss": 0.3699, "step": 880 }, { "epoch": 0.026849728378882887, "grad_norm": 0.07701084017753601, "learning_rate": 2.7975000000000002e-05, "loss": 0.4377, "step": 881 }, { "epoch": 0.026880204801560392, "grad_norm": 0.08449683338403702, "learning_rate": 2.7950000000000005e-05, "loss": 0.5478, "step": 882 }, { "epoch": 0.026910681224237898, "grad_norm": 0.07704976201057434, "learning_rate": 2.7925e-05, "loss": 0.4683, "step": 883 }, { "epoch": 0.026941157646915403, "grad_norm": 0.07891270518302917, "learning_rate": 2.7900000000000004e-05, "loss": 0.523, "step": 884 }, { "epoch": 0.026971634069592913, "grad_norm": 0.08331729471683502, "learning_rate": 2.7875e-05, "loss": 0.522, "step": 885 }, { "epoch": 0.027002110492270418, "grad_norm": 0.08033519983291626, "learning_rate": 2.7850000000000003e-05, "loss": 0.4894, "step": 886 }, { "epoch": 0.027032586914947924, "grad_norm": 0.081100232899189, "learning_rate": 2.7825000000000002e-05, "loss": 0.4492, "step": 887 }, { "epoch": 0.02706306333762543, "grad_norm": 0.07502605020999908, "learning_rate": 2.7800000000000005e-05, "loss": 0.4425, "step": 888 }, { "epoch": 0.027093539760302935, "grad_norm": 0.09724737703800201, "learning_rate": 2.7775e-05, "loss": 0.3933, "step": 889 }, { "epoch": 0.02712401618298044, "grad_norm": 0.07818922400474548, "learning_rate": 2.7750000000000004e-05, "loss": 0.4874, "step": 890 }, { "epoch": 0.027154492605657946, "grad_norm": 0.07230567932128906, "learning_rate": 2.7725e-05, "loss": 0.3946, "step": 891 }, { "epoch": 0.027184969028335455, "grad_norm": 0.07353516668081284, "learning_rate": 2.7700000000000002e-05, "loss": 0.4133, "step": 892 }, { "epoch": 0.02721544545101296, "grad_norm": 0.076216921210289, "learning_rate": 2.7675000000000002e-05, "loss": 0.4539, "step": 893 }, { "epoch": 0.027245921873690467, "grad_norm": 0.08195734769105911, "learning_rate": 2.7650000000000005e-05, "loss": 0.524, "step": 894 }, { "epoch": 0.027276398296367972, "grad_norm": 0.09346943348646164, "learning_rate": 2.7625e-05, "loss": 0.5292, "step": 895 }, { "epoch": 0.027306874719045478, "grad_norm": 0.07986555993556976, "learning_rate": 2.7600000000000003e-05, "loss": 0.4649, "step": 896 }, { "epoch": 0.027337351141722983, "grad_norm": 0.07472653687000275, "learning_rate": 2.7575e-05, "loss": 0.4971, "step": 897 }, { "epoch": 0.02736782756440049, "grad_norm": 0.07901741564273834, "learning_rate": 2.7550000000000002e-05, "loss": 0.4489, "step": 898 }, { "epoch": 0.027398303987077998, "grad_norm": 0.07170873135328293, "learning_rate": 2.7525e-05, "loss": 0.387, "step": 899 }, { "epoch": 0.027428780409755504, "grad_norm": 0.08016093075275421, "learning_rate": 2.7500000000000004e-05, "loss": 0.4551, "step": 900 }, { "epoch": 0.02745925683243301, "grad_norm": 0.08166582882404327, "learning_rate": 2.7475e-05, "loss": 0.4825, "step": 901 }, { "epoch": 0.027489733255110515, "grad_norm": 0.08589409291744232, "learning_rate": 2.7450000000000003e-05, "loss": 0.5278, "step": 902 }, { "epoch": 0.02752020967778802, "grad_norm": 0.08061208575963974, "learning_rate": 2.7425e-05, "loss": 0.5019, "step": 903 }, { "epoch": 0.027550686100465526, "grad_norm": 0.0761689618229866, "learning_rate": 2.7400000000000002e-05, "loss": 0.3898, "step": 904 }, { "epoch": 0.027581162523143035, "grad_norm": 0.06770871579647064, "learning_rate": 2.7375e-05, "loss": 0.3516, "step": 905 }, { "epoch": 0.02761163894582054, "grad_norm": 0.07166413962841034, "learning_rate": 2.7350000000000004e-05, "loss": 0.4102, "step": 906 }, { "epoch": 0.027642115368498046, "grad_norm": 0.06481317430734634, "learning_rate": 2.7325e-05, "loss": 0.3763, "step": 907 }, { "epoch": 0.027672591791175552, "grad_norm": 0.08476804196834564, "learning_rate": 2.7300000000000003e-05, "loss": 0.521, "step": 908 }, { "epoch": 0.027703068213853058, "grad_norm": 0.07578303664922714, "learning_rate": 2.7275e-05, "loss": 0.4646, "step": 909 }, { "epoch": 0.027733544636530563, "grad_norm": 0.07119432091712952, "learning_rate": 2.725e-05, "loss": 0.3692, "step": 910 }, { "epoch": 0.02776402105920807, "grad_norm": 0.07629730552434921, "learning_rate": 2.7225e-05, "loss": 0.5038, "step": 911 }, { "epoch": 0.027794497481885578, "grad_norm": 0.08684691041707993, "learning_rate": 2.7200000000000004e-05, "loss": 0.5172, "step": 912 }, { "epoch": 0.027824973904563084, "grad_norm": 0.08575067669153214, "learning_rate": 2.7175e-05, "loss": 0.5569, "step": 913 }, { "epoch": 0.02785545032724059, "grad_norm": 0.08256448805332184, "learning_rate": 2.7150000000000003e-05, "loss": 0.4824, "step": 914 }, { "epoch": 0.027885926749918095, "grad_norm": 0.07907640933990479, "learning_rate": 2.7125000000000002e-05, "loss": 0.5108, "step": 915 }, { "epoch": 0.0279164031725956, "grad_norm": 0.09498827159404755, "learning_rate": 2.7100000000000005e-05, "loss": 0.6134, "step": 916 }, { "epoch": 0.027946879595273106, "grad_norm": 0.07079599797725677, "learning_rate": 2.7075e-05, "loss": 0.4347, "step": 917 }, { "epoch": 0.02797735601795061, "grad_norm": 0.08141549676656723, "learning_rate": 2.7050000000000004e-05, "loss": 0.5286, "step": 918 }, { "epoch": 0.02800783244062812, "grad_norm": 0.07951369136571884, "learning_rate": 2.7025e-05, "loss": 0.4147, "step": 919 }, { "epoch": 0.028038308863305626, "grad_norm": 0.08554822206497192, "learning_rate": 2.7000000000000002e-05, "loss": 0.5255, "step": 920 }, { "epoch": 0.028068785285983132, "grad_norm": 0.08277765661478043, "learning_rate": 2.6975000000000002e-05, "loss": 0.5137, "step": 921 }, { "epoch": 0.028099261708660637, "grad_norm": 0.07661718875169754, "learning_rate": 2.6950000000000005e-05, "loss": 0.4252, "step": 922 }, { "epoch": 0.028129738131338143, "grad_norm": 0.08322782069444656, "learning_rate": 2.6925e-05, "loss": 0.5585, "step": 923 }, { "epoch": 0.02816021455401565, "grad_norm": 0.076521135866642, "learning_rate": 2.6900000000000003e-05, "loss": 0.3946, "step": 924 }, { "epoch": 0.028190690976693154, "grad_norm": 0.07340560108423233, "learning_rate": 2.6875e-05, "loss": 0.4031, "step": 925 }, { "epoch": 0.028221167399370663, "grad_norm": 0.07542892545461655, "learning_rate": 2.6850000000000002e-05, "loss": 0.4292, "step": 926 }, { "epoch": 0.02825164382204817, "grad_norm": 0.0771617665886879, "learning_rate": 2.6825e-05, "loss": 0.4353, "step": 927 }, { "epoch": 0.028282120244725675, "grad_norm": 0.07270224392414093, "learning_rate": 2.6800000000000004e-05, "loss": 0.4466, "step": 928 }, { "epoch": 0.02831259666740318, "grad_norm": 0.07965298742055893, "learning_rate": 2.6775e-05, "loss": 0.4467, "step": 929 }, { "epoch": 0.028343073090080686, "grad_norm": 0.07434156537055969, "learning_rate": 2.6750000000000003e-05, "loss": 0.4064, "step": 930 }, { "epoch": 0.02837354951275819, "grad_norm": 0.07496625930070877, "learning_rate": 2.6725e-05, "loss": 0.4218, "step": 931 }, { "epoch": 0.028404025935435697, "grad_norm": 0.0878630131483078, "learning_rate": 2.6700000000000002e-05, "loss": 0.5542, "step": 932 }, { "epoch": 0.028434502358113206, "grad_norm": 0.07509927451610565, "learning_rate": 2.6675e-05, "loss": 0.4269, "step": 933 }, { "epoch": 0.028464978780790712, "grad_norm": 0.07052884995937347, "learning_rate": 2.6650000000000004e-05, "loss": 0.4122, "step": 934 }, { "epoch": 0.028495455203468217, "grad_norm": 0.07624265551567078, "learning_rate": 2.6625e-05, "loss": 0.4548, "step": 935 }, { "epoch": 0.028525931626145723, "grad_norm": 0.07866404205560684, "learning_rate": 2.6600000000000003e-05, "loss": 0.5231, "step": 936 }, { "epoch": 0.02855640804882323, "grad_norm": 0.0840945839881897, "learning_rate": 2.6575e-05, "loss": 0.5385, "step": 937 }, { "epoch": 0.028586884471500734, "grad_norm": 0.07228624820709229, "learning_rate": 2.655e-05, "loss": 0.434, "step": 938 }, { "epoch": 0.02861736089417824, "grad_norm": 0.07409662008285522, "learning_rate": 2.6525e-05, "loss": 0.4441, "step": 939 }, { "epoch": 0.02864783731685575, "grad_norm": 0.07441885024309158, "learning_rate": 2.6500000000000004e-05, "loss": 0.4043, "step": 940 }, { "epoch": 0.028678313739533254, "grad_norm": 0.08886075019836426, "learning_rate": 2.6475e-05, "loss": 0.5384, "step": 941 }, { "epoch": 0.02870879016221076, "grad_norm": 0.07670284807682037, "learning_rate": 2.6450000000000003e-05, "loss": 0.4259, "step": 942 }, { "epoch": 0.028739266584888266, "grad_norm": 0.08708427846431732, "learning_rate": 2.6425e-05, "loss": 0.5333, "step": 943 }, { "epoch": 0.02876974300756577, "grad_norm": 0.08084359019994736, "learning_rate": 2.64e-05, "loss": 0.537, "step": 944 }, { "epoch": 0.028800219430243277, "grad_norm": 0.07058343291282654, "learning_rate": 2.6375e-05, "loss": 0.3948, "step": 945 }, { "epoch": 0.028830695852920783, "grad_norm": 0.07939262688159943, "learning_rate": 2.6350000000000004e-05, "loss": 0.5028, "step": 946 }, { "epoch": 0.02886117227559829, "grad_norm": 0.07882367819547653, "learning_rate": 2.6325e-05, "loss": 0.4717, "step": 947 }, { "epoch": 0.028891648698275797, "grad_norm": 0.08548135310411453, "learning_rate": 2.6300000000000002e-05, "loss": 0.57, "step": 948 }, { "epoch": 0.028922125120953303, "grad_norm": 0.07225639373064041, "learning_rate": 2.6275e-05, "loss": 0.4474, "step": 949 }, { "epoch": 0.02895260154363081, "grad_norm": 0.07281869649887085, "learning_rate": 2.625e-05, "loss": 0.4399, "step": 950 }, { "epoch": 0.028983077966308314, "grad_norm": 0.07666197419166565, "learning_rate": 2.6225e-05, "loss": 0.4405, "step": 951 }, { "epoch": 0.02901355438898582, "grad_norm": 0.07335663586854935, "learning_rate": 2.6200000000000003e-05, "loss": 0.4114, "step": 952 }, { "epoch": 0.029044030811663325, "grad_norm": 0.08124499022960663, "learning_rate": 2.6175e-05, "loss": 0.5423, "step": 953 }, { "epoch": 0.029074507234340834, "grad_norm": 0.08024904876947403, "learning_rate": 2.6150000000000002e-05, "loss": 0.4505, "step": 954 }, { "epoch": 0.02910498365701834, "grad_norm": 0.07474556565284729, "learning_rate": 2.6124999999999998e-05, "loss": 0.4776, "step": 955 }, { "epoch": 0.029135460079695846, "grad_norm": 0.07457344233989716, "learning_rate": 2.61e-05, "loss": 0.4128, "step": 956 }, { "epoch": 0.02916593650237335, "grad_norm": 0.08057667315006256, "learning_rate": 2.6075e-05, "loss": 0.5739, "step": 957 }, { "epoch": 0.029196412925050857, "grad_norm": 0.07493437826633453, "learning_rate": 2.6050000000000003e-05, "loss": 0.4698, "step": 958 }, { "epoch": 0.029226889347728362, "grad_norm": 0.0789019912481308, "learning_rate": 2.6025e-05, "loss": 0.5278, "step": 959 }, { "epoch": 0.029257365770405868, "grad_norm": 0.07884068787097931, "learning_rate": 2.6000000000000002e-05, "loss": 0.4758, "step": 960 }, { "epoch": 0.029287842193083377, "grad_norm": 0.07119706273078918, "learning_rate": 2.5974999999999998e-05, "loss": 0.4247, "step": 961 }, { "epoch": 0.029318318615760883, "grad_norm": 0.09300888329744339, "learning_rate": 2.595e-05, "loss": 0.6038, "step": 962 }, { "epoch": 0.02934879503843839, "grad_norm": 0.07084882259368896, "learning_rate": 2.5925e-05, "loss": 0.3875, "step": 963 }, { "epoch": 0.029379271461115894, "grad_norm": 0.06928104907274246, "learning_rate": 2.5900000000000003e-05, "loss": 0.404, "step": 964 }, { "epoch": 0.0294097478837934, "grad_norm": 0.08157572150230408, "learning_rate": 2.5875e-05, "loss": 0.5659, "step": 965 }, { "epoch": 0.029440224306470905, "grad_norm": 0.0773928090929985, "learning_rate": 2.585e-05, "loss": 0.445, "step": 966 }, { "epoch": 0.029470700729148414, "grad_norm": 0.08616477251052856, "learning_rate": 2.5824999999999998e-05, "loss": 0.5504, "step": 967 }, { "epoch": 0.02950117715182592, "grad_norm": 0.08309312164783478, "learning_rate": 2.58e-05, "loss": 0.4992, "step": 968 }, { "epoch": 0.029531653574503425, "grad_norm": 0.08155832439661026, "learning_rate": 2.5775e-05, "loss": 0.488, "step": 969 }, { "epoch": 0.02956212999718093, "grad_norm": 0.08447880297899246, "learning_rate": 2.5750000000000002e-05, "loss": 0.4991, "step": 970 }, { "epoch": 0.029592606419858437, "grad_norm": 0.07492344081401825, "learning_rate": 2.5725e-05, "loss": 0.4153, "step": 971 }, { "epoch": 0.029623082842535942, "grad_norm": 0.07410462200641632, "learning_rate": 2.57e-05, "loss": 0.4247, "step": 972 }, { "epoch": 0.029653559265213448, "grad_norm": 0.06824444234371185, "learning_rate": 2.5675e-05, "loss": 0.3794, "step": 973 }, { "epoch": 0.029684035687890957, "grad_norm": 0.07797446101903915, "learning_rate": 2.5650000000000003e-05, "loss": 0.4936, "step": 974 }, { "epoch": 0.029714512110568463, "grad_norm": 0.08781326562166214, "learning_rate": 2.5625e-05, "loss": 0.6152, "step": 975 }, { "epoch": 0.029744988533245968, "grad_norm": 0.08396873623132706, "learning_rate": 2.5600000000000002e-05, "loss": 0.4852, "step": 976 }, { "epoch": 0.029775464955923474, "grad_norm": 0.08080101758241653, "learning_rate": 2.5574999999999998e-05, "loss": 0.5057, "step": 977 }, { "epoch": 0.02980594137860098, "grad_norm": 0.07078608870506287, "learning_rate": 2.555e-05, "loss": 0.4046, "step": 978 }, { "epoch": 0.029836417801278485, "grad_norm": 0.07440809160470963, "learning_rate": 2.5525e-05, "loss": 0.3887, "step": 979 }, { "epoch": 0.02986689422395599, "grad_norm": 0.07184373587369919, "learning_rate": 2.5500000000000003e-05, "loss": 0.3493, "step": 980 }, { "epoch": 0.0298973706466335, "grad_norm": 0.07845433056354523, "learning_rate": 2.5475e-05, "loss": 0.4305, "step": 981 }, { "epoch": 0.029927847069311005, "grad_norm": 0.08690222352743149, "learning_rate": 2.5450000000000002e-05, "loss": 0.5809, "step": 982 }, { "epoch": 0.02995832349198851, "grad_norm": 0.07446234673261642, "learning_rate": 2.5424999999999998e-05, "loss": 0.4632, "step": 983 }, { "epoch": 0.029988799914666017, "grad_norm": 0.07507109642028809, "learning_rate": 2.54e-05, "loss": 0.4395, "step": 984 }, { "epoch": 0.030019276337343522, "grad_norm": 0.07372098416090012, "learning_rate": 2.5375e-05, "loss": 0.4281, "step": 985 }, { "epoch": 0.030049752760021028, "grad_norm": 0.07454129308462143, "learning_rate": 2.5350000000000003e-05, "loss": 0.4241, "step": 986 }, { "epoch": 0.030080229182698533, "grad_norm": 0.12441176176071167, "learning_rate": 2.5325e-05, "loss": 0.5339, "step": 987 }, { "epoch": 0.030110705605376042, "grad_norm": 0.07522255182266235, "learning_rate": 2.5300000000000002e-05, "loss": 0.4418, "step": 988 }, { "epoch": 0.030141182028053548, "grad_norm": 0.07934848219156265, "learning_rate": 2.5274999999999998e-05, "loss": 0.4889, "step": 989 }, { "epoch": 0.030171658450731054, "grad_norm": 0.07478386908769608, "learning_rate": 2.525e-05, "loss": 0.4189, "step": 990 }, { "epoch": 0.03020213487340856, "grad_norm": 0.0787413939833641, "learning_rate": 2.5225e-05, "loss": 0.4782, "step": 991 }, { "epoch": 0.030232611296086065, "grad_norm": 0.0785098597407341, "learning_rate": 2.5200000000000003e-05, "loss": 0.4785, "step": 992 }, { "epoch": 0.03026308771876357, "grad_norm": 0.07964067906141281, "learning_rate": 2.5175e-05, "loss": 0.4919, "step": 993 }, { "epoch": 0.030293564141441076, "grad_norm": 0.07061087340116501, "learning_rate": 2.515e-05, "loss": 0.3951, "step": 994 }, { "epoch": 0.030324040564118585, "grad_norm": 0.08392026275396347, "learning_rate": 2.5124999999999997e-05, "loss": 0.5764, "step": 995 }, { "epoch": 0.03035451698679609, "grad_norm": 0.08350448310375214, "learning_rate": 2.51e-05, "loss": 0.4744, "step": 996 }, { "epoch": 0.030384993409473596, "grad_norm": 0.07740837335586548, "learning_rate": 2.5075e-05, "loss": 0.4701, "step": 997 }, { "epoch": 0.030415469832151102, "grad_norm": 0.0745137557387352, "learning_rate": 2.5050000000000002e-05, "loss": 0.4496, "step": 998 }, { "epoch": 0.030445946254828608, "grad_norm": 0.08276493847370148, "learning_rate": 2.5025e-05, "loss": 0.5201, "step": 999 }, { "epoch": 0.030476422677506113, "grad_norm": 0.08082837611436844, "learning_rate": 2.5e-05, "loss": 0.4861, "step": 1000 }, { "epoch": 0.03050689910018362, "grad_norm": 0.0765823945403099, "learning_rate": 2.4975e-05, "loss": 0.4687, "step": 1001 }, { "epoch": 0.030537375522861128, "grad_norm": 0.07668869197368622, "learning_rate": 2.495e-05, "loss": 0.4395, "step": 1002 }, { "epoch": 0.030567851945538634, "grad_norm": 0.0867798924446106, "learning_rate": 2.4925000000000003e-05, "loss": 0.4856, "step": 1003 }, { "epoch": 0.03059832836821614, "grad_norm": 0.0923258364200592, "learning_rate": 2.4900000000000002e-05, "loss": 0.5496, "step": 1004 }, { "epoch": 0.030628804790893645, "grad_norm": 0.0768980011343956, "learning_rate": 2.4875e-05, "loss": 0.4848, "step": 1005 }, { "epoch": 0.03065928121357115, "grad_norm": 0.0749521553516388, "learning_rate": 2.485e-05, "loss": 0.4404, "step": 1006 }, { "epoch": 0.030689757636248656, "grad_norm": 0.07797399163246155, "learning_rate": 2.4825e-05, "loss": 0.5125, "step": 1007 }, { "epoch": 0.03072023405892616, "grad_norm": 0.0802532434463501, "learning_rate": 2.48e-05, "loss": 0.4721, "step": 1008 }, { "epoch": 0.03075071048160367, "grad_norm": 0.0725870355963707, "learning_rate": 2.4775000000000003e-05, "loss": 0.4355, "step": 1009 }, { "epoch": 0.030781186904281176, "grad_norm": 0.06679749488830566, "learning_rate": 2.4750000000000002e-05, "loss": 0.3417, "step": 1010 }, { "epoch": 0.030811663326958682, "grad_norm": 0.07796447724103928, "learning_rate": 2.4725e-05, "loss": 0.4859, "step": 1011 }, { "epoch": 0.030842139749636188, "grad_norm": 0.08769549429416656, "learning_rate": 2.47e-05, "loss": 0.489, "step": 1012 }, { "epoch": 0.030872616172313693, "grad_norm": 0.07319208979606628, "learning_rate": 2.4675e-05, "loss": 0.4097, "step": 1013 }, { "epoch": 0.0309030925949912, "grad_norm": 0.08083212375640869, "learning_rate": 2.465e-05, "loss": 0.4675, "step": 1014 }, { "epoch": 0.030933569017668704, "grad_norm": 0.07354455441236496, "learning_rate": 2.4625000000000002e-05, "loss": 0.4276, "step": 1015 }, { "epoch": 0.030964045440346213, "grad_norm": 0.07098808139562607, "learning_rate": 2.46e-05, "loss": 0.4185, "step": 1016 }, { "epoch": 0.03099452186302372, "grad_norm": 0.08009263873100281, "learning_rate": 2.4575e-05, "loss": 0.4794, "step": 1017 }, { "epoch": 0.031024998285701225, "grad_norm": 0.07026191055774689, "learning_rate": 2.455e-05, "loss": 0.3992, "step": 1018 }, { "epoch": 0.03105547470837873, "grad_norm": 0.07998433709144592, "learning_rate": 2.4525e-05, "loss": 0.4768, "step": 1019 }, { "epoch": 0.031085951131056236, "grad_norm": 0.08261123299598694, "learning_rate": 2.45e-05, "loss": 0.469, "step": 1020 }, { "epoch": 0.03111642755373374, "grad_norm": 0.07796258479356766, "learning_rate": 2.4475000000000002e-05, "loss": 0.4781, "step": 1021 }, { "epoch": 0.03114690397641125, "grad_norm": 0.07386413216590881, "learning_rate": 2.445e-05, "loss": 0.4127, "step": 1022 }, { "epoch": 0.031177380399088756, "grad_norm": 0.0796252116560936, "learning_rate": 2.4425e-05, "loss": 0.4637, "step": 1023 }, { "epoch": 0.031207856821766262, "grad_norm": 0.07099957019090652, "learning_rate": 2.44e-05, "loss": 0.4352, "step": 1024 }, { "epoch": 0.031238333244443767, "grad_norm": 0.0747579038143158, "learning_rate": 2.4375e-05, "loss": 0.4347, "step": 1025 }, { "epoch": 0.031268809667121276, "grad_norm": 0.08040732145309448, "learning_rate": 2.435e-05, "loss": 0.4864, "step": 1026 }, { "epoch": 0.03129928608979878, "grad_norm": 0.07697156071662903, "learning_rate": 2.4325000000000002e-05, "loss": 0.4501, "step": 1027 }, { "epoch": 0.03132976251247629, "grad_norm": 0.07704800367355347, "learning_rate": 2.43e-05, "loss": 0.3996, "step": 1028 }, { "epoch": 0.03136023893515379, "grad_norm": 0.08603644371032715, "learning_rate": 2.4275e-05, "loss": 0.4852, "step": 1029 }, { "epoch": 0.0313907153578313, "grad_norm": 0.06943400204181671, "learning_rate": 2.425e-05, "loss": 0.4087, "step": 1030 }, { "epoch": 0.031421191780508805, "grad_norm": 0.0687209814786911, "learning_rate": 2.4225e-05, "loss": 0.3905, "step": 1031 }, { "epoch": 0.03145166820318631, "grad_norm": 0.07091062515974045, "learning_rate": 2.4200000000000002e-05, "loss": 0.3903, "step": 1032 }, { "epoch": 0.031482144625863816, "grad_norm": 0.07964222878217697, "learning_rate": 2.4175e-05, "loss": 0.5127, "step": 1033 }, { "epoch": 0.03151262104854132, "grad_norm": 0.06999529153108597, "learning_rate": 2.415e-05, "loss": 0.3735, "step": 1034 }, { "epoch": 0.03154309747121883, "grad_norm": 0.07633005082607269, "learning_rate": 2.4125e-05, "loss": 0.4103, "step": 1035 }, { "epoch": 0.03157357389389633, "grad_norm": 0.0741332545876503, "learning_rate": 2.41e-05, "loss": 0.4819, "step": 1036 }, { "epoch": 0.03160405031657384, "grad_norm": 0.08161796629428864, "learning_rate": 2.4075e-05, "loss": 0.5078, "step": 1037 }, { "epoch": 0.031634526739251344, "grad_norm": 0.08213040977716446, "learning_rate": 2.4050000000000002e-05, "loss": 0.4623, "step": 1038 }, { "epoch": 0.03166500316192885, "grad_norm": 0.09008892625570297, "learning_rate": 2.4025e-05, "loss": 0.5129, "step": 1039 }, { "epoch": 0.03169547958460636, "grad_norm": 0.08843938261270523, "learning_rate": 2.4e-05, "loss": 0.5664, "step": 1040 }, { "epoch": 0.03172595600728387, "grad_norm": 0.07861369848251343, "learning_rate": 2.3975e-05, "loss": 0.4899, "step": 1041 }, { "epoch": 0.03175643242996137, "grad_norm": 0.07106230407953262, "learning_rate": 2.395e-05, "loss": 0.4135, "step": 1042 }, { "epoch": 0.03178690885263888, "grad_norm": 0.07541561871767044, "learning_rate": 2.3925e-05, "loss": 0.4119, "step": 1043 }, { "epoch": 0.031817385275316384, "grad_norm": 0.07962796837091446, "learning_rate": 2.39e-05, "loss": 0.3887, "step": 1044 }, { "epoch": 0.03184786169799389, "grad_norm": 0.08364412188529968, "learning_rate": 2.3875e-05, "loss": 0.4993, "step": 1045 }, { "epoch": 0.031878338120671396, "grad_norm": 0.07957728952169418, "learning_rate": 2.385e-05, "loss": 0.477, "step": 1046 }, { "epoch": 0.0319088145433489, "grad_norm": 0.08205942064523697, "learning_rate": 2.3825e-05, "loss": 0.5142, "step": 1047 }, { "epoch": 0.03193929096602641, "grad_norm": 0.0743609219789505, "learning_rate": 2.38e-05, "loss": 0.383, "step": 1048 }, { "epoch": 0.03196976738870391, "grad_norm": 0.07717466354370117, "learning_rate": 2.3775e-05, "loss": 0.4751, "step": 1049 }, { "epoch": 0.03200024381138142, "grad_norm": 0.07468174397945404, "learning_rate": 2.375e-05, "loss": 0.4429, "step": 1050 }, { "epoch": 0.032030720234058924, "grad_norm": 0.07713703066110611, "learning_rate": 2.3725e-05, "loss": 0.4514, "step": 1051 }, { "epoch": 0.03206119665673643, "grad_norm": 0.08319132775068283, "learning_rate": 2.37e-05, "loss": 0.4885, "step": 1052 }, { "epoch": 0.03209167307941394, "grad_norm": 0.0779431015253067, "learning_rate": 2.3675e-05, "loss": 0.4399, "step": 1053 }, { "epoch": 0.03212214950209145, "grad_norm": 0.0810658410191536, "learning_rate": 2.365e-05, "loss": 0.5057, "step": 1054 }, { "epoch": 0.03215262592476895, "grad_norm": 0.08209062367677689, "learning_rate": 2.3624999999999998e-05, "loss": 0.527, "step": 1055 }, { "epoch": 0.03218310234744646, "grad_norm": 0.10149164497852325, "learning_rate": 2.36e-05, "loss": 0.4868, "step": 1056 }, { "epoch": 0.032213578770123964, "grad_norm": 0.08067110925912857, "learning_rate": 2.3575e-05, "loss": 0.4618, "step": 1057 }, { "epoch": 0.03224405519280147, "grad_norm": 0.07035715132951736, "learning_rate": 2.355e-05, "loss": 0.4405, "step": 1058 }, { "epoch": 0.032274531615478975, "grad_norm": 0.08269621431827545, "learning_rate": 2.3525e-05, "loss": 0.4732, "step": 1059 }, { "epoch": 0.03230500803815648, "grad_norm": 0.0866980254650116, "learning_rate": 2.35e-05, "loss": 0.4837, "step": 1060 }, { "epoch": 0.03233548446083399, "grad_norm": 0.08218075335025787, "learning_rate": 2.3475e-05, "loss": 0.4886, "step": 1061 }, { "epoch": 0.03236596088351149, "grad_norm": 0.07841754704713821, "learning_rate": 2.345e-05, "loss": 0.4752, "step": 1062 }, { "epoch": 0.032396437306189, "grad_norm": 0.08677341789007187, "learning_rate": 2.3425000000000004e-05, "loss": 0.5399, "step": 1063 }, { "epoch": 0.032426913728866504, "grad_norm": 0.0809488296508789, "learning_rate": 2.3400000000000003e-05, "loss": 0.5302, "step": 1064 }, { "epoch": 0.03245739015154401, "grad_norm": 0.08245817571878433, "learning_rate": 2.3375000000000002e-05, "loss": 0.5192, "step": 1065 }, { "epoch": 0.032487866574221515, "grad_norm": 0.07782723009586334, "learning_rate": 2.3350000000000002e-05, "loss": 0.4288, "step": 1066 }, { "epoch": 0.03251834299689903, "grad_norm": 0.07693643867969513, "learning_rate": 2.3325e-05, "loss": 0.3899, "step": 1067 }, { "epoch": 0.03254881941957653, "grad_norm": 0.09324256330728531, "learning_rate": 2.3300000000000004e-05, "loss": 0.5727, "step": 1068 }, { "epoch": 0.03257929584225404, "grad_norm": 0.07531018555164337, "learning_rate": 2.3275000000000003e-05, "loss": 0.4651, "step": 1069 }, { "epoch": 0.032609772264931544, "grad_norm": 0.07966288924217224, "learning_rate": 2.3250000000000003e-05, "loss": 0.4741, "step": 1070 }, { "epoch": 0.03264024868760905, "grad_norm": 0.07804643362760544, "learning_rate": 2.3225000000000002e-05, "loss": 0.4181, "step": 1071 }, { "epoch": 0.032670725110286555, "grad_norm": 0.07963328063488007, "learning_rate": 2.32e-05, "loss": 0.4917, "step": 1072 }, { "epoch": 0.03270120153296406, "grad_norm": 0.08076660335063934, "learning_rate": 2.3175e-05, "loss": 0.4764, "step": 1073 }, { "epoch": 0.03273167795564157, "grad_norm": 0.07797010987997055, "learning_rate": 2.3150000000000004e-05, "loss": 0.4746, "step": 1074 }, { "epoch": 0.03276215437831907, "grad_norm": 0.08103783428668976, "learning_rate": 2.3125000000000003e-05, "loss": 0.4934, "step": 1075 }, { "epoch": 0.03279263080099658, "grad_norm": 0.07599592953920364, "learning_rate": 2.3100000000000002e-05, "loss": 0.4207, "step": 1076 }, { "epoch": 0.03282310722367408, "grad_norm": 0.08597654104232788, "learning_rate": 2.3075000000000002e-05, "loss": 0.4915, "step": 1077 }, { "epoch": 0.03285358364635159, "grad_norm": 0.06513240933418274, "learning_rate": 2.305e-05, "loss": 0.3613, "step": 1078 }, { "epoch": 0.032884060069029095, "grad_norm": 0.07519528269767761, "learning_rate": 2.3025e-05, "loss": 0.4315, "step": 1079 }, { "epoch": 0.0329145364917066, "grad_norm": 0.08215155452489853, "learning_rate": 2.3000000000000003e-05, "loss": 0.5149, "step": 1080 }, { "epoch": 0.03294501291438411, "grad_norm": 0.08189955353736877, "learning_rate": 2.2975000000000003e-05, "loss": 0.4347, "step": 1081 }, { "epoch": 0.03297548933706162, "grad_norm": 0.083504818379879, "learning_rate": 2.2950000000000002e-05, "loss": 0.5089, "step": 1082 }, { "epoch": 0.033005965759739124, "grad_norm": 0.07619626075029373, "learning_rate": 2.2925e-05, "loss": 0.5036, "step": 1083 }, { "epoch": 0.03303644218241663, "grad_norm": 0.0740126296877861, "learning_rate": 2.29e-05, "loss": 0.4296, "step": 1084 }, { "epoch": 0.033066918605094135, "grad_norm": 0.0683075338602066, "learning_rate": 2.2875e-05, "loss": 0.3989, "step": 1085 }, { "epoch": 0.03309739502777164, "grad_norm": 0.07235126197338104, "learning_rate": 2.2850000000000003e-05, "loss": 0.3836, "step": 1086 }, { "epoch": 0.033127871450449146, "grad_norm": 0.08330126106739044, "learning_rate": 2.2825000000000003e-05, "loss": 0.5118, "step": 1087 }, { "epoch": 0.03315834787312665, "grad_norm": 0.08101587742567062, "learning_rate": 2.2800000000000002e-05, "loss": 0.4827, "step": 1088 }, { "epoch": 0.03318882429580416, "grad_norm": 0.07654716074466705, "learning_rate": 2.2775e-05, "loss": 0.427, "step": 1089 }, { "epoch": 0.03321930071848166, "grad_norm": 0.08190520107746124, "learning_rate": 2.275e-05, "loss": 0.4827, "step": 1090 }, { "epoch": 0.03324977714115917, "grad_norm": 0.08001712709665298, "learning_rate": 2.2725000000000003e-05, "loss": 0.501, "step": 1091 }, { "epoch": 0.033280253563836674, "grad_norm": 0.07717631012201309, "learning_rate": 2.2700000000000003e-05, "loss": 0.4546, "step": 1092 }, { "epoch": 0.03331072998651418, "grad_norm": 0.08625360578298569, "learning_rate": 2.2675000000000002e-05, "loss": 0.5332, "step": 1093 }, { "epoch": 0.033341206409191686, "grad_norm": 0.07608518749475479, "learning_rate": 2.265e-05, "loss": 0.4944, "step": 1094 }, { "epoch": 0.0333716828318692, "grad_norm": 0.07326430082321167, "learning_rate": 2.2625e-05, "loss": 0.4256, "step": 1095 }, { "epoch": 0.033402159254546704, "grad_norm": 0.08827457576990128, "learning_rate": 2.26e-05, "loss": 0.5121, "step": 1096 }, { "epoch": 0.03343263567722421, "grad_norm": 0.07429133355617523, "learning_rate": 2.2575000000000003e-05, "loss": 0.4654, "step": 1097 }, { "epoch": 0.033463112099901715, "grad_norm": 0.08406919240951538, "learning_rate": 2.2550000000000003e-05, "loss": 0.5647, "step": 1098 }, { "epoch": 0.03349358852257922, "grad_norm": 0.06593562662601471, "learning_rate": 2.2525000000000002e-05, "loss": 0.3606, "step": 1099 }, { "epoch": 0.033524064945256726, "grad_norm": 0.07565759122371674, "learning_rate": 2.25e-05, "loss": 0.4296, "step": 1100 }, { "epoch": 0.03355454136793423, "grad_norm": 0.08141256868839264, "learning_rate": 2.2475e-05, "loss": 0.4753, "step": 1101 }, { "epoch": 0.03358501779061174, "grad_norm": 0.07651394605636597, "learning_rate": 2.245e-05, "loss": 0.4714, "step": 1102 }, { "epoch": 0.03361549421328924, "grad_norm": 0.08785784244537354, "learning_rate": 2.2425000000000003e-05, "loss": 0.5682, "step": 1103 }, { "epoch": 0.03364597063596675, "grad_norm": 0.0739833191037178, "learning_rate": 2.2400000000000002e-05, "loss": 0.426, "step": 1104 }, { "epoch": 0.033676447058644254, "grad_norm": 0.07747345417737961, "learning_rate": 2.2375000000000002e-05, "loss": 0.4796, "step": 1105 }, { "epoch": 0.03370692348132176, "grad_norm": 0.07984206080436707, "learning_rate": 2.235e-05, "loss": 0.4807, "step": 1106 }, { "epoch": 0.033737399903999266, "grad_norm": 0.09307409077882767, "learning_rate": 2.2325e-05, "loss": 0.5738, "step": 1107 }, { "epoch": 0.03376787632667678, "grad_norm": 0.07208666950464249, "learning_rate": 2.23e-05, "loss": 0.4287, "step": 1108 }, { "epoch": 0.033798352749354284, "grad_norm": 0.0826752707362175, "learning_rate": 2.2275000000000003e-05, "loss": 0.5225, "step": 1109 }, { "epoch": 0.03382882917203179, "grad_norm": 0.07515248656272888, "learning_rate": 2.2250000000000002e-05, "loss": 0.4369, "step": 1110 }, { "epoch": 0.033859305594709295, "grad_norm": 0.08508395403623581, "learning_rate": 2.2225e-05, "loss": 0.3917, "step": 1111 }, { "epoch": 0.0338897820173868, "grad_norm": 0.07607904821634293, "learning_rate": 2.22e-05, "loss": 0.4606, "step": 1112 }, { "epoch": 0.033920258440064306, "grad_norm": 0.08213415741920471, "learning_rate": 2.2175e-05, "loss": 0.3844, "step": 1113 }, { "epoch": 0.03395073486274181, "grad_norm": 0.07202441990375519, "learning_rate": 2.215e-05, "loss": 0.4316, "step": 1114 }, { "epoch": 0.03398121128541932, "grad_norm": 0.07517337799072266, "learning_rate": 2.2125000000000002e-05, "loss": 0.4374, "step": 1115 }, { "epoch": 0.03401168770809682, "grad_norm": 0.0745258629322052, "learning_rate": 2.2100000000000002e-05, "loss": 0.4593, "step": 1116 }, { "epoch": 0.03404216413077433, "grad_norm": 0.07741015404462814, "learning_rate": 2.2075e-05, "loss": 0.4665, "step": 1117 }, { "epoch": 0.034072640553451834, "grad_norm": 0.08271410316228867, "learning_rate": 2.205e-05, "loss": 0.5311, "step": 1118 }, { "epoch": 0.03410311697612934, "grad_norm": 0.06921856850385666, "learning_rate": 2.2025e-05, "loss": 0.3996, "step": 1119 }, { "epoch": 0.034133593398806845, "grad_norm": 0.07232721894979477, "learning_rate": 2.2000000000000003e-05, "loss": 0.4296, "step": 1120 }, { "epoch": 0.03416406982148435, "grad_norm": 0.07791001349687576, "learning_rate": 2.1975000000000002e-05, "loss": 0.4685, "step": 1121 }, { "epoch": 0.034194546244161864, "grad_norm": 0.07291356474161148, "learning_rate": 2.195e-05, "loss": 0.4806, "step": 1122 }, { "epoch": 0.03422502266683937, "grad_norm": 0.07396282255649567, "learning_rate": 2.1925e-05, "loss": 0.4167, "step": 1123 }, { "epoch": 0.034255499089516875, "grad_norm": 0.08300532400608063, "learning_rate": 2.19e-05, "loss": 0.5254, "step": 1124 }, { "epoch": 0.03428597551219438, "grad_norm": 0.08437091112136841, "learning_rate": 2.1875e-05, "loss": 0.5553, "step": 1125 }, { "epoch": 0.034316451934871886, "grad_norm": 0.07955759018659592, "learning_rate": 2.1850000000000003e-05, "loss": 0.4964, "step": 1126 }, { "epoch": 0.03434692835754939, "grad_norm": 0.07973140478134155, "learning_rate": 2.1825000000000002e-05, "loss": 0.4587, "step": 1127 }, { "epoch": 0.0343774047802269, "grad_norm": 0.07252024114131927, "learning_rate": 2.18e-05, "loss": 0.4263, "step": 1128 }, { "epoch": 0.0344078812029044, "grad_norm": 0.08075224608182907, "learning_rate": 2.1775e-05, "loss": 0.4558, "step": 1129 }, { "epoch": 0.03443835762558191, "grad_norm": 0.08054304867982864, "learning_rate": 2.175e-05, "loss": 0.5025, "step": 1130 }, { "epoch": 0.034468834048259414, "grad_norm": 0.08686905354261398, "learning_rate": 2.1725e-05, "loss": 0.533, "step": 1131 }, { "epoch": 0.03449931047093692, "grad_norm": 0.07592800259590149, "learning_rate": 2.1700000000000002e-05, "loss": 0.4517, "step": 1132 }, { "epoch": 0.034529786893614425, "grad_norm": 0.07240843772888184, "learning_rate": 2.1675e-05, "loss": 0.424, "step": 1133 }, { "epoch": 0.03456026331629193, "grad_norm": 0.08290556818246841, "learning_rate": 2.165e-05, "loss": 0.537, "step": 1134 }, { "epoch": 0.03459073973896944, "grad_norm": 0.08035808056592941, "learning_rate": 2.1625e-05, "loss": 0.4907, "step": 1135 }, { "epoch": 0.03462121616164695, "grad_norm": 0.07493937015533447, "learning_rate": 2.16e-05, "loss": 0.461, "step": 1136 }, { "epoch": 0.034651692584324455, "grad_norm": 0.07884912192821503, "learning_rate": 2.1575e-05, "loss": 0.4931, "step": 1137 }, { "epoch": 0.03468216900700196, "grad_norm": 0.0698591098189354, "learning_rate": 2.1550000000000002e-05, "loss": 0.4105, "step": 1138 }, { "epoch": 0.034712645429679466, "grad_norm": 0.0825762003660202, "learning_rate": 2.1525e-05, "loss": 0.502, "step": 1139 }, { "epoch": 0.03474312185235697, "grad_norm": 0.07466436177492142, "learning_rate": 2.15e-05, "loss": 0.4333, "step": 1140 }, { "epoch": 0.03477359827503448, "grad_norm": 0.07135630398988724, "learning_rate": 2.1475e-05, "loss": 0.4073, "step": 1141 }, { "epoch": 0.03480407469771198, "grad_norm": 0.0685432180762291, "learning_rate": 2.145e-05, "loss": 0.4197, "step": 1142 }, { "epoch": 0.03483455112038949, "grad_norm": 0.08008960634469986, "learning_rate": 2.1425e-05, "loss": 0.4526, "step": 1143 }, { "epoch": 0.034865027543066994, "grad_norm": 0.06698304414749146, "learning_rate": 2.1400000000000002e-05, "loss": 0.3521, "step": 1144 }, { "epoch": 0.0348955039657445, "grad_norm": 0.07457142323255539, "learning_rate": 2.1375e-05, "loss": 0.4461, "step": 1145 }, { "epoch": 0.034925980388422005, "grad_norm": 0.07921376079320908, "learning_rate": 2.135e-05, "loss": 0.4893, "step": 1146 }, { "epoch": 0.03495645681109951, "grad_norm": 0.08625560253858566, "learning_rate": 2.1325e-05, "loss": 0.4858, "step": 1147 }, { "epoch": 0.034986933233777016, "grad_norm": 0.08166279643774033, "learning_rate": 2.13e-05, "loss": 0.4689, "step": 1148 }, { "epoch": 0.03501740965645452, "grad_norm": 0.07195094972848892, "learning_rate": 2.1275000000000002e-05, "loss": 0.3271, "step": 1149 }, { "epoch": 0.035047886079132035, "grad_norm": 0.07523542642593384, "learning_rate": 2.125e-05, "loss": 0.4601, "step": 1150 }, { "epoch": 0.03507836250180954, "grad_norm": 0.07776416838169098, "learning_rate": 2.1225e-05, "loss": 0.5004, "step": 1151 }, { "epoch": 0.035108838924487046, "grad_norm": 0.07545484602451324, "learning_rate": 2.12e-05, "loss": 0.413, "step": 1152 }, { "epoch": 0.03513931534716455, "grad_norm": 0.07746530324220657, "learning_rate": 2.1175e-05, "loss": 0.4691, "step": 1153 }, { "epoch": 0.03516979176984206, "grad_norm": 0.07687117159366608, "learning_rate": 2.115e-05, "loss": 0.4827, "step": 1154 }, { "epoch": 0.03520026819251956, "grad_norm": 0.06418263167142868, "learning_rate": 2.1125000000000002e-05, "loss": 0.3552, "step": 1155 }, { "epoch": 0.03523074461519707, "grad_norm": 0.07973786443471909, "learning_rate": 2.11e-05, "loss": 0.4919, "step": 1156 }, { "epoch": 0.035261221037874574, "grad_norm": 0.07666247338056564, "learning_rate": 2.1075e-05, "loss": 0.4579, "step": 1157 }, { "epoch": 0.03529169746055208, "grad_norm": 0.07535306364297867, "learning_rate": 2.105e-05, "loss": 0.4586, "step": 1158 }, { "epoch": 0.035322173883229585, "grad_norm": 0.0748143121600151, "learning_rate": 2.1025e-05, "loss": 0.3831, "step": 1159 }, { "epoch": 0.03535265030590709, "grad_norm": 0.07700170576572418, "learning_rate": 2.1e-05, "loss": 0.4547, "step": 1160 }, { "epoch": 0.035383126728584596, "grad_norm": 0.07464930415153503, "learning_rate": 2.0975e-05, "loss": 0.4116, "step": 1161 }, { "epoch": 0.0354136031512621, "grad_norm": 0.07612532377243042, "learning_rate": 2.095e-05, "loss": 0.4754, "step": 1162 }, { "epoch": 0.03544407957393961, "grad_norm": 0.07881446182727814, "learning_rate": 2.0925e-05, "loss": 0.4717, "step": 1163 }, { "epoch": 0.03547455599661712, "grad_norm": 0.07158850133419037, "learning_rate": 2.09e-05, "loss": 0.4734, "step": 1164 }, { "epoch": 0.035505032419294626, "grad_norm": 0.07024332880973816, "learning_rate": 2.0875e-05, "loss": 0.3968, "step": 1165 }, { "epoch": 0.03553550884197213, "grad_norm": 0.07495395839214325, "learning_rate": 2.085e-05, "loss": 0.419, "step": 1166 }, { "epoch": 0.03556598526464964, "grad_norm": 0.07781948149204254, "learning_rate": 2.0825e-05, "loss": 0.4702, "step": 1167 }, { "epoch": 0.03559646168732714, "grad_norm": 0.0794079378247261, "learning_rate": 2.08e-05, "loss": 0.4804, "step": 1168 }, { "epoch": 0.03562693811000465, "grad_norm": 0.07464304566383362, "learning_rate": 2.0775e-05, "loss": 0.4692, "step": 1169 }, { "epoch": 0.035657414532682154, "grad_norm": 0.07628969848155975, "learning_rate": 2.075e-05, "loss": 0.4865, "step": 1170 }, { "epoch": 0.03568789095535966, "grad_norm": 0.07347798347473145, "learning_rate": 2.0725e-05, "loss": 0.4095, "step": 1171 }, { "epoch": 0.035718367378037165, "grad_norm": 0.07283708453178406, "learning_rate": 2.07e-05, "loss": 0.4342, "step": 1172 }, { "epoch": 0.03574884380071467, "grad_norm": 0.08202147483825684, "learning_rate": 2.0675e-05, "loss": 0.5221, "step": 1173 }, { "epoch": 0.035779320223392176, "grad_norm": 0.0768025666475296, "learning_rate": 2.065e-05, "loss": 0.4803, "step": 1174 }, { "epoch": 0.03580979664606968, "grad_norm": 0.07391737401485443, "learning_rate": 2.0625e-05, "loss": 0.3898, "step": 1175 }, { "epoch": 0.03584027306874719, "grad_norm": 0.07677934318780899, "learning_rate": 2.06e-05, "loss": 0.4294, "step": 1176 }, { "epoch": 0.0358707494914247, "grad_norm": 0.07796715945005417, "learning_rate": 2.0575e-05, "loss": 0.5415, "step": 1177 }, { "epoch": 0.035901225914102206, "grad_norm": 0.0745825543999672, "learning_rate": 2.055e-05, "loss": 0.4434, "step": 1178 }, { "epoch": 0.03593170233677971, "grad_norm": 0.07720299810171127, "learning_rate": 2.0525e-05, "loss": 0.4652, "step": 1179 }, { "epoch": 0.03596217875945722, "grad_norm": 0.08553807437419891, "learning_rate": 2.05e-05, "loss": 0.5047, "step": 1180 }, { "epoch": 0.03599265518213472, "grad_norm": 0.07886727899312973, "learning_rate": 2.0475e-05, "loss": 0.4567, "step": 1181 }, { "epoch": 0.03602313160481223, "grad_norm": 0.09122951328754425, "learning_rate": 2.045e-05, "loss": 0.5206, "step": 1182 }, { "epoch": 0.036053608027489734, "grad_norm": 0.07745413482189178, "learning_rate": 2.0425e-05, "loss": 0.4244, "step": 1183 }, { "epoch": 0.03608408445016724, "grad_norm": 0.08161917328834534, "learning_rate": 2.04e-05, "loss": 0.4644, "step": 1184 }, { "epoch": 0.036114560872844745, "grad_norm": 0.0715264305472374, "learning_rate": 2.0375e-05, "loss": 0.4041, "step": 1185 }, { "epoch": 0.03614503729552225, "grad_norm": 0.08074958622455597, "learning_rate": 2.035e-05, "loss": 0.4708, "step": 1186 }, { "epoch": 0.036175513718199756, "grad_norm": 0.08088172972202301, "learning_rate": 2.0325e-05, "loss": 0.4905, "step": 1187 }, { "epoch": 0.03620599014087726, "grad_norm": 0.08671984821557999, "learning_rate": 2.0300000000000002e-05, "loss": 0.6188, "step": 1188 }, { "epoch": 0.03623646656355477, "grad_norm": 0.07810551673173904, "learning_rate": 2.0275e-05, "loss": 0.3925, "step": 1189 }, { "epoch": 0.03626694298623227, "grad_norm": 0.07822590321302414, "learning_rate": 2.025e-05, "loss": 0.5208, "step": 1190 }, { "epoch": 0.036297419408909785, "grad_norm": 0.07870861142873764, "learning_rate": 2.0225000000000004e-05, "loss": 0.4608, "step": 1191 }, { "epoch": 0.03632789583158729, "grad_norm": 0.08404915779829025, "learning_rate": 2.0200000000000003e-05, "loss": 0.4985, "step": 1192 }, { "epoch": 0.0363583722542648, "grad_norm": 0.07792908698320389, "learning_rate": 2.0175000000000003e-05, "loss": 0.4578, "step": 1193 }, { "epoch": 0.0363888486769423, "grad_norm": 0.09054426103830338, "learning_rate": 2.0150000000000002e-05, "loss": 0.4623, "step": 1194 }, { "epoch": 0.03641932509961981, "grad_norm": 0.08585979789495468, "learning_rate": 2.0125e-05, "loss": 0.4885, "step": 1195 }, { "epoch": 0.03644980152229731, "grad_norm": 0.0759185180068016, "learning_rate": 2.01e-05, "loss": 0.4226, "step": 1196 }, { "epoch": 0.03648027794497482, "grad_norm": 0.07759178429841995, "learning_rate": 2.0075000000000003e-05, "loss": 0.4601, "step": 1197 }, { "epoch": 0.036510754367652325, "grad_norm": 0.08275208622217178, "learning_rate": 2.0050000000000003e-05, "loss": 0.4494, "step": 1198 }, { "epoch": 0.03654123079032983, "grad_norm": 0.07568763941526413, "learning_rate": 2.0025000000000002e-05, "loss": 0.4405, "step": 1199 }, { "epoch": 0.036571707213007336, "grad_norm": 0.08068037778139114, "learning_rate": 2e-05, "loss": 0.5111, "step": 1200 }, { "epoch": 0.03660218363568484, "grad_norm": 0.0741753801703453, "learning_rate": 1.9975e-05, "loss": 0.4562, "step": 1201 }, { "epoch": 0.03663266005836235, "grad_norm": 0.07798675447702408, "learning_rate": 1.995e-05, "loss": 0.4887, "step": 1202 }, { "epoch": 0.03666313648103985, "grad_norm": 0.08292707800865173, "learning_rate": 1.9925000000000003e-05, "loss": 0.4787, "step": 1203 }, { "epoch": 0.03669361290371736, "grad_norm": 0.07597538828849792, "learning_rate": 1.9900000000000003e-05, "loss": 0.3853, "step": 1204 }, { "epoch": 0.03672408932639487, "grad_norm": 0.08731599897146225, "learning_rate": 1.9875000000000002e-05, "loss": 0.4662, "step": 1205 }, { "epoch": 0.036754565749072377, "grad_norm": 0.07231872528791428, "learning_rate": 1.985e-05, "loss": 0.3849, "step": 1206 }, { "epoch": 0.03678504217174988, "grad_norm": 0.07160863280296326, "learning_rate": 1.9825e-05, "loss": 0.4233, "step": 1207 }, { "epoch": 0.03681551859442739, "grad_norm": 0.0762363150715828, "learning_rate": 1.9800000000000004e-05, "loss": 0.4441, "step": 1208 }, { "epoch": 0.03684599501710489, "grad_norm": 0.07061326503753662, "learning_rate": 1.9775000000000003e-05, "loss": 0.4129, "step": 1209 }, { "epoch": 0.0368764714397824, "grad_norm": 0.0764552652835846, "learning_rate": 1.9750000000000002e-05, "loss": 0.4945, "step": 1210 }, { "epoch": 0.036906947862459905, "grad_norm": 0.0739632397890091, "learning_rate": 1.9725000000000002e-05, "loss": 0.4594, "step": 1211 }, { "epoch": 0.03693742428513741, "grad_norm": 0.072509765625, "learning_rate": 1.97e-05, "loss": 0.3982, "step": 1212 }, { "epoch": 0.036967900707814916, "grad_norm": 0.08223304897546768, "learning_rate": 1.9675e-05, "loss": 0.4628, "step": 1213 }, { "epoch": 0.03699837713049242, "grad_norm": 0.08050574362277985, "learning_rate": 1.9650000000000003e-05, "loss": 0.494, "step": 1214 }, { "epoch": 0.03702885355316993, "grad_norm": 0.08207545429468155, "learning_rate": 1.9625000000000003e-05, "loss": 0.5023, "step": 1215 }, { "epoch": 0.03705932997584743, "grad_norm": 0.07208550721406937, "learning_rate": 1.9600000000000002e-05, "loss": 0.4129, "step": 1216 }, { "epoch": 0.03708980639852494, "grad_norm": 0.07769951969385147, "learning_rate": 1.9575e-05, "loss": 0.4303, "step": 1217 }, { "epoch": 0.037120282821202444, "grad_norm": 0.07532592862844467, "learning_rate": 1.955e-05, "loss": 0.4043, "step": 1218 }, { "epoch": 0.037150759243879956, "grad_norm": 0.08695685118436813, "learning_rate": 1.9525e-05, "loss": 0.5592, "step": 1219 }, { "epoch": 0.03718123566655746, "grad_norm": 0.07999631762504578, "learning_rate": 1.9500000000000003e-05, "loss": 0.4476, "step": 1220 }, { "epoch": 0.03721171208923497, "grad_norm": 0.07893343269824982, "learning_rate": 1.9475000000000002e-05, "loss": 0.4531, "step": 1221 }, { "epoch": 0.03724218851191247, "grad_norm": 0.07094209641218185, "learning_rate": 1.9450000000000002e-05, "loss": 0.4165, "step": 1222 }, { "epoch": 0.03727266493458998, "grad_norm": 0.07335719466209412, "learning_rate": 1.9425e-05, "loss": 0.4259, "step": 1223 }, { "epoch": 0.037303141357267484, "grad_norm": 0.07681035250425339, "learning_rate": 1.94e-05, "loss": 0.478, "step": 1224 }, { "epoch": 0.03733361777994499, "grad_norm": 0.07244287431240082, "learning_rate": 1.9375e-05, "loss": 0.4482, "step": 1225 }, { "epoch": 0.037364094202622496, "grad_norm": 0.0703640729188919, "learning_rate": 1.9350000000000003e-05, "loss": 0.3774, "step": 1226 }, { "epoch": 0.0373945706253, "grad_norm": 0.07905355840921402, "learning_rate": 1.9325000000000002e-05, "loss": 0.4847, "step": 1227 }, { "epoch": 0.03742504704797751, "grad_norm": 0.08674014359712601, "learning_rate": 1.93e-05, "loss": 0.5161, "step": 1228 }, { "epoch": 0.03745552347065501, "grad_norm": 0.07755231857299805, "learning_rate": 1.9275e-05, "loss": 0.4749, "step": 1229 }, { "epoch": 0.03748599989333252, "grad_norm": 0.07406751066446304, "learning_rate": 1.925e-05, "loss": 0.4157, "step": 1230 }, { "epoch": 0.037516476316010024, "grad_norm": 0.08149239420890808, "learning_rate": 1.9225e-05, "loss": 0.4002, "step": 1231 }, { "epoch": 0.037546952738687536, "grad_norm": 0.07023477554321289, "learning_rate": 1.9200000000000003e-05, "loss": 0.3804, "step": 1232 }, { "epoch": 0.03757742916136504, "grad_norm": 0.08459515869617462, "learning_rate": 1.9175000000000002e-05, "loss": 0.5369, "step": 1233 }, { "epoch": 0.03760790558404255, "grad_norm": 0.07303836941719055, "learning_rate": 1.915e-05, "loss": 0.4008, "step": 1234 }, { "epoch": 0.03763838200672005, "grad_norm": 0.07417949289083481, "learning_rate": 1.9125e-05, "loss": 0.4551, "step": 1235 }, { "epoch": 0.03766885842939756, "grad_norm": 0.09014063328504562, "learning_rate": 1.91e-05, "loss": 0.4222, "step": 1236 }, { "epoch": 0.037699334852075064, "grad_norm": 0.07361666113138199, "learning_rate": 1.9075000000000003e-05, "loss": 0.461, "step": 1237 }, { "epoch": 0.03772981127475257, "grad_norm": 0.076505146920681, "learning_rate": 1.9050000000000002e-05, "loss": 0.4269, "step": 1238 }, { "epoch": 0.037760287697430076, "grad_norm": 0.08712539076805115, "learning_rate": 1.9025e-05, "loss": 0.5857, "step": 1239 }, { "epoch": 0.03779076412010758, "grad_norm": 0.07382533699274063, "learning_rate": 1.9e-05, "loss": 0.4554, "step": 1240 }, { "epoch": 0.03782124054278509, "grad_norm": 0.08178618550300598, "learning_rate": 1.8975e-05, "loss": 0.4937, "step": 1241 }, { "epoch": 0.03785171696546259, "grad_norm": 0.07598119229078293, "learning_rate": 1.895e-05, "loss": 0.4479, "step": 1242 }, { "epoch": 0.0378821933881401, "grad_norm": 0.07282229512929916, "learning_rate": 1.8925000000000003e-05, "loss": 0.4081, "step": 1243 }, { "epoch": 0.037912669810817604, "grad_norm": 0.0718529224395752, "learning_rate": 1.8900000000000002e-05, "loss": 0.4307, "step": 1244 }, { "epoch": 0.03794314623349511, "grad_norm": 0.08369870483875275, "learning_rate": 1.8875e-05, "loss": 0.5558, "step": 1245 }, { "epoch": 0.03797362265617262, "grad_norm": 0.07728356868028641, "learning_rate": 1.885e-05, "loss": 0.4659, "step": 1246 }, { "epoch": 0.03800409907885013, "grad_norm": 0.07424064725637436, "learning_rate": 1.8825e-05, "loss": 0.4399, "step": 1247 }, { "epoch": 0.03803457550152763, "grad_norm": 0.07464594393968582, "learning_rate": 1.88e-05, "loss": 0.4429, "step": 1248 }, { "epoch": 0.03806505192420514, "grad_norm": 0.07556020468473434, "learning_rate": 1.8775000000000002e-05, "loss": 0.4582, "step": 1249 }, { "epoch": 0.038095528346882644, "grad_norm": 0.0755869448184967, "learning_rate": 1.8750000000000002e-05, "loss": 0.4944, "step": 1250 }, { "epoch": 0.03812600476956015, "grad_norm": 0.07357589155435562, "learning_rate": 1.8725e-05, "loss": 0.455, "step": 1251 }, { "epoch": 0.038156481192237655, "grad_norm": 0.06875104457139969, "learning_rate": 1.87e-05, "loss": 0.3694, "step": 1252 }, { "epoch": 0.03818695761491516, "grad_norm": 0.09266966581344604, "learning_rate": 1.8675e-05, "loss": 0.4946, "step": 1253 }, { "epoch": 0.03821743403759267, "grad_norm": 0.073293037712574, "learning_rate": 1.865e-05, "loss": 0.4177, "step": 1254 }, { "epoch": 0.03824791046027017, "grad_norm": 0.07828037440776825, "learning_rate": 1.8625000000000002e-05, "loss": 0.4475, "step": 1255 }, { "epoch": 0.03827838688294768, "grad_norm": 0.07746947556734085, "learning_rate": 1.86e-05, "loss": 0.446, "step": 1256 }, { "epoch": 0.03830886330562518, "grad_norm": 0.08287563174962997, "learning_rate": 1.8575e-05, "loss": 0.4994, "step": 1257 }, { "epoch": 0.03833933972830269, "grad_norm": 0.07723773270845413, "learning_rate": 1.855e-05, "loss": 0.4631, "step": 1258 }, { "epoch": 0.038369816150980195, "grad_norm": 0.08758226037025452, "learning_rate": 1.8525e-05, "loss": 0.5756, "step": 1259 }, { "epoch": 0.03840029257365771, "grad_norm": 0.08764380216598511, "learning_rate": 1.85e-05, "loss": 0.5305, "step": 1260 }, { "epoch": 0.03843076899633521, "grad_norm": 0.19418556988239288, "learning_rate": 1.8475000000000002e-05, "loss": 0.4831, "step": 1261 }, { "epoch": 0.03846124541901272, "grad_norm": 0.08309505134820938, "learning_rate": 1.845e-05, "loss": 0.4798, "step": 1262 }, { "epoch": 0.038491721841690224, "grad_norm": 0.08765313029289246, "learning_rate": 1.8425e-05, "loss": 0.478, "step": 1263 }, { "epoch": 0.03852219826436773, "grad_norm": 0.07946054637432098, "learning_rate": 1.84e-05, "loss": 0.5272, "step": 1264 }, { "epoch": 0.038552674687045235, "grad_norm": 0.07755643129348755, "learning_rate": 1.8375e-05, "loss": 0.4266, "step": 1265 }, { "epoch": 0.03858315110972274, "grad_norm": 0.07644887268543243, "learning_rate": 1.8350000000000002e-05, "loss": 0.4718, "step": 1266 }, { "epoch": 0.038613627532400246, "grad_norm": 0.0864272341132164, "learning_rate": 1.8325e-05, "loss": 0.5723, "step": 1267 }, { "epoch": 0.03864410395507775, "grad_norm": 0.07859522104263306, "learning_rate": 1.83e-05, "loss": 0.4579, "step": 1268 }, { "epoch": 0.03867458037775526, "grad_norm": 0.08723235130310059, "learning_rate": 1.8275e-05, "loss": 0.5646, "step": 1269 }, { "epoch": 0.03870505680043276, "grad_norm": 0.0810532420873642, "learning_rate": 1.825e-05, "loss": 0.5175, "step": 1270 }, { "epoch": 0.03873553322311027, "grad_norm": 0.0706227496266365, "learning_rate": 1.8225e-05, "loss": 0.4075, "step": 1271 }, { "epoch": 0.038766009645787775, "grad_norm": 0.07583838701248169, "learning_rate": 1.8200000000000002e-05, "loss": 0.4286, "step": 1272 }, { "epoch": 0.03879648606846528, "grad_norm": 0.08726687729358673, "learning_rate": 1.8175e-05, "loss": 0.5414, "step": 1273 }, { "epoch": 0.03882696249114279, "grad_norm": 0.07915574312210083, "learning_rate": 1.815e-05, "loss": 0.4739, "step": 1274 }, { "epoch": 0.0388574389138203, "grad_norm": 0.08250943571329117, "learning_rate": 1.8125e-05, "loss": 0.5209, "step": 1275 }, { "epoch": 0.038887915336497804, "grad_norm": 0.07677184790372849, "learning_rate": 1.81e-05, "loss": 0.4572, "step": 1276 }, { "epoch": 0.03891839175917531, "grad_norm": 0.0822749137878418, "learning_rate": 1.8075e-05, "loss": 0.4909, "step": 1277 }, { "epoch": 0.038948868181852815, "grad_norm": 0.07724782079458237, "learning_rate": 1.805e-05, "loss": 0.496, "step": 1278 }, { "epoch": 0.03897934460453032, "grad_norm": 0.08193520456552505, "learning_rate": 1.8025e-05, "loss": 0.556, "step": 1279 }, { "epoch": 0.039009821027207826, "grad_norm": 0.08037828654050827, "learning_rate": 1.8e-05, "loss": 0.541, "step": 1280 }, { "epoch": 0.03904029744988533, "grad_norm": 0.07217898219823837, "learning_rate": 1.7975e-05, "loss": 0.44, "step": 1281 }, { "epoch": 0.03907077387256284, "grad_norm": 0.07741450518369675, "learning_rate": 1.795e-05, "loss": 0.4886, "step": 1282 }, { "epoch": 0.03910125029524034, "grad_norm": 0.07329129427671432, "learning_rate": 1.7925e-05, "loss": 0.443, "step": 1283 }, { "epoch": 0.03913172671791785, "grad_norm": 0.0705774575471878, "learning_rate": 1.79e-05, "loss": 0.388, "step": 1284 }, { "epoch": 0.039162203140595354, "grad_norm": 0.07700534909963608, "learning_rate": 1.7875e-05, "loss": 0.4541, "step": 1285 }, { "epoch": 0.03919267956327286, "grad_norm": 0.07576808333396912, "learning_rate": 1.785e-05, "loss": 0.4603, "step": 1286 }, { "epoch": 0.03922315598595037, "grad_norm": 0.07201448082923889, "learning_rate": 1.7825e-05, "loss": 0.4406, "step": 1287 }, { "epoch": 0.03925363240862788, "grad_norm": 0.06906180828809738, "learning_rate": 1.78e-05, "loss": 0.4295, "step": 1288 }, { "epoch": 0.039284108831305384, "grad_norm": 0.0749930664896965, "learning_rate": 1.7775e-05, "loss": 0.4351, "step": 1289 }, { "epoch": 0.03931458525398289, "grad_norm": 0.08230794221162796, "learning_rate": 1.775e-05, "loss": 0.3935, "step": 1290 }, { "epoch": 0.039345061676660395, "grad_norm": 0.07884281128644943, "learning_rate": 1.7725e-05, "loss": 0.4385, "step": 1291 }, { "epoch": 0.0393755380993379, "grad_norm": 0.07391180098056793, "learning_rate": 1.77e-05, "loss": 0.4198, "step": 1292 }, { "epoch": 0.039406014522015406, "grad_norm": 0.07893234491348267, "learning_rate": 1.7675e-05, "loss": 0.4893, "step": 1293 }, { "epoch": 0.03943649094469291, "grad_norm": 0.07637498527765274, "learning_rate": 1.765e-05, "loss": 0.4134, "step": 1294 }, { "epoch": 0.03946696736737042, "grad_norm": 0.07906269282102585, "learning_rate": 1.7625e-05, "loss": 0.4643, "step": 1295 }, { "epoch": 0.03949744379004792, "grad_norm": 0.08669058233499527, "learning_rate": 1.76e-05, "loss": 0.5243, "step": 1296 }, { "epoch": 0.03952792021272543, "grad_norm": 0.07281150668859482, "learning_rate": 1.7575e-05, "loss": 0.4245, "step": 1297 }, { "epoch": 0.039558396635402934, "grad_norm": 0.07432803511619568, "learning_rate": 1.755e-05, "loss": 0.4213, "step": 1298 }, { "epoch": 0.03958887305808044, "grad_norm": 0.06927355378866196, "learning_rate": 1.7525e-05, "loss": 0.3857, "step": 1299 }, { "epoch": 0.039619349480757945, "grad_norm": 0.07519862800836563, "learning_rate": 1.75e-05, "loss": 0.4312, "step": 1300 }, { "epoch": 0.03964982590343546, "grad_norm": 0.075869120657444, "learning_rate": 1.7475e-05, "loss": 0.4424, "step": 1301 }, { "epoch": 0.039680302326112964, "grad_norm": 0.07063412666320801, "learning_rate": 1.745e-05, "loss": 0.4157, "step": 1302 }, { "epoch": 0.03971077874879047, "grad_norm": 0.08617142587900162, "learning_rate": 1.7425e-05, "loss": 0.5357, "step": 1303 }, { "epoch": 0.039741255171467975, "grad_norm": 0.07373583316802979, "learning_rate": 1.74e-05, "loss": 0.4181, "step": 1304 }, { "epoch": 0.03977173159414548, "grad_norm": 0.07437223196029663, "learning_rate": 1.7375e-05, "loss": 0.4516, "step": 1305 }, { "epoch": 0.039802208016822986, "grad_norm": 0.07732459157705307, "learning_rate": 1.7349999999999998e-05, "loss": 0.4003, "step": 1306 }, { "epoch": 0.03983268443950049, "grad_norm": 0.07903821766376495, "learning_rate": 1.7325e-05, "loss": 0.513, "step": 1307 }, { "epoch": 0.039863160862178, "grad_norm": 0.08046909421682358, "learning_rate": 1.73e-05, "loss": 0.4922, "step": 1308 }, { "epoch": 0.0398936372848555, "grad_norm": 0.0848076269030571, "learning_rate": 1.7275e-05, "loss": 0.5094, "step": 1309 }, { "epoch": 0.03992411370753301, "grad_norm": 0.07986974716186523, "learning_rate": 1.725e-05, "loss": 0.5305, "step": 1310 }, { "epoch": 0.039954590130210514, "grad_norm": 0.08204922080039978, "learning_rate": 1.7225e-05, "loss": 0.5376, "step": 1311 }, { "epoch": 0.03998506655288802, "grad_norm": 0.0841006487607956, "learning_rate": 1.7199999999999998e-05, "loss": 0.5722, "step": 1312 }, { "epoch": 0.040015542975565525, "grad_norm": 0.07188112288713455, "learning_rate": 1.7175e-05, "loss": 0.4555, "step": 1313 }, { "epoch": 0.04004601939824303, "grad_norm": 0.07414884120225906, "learning_rate": 1.7150000000000004e-05, "loss": 0.4273, "step": 1314 }, { "epoch": 0.040076495820920544, "grad_norm": 0.08127425611019135, "learning_rate": 1.7125000000000003e-05, "loss": 0.5072, "step": 1315 }, { "epoch": 0.04010697224359805, "grad_norm": 0.06895403563976288, "learning_rate": 1.7100000000000002e-05, "loss": 0.3862, "step": 1316 }, { "epoch": 0.040137448666275555, "grad_norm": 0.07139305770397186, "learning_rate": 1.7075e-05, "loss": 0.4108, "step": 1317 }, { "epoch": 0.04016792508895306, "grad_norm": 0.08269815146923065, "learning_rate": 1.705e-05, "loss": 0.4733, "step": 1318 }, { "epoch": 0.040198401511630566, "grad_norm": 0.07380036264657974, "learning_rate": 1.7025e-05, "loss": 0.4376, "step": 1319 }, { "epoch": 0.04022887793430807, "grad_norm": 0.0878400057554245, "learning_rate": 1.7000000000000003e-05, "loss": 0.4823, "step": 1320 }, { "epoch": 0.04025935435698558, "grad_norm": 0.07100950926542282, "learning_rate": 1.6975000000000003e-05, "loss": 0.3904, "step": 1321 }, { "epoch": 0.04028983077966308, "grad_norm": 0.07543230801820755, "learning_rate": 1.6950000000000002e-05, "loss": 0.4158, "step": 1322 }, { "epoch": 0.04032030720234059, "grad_norm": 0.07148663699626923, "learning_rate": 1.6925e-05, "loss": 0.4065, "step": 1323 }, { "epoch": 0.040350783625018094, "grad_norm": 0.06970249861478806, "learning_rate": 1.69e-05, "loss": 0.4002, "step": 1324 }, { "epoch": 0.0403812600476956, "grad_norm": 0.08213993906974792, "learning_rate": 1.6875000000000004e-05, "loss": 0.5263, "step": 1325 }, { "epoch": 0.040411736470373105, "grad_norm": 0.07497018575668335, "learning_rate": 1.6850000000000003e-05, "loss": 0.4498, "step": 1326 }, { "epoch": 0.04044221289305061, "grad_norm": 0.07344295084476471, "learning_rate": 1.6825000000000002e-05, "loss": 0.3977, "step": 1327 }, { "epoch": 0.040472689315728116, "grad_norm": 0.0752490758895874, "learning_rate": 1.6800000000000002e-05, "loss": 0.4209, "step": 1328 }, { "epoch": 0.04050316573840563, "grad_norm": 0.08633843064308167, "learning_rate": 1.6775e-05, "loss": 0.532, "step": 1329 }, { "epoch": 0.040533642161083135, "grad_norm": 0.07450416684150696, "learning_rate": 1.675e-05, "loss": 0.4435, "step": 1330 }, { "epoch": 0.04056411858376064, "grad_norm": 0.0810224860906601, "learning_rate": 1.6725000000000003e-05, "loss": 0.4607, "step": 1331 }, { "epoch": 0.040594595006438146, "grad_norm": 0.0794248878955841, "learning_rate": 1.6700000000000003e-05, "loss": 0.4462, "step": 1332 }, { "epoch": 0.04062507142911565, "grad_norm": 0.07931298762559891, "learning_rate": 1.6675000000000002e-05, "loss": 0.4444, "step": 1333 }, { "epoch": 0.04065554785179316, "grad_norm": 0.0816156342625618, "learning_rate": 1.665e-05, "loss": 0.4856, "step": 1334 }, { "epoch": 0.04068602427447066, "grad_norm": 0.07542359083890915, "learning_rate": 1.6625e-05, "loss": 0.4295, "step": 1335 }, { "epoch": 0.04071650069714817, "grad_norm": 0.07764028012752533, "learning_rate": 1.66e-05, "loss": 0.4539, "step": 1336 }, { "epoch": 0.040746977119825674, "grad_norm": 0.07967660576105118, "learning_rate": 1.6575000000000003e-05, "loss": 0.4628, "step": 1337 }, { "epoch": 0.04077745354250318, "grad_norm": 0.07214836031198502, "learning_rate": 1.6550000000000002e-05, "loss": 0.408, "step": 1338 }, { "epoch": 0.040807929965180685, "grad_norm": 0.08198299258947372, "learning_rate": 1.6525000000000002e-05, "loss": 0.4604, "step": 1339 }, { "epoch": 0.04083840638785819, "grad_norm": 0.07514683157205582, "learning_rate": 1.65e-05, "loss": 0.4098, "step": 1340 }, { "epoch": 0.040868882810535696, "grad_norm": 0.0824052169919014, "learning_rate": 1.6475e-05, "loss": 0.5145, "step": 1341 }, { "epoch": 0.04089935923321321, "grad_norm": 0.07004890590906143, "learning_rate": 1.645e-05, "loss": 0.4169, "step": 1342 }, { "epoch": 0.040929835655890714, "grad_norm": 0.07192550599575043, "learning_rate": 1.6425000000000003e-05, "loss": 0.391, "step": 1343 }, { "epoch": 0.04096031207856822, "grad_norm": 0.0827094167470932, "learning_rate": 1.6400000000000002e-05, "loss": 0.4576, "step": 1344 }, { "epoch": 0.040990788501245726, "grad_norm": 0.07640732824802399, "learning_rate": 1.6375e-05, "loss": 0.4672, "step": 1345 }, { "epoch": 0.04102126492392323, "grad_norm": 0.08194056898355484, "learning_rate": 1.635e-05, "loss": 0.443, "step": 1346 }, { "epoch": 0.04105174134660074, "grad_norm": 0.08737534284591675, "learning_rate": 1.6325e-05, "loss": 0.5544, "step": 1347 }, { "epoch": 0.04108221776927824, "grad_norm": 0.07147364318370819, "learning_rate": 1.63e-05, "loss": 0.3979, "step": 1348 }, { "epoch": 0.04111269419195575, "grad_norm": 0.07589215785264969, "learning_rate": 1.6275000000000003e-05, "loss": 0.4731, "step": 1349 }, { "epoch": 0.041143170614633254, "grad_norm": 0.0841900110244751, "learning_rate": 1.6250000000000002e-05, "loss": 0.4957, "step": 1350 }, { "epoch": 0.04117364703731076, "grad_norm": 0.07811616361141205, "learning_rate": 1.6225e-05, "loss": 0.3695, "step": 1351 }, { "epoch": 0.041204123459988265, "grad_norm": 0.08196917176246643, "learning_rate": 1.62e-05, "loss": 0.4785, "step": 1352 }, { "epoch": 0.04123459988266577, "grad_norm": 0.0879179984331131, "learning_rate": 1.6175e-05, "loss": 0.5787, "step": 1353 }, { "epoch": 0.041265076305343276, "grad_norm": 0.07145990431308746, "learning_rate": 1.6150000000000003e-05, "loss": 0.3674, "step": 1354 }, { "epoch": 0.04129555272802078, "grad_norm": 0.07658839970827103, "learning_rate": 1.6125000000000002e-05, "loss": 0.4768, "step": 1355 }, { "epoch": 0.041326029150698294, "grad_norm": 0.0794522687792778, "learning_rate": 1.6100000000000002e-05, "loss": 0.4687, "step": 1356 }, { "epoch": 0.0413565055733758, "grad_norm": 0.07868267595767975, "learning_rate": 1.6075e-05, "loss": 0.4903, "step": 1357 }, { "epoch": 0.041386981996053306, "grad_norm": 0.0771985873579979, "learning_rate": 1.605e-05, "loss": 0.4618, "step": 1358 }, { "epoch": 0.04141745841873081, "grad_norm": 0.07792288064956665, "learning_rate": 1.6025e-05, "loss": 0.4639, "step": 1359 }, { "epoch": 0.04144793484140832, "grad_norm": 0.07387889176607132, "learning_rate": 1.6000000000000003e-05, "loss": 0.4081, "step": 1360 }, { "epoch": 0.04147841126408582, "grad_norm": 0.08136799931526184, "learning_rate": 1.5975000000000002e-05, "loss": 0.4985, "step": 1361 }, { "epoch": 0.04150888768676333, "grad_norm": 0.0819934755563736, "learning_rate": 1.595e-05, "loss": 0.4757, "step": 1362 }, { "epoch": 0.041539364109440834, "grad_norm": 0.080718033015728, "learning_rate": 1.5925e-05, "loss": 0.5134, "step": 1363 }, { "epoch": 0.04156984053211834, "grad_norm": 0.0758088082075119, "learning_rate": 1.59e-05, "loss": 0.4543, "step": 1364 }, { "epoch": 0.041600316954795845, "grad_norm": 0.06949636340141296, "learning_rate": 1.5875e-05, "loss": 0.3899, "step": 1365 }, { "epoch": 0.04163079337747335, "grad_norm": 0.08283908665180206, "learning_rate": 1.5850000000000002e-05, "loss": 0.4816, "step": 1366 }, { "epoch": 0.041661269800150856, "grad_norm": 0.127402201294899, "learning_rate": 1.5825000000000002e-05, "loss": 0.4355, "step": 1367 }, { "epoch": 0.04169174622282836, "grad_norm": 0.07779651880264282, "learning_rate": 1.58e-05, "loss": 0.4879, "step": 1368 }, { "epoch": 0.04172222264550587, "grad_norm": 0.0884762778878212, "learning_rate": 1.5775e-05, "loss": 0.5863, "step": 1369 }, { "epoch": 0.04175269906818338, "grad_norm": 0.07251448184251785, "learning_rate": 1.575e-05, "loss": 0.4659, "step": 1370 }, { "epoch": 0.041783175490860885, "grad_norm": 0.07333655655384064, "learning_rate": 1.5725e-05, "loss": 0.4441, "step": 1371 }, { "epoch": 0.04181365191353839, "grad_norm": 0.07164372503757477, "learning_rate": 1.5700000000000002e-05, "loss": 0.3952, "step": 1372 }, { "epoch": 0.0418441283362159, "grad_norm": 0.0811358317732811, "learning_rate": 1.5675e-05, "loss": 0.4957, "step": 1373 }, { "epoch": 0.0418746047588934, "grad_norm": 0.08031062036752701, "learning_rate": 1.565e-05, "loss": 0.5266, "step": 1374 }, { "epoch": 0.04190508118157091, "grad_norm": 0.0779448002576828, "learning_rate": 1.5625e-05, "loss": 0.4621, "step": 1375 }, { "epoch": 0.041935557604248413, "grad_norm": 0.07950543612241745, "learning_rate": 1.56e-05, "loss": 0.4864, "step": 1376 }, { "epoch": 0.04196603402692592, "grad_norm": 0.07414623349905014, "learning_rate": 1.5575e-05, "loss": 0.4162, "step": 1377 }, { "epoch": 0.041996510449603425, "grad_norm": 0.06943826377391815, "learning_rate": 1.5550000000000002e-05, "loss": 0.4311, "step": 1378 }, { "epoch": 0.04202698687228093, "grad_norm": 0.09015795588493347, "learning_rate": 1.5525e-05, "loss": 0.5526, "step": 1379 }, { "epoch": 0.042057463294958436, "grad_norm": 0.07757078856229782, "learning_rate": 1.55e-05, "loss": 0.4618, "step": 1380 }, { "epoch": 0.04208793971763594, "grad_norm": 0.0741356909275055, "learning_rate": 1.5475e-05, "loss": 0.4105, "step": 1381 }, { "epoch": 0.04211841614031345, "grad_norm": 0.0724051222205162, "learning_rate": 1.545e-05, "loss": 0.4195, "step": 1382 }, { "epoch": 0.04214889256299095, "grad_norm": 0.07285245507955551, "learning_rate": 1.5425000000000002e-05, "loss": 0.4208, "step": 1383 }, { "epoch": 0.042179368985668465, "grad_norm": 0.08056759834289551, "learning_rate": 1.54e-05, "loss": 0.4881, "step": 1384 }, { "epoch": 0.04220984540834597, "grad_norm": 0.07745154201984406, "learning_rate": 1.5375e-05, "loss": 0.4697, "step": 1385 }, { "epoch": 0.04224032183102348, "grad_norm": 0.07305365800857544, "learning_rate": 1.535e-05, "loss": 0.4626, "step": 1386 }, { "epoch": 0.04227079825370098, "grad_norm": 0.07827824354171753, "learning_rate": 1.5325e-05, "loss": 0.4693, "step": 1387 }, { "epoch": 0.04230127467637849, "grad_norm": 0.08982706069946289, "learning_rate": 1.53e-05, "loss": 0.5624, "step": 1388 }, { "epoch": 0.04233175109905599, "grad_norm": 0.07565366476774216, "learning_rate": 1.5275000000000002e-05, "loss": 0.4629, "step": 1389 }, { "epoch": 0.0423622275217335, "grad_norm": 0.07691759616136551, "learning_rate": 1.525e-05, "loss": 0.495, "step": 1390 }, { "epoch": 0.042392703944411005, "grad_norm": 0.07332286983728409, "learning_rate": 1.5225e-05, "loss": 0.447, "step": 1391 }, { "epoch": 0.04242318036708851, "grad_norm": 0.08093613386154175, "learning_rate": 1.52e-05, "loss": 0.4826, "step": 1392 }, { "epoch": 0.042453656789766016, "grad_norm": 0.08450298756361008, "learning_rate": 1.5175e-05, "loss": 0.5489, "step": 1393 }, { "epoch": 0.04248413321244352, "grad_norm": 0.07857246696949005, "learning_rate": 1.515e-05, "loss": 0.5218, "step": 1394 }, { "epoch": 0.04251460963512103, "grad_norm": 0.08307208120822906, "learning_rate": 1.5125e-05, "loss": 0.55, "step": 1395 }, { "epoch": 0.04254508605779853, "grad_norm": 0.08828859776258469, "learning_rate": 1.51e-05, "loss": 0.5447, "step": 1396 }, { "epoch": 0.042575562480476045, "grad_norm": 0.07342839986085892, "learning_rate": 1.5075e-05, "loss": 0.4692, "step": 1397 }, { "epoch": 0.04260603890315355, "grad_norm": 0.08195146918296814, "learning_rate": 1.505e-05, "loss": 0.4144, "step": 1398 }, { "epoch": 0.042636515325831056, "grad_norm": 0.07663796097040176, "learning_rate": 1.5025000000000001e-05, "loss": 0.4749, "step": 1399 }, { "epoch": 0.04266699174850856, "grad_norm": 0.07473456114530563, "learning_rate": 1.5e-05, "loss": 0.3314, "step": 1400 }, { "epoch": 0.04269746817118607, "grad_norm": 0.07448071241378784, "learning_rate": 1.4975e-05, "loss": 0.4536, "step": 1401 }, { "epoch": 0.04272794459386357, "grad_norm": 0.08641796559095383, "learning_rate": 1.4950000000000001e-05, "loss": 0.5049, "step": 1402 }, { "epoch": 0.04275842101654108, "grad_norm": 0.0777096077799797, "learning_rate": 1.4925e-05, "loss": 0.3811, "step": 1403 }, { "epoch": 0.042788897439218584, "grad_norm": 0.07325203716754913, "learning_rate": 1.49e-05, "loss": 0.4331, "step": 1404 }, { "epoch": 0.04281937386189609, "grad_norm": 0.08329159021377563, "learning_rate": 1.4875e-05, "loss": 0.525, "step": 1405 }, { "epoch": 0.042849850284573596, "grad_norm": 0.07711724191904068, "learning_rate": 1.485e-05, "loss": 0.5006, "step": 1406 }, { "epoch": 0.0428803267072511, "grad_norm": 0.07511943578720093, "learning_rate": 1.4825e-05, "loss": 0.4492, "step": 1407 }, { "epoch": 0.04291080312992861, "grad_norm": 0.0826931744813919, "learning_rate": 1.48e-05, "loss": 0.5101, "step": 1408 }, { "epoch": 0.04294127955260611, "grad_norm": 0.07374748587608337, "learning_rate": 1.4775e-05, "loss": 0.4067, "step": 1409 }, { "epoch": 0.04297175597528362, "grad_norm": 0.07727071642875671, "learning_rate": 1.475e-05, "loss": 0.4859, "step": 1410 }, { "epoch": 0.04300223239796113, "grad_norm": 0.07446695119142532, "learning_rate": 1.4725e-05, "loss": 0.4596, "step": 1411 }, { "epoch": 0.043032708820638636, "grad_norm": 0.09568074345588684, "learning_rate": 1.47e-05, "loss": 0.4934, "step": 1412 }, { "epoch": 0.04306318524331614, "grad_norm": 0.07729993760585785, "learning_rate": 1.4675e-05, "loss": 0.4416, "step": 1413 }, { "epoch": 0.04309366166599365, "grad_norm": 0.0782594084739685, "learning_rate": 1.465e-05, "loss": 0.4922, "step": 1414 }, { "epoch": 0.04312413808867115, "grad_norm": 0.08092372119426727, "learning_rate": 1.4625e-05, "loss": 0.5551, "step": 1415 }, { "epoch": 0.04315461451134866, "grad_norm": 0.07719599455595016, "learning_rate": 1.4599999999999999e-05, "loss": 0.5078, "step": 1416 }, { "epoch": 0.043185090934026164, "grad_norm": 0.07646114379167557, "learning_rate": 1.4575e-05, "loss": 0.4799, "step": 1417 }, { "epoch": 0.04321556735670367, "grad_norm": 0.08060050010681152, "learning_rate": 1.455e-05, "loss": 0.4847, "step": 1418 }, { "epoch": 0.043246043779381176, "grad_norm": 0.07836181670427322, "learning_rate": 1.4524999999999999e-05, "loss": 0.4849, "step": 1419 }, { "epoch": 0.04327652020205868, "grad_norm": 0.07964999973773956, "learning_rate": 1.45e-05, "loss": 0.5031, "step": 1420 }, { "epoch": 0.04330699662473619, "grad_norm": 0.06717615574598312, "learning_rate": 1.4475e-05, "loss": 0.3788, "step": 1421 }, { "epoch": 0.04333747304741369, "grad_norm": 0.07017872482538223, "learning_rate": 1.4449999999999999e-05, "loss": 0.3979, "step": 1422 }, { "epoch": 0.0433679494700912, "grad_norm": 0.07596758753061295, "learning_rate": 1.4425e-05, "loss": 0.4836, "step": 1423 }, { "epoch": 0.043398425892768704, "grad_norm": 0.07396314293146133, "learning_rate": 1.44e-05, "loss": 0.4246, "step": 1424 }, { "epoch": 0.043428902315446216, "grad_norm": 0.08051574975252151, "learning_rate": 1.4374999999999999e-05, "loss": 0.435, "step": 1425 }, { "epoch": 0.04345937873812372, "grad_norm": 0.1934771090745926, "learning_rate": 1.435e-05, "loss": 0.591, "step": 1426 }, { "epoch": 0.04348985516080123, "grad_norm": 0.07806826382875443, "learning_rate": 1.4325e-05, "loss": 0.37, "step": 1427 }, { "epoch": 0.04352033158347873, "grad_norm": 0.07940944284200668, "learning_rate": 1.43e-05, "loss": 0.5217, "step": 1428 }, { "epoch": 0.04355080800615624, "grad_norm": 0.08246316015720367, "learning_rate": 1.4275e-05, "loss": 0.5142, "step": 1429 }, { "epoch": 0.043581284428833744, "grad_norm": 0.09141865372657776, "learning_rate": 1.4249999999999999e-05, "loss": 0.5191, "step": 1430 }, { "epoch": 0.04361176085151125, "grad_norm": 0.07262786477804184, "learning_rate": 1.4225e-05, "loss": 0.4278, "step": 1431 }, { "epoch": 0.043642237274188755, "grad_norm": 0.07474478334188461, "learning_rate": 1.42e-05, "loss": 0.4663, "step": 1432 }, { "epoch": 0.04367271369686626, "grad_norm": 0.07299743592739105, "learning_rate": 1.4174999999999999e-05, "loss": 0.4558, "step": 1433 }, { "epoch": 0.04370319011954377, "grad_norm": 0.06480547040700912, "learning_rate": 1.415e-05, "loss": 0.3831, "step": 1434 }, { "epoch": 0.04373366654222127, "grad_norm": 0.08862403780221939, "learning_rate": 1.4125e-05, "loss": 0.5612, "step": 1435 }, { "epoch": 0.04376414296489878, "grad_norm": 0.07097696512937546, "learning_rate": 1.4099999999999999e-05, "loss": 0.3946, "step": 1436 }, { "epoch": 0.043794619387576283, "grad_norm": 0.08596453815698624, "learning_rate": 1.4075e-05, "loss": 0.4981, "step": 1437 }, { "epoch": 0.04382509581025379, "grad_norm": 0.07930775731801987, "learning_rate": 1.4050000000000003e-05, "loss": 0.4674, "step": 1438 }, { "epoch": 0.0438555722329313, "grad_norm": 0.08267464488744736, "learning_rate": 1.4025000000000002e-05, "loss": 0.4617, "step": 1439 }, { "epoch": 0.04388604865560881, "grad_norm": 0.08595051616430283, "learning_rate": 1.4000000000000001e-05, "loss": 0.5556, "step": 1440 }, { "epoch": 0.04391652507828631, "grad_norm": 0.07905665785074234, "learning_rate": 1.3975000000000003e-05, "loss": 0.4859, "step": 1441 }, { "epoch": 0.04394700150096382, "grad_norm": 0.08077443391084671, "learning_rate": 1.3950000000000002e-05, "loss": 0.4961, "step": 1442 }, { "epoch": 0.043977477923641324, "grad_norm": 0.08551418036222458, "learning_rate": 1.3925000000000001e-05, "loss": 0.5332, "step": 1443 }, { "epoch": 0.04400795434631883, "grad_norm": 0.08028308302164078, "learning_rate": 1.3900000000000002e-05, "loss": 0.4949, "step": 1444 }, { "epoch": 0.044038430768996335, "grad_norm": 0.07687711715698242, "learning_rate": 1.3875000000000002e-05, "loss": 0.4874, "step": 1445 }, { "epoch": 0.04406890719167384, "grad_norm": 0.07333803921937943, "learning_rate": 1.3850000000000001e-05, "loss": 0.4293, "step": 1446 }, { "epoch": 0.044099383614351347, "grad_norm": 0.08115465193986893, "learning_rate": 1.3825000000000002e-05, "loss": 0.5239, "step": 1447 }, { "epoch": 0.04412986003702885, "grad_norm": 0.07846251875162125, "learning_rate": 1.3800000000000002e-05, "loss": 0.4807, "step": 1448 }, { "epoch": 0.04416033645970636, "grad_norm": 0.07172375172376633, "learning_rate": 1.3775000000000001e-05, "loss": 0.4196, "step": 1449 }, { "epoch": 0.04419081288238386, "grad_norm": 0.07956532388925552, "learning_rate": 1.3750000000000002e-05, "loss": 0.4848, "step": 1450 }, { "epoch": 0.04422128930506137, "grad_norm": 0.07288660854101181, "learning_rate": 1.3725000000000002e-05, "loss": 0.3994, "step": 1451 }, { "epoch": 0.044251765727738875, "grad_norm": 0.08195853978395462, "learning_rate": 1.3700000000000001e-05, "loss": 0.493, "step": 1452 }, { "epoch": 0.04428224215041639, "grad_norm": 0.0703754872083664, "learning_rate": 1.3675000000000002e-05, "loss": 0.3787, "step": 1453 }, { "epoch": 0.04431271857309389, "grad_norm": 0.08334983140230179, "learning_rate": 1.3650000000000001e-05, "loss": 0.4741, "step": 1454 }, { "epoch": 0.0443431949957714, "grad_norm": 0.07160279154777527, "learning_rate": 1.3625e-05, "loss": 0.4121, "step": 1455 }, { "epoch": 0.044373671418448904, "grad_norm": 0.07017668336629868, "learning_rate": 1.3600000000000002e-05, "loss": 0.3477, "step": 1456 }, { "epoch": 0.04440414784112641, "grad_norm": 0.07109489291906357, "learning_rate": 1.3575000000000001e-05, "loss": 0.4086, "step": 1457 }, { "epoch": 0.044434624263803915, "grad_norm": 0.0873413234949112, "learning_rate": 1.3550000000000002e-05, "loss": 0.5219, "step": 1458 }, { "epoch": 0.04446510068648142, "grad_norm": 0.08211690187454224, "learning_rate": 1.3525000000000002e-05, "loss": 0.5062, "step": 1459 }, { "epoch": 0.044495577109158926, "grad_norm": 0.07990244030952454, "learning_rate": 1.3500000000000001e-05, "loss": 0.4869, "step": 1460 }, { "epoch": 0.04452605353183643, "grad_norm": 0.07822046428918839, "learning_rate": 1.3475000000000002e-05, "loss": 0.4578, "step": 1461 }, { "epoch": 0.04455652995451394, "grad_norm": 0.08237213641405106, "learning_rate": 1.3450000000000002e-05, "loss": 0.4071, "step": 1462 }, { "epoch": 0.04458700637719144, "grad_norm": 0.07966063916683197, "learning_rate": 1.3425000000000001e-05, "loss": 0.4871, "step": 1463 }, { "epoch": 0.04461748279986895, "grad_norm": 0.07626551389694214, "learning_rate": 1.3400000000000002e-05, "loss": 0.4598, "step": 1464 }, { "epoch": 0.044647959222546454, "grad_norm": 0.08216128498315811, "learning_rate": 1.3375000000000002e-05, "loss": 0.5012, "step": 1465 }, { "epoch": 0.04467843564522397, "grad_norm": 0.07186061888933182, "learning_rate": 1.3350000000000001e-05, "loss": 0.408, "step": 1466 }, { "epoch": 0.04470891206790147, "grad_norm": 0.07121529430150986, "learning_rate": 1.3325000000000002e-05, "loss": 0.3997, "step": 1467 }, { "epoch": 0.04473938849057898, "grad_norm": 0.07302889227867126, "learning_rate": 1.3300000000000001e-05, "loss": 0.4438, "step": 1468 }, { "epoch": 0.044769864913256484, "grad_norm": 0.07514402270317078, "learning_rate": 1.3275e-05, "loss": 0.4989, "step": 1469 }, { "epoch": 0.04480034133593399, "grad_norm": 0.08196177333593369, "learning_rate": 1.3250000000000002e-05, "loss": 0.4492, "step": 1470 }, { "epoch": 0.044830817758611495, "grad_norm": 0.08083102107048035, "learning_rate": 1.3225000000000001e-05, "loss": 0.4876, "step": 1471 }, { "epoch": 0.044861294181289, "grad_norm": 0.07384220510721207, "learning_rate": 1.32e-05, "loss": 0.3914, "step": 1472 }, { "epoch": 0.044891770603966506, "grad_norm": 0.07721889764070511, "learning_rate": 1.3175000000000002e-05, "loss": 0.4163, "step": 1473 }, { "epoch": 0.04492224702664401, "grad_norm": 0.07626696676015854, "learning_rate": 1.3150000000000001e-05, "loss": 0.4573, "step": 1474 }, { "epoch": 0.04495272344932152, "grad_norm": 0.07802897691726685, "learning_rate": 1.3125e-05, "loss": 0.4802, "step": 1475 }, { "epoch": 0.04498319987199902, "grad_norm": 0.07238595187664032, "learning_rate": 1.3100000000000002e-05, "loss": 0.4184, "step": 1476 }, { "epoch": 0.04501367629467653, "grad_norm": 0.08563128113746643, "learning_rate": 1.3075000000000001e-05, "loss": 0.4999, "step": 1477 }, { "epoch": 0.045044152717354034, "grad_norm": 0.07522845268249512, "learning_rate": 1.305e-05, "loss": 0.4162, "step": 1478 }, { "epoch": 0.04507462914003154, "grad_norm": 0.07230788469314575, "learning_rate": 1.3025000000000002e-05, "loss": 0.4259, "step": 1479 }, { "epoch": 0.04510510556270905, "grad_norm": 0.07959429174661636, "learning_rate": 1.3000000000000001e-05, "loss": 0.4784, "step": 1480 }, { "epoch": 0.04513558198538656, "grad_norm": 0.07231426239013672, "learning_rate": 1.2975e-05, "loss": 0.4482, "step": 1481 }, { "epoch": 0.045166058408064064, "grad_norm": 0.07715311646461487, "learning_rate": 1.2950000000000001e-05, "loss": 0.4279, "step": 1482 }, { "epoch": 0.04519653483074157, "grad_norm": 0.07568646967411041, "learning_rate": 1.2925e-05, "loss": 0.4293, "step": 1483 }, { "epoch": 0.045227011253419075, "grad_norm": 0.0802227333188057, "learning_rate": 1.29e-05, "loss": 0.4926, "step": 1484 }, { "epoch": 0.04525748767609658, "grad_norm": 0.08185547590255737, "learning_rate": 1.2875000000000001e-05, "loss": 0.5119, "step": 1485 }, { "epoch": 0.045287964098774086, "grad_norm": 0.07622049748897552, "learning_rate": 1.285e-05, "loss": 0.434, "step": 1486 }, { "epoch": 0.04531844052145159, "grad_norm": 0.07052507996559143, "learning_rate": 1.2825000000000002e-05, "loss": 0.3659, "step": 1487 }, { "epoch": 0.0453489169441291, "grad_norm": 0.07093793153762817, "learning_rate": 1.2800000000000001e-05, "loss": 0.4393, "step": 1488 }, { "epoch": 0.0453793933668066, "grad_norm": 0.08474712818861008, "learning_rate": 1.2775e-05, "loss": 0.4501, "step": 1489 }, { "epoch": 0.04540986978948411, "grad_norm": 0.08003673702478409, "learning_rate": 1.2750000000000002e-05, "loss": 0.476, "step": 1490 }, { "epoch": 0.045440346212161614, "grad_norm": 0.07535851746797562, "learning_rate": 1.2725000000000001e-05, "loss": 0.4909, "step": 1491 }, { "epoch": 0.04547082263483912, "grad_norm": 0.0806087851524353, "learning_rate": 1.27e-05, "loss": 0.3457, "step": 1492 }, { "epoch": 0.045501299057516625, "grad_norm": 0.07954234629869461, "learning_rate": 1.2675000000000001e-05, "loss": 0.501, "step": 1493 }, { "epoch": 0.04553177548019414, "grad_norm": 0.07901369780302048, "learning_rate": 1.2650000000000001e-05, "loss": 0.4544, "step": 1494 }, { "epoch": 0.045562251902871644, "grad_norm": 0.0758800283074379, "learning_rate": 1.2625e-05, "loss": 0.4425, "step": 1495 }, { "epoch": 0.04559272832554915, "grad_norm": 0.08573015034198761, "learning_rate": 1.2600000000000001e-05, "loss": 0.4915, "step": 1496 }, { "epoch": 0.045623204748226655, "grad_norm": 0.07737643271684647, "learning_rate": 1.2575e-05, "loss": 0.4235, "step": 1497 }, { "epoch": 0.04565368117090416, "grad_norm": 0.0755729079246521, "learning_rate": 1.255e-05, "loss": 0.4521, "step": 1498 }, { "epoch": 0.045684157593581666, "grad_norm": 0.07465959340333939, "learning_rate": 1.2525000000000001e-05, "loss": 0.4069, "step": 1499 }, { "epoch": 0.04571463401625917, "grad_norm": 0.08361887186765671, "learning_rate": 1.25e-05, "loss": 0.5487, "step": 1500 }, { "epoch": 0.04574511043893668, "grad_norm": 0.07683122903108597, "learning_rate": 1.2475e-05, "loss": 0.485, "step": 1501 }, { "epoch": 0.04577558686161418, "grad_norm": 0.07726240158081055, "learning_rate": 1.2450000000000001e-05, "loss": 0.4374, "step": 1502 }, { "epoch": 0.04580606328429169, "grad_norm": 0.07609997689723969, "learning_rate": 1.2425e-05, "loss": 0.4544, "step": 1503 }, { "epoch": 0.045836539706969194, "grad_norm": 0.08178794384002686, "learning_rate": 1.24e-05, "loss": 0.3743, "step": 1504 }, { "epoch": 0.0458670161296467, "grad_norm": 0.31682145595550537, "learning_rate": 1.2375000000000001e-05, "loss": 0.437, "step": 1505 }, { "epoch": 0.045897492552324205, "grad_norm": 0.07925023138523102, "learning_rate": 1.235e-05, "loss": 0.4757, "step": 1506 }, { "epoch": 0.04592796897500171, "grad_norm": 0.08774270862340927, "learning_rate": 1.2325e-05, "loss": 0.387, "step": 1507 }, { "epoch": 0.04595844539767922, "grad_norm": 0.08554140478372574, "learning_rate": 1.23e-05, "loss": 0.5501, "step": 1508 }, { "epoch": 0.04598892182035673, "grad_norm": 0.08531056344509125, "learning_rate": 1.2275e-05, "loss": 0.4768, "step": 1509 }, { "epoch": 0.046019398243034235, "grad_norm": 0.07456015050411224, "learning_rate": 1.225e-05, "loss": 0.4448, "step": 1510 }, { "epoch": 0.04604987466571174, "grad_norm": 0.09413668513298035, "learning_rate": 1.2225e-05, "loss": 0.5449, "step": 1511 }, { "epoch": 0.046080351088389246, "grad_norm": 0.07510751485824585, "learning_rate": 1.22e-05, "loss": 0.4725, "step": 1512 }, { "epoch": 0.04611082751106675, "grad_norm": 0.07561711221933365, "learning_rate": 1.2175e-05, "loss": 0.4253, "step": 1513 }, { "epoch": 0.04614130393374426, "grad_norm": 0.07376419752836227, "learning_rate": 1.215e-05, "loss": 0.454, "step": 1514 }, { "epoch": 0.04617178035642176, "grad_norm": 0.0736355260014534, "learning_rate": 1.2125e-05, "loss": 0.4487, "step": 1515 }, { "epoch": 0.04620225677909927, "grad_norm": 0.07239922881126404, "learning_rate": 1.2100000000000001e-05, "loss": 0.4101, "step": 1516 }, { "epoch": 0.046232733201776774, "grad_norm": 0.07095226645469666, "learning_rate": 1.2075e-05, "loss": 0.439, "step": 1517 }, { "epoch": 0.04626320962445428, "grad_norm": 0.07873702794313431, "learning_rate": 1.205e-05, "loss": 0.4685, "step": 1518 }, { "epoch": 0.046293686047131785, "grad_norm": 0.07642990350723267, "learning_rate": 1.2025000000000001e-05, "loss": 0.4764, "step": 1519 }, { "epoch": 0.04632416246980929, "grad_norm": 0.06726355105638504, "learning_rate": 1.2e-05, "loss": 0.3664, "step": 1520 }, { "epoch": 0.0463546388924868, "grad_norm": 0.06575615704059601, "learning_rate": 1.1975e-05, "loss": 0.3491, "step": 1521 }, { "epoch": 0.04638511531516431, "grad_norm": 0.07432980090379715, "learning_rate": 1.195e-05, "loss": 0.4308, "step": 1522 }, { "epoch": 0.046415591737841815, "grad_norm": 0.0760628879070282, "learning_rate": 1.1925e-05, "loss": 0.4794, "step": 1523 }, { "epoch": 0.04644606816051932, "grad_norm": 0.08656267821788788, "learning_rate": 1.19e-05, "loss": 0.536, "step": 1524 }, { "epoch": 0.046476544583196826, "grad_norm": 0.08315083384513855, "learning_rate": 1.1875e-05, "loss": 0.5122, "step": 1525 }, { "epoch": 0.04650702100587433, "grad_norm": 0.08372493833303452, "learning_rate": 1.185e-05, "loss": 0.4872, "step": 1526 }, { "epoch": 0.04653749742855184, "grad_norm": 0.07720647007226944, "learning_rate": 1.1825e-05, "loss": 0.4287, "step": 1527 }, { "epoch": 0.04656797385122934, "grad_norm": 0.07871419936418533, "learning_rate": 1.18e-05, "loss": 0.4823, "step": 1528 }, { "epoch": 0.04659845027390685, "grad_norm": 0.07749909162521362, "learning_rate": 1.1775e-05, "loss": 0.4733, "step": 1529 }, { "epoch": 0.046628926696584354, "grad_norm": 0.08232264220714569, "learning_rate": 1.175e-05, "loss": 0.4192, "step": 1530 }, { "epoch": 0.04665940311926186, "grad_norm": 0.08905529230833054, "learning_rate": 1.1725e-05, "loss": 0.5156, "step": 1531 }, { "epoch": 0.046689879541939365, "grad_norm": 0.07463119924068451, "learning_rate": 1.1700000000000001e-05, "loss": 0.4837, "step": 1532 }, { "epoch": 0.04672035596461687, "grad_norm": 0.07844197005033493, "learning_rate": 1.1675000000000001e-05, "loss": 0.4491, "step": 1533 }, { "epoch": 0.046750832387294376, "grad_norm": 0.08200187236070633, "learning_rate": 1.1650000000000002e-05, "loss": 0.4877, "step": 1534 }, { "epoch": 0.04678130880997189, "grad_norm": 0.07673025131225586, "learning_rate": 1.1625000000000001e-05, "loss": 0.4712, "step": 1535 }, { "epoch": 0.046811785232649394, "grad_norm": 0.0789891704916954, "learning_rate": 1.16e-05, "loss": 0.5123, "step": 1536 }, { "epoch": 0.0468422616553269, "grad_norm": 0.07714387029409409, "learning_rate": 1.1575000000000002e-05, "loss": 0.4388, "step": 1537 }, { "epoch": 0.046872738078004406, "grad_norm": 0.07514838129281998, "learning_rate": 1.1550000000000001e-05, "loss": 0.4478, "step": 1538 }, { "epoch": 0.04690321450068191, "grad_norm": 0.07515661418437958, "learning_rate": 1.1525e-05, "loss": 0.4324, "step": 1539 }, { "epoch": 0.04693369092335942, "grad_norm": 0.0791865661740303, "learning_rate": 1.1500000000000002e-05, "loss": 0.4744, "step": 1540 }, { "epoch": 0.04696416734603692, "grad_norm": 0.07427757233381271, "learning_rate": 1.1475000000000001e-05, "loss": 0.417, "step": 1541 }, { "epoch": 0.04699464376871443, "grad_norm": 0.07577263563871384, "learning_rate": 1.145e-05, "loss": 0.4798, "step": 1542 }, { "epoch": 0.047025120191391934, "grad_norm": 0.07467955350875854, "learning_rate": 1.1425000000000002e-05, "loss": 0.4413, "step": 1543 }, { "epoch": 0.04705559661406944, "grad_norm": 0.0739821046590805, "learning_rate": 1.1400000000000001e-05, "loss": 0.4313, "step": 1544 }, { "epoch": 0.047086073036746945, "grad_norm": 0.0700758770108223, "learning_rate": 1.1375e-05, "loss": 0.3735, "step": 1545 }, { "epoch": 0.04711654945942445, "grad_norm": 0.07600681483745575, "learning_rate": 1.1350000000000001e-05, "loss": 0.4384, "step": 1546 }, { "epoch": 0.047147025882101956, "grad_norm": 0.0828060433268547, "learning_rate": 1.1325e-05, "loss": 0.4905, "step": 1547 }, { "epoch": 0.04717750230477946, "grad_norm": 0.0800706297159195, "learning_rate": 1.13e-05, "loss": 0.4561, "step": 1548 }, { "epoch": 0.047207978727456974, "grad_norm": 0.08213678747415543, "learning_rate": 1.1275000000000001e-05, "loss": 0.5365, "step": 1549 }, { "epoch": 0.04723845515013448, "grad_norm": 0.08435729146003723, "learning_rate": 1.125e-05, "loss": 0.4673, "step": 1550 }, { "epoch": 0.047268931572811985, "grad_norm": 0.07643872499465942, "learning_rate": 1.1225e-05, "loss": 0.4718, "step": 1551 }, { "epoch": 0.04729940799548949, "grad_norm": 0.07975197583436966, "learning_rate": 1.1200000000000001e-05, "loss": 0.5325, "step": 1552 }, { "epoch": 0.047329884418167, "grad_norm": 0.07199688255786896, "learning_rate": 1.1175e-05, "loss": 0.4177, "step": 1553 }, { "epoch": 0.0473603608408445, "grad_norm": 0.08104916661977768, "learning_rate": 1.115e-05, "loss": 0.5026, "step": 1554 }, { "epoch": 0.04739083726352201, "grad_norm": 0.07149991393089294, "learning_rate": 1.1125000000000001e-05, "loss": 0.3888, "step": 1555 }, { "epoch": 0.047421313686199514, "grad_norm": 0.07827671617269516, "learning_rate": 1.11e-05, "loss": 0.4331, "step": 1556 }, { "epoch": 0.04745179010887702, "grad_norm": 0.07857640087604523, "learning_rate": 1.1075e-05, "loss": 0.4593, "step": 1557 }, { "epoch": 0.047482266531554525, "grad_norm": 0.07794690877199173, "learning_rate": 1.1050000000000001e-05, "loss": 0.502, "step": 1558 }, { "epoch": 0.04751274295423203, "grad_norm": 0.08332689851522446, "learning_rate": 1.1025e-05, "loss": 0.4798, "step": 1559 }, { "epoch": 0.047543219376909536, "grad_norm": 0.07607225328683853, "learning_rate": 1.1000000000000001e-05, "loss": 0.4345, "step": 1560 }, { "epoch": 0.04757369579958704, "grad_norm": 0.09533950686454773, "learning_rate": 1.0975e-05, "loss": 0.4041, "step": 1561 }, { "epoch": 0.04760417222226455, "grad_norm": 0.08793961256742477, "learning_rate": 1.095e-05, "loss": 0.5031, "step": 1562 }, { "epoch": 0.04763464864494206, "grad_norm": 0.07501745969057083, "learning_rate": 1.0925000000000001e-05, "loss": 0.4428, "step": 1563 }, { "epoch": 0.047665125067619565, "grad_norm": 0.08227071911096573, "learning_rate": 1.09e-05, "loss": 0.4483, "step": 1564 }, { "epoch": 0.04769560149029707, "grad_norm": 0.07390928268432617, "learning_rate": 1.0875e-05, "loss": 0.409, "step": 1565 }, { "epoch": 0.04772607791297458, "grad_norm": 0.07477901130914688, "learning_rate": 1.0850000000000001e-05, "loss": 0.4269, "step": 1566 }, { "epoch": 0.04775655433565208, "grad_norm": 0.07622239738702774, "learning_rate": 1.0825e-05, "loss": 0.4348, "step": 1567 }, { "epoch": 0.04778703075832959, "grad_norm": 0.07148583233356476, "learning_rate": 1.08e-05, "loss": 0.4247, "step": 1568 }, { "epoch": 0.04781750718100709, "grad_norm": 0.07793376594781876, "learning_rate": 1.0775000000000001e-05, "loss": 0.5062, "step": 1569 }, { "epoch": 0.0478479836036846, "grad_norm": 0.07209542393684387, "learning_rate": 1.075e-05, "loss": 0.4136, "step": 1570 }, { "epoch": 0.047878460026362105, "grad_norm": 0.07281986624002457, "learning_rate": 1.0725e-05, "loss": 0.4291, "step": 1571 }, { "epoch": 0.04790893644903961, "grad_norm": 0.08049772679805756, "learning_rate": 1.0700000000000001e-05, "loss": 0.4944, "step": 1572 }, { "epoch": 0.047939412871717116, "grad_norm": 0.08272287994623184, "learning_rate": 1.0675e-05, "loss": 0.5117, "step": 1573 }, { "epoch": 0.04796988929439462, "grad_norm": 0.07369446754455566, "learning_rate": 1.065e-05, "loss": 0.4696, "step": 1574 }, { "epoch": 0.04800036571707213, "grad_norm": 0.07067371904850006, "learning_rate": 1.0625e-05, "loss": 0.3691, "step": 1575 }, { "epoch": 0.04803084213974964, "grad_norm": 0.07755018025636673, "learning_rate": 1.06e-05, "loss": 0.402, "step": 1576 }, { "epoch": 0.048061318562427145, "grad_norm": 0.0775521844625473, "learning_rate": 1.0575e-05, "loss": 0.4463, "step": 1577 }, { "epoch": 0.04809179498510465, "grad_norm": 0.07601470500230789, "learning_rate": 1.055e-05, "loss": 0.4552, "step": 1578 }, { "epoch": 0.048122271407782156, "grad_norm": 0.07766881585121155, "learning_rate": 1.0525e-05, "loss": 0.3986, "step": 1579 }, { "epoch": 0.04815274783045966, "grad_norm": 0.07652290165424347, "learning_rate": 1.05e-05, "loss": 0.4764, "step": 1580 }, { "epoch": 0.04818322425313717, "grad_norm": 0.07004466652870178, "learning_rate": 1.0475e-05, "loss": 0.3846, "step": 1581 }, { "epoch": 0.04821370067581467, "grad_norm": 0.09664234519004822, "learning_rate": 1.045e-05, "loss": 0.5923, "step": 1582 }, { "epoch": 0.04824417709849218, "grad_norm": 0.09495330601930618, "learning_rate": 1.0425e-05, "loss": 0.4445, "step": 1583 }, { "epoch": 0.048274653521169685, "grad_norm": 0.07758696377277374, "learning_rate": 1.04e-05, "loss": 0.4234, "step": 1584 }, { "epoch": 0.04830512994384719, "grad_norm": 0.08628136664628983, "learning_rate": 1.0375e-05, "loss": 0.5404, "step": 1585 }, { "epoch": 0.048335606366524696, "grad_norm": 0.07584135234355927, "learning_rate": 1.035e-05, "loss": 0.4793, "step": 1586 }, { "epoch": 0.0483660827892022, "grad_norm": 0.0767350047826767, "learning_rate": 1.0325e-05, "loss": 0.4676, "step": 1587 }, { "epoch": 0.04839655921187971, "grad_norm": 0.07993650436401367, "learning_rate": 1.03e-05, "loss": 0.5038, "step": 1588 }, { "epoch": 0.04842703563455721, "grad_norm": 0.08115538209676743, "learning_rate": 1.0275e-05, "loss": 0.4845, "step": 1589 }, { "epoch": 0.048457512057234725, "grad_norm": 0.07026893645524979, "learning_rate": 1.025e-05, "loss": 0.3824, "step": 1590 }, { "epoch": 0.04848798847991223, "grad_norm": 0.08964451402425766, "learning_rate": 1.0225e-05, "loss": 0.5417, "step": 1591 }, { "epoch": 0.048518464902589736, "grad_norm": 0.07860507071018219, "learning_rate": 1.02e-05, "loss": 0.4819, "step": 1592 }, { "epoch": 0.04854894132526724, "grad_norm": 0.09121429175138474, "learning_rate": 1.0175e-05, "loss": 0.5481, "step": 1593 }, { "epoch": 0.04857941774794475, "grad_norm": 0.07564505934715271, "learning_rate": 1.0150000000000001e-05, "loss": 0.4043, "step": 1594 }, { "epoch": 0.04860989417062225, "grad_norm": 0.10292312502861023, "learning_rate": 1.0125e-05, "loss": 0.3885, "step": 1595 }, { "epoch": 0.04864037059329976, "grad_norm": 0.08386413007974625, "learning_rate": 1.0100000000000002e-05, "loss": 0.5494, "step": 1596 }, { "epoch": 0.048670847015977264, "grad_norm": 0.07438459992408752, "learning_rate": 1.0075000000000001e-05, "loss": 0.4147, "step": 1597 }, { "epoch": 0.04870132343865477, "grad_norm": 0.07893188297748566, "learning_rate": 1.005e-05, "loss": 0.5138, "step": 1598 }, { "epoch": 0.048731799861332276, "grad_norm": 0.07363870739936829, "learning_rate": 1.0025000000000001e-05, "loss": 0.4339, "step": 1599 }, { "epoch": 0.04876227628400978, "grad_norm": 0.07932309061288834, "learning_rate": 1e-05, "loss": 0.5056, "step": 1600 }, { "epoch": 0.04879275270668729, "grad_norm": 0.08778295665979385, "learning_rate": 9.975e-06, "loss": 0.4718, "step": 1601 }, { "epoch": 0.04882322912936479, "grad_norm": 0.08145146071910858, "learning_rate": 9.950000000000001e-06, "loss": 0.4561, "step": 1602 }, { "epoch": 0.0488537055520423, "grad_norm": 0.07066711038351059, "learning_rate": 9.925e-06, "loss": 0.3991, "step": 1603 }, { "epoch": 0.04888418197471981, "grad_norm": 0.07630852609872818, "learning_rate": 9.900000000000002e-06, "loss": 0.4562, "step": 1604 }, { "epoch": 0.048914658397397316, "grad_norm": 0.07682674378156662, "learning_rate": 9.875000000000001e-06, "loss": 0.4198, "step": 1605 }, { "epoch": 0.04894513482007482, "grad_norm": 0.07642398029565811, "learning_rate": 9.85e-06, "loss": 0.4994, "step": 1606 }, { "epoch": 0.04897561124275233, "grad_norm": 0.0764671266078949, "learning_rate": 9.825000000000002e-06, "loss": 0.4857, "step": 1607 }, { "epoch": 0.04900608766542983, "grad_norm": 0.07510103285312653, "learning_rate": 9.800000000000001e-06, "loss": 0.4343, "step": 1608 }, { "epoch": 0.04903656408810734, "grad_norm": 0.08515484631061554, "learning_rate": 9.775e-06, "loss": 0.4636, "step": 1609 }, { "epoch": 0.049067040510784844, "grad_norm": 0.08086752891540527, "learning_rate": 9.750000000000002e-06, "loss": 0.486, "step": 1610 }, { "epoch": 0.04909751693346235, "grad_norm": 0.0797349363565445, "learning_rate": 9.725000000000001e-06, "loss": 0.4864, "step": 1611 }, { "epoch": 0.049127993356139855, "grad_norm": 0.07715785503387451, "learning_rate": 9.7e-06, "loss": 0.4641, "step": 1612 }, { "epoch": 0.04915846977881736, "grad_norm": 0.08772648125886917, "learning_rate": 9.675000000000001e-06, "loss": 0.5467, "step": 1613 }, { "epoch": 0.04918894620149487, "grad_norm": 0.074705109000206, "learning_rate": 9.65e-06, "loss": 0.3988, "step": 1614 }, { "epoch": 0.04921942262417237, "grad_norm": 0.07447940856218338, "learning_rate": 9.625e-06, "loss": 0.4138, "step": 1615 }, { "epoch": 0.04924989904684988, "grad_norm": 0.07370208948850632, "learning_rate": 9.600000000000001e-06, "loss": 0.4762, "step": 1616 }, { "epoch": 0.049280375469527384, "grad_norm": 0.0795862078666687, "learning_rate": 9.575e-06, "loss": 0.4755, "step": 1617 }, { "epoch": 0.049310851892204896, "grad_norm": 0.07293259352445602, "learning_rate": 9.55e-06, "loss": 0.376, "step": 1618 }, { "epoch": 0.0493413283148824, "grad_norm": 0.08106304705142975, "learning_rate": 9.525000000000001e-06, "loss": 0.5023, "step": 1619 }, { "epoch": 0.04937180473755991, "grad_norm": 0.08154899626970291, "learning_rate": 9.5e-06, "loss": 0.492, "step": 1620 }, { "epoch": 0.04940228116023741, "grad_norm": 0.08138760924339294, "learning_rate": 9.475e-06, "loss": 0.5429, "step": 1621 }, { "epoch": 0.04943275758291492, "grad_norm": 0.07741197198629379, "learning_rate": 9.450000000000001e-06, "loss": 0.4868, "step": 1622 }, { "epoch": 0.049463234005592424, "grad_norm": 0.08139384537935257, "learning_rate": 9.425e-06, "loss": 0.4312, "step": 1623 }, { "epoch": 0.04949371042826993, "grad_norm": 0.08226098865270615, "learning_rate": 9.4e-06, "loss": 0.4896, "step": 1624 }, { "epoch": 0.049524186850947435, "grad_norm": 0.0756540521979332, "learning_rate": 9.375000000000001e-06, "loss": 0.4812, "step": 1625 }, { "epoch": 0.04955466327362494, "grad_norm": 0.07895086705684662, "learning_rate": 9.35e-06, "loss": 0.4224, "step": 1626 }, { "epoch": 0.04958513969630245, "grad_norm": 0.08456684648990631, "learning_rate": 9.325e-06, "loss": 0.565, "step": 1627 }, { "epoch": 0.04961561611897995, "grad_norm": 0.07961149513721466, "learning_rate": 9.3e-06, "loss": 0.4948, "step": 1628 }, { "epoch": 0.04964609254165746, "grad_norm": 0.07539098709821701, "learning_rate": 9.275e-06, "loss": 0.4824, "step": 1629 }, { "epoch": 0.04967656896433496, "grad_norm": 0.07281839102506638, "learning_rate": 9.25e-06, "loss": 0.4086, "step": 1630 }, { "epoch": 0.049707045387012476, "grad_norm": 0.07497972249984741, "learning_rate": 9.225e-06, "loss": 0.4555, "step": 1631 }, { "epoch": 0.04973752180968998, "grad_norm": 0.07533575594425201, "learning_rate": 9.2e-06, "loss": 0.4906, "step": 1632 }, { "epoch": 0.04976799823236749, "grad_norm": 0.07593446224927902, "learning_rate": 9.175000000000001e-06, "loss": 0.4677, "step": 1633 }, { "epoch": 0.04979847465504499, "grad_norm": 0.08968568593263626, "learning_rate": 9.15e-06, "loss": 0.5374, "step": 1634 }, { "epoch": 0.0498289510777225, "grad_norm": 0.06974641978740692, "learning_rate": 9.125e-06, "loss": 0.4263, "step": 1635 }, { "epoch": 0.049859427500400004, "grad_norm": 0.07877788692712784, "learning_rate": 9.100000000000001e-06, "loss": 0.5004, "step": 1636 }, { "epoch": 0.04988990392307751, "grad_norm": 0.08161775767803192, "learning_rate": 9.075e-06, "loss": 0.4918, "step": 1637 }, { "epoch": 0.049920380345755015, "grad_norm": 0.07277511060237885, "learning_rate": 9.05e-06, "loss": 0.4059, "step": 1638 }, { "epoch": 0.04995085676843252, "grad_norm": 0.09041672199964523, "learning_rate": 9.025e-06, "loss": 0.5613, "step": 1639 }, { "epoch": 0.049981333191110026, "grad_norm": 0.08147478848695755, "learning_rate": 9e-06, "loss": 0.5465, "step": 1640 }, { "epoch": 0.05001180961378753, "grad_norm": 0.08431069552898407, "learning_rate": 8.975e-06, "loss": 0.5082, "step": 1641 }, { "epoch": 0.05004228603646504, "grad_norm": 0.07264046370983124, "learning_rate": 8.95e-06, "loss": 0.4337, "step": 1642 }, { "epoch": 0.05007276245914254, "grad_norm": 0.08510204404592514, "learning_rate": 8.925e-06, "loss": 0.5272, "step": 1643 }, { "epoch": 0.05010323888182005, "grad_norm": 0.06979744136333466, "learning_rate": 8.9e-06, "loss": 0.406, "step": 1644 }, { "epoch": 0.05013371530449756, "grad_norm": 0.08001603931188583, "learning_rate": 8.875e-06, "loss": 0.4923, "step": 1645 }, { "epoch": 0.05016419172717507, "grad_norm": 0.08247562497854233, "learning_rate": 8.85e-06, "loss": 0.5113, "step": 1646 }, { "epoch": 0.05019466814985257, "grad_norm": 0.0748908594250679, "learning_rate": 8.825e-06, "loss": 0.431, "step": 1647 }, { "epoch": 0.05022514457253008, "grad_norm": 0.07201673835515976, "learning_rate": 8.8e-06, "loss": 0.4027, "step": 1648 }, { "epoch": 0.050255620995207584, "grad_norm": 0.07885918021202087, "learning_rate": 8.775e-06, "loss": 0.5214, "step": 1649 }, { "epoch": 0.05028609741788509, "grad_norm": 0.0764293372631073, "learning_rate": 8.75e-06, "loss": 0.4592, "step": 1650 }, { "epoch": 0.050316573840562595, "grad_norm": 0.07469349354505539, "learning_rate": 8.725e-06, "loss": 0.4147, "step": 1651 }, { "epoch": 0.0503470502632401, "grad_norm": 0.07175737619400024, "learning_rate": 8.7e-06, "loss": 0.4032, "step": 1652 }, { "epoch": 0.050377526685917606, "grad_norm": 0.07444446533918381, "learning_rate": 8.674999999999999e-06, "loss": 0.4438, "step": 1653 }, { "epoch": 0.05040800310859511, "grad_norm": 0.07730165123939514, "learning_rate": 8.65e-06, "loss": 0.4436, "step": 1654 }, { "epoch": 0.05043847953127262, "grad_norm": 0.07565020769834518, "learning_rate": 8.625e-06, "loss": 0.4167, "step": 1655 }, { "epoch": 0.05046895595395012, "grad_norm": 0.07432191073894501, "learning_rate": 8.599999999999999e-06, "loss": 0.4649, "step": 1656 }, { "epoch": 0.05049943237662763, "grad_norm": 0.0723196268081665, "learning_rate": 8.575000000000002e-06, "loss": 0.3885, "step": 1657 }, { "epoch": 0.050529908799305134, "grad_norm": 0.07886788249015808, "learning_rate": 8.550000000000001e-06, "loss": 0.5012, "step": 1658 }, { "epoch": 0.05056038522198265, "grad_norm": 0.07642892748117447, "learning_rate": 8.525e-06, "loss": 0.4409, "step": 1659 }, { "epoch": 0.05059086164466015, "grad_norm": 0.0766199454665184, "learning_rate": 8.500000000000002e-06, "loss": 0.454, "step": 1660 }, { "epoch": 0.05062133806733766, "grad_norm": 0.07638928294181824, "learning_rate": 8.475000000000001e-06, "loss": 0.4486, "step": 1661 }, { "epoch": 0.050651814490015164, "grad_norm": 0.07845763117074966, "learning_rate": 8.45e-06, "loss": 0.4947, "step": 1662 }, { "epoch": 0.05068229091269267, "grad_norm": 0.07834794372320175, "learning_rate": 8.425000000000001e-06, "loss": 0.4813, "step": 1663 }, { "epoch": 0.050712767335370175, "grad_norm": 0.07533837109804153, "learning_rate": 8.400000000000001e-06, "loss": 0.4447, "step": 1664 }, { "epoch": 0.05074324375804768, "grad_norm": 0.06866245716810226, "learning_rate": 8.375e-06, "loss": 0.388, "step": 1665 }, { "epoch": 0.050773720180725186, "grad_norm": 0.08451104164123535, "learning_rate": 8.350000000000001e-06, "loss": 0.5121, "step": 1666 }, { "epoch": 0.05080419660340269, "grad_norm": 0.07676304131746292, "learning_rate": 8.325e-06, "loss": 0.4096, "step": 1667 }, { "epoch": 0.0508346730260802, "grad_norm": 0.07662507146596909, "learning_rate": 8.3e-06, "loss": 0.4457, "step": 1668 }, { "epoch": 0.0508651494487577, "grad_norm": 0.09124063700437546, "learning_rate": 8.275000000000001e-06, "loss": 0.49, "step": 1669 }, { "epoch": 0.05089562587143521, "grad_norm": 0.07442934811115265, "learning_rate": 8.25e-06, "loss": 0.4374, "step": 1670 }, { "epoch": 0.050926102294112714, "grad_norm": 0.07832320034503937, "learning_rate": 8.225e-06, "loss": 0.4906, "step": 1671 }, { "epoch": 0.05095657871679022, "grad_norm": 0.07228587567806244, "learning_rate": 8.200000000000001e-06, "loss": 0.4291, "step": 1672 }, { "epoch": 0.05098705513946773, "grad_norm": 0.07382132112979889, "learning_rate": 8.175e-06, "loss": 0.4638, "step": 1673 }, { "epoch": 0.05101753156214524, "grad_norm": 0.07105421274900436, "learning_rate": 8.15e-06, "loss": 0.3888, "step": 1674 }, { "epoch": 0.051048007984822744, "grad_norm": 0.07065872848033905, "learning_rate": 8.125000000000001e-06, "loss": 0.4482, "step": 1675 }, { "epoch": 0.05107848440750025, "grad_norm": 0.07212594896554947, "learning_rate": 8.1e-06, "loss": 0.4375, "step": 1676 }, { "epoch": 0.051108960830177755, "grad_norm": 0.07495900988578796, "learning_rate": 8.075000000000001e-06, "loss": 0.4702, "step": 1677 }, { "epoch": 0.05113943725285526, "grad_norm": 0.0754467025399208, "learning_rate": 8.050000000000001e-06, "loss": 0.4893, "step": 1678 }, { "epoch": 0.051169913675532766, "grad_norm": 0.06905972957611084, "learning_rate": 8.025e-06, "loss": 0.3701, "step": 1679 }, { "epoch": 0.05120039009821027, "grad_norm": 0.0763591006398201, "learning_rate": 8.000000000000001e-06, "loss": 0.4553, "step": 1680 }, { "epoch": 0.05123086652088778, "grad_norm": 0.0739377811551094, "learning_rate": 7.975e-06, "loss": 0.4887, "step": 1681 }, { "epoch": 0.05126134294356528, "grad_norm": 0.08291000127792358, "learning_rate": 7.95e-06, "loss": 0.5487, "step": 1682 }, { "epoch": 0.05129181936624279, "grad_norm": 0.08113480359315872, "learning_rate": 7.925000000000001e-06, "loss": 0.523, "step": 1683 }, { "epoch": 0.051322295788920294, "grad_norm": 0.07983608543872833, "learning_rate": 7.9e-06, "loss": 0.4041, "step": 1684 }, { "epoch": 0.0513527722115978, "grad_norm": 0.08505839109420776, "learning_rate": 7.875e-06, "loss": 0.5457, "step": 1685 }, { "epoch": 0.051383248634275305, "grad_norm": 0.08129873871803284, "learning_rate": 7.850000000000001e-06, "loss": 0.4212, "step": 1686 }, { "epoch": 0.05141372505695282, "grad_norm": 0.07141125202178955, "learning_rate": 7.825e-06, "loss": 0.3897, "step": 1687 }, { "epoch": 0.051444201479630323, "grad_norm": 0.08046659827232361, "learning_rate": 7.8e-06, "loss": 0.4156, "step": 1688 }, { "epoch": 0.05147467790230783, "grad_norm": 0.07678405195474625, "learning_rate": 7.775000000000001e-06, "loss": 0.4674, "step": 1689 }, { "epoch": 0.051505154324985335, "grad_norm": 0.11052624881267548, "learning_rate": 7.75e-06, "loss": 0.4796, "step": 1690 }, { "epoch": 0.05153563074766284, "grad_norm": 0.07501202076673508, "learning_rate": 7.725e-06, "loss": 0.451, "step": 1691 }, { "epoch": 0.051566107170340346, "grad_norm": 0.07760925590991974, "learning_rate": 7.7e-06, "loss": 0.4655, "step": 1692 }, { "epoch": 0.05159658359301785, "grad_norm": 0.07183408737182617, "learning_rate": 7.675e-06, "loss": 0.4463, "step": 1693 }, { "epoch": 0.05162706001569536, "grad_norm": 0.07133106887340546, "learning_rate": 7.65e-06, "loss": 0.3951, "step": 1694 }, { "epoch": 0.05165753643837286, "grad_norm": 0.07419022172689438, "learning_rate": 7.625e-06, "loss": 0.4731, "step": 1695 }, { "epoch": 0.05168801286105037, "grad_norm": 0.07380285114049911, "learning_rate": 7.6e-06, "loss": 0.4409, "step": 1696 }, { "epoch": 0.051718489283727874, "grad_norm": 0.07903948426246643, "learning_rate": 7.575e-06, "loss": 0.5022, "step": 1697 }, { "epoch": 0.05174896570640538, "grad_norm": 0.06858405470848083, "learning_rate": 7.55e-06, "loss": 0.3492, "step": 1698 }, { "epoch": 0.051779442129082885, "grad_norm": 0.07356146723031998, "learning_rate": 7.525e-06, "loss": 0.4262, "step": 1699 }, { "epoch": 0.0518099185517604, "grad_norm": 0.07331035286188126, "learning_rate": 7.5e-06, "loss": 0.371, "step": 1700 }, { "epoch": 0.0518403949744379, "grad_norm": 0.07909146696329117, "learning_rate": 7.4750000000000004e-06, "loss": 0.4679, "step": 1701 }, { "epoch": 0.05187087139711541, "grad_norm": 0.08103639632463455, "learning_rate": 7.45e-06, "loss": 0.5368, "step": 1702 }, { "epoch": 0.051901347819792915, "grad_norm": 0.06804439425468445, "learning_rate": 7.425e-06, "loss": 0.3737, "step": 1703 }, { "epoch": 0.05193182424247042, "grad_norm": 0.06998948752880096, "learning_rate": 7.4e-06, "loss": 0.4258, "step": 1704 }, { "epoch": 0.051962300665147926, "grad_norm": 0.07609688490629196, "learning_rate": 7.375e-06, "loss": 0.458, "step": 1705 }, { "epoch": 0.05199277708782543, "grad_norm": 0.0767679288983345, "learning_rate": 7.35e-06, "loss": 0.4544, "step": 1706 }, { "epoch": 0.05202325351050294, "grad_norm": 0.07537072896957397, "learning_rate": 7.325e-06, "loss": 0.4388, "step": 1707 }, { "epoch": 0.05205372993318044, "grad_norm": 0.06975585967302322, "learning_rate": 7.2999999999999996e-06, "loss": 0.4082, "step": 1708 }, { "epoch": 0.05208420635585795, "grad_norm": 0.0888831689953804, "learning_rate": 7.275e-06, "loss": 0.5451, "step": 1709 }, { "epoch": 0.052114682778535454, "grad_norm": 0.07357228547334671, "learning_rate": 7.25e-06, "loss": 0.3948, "step": 1710 }, { "epoch": 0.05214515920121296, "grad_norm": 0.07749288529157639, "learning_rate": 7.2249999999999994e-06, "loss": 0.4503, "step": 1711 }, { "epoch": 0.052175635623890465, "grad_norm": 0.07862765341997147, "learning_rate": 7.2e-06, "loss": 0.4795, "step": 1712 }, { "epoch": 0.05220611204656797, "grad_norm": 0.07246437668800354, "learning_rate": 7.175e-06, "loss": 0.3829, "step": 1713 }, { "epoch": 0.05223658846924548, "grad_norm": 0.0723225399851799, "learning_rate": 7.15e-06, "loss": 0.4354, "step": 1714 }, { "epoch": 0.05226706489192299, "grad_norm": 0.07389179617166519, "learning_rate": 7.1249999999999995e-06, "loss": 0.4407, "step": 1715 }, { "epoch": 0.052297541314600494, "grad_norm": 0.0705588236451149, "learning_rate": 7.1e-06, "loss": 0.4027, "step": 1716 }, { "epoch": 0.052328017737278, "grad_norm": 0.07581262290477753, "learning_rate": 7.075e-06, "loss": 0.4178, "step": 1717 }, { "epoch": 0.052358494159955506, "grad_norm": 0.07841658592224121, "learning_rate": 7.049999999999999e-06, "loss": 0.4861, "step": 1718 }, { "epoch": 0.05238897058263301, "grad_norm": 0.07903047651052475, "learning_rate": 7.025000000000001e-06, "loss": 0.44, "step": 1719 }, { "epoch": 0.05241944700531052, "grad_norm": 0.09011761099100113, "learning_rate": 7.000000000000001e-06, "loss": 0.5644, "step": 1720 }, { "epoch": 0.05244992342798802, "grad_norm": 0.06914852559566498, "learning_rate": 6.975000000000001e-06, "loss": 0.3571, "step": 1721 }, { "epoch": 0.05248039985066553, "grad_norm": 0.08034972846508026, "learning_rate": 6.950000000000001e-06, "loss": 0.4278, "step": 1722 }, { "epoch": 0.052510876273343034, "grad_norm": 0.08054415136575699, "learning_rate": 6.925000000000001e-06, "loss": 0.4755, "step": 1723 }, { "epoch": 0.05254135269602054, "grad_norm": 0.06979429721832275, "learning_rate": 6.900000000000001e-06, "loss": 0.3797, "step": 1724 }, { "epoch": 0.052571829118698045, "grad_norm": 0.07440292835235596, "learning_rate": 6.875000000000001e-06, "loss": 0.3936, "step": 1725 }, { "epoch": 0.05260230554137555, "grad_norm": 0.0711861252784729, "learning_rate": 6.8500000000000005e-06, "loss": 0.369, "step": 1726 }, { "epoch": 0.052632781964053056, "grad_norm": 0.07376895844936371, "learning_rate": 6.825000000000001e-06, "loss": 0.4493, "step": 1727 }, { "epoch": 0.05266325838673057, "grad_norm": 0.084828682243824, "learning_rate": 6.800000000000001e-06, "loss": 0.4828, "step": 1728 }, { "epoch": 0.052693734809408074, "grad_norm": 0.07862475514411926, "learning_rate": 6.775000000000001e-06, "loss": 0.5283, "step": 1729 }, { "epoch": 0.05272421123208558, "grad_norm": 0.07115122675895691, "learning_rate": 6.750000000000001e-06, "loss": 0.404, "step": 1730 }, { "epoch": 0.052754687654763086, "grad_norm": 0.08562794327735901, "learning_rate": 6.725000000000001e-06, "loss": 0.5205, "step": 1731 }, { "epoch": 0.05278516407744059, "grad_norm": 0.08060447126626968, "learning_rate": 6.700000000000001e-06, "loss": 0.4737, "step": 1732 }, { "epoch": 0.0528156405001181, "grad_norm": 0.08253117650747299, "learning_rate": 6.6750000000000005e-06, "loss": 0.4869, "step": 1733 }, { "epoch": 0.0528461169227956, "grad_norm": 0.07961471378803253, "learning_rate": 6.650000000000001e-06, "loss": 0.5275, "step": 1734 }, { "epoch": 0.05287659334547311, "grad_norm": 0.07735777646303177, "learning_rate": 6.625000000000001e-06, "loss": 0.4375, "step": 1735 }, { "epoch": 0.052907069768150614, "grad_norm": 0.08185034990310669, "learning_rate": 6.6e-06, "loss": 0.4912, "step": 1736 }, { "epoch": 0.05293754619082812, "grad_norm": 0.07915080338716507, "learning_rate": 6.5750000000000006e-06, "loss": 0.5189, "step": 1737 }, { "epoch": 0.052968022613505625, "grad_norm": 0.08062077313661575, "learning_rate": 6.550000000000001e-06, "loss": 0.4407, "step": 1738 }, { "epoch": 0.05299849903618313, "grad_norm": 0.07715611159801483, "learning_rate": 6.525e-06, "loss": 0.4722, "step": 1739 }, { "epoch": 0.053028975458860636, "grad_norm": 0.0786377415060997, "learning_rate": 6.5000000000000004e-06, "loss": 0.4482, "step": 1740 }, { "epoch": 0.05305945188153814, "grad_norm": 0.08128920197486877, "learning_rate": 6.475000000000001e-06, "loss": 0.5028, "step": 1741 }, { "epoch": 0.053089928304215654, "grad_norm": 0.07736604660749435, "learning_rate": 6.45e-06, "loss": 0.4583, "step": 1742 }, { "epoch": 0.05312040472689316, "grad_norm": 0.08506511151790619, "learning_rate": 6.425e-06, "loss": 0.5355, "step": 1743 }, { "epoch": 0.053150881149570665, "grad_norm": 0.08120521903038025, "learning_rate": 6.4000000000000006e-06, "loss": 0.4936, "step": 1744 }, { "epoch": 0.05318135757224817, "grad_norm": 0.12086557596921921, "learning_rate": 6.375000000000001e-06, "loss": 0.5081, "step": 1745 }, { "epoch": 0.05321183399492568, "grad_norm": 0.07926332205533981, "learning_rate": 6.35e-06, "loss": 0.4891, "step": 1746 }, { "epoch": 0.05324231041760318, "grad_norm": 0.08611719310283661, "learning_rate": 6.3250000000000004e-06, "loss": 0.5308, "step": 1747 }, { "epoch": 0.05327278684028069, "grad_norm": 0.08155818283557892, "learning_rate": 6.300000000000001e-06, "loss": 0.4706, "step": 1748 }, { "epoch": 0.05330326326295819, "grad_norm": 0.07776990532875061, "learning_rate": 6.275e-06, "loss": 0.4821, "step": 1749 }, { "epoch": 0.0533337396856357, "grad_norm": 0.07190857827663422, "learning_rate": 6.25e-06, "loss": 0.3891, "step": 1750 }, { "epoch": 0.053364216108313205, "grad_norm": 0.07988125085830688, "learning_rate": 6.2250000000000005e-06, "loss": 0.4685, "step": 1751 }, { "epoch": 0.05339469253099071, "grad_norm": 0.07777253538370132, "learning_rate": 6.2e-06, "loss": 0.4853, "step": 1752 }, { "epoch": 0.053425168953668216, "grad_norm": 0.08099931478500366, "learning_rate": 6.175e-06, "loss": 0.5142, "step": 1753 }, { "epoch": 0.05345564537634572, "grad_norm": 0.07605674117803574, "learning_rate": 6.15e-06, "loss": 0.4291, "step": 1754 }, { "epoch": 0.053486121799023234, "grad_norm": 0.06488881260156631, "learning_rate": 6.125e-06, "loss": 0.3659, "step": 1755 }, { "epoch": 0.05351659822170074, "grad_norm": 0.07975669205188751, "learning_rate": 6.1e-06, "loss": 0.5397, "step": 1756 }, { "epoch": 0.053547074644378245, "grad_norm": 0.07339906692504883, "learning_rate": 6.075e-06, "loss": 0.4241, "step": 1757 }, { "epoch": 0.05357755106705575, "grad_norm": 0.08850807696580887, "learning_rate": 6.0500000000000005e-06, "loss": 0.5001, "step": 1758 }, { "epoch": 0.053608027489733256, "grad_norm": 0.0716274231672287, "learning_rate": 6.025e-06, "loss": 0.452, "step": 1759 }, { "epoch": 0.05363850391241076, "grad_norm": 0.08363133668899536, "learning_rate": 6e-06, "loss": 0.4856, "step": 1760 }, { "epoch": 0.05366898033508827, "grad_norm": 0.07937105000019073, "learning_rate": 5.975e-06, "loss": 0.4635, "step": 1761 }, { "epoch": 0.05369945675776577, "grad_norm": 0.08995947241783142, "learning_rate": 5.95e-06, "loss": 0.4628, "step": 1762 }, { "epoch": 0.05372993318044328, "grad_norm": 0.08080995827913284, "learning_rate": 5.925e-06, "loss": 0.4531, "step": 1763 }, { "epoch": 0.053760409603120785, "grad_norm": 0.068483866751194, "learning_rate": 5.9e-06, "loss": 0.4203, "step": 1764 }, { "epoch": 0.05379088602579829, "grad_norm": 0.08373936265707016, "learning_rate": 5.875e-06, "loss": 0.5224, "step": 1765 }, { "epoch": 0.053821362448475796, "grad_norm": 0.07308318465948105, "learning_rate": 5.850000000000001e-06, "loss": 0.4074, "step": 1766 }, { "epoch": 0.0538518388711533, "grad_norm": 0.079605333507061, "learning_rate": 5.825000000000001e-06, "loss": 0.4819, "step": 1767 }, { "epoch": 0.05388231529383081, "grad_norm": 0.07526987791061401, "learning_rate": 5.8e-06, "loss": 0.4551, "step": 1768 }, { "epoch": 0.05391279171650832, "grad_norm": 0.07402167469263077, "learning_rate": 5.775000000000001e-06, "loss": 0.4314, "step": 1769 }, { "epoch": 0.053943268139185825, "grad_norm": 0.07275580614805222, "learning_rate": 5.750000000000001e-06, "loss": 0.3781, "step": 1770 }, { "epoch": 0.05397374456186333, "grad_norm": 0.07869464159011841, "learning_rate": 5.725e-06, "loss": 0.4588, "step": 1771 }, { "epoch": 0.054004220984540836, "grad_norm": 0.09349686652421951, "learning_rate": 5.7000000000000005e-06, "loss": 0.5665, "step": 1772 }, { "epoch": 0.05403469740721834, "grad_norm": 0.07086145877838135, "learning_rate": 5.675000000000001e-06, "loss": 0.3958, "step": 1773 }, { "epoch": 0.05406517382989585, "grad_norm": 0.07265803217887878, "learning_rate": 5.65e-06, "loss": 0.4375, "step": 1774 }, { "epoch": 0.05409565025257335, "grad_norm": 0.07727787643671036, "learning_rate": 5.625e-06, "loss": 0.438, "step": 1775 }, { "epoch": 0.05412612667525086, "grad_norm": 0.08220883458852768, "learning_rate": 5.600000000000001e-06, "loss": 0.5267, "step": 1776 }, { "epoch": 0.054156603097928364, "grad_norm": 0.078182652592659, "learning_rate": 5.575e-06, "loss": 0.4958, "step": 1777 }, { "epoch": 0.05418707952060587, "grad_norm": 0.07792630791664124, "learning_rate": 5.55e-06, "loss": 0.4712, "step": 1778 }, { "epoch": 0.054217555943283376, "grad_norm": 0.07781917601823807, "learning_rate": 5.5250000000000005e-06, "loss": 0.4751, "step": 1779 }, { "epoch": 0.05424803236596088, "grad_norm": 0.08237753063440323, "learning_rate": 5.500000000000001e-06, "loss": 0.3547, "step": 1780 }, { "epoch": 0.05427850878863839, "grad_norm": 0.08097647875547409, "learning_rate": 5.475e-06, "loss": 0.4954, "step": 1781 }, { "epoch": 0.05430898521131589, "grad_norm": 0.08303692936897278, "learning_rate": 5.45e-06, "loss": 0.5413, "step": 1782 }, { "epoch": 0.054339461633993405, "grad_norm": 0.06994259357452393, "learning_rate": 5.4250000000000006e-06, "loss": 0.409, "step": 1783 }, { "epoch": 0.05436993805667091, "grad_norm": 0.06907018274068832, "learning_rate": 5.4e-06, "loss": 0.3782, "step": 1784 }, { "epoch": 0.054400414479348416, "grad_norm": 0.07399621605873108, "learning_rate": 5.375e-06, "loss": 0.4477, "step": 1785 }, { "epoch": 0.05443089090202592, "grad_norm": 0.08269914239645004, "learning_rate": 5.3500000000000004e-06, "loss": 0.5439, "step": 1786 }, { "epoch": 0.05446136732470343, "grad_norm": 0.06781642138957977, "learning_rate": 5.325e-06, "loss": 0.3769, "step": 1787 }, { "epoch": 0.05449184374738093, "grad_norm": 0.07652926445007324, "learning_rate": 5.3e-06, "loss": 0.4843, "step": 1788 }, { "epoch": 0.05452232017005844, "grad_norm": 0.07510750740766525, "learning_rate": 5.275e-06, "loss": 0.4159, "step": 1789 }, { "epoch": 0.054552796592735944, "grad_norm": 0.07404252141714096, "learning_rate": 5.25e-06, "loss": 0.4543, "step": 1790 }, { "epoch": 0.05458327301541345, "grad_norm": 0.07301948219537735, "learning_rate": 5.225e-06, "loss": 0.3802, "step": 1791 }, { "epoch": 0.054613749438090956, "grad_norm": 0.0774642750620842, "learning_rate": 5.2e-06, "loss": 0.4172, "step": 1792 }, { "epoch": 0.05464422586076846, "grad_norm": 0.08397026360034943, "learning_rate": 5.175e-06, "loss": 0.4165, "step": 1793 }, { "epoch": 0.05467470228344597, "grad_norm": 0.07271844893693924, "learning_rate": 5.15e-06, "loss": 0.4012, "step": 1794 }, { "epoch": 0.05470517870612347, "grad_norm": 0.08337218314409256, "learning_rate": 5.125e-06, "loss": 0.4869, "step": 1795 }, { "epoch": 0.05473565512880098, "grad_norm": 0.07798396795988083, "learning_rate": 5.1e-06, "loss": 0.4662, "step": 1796 }, { "epoch": 0.05476613155147849, "grad_norm": 0.07565627992153168, "learning_rate": 5.0750000000000005e-06, "loss": 0.4712, "step": 1797 }, { "epoch": 0.054796607974155996, "grad_norm": 0.07714686542749405, "learning_rate": 5.050000000000001e-06, "loss": 0.4807, "step": 1798 }, { "epoch": 0.0548270843968335, "grad_norm": 0.07284951210021973, "learning_rate": 5.025e-06, "loss": 0.368, "step": 1799 }, { "epoch": 0.05485756081951101, "grad_norm": 0.07004691660404205, "learning_rate": 5e-06, "loss": 0.3735, "step": 1800 }, { "epoch": 0.05488803724218851, "grad_norm": 0.08440929651260376, "learning_rate": 4.975000000000001e-06, "loss": 0.5274, "step": 1801 }, { "epoch": 0.05491851366486602, "grad_norm": 0.08080580085515976, "learning_rate": 4.950000000000001e-06, "loss": 0.4999, "step": 1802 }, { "epoch": 0.054948990087543524, "grad_norm": 0.06864457577466965, "learning_rate": 4.925e-06, "loss": 0.3673, "step": 1803 }, { "epoch": 0.05497946651022103, "grad_norm": 0.07758983969688416, "learning_rate": 4.9000000000000005e-06, "loss": 0.4789, "step": 1804 }, { "epoch": 0.055009942932898535, "grad_norm": 0.08826585859060287, "learning_rate": 4.875000000000001e-06, "loss": 0.5679, "step": 1805 }, { "epoch": 0.05504041935557604, "grad_norm": 0.07601723074913025, "learning_rate": 4.85e-06, "loss": 0.4106, "step": 1806 }, { "epoch": 0.05507089577825355, "grad_norm": 0.07155335694551468, "learning_rate": 4.825e-06, "loss": 0.3888, "step": 1807 }, { "epoch": 0.05510137220093105, "grad_norm": 0.07949995994567871, "learning_rate": 4.800000000000001e-06, "loss": 0.4915, "step": 1808 }, { "epoch": 0.05513184862360856, "grad_norm": 0.07787677645683289, "learning_rate": 4.775e-06, "loss": 0.4713, "step": 1809 }, { "epoch": 0.05516232504628607, "grad_norm": 0.07640416920185089, "learning_rate": 4.75e-06, "loss": 0.4393, "step": 1810 }, { "epoch": 0.055192801468963576, "grad_norm": 0.08414927124977112, "learning_rate": 4.7250000000000005e-06, "loss": 0.5233, "step": 1811 }, { "epoch": 0.05522327789164108, "grad_norm": 0.0717151090502739, "learning_rate": 4.7e-06, "loss": 0.4319, "step": 1812 }, { "epoch": 0.05525375431431859, "grad_norm": 0.08354421705007553, "learning_rate": 4.675e-06, "loss": 0.5151, "step": 1813 }, { "epoch": 0.05528423073699609, "grad_norm": 0.0800461545586586, "learning_rate": 4.65e-06, "loss": 0.4779, "step": 1814 }, { "epoch": 0.0553147071596736, "grad_norm": 0.07104060053825378, "learning_rate": 4.625e-06, "loss": 0.4068, "step": 1815 }, { "epoch": 0.055345183582351104, "grad_norm": 0.07663567364215851, "learning_rate": 4.6e-06, "loss": 0.4561, "step": 1816 }, { "epoch": 0.05537566000502861, "grad_norm": 0.07491706311702728, "learning_rate": 4.575e-06, "loss": 0.4219, "step": 1817 }, { "epoch": 0.055406136427706115, "grad_norm": 0.08497320860624313, "learning_rate": 4.5500000000000005e-06, "loss": 0.5462, "step": 1818 }, { "epoch": 0.05543661285038362, "grad_norm": 0.07618972659111023, "learning_rate": 4.525e-06, "loss": 0.4827, "step": 1819 }, { "epoch": 0.055467089273061126, "grad_norm": 0.07458259910345078, "learning_rate": 4.5e-06, "loss": 0.4165, "step": 1820 }, { "epoch": 0.05549756569573863, "grad_norm": 0.07624120265245438, "learning_rate": 4.475e-06, "loss": 0.4327, "step": 1821 }, { "epoch": 0.05552804211841614, "grad_norm": 0.08696374297142029, "learning_rate": 4.45e-06, "loss": 0.5047, "step": 1822 }, { "epoch": 0.05555851854109364, "grad_norm": 0.0834360346198082, "learning_rate": 4.425e-06, "loss": 0.5383, "step": 1823 }, { "epoch": 0.055588994963771156, "grad_norm": 0.07860411703586578, "learning_rate": 4.4e-06, "loss": 0.4545, "step": 1824 }, { "epoch": 0.05561947138644866, "grad_norm": 0.08337979763746262, "learning_rate": 4.375e-06, "loss": 0.5042, "step": 1825 }, { "epoch": 0.05564994780912617, "grad_norm": 0.07183274626731873, "learning_rate": 4.35e-06, "loss": 0.4241, "step": 1826 }, { "epoch": 0.05568042423180367, "grad_norm": 0.0660005509853363, "learning_rate": 4.325e-06, "loss": 0.3585, "step": 1827 }, { "epoch": 0.05571090065448118, "grad_norm": 0.08253025263547897, "learning_rate": 4.2999999999999995e-06, "loss": 0.4495, "step": 1828 }, { "epoch": 0.055741377077158684, "grad_norm": 0.09679601341485977, "learning_rate": 4.2750000000000006e-06, "loss": 0.4807, "step": 1829 }, { "epoch": 0.05577185349983619, "grad_norm": 0.07512461394071579, "learning_rate": 4.250000000000001e-06, "loss": 0.4696, "step": 1830 }, { "epoch": 0.055802329922513695, "grad_norm": 0.08214529603719711, "learning_rate": 4.225e-06, "loss": 0.5397, "step": 1831 }, { "epoch": 0.0558328063451912, "grad_norm": 0.07940132915973663, "learning_rate": 4.2000000000000004e-06, "loss": 0.3591, "step": 1832 }, { "epoch": 0.055863282767868706, "grad_norm": 0.07063466310501099, "learning_rate": 4.175000000000001e-06, "loss": 0.3941, "step": 1833 }, { "epoch": 0.05589375919054621, "grad_norm": 0.07677746564149857, "learning_rate": 4.15e-06, "loss": 0.442, "step": 1834 }, { "epoch": 0.05592423561322372, "grad_norm": 0.07896104454994202, "learning_rate": 4.125e-06, "loss": 0.4757, "step": 1835 }, { "epoch": 0.05595471203590122, "grad_norm": 0.07606010884046555, "learning_rate": 4.1000000000000006e-06, "loss": 0.4444, "step": 1836 }, { "epoch": 0.05598518845857873, "grad_norm": 0.07802238315343857, "learning_rate": 4.075e-06, "loss": 0.4214, "step": 1837 }, { "epoch": 0.05601566488125624, "grad_norm": 0.07207982987165451, "learning_rate": 4.05e-06, "loss": 0.3938, "step": 1838 }, { "epoch": 0.05604614130393375, "grad_norm": 0.07555796951055527, "learning_rate": 4.0250000000000004e-06, "loss": 0.3955, "step": 1839 }, { "epoch": 0.05607661772661125, "grad_norm": 0.08003240823745728, "learning_rate": 4.000000000000001e-06, "loss": 0.4872, "step": 1840 }, { "epoch": 0.05610709414928876, "grad_norm": 0.07844400405883789, "learning_rate": 3.975e-06, "loss": 0.4945, "step": 1841 }, { "epoch": 0.056137570571966264, "grad_norm": 0.07300937175750732, "learning_rate": 3.95e-06, "loss": 0.4776, "step": 1842 }, { "epoch": 0.05616804699464377, "grad_norm": 0.07540492713451385, "learning_rate": 3.9250000000000005e-06, "loss": 0.4307, "step": 1843 }, { "epoch": 0.056198523417321275, "grad_norm": 0.07466063648462296, "learning_rate": 3.9e-06, "loss": 0.4662, "step": 1844 }, { "epoch": 0.05622899983999878, "grad_norm": 0.07338997721672058, "learning_rate": 3.875e-06, "loss": 0.3346, "step": 1845 }, { "epoch": 0.056259476262676286, "grad_norm": 0.08118147403001785, "learning_rate": 3.85e-06, "loss": 0.5191, "step": 1846 }, { "epoch": 0.05628995268535379, "grad_norm": 0.07481807470321655, "learning_rate": 3.825e-06, "loss": 0.4029, "step": 1847 }, { "epoch": 0.0563204291080313, "grad_norm": 0.07008731365203857, "learning_rate": 3.8e-06, "loss": 0.3911, "step": 1848 }, { "epoch": 0.0563509055307088, "grad_norm": 0.0755765289068222, "learning_rate": 3.775e-06, "loss": 0.4395, "step": 1849 }, { "epoch": 0.05638138195338631, "grad_norm": 0.07991984486579895, "learning_rate": 3.75e-06, "loss": 0.479, "step": 1850 }, { "epoch": 0.056411858376063814, "grad_norm": 0.08616796135902405, "learning_rate": 3.725e-06, "loss": 0.5003, "step": 1851 }, { "epoch": 0.05644233479874133, "grad_norm": 0.07634107768535614, "learning_rate": 3.7e-06, "loss": 0.4874, "step": 1852 }, { "epoch": 0.05647281122141883, "grad_norm": 0.08248892426490784, "learning_rate": 3.675e-06, "loss": 0.5017, "step": 1853 }, { "epoch": 0.05650328764409634, "grad_norm": 0.0769951269030571, "learning_rate": 3.6499999999999998e-06, "loss": 0.4517, "step": 1854 }, { "epoch": 0.056533764066773844, "grad_norm": 0.07078733295202255, "learning_rate": 3.625e-06, "loss": 0.4122, "step": 1855 }, { "epoch": 0.05656424048945135, "grad_norm": 0.07455853372812271, "learning_rate": 3.6e-06, "loss": 0.4442, "step": 1856 }, { "epoch": 0.056594716912128855, "grad_norm": 0.0808526948094368, "learning_rate": 3.575e-06, "loss": 0.4675, "step": 1857 }, { "epoch": 0.05662519333480636, "grad_norm": 0.08268711715936661, "learning_rate": 3.55e-06, "loss": 0.4969, "step": 1858 }, { "epoch": 0.056655669757483866, "grad_norm": 0.07475420087575912, "learning_rate": 3.5249999999999997e-06, "loss": 0.4762, "step": 1859 }, { "epoch": 0.05668614618016137, "grad_norm": 0.07622849196195602, "learning_rate": 3.5000000000000004e-06, "loss": 0.4371, "step": 1860 }, { "epoch": 0.05671662260283888, "grad_norm": 0.0740659162402153, "learning_rate": 3.4750000000000006e-06, "loss": 0.4601, "step": 1861 }, { "epoch": 0.05674709902551638, "grad_norm": 0.08383522927761078, "learning_rate": 3.4500000000000004e-06, "loss": 0.5344, "step": 1862 }, { "epoch": 0.05677757544819389, "grad_norm": 0.07983031123876572, "learning_rate": 3.4250000000000002e-06, "loss": 0.442, "step": 1863 }, { "epoch": 0.056808051870871394, "grad_norm": 0.07575611770153046, "learning_rate": 3.4000000000000005e-06, "loss": 0.4183, "step": 1864 }, { "epoch": 0.05683852829354891, "grad_norm": 0.06798914819955826, "learning_rate": 3.3750000000000003e-06, "loss": 0.3884, "step": 1865 }, { "epoch": 0.05686900471622641, "grad_norm": 0.0825703889131546, "learning_rate": 3.3500000000000005e-06, "loss": 0.4912, "step": 1866 }, { "epoch": 0.05689948113890392, "grad_norm": 0.07204671204090118, "learning_rate": 3.3250000000000004e-06, "loss": 0.4147, "step": 1867 }, { "epoch": 0.056929957561581424, "grad_norm": 0.0720638781785965, "learning_rate": 3.3e-06, "loss": 0.3834, "step": 1868 }, { "epoch": 0.05696043398425893, "grad_norm": 0.08080707490444183, "learning_rate": 3.2750000000000004e-06, "loss": 0.5463, "step": 1869 }, { "epoch": 0.056990910406936435, "grad_norm": 0.07068087160587311, "learning_rate": 3.2500000000000002e-06, "loss": 0.3694, "step": 1870 }, { "epoch": 0.05702138682961394, "grad_norm": 0.07543907314538956, "learning_rate": 3.225e-06, "loss": 0.4822, "step": 1871 }, { "epoch": 0.057051863252291446, "grad_norm": 0.08117543905973434, "learning_rate": 3.2000000000000003e-06, "loss": 0.523, "step": 1872 }, { "epoch": 0.05708233967496895, "grad_norm": 0.08211101591587067, "learning_rate": 3.175e-06, "loss": 0.5152, "step": 1873 }, { "epoch": 0.05711281609764646, "grad_norm": 0.0837220698595047, "learning_rate": 3.1500000000000003e-06, "loss": 0.509, "step": 1874 }, { "epoch": 0.05714329252032396, "grad_norm": 0.08205706626176834, "learning_rate": 3.125e-06, "loss": 0.4757, "step": 1875 }, { "epoch": 0.05717376894300147, "grad_norm": 0.07633814960718155, "learning_rate": 3.1e-06, "loss": 0.4744, "step": 1876 }, { "epoch": 0.057204245365678974, "grad_norm": 0.07428915053606033, "learning_rate": 3.075e-06, "loss": 0.445, "step": 1877 }, { "epoch": 0.05723472178835648, "grad_norm": 0.07590076327323914, "learning_rate": 3.05e-06, "loss": 0.4011, "step": 1878 }, { "epoch": 0.05726519821103399, "grad_norm": 0.08229390531778336, "learning_rate": 3.0250000000000003e-06, "loss": 0.502, "step": 1879 }, { "epoch": 0.0572956746337115, "grad_norm": 0.08211617916822433, "learning_rate": 3e-06, "loss": 0.5022, "step": 1880 }, { "epoch": 0.057326151056389, "grad_norm": 0.08459153771400452, "learning_rate": 2.975e-06, "loss": 0.5531, "step": 1881 }, { "epoch": 0.05735662747906651, "grad_norm": 0.07254498451948166, "learning_rate": 2.95e-06, "loss": 0.4235, "step": 1882 }, { "epoch": 0.057387103901744015, "grad_norm": 0.07736314088106155, "learning_rate": 2.9250000000000004e-06, "loss": 0.4423, "step": 1883 }, { "epoch": 0.05741758032442152, "grad_norm": 0.08175991475582123, "learning_rate": 2.9e-06, "loss": 0.4285, "step": 1884 }, { "epoch": 0.057448056747099026, "grad_norm": 0.07864397019147873, "learning_rate": 2.8750000000000004e-06, "loss": 0.4642, "step": 1885 }, { "epoch": 0.05747853316977653, "grad_norm": 0.07438363134860992, "learning_rate": 2.8500000000000002e-06, "loss": 0.44, "step": 1886 }, { "epoch": 0.05750900959245404, "grad_norm": 0.0803142562508583, "learning_rate": 2.825e-06, "loss": 0.4522, "step": 1887 }, { "epoch": 0.05753948601513154, "grad_norm": 0.07140840590000153, "learning_rate": 2.8000000000000003e-06, "loss": 0.4316, "step": 1888 }, { "epoch": 0.05756996243780905, "grad_norm": 0.06882616132497787, "learning_rate": 2.775e-06, "loss": 0.3946, "step": 1889 }, { "epoch": 0.057600438860486554, "grad_norm": 0.07327203452587128, "learning_rate": 2.7500000000000004e-06, "loss": 0.4212, "step": 1890 }, { "epoch": 0.05763091528316406, "grad_norm": 0.08073710650205612, "learning_rate": 2.725e-06, "loss": 0.4819, "step": 1891 }, { "epoch": 0.057661391705841565, "grad_norm": 0.08377823233604431, "learning_rate": 2.7e-06, "loss": 0.5784, "step": 1892 }, { "epoch": 0.05769186812851908, "grad_norm": 0.09281211346387863, "learning_rate": 2.6750000000000002e-06, "loss": 0.5763, "step": 1893 }, { "epoch": 0.05772234455119658, "grad_norm": 0.07777675241231918, "learning_rate": 2.65e-06, "loss": 0.4665, "step": 1894 }, { "epoch": 0.05775282097387409, "grad_norm": 0.07077455520629883, "learning_rate": 2.625e-06, "loss": 0.4047, "step": 1895 }, { "epoch": 0.057783297396551594, "grad_norm": 0.09553859382867813, "learning_rate": 2.6e-06, "loss": 0.6183, "step": 1896 }, { "epoch": 0.0578137738192291, "grad_norm": 0.08617044240236282, "learning_rate": 2.575e-06, "loss": 0.5409, "step": 1897 }, { "epoch": 0.057844250241906606, "grad_norm": 0.08568374067544937, "learning_rate": 2.55e-06, "loss": 0.5053, "step": 1898 }, { "epoch": 0.05787472666458411, "grad_norm": 0.07996345311403275, "learning_rate": 2.5250000000000004e-06, "loss": 0.4839, "step": 1899 }, { "epoch": 0.05790520308726162, "grad_norm": 0.08515867590904236, "learning_rate": 2.5e-06, "loss": 0.5723, "step": 1900 }, { "epoch": 0.05793567950993912, "grad_norm": 0.07514118403196335, "learning_rate": 2.4750000000000004e-06, "loss": 0.4108, "step": 1901 }, { "epoch": 0.05796615593261663, "grad_norm": 0.0790206640958786, "learning_rate": 2.4500000000000003e-06, "loss": 0.4441, "step": 1902 }, { "epoch": 0.057996632355294134, "grad_norm": 0.08381094038486481, "learning_rate": 2.425e-06, "loss": 0.4827, "step": 1903 }, { "epoch": 0.05802710877797164, "grad_norm": 0.07124467194080353, "learning_rate": 2.4000000000000003e-06, "loss": 0.3635, "step": 1904 }, { "epoch": 0.058057585200649145, "grad_norm": 0.07671955227851868, "learning_rate": 2.375e-06, "loss": 0.4435, "step": 1905 }, { "epoch": 0.05808806162332665, "grad_norm": 0.06961016356945038, "learning_rate": 2.35e-06, "loss": 0.3803, "step": 1906 }, { "epoch": 0.05811853804600416, "grad_norm": 0.07823831588029861, "learning_rate": 2.325e-06, "loss": 0.4838, "step": 1907 }, { "epoch": 0.05814901446868167, "grad_norm": 0.07713828980922699, "learning_rate": 2.3e-06, "loss": 0.4588, "step": 1908 }, { "epoch": 0.058179490891359174, "grad_norm": 0.07507798075675964, "learning_rate": 2.2750000000000002e-06, "loss": 0.4297, "step": 1909 }, { "epoch": 0.05820996731403668, "grad_norm": 0.08098819106817245, "learning_rate": 2.25e-06, "loss": 0.504, "step": 1910 }, { "epoch": 0.058240443736714186, "grad_norm": 0.07353410869836807, "learning_rate": 2.225e-06, "loss": 0.417, "step": 1911 }, { "epoch": 0.05827092015939169, "grad_norm": 0.08222463726997375, "learning_rate": 2.2e-06, "loss": 0.4958, "step": 1912 }, { "epoch": 0.0583013965820692, "grad_norm": 0.07737500965595245, "learning_rate": 2.175e-06, "loss": 0.5237, "step": 1913 }, { "epoch": 0.0583318730047467, "grad_norm": 0.07270614802837372, "learning_rate": 2.1499999999999997e-06, "loss": 0.4188, "step": 1914 }, { "epoch": 0.05836234942742421, "grad_norm": 0.07766509056091309, "learning_rate": 2.1250000000000004e-06, "loss": 0.4247, "step": 1915 }, { "epoch": 0.058392825850101714, "grad_norm": 0.0751839429140091, "learning_rate": 2.1000000000000002e-06, "loss": 0.4496, "step": 1916 }, { "epoch": 0.05842330227277922, "grad_norm": 0.0660671815276146, "learning_rate": 2.075e-06, "loss": 0.3416, "step": 1917 }, { "epoch": 0.058453778695456725, "grad_norm": 0.08144040405750275, "learning_rate": 2.0500000000000003e-06, "loss": 0.5381, "step": 1918 }, { "epoch": 0.05848425511813423, "grad_norm": 0.08119518309831619, "learning_rate": 2.025e-06, "loss": 0.4636, "step": 1919 }, { "epoch": 0.058514731540811736, "grad_norm": 0.07839170098304749, "learning_rate": 2.0000000000000003e-06, "loss": 0.4968, "step": 1920 }, { "epoch": 0.05854520796348925, "grad_norm": 0.07117439061403275, "learning_rate": 1.975e-06, "loss": 0.3631, "step": 1921 }, { "epoch": 0.058575684386166754, "grad_norm": 0.07680480182170868, "learning_rate": 1.95e-06, "loss": 0.4813, "step": 1922 }, { "epoch": 0.05860616080884426, "grad_norm": 0.08159365504980087, "learning_rate": 1.925e-06, "loss": 0.5391, "step": 1923 }, { "epoch": 0.058636637231521765, "grad_norm": 0.08709205687046051, "learning_rate": 1.9e-06, "loss": 0.5165, "step": 1924 }, { "epoch": 0.05866711365419927, "grad_norm": 0.08084145188331604, "learning_rate": 1.875e-06, "loss": 0.4691, "step": 1925 }, { "epoch": 0.05869759007687678, "grad_norm": 0.0735328271985054, "learning_rate": 1.85e-06, "loss": 0.4508, "step": 1926 }, { "epoch": 0.05872806649955428, "grad_norm": 0.06936170905828476, "learning_rate": 1.8249999999999999e-06, "loss": 0.3906, "step": 1927 }, { "epoch": 0.05875854292223179, "grad_norm": 0.07574455440044403, "learning_rate": 1.8e-06, "loss": 0.4429, "step": 1928 }, { "epoch": 0.058789019344909293, "grad_norm": 0.08336243033409119, "learning_rate": 1.775e-06, "loss": 0.4983, "step": 1929 }, { "epoch": 0.0588194957675868, "grad_norm": 0.0730966255068779, "learning_rate": 1.7500000000000002e-06, "loss": 0.4207, "step": 1930 }, { "epoch": 0.058849972190264305, "grad_norm": 0.08508003503084183, "learning_rate": 1.7250000000000002e-06, "loss": 0.5282, "step": 1931 }, { "epoch": 0.05888044861294181, "grad_norm": 0.08668641746044159, "learning_rate": 1.7000000000000002e-06, "loss": 0.5213, "step": 1932 }, { "epoch": 0.058910925035619316, "grad_norm": 0.07657695561647415, "learning_rate": 1.6750000000000003e-06, "loss": 0.4525, "step": 1933 }, { "epoch": 0.05894140145829683, "grad_norm": 0.07091470807790756, "learning_rate": 1.65e-06, "loss": 0.3995, "step": 1934 }, { "epoch": 0.058971877880974334, "grad_norm": 0.08255702257156372, "learning_rate": 1.6250000000000001e-06, "loss": 0.5159, "step": 1935 }, { "epoch": 0.05900235430365184, "grad_norm": 0.07477465271949768, "learning_rate": 1.6000000000000001e-06, "loss": 0.4578, "step": 1936 }, { "epoch": 0.059032830726329345, "grad_norm": 0.0878307893872261, "learning_rate": 1.5750000000000002e-06, "loss": 0.5358, "step": 1937 }, { "epoch": 0.05906330714900685, "grad_norm": 0.07151155173778534, "learning_rate": 1.55e-06, "loss": 0.3745, "step": 1938 }, { "epoch": 0.059093783571684357, "grad_norm": 0.07488836348056793, "learning_rate": 1.525e-06, "loss": 0.4471, "step": 1939 }, { "epoch": 0.05912425999436186, "grad_norm": 0.07117241621017456, "learning_rate": 1.5e-06, "loss": 0.4015, "step": 1940 }, { "epoch": 0.05915473641703937, "grad_norm": 0.07607581466436386, "learning_rate": 1.475e-06, "loss": 0.4584, "step": 1941 }, { "epoch": 0.05918521283971687, "grad_norm": 0.07796807587146759, "learning_rate": 1.45e-06, "loss": 0.4771, "step": 1942 }, { "epoch": 0.05921568926239438, "grad_norm": 0.07287101447582245, "learning_rate": 1.4250000000000001e-06, "loss": 0.3865, "step": 1943 }, { "epoch": 0.059246165685071885, "grad_norm": 0.08055838197469711, "learning_rate": 1.4000000000000001e-06, "loss": 0.5086, "step": 1944 }, { "epoch": 0.05927664210774939, "grad_norm": 0.0863788053393364, "learning_rate": 1.3750000000000002e-06, "loss": 0.5338, "step": 1945 }, { "epoch": 0.059307118530426896, "grad_norm": 0.07795803993940353, "learning_rate": 1.35e-06, "loss": 0.4711, "step": 1946 }, { "epoch": 0.0593375949531044, "grad_norm": 0.07831612974405289, "learning_rate": 1.325e-06, "loss": 0.4713, "step": 1947 }, { "epoch": 0.059368071375781914, "grad_norm": 0.07482554763555527, "learning_rate": 1.3e-06, "loss": 0.464, "step": 1948 }, { "epoch": 0.05939854779845942, "grad_norm": 0.07675724476575851, "learning_rate": 1.275e-06, "loss": 0.4891, "step": 1949 }, { "epoch": 0.059429024221136925, "grad_norm": 0.06598085165023804, "learning_rate": 1.25e-06, "loss": 0.3318, "step": 1950 }, { "epoch": 0.05945950064381443, "grad_norm": 0.06993094086647034, "learning_rate": 1.2250000000000001e-06, "loss": 0.3843, "step": 1951 }, { "epoch": 0.059489977066491936, "grad_norm": 0.06824394315481186, "learning_rate": 1.2000000000000002e-06, "loss": 0.3787, "step": 1952 }, { "epoch": 0.05952045348916944, "grad_norm": 0.0701347216963768, "learning_rate": 1.175e-06, "loss": 0.3889, "step": 1953 }, { "epoch": 0.05955092991184695, "grad_norm": 0.0751233845949173, "learning_rate": 1.15e-06, "loss": 0.4744, "step": 1954 }, { "epoch": 0.05958140633452445, "grad_norm": 0.08175686001777649, "learning_rate": 1.125e-06, "loss": 0.4995, "step": 1955 }, { "epoch": 0.05961188275720196, "grad_norm": 0.08348920941352844, "learning_rate": 1.1e-06, "loss": 0.4702, "step": 1956 }, { "epoch": 0.059642359179879464, "grad_norm": 0.08014917373657227, "learning_rate": 1.0749999999999999e-06, "loss": 0.4755, "step": 1957 }, { "epoch": 0.05967283560255697, "grad_norm": 0.07679284363985062, "learning_rate": 1.0500000000000001e-06, "loss": 0.5111, "step": 1958 }, { "epoch": 0.059703312025234476, "grad_norm": 0.08479301631450653, "learning_rate": 1.0250000000000001e-06, "loss": 0.5415, "step": 1959 }, { "epoch": 0.05973378844791198, "grad_norm": 0.07514672726392746, "learning_rate": 1.0000000000000002e-06, "loss": 0.4777, "step": 1960 }, { "epoch": 0.05976426487058949, "grad_norm": 0.0823245570063591, "learning_rate": 9.75e-07, "loss": 0.4523, "step": 1961 }, { "epoch": 0.059794741293267, "grad_norm": 0.06955621391534805, "learning_rate": 9.5e-07, "loss": 0.3664, "step": 1962 }, { "epoch": 0.059825217715944505, "grad_norm": 0.07471796125173569, "learning_rate": 9.25e-07, "loss": 0.4355, "step": 1963 }, { "epoch": 0.05985569413862201, "grad_norm": 0.06898004561662674, "learning_rate": 9e-07, "loss": 0.3721, "step": 1964 }, { "epoch": 0.059886170561299516, "grad_norm": 0.07481876015663147, "learning_rate": 8.750000000000001e-07, "loss": 0.4541, "step": 1965 }, { "epoch": 0.05991664698397702, "grad_norm": 0.08357566595077515, "learning_rate": 8.500000000000001e-07, "loss": 0.5171, "step": 1966 }, { "epoch": 0.05994712340665453, "grad_norm": 0.0776282325387001, "learning_rate": 8.25e-07, "loss": 0.5076, "step": 1967 }, { "epoch": 0.05997759982933203, "grad_norm": 0.07517019659280777, "learning_rate": 8.000000000000001e-07, "loss": 0.4226, "step": 1968 }, { "epoch": 0.06000807625200954, "grad_norm": 0.07620959728956223, "learning_rate": 7.75e-07, "loss": 0.4712, "step": 1969 }, { "epoch": 0.060038552674687044, "grad_norm": 0.07347417622804642, "learning_rate": 7.5e-07, "loss": 0.4205, "step": 1970 }, { "epoch": 0.06006902909736455, "grad_norm": 0.07087808847427368, "learning_rate": 7.25e-07, "loss": 0.3555, "step": 1971 }, { "epoch": 0.060099505520042056, "grad_norm": 0.07126067578792572, "learning_rate": 7.000000000000001e-07, "loss": 0.4166, "step": 1972 }, { "epoch": 0.06012998194271956, "grad_norm": 0.07367749512195587, "learning_rate": 6.75e-07, "loss": 0.4399, "step": 1973 }, { "epoch": 0.06016045836539707, "grad_norm": 0.0732448399066925, "learning_rate": 6.5e-07, "loss": 0.4137, "step": 1974 }, { "epoch": 0.06019093478807457, "grad_norm": 0.08127132803201675, "learning_rate": 6.25e-07, "loss": 0.5205, "step": 1975 }, { "epoch": 0.060221411210752085, "grad_norm": 0.0776558667421341, "learning_rate": 6.000000000000001e-07, "loss": 0.4273, "step": 1976 }, { "epoch": 0.06025188763342959, "grad_norm": 0.07769973576068878, "learning_rate": 5.75e-07, "loss": 0.457, "step": 1977 }, { "epoch": 0.060282364056107096, "grad_norm": 0.07712487876415253, "learning_rate": 5.5e-07, "loss": 0.4336, "step": 1978 }, { "epoch": 0.0603128404787846, "grad_norm": 0.08771371841430664, "learning_rate": 5.250000000000001e-07, "loss": 0.5787, "step": 1979 }, { "epoch": 0.06034331690146211, "grad_norm": 0.0856744647026062, "learning_rate": 5.000000000000001e-07, "loss": 0.5608, "step": 1980 }, { "epoch": 0.06037379332413961, "grad_norm": 0.08874399214982986, "learning_rate": 4.75e-07, "loss": 0.5566, "step": 1981 }, { "epoch": 0.06040426974681712, "grad_norm": 0.0749705359339714, "learning_rate": 4.5e-07, "loss": 0.4146, "step": 1982 }, { "epoch": 0.060434746169494624, "grad_norm": 0.07864365726709366, "learning_rate": 4.2500000000000006e-07, "loss": 0.4227, "step": 1983 }, { "epoch": 0.06046522259217213, "grad_norm": 0.07229630649089813, "learning_rate": 4.0000000000000003e-07, "loss": 0.4497, "step": 1984 }, { "epoch": 0.060495699014849635, "grad_norm": 0.07828386127948761, "learning_rate": 3.75e-07, "loss": 0.4999, "step": 1985 }, { "epoch": 0.06052617543752714, "grad_norm": 0.08361735939979553, "learning_rate": 3.5000000000000004e-07, "loss": 0.5418, "step": 1986 }, { "epoch": 0.06055665186020465, "grad_norm": 0.07871618121862411, "learning_rate": 3.25e-07, "loss": 0.4616, "step": 1987 }, { "epoch": 0.06058712828288215, "grad_norm": 0.08241317421197891, "learning_rate": 3.0000000000000004e-07, "loss": 0.5237, "step": 1988 }, { "epoch": 0.060617604705559665, "grad_norm": 0.08095940202474594, "learning_rate": 2.75e-07, "loss": 0.4683, "step": 1989 }, { "epoch": 0.06064808112823717, "grad_norm": 0.08275561034679413, "learning_rate": 2.5000000000000004e-07, "loss": 0.5142, "step": 1990 }, { "epoch": 0.060678557550914676, "grad_norm": 0.07975674420595169, "learning_rate": 2.25e-07, "loss": 0.4293, "step": 1991 }, { "epoch": 0.06070903397359218, "grad_norm": 0.08442219346761703, "learning_rate": 2.0000000000000002e-07, "loss": 0.4832, "step": 1992 }, { "epoch": 0.06073951039626969, "grad_norm": 0.08482984453439713, "learning_rate": 1.7500000000000002e-07, "loss": 0.4675, "step": 1993 }, { "epoch": 0.06076998681894719, "grad_norm": 0.08494574576616287, "learning_rate": 1.5000000000000002e-07, "loss": 0.5115, "step": 1994 }, { "epoch": 0.0608004632416247, "grad_norm": 0.07198780030012131, "learning_rate": 1.2500000000000002e-07, "loss": 0.4037, "step": 1995 }, { "epoch": 0.060830939664302204, "grad_norm": 0.08138151466846466, "learning_rate": 1.0000000000000001e-07, "loss": 0.5054, "step": 1996 }, { "epoch": 0.06086141608697971, "grad_norm": 0.07247910648584366, "learning_rate": 7.500000000000001e-08, "loss": 0.3885, "step": 1997 }, { "epoch": 0.060891892509657215, "grad_norm": 0.07957529276609421, "learning_rate": 5.0000000000000004e-08, "loss": 0.5059, "step": 1998 }, { "epoch": 0.06092236893233472, "grad_norm": 0.08180355280637741, "learning_rate": 2.5000000000000002e-08, "loss": 0.4757, "step": 1999 }, { "epoch": 0.060952845355012227, "grad_norm": 0.08512572944164276, "learning_rate": 0.0, "loss": 0.5458, "step": 2000 }, { "epoch": 0.060952845355012227, "step": 2000, "total_flos": 1.04520375926784e+18, "train_loss": 0.5059917987585068, "train_runtime": 72742.6851, "train_samples_per_second": 0.88, "train_steps_per_second": 0.027 } ], "logging_steps": 1.0, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.04520375926784e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }